• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
3;
4; Copyright (C) 2011, 2016, D. R. Commander.
5; Copyright (C) 2015, Intel Corporation.
6; Copyright (C) 2018, Matthias Räncker.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Convert some rows of samples to the output colorspace.
23;
24; GLOBAL(void)
25; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
26;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
27;                             int num_rows);
28;
29
30; r10d = JDIMENSION img_width
31; r11 = JSAMPARRAY input_buf
32; r12 = JSAMPIMAGE output_buf
33; r13d = JDIMENSION output_row
34; r14d = int num_rows
35
36%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
37%define WK_NUM  2
38
39    align       32
40    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
41
42EXTN(jsimd_rgb_gray_convert_avx2):
43    push        rbp
44    mov         rax, rsp                     ; rax = original rbp
45    sub         rsp, byte 4
46    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
47    mov         [rsp], rax
48    mov         rbp, rsp                     ; rbp = aligned rbp
49    lea         rsp, [wk(0)]
50    collect_args 5
51    push        rbx
52
53    mov         ecx, r10d
54    test        rcx, rcx
55    jz          near .return
56
57    push        rcx
58
59    mov         rsi, r12
60    mov         ecx, r13d
61    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
63
64    pop         rcx
65
66    mov         rsi, r11
67    mov         eax, r14d
68    test        rax, rax
69    jle         near .return
70.rowloop:
71    push        rdi
72    push        rsi
73    push        rcx                     ; col
74
75    mov         rsip, JSAMPROW [rsi]    ; inptr
76    mov         rdip, JSAMPROW [rdi]    ; outptr0
77
78    cmp         rcx, byte SIZEOF_YMMWORD
79    jae         near .columnloop
80
81%if RGB_PIXELSIZE == 3  ; ---------------
82
83.column_ld1:
84    push        rax
85    push        rdx
86    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
87    test        cl, SIZEOF_BYTE
88    jz          short .column_ld2
89    sub         rcx, byte SIZEOF_BYTE
90    movzx       rax, byte [rsi+rcx]
91.column_ld2:
92    test        cl, SIZEOF_WORD
93    jz          short .column_ld4
94    sub         rcx, byte SIZEOF_WORD
95    movzx       rdx, word [rsi+rcx]
96    shl         rax, WORD_BIT
97    or          rax, rdx
98.column_ld4:
99    vmovd       xmmA, eax
100    pop         rdx
101    pop         rax
102    test        cl, SIZEOF_DWORD
103    jz          short .column_ld8
104    sub         rcx, byte SIZEOF_DWORD
105    vmovd       xmmF, XMM_DWORD [rsi+rcx]
106    vpslldq     xmmA, xmmA, SIZEOF_DWORD
107    vpor        xmmA, xmmA, xmmF
108.column_ld8:
109    test        cl, SIZEOF_MMWORD
110    jz          short .column_ld16
111    sub         rcx, byte SIZEOF_MMWORD
112    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
113    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
114    vpor        xmmA, xmmA, xmmB
115.column_ld16:
116    test        cl, SIZEOF_XMMWORD
117    jz          short .column_ld32
118    sub         rcx, byte SIZEOF_XMMWORD
119    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
120    vperm2i128  ymmA, ymmA, ymmA, 1
121    vpor        ymmA, ymmB
122.column_ld32:
123    test        cl, SIZEOF_YMMWORD
124    jz          short .column_ld64
125    sub         rcx, byte SIZEOF_YMMWORD
126    vmovdqa     ymmF, ymmA
127    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
128.column_ld64:
129    test        cl, 2*SIZEOF_YMMWORD
130    mov         rcx, SIZEOF_YMMWORD
131    jz          short .rgb_gray_cnv
132    vmovdqa     ymmB, ymmA
133    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
134    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
135    jmp         short .rgb_gray_cnv
136
137.columnloop:
138    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
139    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
140    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
141
142.rgb_gray_cnv:
143    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
144    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
145    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
146    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
147    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
148    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
149
150    vmovdqu     ymmC, ymmA
151    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
152                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
153    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
154                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
155    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
156                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
157    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
158                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
159
160    vmovdqa     ymmG, ymmA
161    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
162                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
163    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
164                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
165
166    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
167                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
168    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
169                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
170
171    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
172                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
173    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
174                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
175
176    vmovdqa     ymmD, ymmA
177    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
178                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
179    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
180                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
181
182    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
183                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
184    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
185                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
186
187    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
188                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
189    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
190                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
191
192    vmovdqa     ymmE, ymmA
193    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
194                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
195    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
196                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
197
198    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
199                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
200    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
201                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
202
203    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
204                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
205    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
206                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
207
208    vpxor       ymmH, ymmH, ymmH
209
210    vmovdqa     ymmC, ymmA
211    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
212    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
213
214    vmovdqa     ymmB, ymmE
215    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
216    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
217
218    vmovdqa     ymmF, ymmD
219    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
220    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
221
222%else  ; RGB_PIXELSIZE == 4 ; -----------
223
224.column_ld1:
225    test        cl, SIZEOF_XMMWORD/16
226    jz          short .column_ld2
227    sub         rcx, byte SIZEOF_XMMWORD/16
228    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
229.column_ld2:
230    test        cl, SIZEOF_XMMWORD/8
231    jz          short .column_ld4
232    sub         rcx, byte SIZEOF_XMMWORD/8
233    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
234    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
235    vpor        xmmA, xmmA, xmmF
236.column_ld4:
237    test        cl, SIZEOF_XMMWORD/4
238    jz          short .column_ld8
239    sub         rcx, byte SIZEOF_XMMWORD/4
240    vmovdqa     xmmF, xmmA
241    vperm2i128  ymmF, ymmF, ymmF, 1
242    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
243    vpor        ymmA, ymmA, ymmF
244.column_ld8:
245    test        cl, SIZEOF_XMMWORD/2
246    jz          short .column_ld16
247    sub         rcx, byte SIZEOF_XMMWORD/2
248    vmovdqa     ymmF, ymmA
249    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
250.column_ld16:
251    test        cl, SIZEOF_XMMWORD
252    mov         rcx, SIZEOF_YMMWORD
253    jz          short .rgb_gray_cnv
254    vmovdqa     ymmE, ymmA
255    vmovdqa     ymmH, ymmF
256    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
257    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
258    jmp         short .rgb_gray_cnv
259
260.columnloop:
261    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
262    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
263    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
264    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
265
266.rgb_gray_cnv:
267    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
268    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
269    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
270    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
271    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
272    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
273    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
274    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
275
276    vmovdqa     ymmB, ymmA
277    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
278                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
279    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
280                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
281
282    vmovdqa     ymmB, ymmF
283    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
284                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
285    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
286                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
287
288    vmovdqa     ymmD, ymmA
289    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
290                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
291    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
292                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
293
294    vmovdqa     ymmC, ymmF
295    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
296                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
297    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
298                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
299
300    vmovdqa     ymmB, ymmA
301    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
302                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
303    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
304                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
305
306    vmovdqa     ymmG, ymmD
307    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
308                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
309    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
310                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
311
312    vmovdqa     ymmE, ymmA
313    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
314                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
315    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
316                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
317
318    vmovdqa     ymmH, ymmB
319    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
320                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
321    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
322                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
323
324    vpxor       ymmF, ymmF, ymmF
325
326    vmovdqa     ymmC, ymmA
327    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
328    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
329
330    vmovdqa     ymmD, ymmB
331    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
332    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
333
334    vmovdqa     ymmG, ymmE
335    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
336    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
337
338    vpunpcklbw  ymmF, ymmF, ymmH
339    vpunpckhbw  ymmH, ymmH, ymmH
340    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
341    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
342
343%endif  ; RGB_PIXELSIZE ; ---------------
344
345    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
346    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
347
348    ; (Original)
349    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
350    ;
351    ; (This implementation)
352    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
353
354    vmovdqa     ymm6, ymm1
355    vpunpcklwd  ymm1, ymm1, ymm3
356    vpunpckhwd  ymm6, ymm6, ymm3
357    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
358    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
359
360    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
361
362    vmovdqa     ymm6, ymm0
363    vpunpcklwd  ymm0, ymm0, ymm2
364    vpunpckhwd  ymm6, ymm6, ymm2
365    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
366    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
367
368    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
369    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
370
371    vmovdqa     ymm0, ymm5              ; ymm0=BO
372    vmovdqa     ymm6, ymm4              ; ymm6=BE
373
374    vmovdqa     ymm4, ymm0
375    vpunpcklwd  ymm0, ymm0, ymm3
376    vpunpckhwd  ymm4, ymm4, ymm3
377    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
378    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
379
380    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
381
382    vpaddd      ymm0, ymm0, ymm1
383    vpaddd      ymm4, ymm4, ymm7
384    vpaddd      ymm0, ymm0, ymm3
385    vpaddd      ymm4, ymm4, ymm3
386    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
387    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
388    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
389
390    vmovdqa     ymm4, ymm6
391    vpunpcklwd  ymm6, ymm6, ymm2
392    vpunpckhwd  ymm4, ymm4, ymm2
393    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
394    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
395
396    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
397
398    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
399    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
400    vpaddd      ymm6, ymm6, ymm2
401    vpaddd      ymm4, ymm4, ymm2
402    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
403    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
404    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
405
406    vpsllw      ymm0, ymm0, BYTE_BIT
407    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
408    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
409
410    sub         rcx, byte SIZEOF_YMMWORD
411    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
412    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
413    cmp         rcx, byte SIZEOF_YMMWORD
414    jae         near .columnloop
415    test        rcx, rcx
416    jnz         near .column_ld1
417
418    pop         rcx                     ; col
419    pop         rsi
420    pop         rdi
421
422    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
423    add         rdi, byte SIZEOF_JSAMPROW
424    dec         rax                        ; num_rows
425    jg          near .rowloop
426
427.return:
428    pop         rbx
429    vzeroupper
430    uncollect_args 5
431    mov         rsp, rbp                ; rsp <- aligned rbp
432    pop         rsp                     ; rsp <- original rbp
433    pop         rbp
434    ret
435
436; For some reason, the OS X linker does not honor the request to align the
437; segment unless we do this.
438    align       32
439