• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jccolext.asm - colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2009, 2016, D. R. Commander.
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15
16%include "jcolsamp.inc"
17
18; --------------------------------------------------------------------------
19;
20; Convert some rows of samples to the output colorspace.
21;
22; GLOBAL(void)
23; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
24;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
25;                            int num_rows);
26;
27
28; r10d = JDIMENSION img_width
29; r11 = JSAMPARRAY input_buf
30; r12 = JSAMPIMAGE output_buf
31; r13d = JDIMENSION output_row
32; r14d = int num_rows
33
34%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
35%define WK_NUM  8
36
37    align       32
38    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
39
40EXTN(jsimd_rgb_ycc_convert_sse2):
41    push        rbp
42    mov         rax, rsp                     ; rax = original rbp
43    sub         rsp, byte 4
44    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
45    mov         [rsp], rax
46    mov         rbp, rsp                     ; rbp = aligned rbp
47    lea         rsp, [wk(0)]
48    collect_args 5
49    push        rbx
50
51    mov         ecx, r10d
52    test        rcx, rcx
53    jz          near .return
54
55    push        rcx
56
57    mov         rsi, r12
58    mov         ecx, r13d
59    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
60    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
61    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
62    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
63    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
64    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
65
66    pop         rcx
67
68    mov         rsi, r11
69    mov         eax, r14d
70    test        rax, rax
71    jle         near .return
72.rowloop:
73    push        rdx
74    push        rbx
75    push        rdi
76    push        rsi
77    push        rcx                     ; col
78
79    mov         rsi, JSAMPROW [rsi]     ; inptr
80    mov         rdi, JSAMPROW [rdi]     ; outptr0
81    mov         rbx, JSAMPROW [rbx]     ; outptr1
82    mov         rdx, JSAMPROW [rdx]     ; outptr2
83
84    cmp         rcx, byte SIZEOF_XMMWORD
85    jae         near .columnloop
86
87%if RGB_PIXELSIZE == 3  ; ---------------
88
89.column_ld1:
90    push        rax
91    push        rdx
92    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
93    test        cl, SIZEOF_BYTE
94    jz          short .column_ld2
95    sub         rcx, byte SIZEOF_BYTE
96    movzx       rax, byte [rsi+rcx]
97.column_ld2:
98    test        cl, SIZEOF_WORD
99    jz          short .column_ld4
100    sub         rcx, byte SIZEOF_WORD
101    movzx       rdx, word [rsi+rcx]
102    shl         rax, WORD_BIT
103    or          rax, rdx
104.column_ld4:
105    movd        xmmA, eax
106    pop         rdx
107    pop         rax
108    test        cl, SIZEOF_DWORD
109    jz          short .column_ld8
110    sub         rcx, byte SIZEOF_DWORD
111    movd        xmmF, XMM_DWORD [rsi+rcx]
112    pslldq      xmmA, SIZEOF_DWORD
113    por         xmmA, xmmF
114.column_ld8:
115    test        cl, SIZEOF_MMWORD
116    jz          short .column_ld16
117    sub         rcx, byte SIZEOF_MMWORD
118    movq        xmmB, XMM_MMWORD [rsi+rcx]
119    pslldq      xmmA, SIZEOF_MMWORD
120    por         xmmA, xmmB
121.column_ld16:
122    test        cl, SIZEOF_XMMWORD
123    jz          short .column_ld32
124    movdqa      xmmF, xmmA
125    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
126    mov         rcx, SIZEOF_XMMWORD
127    jmp         short .rgb_ycc_cnv
128.column_ld32:
129    test        cl, 2*SIZEOF_XMMWORD
130    mov         rcx, SIZEOF_XMMWORD
131    jz          short .rgb_ycc_cnv
132    movdqa      xmmB, xmmA
133    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
134    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
135    jmp         short .rgb_ycc_cnv
136
137.columnloop:
138    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
139    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
140    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
141
142.rgb_ycc_cnv:
143    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
144    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
145    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
146
147    movdqa      xmmG, xmmA
148    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
149    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
150
151    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
152    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
153
154    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
155    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
156
157    movdqa      xmmD, xmmA
158    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
159    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
160
161    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
162    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
163
164    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
165    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
166
167    movdqa      xmmE, xmmA
168    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
169    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
170
171    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
172    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
173
174    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
175    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
176
177    pxor        xmmH, xmmH
178
179    movdqa      xmmC, xmmA
180    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
181    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
182
183    movdqa      xmmB, xmmE
184    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
185    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
186
187    movdqa      xmmF, xmmD
188    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
189    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
190
191%else  ; RGB_PIXELSIZE == 4 ; -----------
192
193.column_ld1:
194    test        cl, SIZEOF_XMMWORD/16
195    jz          short .column_ld2
196    sub         rcx, byte SIZEOF_XMMWORD/16
197    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
198.column_ld2:
199    test        cl, SIZEOF_XMMWORD/8
200    jz          short .column_ld4
201    sub         rcx, byte SIZEOF_XMMWORD/8
202    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
203    pslldq      xmmA, SIZEOF_MMWORD
204    por         xmmA, xmmE
205.column_ld4:
206    test        cl, SIZEOF_XMMWORD/4
207    jz          short .column_ld8
208    sub         rcx, byte SIZEOF_XMMWORD/4
209    movdqa      xmmE, xmmA
210    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
211.column_ld8:
212    test        cl, SIZEOF_XMMWORD/2
213    mov         rcx, SIZEOF_XMMWORD
214    jz          short .rgb_ycc_cnv
215    movdqa      xmmF, xmmA
216    movdqa      xmmH, xmmE
217    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
218    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
219    jmp         short .rgb_ycc_cnv
220
221.columnloop:
222    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
223    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
224    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
225    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
226
227.rgb_ycc_cnv:
228    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
229    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
230    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
231    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
232
233    movdqa      xmmD, xmmA
234    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
235    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
236
237    movdqa      xmmC, xmmF
238    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
239    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
240
241    movdqa      xmmB, xmmA
242    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
243    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
244
245    movdqa      xmmG, xmmD
246    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
247    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
248
249    movdqa      xmmE, xmmA
250    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
251    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
252
253    movdqa      xmmH, xmmB
254    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
255    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
256
257    pxor        xmmF, xmmF
258
259    movdqa      xmmC, xmmA
260    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
261    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
262
263    movdqa      xmmD, xmmB
264    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
265    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
266
267    movdqa      xmmG, xmmE
268    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
269    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
270
271    punpcklbw   xmmF, xmmH
272    punpckhbw   xmmH, xmmH
273    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
274    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
275
276%endif  ; RGB_PIXELSIZE ; ---------------
277
278    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
279    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
280
281    ; (Original)
282    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
283    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
284    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
285    ;
286    ; (This implementation)
287    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
288    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
289    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
290
291    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
292    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
293    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
294    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
295
296    movdqa      xmm6, xmm1
297    punpcklwd   xmm1, xmm3
298    punpckhwd   xmm6, xmm3
299    movdqa      xmm7, xmm1
300    movdqa      xmm4, xmm6
301    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
302    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
303    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
304    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
305
306    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
307    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
308
309    pxor        xmm1, xmm1
310    pxor        xmm6, xmm6
311    punpcklwd   xmm1, xmm5              ; xmm1=BOL
312    punpckhwd   xmm6, xmm5              ; xmm6=BOH
313    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
314    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
315
316    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
317
318    paddd       xmm7, xmm1
319    paddd       xmm4, xmm6
320    paddd       xmm7, xmm5
321    paddd       xmm4, xmm5
322    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
323    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
324    packssdw    xmm7, xmm4              ; xmm7=CbO
325
326    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
327
328    movdqa      xmm6, xmm0
329    punpcklwd   xmm0, xmm2
330    punpckhwd   xmm6, xmm2
331    movdqa      xmm5, xmm0
332    movdqa      xmm4, xmm6
333    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
334    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
335    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
336    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
337
338    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
339    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
340
341    pxor        xmm0, xmm0
342    pxor        xmm6, xmm6
343    punpcklwd   xmm0, xmm1              ; xmm0=BEL
344    punpckhwd   xmm6, xmm1              ; xmm6=BEH
345    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
346    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
347
348    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
349
350    paddd       xmm5, xmm0
351    paddd       xmm4, xmm6
352    paddd       xmm5, xmm1
353    paddd       xmm4, xmm1
354    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
355    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
356    packssdw    xmm5, xmm4              ; xmm5=CbE
357
358    psllw       xmm7, BYTE_BIT
359    por         xmm5, xmm7              ; xmm5=Cb
360    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
361
362    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
363    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
364    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
365
366    movdqa      xmm4, xmm0
367    punpcklwd   xmm0, xmm3
368    punpckhwd   xmm4, xmm3
369    movdqa      xmm7, xmm0
370    movdqa      xmm5, xmm4
371    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
372    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
373    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
374    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
375
376    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
377
378    paddd       xmm0, XMMWORD [wk(4)]
379    paddd       xmm4, XMMWORD [wk(5)]
380    paddd       xmm0, xmm3
381    paddd       xmm4, xmm3
382    psrld       xmm0, SCALEBITS         ; xmm0=YOL
383    psrld       xmm4, SCALEBITS         ; xmm4=YOH
384    packssdw    xmm0, xmm4              ; xmm0=YO
385
386    pxor        xmm3, xmm3
387    pxor        xmm4, xmm4
388    punpcklwd   xmm3, xmm1              ; xmm3=ROL
389    punpckhwd   xmm4, xmm1              ; xmm4=ROH
390    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
391    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
392
393    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
394
395    paddd       xmm7, xmm3
396    paddd       xmm5, xmm4
397    paddd       xmm7, xmm1
398    paddd       xmm5, xmm1
399    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
400    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
401    packssdw    xmm7, xmm5              ; xmm7=CrO
402
403    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
404
405    movdqa      xmm4, xmm6
406    punpcklwd   xmm6, xmm2
407    punpckhwd   xmm4, xmm2
408    movdqa      xmm1, xmm6
409    movdqa      xmm5, xmm4
410    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
411    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
412    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
413    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
414
415    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
416
417    paddd       xmm6, XMMWORD [wk(6)]
418    paddd       xmm4, XMMWORD [wk(7)]
419    paddd       xmm6, xmm2
420    paddd       xmm4, xmm2
421    psrld       xmm6, SCALEBITS         ; xmm6=YEL
422    psrld       xmm4, SCALEBITS         ; xmm4=YEH
423    packssdw    xmm6, xmm4              ; xmm6=YE
424
425    psllw       xmm0, BYTE_BIT
426    por         xmm6, xmm0              ; xmm6=Y
427    movdqa      XMMWORD [rdi], xmm6     ; Save Y
428
429    pxor        xmm2, xmm2
430    pxor        xmm4, xmm4
431    punpcklwd   xmm2, xmm3              ; xmm2=REL
432    punpckhwd   xmm4, xmm3              ; xmm4=REH
433    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
434    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
435
436    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
437
438    paddd       xmm1, xmm2
439    paddd       xmm5, xmm4
440    paddd       xmm1, xmm0
441    paddd       xmm5, xmm0
442    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
443    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
444    packssdw    xmm1, xmm5              ; xmm1=CrE
445
446    psllw       xmm7, BYTE_BIT
447    por         xmm1, xmm7              ; xmm1=Cr
448    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
449
450    sub         rcx, byte SIZEOF_XMMWORD
451    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
452    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
453    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
454    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
455    cmp         rcx, byte SIZEOF_XMMWORD
456    jae         near .columnloop
457    test        rcx, rcx
458    jnz         near .column_ld1
459
460    pop         rcx                     ; col
461    pop         rsi
462    pop         rdi
463    pop         rbx
464    pop         rdx
465
466    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
467    add         rdi, byte SIZEOF_JSAMPROW
468    add         rbx, byte SIZEOF_JSAMPROW
469    add         rdx, byte SIZEOF_JSAMPROW
470    dec         rax                        ; num_rows
471    jg          near .rowloop
472
473.return:
474    pop         rbx
475    uncollect_args 5
476    mov         rsp, rbp                ; rsp <- aligned rbp
477    pop         rsp                     ; rsp <- original rbp
478    pop         rbp
479    ret
480
481; For some reason, the OS X linker does not honor the request to align the
482; segment unless we do this.
483    align       32
484