• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jccolext.asm - colorspace conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jcolsamp.inc"
18
19; --------------------------------------------------------------------------
20;
21; Convert some rows of samples to the output colorspace.
22;
23; GLOBAL(void)
24; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
25;                           JSAMPIMAGE output_buf, JDIMENSION output_row,
26;                           int num_rows);
27;
28
29%define img_width(b)   (b) + 8          ; JDIMENSION img_width
30%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
31%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
32%define output_row(b)  (b) + 20         ; JDIMENSION output_row
33%define num_rows(b)    (b) + 24         ; int num_rows
34
35%define original_ebp   ebp + 0
36%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
37                                        ; mmword wk[WK_NUM]
38%define WK_NUM         8
39%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
40
41    align       32
42    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
43
44EXTN(jsimd_rgb_ycc_convert_mmx):
45    push        ebp
46    mov         eax, esp                    ; eax = original ebp
47    sub         esp, byte 4
48    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
49    mov         [esp], eax
50    mov         ebp, esp                    ; ebp = aligned ebp
51    lea         esp, [wk(0)]
52    pushpic     eax                     ; make a room for GOT address
53    push        ebx
54;   push        ecx                     ; need not be preserved
55;   push        edx                     ; need not be preserved
56    push        esi
57    push        edi
58
59    get_GOT     ebx                     ; get GOT address
60    movpic      POINTER [gotptr], ebx   ; save GOT address
61
62    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
63    test        ecx, ecx
64    jz          near .return
65
66    push        ecx
67
68    mov         esi, JSAMPIMAGE [output_buf(eax)]
69    mov         ecx, JDIMENSION [output_row(eax)]
70    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
71    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
72    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
73    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
74    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
75    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
76
77    pop         ecx
78
79    mov         esi, JSAMPARRAY [input_buf(eax)]
80    mov         eax, INT [num_rows(eax)]
81    test        eax, eax
82    jle         near .return
83    alignx      16, 7
84.rowloop:
85    pushpic     eax
86    push        edx
87    push        ebx
88    push        edi
89    push        esi
90    push        ecx                     ; col
91
92    mov         esi, JSAMPROW [esi]     ; inptr
93    mov         edi, JSAMPROW [edi]     ; outptr0
94    mov         ebx, JSAMPROW [ebx]     ; outptr1
95    mov         edx, JSAMPROW [edx]     ; outptr2
96    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
97
98    cmp         ecx, byte SIZEOF_MMWORD
99    jae         short .columnloop
100    alignx      16, 7
101
102%if RGB_PIXELSIZE == 3  ; ---------------
103
104.column_ld1:
105    push        eax
106    push        edx
107    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
108    test        cl, SIZEOF_BYTE
109    jz          short .column_ld2
110    sub         ecx, byte SIZEOF_BYTE
111    xor         eax, eax
112    mov         al, byte [esi+ecx]
113.column_ld2:
114    test        cl, SIZEOF_WORD
115    jz          short .column_ld4
116    sub         ecx, byte SIZEOF_WORD
117    xor         edx, edx
118    mov         dx, word [esi+ecx]
119    shl         eax, WORD_BIT
120    or          eax, edx
121.column_ld4:
122    movd        mmA, eax
123    pop         edx
124    pop         eax
125    test        cl, SIZEOF_DWORD
126    jz          short .column_ld8
127    sub         ecx, byte SIZEOF_DWORD
128    movd        mmG, dword [esi+ecx]
129    psllq       mmA, DWORD_BIT
130    por         mmA, mmG
131.column_ld8:
132    test        cl, SIZEOF_MMWORD
133    jz          short .column_ld16
134    movq        mmG, mmA
135    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
136    mov         ecx, SIZEOF_MMWORD
137    jmp         short .rgb_ycc_cnv
138.column_ld16:
139    test        cl, 2*SIZEOF_MMWORD
140    mov         ecx, SIZEOF_MMWORD
141    jz          short .rgb_ycc_cnv
142    movq        mmF, mmA
143    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
144    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
145    jmp         short .rgb_ycc_cnv
146    alignx      16, 7
147
148.columnloop:
149    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
150    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
151    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
152
153.rgb_ycc_cnv:
154    ; mmA=(00 10 20 01 11 21 02 12)
155    ; mmG=(22 03 13 23 04 14 24 05)
156    ; mmF=(15 25 06 16 26 07 17 27)
157
158    movq        mmD, mmA
159    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
160    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
161
162    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
163    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
164
165    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
166    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
167
168    movq        mmE, mmA
169    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
170    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
171
172    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
173    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
174
175    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
176    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
177
178    pxor        mmH, mmH
179
180    movq        mmC, mmA
181    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
182    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
183
184    movq        mmB, mmE
185    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
186    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
187
188    movq        mmF, mmD
189    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
190    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
191
192%else  ; RGB_PIXELSIZE == 4 ; -----------
193
194.column_ld1:
195    test        cl, SIZEOF_MMWORD/8
196    jz          short .column_ld2
197    sub         ecx, byte SIZEOF_MMWORD/8
198    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
199.column_ld2:
200    test        cl, SIZEOF_MMWORD/4
201    jz          short .column_ld4
202    sub         ecx, byte SIZEOF_MMWORD/4
203    movq        mmF, mmA
204    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
205.column_ld4:
206    test        cl, SIZEOF_MMWORD/2
207    mov         ecx, SIZEOF_MMWORD
208    jz          short .rgb_ycc_cnv
209    movq        mmD, mmA
210    movq        mmC, mmF
211    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
212    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
213    jmp         short .rgb_ycc_cnv
214    alignx      16, 7
215
216.columnloop:
217    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
218    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
219    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
220    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
221
222.rgb_ycc_cnv:
223    ; mmA=(00 10 20 30 01 11 21 31)
224    ; mmF=(02 12 22 32 03 13 23 33)
225    ; mmD=(04 14 24 34 05 15 25 35)
226    ; mmC=(06 16 26 36 07 17 27 37)
227
228    movq        mmB, mmA
229    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
230    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
231
232    movq        mmG, mmD
233    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
234    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
235
236    movq        mmE, mmA
237    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
238    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
239
240    movq        mmH, mmB
241    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
242    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
243
244    pxor        mmF, mmF
245
246    movq        mmC, mmA
247    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
248    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
249
250    movq        mmD, mmB
251    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
252    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
253
254    movq        mmG, mmE
255    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
256    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
257
258    punpcklbw   mmF, mmH
259    punpckhbw   mmH, mmH
260    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
261    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
262
263%endif  ; RGB_PIXELSIZE ; ---------------
264
265    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
266    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
267
268    ; (Original)
269    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
270    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
271    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
272    ;
273    ; (This implementation)
274    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
275    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
276    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
277
278    movq        MMWORD [wk(0)], mm0     ; wk(0)=RE
279    movq        MMWORD [wk(1)], mm1     ; wk(1)=RO
280    movq        MMWORD [wk(2)], mm4     ; wk(2)=BE
281    movq        MMWORD [wk(3)], mm5     ; wk(3)=BO
282
283    movq        mm6, mm1
284    punpcklwd   mm1, mm3
285    punpckhwd   mm6, mm3
286    movq        mm7, mm1
287    movq        mm4, mm6
288    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
289    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
290    pmaddwd     mm7, [GOTOFF(eax,PW_MF016_MF033)]  ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
291    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
292
293    movq        MMWORD [wk(4)], mm1     ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
294    movq        MMWORD [wk(5)], mm6     ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
295
296    pxor        mm1, mm1
297    pxor        mm6, mm6
298    punpcklwd   mm1, mm5                ; mm1=BOL
299    punpckhwd   mm6, mm5                ; mm6=BOH
300    psrld       mm1, 1                  ; mm1=BOL*FIX(0.500)
301    psrld       mm6, 1                  ; mm6=BOH*FIX(0.500)
302
303    movq        mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm5=[PD_ONEHALFM1_CJ]
304
305    paddd       mm7, mm1
306    paddd       mm4, mm6
307    paddd       mm7, mm5
308    paddd       mm4, mm5
309    psrld       mm7, SCALEBITS          ; mm7=CbOL
310    psrld       mm4, SCALEBITS          ; mm4=CbOH
311    packssdw    mm7, mm4                ; mm7=CbO
312
313    movq        mm1, MMWORD [wk(2)]     ; mm1=BE
314
315    movq        mm6, mm0
316    punpcklwd   mm0, mm2
317    punpckhwd   mm6, mm2
318    movq        mm5, mm0
319    movq        mm4, mm6
320    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
321    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
322    pmaddwd     mm5, [GOTOFF(eax,PW_MF016_MF033)]  ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
323    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
324
325    movq        MMWORD [wk(6)], mm0     ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
326    movq        MMWORD [wk(7)], mm6     ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
327
328    pxor        mm0, mm0
329    pxor        mm6, mm6
330    punpcklwd   mm0, mm1                ; mm0=BEL
331    punpckhwd   mm6, mm1                ; mm6=BEH
332    psrld       mm0, 1                  ; mm0=BEL*FIX(0.500)
333    psrld       mm6, 1                  ; mm6=BEH*FIX(0.500)
334
335    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
336
337    paddd       mm5, mm0
338    paddd       mm4, mm6
339    paddd       mm5, mm1
340    paddd       mm4, mm1
341    psrld       mm5, SCALEBITS          ; mm5=CbEL
342    psrld       mm4, SCALEBITS          ; mm4=CbEH
343    packssdw    mm5, mm4                ; mm5=CbE
344
345    psllw       mm7, BYTE_BIT
346    por         mm5, mm7                ; mm5=Cb
347    movq        MMWORD [ebx], mm5       ; Save Cb
348
349    movq        mm0, MMWORD [wk(3)]     ; mm0=BO
350    movq        mm6, MMWORD [wk(2)]     ; mm6=BE
351    movq        mm1, MMWORD [wk(1)]     ; mm1=RO
352
353    movq        mm4, mm0
354    punpcklwd   mm0, mm3
355    punpckhwd   mm4, mm3
356    movq        mm7, mm0
357    movq        mm5, mm4
358    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
359    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
360    pmaddwd     mm7, [GOTOFF(eax,PW_MF008_MF041)]  ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
361    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
362
363    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
364
365    paddd       mm0, MMWORD [wk(4)]
366    paddd       mm4, MMWORD [wk(5)]
367    paddd       mm0, mm3
368    paddd       mm4, mm3
369    psrld       mm0, SCALEBITS          ; mm0=YOL
370    psrld       mm4, SCALEBITS          ; mm4=YOH
371    packssdw    mm0, mm4                ; mm0=YO
372
373    pxor        mm3, mm3
374    pxor        mm4, mm4
375    punpcklwd   mm3, mm1                ; mm3=ROL
376    punpckhwd   mm4, mm1                ; mm4=ROH
377    psrld       mm3, 1                  ; mm3=ROL*FIX(0.500)
378    psrld       mm4, 1                  ; mm4=ROH*FIX(0.500)
379
380    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
381
382    paddd       mm7, mm3
383    paddd       mm5, mm4
384    paddd       mm7, mm1
385    paddd       mm5, mm1
386    psrld       mm7, SCALEBITS          ; mm7=CrOL
387    psrld       mm5, SCALEBITS          ; mm5=CrOH
388    packssdw    mm7, mm5                ; mm7=CrO
389
390    movq        mm3, MMWORD [wk(0)]     ; mm3=RE
391
392    movq        mm4, mm6
393    punpcklwd   mm6, mm2
394    punpckhwd   mm4, mm2
395    movq        mm1, mm6
396    movq        mm5, mm4
397    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
398    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
399    pmaddwd     mm1, [GOTOFF(eax,PW_MF008_MF041)]  ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
400    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
401
402    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
403
404    paddd       mm6, MMWORD [wk(6)]
405    paddd       mm4, MMWORD [wk(7)]
406    paddd       mm6, mm2
407    paddd       mm4, mm2
408    psrld       mm6, SCALEBITS          ; mm6=YEL
409    psrld       mm4, SCALEBITS          ; mm4=YEH
410    packssdw    mm6, mm4                ; mm6=YE
411
412    psllw       mm0, BYTE_BIT
413    por         mm6, mm0                ; mm6=Y
414    movq        MMWORD [edi], mm6       ; Save Y
415
416    pxor        mm2, mm2
417    pxor        mm4, mm4
418    punpcklwd   mm2, mm3                ; mm2=REL
419    punpckhwd   mm4, mm3                ; mm4=REH
420    psrld       mm2, 1                  ; mm2=REL*FIX(0.500)
421    psrld       mm4, 1                  ; mm4=REH*FIX(0.500)
422
423    movq        mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm0=[PD_ONEHALFM1_CJ]
424
425    paddd       mm1, mm2
426    paddd       mm5, mm4
427    paddd       mm1, mm0
428    paddd       mm5, mm0
429    psrld       mm1, SCALEBITS          ; mm1=CrEL
430    psrld       mm5, SCALEBITS          ; mm5=CrEH
431    packssdw    mm1, mm5                ; mm1=CrE
432
433    psllw       mm7, BYTE_BIT
434    por         mm1, mm7                ; mm1=Cr
435    movq        MMWORD [edx], mm1       ; Save Cr
436
437    sub         ecx, byte SIZEOF_MMWORD
438    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
439    add         edi, byte SIZEOF_MMWORD                ; outptr0
440    add         ebx, byte SIZEOF_MMWORD                ; outptr1
441    add         edx, byte SIZEOF_MMWORD                ; outptr2
442    cmp         ecx, byte SIZEOF_MMWORD
443    jae         near .columnloop
444    test        ecx, ecx
445    jnz         near .column_ld1
446
447    pop         ecx                     ; col
448    pop         esi
449    pop         edi
450    pop         ebx
451    pop         edx
452    poppic      eax
453
454    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
455    add         edi, byte SIZEOF_JSAMPROW
456    add         ebx, byte SIZEOF_JSAMPROW
457    add         edx, byte SIZEOF_JSAMPROW
458    dec         eax                        ; num_rows
459    jg          near .rowloop
460
461    emms                                ; empty MMX state
462
463.return:
464    pop         edi
465    pop         esi
466;   pop         edx                     ; need not be preserved
467;   pop         ecx                     ; need not be preserved
468    pop         ebx
469    mov         esp, ebp                ; esp <- aligned ebp
470    pop         esp                     ; esp <- original ebp
471    pop         ebp
472    ret
473
474; For some reason, the OS X linker does not honor the request to align the
475; segment unless we do this.
476    align       32
477