• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcgryext.asm - grayscale colorspace conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2011, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jcolsamp.inc"
20
21; --------------------------------------------------------------------------
22;
23; Convert some rows of samples to the output colorspace.
24;
25; GLOBAL(void)
26; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
27;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
28;                            int num_rows);
29;
30
31%define img_width(b)   (b) + 8          ; JDIMENSION img_width
32%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
33%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
34%define output_row(b)  (b) + 20         ; JDIMENSION output_row
35%define num_rows(b)    (b) + 24         ; int num_rows
36
37%define original_ebp   ebp + 0
38%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
39                                        ; mmword wk[WK_NUM]
40%define WK_NUM         2
41%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
42
43    align       32
44    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
45
46EXTN(jsimd_rgb_gray_convert_mmx):
47    push        ebp
48    mov         eax, esp                    ; eax = original ebp
49    sub         esp, byte 4
50    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
51    mov         [esp], eax
52    mov         ebp, esp                    ; ebp = aligned ebp
53    lea         esp, [wk(0)]
54    pushpic     eax                     ; make a room for GOT address
55    push        ebx
56;   push        ecx                     ; need not be preserved
57;   push        edx                     ; need not be preserved
58    push        esi
59    push        edi
60
61    get_GOT     ebx                     ; get GOT address
62    movpic      POINTER [gotptr], ebx   ; save GOT address
63
64    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
65    test        ecx, ecx
66    jz          near .return
67
68    push        ecx
69
70    mov         esi, JSAMPIMAGE [output_buf(eax)]
71    mov         ecx, JDIMENSION [output_row(eax)]
72    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
73    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
74
75    pop         ecx
76
77    mov         esi, JSAMPARRAY [input_buf(eax)]
78    mov         eax, INT [num_rows(eax)]
79    test        eax, eax
80    jle         near .return
81    alignx      16, 7
82.rowloop:
83    pushpic     eax
84    push        edi
85    push        esi
86    push        ecx                     ; col
87
88    mov         esi, JSAMPROW [esi]     ; inptr
89    mov         edi, JSAMPROW [edi]     ; outptr0
90    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
91
92    cmp         ecx, byte SIZEOF_MMWORD
93    jae         short .columnloop
94    alignx      16, 7
95
96%if RGB_PIXELSIZE == 3  ; ---------------
97
98.column_ld1:
99    push        eax
100    push        edx
101    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
102    test        cl, SIZEOF_BYTE
103    jz          short .column_ld2
104    sub         ecx, byte SIZEOF_BYTE
105    xor         eax, eax
106    mov         al, BYTE [esi+ecx]
107.column_ld2:
108    test        cl, SIZEOF_WORD
109    jz          short .column_ld4
110    sub         ecx, byte SIZEOF_WORD
111    xor         edx, edx
112    mov         dx, WORD [esi+ecx]
113    shl         eax, WORD_BIT
114    or          eax, edx
115.column_ld4:
116    movd        mmA, eax
117    pop         edx
118    pop         eax
119    test        cl, SIZEOF_DWORD
120    jz          short .column_ld8
121    sub         ecx, byte SIZEOF_DWORD
122    movd        mmG, DWORD [esi+ecx]
123    psllq       mmA, DWORD_BIT
124    por         mmA, mmG
125.column_ld8:
126    test        cl, SIZEOF_MMWORD
127    jz          short .column_ld16
128    movq        mmG, mmA
129    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
130    mov         ecx, SIZEOF_MMWORD
131    jmp         short .rgb_gray_cnv
132.column_ld16:
133    test        cl, 2*SIZEOF_MMWORD
134    mov         ecx, SIZEOF_MMWORD
135    jz          short .rgb_gray_cnv
136    movq        mmF, mmA
137    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
138    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
139    jmp         short .rgb_gray_cnv
140    alignx      16, 7
141
142.columnloop:
143    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
144    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
145    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
146
147.rgb_gray_cnv:
148    ; mmA=(00 10 20 01 11 21 02 12)
149    ; mmG=(22 03 13 23 04 14 24 05)
150    ; mmF=(15 25 06 16 26 07 17 27)
151
152    movq        mmD, mmA
153    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
154    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
155
156    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
157    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
158
159    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
160    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
161
162    movq        mmE, mmA
163    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
164    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
165
166    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
167    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
168
169    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
170    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
171
172    pxor        mmH, mmH
173
174    movq        mmC, mmA
175    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
176    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
177
178    movq        mmB, mmE
179    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
180    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
181
182    movq        mmF, mmD
183    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
184    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
185
186%else  ; RGB_PIXELSIZE == 4 ; -----------
187
188.column_ld1:
189    test        cl, SIZEOF_MMWORD/8
190    jz          short .column_ld2
191    sub         ecx, byte SIZEOF_MMWORD/8
192    movd        mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
193.column_ld2:
194    test        cl, SIZEOF_MMWORD/4
195    jz          short .column_ld4
196    sub         ecx, byte SIZEOF_MMWORD/4
197    movq        mmF, mmA
198    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
199.column_ld4:
200    test        cl, SIZEOF_MMWORD/2
201    mov         ecx, SIZEOF_MMWORD
202    jz          short .rgb_gray_cnv
203    movq        mmD, mmA
204    movq        mmC, mmF
205    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
206    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
207    jmp         short .rgb_gray_cnv
208    alignx      16, 7
209
210.columnloop:
211    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
212    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
213    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
214    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
215
216.rgb_gray_cnv:
217    ; mmA=(00 10 20 30 01 11 21 31)
218    ; mmF=(02 12 22 32 03 13 23 33)
219    ; mmD=(04 14 24 34 05 15 25 35)
220    ; mmC=(06 16 26 36 07 17 27 37)
221
222    movq        mmB, mmA
223    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
224    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
225
226    movq        mmG, mmD
227    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
228    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
229
230    movq        mmE, mmA
231    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
232    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
233
234    movq        mmH, mmB
235    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
236    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
237
238    pxor        mmF, mmF
239
240    movq        mmC, mmA
241    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
242    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
243
244    movq        mmD, mmB
245    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
246    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
247
248    movq        mmG, mmE
249    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
250    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
251
252    punpcklbw   mmF, mmH
253    punpckhbw   mmH, mmH
254    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
255    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
256
257%endif  ; RGB_PIXELSIZE ; ---------------
258
259    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
260    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
261
262    ; (Original)
263    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
264    ;
265    ; (This implementation)
266    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
267
268    movq        mm6, mm1
269    punpcklwd   mm1, mm3
270    punpckhwd   mm6, mm3
271    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
272    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
273
274    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
275
276    movq        mm6, mm0
277    punpcklwd   mm0, mm2
278    punpckhwd   mm6, mm2
279    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
280    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
281
282    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
283    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
284
285    movq        mm0, mm5                ; mm0=BO
286    movq        mm6, mm4                ; mm6=BE
287
288    movq        mm4, mm0
289    punpcklwd   mm0, mm3
290    punpckhwd   mm4, mm3
291    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
292    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
293
294    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
295
296    paddd       mm0, mm1
297    paddd       mm4, mm7
298    paddd       mm0, mm3
299    paddd       mm4, mm3
300    psrld       mm0, SCALEBITS          ; mm0=YOL
301    psrld       mm4, SCALEBITS          ; mm4=YOH
302    packssdw    mm0, mm4                ; mm0=YO
303
304    movq        mm4, mm6
305    punpcklwd   mm6, mm2
306    punpckhwd   mm4, mm2
307    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
308    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
309
310    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
311
312    paddd       mm6, MMWORD [wk(0)]
313    paddd       mm4, MMWORD [wk(1)]
314    paddd       mm6, mm2
315    paddd       mm4, mm2
316    psrld       mm6, SCALEBITS          ; mm6=YEL
317    psrld       mm4, SCALEBITS          ; mm4=YEH
318    packssdw    mm6, mm4                ; mm6=YE
319
320    psllw       mm0, BYTE_BIT
321    por         mm6, mm0                ; mm6=Y
322    movq        MMWORD [edi], mm6       ; Save Y
323
324    sub         ecx, byte SIZEOF_MMWORD
325    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
326    add         edi, byte SIZEOF_MMWORD                ; outptr0
327    cmp         ecx, byte SIZEOF_MMWORD
328    jae         near .columnloop
329    test        ecx, ecx
330    jnz         near .column_ld1
331
332    pop         ecx                     ; col
333    pop         esi
334    pop         edi
335    poppic      eax
336
337    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
338    add         edi, byte SIZEOF_JSAMPROW
339    dec         eax                        ; num_rows
340    jg          near .rowloop
341
342    emms                                ; empty MMX state
343
344.return:
345    pop         edi
346    pop         esi
347;   pop         edx                     ; need not be preserved
348;   pop         ecx                     ; need not be preserved
349    pop         ebx
350    mov         esp, ebp                ; esp <- aligned ebp
351    pop         esp                     ; esp <- original ebp
352    pop         ebp
353    ret
354
355; For some reason, the OS X linker does not honor the request to align the
356; segment unless we do this.
357    align       32
358