• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcgryext.asm - grayscale colorspace conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2011, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jcolsamp.inc"
20
21; --------------------------------------------------------------------------
22;
23; Convert some rows of samples to the output colorspace.
24;
25; GLOBAL(void)
26; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
27;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
28;                             JDIMENSION output_row, int num_rows);
29;
30
31%define img_width(b)    (b)+8           ; JDIMENSION img_width
32%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
33%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
34%define output_row(b)   (b)+20          ; JDIMENSION output_row
35%define num_rows(b)     (b)+24          ; int num_rows
36
37%define original_ebp    ebp+0
38%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
39%define WK_NUM          2
40%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
41
42        align   16
43        global  EXTN(jsimd_rgb_gray_convert_mmx)
44
45EXTN(jsimd_rgb_gray_convert_mmx):
46        push    ebp
47        mov     eax,esp                         ; eax = original ebp
48        sub     esp, byte 4
49        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
50        mov     [esp],eax
51        mov     ebp,esp                         ; ebp = aligned ebp
52        lea     esp, [wk(0)]
53        pushpic eax             ; make a room for GOT address
54        push    ebx
55;       push    ecx             ; need not be preserved
56;       push    edx             ; need not be preserved
57        push    esi
58        push    edi
59
60        get_GOT ebx                     ; get GOT address
61        movpic  POINTER [gotptr], ebx   ; save GOT address
62
63        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
64        test    ecx,ecx
65        jz      near .return
66
67        push    ecx
68
69        mov     esi, JSAMPIMAGE [output_buf(eax)]
70        mov     ecx, JDIMENSION [output_row(eax)]
71        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
72        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
73
74        pop     ecx
75
76        mov     esi, JSAMPARRAY [input_buf(eax)]
77        mov     eax, INT [num_rows(eax)]
78        test    eax,eax
79        jle     near .return
80        alignx  16,7
81.rowloop:
82        pushpic eax
83        push    edi
84        push    esi
85        push    ecx                     ; col
86
87        mov     esi, JSAMPROW [esi]     ; inptr
88        mov     edi, JSAMPROW [edi]     ; outptr0
89        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
90
91        cmp     ecx, byte SIZEOF_MMWORD
92        jae     short .columnloop
93        alignx  16,7
94
95%if RGB_PIXELSIZE == 3 ; ---------------
96
97.column_ld1:
98        push    eax
99        push    edx
100        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
101        test    cl, SIZEOF_BYTE
102        jz      short .column_ld2
103        sub     ecx, byte SIZEOF_BYTE
104        xor     eax,eax
105        mov     al, BYTE [esi+ecx]
106.column_ld2:
107        test    cl, SIZEOF_WORD
108        jz      short .column_ld4
109        sub     ecx, byte SIZEOF_WORD
110        xor     edx,edx
111        mov     dx, WORD [esi+ecx]
112        shl     eax, WORD_BIT
113        or      eax,edx
114.column_ld4:
115        movd    mmA,eax
116        pop     edx
117        pop     eax
118        test    cl, SIZEOF_DWORD
119        jz      short .column_ld8
120        sub     ecx, byte SIZEOF_DWORD
121        movd    mmG, DWORD [esi+ecx]
122        psllq   mmA, DWORD_BIT
123        por     mmA,mmG
124.column_ld8:
125        test    cl, SIZEOF_MMWORD
126        jz      short .column_ld16
127        movq    mmG,mmA
128        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
129        mov     ecx, SIZEOF_MMWORD
130        jmp     short .rgb_gray_cnv
131.column_ld16:
132        test    cl, 2*SIZEOF_MMWORD
133        mov     ecx, SIZEOF_MMWORD
134        jz      short .rgb_gray_cnv
135        movq    mmF,mmA
136        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
137        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
138        jmp     short .rgb_gray_cnv
139        alignx  16,7
140
141.columnloop:
142        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
143        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
144        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
145
146.rgb_gray_cnv:
147        ; mmA=(00 10 20 01 11 21 02 12)
148        ; mmG=(22 03 13 23 04 14 24 05)
149        ; mmF=(15 25 06 16 26 07 17 27)
150
151        movq      mmD,mmA
152        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
153        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
154
155        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
156        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
157
158        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
159        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
160
161        movq      mmE,mmA
162        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
163        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
164
165        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
166        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
167
168        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
169        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
170
171        pxor      mmH,mmH
172
173        movq      mmC,mmA
174        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
175        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
176
177        movq      mmB,mmE
178        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
179        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
180
181        movq      mmF,mmD
182        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
183        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
184
185%else ; RGB_PIXELSIZE == 4 ; -----------
186
187.column_ld1:
188        test    cl, SIZEOF_MMWORD/8
189        jz      short .column_ld2
190        sub     ecx, byte SIZEOF_MMWORD/8
191        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
192.column_ld2:
193        test    cl, SIZEOF_MMWORD/4
194        jz      short .column_ld4
195        sub     ecx, byte SIZEOF_MMWORD/4
196        movq    mmF,mmA
197        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
198.column_ld4:
199        test    cl, SIZEOF_MMWORD/2
200        mov     ecx, SIZEOF_MMWORD
201        jz      short .rgb_gray_cnv
202        movq    mmD,mmA
203        movq    mmC,mmF
204        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
205        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
206        jmp     short .rgb_gray_cnv
207        alignx  16,7
208
209.columnloop:
210        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
211        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
212        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
213        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
214
215.rgb_gray_cnv:
216        ; mmA=(00 10 20 30 01 11 21 31)
217        ; mmF=(02 12 22 32 03 13 23 33)
218        ; mmD=(04 14 24 34 05 15 25 35)
219        ; mmC=(06 16 26 36 07 17 27 37)
220
221        movq      mmB,mmA
222        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
223        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
224
225        movq      mmG,mmD
226        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
227        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
228
229        movq      mmE,mmA
230        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
231        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
232
233        movq      mmH,mmB
234        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
235        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
236
237        pxor      mmF,mmF
238
239        movq      mmC,mmA
240        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
241        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
242
243        movq      mmD,mmB
244        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
245        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
246
247        movq      mmG,mmE
248        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
249        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
250
251        punpcklbw mmF,mmH
252        punpckhbw mmH,mmH
253        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
254        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
255
256%endif ; RGB_PIXELSIZE ; ---------------
257
258        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
259        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
260
261        ; (Original)
262        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
263        ;
264        ; (This implementation)
265        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
266
267        movq      mm6,mm1
268        punpcklwd mm1,mm3
269        punpckhwd mm6,mm3
270        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
271        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
272
273        movq      mm7, mm6      ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
274
275        movq      mm6,mm0
276        punpcklwd mm0,mm2
277        punpckhwd mm6,mm2
278        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
279        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
280
281        movq      MMWORD [wk(0)], mm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
282        movq      MMWORD [wk(1)], mm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
283
284        movq      mm0, mm5      ; mm0=BO
285        movq      mm6, mm4      ; mm6=BE
286
287        movq      mm4,mm0
288        punpcklwd mm0,mm3
289        punpckhwd mm4,mm3
290        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
291        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
292
293        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
294
295        paddd     mm0, mm1
296        paddd     mm4, mm7
297        paddd     mm0,mm3
298        paddd     mm4,mm3
299        psrld     mm0,SCALEBITS         ; mm0=YOL
300        psrld     mm4,SCALEBITS         ; mm4=YOH
301        packssdw  mm0,mm4               ; mm0=YO
302
303        movq      mm4,mm6
304        punpcklwd mm6,mm2
305        punpckhwd mm4,mm2
306        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
307        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
308
309        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
310
311        paddd     mm6, MMWORD [wk(0)]
312        paddd     mm4, MMWORD [wk(1)]
313        paddd     mm6,mm2
314        paddd     mm4,mm2
315        psrld     mm6,SCALEBITS         ; mm6=YEL
316        psrld     mm4,SCALEBITS         ; mm4=YEH
317        packssdw  mm6,mm4               ; mm6=YE
318
319        psllw     mm0,BYTE_BIT
320        por       mm6,mm0               ; mm6=Y
321        movq      MMWORD [edi], mm6     ; Save Y
322
323        sub     ecx, byte SIZEOF_MMWORD
324        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
325        add     edi, byte SIZEOF_MMWORD                 ; outptr0
326        cmp     ecx, byte SIZEOF_MMWORD
327        jae     near .columnloop
328        test    ecx,ecx
329        jnz     near .column_ld1
330
331        pop     ecx                     ; col
332        pop     esi
333        pop     edi
334        poppic  eax
335
336        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
337        add     edi, byte SIZEOF_JSAMPROW
338        dec     eax                             ; num_rows
339        jg      near .rowloop
340
341        emms            ; empty MMX state
342
343.return:
344        pop     edi
345        pop     esi
346;       pop     edx             ; need not be preserved
347;       pop     ecx             ; need not be preserved
348        pop     ebx
349        mov     esp,ebp         ; esp <- aligned ebp
350        pop     esp             ; esp <- original ebp
351        pop     ebp
352        ret
353
354; For some reason, the OS X linker does not honor the request to align the
355; segment unless we do this.
356        align   16
357