• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
3;
4; x86 SIMD extension for IJG JPEG library
5; Copyright (C) 1999-2006, MIYASAKA Masaru.
6; Copyright (C) 2011, D. R. Commander.
7; For conditions of distribution and use, see copyright notice in jsimdext.inc
8;
9; This file should be assembled with NASM (Netwide Assembler),
10; can *not* be assembled with Microsoft's MASM or any compatible
11; assembler (including Borland's Turbo Assembler).
12; NASM is available from http://nasm.sourceforge.net/ or
13; http://sourceforge.net/project/showfiles.php?group_id=6208
14;
15; [TAB8]
16
17%include "jcolsamp.inc"
18
19; --------------------------------------------------------------------------
20;
21; Convert some rows of samples to the output colorspace.
22;
23; GLOBAL(void)
24; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
25;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
26;                              JDIMENSION output_row, int num_rows);
27;
28
29; r10 = JDIMENSION img_width
30; r11 = JSAMPARRAY input_buf
31; r12 = JSAMPIMAGE output_buf
32; r13 = JDIMENSION output_row
33; r14 = int num_rows
34
35%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
36%define WK_NUM          2
37
38        align   16
39
40        global  EXTN(jsimd_rgb_gray_convert_sse2)
41
42EXTN(jsimd_rgb_gray_convert_sse2):
43        push    rbp
44        mov     rax,rsp                         ; rax = original rbp
45        sub     rsp, byte 4
46        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
47        mov     [rsp],rax
48        mov     rbp,rsp                         ; rbp = aligned rbp
49        lea     rsp, [wk(0)]
50        collect_args
51        push    rbx
52
53        mov     ecx, r10d
54        test    rcx,rcx
55        jz      near .return
56
57        push    rcx
58
59        mov rsi, r12
60        mov ecx, r13d
61        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
63
64        pop     rcx
65
66        mov rsi, r11
67        mov     eax, r14d
68        test    rax,rax
69        jle     near .return
70.rowloop:
71        push    rdi
72        push    rsi
73        push    rcx                     ; col
74
75        mov     rsi, JSAMPROW [rsi]     ; inptr
76        mov     rdi, JSAMPROW [rdi]     ; outptr0
77
78        cmp     rcx, byte SIZEOF_XMMWORD
79        jae     near .columnloop
80
81%if RGB_PIXELSIZE == 3 ; ---------------
82
83.column_ld1:
84        push    rax
85        push    rdx
86        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
87        test    cl, SIZEOF_BYTE
88        jz      short .column_ld2
89        sub     rcx, byte SIZEOF_BYTE
90        movzx   rax, BYTE [rsi+rcx]
91.column_ld2:
92        test    cl, SIZEOF_WORD
93        jz      short .column_ld4
94        sub     rcx, byte SIZEOF_WORD
95        movzx   rdx, WORD [rsi+rcx]
96        shl     rax, WORD_BIT
97        or      rax,rdx
98.column_ld4:
99        movd    xmmA,eax
100        pop     rdx
101        pop     rax
102        test    cl, SIZEOF_DWORD
103        jz      short .column_ld8
104        sub     rcx, byte SIZEOF_DWORD
105        movd    xmmF, XMM_DWORD [rsi+rcx]
106        pslldq  xmmA, SIZEOF_DWORD
107        por     xmmA,xmmF
108.column_ld8:
109        test    cl, SIZEOF_MMWORD
110        jz      short .column_ld16
111        sub     rcx, byte SIZEOF_MMWORD
112        movq    xmmB, XMM_MMWORD [rsi+rcx]
113        pslldq  xmmA, SIZEOF_MMWORD
114        por     xmmA,xmmB
115.column_ld16:
116        test    cl, SIZEOF_XMMWORD
117        jz      short .column_ld32
118        movdqa  xmmF,xmmA
119        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
120        mov     rcx, SIZEOF_XMMWORD
121        jmp     short .rgb_gray_cnv
122.column_ld32:
123        test    cl, 2*SIZEOF_XMMWORD
124        mov     rcx, SIZEOF_XMMWORD
125        jz      short .rgb_gray_cnv
126        movdqa  xmmB,xmmA
127        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
128        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
129        jmp     short .rgb_gray_cnv
130
131.columnloop:
132        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
133        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
134        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
135
136.rgb_gray_cnv:
137        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
138        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
139        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
140
141        movdqa    xmmG,xmmA
142        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
143        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
144
145        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
146        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
147
148        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
149        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
150
151        movdqa    xmmD,xmmA
152        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
153        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
154
155        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
156        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
157
158        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
159        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
160
161        movdqa    xmmE,xmmA
162        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
163        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
164
165        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
166        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
167
168        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
169        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
170
171        pxor      xmmH,xmmH
172
173        movdqa    xmmC,xmmA
174        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
175        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
176
177        movdqa    xmmB,xmmE
178        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
179        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
180
181        movdqa    xmmF,xmmD
182        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
183        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
184
185%else ; RGB_PIXELSIZE == 4 ; -----------
186
187.column_ld1:
188        test    cl, SIZEOF_XMMWORD/16
189        jz      short .column_ld2
190        sub     rcx, byte SIZEOF_XMMWORD/16
191        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
192.column_ld2:
193        test    cl, SIZEOF_XMMWORD/8
194        jz      short .column_ld4
195        sub     rcx, byte SIZEOF_XMMWORD/8
196        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
197        pslldq  xmmA, SIZEOF_MMWORD
198        por     xmmA,xmmE
199.column_ld4:
200        test    cl, SIZEOF_XMMWORD/4
201        jz      short .column_ld8
202        sub     rcx, byte SIZEOF_XMMWORD/4
203        movdqa  xmmE,xmmA
204        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
205.column_ld8:
206        test    cl, SIZEOF_XMMWORD/2
207        mov     rcx, SIZEOF_XMMWORD
208        jz      short .rgb_gray_cnv
209        movdqa  xmmF,xmmA
210        movdqa  xmmH,xmmE
211        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
212        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
213        jmp     short .rgb_gray_cnv
214
215.columnloop:
216        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
217        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
218        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
219        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
220
221.rgb_gray_cnv:
222        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
223        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
224        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
225        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
226
227        movdqa    xmmD,xmmA
228        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
229        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
230
231        movdqa    xmmC,xmmF
232        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
233        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
234
235        movdqa    xmmB,xmmA
236        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
237        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
238
239        movdqa    xmmG,xmmD
240        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
241        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
242
243        movdqa    xmmE,xmmA
244        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
245        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
246
247        movdqa    xmmH,xmmB
248        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
249        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
250
251        pxor      xmmF,xmmF
252
253        movdqa    xmmC,xmmA
254        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
255        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
256
257        movdqa    xmmD,xmmB
258        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
259        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
260
261        movdqa    xmmG,xmmE
262        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
263        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
264
265        punpcklbw xmmF,xmmH
266        punpckhbw xmmH,xmmH
267        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
268        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
269
270%endif ; RGB_PIXELSIZE ; ---------------
271
272        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
273        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
274
275        ; (Original)
276        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
277        ;
278        ; (This implementation)
279        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
280
281        movdqa    xmm6,xmm1
282        punpcklwd xmm1,xmm3
283        punpckhwd xmm6,xmm3
284        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
285        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
286
287        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
288
289        movdqa    xmm6,xmm0
290        punpcklwd xmm0,xmm2
291        punpckhwd xmm6,xmm2
292        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
293        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
294
295        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
296        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
297
298        movdqa    xmm0, xmm5    ; xmm0=BO
299        movdqa    xmm6, xmm4    ; xmm6=BE
300
301        movdqa    xmm4,xmm0
302        punpcklwd xmm0,xmm3
303        punpckhwd xmm4,xmm3
304        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
305        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
306
307        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
308
309        paddd     xmm0, xmm1
310        paddd     xmm4, xmm7
311        paddd     xmm0,xmm3
312        paddd     xmm4,xmm3
313        psrld     xmm0,SCALEBITS        ; xmm0=YOL
314        psrld     xmm4,SCALEBITS        ; xmm4=YOH
315        packssdw  xmm0,xmm4             ; xmm0=YO
316
317        movdqa    xmm4,xmm6
318        punpcklwd xmm6,xmm2
319        punpckhwd xmm4,xmm2
320        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
321        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
322
323        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
324
325        paddd     xmm6, XMMWORD [wk(0)]
326        paddd     xmm4, XMMWORD [wk(1)]
327        paddd     xmm6,xmm2
328        paddd     xmm4,xmm2
329        psrld     xmm6,SCALEBITS        ; xmm6=YEL
330        psrld     xmm4,SCALEBITS        ; xmm4=YEH
331        packssdw  xmm6,xmm4             ; xmm6=YE
332
333        psllw     xmm0,BYTE_BIT
334        por       xmm6,xmm0             ; xmm6=Y
335        movdqa    XMMWORD [rdi], xmm6   ; Save Y
336
337        sub     rcx, byte SIZEOF_XMMWORD
338        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
339        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
340        cmp     rcx, byte SIZEOF_XMMWORD
341        jae     near .columnloop
342        test    rcx,rcx
343        jnz     near .column_ld1
344
345        pop     rcx                     ; col
346        pop     rsi
347        pop     rdi
348
349        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
350        add     rdi, byte SIZEOF_JSAMPROW
351        dec     rax                             ; num_rows
352        jg      near .rowloop
353
354.return:
355        pop     rbx
356        uncollect_args
357        mov     rsp,rbp         ; rsp <- aligned rbp
358        pop     rsp             ; rsp <- original rbp
359        pop     rbp
360        ret
361
362; For some reason, the OS X linker does not honor the request to align the
363; segment unless we do this.
364        align   16
365