• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2011, 2016, D. R. Commander.
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15
16%include "jcolsamp.inc"
17
18; --------------------------------------------------------------------------
19;
20; Convert some rows of samples to the output colorspace.
21;
22; GLOBAL(void)
23; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
24;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
25;                             int num_rows);
26;
27
28; r10d = JDIMENSION img_width
29; r11 = JSAMPARRAY input_buf
30; r12 = JSAMPIMAGE output_buf
31; r13d = JDIMENSION output_row
32; r14d = int num_rows
33
34%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
35%define WK_NUM  2
36
37    align       32
38    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
39
40EXTN(jsimd_rgb_gray_convert_sse2):
41    push        rbp
42    mov         rax, rsp                     ; rax = original rbp
43    sub         rsp, byte 4
44    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
45    mov         [rsp], rax
46    mov         rbp, rsp                     ; rbp = aligned rbp
47    lea         rsp, [wk(0)]
48    collect_args 5
49    push        rbx
50
51    mov         ecx, r10d
52    test        rcx, rcx
53    jz          near .return
54
55    push        rcx
56
57    mov         rsi, r12
58    mov         ecx, r13d
59    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
60    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
61
62    pop         rcx
63
64    mov         rsi, r11
65    mov         eax, r14d
66    test        rax, rax
67    jle         near .return
68.rowloop:
69    push        rdi
70    push        rsi
71    push        rcx                     ; col
72
73    mov         rsi, JSAMPROW [rsi]     ; inptr
74    mov         rdi, JSAMPROW [rdi]     ; outptr0
75
76    cmp         rcx, byte SIZEOF_XMMWORD
77    jae         near .columnloop
78
79%if RGB_PIXELSIZE == 3  ; ---------------
80
81.column_ld1:
82    push        rax
83    push        rdx
84    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
85    test        cl, SIZEOF_BYTE
86    jz          short .column_ld2
87    sub         rcx, byte SIZEOF_BYTE
88    movzx       rax, byte [rsi+rcx]
89.column_ld2:
90    test        cl, SIZEOF_WORD
91    jz          short .column_ld4
92    sub         rcx, byte SIZEOF_WORD
93    movzx       rdx, word [rsi+rcx]
94    shl         rax, WORD_BIT
95    or          rax, rdx
96.column_ld4:
97    movd        xmmA, eax
98    pop         rdx
99    pop         rax
100    test        cl, SIZEOF_DWORD
101    jz          short .column_ld8
102    sub         rcx, byte SIZEOF_DWORD
103    movd        xmmF, XMM_DWORD [rsi+rcx]
104    pslldq      xmmA, SIZEOF_DWORD
105    por         xmmA, xmmF
106.column_ld8:
107    test        cl, SIZEOF_MMWORD
108    jz          short .column_ld16
109    sub         rcx, byte SIZEOF_MMWORD
110    movq        xmmB, XMM_MMWORD [rsi+rcx]
111    pslldq      xmmA, SIZEOF_MMWORD
112    por         xmmA, xmmB
113.column_ld16:
114    test        cl, SIZEOF_XMMWORD
115    jz          short .column_ld32
116    movdqa      xmmF, xmmA
117    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
118    mov         rcx, SIZEOF_XMMWORD
119    jmp         short .rgb_gray_cnv
120.column_ld32:
121    test        cl, 2*SIZEOF_XMMWORD
122    mov         rcx, SIZEOF_XMMWORD
123    jz          short .rgb_gray_cnv
124    movdqa      xmmB, xmmA
125    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
126    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
127    jmp         short .rgb_gray_cnv
128
129.columnloop:
130    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
131    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
132    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
133
134.rgb_gray_cnv:
135    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
136    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
137    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
138
139    movdqa      xmmG, xmmA
140    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
141    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
142
143    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
144    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
145
146    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
147    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
148
149    movdqa      xmmD, xmmA
150    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
151    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
152
153    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
154    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
155
156    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
157    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
158
159    movdqa      xmmE, xmmA
160    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
161    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
162
163    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
164    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
165
166    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
167    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
168
169    pxor        xmmH, xmmH
170
171    movdqa      xmmC, xmmA
172    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
173    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
174
175    movdqa      xmmB, xmmE
176    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
177    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
178
179    movdqa      xmmF, xmmD
180    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
181    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
182
183%else  ; RGB_PIXELSIZE == 4 ; -----------
184
185.column_ld1:
186    test        cl, SIZEOF_XMMWORD/16
187    jz          short .column_ld2
188    sub         rcx, byte SIZEOF_XMMWORD/16
189    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
190.column_ld2:
191    test        cl, SIZEOF_XMMWORD/8
192    jz          short .column_ld4
193    sub         rcx, byte SIZEOF_XMMWORD/8
194    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
195    pslldq      xmmA, SIZEOF_MMWORD
196    por         xmmA, xmmE
197.column_ld4:
198    test        cl, SIZEOF_XMMWORD/4
199    jz          short .column_ld8
200    sub         rcx, byte SIZEOF_XMMWORD/4
201    movdqa      xmmE, xmmA
202    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
203.column_ld8:
204    test        cl, SIZEOF_XMMWORD/2
205    mov         rcx, SIZEOF_XMMWORD
206    jz          short .rgb_gray_cnv
207    movdqa      xmmF, xmmA
208    movdqa      xmmH, xmmE
209    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
210    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
211    jmp         short .rgb_gray_cnv
212
213.columnloop:
214    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
215    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
216    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
217    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
218
219.rgb_gray_cnv:
220    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
221    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
222    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
223    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
224
225    movdqa      xmmD, xmmA
226    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
227    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
228
229    movdqa      xmmC, xmmF
230    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
231    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
232
233    movdqa      xmmB, xmmA
234    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
235    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
236
237    movdqa      xmmG, xmmD
238    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
239    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
240
241    movdqa      xmmE, xmmA
242    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
243    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
244
245    movdqa      xmmH, xmmB
246    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
247    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
248
249    pxor        xmmF, xmmF
250
251    movdqa      xmmC, xmmA
252    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
253    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
254
255    movdqa      xmmD, xmmB
256    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
257    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
258
259    movdqa      xmmG, xmmE
260    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
261    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
262
263    punpcklbw   xmmF, xmmH
264    punpckhbw   xmmH, xmmH
265    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
266    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
267
268%endif  ; RGB_PIXELSIZE ; ---------------
269
270    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
271    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
272
273    ; (Original)
274    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
275    ;
276    ; (This implementation)
277    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
278
279    movdqa      xmm6, xmm1
280    punpcklwd   xmm1, xmm3
281    punpckhwd   xmm6, xmm3
282    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
283    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
284
285    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
286
287    movdqa      xmm6, xmm0
288    punpcklwd   xmm0, xmm2
289    punpckhwd   xmm6, xmm2
290    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
291    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
292
293    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
294    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
295
296    movdqa      xmm0, xmm5              ; xmm0=BO
297    movdqa      xmm6, xmm4              ; xmm6=BE
298
299    movdqa      xmm4, xmm0
300    punpcklwd   xmm0, xmm3
301    punpckhwd   xmm4, xmm3
302    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
303    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
304
305    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
306
307    paddd       xmm0, xmm1
308    paddd       xmm4, xmm7
309    paddd       xmm0, xmm3
310    paddd       xmm4, xmm3
311    psrld       xmm0, SCALEBITS         ; xmm0=YOL
312    psrld       xmm4, SCALEBITS         ; xmm4=YOH
313    packssdw    xmm0, xmm4              ; xmm0=YO
314
315    movdqa      xmm4, xmm6
316    punpcklwd   xmm6, xmm2
317    punpckhwd   xmm4, xmm2
318    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
319    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
320
321    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
322
323    paddd       xmm6, XMMWORD [wk(0)]
324    paddd       xmm4, XMMWORD [wk(1)]
325    paddd       xmm6, xmm2
326    paddd       xmm4, xmm2
327    psrld       xmm6, SCALEBITS         ; xmm6=YEL
328    psrld       xmm4, SCALEBITS         ; xmm4=YEH
329    packssdw    xmm6, xmm4              ; xmm6=YE
330
331    psllw       xmm0, BYTE_BIT
332    por         xmm6, xmm0              ; xmm6=Y
333    movdqa      XMMWORD [rdi], xmm6     ; Save Y
334
335    sub         rcx, byte SIZEOF_XMMWORD
336    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
337    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
338    cmp         rcx, byte SIZEOF_XMMWORD
339    jae         near .columnloop
340    test        rcx, rcx
341    jnz         near .column_ld1
342
343    pop         rcx                     ; col
344    pop         rsi
345    pop         rdi
346
347    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
348    add         rdi, byte SIZEOF_JSAMPROW
349    dec         rax                        ; num_rows
350    jg          near .rowloop
351
352.return:
353    pop         rbx
354    uncollect_args 5
355    mov         rsp, rbp                ; rsp <- aligned rbp
356    pop         rsp                     ; rsp <- original rbp
357    pop         rbp
358    ret
359
360; For some reason, the OS X linker does not honor the request to align the
361; segment unless we do this.
362    align       32
363