• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Loongson MMI optimizations for libjpeg-turbo
3  *
4  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5  * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
6  * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
7  *                          All Rights Reserved.
8  * Authors:  ZhuChen     <zhuchen@loongson.cn>
9  *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
10  *           CaiWanwei   <caiwanwei@loongson.cn>
11  *           ZhangLixia  <zhanglixia-hf@loongson.cn>
12  *
13  * Based on the x86 SIMD extension for IJG JPEG library
14  * Copyright (C) 1999-2006, MIYASAKA Masaru.
15  *
16  * This software is provided 'as-is', without any express or implied
17  * warranty.  In no event will the authors be held liable for any damages
18  * arising from the use of this software.
19  *
20  * Permission is granted to anyone to use this software for any purpose,
21  * including commercial applications, and to alter it and redistribute it
22  * freely, subject to the following restrictions:
23  *
24  * 1. The origin of this software must not be misrepresented; you must not
25  *    claim that you wrote the original software. If you use this software
26  *    in a product, an acknowledgment in the product documentation would be
27  *    appreciated but is not required.
28  * 2. Altered source versions must be plainly marked as such, and must not be
29  *    misrepresented as being the original software.
30  * 3. This notice may not be removed or altered from any source distribution.
31  */
32 
33 /* This file is included by jccolor-mmi.c */
34 
35 
36 #if RGB_RED == 0
37 #define mmA  re
38 #define mmB  ro
39 #elif RGB_GREEN == 0
40 #define mmA  ge
41 #define mmB  go
42 #elif RGB_BLUE == 0
43 #define mmA  be
44 #define mmB  bo
45 #else
46 #define mmA  xe
47 #define mmB  xo
48 #endif
49 
50 #if RGB_RED == 1
51 #define mmC  re
52 #define mmD  ro
53 #elif RGB_GREEN == 1
54 #define mmC  ge
55 #define mmD  go
56 #elif RGB_BLUE == 1
57 #define mmC  be
58 #define mmD  bo
59 #else
60 #define mmC  xe
61 #define mmD  xo
62 #endif
63 
64 #if RGB_RED == 2
65 #define mmE  re
66 #define mmF  ro
67 #elif RGB_GREEN == 2
68 #define mmE  ge
69 #define mmF  go
70 #elif RGB_BLUE == 2
71 #define mmE  be
72 #define mmF  bo
73 #else
74 #define mmE  xe
75 #define mmF  xo
76 #endif
77 
78 #if RGB_RED == 3
79 #define mmG  re
80 #define mmH  ro
81 #elif RGB_GREEN == 3
82 #define mmG  ge
83 #define mmH  go
84 #elif RGB_BLUE == 3
85 #define mmG  be
86 #define mmH  bo
87 #else
88 #define mmG  xe
89 #define mmH  xo
90 #endif
91 
92 
jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)93 void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
94                                JSAMPIMAGE output_buf, JDIMENSION output_row,
95                                int num_rows)
96 {
97   JSAMPROW inptr, outptr0, outptr1, outptr2;
98   int num_cols, col;
99   __m64 re, ro, ge, go, be, bo, xe;
100 #if RGB_PIXELSIZE == 4
101   __m64 xo;
102 #endif
103   __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
104   __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
105   __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
106   __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
107   __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
108   __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
109   __m64 crle, crhe, cre, crlo, crho, cro, cr;
110 
111   while (--num_rows >= 0) {
112     inptr = *input_buf++;
113     outptr0 = output_buf[0][output_row];
114     outptr1 = output_buf[1][output_row];
115     outptr2 = output_buf[2][output_row];
116     output_row++;
117 
118     for (num_cols = image_width; num_cols > 0; num_cols -= 8,
119          outptr0 += 8, outptr1 += 8, outptr2 += 8) {
120 
121 #if RGB_PIXELSIZE == 3
122 
123       if (num_cols < 8) {
124         col = num_cols * 3;
125         asm(".set noreorder\r\n"
126 
127             "li       $8, 1\r\n"
128             "move     $9, %3\r\n"
129             "and      $10, $9, $8\r\n"
130             "beqz     $10, 1f\r\n"
131             "nop      \r\n"
132             "subu     $9, $9, 1\r\n"
133             "xor      $12, $12, $12\r\n"
134             "move     $13, %5\r\n"
135             PTR_ADDU  "$13, $13, $9\r\n"
136             "lbu      $12, 0($13)\r\n"
137 
138             "1:       \r\n"
139             "li       $8, 2\r\n"
140             "and      $10, $9, $8\r\n"
141             "beqz     $10, 2f\r\n"
142             "nop      \r\n"
143             "subu     $9, $9, 2\r\n"
144             "xor      $11, $11, $11\r\n"
145             "move     $13, %5\r\n"
146             PTR_ADDU  "$13, $13, $9\r\n"
147             "lhu      $11, 0($13)\r\n"
148             "sll      $12, $12, 16\r\n"
149             "or       $12, $12, $11\r\n"
150 
151             "2:       \r\n"
152             "dmtc1    $12, %0\r\n"
153             "li       $8, 4\r\n"
154             "and      $10, $9, $8\r\n"
155             "beqz     $10, 3f\r\n"
156             "nop      \r\n"
157             "subu     $9, $9, 4\r\n"
158             "move     $13, %5\r\n"
159             PTR_ADDU  "$13, $13, $9\r\n"
160             "lwu      $14, 0($13)\r\n"
161             "dmtc1    $14, %1\r\n"
162             "dsll32   $12, $12, 0\r\n"
163             "or       $12, $12, $14\r\n"
164             "dmtc1    $12, %0\r\n"
165 
166             "3:       \r\n"
167             "li       $8, 8\r\n"
168             "and      $10, $9, $8\r\n"
169             "beqz     $10, 4f\r\n"
170             "nop      \r\n"
171             "mov.s    %1, %0\r\n"
172             "ldc1     %0, 0(%5)\r\n"
173             "li       $9, 8\r\n"
174             "j        5f\r\n"
175             "nop      \r\n"
176 
177             "4:       \r\n"
178             "li       $8, 16\r\n"
179             "and      $10, $9, $8\r\n"
180             "beqz     $10, 5f\r\n"
181             "nop      \r\n"
182             "mov.s    %2, %0\r\n"
183             "ldc1     %0, 0(%5)\r\n"
184             "ldc1     %1, 8(%5)\r\n"
185 
186             "5:       \r\n"
187             "nop      \r\n"
188             ".set reorder\r\n"
189 
190             : "=f" (mmA), "=f" (mmG), "=f" (mmF)
191             : "r" (col), "r" (num_rows), "r" (inptr)
192             : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
193               "$14", "memory"
194            );
195       } else {
196         if (!(((long)inptr) & 7)) {
197           mmA = _mm_load_si64((__m64 *)&inptr[0]);
198           mmG = _mm_load_si64((__m64 *)&inptr[8]);
199           mmF = _mm_load_si64((__m64 *)&inptr[16]);
200         } else {
201           mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
202           mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
203           mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
204         }
205         inptr += RGB_PIXELSIZE * 8;
206       }
207       mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
208       mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
209 
210       mmA = _mm_unpackhi_pi8(mmA, mmG);
211       mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
212 
213       mmD = _mm_unpacklo_pi8(mmD, mmF);
214       mmG = _mm_unpackhi_pi8(mmG, mmF);
215 
216       mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
217       mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
218 
219       mmA = _mm_unpackhi_pi8(mmA, mmD);
220       mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
221 
222       mmE = _mm_unpacklo_pi8(mmE, mmG);
223       mmD = _mm_unpackhi_pi8(mmD, mmG);
224       mmC = _mm_loadhi_pi8_f(mmA);
225       mmA = _mm_loadlo_pi8_f(mmA);
226 
227       mmB = _mm_loadhi_pi8_f(mmE);
228       mmE = _mm_loadlo_pi8_f(mmE);
229 
230       mmF = _mm_loadhi_pi8_f(mmD);
231       mmD = _mm_loadlo_pi8_f(mmD);
232 
233 #else  /* RGB_PIXELSIZE == 4 */
234 
235       if (num_cols < 8) {
236         col = num_cols;
237         asm(".set noreorder\r\n"
238 
239             "li       $8, 1\r\n"
240             "move     $9, %4\r\n"
241             "and      $10, $9, $8\r\n"
242             "beqz     $10, 1f\r\n"
243             "nop      \r\n"
244             "subu     $9, $9, 1\r\n"
245             PTR_SLL   "$11, $9, 2\r\n"
246             "move     $13, %5\r\n"
247             PTR_ADDU  "$13, $13, $11\r\n"
248             "lwc1     %0, 0($13)\r\n"
249 
250             "1:       \r\n"
251             "li       $8, 2\r\n"
252             "and      $10, $9, $8\r\n"
253             "beqz     $10, 2f\r\n"
254             "nop      \r\n"
255             "subu     $9, $9, 2\r\n"
256             PTR_SLL   "$11, $9, 2\r\n"
257             "move     $13, %5\r\n"
258             PTR_ADDU  "$13, $13, $11\r\n"
259             "mov.s    %1, %0\r\n"
260             "ldc1     %0, 0($13)\r\n"
261 
262             "2:       \r\n"
263             "li       $8, 4\r\n"
264             "and      $10, $9, $8\r\n"
265             "beqz     $10, 3f\r\n"
266             "nop      \r\n"
267             "mov.s    %2, %0\r\n"
268             "mov.s    %3, %1\r\n"
269             "ldc1     %0, 0(%5)\r\n"
270             "ldc1     %1, 8(%5)\r\n"
271 
272             "3:       \r\n"
273             "nop      \r\n"
274             ".set reorder\r\n"
275 
276             : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
277             : "r" (col), "r" (inptr)
278             : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
279            );
280       } else {
281         if (!(((long)inptr) & 7)) {
282           mmA = _mm_load_si64((__m64 *)&inptr[0]);
283           mmF = _mm_load_si64((__m64 *)&inptr[8]);
284           mmD = _mm_load_si64((__m64 *)&inptr[16]);
285           mmC = _mm_load_si64((__m64 *)&inptr[24]);
286         } else {
287           mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
288           mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
289           mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
290           mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
291         }
292         inptr += RGB_PIXELSIZE * 8;
293       }
294       mmB = _mm_unpackhi_pi8(mmA, mmF);
295       mmA = _mm_unpacklo_pi8(mmA, mmF);
296 
297       mmG = _mm_unpackhi_pi8(mmD, mmC);
298       mmD = _mm_unpacklo_pi8(mmD, mmC);
299 
300       mmE = _mm_unpackhi_pi16(mmA, mmD);
301       mmA = _mm_unpacklo_pi16(mmA, mmD);
302 
303       mmH = _mm_unpackhi_pi16(mmB, mmG);
304       mmB = _mm_unpacklo_pi16(mmB, mmG);
305 
306       mmC = _mm_loadhi_pi8_f(mmA);
307       mmA = _mm_loadlo_pi8_f(mmA);
308 
309       mmD = _mm_loadhi_pi8_f(mmB);
310       mmB = _mm_loadlo_pi8_f(mmB);
311 
312       mmG = _mm_loadhi_pi8_f(mmE);
313       mmE = _mm_loadlo_pi8_f(mmE);
314 
315       mmF = _mm_unpacklo_pi8(mmH, mmH);
316       mmH = _mm_unpackhi_pi8(mmH, mmH);
317       mmF = _mm_srli_pi16(mmF, BYTE_BIT);
318       mmH = _mm_srli_pi16(mmH, BYTE_BIT);
319 
320 #endif
321 
322       /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
323        * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
324        *
325        * (Original)
326        * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
327        * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
328        * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
329        *
330        * (This implementation)
331        * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
332        * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
333        * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
334        */
335 
336       rglo = _mm_unpacklo_pi16(ro, go);
337       rgho = _mm_unpackhi_pi16(ro, go);
338       ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
339       yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
340       cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
341       cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
342 
343       blo = _mm_loadlo_pi16_f(bo);
344       bho = _mm_loadhi_pi16_f(bo);
345       halfblo = _mm_srli_pi32(blo, 1);
346       halfbho = _mm_srli_pi32(bho, 1);
347 
348       cblo = _mm_add_pi32(cblo, halfblo);
349       cbho = _mm_add_pi32(cbho, halfbho);
350       cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
351       cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
352       cblo = _mm_srli_pi32(cblo, SCALEBITS);
353       cbho = _mm_srli_pi32(cbho, SCALEBITS);
354       cbo = _mm_packs_pi32(cblo, cbho);
355 
356       rgle = _mm_unpacklo_pi16(re, ge);
357       rghe = _mm_unpackhi_pi16(re, ge);
358       yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
359       yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
360       cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
361       cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
362 
363       ble = _mm_loadlo_pi16_f(be);
364       bhe = _mm_loadhi_pi16_f(be);
365       halfble = _mm_srli_pi32(ble, 1);
366       halfbhe = _mm_srli_pi32(bhe, 1);
367 
368       cble = _mm_add_pi32(cble, halfble);
369       cbhe = _mm_add_pi32(cbhe, halfbhe);
370       cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
371       cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
372       cble = _mm_srli_pi32(cble, SCALEBITS);
373       cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
374       cbe = _mm_packs_pi32(cble, cbhe);
375 
376       cbo = _mm_slli_pi16(cbo, BYTE_BIT);
377       cb = _mm_or_si64(cbe, cbo);
378 
379       bglo = _mm_unpacklo_pi16(bo, go);
380       bgho = _mm_unpackhi_pi16(bo, go);
381       ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
382       yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
383       crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
384       crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
385 
386       ylo = _mm_add_pi32(ylo_bg, ylo_rg);
387       yho = _mm_add_pi32(yho_bg, yho_rg);
388       ylo = _mm_add_pi32(ylo, PD_ONEHALF);
389       yho = _mm_add_pi32(yho, PD_ONEHALF);
390       ylo = _mm_srli_pi32(ylo, SCALEBITS);
391       yho = _mm_srli_pi32(yho, SCALEBITS);
392       yo = _mm_packs_pi32(ylo, yho);
393 
394       rlo = _mm_loadlo_pi16_f(ro);
395       rho = _mm_loadhi_pi16_f(ro);
396       halfrlo = _mm_srli_pi32(rlo, 1);
397       halfrho = _mm_srli_pi32(rho, 1);
398 
399       crlo = _mm_add_pi32(crlo, halfrlo);
400       crho = _mm_add_pi32(crho, halfrho);
401       crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
402       crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
403       crlo = _mm_srli_pi32(crlo, SCALEBITS);
404       crho = _mm_srli_pi32(crho, SCALEBITS);
405       cro = _mm_packs_pi32(crlo, crho);
406 
407       bgle = _mm_unpacklo_pi16(be, ge);
408       bghe = _mm_unpackhi_pi16(be, ge);
409       yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
410       yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
411       crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
412       crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
413 
414       yle = _mm_add_pi32(yle_bg, yle_rg);
415       yhe = _mm_add_pi32(yhe_bg, yhe_rg);
416       yle = _mm_add_pi32(yle, PD_ONEHALF);
417       yhe = _mm_add_pi32(yhe, PD_ONEHALF);
418       yle = _mm_srli_pi32(yle, SCALEBITS);
419       yhe = _mm_srli_pi32(yhe, SCALEBITS);
420       ye = _mm_packs_pi32(yle, yhe);
421 
422       yo = _mm_slli_pi16(yo, BYTE_BIT);
423       y = _mm_or_si64(ye, yo);
424 
425       rle = _mm_loadlo_pi16_f(re);
426       rhe = _mm_loadhi_pi16_f(re);
427       halfrle = _mm_srli_pi32(rle, 1);
428       halfrhe = _mm_srli_pi32(rhe, 1);
429 
430       crle = _mm_add_pi32(crle, halfrle);
431       crhe = _mm_add_pi32(crhe, halfrhe);
432       crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
433       crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
434       crle = _mm_srli_pi32(crle, SCALEBITS);
435       crhe = _mm_srli_pi32(crhe, SCALEBITS);
436       cre = _mm_packs_pi32(crle, crhe);
437 
438       cro = _mm_slli_pi16(cro, BYTE_BIT);
439       cr = _mm_or_si64(cre, cro);
440 
441       _mm_store_si64((__m64 *)&outptr0[0], y);
442       _mm_store_si64((__m64 *)&outptr1[0], cb);
443       _mm_store_si64((__m64 *)&outptr2[0], cr);
444     }
445   }
446 }
447 
448 #undef mmA
449 #undef mmB
450 #undef mmC
451 #undef mmD
452 #undef mmE
453 #undef mmF
454 #undef mmG
455 #undef mmH
456