1 /*
2 * Loongson MMI optimizations for libjpeg-turbo
3 *
4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
7 * All Rights Reserved.
8 * Authors: ZhuChen <zhuchen@loongson.cn>
9 * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
10 * CaiWanwei <caiwanwei@loongson.cn>
11 * ZhangLixia <zhanglixia-hf@loongson.cn>
12 *
13 * Based on the x86 SIMD extension for IJG JPEG library
14 * Copyright (C) 1999-2006, MIYASAKA Masaru.
15 *
16 * This software is provided 'as-is', without any express or implied
17 * warranty. In no event will the authors be held liable for any damages
18 * arising from the use of this software.
19 *
20 * Permission is granted to anyone to use this software for any purpose,
21 * including commercial applications, and to alter it and redistribute it
22 * freely, subject to the following restrictions:
23 *
24 * 1. The origin of this software must not be misrepresented; you must not
25 * claim that you wrote the original software. If you use this software
26 * in a product, an acknowledgment in the product documentation would be
27 * appreciated but is not required.
28 * 2. Altered source versions must be plainly marked as such, and must not be
29 * misrepresented as being the original software.
30 * 3. This notice may not be removed or altered from any source distribution.
31 */
32
33 /* This file is included by jccolor-mmi.c */
34
35
36 #if RGB_RED == 0
37 #define mmA re
38 #define mmB ro
39 #elif RGB_GREEN == 0
40 #define mmA ge
41 #define mmB go
42 #elif RGB_BLUE == 0
43 #define mmA be
44 #define mmB bo
45 #else
46 #define mmA xe
47 #define mmB xo
48 #endif
49
50 #if RGB_RED == 1
51 #define mmC re
52 #define mmD ro
53 #elif RGB_GREEN == 1
54 #define mmC ge
55 #define mmD go
56 #elif RGB_BLUE == 1
57 #define mmC be
58 #define mmD bo
59 #else
60 #define mmC xe
61 #define mmD xo
62 #endif
63
64 #if RGB_RED == 2
65 #define mmE re
66 #define mmF ro
67 #elif RGB_GREEN == 2
68 #define mmE ge
69 #define mmF go
70 #elif RGB_BLUE == 2
71 #define mmE be
72 #define mmF bo
73 #else
74 #define mmE xe
75 #define mmF xo
76 #endif
77
78 #if RGB_RED == 3
79 #define mmG re
80 #define mmH ro
81 #elif RGB_GREEN == 3
82 #define mmG ge
83 #define mmH go
84 #elif RGB_BLUE == 3
85 #define mmG be
86 #define mmH bo
87 #else
88 #define mmG xe
89 #define mmH xo
90 #endif
91
92
jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)93 void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
94 JSAMPIMAGE output_buf, JDIMENSION output_row,
95 int num_rows)
96 {
97 JSAMPROW inptr, outptr0, outptr1, outptr2;
98 int num_cols, col;
99 __m64 re, ro, ge, go, be, bo, xe;
100 #if RGB_PIXELSIZE == 4
101 __m64 xo;
102 #endif
103 __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
104 __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
105 __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
106 __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
107 __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
108 __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
109 __m64 crle, crhe, cre, crlo, crho, cro, cr;
110
111 while (--num_rows >= 0) {
112 inptr = *input_buf++;
113 outptr0 = output_buf[0][output_row];
114 outptr1 = output_buf[1][output_row];
115 outptr2 = output_buf[2][output_row];
116 output_row++;
117
118 for (num_cols = image_width; num_cols > 0; num_cols -= 8,
119 outptr0 += 8, outptr1 += 8, outptr2 += 8) {
120
121 #if RGB_PIXELSIZE == 3
122
123 if (num_cols < 8) {
124 col = num_cols * 3;
125 asm(".set noreorder\r\n"
126
127 "li $8, 1\r\n"
128 "move $9, %3\r\n"
129 "and $10, $9, $8\r\n"
130 "beqz $10, 1f\r\n"
131 "nop \r\n"
132 "subu $9, $9, 1\r\n"
133 "xor $12, $12, $12\r\n"
134 "move $13, %5\r\n"
135 PTR_ADDU "$13, $13, $9\r\n"
136 "lbu $12, 0($13)\r\n"
137
138 "1: \r\n"
139 "li $8, 2\r\n"
140 "and $10, $9, $8\r\n"
141 "beqz $10, 2f\r\n"
142 "nop \r\n"
143 "subu $9, $9, 2\r\n"
144 "xor $11, $11, $11\r\n"
145 "move $13, %5\r\n"
146 PTR_ADDU "$13, $13, $9\r\n"
147 "lhu $11, 0($13)\r\n"
148 "sll $12, $12, 16\r\n"
149 "or $12, $12, $11\r\n"
150
151 "2: \r\n"
152 "dmtc1 $12, %0\r\n"
153 "li $8, 4\r\n"
154 "and $10, $9, $8\r\n"
155 "beqz $10, 3f\r\n"
156 "nop \r\n"
157 "subu $9, $9, 4\r\n"
158 "move $13, %5\r\n"
159 PTR_ADDU "$13, $13, $9\r\n"
160 "lwu $14, 0($13)\r\n"
161 "dmtc1 $14, %1\r\n"
162 "dsll32 $12, $12, 0\r\n"
163 "or $12, $12, $14\r\n"
164 "dmtc1 $12, %0\r\n"
165
166 "3: \r\n"
167 "li $8, 8\r\n"
168 "and $10, $9, $8\r\n"
169 "beqz $10, 4f\r\n"
170 "nop \r\n"
171 "mov.s %1, %0\r\n"
172 "ldc1 %0, 0(%5)\r\n"
173 "li $9, 8\r\n"
174 "j 5f\r\n"
175 "nop \r\n"
176
177 "4: \r\n"
178 "li $8, 16\r\n"
179 "and $10, $9, $8\r\n"
180 "beqz $10, 5f\r\n"
181 "nop \r\n"
182 "mov.s %2, %0\r\n"
183 "ldc1 %0, 0(%5)\r\n"
184 "ldc1 %1, 8(%5)\r\n"
185
186 "5: \r\n"
187 "nop \r\n"
188 ".set reorder\r\n"
189
190 : "=f" (mmA), "=f" (mmG), "=f" (mmF)
191 : "r" (col), "r" (num_rows), "r" (inptr)
192 : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
193 "$14", "memory"
194 );
195 } else {
196 if (!(((long)inptr) & 7)) {
197 mmA = _mm_load_si64((__m64 *)&inptr[0]);
198 mmG = _mm_load_si64((__m64 *)&inptr[8]);
199 mmF = _mm_load_si64((__m64 *)&inptr[16]);
200 } else {
201 mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
202 mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
203 mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
204 }
205 inptr += RGB_PIXELSIZE * 8;
206 }
207 mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
208 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
209
210 mmA = _mm_unpackhi_pi8(mmA, mmG);
211 mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
212
213 mmD = _mm_unpacklo_pi8(mmD, mmF);
214 mmG = _mm_unpackhi_pi8(mmG, mmF);
215
216 mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
217 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
218
219 mmA = _mm_unpackhi_pi8(mmA, mmD);
220 mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
221
222 mmE = _mm_unpacklo_pi8(mmE, mmG);
223 mmD = _mm_unpackhi_pi8(mmD, mmG);
224 mmC = _mm_loadhi_pi8_f(mmA);
225 mmA = _mm_loadlo_pi8_f(mmA);
226
227 mmB = _mm_loadhi_pi8_f(mmE);
228 mmE = _mm_loadlo_pi8_f(mmE);
229
230 mmF = _mm_loadhi_pi8_f(mmD);
231 mmD = _mm_loadlo_pi8_f(mmD);
232
233 #else /* RGB_PIXELSIZE == 4 */
234
235 if (num_cols < 8) {
236 col = num_cols;
237 asm(".set noreorder\r\n"
238
239 "li $8, 1\r\n"
240 "move $9, %4\r\n"
241 "and $10, $9, $8\r\n"
242 "beqz $10, 1f\r\n"
243 "nop \r\n"
244 "subu $9, $9, 1\r\n"
245 PTR_SLL "$11, $9, 2\r\n"
246 "move $13, %5\r\n"
247 PTR_ADDU "$13, $13, $11\r\n"
248 "lwc1 %0, 0($13)\r\n"
249
250 "1: \r\n"
251 "li $8, 2\r\n"
252 "and $10, $9, $8\r\n"
253 "beqz $10, 2f\r\n"
254 "nop \r\n"
255 "subu $9, $9, 2\r\n"
256 PTR_SLL "$11, $9, 2\r\n"
257 "move $13, %5\r\n"
258 PTR_ADDU "$13, $13, $11\r\n"
259 "mov.s %1, %0\r\n"
260 "ldc1 %0, 0($13)\r\n"
261
262 "2: \r\n"
263 "li $8, 4\r\n"
264 "and $10, $9, $8\r\n"
265 "beqz $10, 3f\r\n"
266 "nop \r\n"
267 "mov.s %2, %0\r\n"
268 "mov.s %3, %1\r\n"
269 "ldc1 %0, 0(%5)\r\n"
270 "ldc1 %1, 8(%5)\r\n"
271
272 "3: \r\n"
273 "nop \r\n"
274 ".set reorder\r\n"
275
276 : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
277 : "r" (col), "r" (inptr)
278 : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
279 );
280 } else {
281 if (!(((long)inptr) & 7)) {
282 mmA = _mm_load_si64((__m64 *)&inptr[0]);
283 mmF = _mm_load_si64((__m64 *)&inptr[8]);
284 mmD = _mm_load_si64((__m64 *)&inptr[16]);
285 mmC = _mm_load_si64((__m64 *)&inptr[24]);
286 } else {
287 mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
288 mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
289 mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
290 mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
291 }
292 inptr += RGB_PIXELSIZE * 8;
293 }
294 mmB = _mm_unpackhi_pi8(mmA, mmF);
295 mmA = _mm_unpacklo_pi8(mmA, mmF);
296
297 mmG = _mm_unpackhi_pi8(mmD, mmC);
298 mmD = _mm_unpacklo_pi8(mmD, mmC);
299
300 mmE = _mm_unpackhi_pi16(mmA, mmD);
301 mmA = _mm_unpacklo_pi16(mmA, mmD);
302
303 mmH = _mm_unpackhi_pi16(mmB, mmG);
304 mmB = _mm_unpacklo_pi16(mmB, mmG);
305
306 mmC = _mm_loadhi_pi8_f(mmA);
307 mmA = _mm_loadlo_pi8_f(mmA);
308
309 mmD = _mm_loadhi_pi8_f(mmB);
310 mmB = _mm_loadlo_pi8_f(mmB);
311
312 mmG = _mm_loadhi_pi8_f(mmE);
313 mmE = _mm_loadlo_pi8_f(mmE);
314
315 mmF = _mm_unpacklo_pi8(mmH, mmH);
316 mmH = _mm_unpackhi_pi8(mmH, mmH);
317 mmF = _mm_srli_pi16(mmF, BYTE_BIT);
318 mmH = _mm_srli_pi16(mmH, BYTE_BIT);
319
320 #endif
321
322 /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
323 * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
324 *
325 * (Original)
326 * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
327 * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
328 * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
329 *
330 * (This implementation)
331 * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
332 * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
333 * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
334 */
335
336 rglo = _mm_unpacklo_pi16(ro, go);
337 rgho = _mm_unpackhi_pi16(ro, go);
338 ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
339 yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
340 cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
341 cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
342
343 blo = _mm_loadlo_pi16_f(bo);
344 bho = _mm_loadhi_pi16_f(bo);
345 halfblo = _mm_srli_pi32(blo, 1);
346 halfbho = _mm_srli_pi32(bho, 1);
347
348 cblo = _mm_add_pi32(cblo, halfblo);
349 cbho = _mm_add_pi32(cbho, halfbho);
350 cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
351 cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
352 cblo = _mm_srli_pi32(cblo, SCALEBITS);
353 cbho = _mm_srli_pi32(cbho, SCALEBITS);
354 cbo = _mm_packs_pi32(cblo, cbho);
355
356 rgle = _mm_unpacklo_pi16(re, ge);
357 rghe = _mm_unpackhi_pi16(re, ge);
358 yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
359 yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
360 cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
361 cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
362
363 ble = _mm_loadlo_pi16_f(be);
364 bhe = _mm_loadhi_pi16_f(be);
365 halfble = _mm_srli_pi32(ble, 1);
366 halfbhe = _mm_srli_pi32(bhe, 1);
367
368 cble = _mm_add_pi32(cble, halfble);
369 cbhe = _mm_add_pi32(cbhe, halfbhe);
370 cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
371 cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
372 cble = _mm_srli_pi32(cble, SCALEBITS);
373 cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
374 cbe = _mm_packs_pi32(cble, cbhe);
375
376 cbo = _mm_slli_pi16(cbo, BYTE_BIT);
377 cb = _mm_or_si64(cbe, cbo);
378
379 bglo = _mm_unpacklo_pi16(bo, go);
380 bgho = _mm_unpackhi_pi16(bo, go);
381 ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
382 yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
383 crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
384 crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
385
386 ylo = _mm_add_pi32(ylo_bg, ylo_rg);
387 yho = _mm_add_pi32(yho_bg, yho_rg);
388 ylo = _mm_add_pi32(ylo, PD_ONEHALF);
389 yho = _mm_add_pi32(yho, PD_ONEHALF);
390 ylo = _mm_srli_pi32(ylo, SCALEBITS);
391 yho = _mm_srli_pi32(yho, SCALEBITS);
392 yo = _mm_packs_pi32(ylo, yho);
393
394 rlo = _mm_loadlo_pi16_f(ro);
395 rho = _mm_loadhi_pi16_f(ro);
396 halfrlo = _mm_srli_pi32(rlo, 1);
397 halfrho = _mm_srli_pi32(rho, 1);
398
399 crlo = _mm_add_pi32(crlo, halfrlo);
400 crho = _mm_add_pi32(crho, halfrho);
401 crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
402 crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
403 crlo = _mm_srli_pi32(crlo, SCALEBITS);
404 crho = _mm_srli_pi32(crho, SCALEBITS);
405 cro = _mm_packs_pi32(crlo, crho);
406
407 bgle = _mm_unpacklo_pi16(be, ge);
408 bghe = _mm_unpackhi_pi16(be, ge);
409 yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
410 yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
411 crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
412 crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
413
414 yle = _mm_add_pi32(yle_bg, yle_rg);
415 yhe = _mm_add_pi32(yhe_bg, yhe_rg);
416 yle = _mm_add_pi32(yle, PD_ONEHALF);
417 yhe = _mm_add_pi32(yhe, PD_ONEHALF);
418 yle = _mm_srli_pi32(yle, SCALEBITS);
419 yhe = _mm_srli_pi32(yhe, SCALEBITS);
420 ye = _mm_packs_pi32(yle, yhe);
421
422 yo = _mm_slli_pi16(yo, BYTE_BIT);
423 y = _mm_or_si64(ye, yo);
424
425 rle = _mm_loadlo_pi16_f(re);
426 rhe = _mm_loadhi_pi16_f(re);
427 halfrle = _mm_srli_pi32(rle, 1);
428 halfrhe = _mm_srli_pi32(rhe, 1);
429
430 crle = _mm_add_pi32(crle, halfrle);
431 crhe = _mm_add_pi32(crhe, halfrhe);
432 crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
433 crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
434 crle = _mm_srli_pi32(crle, SCALEBITS);
435 crhe = _mm_srli_pi32(crhe, SCALEBITS);
436 cre = _mm_packs_pi32(crle, crhe);
437
438 cro = _mm_slli_pi16(cro, BYTE_BIT);
439 cr = _mm_or_si64(cre, cro);
440
441 _mm_store_si64((__m64 *)&outptr0[0], y);
442 _mm_store_si64((__m64 *)&outptr1[0], cb);
443 _mm_store_si64((__m64 *)&outptr2[0], cr);
444 }
445 }
446 }
447
448 #undef mmA
449 #undef mmB
450 #undef mmC
451 #undef mmD
452 #undef mmE
453 #undef mmF
454 #undef mmG
455 #undef mmH
456