1 /*
2 * Loongson MMI optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
5 * All Rights Reserved.
6 * Authors: ZhuChen <zhuchen@loongson.cn>
7 * CaiWanwei <caiwanwei@loongson.cn>
8 * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
9 * Copyright (C) 2018, D. R. Commander. All Rights Reserved.
10 *
11 * Based on the x86 SIMD extension for IJG JPEG library
12 * Copyright (C) 1999-2006, MIYASAKA Masaru.
13 *
14 * This software is provided 'as-is', without any express or implied
15 * warranty. In no event will the authors be held liable for any damages
16 * arising from the use of this software.
17 *
18 * Permission is granted to anyone to use this software for any purpose,
19 * including commercial applications, and to alter it and redistribute it
20 * freely, subject to the following restrictions:
21 *
22 * 1. The origin of this software must not be misrepresented; you must not
23 * claim that you wrote the original software. If you use this software
24 * in a product, an acknowledgment in the product documentation would be
25 * appreciated but is not required.
26 * 2. Altered source versions must be plainly marked as such, and must not be
27 * misrepresented as being the original software.
28 * 3. This notice may not be removed or altered from any source distribution.
29 */
30
31 /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
32
33 #include "jsimd_mmi.h"
34
35
36 #define DO_QUANT() { \
37 mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
38 mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
39 \
40 mm0 = mm2; \
41 mm1 = mm3; \
42 \
43 mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1)); /* -1 if value < 0, */ \
44 /* 0 otherwise */ \
45 mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \
46 \
47 mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
48 mm1 = _mm_xor_si64(mm1, mm3); \
49 mm0 = _mm_sub_pi16(mm0, mm2); \
50 mm1 = _mm_sub_pi16(mm1, mm3); \
51 \
52 corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \
53 corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
54 \
55 mm0 = _mm_add_pi16(mm0, corr0); /* correction + roundfactor */ \
56 mm1 = _mm_add_pi16(mm1, corr1); \
57 \
58 mm4 = mm0; \
59 mm5 = mm1; \
60 \
61 recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \
62 recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
63 \
64 mm0 = _mm_mulhi_pi16(mm0, recip0); \
65 mm1 = _mm_mulhi_pi16(mm1, recip1); \
66 \
67 mm0 = _mm_add_pi16(mm0, mm4); /* reciprocal is always negative */ \
68 mm1 = _mm_add_pi16(mm1, mm5); /* (MSB=1), so we always need to add the */ \
69 /* initial value (input value is never */ \
70 /* negative as we inverted it at the */ \
71 /* start of this routine) */ \
72 \
73 scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \
74 scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
75 \
76 mm6 = scale0; \
77 mm7 = scale1; \
78 mm4 = mm0; \
79 mm5 = mm1; \
80 \
81 mm0 = _mm_mulhi_pi16(mm0, mm6); \
82 mm1 = _mm_mulhi_pi16(mm1, mm7); \
83 \
84 mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1)); /* determine if scale... */ \
85 /* is negative */ \
86 mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \
87 \
88 mm6 = _mm_and_si64(mm6, mm4); /* and add input if it is */ \
89 mm7 = _mm_and_si64(mm7, mm5); \
90 mm0 = _mm_add_pi16(mm0, mm6); \
91 mm1 = _mm_add_pi16(mm1, mm7); \
92 \
93 mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1)); /* then check if... */ \
94 mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1)); /* negative input */ \
95 \
96 mm4 = _mm_and_si64(mm4, scale0); /* and add scale if it is */ \
97 mm5 = _mm_and_si64(mm5, scale1); \
98 mm0 = _mm_add_pi16(mm0, mm4); \
99 mm1 = _mm_add_pi16(mm1, mm5); \
100 \
101 mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
102 mm1 = _mm_xor_si64(mm1, mm3); \
103 mm0 = _mm_sub_pi16(mm0, mm2); \
104 mm1 = _mm_sub_pi16(mm1, mm3); \
105 \
106 _mm_store_si64((__m64 *)&output_ptr[0], mm0); \
107 _mm_store_si64((__m64 *)&output_ptr[4], mm1); \
108 \
109 workspace += DCTSIZE; \
110 divisors += DCTSIZE; \
111 output_ptr += DCTSIZE; \
112 }
113
114
jsimd_quantize_mmi(JCOEFPTR coef_block,DCTELEM * divisors,DCTELEM * workspace)115 void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
116 DCTELEM *workspace)
117 {
118 JCOEFPTR output_ptr = coef_block;
119 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
120 __m64 corr0, corr1, recip0, recip1, scale0, scale1;
121
122 DO_QUANT()
123 DO_QUANT()
124 DO_QUANT()
125 DO_QUANT()
126 DO_QUANT()
127 DO_QUANT()
128 DO_QUANT()
129 DO_QUANT()
130 }
131