1 /*
2 * The copyright in this software is being made available under the 2-clauses
3 * BSD License, included below. This software may be subject to other third
4 * party and contributor rights, including patent rights, and no such rights
5 * are granted under this license.
6 *
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8 * Copyright (c) 2002-2014, Professor Benoit Macq
9 * Copyright (c) 2001-2003, David Janssens
10 * Copyright (c) 2002-2003, Yannick Verschueren
11 * Copyright (c) 2003-2007, Francois-Olivier Devaux
12 * Copyright (c) 2003-2014, Antonin Descampe
13 * Copyright (c) 2005, Herve Drolon, FreeImage Team
14 * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR
15 * Copyright (c) 2012, CS Systemes d'Information, France
16 * All rights reserved.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 * notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 * notice, this list of conditions and the following disclaimer in the
25 * documentation and/or other materials provided with the distribution.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
28 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 #if defined(__SSE__) && !defined(_M_IX86) && !defined(__i386)
41 #define USE_SSE
42 #include <xmmintrin.h>
43 #endif
44 #if defined(__SSE2__) && !defined(_M_IX86) && !defined(__i386)
45 #define USE_SSE2
46 #include <emmintrin.h>
47 #endif
48 #if defined(__SSE4_1__) && !defined(_M_IX86) && !defined(__i386)
49 #include <smmintrin.h>
50 #endif
51
52 #include "opj_includes.h"
53
54 /* <summary> */
55 /* This table contains the norms of the basis function of the reversible MCT. */
56 /* </summary> */
57 static const OPJ_FLOAT64 opj_mct_norms[3] = { 1.732, .8292, .8292 };
58
59 /* <summary> */
60 /* This table contains the norms of the basis function of the irreversible MCT. */
61 /* </summary> */
62 static const OPJ_FLOAT64 opj_mct_norms_real[3] = { 1.732, 1.805, 1.573 };
63
opj_mct_get_mct_norms()64 const OPJ_FLOAT64 * opj_mct_get_mct_norms()
65 {
66 return opj_mct_norms;
67 }
68
opj_mct_get_mct_norms_real()69 const OPJ_FLOAT64 * opj_mct_get_mct_norms_real()
70 {
71 return opj_mct_norms_real;
72 }
73
74 /* <summary> */
75 /* Forward reversible MCT. */
76 /* </summary> */
77 #ifdef USE_SSE2
opj_mct_encode(OPJ_INT32 * OPJ_RESTRICT c0,OPJ_INT32 * OPJ_RESTRICT c1,OPJ_INT32 * OPJ_RESTRICT c2,OPJ_SIZE_T n)78 void opj_mct_encode(
79 OPJ_INT32* OPJ_RESTRICT c0,
80 OPJ_INT32* OPJ_RESTRICT c1,
81 OPJ_INT32* OPJ_RESTRICT c2,
82 OPJ_SIZE_T n)
83 {
84 OPJ_SIZE_T i;
85 const OPJ_SIZE_T len = n;
86 /* buffer are aligned on 16 bytes */
87 assert(((size_t)c0 & 0xf) == 0);
88 assert(((size_t)c1 & 0xf) == 0);
89 assert(((size_t)c2 & 0xf) == 0);
90
91 for (i = 0; i < (len & ~3U); i += 4) {
92 __m128i y, u, v;
93 __m128i r = _mm_load_si128((const __m128i *) & (c0[i]));
94 __m128i g = _mm_load_si128((const __m128i *) & (c1[i]));
95 __m128i b = _mm_load_si128((const __m128i *) & (c2[i]));
96 y = _mm_add_epi32(g, g);
97 y = _mm_add_epi32(y, b);
98 y = _mm_add_epi32(y, r);
99 y = _mm_srai_epi32(y, 2);
100 u = _mm_sub_epi32(b, g);
101 v = _mm_sub_epi32(r, g);
102 _mm_store_si128((__m128i *) & (c0[i]), y);
103 _mm_store_si128((__m128i *) & (c1[i]), u);
104 _mm_store_si128((__m128i *) & (c2[i]), v);
105 }
106
107 for (; i < len; ++i) {
108 OPJ_INT32 r = c0[i];
109 OPJ_INT32 g = c1[i];
110 OPJ_INT32 b = c2[i];
111 OPJ_INT32 y = (r + (g * 2) + b) >> 2;
112 OPJ_INT32 u = b - g;
113 OPJ_INT32 v = r - g;
114 c0[i] = y;
115 c1[i] = u;
116 c2[i] = v;
117 }
118 }
119 #else
opj_mct_encode(OPJ_INT32 * OPJ_RESTRICT c0,OPJ_INT32 * OPJ_RESTRICT c1,OPJ_INT32 * OPJ_RESTRICT c2,OPJ_SIZE_T n)120 void opj_mct_encode(
121 OPJ_INT32* OPJ_RESTRICT c0,
122 OPJ_INT32* OPJ_RESTRICT c1,
123 OPJ_INT32* OPJ_RESTRICT c2,
124 OPJ_SIZE_T n)
125 {
126 OPJ_SIZE_T i;
127 const OPJ_SIZE_T len = n;
128
129 for (i = 0; i < len; ++i) {
130 OPJ_INT32 r = c0[i];
131 OPJ_INT32 g = c1[i];
132 OPJ_INT32 b = c2[i];
133 OPJ_INT32 y = (r + (g * 2) + b) >> 2;
134 OPJ_INT32 u = b - g;
135 OPJ_INT32 v = r - g;
136 c0[i] = y;
137 c1[i] = u;
138 c2[i] = v;
139 }
140 }
141 #endif
142
143 /* <summary> */
144 /* Inverse reversible MCT. */
145 /* </summary> */
146 #ifdef USE_SSE2
opj_mct_decode(OPJ_INT32 * OPJ_RESTRICT c0,OPJ_INT32 * OPJ_RESTRICT c1,OPJ_INT32 * OPJ_RESTRICT c2,OPJ_SIZE_T n)147 void opj_mct_decode(
148 OPJ_INT32* OPJ_RESTRICT c0,
149 OPJ_INT32* OPJ_RESTRICT c1,
150 OPJ_INT32* OPJ_RESTRICT c2,
151 OPJ_SIZE_T n)
152 {
153 OPJ_SIZE_T i;
154 const OPJ_SIZE_T len = n;
155
156 for (i = 0; i < (len & ~3U); i += 4) {
157 __m128i r, g, b;
158 __m128i y = _mm_load_si128((const __m128i *) & (c0[i]));
159 __m128i u = _mm_load_si128((const __m128i *) & (c1[i]));
160 __m128i v = _mm_load_si128((const __m128i *) & (c2[i]));
161 g = y;
162 g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2));
163 r = _mm_add_epi32(v, g);
164 b = _mm_add_epi32(u, g);
165 _mm_store_si128((__m128i *) & (c0[i]), r);
166 _mm_store_si128((__m128i *) & (c1[i]), g);
167 _mm_store_si128((__m128i *) & (c2[i]), b);
168 }
169 for (; i < len; ++i) {
170 OPJ_INT32 y = c0[i];
171 OPJ_INT32 u = c1[i];
172 OPJ_INT32 v = c2[i];
173 OPJ_INT32 g = y - ((u + v) >> 2);
174 OPJ_INT32 r = v + g;
175 OPJ_INT32 b = u + g;
176 c0[i] = r;
177 c1[i] = g;
178 c2[i] = b;
179 }
180 }
181 #else
opj_mct_decode(OPJ_INT32 * OPJ_RESTRICT c0,OPJ_INT32 * OPJ_RESTRICT c1,OPJ_INT32 * OPJ_RESTRICT c2,OPJ_SIZE_T n)182 void opj_mct_decode(
183 OPJ_INT32* OPJ_RESTRICT c0,
184 OPJ_INT32* OPJ_RESTRICT c1,
185 OPJ_INT32* OPJ_RESTRICT c2,
186 OPJ_SIZE_T n)
187 {
188 OPJ_SIZE_T i;
189 for (i = 0; i < n; ++i) {
190 OPJ_INT32 y = c0[i];
191 OPJ_INT32 u = c1[i];
192 OPJ_INT32 v = c2[i];
193 OPJ_INT32 g = y - ((u + v) >> 2);
194 OPJ_INT32 r = v + g;
195 OPJ_INT32 b = u + g;
196 c0[i] = r;
197 c1[i] = g;
198 c2[i] = b;
199 }
200 }
201 #endif
202
203 /* <summary> */
204 /* Get norm of basis function of reversible MCT. */
205 /* </summary> */
opj_mct_getnorm(OPJ_UINT32 compno)206 OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno)
207 {
208 return opj_mct_norms[compno];
209 }
210
211 /* <summary> */
212 /* Forward irreversible MCT. */
213 /* </summary> */
opj_mct_encode_real(OPJ_FLOAT32 * OPJ_RESTRICT c0,OPJ_FLOAT32 * OPJ_RESTRICT c1,OPJ_FLOAT32 * OPJ_RESTRICT c2,OPJ_SIZE_T n)214 void opj_mct_encode_real(
215 OPJ_FLOAT32* OPJ_RESTRICT c0,
216 OPJ_FLOAT32* OPJ_RESTRICT c1,
217 OPJ_FLOAT32* OPJ_RESTRICT c2,
218 OPJ_SIZE_T n)
219 {
220 OPJ_SIZE_T i;
221 #ifdef USE_SSE
222 const __m128 YR = _mm_set1_ps(0.299f);
223 const __m128 YG = _mm_set1_ps(0.587f);
224 const __m128 YB = _mm_set1_ps(0.114f);
225 const __m128 UR = _mm_set1_ps(-0.16875f);
226 const __m128 UG = _mm_set1_ps(-0.331260f);
227 const __m128 UB = _mm_set1_ps(0.5f);
228 const __m128 VR = _mm_set1_ps(0.5f);
229 const __m128 VG = _mm_set1_ps(-0.41869f);
230 const __m128 VB = _mm_set1_ps(-0.08131f);
231 for (i = 0; i < (n >> 3); i ++) {
232 __m128 r, g, b, y, u, v;
233
234 r = _mm_load_ps(c0);
235 g = _mm_load_ps(c1);
236 b = _mm_load_ps(c2);
237 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
238 _mm_mul_ps(b, YB));
239 u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
240 _mm_mul_ps(b, UB));
241 v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
242 _mm_mul_ps(b, VB));
243 _mm_store_ps(c0, y);
244 _mm_store_ps(c1, u);
245 _mm_store_ps(c2, v);
246 c0 += 4;
247 c1 += 4;
248 c2 += 4;
249
250 r = _mm_load_ps(c0);
251 g = _mm_load_ps(c1);
252 b = _mm_load_ps(c2);
253 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
254 _mm_mul_ps(b, YB));
255 u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
256 _mm_mul_ps(b, UB));
257 v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
258 _mm_mul_ps(b, VB));
259 _mm_store_ps(c0, y);
260 _mm_store_ps(c1, u);
261 _mm_store_ps(c2, v);
262 c0 += 4;
263 c1 += 4;
264 c2 += 4;
265 }
266 n &= 7;
267 #endif
268 for (i = 0; i < n; ++i) {
269 OPJ_FLOAT32 r = c0[i];
270 OPJ_FLOAT32 g = c1[i];
271 OPJ_FLOAT32 b = c2[i];
272 OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b;
273 OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b;
274 OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b;
275 c0[i] = y;
276 c1[i] = u;
277 c2[i] = v;
278 }
279 }
280
281 /* <summary> */
282 /* Inverse irreversible MCT. */
283 /* </summary> */
opj_mct_decode_real(OPJ_FLOAT32 * OPJ_RESTRICT c0,OPJ_FLOAT32 * OPJ_RESTRICT c1,OPJ_FLOAT32 * OPJ_RESTRICT c2,OPJ_SIZE_T n)284 void opj_mct_decode_real(
285 OPJ_FLOAT32* OPJ_RESTRICT c0,
286 OPJ_FLOAT32* OPJ_RESTRICT c1,
287 OPJ_FLOAT32* OPJ_RESTRICT c2,
288 OPJ_SIZE_T n)
289 {
290 OPJ_SIZE_T i;
291 #ifdef USE_SSE
292 __m128 vrv, vgu, vgv, vbu;
293 vrv = _mm_set1_ps(1.402f);
294 vgu = _mm_set1_ps(0.34413f);
295 vgv = _mm_set1_ps(0.71414f);
296 vbu = _mm_set1_ps(1.772f);
297 for (i = 0; i < (n >> 3); ++i) {
298 __m128 vy, vu, vv;
299 __m128 vr, vg, vb;
300
301 vy = _mm_load_ps(c0);
302 vu = _mm_load_ps(c1);
303 vv = _mm_load_ps(c2);
304 vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
305 vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
306 vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
307 _mm_store_ps(c0, vr);
308 _mm_store_ps(c1, vg);
309 _mm_store_ps(c2, vb);
310 c0 += 4;
311 c1 += 4;
312 c2 += 4;
313
314 vy = _mm_load_ps(c0);
315 vu = _mm_load_ps(c1);
316 vv = _mm_load_ps(c2);
317 vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
318 vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
319 vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
320 _mm_store_ps(c0, vr);
321 _mm_store_ps(c1, vg);
322 _mm_store_ps(c2, vb);
323 c0 += 4;
324 c1 += 4;
325 c2 += 4;
326 }
327 n &= 7;
328 #endif
329 for (i = 0; i < n; ++i) {
330 OPJ_FLOAT32 y = c0[i];
331 OPJ_FLOAT32 u = c1[i];
332 OPJ_FLOAT32 v = c2[i];
333 OPJ_FLOAT32 r = y + (v * 1.402f);
334 OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f));
335 OPJ_FLOAT32 b = y + (u * 1.772f);
336 c0[i] = r;
337 c1[i] = g;
338 c2[i] = b;
339 }
340 }
341
342 /* <summary> */
343 /* Get norm of basis function of irreversible MCT. */
344 /* </summary> */
opj_mct_getnorm_real(OPJ_UINT32 compno)345 OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno)
346 {
347 return opj_mct_norms_real[compno];
348 }
349
350
opj_mct_encode_custom(OPJ_BYTE * pCodingdata,OPJ_SIZE_T n,OPJ_BYTE ** pData,OPJ_UINT32 pNbComp,OPJ_UINT32 isSigned)351 OPJ_BOOL opj_mct_encode_custom(
352 OPJ_BYTE * pCodingdata,
353 OPJ_SIZE_T n,
354 OPJ_BYTE ** pData,
355 OPJ_UINT32 pNbComp,
356 OPJ_UINT32 isSigned)
357 {
358 OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata;
359 OPJ_SIZE_T i;
360 OPJ_UINT32 j;
361 OPJ_UINT32 k;
362 OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp;
363 OPJ_INT32 * lCurrentData = 00;
364 OPJ_INT32 * lCurrentMatrix = 00;
365 OPJ_INT32 ** lData = (OPJ_INT32 **) pData;
366 OPJ_UINT32 lMultiplicator = 1 << 13;
367 OPJ_INT32 * lMctPtr;
368
369 OPJ_ARG_NOT_USED(isSigned);
370
371 lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof(
372 OPJ_INT32));
373 if (! lCurrentData) {
374 return OPJ_FALSE;
375 }
376
377 lCurrentMatrix = lCurrentData + pNbComp;
378
379 for (i = 0; i < lNbMatCoeff; ++i) {
380 lCurrentMatrix[i] = (OPJ_INT32)(*(lMct++) * (OPJ_FLOAT32)lMultiplicator);
381 }
382
383 for (i = 0; i < n; ++i) {
384 lMctPtr = lCurrentMatrix;
385 for (j = 0; j < pNbComp; ++j) {
386 lCurrentData[j] = (*(lData[j]));
387 }
388
389 for (j = 0; j < pNbComp; ++j) {
390 *(lData[j]) = 0;
391 for (k = 0; k < pNbComp; ++k) {
392 *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]);
393 ++lMctPtr;
394 }
395
396 ++lData[j];
397 }
398 }
399
400 opj_free(lCurrentData);
401
402 return OPJ_TRUE;
403 }
404
opj_mct_decode_custom(OPJ_BYTE * pDecodingData,OPJ_SIZE_T n,OPJ_BYTE ** pData,OPJ_UINT32 pNbComp,OPJ_UINT32 isSigned)405 OPJ_BOOL opj_mct_decode_custom(
406 OPJ_BYTE * pDecodingData,
407 OPJ_SIZE_T n,
408 OPJ_BYTE ** pData,
409 OPJ_UINT32 pNbComp,
410 OPJ_UINT32 isSigned)
411 {
412 OPJ_FLOAT32 * lMct;
413 OPJ_SIZE_T i;
414 OPJ_UINT32 j;
415 OPJ_UINT32 k;
416
417 OPJ_FLOAT32 * lCurrentData = 00;
418 OPJ_FLOAT32 * lCurrentResult = 00;
419 OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData;
420
421 OPJ_ARG_NOT_USED(isSigned);
422
423 lCurrentData = (OPJ_FLOAT32 *) opj_malloc(2 * pNbComp * sizeof(OPJ_FLOAT32));
424 if (! lCurrentData) {
425 return OPJ_FALSE;
426 }
427 lCurrentResult = lCurrentData + pNbComp;
428
429 for (i = 0; i < n; ++i) {
430 lMct = (OPJ_FLOAT32 *) pDecodingData;
431 for (j = 0; j < pNbComp; ++j) {
432 lCurrentData[j] = (OPJ_FLOAT32)(*(lData[j]));
433 }
434 for (j = 0; j < pNbComp; ++j) {
435 lCurrentResult[j] = 0;
436 for (k = 0; k < pNbComp; ++k) {
437 lCurrentResult[j] += *(lMct++) * lCurrentData[k];
438 }
439 *(lData[j]++) = (OPJ_FLOAT32)(lCurrentResult[j]);
440 }
441 }
442 opj_free(lCurrentData);
443 return OPJ_TRUE;
444 }
445
opj_calculate_norms(OPJ_FLOAT64 * pNorms,OPJ_UINT32 pNbComps,OPJ_FLOAT32 * pMatrix)446 void opj_calculate_norms(OPJ_FLOAT64 * pNorms,
447 OPJ_UINT32 pNbComps,
448 OPJ_FLOAT32 * pMatrix)
449 {
450 OPJ_UINT32 i, j, lIndex;
451 OPJ_FLOAT32 lCurrentValue;
452 OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms;
453 OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix;
454
455 for (i = 0; i < pNbComps; ++i) {
456 lNorms[i] = 0;
457 lIndex = i;
458
459 for (j = 0; j < pNbComps; ++j) {
460 lCurrentValue = lMatrix[lIndex];
461 lIndex += pNbComps;
462 lNorms[i] += (OPJ_FLOAT64) lCurrentValue * lCurrentValue;
463 }
464 lNorms[i] = sqrt(lNorms[i]);
465 }
466 }
467