1 /*
2 * The copyright in this software is being made available under the 2-clauses
3 * BSD License, included below. This software may be subject to other third
4 * party and contributor rights, including patent rights, and no such rights
5 * are granted under this license.
6 *
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8 * Copyright (c) 2002-2014, Professor Benoit Macq
9 * Copyright (c) 2001-2003, David Janssens
10 * Copyright (c) 2002-2003, Yannick Verschueren
11 * Copyright (c) 2003-2007, Francois-Olivier Devaux
12 * Copyright (c) 2003-2014, Antonin Descampe
13 * Copyright (c) 2005, Herve Drolon, FreeImage Team
14 * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
15 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
16 * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
17 * All rights reserved.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
29 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 */
40
41 #include <assert.h>
42
43 #define OPJ_SKIP_POISON
44 #include "opj_includes.h"
45
46 #ifdef __SSE__
47 #include <xmmintrin.h>
48 #endif
49 #ifdef __SSE2__
50 #include <emmintrin.h>
51 #endif
52 #ifdef __SSSE3__
53 #include <tmmintrin.h>
54 #endif
55 #ifdef __AVX2__
56 #include <immintrin.h>
57 #endif
58
59 #if defined(__GNUC__)
60 #pragma GCC poison malloc calloc realloc free
61 #endif
62
63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
64 /*@{*/
65
66 #ifdef __AVX2__
67 /** Number of int32 values in a AVX2 register */
68 #define VREG_INT_COUNT 8
69 #else
70 /** Number of int32 values in a SSE2 register */
71 #define VREG_INT_COUNT 4
72 #endif
73
74 /** Number of columns that we can process in parallel in the vertical pass */
75 #define PARALLEL_COLS_53 (2*VREG_INT_COUNT)
76
77 /** @name Local data structures */
78 /*@{*/
79
80 typedef struct dwt_local {
81 OPJ_INT32* mem;
82 OPJ_SIZE_T mem_count;
83 OPJ_INT32 dn; /* number of elements in high pass band */
84 OPJ_INT32 sn; /* number of elements in low pass band */
85 OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */
86 } opj_dwt_t;
87
88 typedef union {
89 OPJ_FLOAT32 f[4];
90 } opj_v4_t;
91
92 typedef struct v4dwt_local {
93 opj_v4_t* wavelet ;
94 OPJ_INT32 dn ; /* number of elements in high pass band */
95 OPJ_INT32 sn ; /* number of elements in low pass band */
96 OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */
97 OPJ_UINT32 win_l_x0; /* start coord in low pass band */
98 OPJ_UINT32 win_l_x1; /* end coord in low pass band */
99 OPJ_UINT32 win_h_x0; /* start coord in high pass band */
100 OPJ_UINT32 win_h_x1; /* end coord in high pass band */
101 } opj_v4dwt_t ;
102
103 static const OPJ_FLOAT32 opj_dwt_alpha = 1.586134342f; /* 12994 */
104 static const OPJ_FLOAT32 opj_dwt_beta = 0.052980118f; /* 434 */
105 static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /* -7233 */
106 static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /* -3633 */
107
108 static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */
109 static const OPJ_FLOAT32 opj_c13318 = 1.625732422f;
110
111 /*@}*/
112
113 /**
114 Virtual function type for wavelet transform in 1-D
115 */
116 typedef void (*DWT1DFN)(const opj_dwt_t* v);
117
118 /** @name Local static functions */
119 /*@{*/
120
121 /**
122 Forward lazy transform (horizontal)
123 */
124 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
125 OPJ_INT32 sn, OPJ_INT32 cas);
126 /**
127 Forward lazy transform (vertical)
128 */
129 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
130 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
131 /**
132 Forward 5-3 wavelet transform in 1-D
133 */
134 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
135 OPJ_INT32 sn, OPJ_INT32 cas);
136 /**
137 Forward 9-7 wavelet transform in 1-D
138 */
139 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count,
140 OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
141 /**
142 Explicit calculation of the Quantization Stepsizes
143 */
144 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
145 opj_stepsize_t *bandno_stepsize);
146 /**
147 Inverse wavelet transform in 2-D.
148 */
149 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
150 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
151
152 static OPJ_BOOL opj_dwt_decode_partial_tile(
153 opj_tcd_tilecomp_t* tilec,
154 OPJ_UINT32 numres);
155
156 static OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec,
157 void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32));
158
159 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
160 OPJ_UINT32 i);
161
162 /* <summary> */
163 /* Inverse 9-7 wavelet transform in 1-D. */
164 /* </summary> */
165 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
166
167 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
168 OPJ_FLOAT32* OPJ_RESTRICT a,
169 OPJ_UINT32 width,
170 OPJ_UINT32 remaining_height);
171
172 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
173 OPJ_FLOAT32* OPJ_RESTRICT a,
174 OPJ_UINT32 width,
175 OPJ_UINT32 nb_elts_read);
176
177 #ifdef __SSE__
178 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
179 OPJ_UINT32 start,
180 OPJ_UINT32 end,
181 const __m128 c);
182
183 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
184 OPJ_UINT32 start,
185 OPJ_UINT32 end,
186 OPJ_UINT32 m, __m128 c);
187
188 #else
189 static void opj_v4dwt_decode_step1(opj_v4_t* w,
190 OPJ_UINT32 start,
191 OPJ_UINT32 end,
192 const OPJ_FLOAT32 c);
193
194 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
195 OPJ_UINT32 start,
196 OPJ_UINT32 end,
197 OPJ_UINT32 m,
198 OPJ_FLOAT32 c);
199
200 #endif
201
202 /*@}*/
203
204 /*@}*/
205
206 #define IDX_S(i) (i)*2
207 #define IDX_D(i) 1 + (i)* 2
208 #define UNDERFLOW_SN(i) ((i) >= sn&&sn>0)
209 #define UNDERFLOW_DN(i) ((i) >= dn&&dn>0)
210 #define OVERFLOW_S(i) (IDX_S(i) >= a_count)
211 #define OVERFLOW_D(i) (IDX_D(i) >= a_count)
212
213 #define OPJ_S(i) a[IDX_S(i)]
214 #define OPJ_D(i) a[IDX_D(i)]
215 #define OPJ_S_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_SN(i) ? OPJ_S(sn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i)))
216 #define OPJ_D_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_DN(i) ? OPJ_D(dn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i)))
217 /* new */
218 #define OPJ_SS_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_DN(i) ? OPJ_S(dn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i)))
219 #define OPJ_DD_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_SN(i) ? OPJ_D(sn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i)))
220
221 /* <summary> */
222 /* This table contains the norms of the 5-3 wavelets for different bands. */
223 /* </summary> */
224 /* FIXME! the array should really be extended up to 33 resolution levels */
225 /* See https://github.com/uclouvain/openjpeg/issues/493 */
226 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
227 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
228 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
229 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
230 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
231 };
232
233 /* <summary> */
234 /* This table contains the norms of the 9-7 wavelets for different bands. */
235 /* </summary> */
236 /* FIXME! the array should really be extended up to 33 resolution levels */
237 /* See https://github.com/uclouvain/openjpeg/issues/493 */
238 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
239 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
240 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
241 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
242 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
243 };
244
245 /*
246 ==========================================================
247 local functions
248 ==========================================================
249 */
250
251 /* <summary> */
252 /* Forward lazy transform (horizontal). */
253 /* </summary> */
opj_dwt_deinterleave_h(OPJ_INT32 * a,OPJ_INT32 * b,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)254 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
255 OPJ_INT32 sn, OPJ_INT32 cas)
256 {
257 OPJ_INT32 i;
258 OPJ_INT32 * l_dest = b;
259 OPJ_INT32 * l_src = a + cas;
260
261 for (i = 0; i < sn; ++i) {
262 *l_dest++ = *l_src;
263 l_src += 2;
264 }
265
266 l_dest = b + sn;
267 l_src = a + 1 - cas;
268
269 for (i = 0; i < dn; ++i) {
270 *l_dest++ = *l_src;
271 l_src += 2;
272 }
273 }
274
275 /* <summary> */
276 /* Forward lazy transform (vertical). */
277 /* </summary> */
opj_dwt_deinterleave_v(OPJ_INT32 * a,OPJ_INT32 * b,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 x,OPJ_INT32 cas)278 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
279 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
280 {
281 OPJ_INT32 i = sn;
282 OPJ_INT32 * l_dest = b;
283 OPJ_INT32 * l_src = a + cas;
284
285 while (i--) {
286 *l_dest = *l_src;
287 l_dest += x;
288 l_src += 2;
289 } /* b[i*x]=a[2*i+cas]; */
290
291 l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x;
292 l_src = a + 1 - cas;
293
294 i = dn;
295 while (i--) {
296 *l_dest = *l_src;
297 l_dest += x;
298 l_src += 2;
299 } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
300 }
301
302 #ifdef STANDARD_SLOW_VERSION
303 /* <summary> */
304 /* Inverse lazy transform (horizontal). */
305 /* </summary> */
opj_dwt_interleave_h(const opj_dwt_t * h,OPJ_INT32 * a)306 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
307 {
308 OPJ_INT32 *ai = a;
309 OPJ_INT32 *bi = h->mem + h->cas;
310 OPJ_INT32 i = h->sn;
311 while (i--) {
312 *bi = *(ai++);
313 bi += 2;
314 }
315 ai = a + h->sn;
316 bi = h->mem + 1 - h->cas;
317 i = h->dn ;
318 while (i--) {
319 *bi = *(ai++);
320 bi += 2;
321 }
322 }
323
324 /* <summary> */
325 /* Inverse lazy transform (vertical). */
326 /* </summary> */
opj_dwt_interleave_v(const opj_dwt_t * v,OPJ_INT32 * a,OPJ_INT32 x)327 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
328 {
329 OPJ_INT32 *ai = a;
330 OPJ_INT32 *bi = v->mem + v->cas;
331 OPJ_INT32 i = v->sn;
332 while (i--) {
333 *bi = *ai;
334 bi += 2;
335 ai += x;
336 }
337 ai = a + (v->sn * (OPJ_SIZE_T)x);
338 bi = v->mem + 1 - v->cas;
339 i = v->dn ;
340 while (i--) {
341 *bi = *ai;
342 bi += 2;
343 ai += x;
344 }
345 }
346
347 #endif /* STANDARD_SLOW_VERSION */
348
349 /* <summary> */
350 /* Forward 5-3 wavelet transform in 1-D. */
351 /* </summary> */
opj_dwt_encode_1(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)352 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
353 OPJ_INT32 sn, OPJ_INT32 cas)
354 {
355 OPJ_INT32 i;
356
357 if (!cas) {
358 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
359 for (i = 0; i < dn; i++) {
360 OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
361 }
362 for (i = 0; i < sn; i++) {
363 OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
364 }
365 }
366 } else {
367 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
368 OPJ_S(0) *= 2;
369 } else {
370 for (i = 0; i < dn; i++) {
371 OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
372 }
373 for (i = 0; i < sn; i++) {
374 OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
375 }
376 }
377 }
378 }
379
380 #ifdef STANDARD_SLOW_VERSION
381 /* <summary> */
382 /* Inverse 5-3 wavelet transform in 1-D. */
383 /* </summary> */
opj_dwt_decode_1_(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)384 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
385 OPJ_INT32 sn, OPJ_INT32 cas)
386 {
387 OPJ_INT32 i;
388
389 if (!cas) {
390 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
391 for (i = 0; i < sn; i++) {
392 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
393 }
394 for (i = 0; i < dn; i++) {
395 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
396 }
397 }
398 } else {
399 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
400 OPJ_S(0) /= 2;
401 } else {
402 for (i = 0; i < sn; i++) {
403 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
404 }
405 for (i = 0; i < dn; i++) {
406 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
407 }
408 }
409 }
410 }
411
opj_dwt_decode_1(const opj_dwt_t * v)412 static void opj_dwt_decode_1(const opj_dwt_t *v)
413 {
414 opj_dwt_decode_1_(v->mem, v->mem_count, v->dn, v->sn, v->cas);
415 }
416
417 #endif /* STANDARD_SLOW_VERSION */
418
419 #if !defined(STANDARD_SLOW_VERSION)
opj_idwt53_h_cas0(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp)420 static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
421 const OPJ_INT32 sn,
422 const OPJ_INT32 len,
423 OPJ_INT32* tiledp)
424 {
425 OPJ_INT32 i, j;
426 const OPJ_INT32* in_even = &tiledp[0];
427 const OPJ_INT32* in_odd = &tiledp[sn];
428
429 #ifdef TWO_PASS_VERSION
430 /* For documentation purpose: performs lifting in two iterations, */
431 /* but without explicit interleaving */
432
433 assert(len > 1);
434
435 /* Even */
436 tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
437 for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
438 tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
439 }
440 if (len & 1) { /* if len is odd */
441 tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
442 }
443
444 /* Odd */
445 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
446 tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
447 }
448 if (!(len & 1)) { /* if len is even */
449 tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
450 }
451 #else
452 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
453
454 assert(len > 1);
455
456 /* Improved version of the TWO_PASS_VERSION: */
457 /* Performs lifting in one single iteration. Saves memory */
458 /* accesses and explicit interleaving. */
459 s1n = in_even[0];
460 d1n = in_odd[0];
461 s0n = s1n - ((d1n + 1) >> 1);
462
463 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
464 d1c = d1n;
465 s0c = s0n;
466
467 s1n = in_even[j];
468 d1n = in_odd[j];
469
470 s0n = s1n - ((d1c + d1n + 2) >> 2);
471
472 tmp[i ] = s0c;
473 tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
474 }
475
476 tmp[i] = s0n;
477
478 if (len & 1) {
479 tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
480 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
481 } else {
482 tmp[len - 1] = d1n + s0n;
483 }
484 #endif
485 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
486 }
487
opj_idwt53_h_cas1(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp)488 static void opj_idwt53_h_cas1(OPJ_INT32* tmp,
489 const OPJ_INT32 sn,
490 const OPJ_INT32 len,
491 OPJ_INT32* tiledp)
492 {
493 OPJ_INT32 i, j;
494 const OPJ_INT32* in_even = &tiledp[sn];
495 const OPJ_INT32* in_odd = &tiledp[0];
496
497 #ifdef TWO_PASS_VERSION
498 /* For documentation purpose: performs lifting in two iterations, */
499 /* but without explicit interleaving */
500
501 assert(len > 2);
502
503 /* Odd */
504 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
505 tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
506 }
507 if (!(len & 1)) {
508 tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
509 }
510
511 /* Even */
512 tmp[0] = in_even[0] + tmp[1];
513 for (i = 2, j = 1; i < len - 1; i += 2, j++) {
514 tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
515 }
516 if (len & 1) {
517 tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
518 }
519 #else
520 OPJ_INT32 s1, s2, dc, dn;
521
522 assert(len > 2);
523
524 /* Improved version of the TWO_PASS_VERSION: */
525 /* Performs lifting in one single iteration. Saves memory */
526 /* accesses and explicit interleaving. */
527
528 s1 = in_even[1];
529 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
530 tmp[0] = in_even[0] + dc;
531
532 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
533
534 s2 = in_even[j + 1];
535
536 dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
537 tmp[i ] = dc;
538 tmp[i + 1] = s1 + ((dn + dc) >> 1);
539
540 dc = dn;
541 s1 = s2;
542 }
543
544 tmp[i] = dc;
545
546 if (!(len & 1)) {
547 dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
548 tmp[len - 2] = s1 + ((dn + dc) >> 1);
549 tmp[len - 1] = dn;
550 } else {
551 tmp[len - 1] = s1 + dc;
552 }
553 #endif
554 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
555 }
556
557
558 #endif /* !defined(STANDARD_SLOW_VERSION) */
559
560 /* <summary> */
561 /* Inverse 5-3 wavelet transform in 1-D for one row. */
562 /* </summary> */
563 /* Performs interleave, inverse wavelet transform and copy back to buffer */
opj_idwt53_h(const opj_dwt_t * dwt,OPJ_INT32 * tiledp)564 static void opj_idwt53_h(const opj_dwt_t *dwt,
565 OPJ_INT32* tiledp)
566 {
567 #ifdef STANDARD_SLOW_VERSION
568 /* For documentation purpose */
569 opj_dwt_interleave_h(dwt, tiledp);
570 opj_dwt_decode_1(dwt);
571 memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
572 #else
573 const OPJ_INT32 sn = dwt->sn;
574 const OPJ_INT32 len = sn + dwt->dn;
575 if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
576 if (len > 1) {
577 opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
578 } else {
579 /* Unmodified value */
580 }
581 } else { /* Left-most sample is on odd coordinate */
582 if (len == 1) {
583 tiledp[0] /= 2;
584 } else if (len == 2) {
585 OPJ_INT32* out = dwt->mem;
586 const OPJ_INT32* in_even = &tiledp[sn];
587 const OPJ_INT32* in_odd = &tiledp[0];
588 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
589 out[0] = in_even[0] + out[1];
590 memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
591 } else if (len > 2) {
592 opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
593 }
594 }
595 #endif
596 }
597
598 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
599
600 /* Conveniency macros to improve the readabilty of the formulas */
601 #if __AVX2__
602 #define VREG __m256i
603 #define LOAD_CST(x) _mm256_set1_epi32(x)
604 #define LOAD(x) _mm256_load_si256((const VREG*)(x))
605 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x))
606 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y))
607 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
608 #define ADD(x,y) _mm256_add_epi32((x),(y))
609 #define SUB(x,y) _mm256_sub_epi32((x),(y))
610 #define SAR(x,y) _mm256_srai_epi32((x),(y))
611 #else
612 #define VREG __m128i
613 #define LOAD_CST(x) _mm_set1_epi32(x)
614 #define LOAD(x) _mm_load_si128((const VREG*)(x))
615 #define LOADU(x) _mm_loadu_si128((const VREG*)(x))
616 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y))
617 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
618 #define ADD(x,y) _mm_add_epi32((x),(y))
619 #define SUB(x,y) _mm_sub_epi32((x),(y))
620 #define SAR(x,y) _mm_srai_epi32((x),(y))
621 #endif
622 #define ADD3(x,y,z) ADD(ADD(x,y),z)
623
624 static
opj_idwt53_v_final_memcpy(OPJ_INT32 * tiledp_col,const OPJ_INT32 * tmp,OPJ_INT32 len,OPJ_SIZE_T stride)625 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
626 const OPJ_INT32* tmp,
627 OPJ_INT32 len,
628 OPJ_SIZE_T stride)
629 {
630 OPJ_INT32 i;
631 for (i = 0; i < len; ++i) {
632 /* A memcpy(&tiledp_col[i * stride + 0],
633 &tmp[PARALLEL_COLS_53 * i + 0],
634 PARALLEL_COLS_53 * sizeof(OPJ_INT32))
635 would do but would be a tiny bit slower.
636 We can take here advantage of our knowledge of alignment */
637 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0],
638 LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
639 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
640 LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
641 }
642 }
643
644 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
645 * 16 in AVX2, when top-most pixel is on even coordinate */
opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)646 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
647 OPJ_INT32* tmp,
648 const OPJ_INT32 sn,
649 const OPJ_INT32 len,
650 OPJ_INT32* tiledp_col,
651 const OPJ_SIZE_T stride)
652 {
653 const OPJ_INT32* in_even = &tiledp_col[0];
654 const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
655
656 OPJ_INT32 i;
657 OPJ_SIZE_T j;
658 VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
659 VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
660 const VREG two = LOAD_CST(2);
661
662 assert(len > 1);
663 #if __AVX2__
664 assert(PARALLEL_COLS_53 == 16);
665 assert(VREG_INT_COUNT == 8);
666 #else
667 assert(PARALLEL_COLS_53 == 8);
668 assert(VREG_INT_COUNT == 4);
669 #endif
670
671 /* Note: loads of input even/odd values must be done in a unaligned */
672 /* fashion. But stores in tmp can be done with aligned store, since */
673 /* the temporary buffer is properly aligned */
674 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
675
676 s1n_0 = LOADU(in_even + 0);
677 s1n_1 = LOADU(in_even + VREG_INT_COUNT);
678 d1n_0 = LOADU(in_odd);
679 d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
680
681 /* s0n = s1n - ((d1n + 1) >> 1); <==> */
682 /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
683 s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
684 s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
685
686 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
687 d1c_0 = d1n_0;
688 s0c_0 = s0n_0;
689 d1c_1 = d1n_1;
690 s0c_1 = s0n_1;
691
692 s1n_0 = LOADU(in_even + j * stride);
693 s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
694 d1n_0 = LOADU(in_odd + j * stride);
695 d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
696
697 /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
698 s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
699 s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
700
701 STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
702 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
703
704 /* d1c + ((s0c + s0n) >> 1) */
705 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
706 ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
707 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
708 ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
709 }
710
711 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
712 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
713
714 if (len & 1) {
715 VREG tmp_len_minus_1;
716 s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
717 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
718 tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
719 STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
720 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
721 STORE(tmp + PARALLEL_COLS_53 * (len - 2),
722 ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
723
724 s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
725 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
726 tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
727 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
728 tmp_len_minus_1);
729 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
730 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
731 ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
732
733
734 } else {
735 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
736 ADD(d1n_0, s0n_0));
737 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
738 ADD(d1n_1, s0n_1));
739 }
740
741 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
742 }
743
744
745 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
746 * 16 in AVX2, when top-most pixel is on odd coordinate */
opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)747 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
748 OPJ_INT32* tmp,
749 const OPJ_INT32 sn,
750 const OPJ_INT32 len,
751 OPJ_INT32* tiledp_col,
752 const OPJ_SIZE_T stride)
753 {
754 OPJ_INT32 i;
755 OPJ_SIZE_T j;
756
757 VREG s1_0, s2_0, dc_0, dn_0;
758 VREG s1_1, s2_1, dc_1, dn_1;
759 const VREG two = LOAD_CST(2);
760
761 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
762 const OPJ_INT32* in_odd = &tiledp_col[0];
763
764 assert(len > 2);
765 #if __AVX2__
766 assert(PARALLEL_COLS_53 == 16);
767 assert(VREG_INT_COUNT == 8);
768 #else
769 assert(PARALLEL_COLS_53 == 8);
770 assert(VREG_INT_COUNT == 4);
771 #endif
772
773 /* Note: loads of input even/odd values must be done in a unaligned */
774 /* fashion. But stores in tmp can be done with aligned store, since */
775 /* the temporary buffer is properly aligned */
776 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
777
778 s1_0 = LOADU(in_even + stride);
779 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
780 dc_0 = SUB(LOADU(in_odd + 0),
781 SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
782 STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
783
784 s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
785 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
786 dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
787 SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
788 STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
789 ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
790
791 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
792
793 s2_0 = LOADU(in_even + (j + 1) * stride);
794 s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
795
796 /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
797 dn_0 = SUB(LOADU(in_odd + j * stride),
798 SAR(ADD3(s1_0, s2_0, two), 2));
799 dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
800 SAR(ADD3(s1_1, s2_1, two), 2));
801
802 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
803 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
804
805 /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
806 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
807 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
808 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
809 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
810
811 dc_0 = dn_0;
812 s1_0 = s2_0;
813 dc_1 = dn_1;
814 s1_1 = s2_1;
815 }
816 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
817 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
818
819 if (!(len & 1)) {
820 /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
821 dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
822 SAR(ADD3(s1_0, s1_0, two), 2));
823 dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
824 SAR(ADD3(s1_1, s1_1, two), 2));
825
826 /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
827 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
828 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
829 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
830 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
831
832 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
833 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
834 } else {
835 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
836 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
837 ADD(s1_1, dc_1));
838 }
839
840 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
841 }
842
843 #undef VREG
844 #undef LOAD_CST
845 #undef LOADU
846 #undef LOAD
847 #undef STORE
848 #undef STOREU
849 #undef ADD
850 #undef ADD3
851 #undef SUB
852 #undef SAR
853
854 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
855
856 #if !defined(STANDARD_SLOW_VERSION)
857 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
858 * pixel is on even coordinate */
opj_idwt3_v_cas0(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)859 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
860 const OPJ_INT32 sn,
861 const OPJ_INT32 len,
862 OPJ_INT32* tiledp_col,
863 const OPJ_SIZE_T stride)
864 {
865 OPJ_INT32 i, j;
866 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
867
868 assert(len > 1);
869
870 /* Performs lifting in one single iteration. Saves memory */
871 /* accesses and explicit interleaving. */
872
873 s1n = tiledp_col[0];
874 d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
875 s0n = s1n - ((d1n + 1) >> 1);
876
877 for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
878 d1c = d1n;
879 s0c = s0n;
880
881 s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
882 d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
883
884 s0n = s1n - ((d1c + d1n + 2) >> 2);
885
886 tmp[i ] = s0c;
887 tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
888 }
889
890 tmp[i] = s0n;
891
892 if (len & 1) {
893 tmp[len - 1] =
894 tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
895 ((d1n + 1) >> 1);
896 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
897 } else {
898 tmp[len - 1] = d1n + s0n;
899 }
900
901 for (i = 0; i < len; ++i) {
902 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
903 }
904 }
905
906
907 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
908 * pixel is on odd coordinate */
opj_idwt3_v_cas1(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)909 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
910 const OPJ_INT32 sn,
911 const OPJ_INT32 len,
912 OPJ_INT32* tiledp_col,
913 const OPJ_SIZE_T stride)
914 {
915 OPJ_INT32 i, j;
916 OPJ_INT32 s1, s2, dc, dn;
917 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
918 const OPJ_INT32* in_odd = &tiledp_col[0];
919
920 assert(len > 2);
921
922 /* Performs lifting in one single iteration. Saves memory */
923 /* accesses and explicit interleaving. */
924
925 s1 = in_even[stride];
926 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
927 tmp[0] = in_even[0] + dc;
928 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
929
930 s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
931
932 dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
933 tmp[i ] = dc;
934 tmp[i + 1] = s1 + ((dn + dc) >> 1);
935
936 dc = dn;
937 s1 = s2;
938 }
939 tmp[i] = dc;
940 if (!(len & 1)) {
941 dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
942 tmp[len - 2] = s1 + ((dn + dc) >> 1);
943 tmp[len - 1] = dn;
944 } else {
945 tmp[len - 1] = s1 + dc;
946 }
947
948 for (i = 0; i < len; ++i) {
949 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
950 }
951 }
952 #endif /* !defined(STANDARD_SLOW_VERSION) */
953
954 /* <summary> */
955 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
956 /* </summary> */
957 /* Performs interleave, inverse wavelet transform and copy back to buffer */
opj_idwt53_v(const opj_dwt_t * dwt,OPJ_INT32 * tiledp_col,OPJ_SIZE_T stride,OPJ_INT32 nb_cols)958 static void opj_idwt53_v(const opj_dwt_t *dwt,
959 OPJ_INT32* tiledp_col,
960 OPJ_SIZE_T stride,
961 OPJ_INT32 nb_cols)
962 {
963 #ifdef STANDARD_SLOW_VERSION
964 /* For documentation purpose */
965 OPJ_INT32 k, c;
966 for (c = 0; c < nb_cols; c ++) {
967 opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
968 opj_dwt_decode_1(dwt);
969 for (k = 0; k < dwt->sn + dwt->dn; ++k) {
970 tiledp_col[c + k * stride] = dwt->mem[k];
971 }
972 }
973 #else
974 const OPJ_INT32 sn = dwt->sn;
975 const OPJ_INT32 len = sn + dwt->dn;
976 if (dwt->cas == 0) {
977 /* If len == 1, unmodified value */
978
979 #if (defined(__SSE2__) || defined(__AVX2__))
980 if (len > 1 && nb_cols == PARALLEL_COLS_53) {
981 /* Same as below general case, except that thanks to SSE2/AVX2 */
982 /* we can efficiently process 8/16 columns in parallel */
983 opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
984 return;
985 }
986 #endif
987 if (len > 1) {
988 OPJ_INT32 c;
989 for (c = 0; c < nb_cols; c++, tiledp_col++) {
990 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
991 }
992 return;
993 }
994 } else {
995 if (len == 1) {
996 OPJ_INT32 c;
997 for (c = 0; c < nb_cols; c++, tiledp_col++) {
998 tiledp_col[0] /= 2;
999 }
1000 return;
1001 }
1002
1003 if (len == 2) {
1004 OPJ_INT32 c;
1005 OPJ_INT32* out = dwt->mem;
1006 for (c = 0; c < nb_cols; c++, tiledp_col++) {
1007 OPJ_INT32 i;
1008 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
1009 const OPJ_INT32* in_odd = &tiledp_col[0];
1010
1011 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
1012 out[0] = in_even[0] + out[1];
1013
1014 for (i = 0; i < len; ++i) {
1015 tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
1016 }
1017 }
1018
1019 return;
1020 }
1021
1022 #if (defined(__SSE2__) || defined(__AVX2__))
1023 if (len > 2 && nb_cols == PARALLEL_COLS_53) {
1024 /* Same as below general case, except that thanks to SSE2/AVX2 */
1025 /* we can efficiently process 8/16 columns in parallel */
1026 opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
1027 return;
1028 }
1029 #endif
1030 if (len > 2) {
1031 OPJ_INT32 c;
1032 for (c = 0; c < nb_cols; c++, tiledp_col++) {
1033 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
1034 }
1035 return;
1036 }
1037 }
1038 #endif
1039 }
1040
1041
1042 /* <summary> */
1043 /* Forward 9-7 wavelet transform in 1-D. */
1044 /* </summary> */
opj_dwt_encode_1_real(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)1045 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count,
1046 OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas)
1047 {
1048 OPJ_INT32 i;
1049 if (!cas) {
1050 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
1051 for (i = 0; i < dn; i++) {
1052 OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
1053 }
1054 for (i = 0; i < sn; i++) {
1055 OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
1056 }
1057 for (i = 0; i < dn; i++) {
1058 OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
1059 }
1060 for (i = 0; i < sn; i++) {
1061 OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
1062 }
1063 for (i = 0; i < dn; i++) {
1064 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */
1065 }
1066 for (i = 0; i < sn; i++) {
1067 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */
1068 }
1069 }
1070 } else {
1071 if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */
1072 for (i = 0; i < dn; i++) {
1073 OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
1074 }
1075 for (i = 0; i < sn; i++) {
1076 OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
1077 }
1078 for (i = 0; i < dn; i++) {
1079 OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
1080 }
1081 for (i = 0; i < sn; i++) {
1082 OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
1083 }
1084 for (i = 0; i < dn; i++) {
1085 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */
1086 }
1087 for (i = 0; i < sn; i++) {
1088 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */
1089 }
1090 }
1091 }
1092 }
1093
opj_dwt_encode_stepsize(OPJ_INT32 stepsize,OPJ_INT32 numbps,opj_stepsize_t * bandno_stepsize)1094 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1095 opj_stepsize_t *bandno_stepsize)
1096 {
1097 OPJ_INT32 p, n;
1098 p = opj_int_floorlog2(stepsize) - 13;
1099 n = 11 - opj_int_floorlog2(stepsize);
1100 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1101 bandno_stepsize->expn = numbps - p;
1102 }
1103
1104 /*
1105 ==========================================================
1106 DWT interface
1107 ==========================================================
1108 */
1109
1110
1111 /* <summary> */
1112 /* Forward 5-3 wavelet transform in 2-D. */
1113 /* </summary> */
opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec,void (* p_function)(OPJ_INT32 *,OPJ_SIZE_T,OPJ_INT32,OPJ_INT32,OPJ_INT32))1114 static INLINE OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec,
1115 void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32))
1116 {
1117 OPJ_INT32 i, j, k;
1118 OPJ_INT32 *a = 00;
1119 OPJ_INT32 *aj = 00;
1120 OPJ_INT32 *bj = 00;
1121 OPJ_INT32 w, l;
1122
1123 OPJ_INT32 rw; /* width of the resolution level computed */
1124 OPJ_INT32 rh; /* height of the resolution level computed */
1125 OPJ_SIZE_T l_data_count;
1126 OPJ_SIZE_T l_data_size;
1127
1128 opj_tcd_resolution_t * l_cur_res = 0;
1129 opj_tcd_resolution_t * l_last_res = 0;
1130
1131 w = tilec->x1 - tilec->x0;
1132 l = (OPJ_INT32)tilec->numresolutions - 1;
1133 a = tilec->data;
1134
1135 l_cur_res = tilec->resolutions + l;
1136 l_last_res = l_cur_res - 1;
1137
1138 l_data_count = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1139 /* overflow check */
1140 if (l_data_count > (SIZE_MAX / sizeof(OPJ_INT32))) {
1141 /* FIXME event manager error callback */
1142 return OPJ_FALSE;
1143 }
1144 l_data_size = l_data_count * sizeof(OPJ_INT32);
1145 bj = (OPJ_INT32*)opj_malloc(l_data_size);
1146 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1147 /* in that case, so do not error out */
1148 if (l_data_size != 0 && ! bj) {
1149 return OPJ_FALSE;
1150 }
1151 i = l;
1152
1153 while (i--) {
1154 OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */
1155 OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */
1156 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1157 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
1158 OPJ_INT32 dn, sn;
1159
1160 rw = l_cur_res->x1 - l_cur_res->x0;
1161 rh = l_cur_res->y1 - l_cur_res->y0;
1162 rw1 = l_last_res->x1 - l_last_res->x0;
1163 rh1 = l_last_res->y1 - l_last_res->y0;
1164
1165 cas_row = l_cur_res->x0 & 1;
1166 cas_col = l_cur_res->y0 & 1;
1167
1168 sn = rh1;
1169 dn = rh - rh1;
1170 for (j = 0; j < rw; ++j) {
1171 aj = a + j;
1172 for (k = 0; k < rh; ++k) {
1173 bj[k] = aj[k * w];
1174 }
1175
1176 (*p_function) (bj, l_data_count, dn, sn, cas_col);
1177
1178 opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
1179 }
1180
1181 sn = rw1;
1182 dn = rw - rw1;
1183
1184 for (j = 0; j < rh; j++) {
1185 aj = a + j * w;
1186 for (k = 0; k < rw; k++) {
1187 bj[k] = aj[k];
1188 }
1189 (*p_function) (bj, l_data_count, dn, sn, cas_row);
1190 opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
1191 }
1192
1193 l_cur_res = l_last_res;
1194
1195 --l_last_res;
1196 }
1197
1198 opj_free(bj);
1199 return OPJ_TRUE;
1200 }
1201
1202 /* Forward 5-3 wavelet transform in 2-D. */
1203 /* </summary> */
opj_dwt_encode(opj_tcd_tilecomp_t * tilec)1204 OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
1205 {
1206 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
1207 }
1208
1209 /* <summary> */
1210 /* Inverse 5-3 wavelet transform in 2-D. */
1211 /* </summary> */
opj_dwt_decode(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1212 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
1213 OPJ_UINT32 numres)
1214 {
1215 if (p_tcd->whole_tile_decoding) {
1216 return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1217 } else {
1218 return opj_dwt_decode_partial_tile(tilec, numres);
1219 }
1220 }
1221
1222
1223 /* <summary> */
1224 /* Get gain of 5-3 wavelet transform. */
1225 /* </summary> */
opj_dwt_getgain(OPJ_UINT32 orient)1226 OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient)
1227 {
1228 if (orient == 0) {
1229 return 0;
1230 }
1231 if (orient == 1 || orient == 2) {
1232 return 1;
1233 }
1234 return 2;
1235 }
1236
1237 /* <summary> */
1238 /* Get norm of 5-3 wavelet. */
1239 /* </summary> */
opj_dwt_getnorm(OPJ_UINT32 level,OPJ_UINT32 orient)1240 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1241 {
1242 /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1243 /* but the array should really be extended up to 33 resolution levels */
1244 /* See https://github.com/uclouvain/openjpeg/issues/493 */
1245 if (orient == 0 && level >= 10) {
1246 level = 9;
1247 } else if (orient > 0 && level >= 9) {
1248 level = 8;
1249 }
1250 return opj_dwt_norms[orient][level];
1251 }
1252
1253 /* <summary> */
1254 /* Forward 9-7 wavelet transform in 2-D. */
1255 /* </summary> */
opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)1256 OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
1257 {
1258 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
1259 }
1260
1261 /* <summary> */
1262 /* Get gain of 9-7 wavelet transform. */
1263 /* </summary> */
opj_dwt_getgain_real(OPJ_UINT32 orient)1264 OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient)
1265 {
1266 (void)orient;
1267 return 0;
1268 }
1269
1270 /* <summary> */
1271 /* Get norm of 9-7 wavelet. */
1272 /* </summary> */
opj_dwt_getnorm_real(OPJ_UINT32 level,OPJ_UINT32 orient)1273 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1274 {
1275 /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1276 /* but the array should really be extended up to 33 resolution levels */
1277 /* See https://github.com/uclouvain/openjpeg/issues/493 */
1278 if (orient == 0 && level >= 10) {
1279 level = 9;
1280 } else if (orient > 0 && level >= 9) {
1281 level = 8;
1282 }
1283 return opj_dwt_norms_real[orient][level];
1284 }
1285
opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp,OPJ_UINT32 prec)1286 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1287 {
1288 OPJ_UINT32 numbands, bandno;
1289 numbands = 3 * tccp->numresolutions - 2;
1290 for (bandno = 0; bandno < numbands; bandno++) {
1291 OPJ_FLOAT64 stepsize;
1292 OPJ_UINT32 resno, level, orient, gain;
1293
1294 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1295 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1296 level = tccp->numresolutions - 1 - resno;
1297 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1298 (orient == 2)) ? 1 : 2));
1299 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1300 stepsize = 1.0;
1301 } else {
1302 OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
1303 stepsize = (1 << (gain)) / norm;
1304 }
1305 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1306 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1307 }
1308 }
1309
1310 /* <summary> */
1311 /* Determine maximum computed resolution level for inverse wavelet transform */
1312 /* </summary> */
opj_dwt_max_resolution(opj_tcd_resolution_t * OPJ_RESTRICT r,OPJ_UINT32 i)1313 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1314 OPJ_UINT32 i)
1315 {
1316 OPJ_UINT32 mr = 0;
1317 OPJ_UINT32 w;
1318 while (--i) {
1319 ++r;
1320 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
1321 mr = w ;
1322 }
1323 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
1324 mr = w ;
1325 }
1326 }
1327 return mr ;
1328 }
1329
1330 typedef struct {
1331 opj_dwt_t h;
1332 OPJ_UINT32 rw;
1333 OPJ_UINT32 w;
1334 OPJ_INT32 * OPJ_RESTRICT tiledp;
1335 OPJ_UINT32 min_j;
1336 OPJ_UINT32 max_j;
1337 } opj_dwd_decode_h_job_t;
1338
opj_dwt_decode_h_func(void * user_data,opj_tls_t * tls)1339 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
1340 {
1341 OPJ_UINT32 j;
1342 opj_dwd_decode_h_job_t* job;
1343 (void)tls;
1344
1345 job = (opj_dwd_decode_h_job_t*)user_data;
1346 for (j = job->min_j; j < job->max_j; j++) {
1347 opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
1348 }
1349
1350 opj_aligned_free(job->h.mem);
1351 opj_free(job);
1352 }
1353
1354 typedef struct {
1355 opj_dwt_t v;
1356 OPJ_UINT32 rh;
1357 OPJ_UINT32 w;
1358 OPJ_INT32 * OPJ_RESTRICT tiledp;
1359 OPJ_UINT32 min_j;
1360 OPJ_UINT32 max_j;
1361 } opj_dwd_decode_v_job_t;
1362
opj_dwt_decode_v_func(void * user_data,opj_tls_t * tls)1363 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
1364 {
1365 OPJ_UINT32 j;
1366 opj_dwd_decode_v_job_t* job;
1367 (void)tls;
1368
1369 job = (opj_dwd_decode_v_job_t*)user_data;
1370 for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
1371 j += PARALLEL_COLS_53) {
1372 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1373 PARALLEL_COLS_53);
1374 }
1375 if (j < job->max_j)
1376 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1377 (OPJ_INT32)(job->max_j - j));
1378
1379 opj_aligned_free(job->v.mem);
1380 opj_free(job);
1381 }
1382
1383
1384 /* <summary> */
1385 /* Inverse wavelet transform in 2-D. */
1386 /* </summary> */
opj_dwt_decode_tile(opj_thread_pool_t * tp,const opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1387 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
1388 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
1389 {
1390 opj_dwt_t h;
1391 opj_dwt_t v;
1392
1393 opj_tcd_resolution_t* tr = tilec->resolutions;
1394
1395 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1396 tr->x0); /* width of the resolution level computed */
1397 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1398 tr->y0); /* height of the resolution level computed */
1399
1400 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
1401 1].x1 -
1402 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
1403 OPJ_SIZE_T h_mem_size;
1404 int num_threads;
1405
1406 if (numres == 1U) {
1407 return OPJ_TRUE;
1408 }
1409 num_threads = opj_thread_pool_get_thread_count(tp);
1410 h.mem_count = opj_dwt_max_resolution(tr, numres);
1411 /* overflow check */
1412 if (h.mem_count > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
1413 /* FIXME event manager error callback */
1414 return OPJ_FALSE;
1415 }
1416 /* We need PARALLEL_COLS_53 times the height of the array, */
1417 /* since for the vertical pass */
1418 /* we process PARALLEL_COLS_53 columns at a time */
1419 h_mem_size = h.mem_count * PARALLEL_COLS_53 * sizeof(OPJ_INT32);
1420 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1421 if (! h.mem) {
1422 /* FIXME event manager error callback */
1423 return OPJ_FALSE;
1424 }
1425
1426 v.mem_count = h.mem_count;
1427 v.mem = h.mem;
1428
1429 while (--numres) {
1430 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1431 OPJ_UINT32 j;
1432
1433 ++tr;
1434 h.sn = (OPJ_INT32)rw;
1435 v.sn = (OPJ_INT32)rh;
1436
1437 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
1438 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
1439
1440 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1441 h.cas = tr->x0 % 2;
1442
1443 if (num_threads <= 1 || rh <= 1) {
1444 for (j = 0; j < rh; ++j) {
1445 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
1446 }
1447 } else {
1448 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1449 OPJ_UINT32 step_j;
1450
1451 if (rh < num_jobs) {
1452 num_jobs = rh;
1453 }
1454 step_j = (rh / num_jobs);
1455
1456 for (j = 0; j < num_jobs; j++) {
1457 opj_dwd_decode_h_job_t* job;
1458
1459 job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
1460 if (!job) {
1461 /* It would be nice to fallback to single thread case, but */
1462 /* unfortunately some jobs may be launched and have modified */
1463 /* tiledp, so it is not practical to recover from that error */
1464 /* FIXME event manager error callback */
1465 opj_thread_pool_wait_completion(tp, 0);
1466 opj_aligned_free(h.mem);
1467 return OPJ_FALSE;
1468 }
1469 job->h = h;
1470 job->rw = rw;
1471 job->w = w;
1472 job->tiledp = tiledp;
1473 job->min_j = j * step_j;
1474 job->max_j = (j + 1U) * step_j; /* this can overflow */
1475 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
1476 job->max_j = rh;
1477 }
1478 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1479 if (!job->h.mem) {
1480 /* FIXME event manager error callback */
1481 opj_thread_pool_wait_completion(tp, 0);
1482 opj_free(job);
1483 opj_aligned_free(h.mem);
1484 return OPJ_FALSE;
1485 }
1486 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
1487 }
1488 opj_thread_pool_wait_completion(tp, 0);
1489 }
1490
1491 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1492 v.cas = tr->y0 % 2;
1493
1494 if (num_threads <= 1 || rw <= 1) {
1495 for (j = 0; j + PARALLEL_COLS_53 <= rw;
1496 j += PARALLEL_COLS_53) {
1497 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
1498 }
1499 if (j < rw) {
1500 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
1501 }
1502 } else {
1503 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1504 OPJ_UINT32 step_j;
1505
1506 if (rw < num_jobs) {
1507 num_jobs = rw;
1508 }
1509 step_j = (rw / num_jobs);
1510
1511 for (j = 0; j < num_jobs; j++) {
1512 opj_dwd_decode_v_job_t* job;
1513
1514 job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
1515 if (!job) {
1516 /* It would be nice to fallback to single thread case, but */
1517 /* unfortunately some jobs may be launched and have modified */
1518 /* tiledp, so it is not practical to recover from that error */
1519 /* FIXME event manager error callback */
1520 opj_thread_pool_wait_completion(tp, 0);
1521 opj_aligned_free(v.mem);
1522 return OPJ_FALSE;
1523 }
1524 job->v = v;
1525 job->rh = rh;
1526 job->w = w;
1527 job->tiledp = tiledp;
1528 job->min_j = j * step_j;
1529 job->max_j = (j + 1U) * step_j; /* this can overflow */
1530 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
1531 job->max_j = rw;
1532 }
1533 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1534 if (!job->v.mem) {
1535 /* FIXME event manager error callback */
1536 opj_thread_pool_wait_completion(tp, 0);
1537 opj_free(job);
1538 opj_aligned_free(v.mem);
1539 return OPJ_FALSE;
1540 }
1541 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
1542 }
1543 opj_thread_pool_wait_completion(tp, 0);
1544 }
1545 }
1546 opj_aligned_free(h.mem);
1547 return OPJ_TRUE;
1548 }
1549
opj_dwt_interleave_partial_h(OPJ_INT32 * dest,OPJ_INT32 cas,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_line,OPJ_UINT32 sn,OPJ_UINT32 win_l_x0,OPJ_UINT32 win_l_x1,OPJ_UINT32 win_h_x0,OPJ_UINT32 win_h_x1)1550 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
1551 OPJ_INT32 cas,
1552 opj_sparse_array_int32_t* sa,
1553 OPJ_UINT32 sa_line,
1554 OPJ_UINT32 sn,
1555 OPJ_UINT32 win_l_x0,
1556 OPJ_UINT32 win_l_x1,
1557 OPJ_UINT32 win_h_x0,
1558 OPJ_UINT32 win_h_x1)
1559 {
1560 OPJ_BOOL ret;
1561 ret = opj_sparse_array_int32_read(sa,
1562 win_l_x0, sa_line,
1563 win_l_x1, sa_line + 1,
1564 dest + cas + 2 * win_l_x0,
1565 2, 0, OPJ_TRUE);
1566 assert(ret);
1567 ret = opj_sparse_array_int32_read(sa,
1568 sn + win_h_x0, sa_line,
1569 sn + win_h_x1, sa_line + 1,
1570 dest + 1 - cas + 2 * win_h_x0,
1571 2, 0, OPJ_TRUE);
1572 assert(ret);
1573 OPJ_UNUSED(ret);
1574 }
1575
1576
opj_dwt_interleave_partial_v(OPJ_INT32 * dest,OPJ_INT32 cas,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_col,OPJ_UINT32 nb_cols,OPJ_UINT32 sn,OPJ_UINT32 win_l_y0,OPJ_UINT32 win_l_y1,OPJ_UINT32 win_h_y0,OPJ_UINT32 win_h_y1)1577 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
1578 OPJ_INT32 cas,
1579 opj_sparse_array_int32_t* sa,
1580 OPJ_UINT32 sa_col,
1581 OPJ_UINT32 nb_cols,
1582 OPJ_UINT32 sn,
1583 OPJ_UINT32 win_l_y0,
1584 OPJ_UINT32 win_l_y1,
1585 OPJ_UINT32 win_h_y0,
1586 OPJ_UINT32 win_h_y1)
1587 {
1588 OPJ_BOOL ret;
1589 ret = opj_sparse_array_int32_read(sa,
1590 sa_col, win_l_y0,
1591 sa_col + nb_cols, win_l_y1,
1592 dest + cas * 4 + 2 * 4 * win_l_y0,
1593 1, 2 * 4, OPJ_TRUE);
1594 assert(ret);
1595 ret = opj_sparse_array_int32_read(sa,
1596 sa_col, sn + win_h_y0,
1597 sa_col + nb_cols, sn + win_h_y1,
1598 dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
1599 1, 2 * 4, OPJ_TRUE);
1600 assert(ret);
1601 OPJ_UNUSED(ret);
1602 }
1603
opj_dwt_decode_partial_1(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas,OPJ_INT32 win_l_x0,OPJ_INT32 win_l_x1,OPJ_INT32 win_h_x0,OPJ_INT32 win_h_x1)1604 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_SIZE_T a_count,
1605 OPJ_INT32 dn, OPJ_INT32 sn,
1606 OPJ_INT32 cas,
1607 OPJ_INT32 win_l_x0,
1608 OPJ_INT32 win_l_x1,
1609 OPJ_INT32 win_h_x0,
1610 OPJ_INT32 win_h_x1)
1611 {
1612 OPJ_INT32 i;
1613
1614 if (!cas) {
1615 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
1616
1617 /* Naive version is :
1618 for (i = win_l_x0; i < i_max; i++) {
1619 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1620 }
1621 for (i = win_h_x0; i < win_h_x1; i++) {
1622 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1623 }
1624 but the compiler doesn't manage to unroll it to avoid bound
1625 checking in OPJ_S_ and OPJ_D_ macros
1626 */
1627
1628 i = win_l_x0;
1629 if (i < win_l_x1) {
1630 OPJ_INT32 i_max;
1631
1632 /* Left-most case */
1633 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1634 i ++;
1635
1636 i_max = win_l_x1;
1637 if (i_max > dn) {
1638 i_max = dn;
1639 }
1640 for (; i < i_max; i++) {
1641 /* No bound checking */
1642 OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
1643 }
1644 for (; i < win_l_x1; i++) {
1645 /* Right-most case */
1646 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1647 }
1648 }
1649
1650 i = win_h_x0;
1651 if (i < win_h_x1) {
1652 OPJ_INT32 i_max = win_h_x1;
1653 if (i_max >= sn) {
1654 i_max = sn - 1;
1655 }
1656 for (; i < i_max; i++) {
1657 /* No bound checking */
1658 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
1659 }
1660 for (; i < win_h_x1; i++) {
1661 /* Right-most case */
1662 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1663 }
1664 }
1665 }
1666 } else {
1667 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
1668 OPJ_S(0) /= 2;
1669 } else {
1670 for (i = win_l_x0; i < win_l_x1; i++) {
1671 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
1672 }
1673 for (i = win_h_x0; i < win_h_x1; i++) {
1674 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
1675 }
1676 }
1677 }
1678 }
1679
1680 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
1681 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
1682 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
1683 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
1684 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
1685 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
1686
opj_dwt_decode_partial_1_parallel(OPJ_INT32 * a,OPJ_UINT32 nb_cols,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas,OPJ_INT32 win_l_x0,OPJ_INT32 win_l_x1,OPJ_INT32 win_h_x0,OPJ_INT32 win_h_x1)1687 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
1688 OPJ_UINT32 nb_cols,
1689 OPJ_INT32 dn, OPJ_INT32 sn,
1690 OPJ_INT32 cas,
1691 OPJ_INT32 win_l_x0,
1692 OPJ_INT32 win_l_x1,
1693 OPJ_INT32 win_h_x0,
1694 OPJ_INT32 win_h_x1)
1695 {
1696 OPJ_INT32 i;
1697 OPJ_UINT32 off;
1698
1699 (void)nb_cols;
1700
1701 if (!cas) {
1702 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
1703
1704 /* Naive version is :
1705 for (i = win_l_x0; i < i_max; i++) {
1706 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1707 }
1708 for (i = win_h_x0; i < win_h_x1; i++) {
1709 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1710 }
1711 but the compiler doesn't manage to unroll it to avoid bound
1712 checking in OPJ_S_ and OPJ_D_ macros
1713 */
1714
1715 i = win_l_x0;
1716 if (i < win_l_x1) {
1717 OPJ_INT32 i_max;
1718
1719 /* Left-most case */
1720 for (off = 0; off < 4; off++) {
1721 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
1722 }
1723 i ++;
1724
1725 i_max = win_l_x1;
1726 if (i_max > dn) {
1727 i_max = dn;
1728 }
1729
1730 #ifdef __SSE2__
1731 if (i + 1 < i_max) {
1732 const __m128i two = _mm_set1_epi32(2);
1733 __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
1734 for (; i + 1 < i_max; i += 2) {
1735 /* No bound checking */
1736 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
1737 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
1738 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
1739 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
1740 S = _mm_sub_epi32(S,
1741 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
1742 S1 = _mm_sub_epi32(S1,
1743 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
1744 _mm_store_si128((__m128i*)(a + i * 8), S);
1745 _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
1746 Dm1 = D1;
1747 }
1748 }
1749 #endif
1750
1751 for (; i < i_max; i++) {
1752 /* No bound checking */
1753 for (off = 0; off < 4; off++) {
1754 OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
1755 }
1756 }
1757 for (; i < win_l_x1; i++) {
1758 /* Right-most case */
1759 for (off = 0; off < 4; off++) {
1760 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
1761 }
1762 }
1763 }
1764
1765 i = win_h_x0;
1766 if (i < win_h_x1) {
1767 OPJ_INT32 i_max = win_h_x1;
1768 if (i_max >= sn) {
1769 i_max = sn - 1;
1770 }
1771
1772 #ifdef __SSE2__
1773 if (i + 1 < i_max) {
1774 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
1775 for (; i + 1 < i_max; i += 2) {
1776 /* No bound checking */
1777 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
1778 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
1779 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
1780 __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
1781 D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
1782 D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
1783 _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
1784 _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
1785 S = S2;
1786 }
1787 }
1788 #endif
1789
1790 for (; i < i_max; i++) {
1791 /* No bound checking */
1792 for (off = 0; off < 4; off++) {
1793 OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
1794 }
1795 }
1796 for (; i < win_h_x1; i++) {
1797 /* Right-most case */
1798 for (off = 0; off < 4; off++) {
1799 OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
1800 }
1801 }
1802 }
1803 }
1804 } else {
1805 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
1806 for (off = 0; off < 4; off++) {
1807 OPJ_S_off(0, off) /= 2;
1808 }
1809 } else {
1810 for (i = win_l_x0; i < win_l_x1; i++) {
1811 for (off = 0; off < 4; off++) {
1812 OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
1813 }
1814 }
1815 for (i = win_h_x0; i < win_h_x1; i++) {
1816 for (off = 0; off < 4; off++) {
1817 OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
1818 }
1819 }
1820 }
1821 }
1822 }
1823
opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 resno,OPJ_UINT32 bandno,OPJ_UINT32 tcx0,OPJ_UINT32 tcy0,OPJ_UINT32 tcx1,OPJ_UINT32 tcy1,OPJ_UINT32 * tbx0,OPJ_UINT32 * tby0,OPJ_UINT32 * tbx1,OPJ_UINT32 * tby1)1824 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
1825 OPJ_UINT32 resno,
1826 OPJ_UINT32 bandno,
1827 OPJ_UINT32 tcx0,
1828 OPJ_UINT32 tcy0,
1829 OPJ_UINT32 tcx1,
1830 OPJ_UINT32 tcy1,
1831 OPJ_UINT32* tbx0,
1832 OPJ_UINT32* tby0,
1833 OPJ_UINT32* tbx1,
1834 OPJ_UINT32* tby1)
1835 {
1836 /* Compute number of decomposition for this band. See table F-1 */
1837 OPJ_UINT32 nb = (resno == 0) ?
1838 tilec->numresolutions - 1 :
1839 tilec->numresolutions - resno;
1840 /* Map above tile-based coordinates to sub-band-based coordinates per */
1841 /* equation B-15 of the standard */
1842 OPJ_UINT32 x0b = bandno & 1;
1843 OPJ_UINT32 y0b = bandno >> 1;
1844 if (tbx0) {
1845 *tbx0 = (nb == 0) ? tcx0 :
1846 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 :
1847 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb);
1848 }
1849 if (tby0) {
1850 *tby0 = (nb == 0) ? tcy0 :
1851 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 :
1852 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb);
1853 }
1854 if (tbx1) {
1855 *tbx1 = (nb == 0) ? tcx1 :
1856 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 :
1857 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb);
1858 }
1859 if (tby1) {
1860 *tby1 = (nb == 0) ? tcy1 :
1861 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 :
1862 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb);
1863 }
1864 }
1865
opj_dwt_segment_grow(OPJ_UINT32 filter_width,OPJ_UINT32 max_size,OPJ_UINT32 * start,OPJ_UINT32 * end)1866 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
1867 OPJ_UINT32 max_size,
1868 OPJ_UINT32* start,
1869 OPJ_UINT32* end)
1870 {
1871 *start = opj_uint_subs(*start, filter_width);
1872 *end = opj_uint_adds(*end, filter_width);
1873 *end = opj_uint_min(*end, max_size);
1874 }
1875
1876
opj_dwt_init_sparse_array(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1877 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
1878 opj_tcd_tilecomp_t* tilec,
1879 OPJ_UINT32 numres)
1880 {
1881 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
1882 OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
1883 OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
1884 OPJ_UINT32 resno, bandno, precno, cblkno;
1885 opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
1886 w, h, opj_uint_min(w, 64), opj_uint_min(h, 64));
1887 if (sa == NULL) {
1888 return NULL;
1889 }
1890
1891 for (resno = 0; resno < numres; ++resno) {
1892 opj_tcd_resolution_t* res = &tilec->resolutions[resno];
1893
1894 for (bandno = 0; bandno < res->numbands; ++bandno) {
1895 opj_tcd_band_t* band = &res->bands[bandno];
1896
1897 for (precno = 0; precno < res->pw * res->ph; ++precno) {
1898 opj_tcd_precinct_t* precinct = &band->precincts[precno];
1899 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
1900 opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
1901 if (cblk->decoded_data != NULL) {
1902 OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
1903 OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
1904 OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
1905 OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
1906
1907 if (band->bandno & 1) {
1908 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
1909 x += (OPJ_UINT32)(pres->x1 - pres->x0);
1910 }
1911 if (band->bandno & 2) {
1912 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
1913 y += (OPJ_UINT32)(pres->y1 - pres->y0);
1914 }
1915
1916 if (!opj_sparse_array_int32_write(sa, x, y,
1917 x + cblk_w, y + cblk_h,
1918 cblk->decoded_data,
1919 1, cblk_w, OPJ_TRUE)) {
1920 opj_sparse_array_int32_free(sa);
1921 return NULL;
1922 }
1923 }
1924 }
1925 }
1926 }
1927 }
1928
1929 return sa;
1930 }
1931
1932
opj_dwt_decode_partial_tile(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1933 static OPJ_BOOL opj_dwt_decode_partial_tile(
1934 opj_tcd_tilecomp_t* tilec,
1935 OPJ_UINT32 numres)
1936 {
1937 opj_sparse_array_int32_t* sa;
1938 opj_dwt_t h;
1939 opj_dwt_t v;
1940 OPJ_UINT32 resno;
1941 /* This value matches the maximum left/right extension given in tables */
1942 /* F.2 and F.3 of the standard. */
1943 const OPJ_UINT32 filter_width = 2U;
1944
1945 opj_tcd_resolution_t* tr = tilec->resolutions;
1946 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
1947
1948 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1949 tr->x0); /* width of the resolution level computed */
1950 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1951 tr->y0); /* height of the resolution level computed */
1952
1953 OPJ_SIZE_T h_mem_size;
1954
1955 /* Compute the intersection of the area of interest, expressed in tile coordinates */
1956 /* with the tile coordinates */
1957 OPJ_UINT32 win_tcx0 = tilec->win_x0;
1958 OPJ_UINT32 win_tcy0 = tilec->win_y0;
1959 OPJ_UINT32 win_tcx1 = tilec->win_x1;
1960 OPJ_UINT32 win_tcy1 = tilec->win_y1;
1961
1962 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
1963 return OPJ_TRUE;
1964 }
1965
1966 sa = opj_dwt_init_sparse_array(tilec, numres);
1967 if (sa == NULL) {
1968 return OPJ_FALSE;
1969 }
1970
1971 if (numres == 1U) {
1972 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
1973 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
1974 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
1975 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
1976 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
1977 tilec->data_win,
1978 1, tr_max->win_x1 - tr_max->win_x0,
1979 OPJ_TRUE);
1980 assert(ret);
1981 OPJ_UNUSED(ret);
1982 opj_sparse_array_int32_free(sa);
1983 return OPJ_TRUE;
1984 }
1985 h.mem_count = opj_dwt_max_resolution(tr, numres);
1986 /* overflow check */
1987 /* in vertical pass, we process 4 columns at a time */
1988 if (h.mem_count > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
1989 /* FIXME event manager error callback */
1990 opj_sparse_array_int32_free(sa);
1991 return OPJ_FALSE;
1992 }
1993
1994 h_mem_size = h.mem_count * 4 * sizeof(OPJ_INT32);
1995 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1996 if (! h.mem) {
1997 /* FIXME event manager error callback */
1998 opj_sparse_array_int32_free(sa);
1999 return OPJ_FALSE;
2000 }
2001
2002 v.mem_count = h.mem_count;
2003 v.mem = h.mem;
2004
2005 for (resno = 1; resno < numres; resno ++) {
2006 OPJ_UINT32 i, j;
2007 /* Window of interest subband-based coordinates */
2008 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2009 OPJ_UINT32 win_hl_x0, win_hl_x1;
2010 OPJ_UINT32 win_lh_y0, win_lh_y1;
2011 /* Window of interest tile-resolution-based coordinates */
2012 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2013 /* Tile-resolution subband-based coordinates */
2014 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2015
2016 ++tr;
2017
2018 h.sn = (OPJ_INT32)rw;
2019 v.sn = (OPJ_INT32)rh;
2020
2021 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2022 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2023
2024 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2025 h.cas = tr->x0 % 2;
2026
2027 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2028 v.cas = tr->y0 % 2;
2029
2030 /* Get the subband coordinates for the window of interest */
2031 /* LL band */
2032 opj_dwt_get_band_coordinates(tilec, resno, 0,
2033 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2034 &win_ll_x0, &win_ll_y0,
2035 &win_ll_x1, &win_ll_y1);
2036
2037 /* HL band */
2038 opj_dwt_get_band_coordinates(tilec, resno, 1,
2039 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2040 &win_hl_x0, NULL, &win_hl_x1, NULL);
2041
2042 /* LH band */
2043 opj_dwt_get_band_coordinates(tilec, resno, 2,
2044 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2045 NULL, &win_lh_y0, NULL, &win_lh_y1);
2046
2047 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2048 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2049 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2050 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2051 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2052
2053 /* Subtract the origin of the bands for this tile, to the subwindow */
2054 /* of interest band coordinates, so as to get them relative to the */
2055 /* tile */
2056 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2057 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2058 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2059 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2060 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2061 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2062 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2063 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2064
2065 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2066 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2067
2068 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2069 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2070
2071 /* Compute the tile-resolution-based coordinates for the window of interest */
2072 if (h.cas == 0) {
2073 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2074 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2075 } else {
2076 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2077 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2078 }
2079
2080 if (v.cas == 0) {
2081 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2082 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2083 } else {
2084 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2085 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2086 }
2087
2088 for (j = 0; j < rh; ++j) {
2089 if ((j >= win_ll_y0 && j < win_ll_y1) ||
2090 (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2091
2092 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */
2093 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */
2094 /* on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */
2095 /* This is less extreme than memsetting the whole buffer to 0 */
2096 /* although we could potentially do better with better handling of edge conditions */
2097 if (win_tr_x1 >= 1 && win_tr_x1 < rw) {
2098 h.mem[win_tr_x1 - 1] = 0;
2099 }
2100 if (win_tr_x1 < rw) {
2101 h.mem[win_tr_x1] = 0;
2102 }
2103
2104 opj_dwt_interleave_partial_h(h.mem,
2105 h.cas,
2106 sa,
2107 j,
2108 (OPJ_UINT32)h.sn,
2109 win_ll_x0,
2110 win_ll_x1,
2111 win_hl_x0,
2112 win_hl_x1);
2113 opj_dwt_decode_partial_1(h.mem, h.mem_count, h.dn, h.sn, h.cas,
2114 (OPJ_INT32)win_ll_x0,
2115 (OPJ_INT32)win_ll_x1,
2116 (OPJ_INT32)win_hl_x0,
2117 (OPJ_INT32)win_hl_x1);
2118 if (!opj_sparse_array_int32_write(sa,
2119 win_tr_x0, j,
2120 win_tr_x1, j + 1,
2121 h.mem + win_tr_x0,
2122 1, 0, OPJ_TRUE)) {
2123 /* FIXME event manager error callback */
2124 opj_sparse_array_int32_free(sa);
2125 opj_aligned_free(h.mem);
2126 return OPJ_FALSE;
2127 }
2128 }
2129 }
2130
2131 for (i = win_tr_x0; i < win_tr_x1;) {
2132 OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
2133 opj_dwt_interleave_partial_v(v.mem,
2134 v.cas,
2135 sa,
2136 i,
2137 nb_cols,
2138 (OPJ_UINT32)v.sn,
2139 win_ll_y0,
2140 win_ll_y1,
2141 win_lh_y0,
2142 win_lh_y1);
2143 opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2144 (OPJ_INT32)win_ll_y0,
2145 (OPJ_INT32)win_ll_y1,
2146 (OPJ_INT32)win_lh_y0,
2147 (OPJ_INT32)win_lh_y1);
2148 if (!opj_sparse_array_int32_write(sa,
2149 i, win_tr_y0,
2150 i + nb_cols, win_tr_y1,
2151 v.mem + 4 * win_tr_y0,
2152 1, 4, OPJ_TRUE)) {
2153 /* FIXME event manager error callback */
2154 opj_sparse_array_int32_free(sa);
2155 opj_aligned_free(h.mem);
2156 return OPJ_FALSE;
2157 }
2158
2159 i += nb_cols;
2160 }
2161 }
2162 opj_aligned_free(h.mem);
2163
2164 {
2165 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2166 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2167 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2168 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2169 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2170 tilec->data_win,
2171 1, tr_max->win_x1 - tr_max->win_x0,
2172 OPJ_TRUE);
2173 assert(ret);
2174 OPJ_UNUSED(ret);
2175 }
2176 opj_sparse_array_int32_free(sa);
2177 return OPJ_TRUE;
2178 }
2179
opj_v4dwt_interleave_h(opj_v4dwt_t * OPJ_RESTRICT dwt,OPJ_FLOAT32 * OPJ_RESTRICT a,OPJ_UINT32 width,OPJ_UINT32 remaining_height)2180 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
2181 OPJ_FLOAT32* OPJ_RESTRICT a,
2182 OPJ_UINT32 width,
2183 OPJ_UINT32 remaining_height)
2184 {
2185 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2186 OPJ_UINT32 i, k;
2187 OPJ_UINT32 x0 = dwt->win_l_x0;
2188 OPJ_UINT32 x1 = dwt->win_l_x1;
2189
2190 for (k = 0; k < 2; ++k) {
2191 if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
2192 ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) {
2193 /* Fast code path */
2194 for (i = x0; i < x1; ++i) {
2195 OPJ_UINT32 j = i;
2196 bi[i * 8 ] = a[j];
2197 j += width;
2198 bi[i * 8 + 1] = a[j];
2199 j += width;
2200 bi[i * 8 + 2] = a[j];
2201 j += width;
2202 bi[i * 8 + 3] = a[j];
2203 }
2204 } else {
2205 /* Slow code path */
2206 for (i = x0; i < x1; ++i) {
2207 OPJ_UINT32 j = i;
2208 bi[i * 8 ] = a[j];
2209 j += width;
2210 if (remaining_height == 1) {
2211 continue;
2212 }
2213 bi[i * 8 + 1] = a[j];
2214 j += width;
2215 if (remaining_height == 2) {
2216 continue;
2217 }
2218 bi[i * 8 + 2] = a[j];
2219 j += width;
2220 if (remaining_height == 3) {
2221 continue;
2222 }
2223 bi[i * 8 + 3] = a[j]; /* This one*/
2224 }
2225 }
2226
2227 bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas);
2228 a += dwt->sn;
2229 x0 = dwt->win_h_x0;
2230 x1 = dwt->win_h_x1;
2231 }
2232 }
2233
opj_v4dwt_interleave_partial_h(opj_v4dwt_t * dwt,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_line,OPJ_UINT32 remaining_height)2234 static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt,
2235 opj_sparse_array_int32_t* sa,
2236 OPJ_UINT32 sa_line,
2237 OPJ_UINT32 remaining_height)
2238 {
2239 OPJ_UINT32 i;
2240 for (i = 0; i < remaining_height; i++) {
2241 OPJ_BOOL ret;
2242 ret = opj_sparse_array_int32_read(sa,
2243 dwt->win_l_x0, sa_line + i,
2244 dwt->win_l_x1, sa_line + i + 1,
2245 /* Nasty cast from float* to int32* */
2246 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
2247 8, 0, OPJ_TRUE);
2248 assert(ret);
2249 ret = opj_sparse_array_int32_read(sa,
2250 (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2251 (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
2252 /* Nasty cast from float* to int32* */
2253 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
2254 8, 0, OPJ_TRUE);
2255 assert(ret);
2256 OPJ_UNUSED(ret);
2257 }
2258 }
2259
opj_v4dwt_interleave_v(opj_v4dwt_t * OPJ_RESTRICT dwt,OPJ_FLOAT32 * OPJ_RESTRICT a,OPJ_UINT32 width,OPJ_UINT32 nb_elts_read)2260 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
2261 OPJ_FLOAT32* OPJ_RESTRICT a,
2262 OPJ_UINT32 width,
2263 OPJ_UINT32 nb_elts_read)
2264 {
2265 opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2266 OPJ_UINT32 i;
2267
2268 for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2269 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2270 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2271 }
2272
2273 a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2274 bi = dwt->wavelet + 1 - dwt->cas;
2275
2276 for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
2277 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2278 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2279 }
2280 }
2281
opj_v4dwt_interleave_partial_v(opj_v4dwt_t * OPJ_RESTRICT dwt,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_col,OPJ_UINT32 nb_elts_read)2282 static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
2283 opj_sparse_array_int32_t* sa,
2284 OPJ_UINT32 sa_col,
2285 OPJ_UINT32 nb_elts_read)
2286 {
2287 OPJ_BOOL ret;
2288 ret = opj_sparse_array_int32_read(sa,
2289 sa_col, dwt->win_l_x0,
2290 sa_col + nb_elts_read, dwt->win_l_x1,
2291 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
2292 1, 8, OPJ_TRUE);
2293 assert(ret);
2294 ret = opj_sparse_array_int32_read(sa,
2295 sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
2296 sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
2297 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
2298 1, 8, OPJ_TRUE);
2299 assert(ret);
2300 OPJ_UNUSED(ret);
2301 }
2302
2303 #ifdef __SSE__
2304
opj_v4dwt_decode_step1_sse(opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,const __m128 c)2305 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
2306 OPJ_UINT32 start,
2307 OPJ_UINT32 end,
2308 const __m128 c)
2309 {
2310 __m128* OPJ_RESTRICT vw = (__m128*) w;
2311 OPJ_UINT32 i;
2312 /* 4x unrolled loop */
2313 vw += 2 * start;
2314 for (i = start; i + 3 < end; i += 4, vw += 8) {
2315 __m128 xmm0 = _mm_mul_ps(vw[0], c);
2316 __m128 xmm2 = _mm_mul_ps(vw[2], c);
2317 __m128 xmm4 = _mm_mul_ps(vw[4], c);
2318 __m128 xmm6 = _mm_mul_ps(vw[6], c);
2319 vw[0] = xmm0;
2320 vw[2] = xmm2;
2321 vw[4] = xmm4;
2322 vw[6] = xmm6;
2323 }
2324 for (; i < end; ++i, vw += 2) {
2325 vw[0] = _mm_mul_ps(vw[0], c);
2326 }
2327 }
2328
opj_v4dwt_decode_step2_sse(opj_v4_t * l,opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,OPJ_UINT32 m,__m128 c)2329 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
2330 OPJ_UINT32 start,
2331 OPJ_UINT32 end,
2332 OPJ_UINT32 m,
2333 __m128 c)
2334 {
2335 __m128* OPJ_RESTRICT vl = (__m128*) l;
2336 __m128* OPJ_RESTRICT vw = (__m128*) w;
2337 OPJ_UINT32 i;
2338 OPJ_UINT32 imax = opj_uint_min(end, m);
2339 __m128 tmp1, tmp2, tmp3;
2340 if (start == 0) {
2341 tmp1 = vl[0];
2342 } else {
2343 vw += start * 2;
2344 tmp1 = vw[-3];
2345 }
2346
2347 i = start;
2348
2349 /* 4x loop unrolling */
2350 for (; i + 3 < imax; i += 4) {
2351 __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
2352 tmp2 = vw[-1];
2353 tmp3 = vw[ 0];
2354 tmp4 = vw[ 1];
2355 tmp5 = vw[ 2];
2356 tmp6 = vw[ 3];
2357 tmp7 = vw[ 4];
2358 tmp8 = vw[ 5];
2359 tmp9 = vw[ 6];
2360 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
2361 vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
2362 vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
2363 vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
2364 tmp1 = tmp9;
2365 vw += 8;
2366 }
2367
2368 for (; i < imax; ++i) {
2369 tmp2 = vw[-1];
2370 tmp3 = vw[ 0];
2371 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
2372 tmp1 = tmp3;
2373 vw += 2;
2374 }
2375 if (m < end) {
2376 assert(m + 1 == end);
2377 c = _mm_add_ps(c, c);
2378 c = _mm_mul_ps(c, vw[-2]);
2379 vw[-1] = _mm_add_ps(vw[-1], c);
2380 }
2381 }
2382
2383 #else
2384
opj_v4dwt_decode_step1(opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,const OPJ_FLOAT32 c)2385 static void opj_v4dwt_decode_step1(opj_v4_t* w,
2386 OPJ_UINT32 start,
2387 OPJ_UINT32 end,
2388 const OPJ_FLOAT32 c)
2389 {
2390 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
2391 OPJ_UINT32 i;
2392 for (i = start; i < end; ++i) {
2393 OPJ_FLOAT32 tmp1 = fw[i * 8 ];
2394 OPJ_FLOAT32 tmp2 = fw[i * 8 + 1];
2395 OPJ_FLOAT32 tmp3 = fw[i * 8 + 2];
2396 OPJ_FLOAT32 tmp4 = fw[i * 8 + 3];
2397 fw[i * 8 ] = tmp1 * c;
2398 fw[i * 8 + 1] = tmp2 * c;
2399 fw[i * 8 + 2] = tmp3 * c;
2400 fw[i * 8 + 3] = tmp4 * c;
2401 }
2402 }
2403
opj_v4dwt_decode_step2(opj_v4_t * l,opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,OPJ_UINT32 m,OPJ_FLOAT32 c)2404 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
2405 OPJ_UINT32 start,
2406 OPJ_UINT32 end,
2407 OPJ_UINT32 m,
2408 OPJ_FLOAT32 c)
2409 {
2410 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
2411 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
2412 OPJ_UINT32 i;
2413 OPJ_UINT32 imax = opj_uint_min(end, m);
2414 if (start > 0) {
2415 fw += 8 * start;
2416 fl = fw - 8;
2417 }
2418 for (i = start; i < imax; ++i) {
2419 OPJ_FLOAT32 tmp1_1 = fl[0];
2420 OPJ_FLOAT32 tmp1_2 = fl[1];
2421 OPJ_FLOAT32 tmp1_3 = fl[2];
2422 OPJ_FLOAT32 tmp1_4 = fl[3];
2423 OPJ_FLOAT32 tmp2_1 = fw[-4];
2424 OPJ_FLOAT32 tmp2_2 = fw[-3];
2425 OPJ_FLOAT32 tmp2_3 = fw[-2];
2426 OPJ_FLOAT32 tmp2_4 = fw[-1];
2427 OPJ_FLOAT32 tmp3_1 = fw[0];
2428 OPJ_FLOAT32 tmp3_2 = fw[1];
2429 OPJ_FLOAT32 tmp3_3 = fw[2];
2430 OPJ_FLOAT32 tmp3_4 = fw[3];
2431 fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
2432 fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
2433 fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
2434 fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
2435 fl = fw;
2436 fw += 8;
2437 }
2438 if (m < end) {
2439 assert(m + 1 == end);
2440 c += c;
2441 fw[-4] = fw[-4] + fl[0] * c;
2442 fw[-3] = fw[-3] + fl[1] * c;
2443 fw[-2] = fw[-2] + fl[2] * c;
2444 fw[-1] = fw[-1] + fl[3] * c;
2445 }
2446 }
2447
2448 #endif
2449
2450 /* <summary> */
2451 /* Inverse 9-7 wavelet transform in 1-D. */
2452 /* </summary> */
opj_v4dwt_decode(opj_v4dwt_t * OPJ_RESTRICT dwt)2453 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
2454 {
2455 OPJ_INT32 a, b;
2456 if (dwt->cas == 0) {
2457 if (!((dwt->dn > 0) || (dwt->sn > 1))) {
2458 return;
2459 }
2460 a = 0;
2461 b = 1;
2462 } else {
2463 if (!((dwt->sn > 0) || (dwt->dn > 1))) {
2464 return;
2465 }
2466 a = 1;
2467 b = 0;
2468 }
2469 #ifdef __SSE__
2470 opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2471 _mm_set1_ps(opj_K));
2472 opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2473 _mm_set1_ps(opj_c13318));
2474 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
2475 dwt->win_l_x0, dwt->win_l_x1,
2476 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2477 _mm_set1_ps(opj_dwt_delta));
2478 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
2479 dwt->win_h_x0, dwt->win_h_x1,
2480 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2481 _mm_set1_ps(opj_dwt_gamma));
2482 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
2483 dwt->win_l_x0, dwt->win_l_x1,
2484 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2485 _mm_set1_ps(opj_dwt_beta));
2486 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
2487 dwt->win_h_x0, dwt->win_h_x1,
2488 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2489 _mm_set1_ps(opj_dwt_alpha));
2490 #else
2491 opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2492 opj_K);
2493 opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2494 opj_c13318);
2495 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
2496 dwt->win_l_x0, dwt->win_l_x1,
2497 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2498 opj_dwt_delta);
2499 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
2500 dwt->win_h_x0, dwt->win_h_x1,
2501 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2502 opj_dwt_gamma);
2503 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
2504 dwt->win_l_x0, dwt->win_l_x1,
2505 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2506 opj_dwt_beta);
2507 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
2508 dwt->win_h_x0, dwt->win_h_x1,
2509 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2510 opj_dwt_alpha);
2511 #endif
2512 }
2513
2514
2515 /* <summary> */
2516 /* Inverse 9-7 wavelet transform in 2-D. */
2517 /* </summary> */
2518 static
opj_dwt_decode_tile_97(opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)2519 OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2520 OPJ_UINT32 numres)
2521 {
2522 opj_v4dwt_t h;
2523 opj_v4dwt_t v;
2524
2525 opj_tcd_resolution_t* res = tilec->resolutions;
2526
2527 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
2528 res->x0); /* width of the resolution level computed */
2529 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
2530 res->y0); /* height of the resolution level computed */
2531
2532 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2533 1].x1 -
2534 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
2535
2536 OPJ_SIZE_T l_data_size;
2537
2538 l_data_size = opj_dwt_max_resolution(res, numres);
2539 /* overflow check */
2540 if (l_data_size > (SIZE_MAX - 5U)) {
2541 /* FIXME event manager error callback */
2542 return OPJ_FALSE;
2543 }
2544 l_data_size += 5U;
2545 /* overflow check */
2546 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
2547 /* FIXME event manager error callback */
2548 return OPJ_FALSE;
2549 }
2550 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
2551 if (!h.wavelet) {
2552 /* FIXME event manager error callback */
2553 return OPJ_FALSE;
2554 }
2555 v.wavelet = h.wavelet;
2556
2557 while (--numres) {
2558 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
2559 OPJ_UINT32 j;
2560
2561 h.sn = (OPJ_INT32)rw;
2562 v.sn = (OPJ_INT32)rh;
2563
2564 ++res;
2565
2566 rw = (OPJ_UINT32)(res->x1 -
2567 res->x0); /* width of the resolution level computed */
2568 rh = (OPJ_UINT32)(res->y1 -
2569 res->y0); /* height of the resolution level computed */
2570
2571 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2572 h.cas = res->x0 % 2;
2573
2574 h.win_l_x0 = 0;
2575 h.win_l_x1 = (OPJ_UINT32)h.sn;
2576 h.win_h_x0 = 0;
2577 h.win_h_x1 = (OPJ_UINT32)h.dn;
2578 for (j = 0; j + 3 < rh; j += 4) {
2579 OPJ_UINT32 k;
2580 opj_v4dwt_interleave_h(&h, aj, w, rh - j);
2581 opj_v4dwt_decode(&h);
2582
2583 for (k = 0; k < rw; k++) {
2584 aj[k ] = h.wavelet[k].f[0];
2585 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1];
2586 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
2587 aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
2588 }
2589
2590 aj += w * 4;
2591 }
2592
2593 if (j < rh) {
2594 OPJ_UINT32 k;
2595 opj_v4dwt_interleave_h(&h, aj, w, rh - j);
2596 opj_v4dwt_decode(&h);
2597 for (k = 0; k < rw; k++) {
2598 switch (rh - j) {
2599 case 3:
2600 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
2601 /* FALLTHRU */
2602 case 2:
2603 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1];
2604 /* FALLTHRU */
2605 case 1:
2606 aj[k] = h.wavelet[k].f[0];
2607 }
2608 }
2609 }
2610
2611 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2612 v.cas = res->y0 % 2;
2613 v.win_l_x0 = 0;
2614 v.win_l_x1 = (OPJ_UINT32)v.sn;
2615 v.win_h_x0 = 0;
2616 v.win_h_x1 = (OPJ_UINT32)v.dn;
2617
2618 aj = (OPJ_FLOAT32*) tilec->data;
2619 for (j = rw; j > 3; j -= 4) {
2620 OPJ_UINT32 k;
2621
2622 opj_v4dwt_interleave_v(&v, aj, w, 4);
2623 opj_v4dwt_decode(&v);
2624
2625 for (k = 0; k < rh; ++k) {
2626 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
2627 }
2628 aj += 4;
2629 }
2630
2631 if (rw & 0x03) {
2632 OPJ_UINT32 k;
2633
2634 j = rw & 0x03;
2635
2636 opj_v4dwt_interleave_v(&v, aj, w, j);
2637 opj_v4dwt_decode(&v);
2638
2639 for (k = 0; k < rh; ++k) {
2640 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
2641 (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
2642 }
2643 }
2644 }
2645
2646 opj_aligned_free(h.wavelet);
2647 return OPJ_TRUE;
2648 }
2649
2650 static
opj_dwt_decode_partial_97(opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)2651 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2652 OPJ_UINT32 numres)
2653 {
2654 opj_sparse_array_int32_t* sa;
2655 opj_v4dwt_t h;
2656 opj_v4dwt_t v;
2657 OPJ_UINT32 resno;
2658 /* This value matches the maximum left/right extension given in tables */
2659 /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
2660 /* we currently use 3. */
2661 const OPJ_UINT32 filter_width = 4U;
2662
2663 opj_tcd_resolution_t* tr = tilec->resolutions;
2664 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2665
2666 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2667 tr->x0); /* width of the resolution level computed */
2668 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2669 tr->y0); /* height of the resolution level computed */
2670
2671 OPJ_SIZE_T l_data_size;
2672
2673 /* Compute the intersection of the area of interest, expressed in tile coordinates */
2674 /* with the tile coordinates */
2675 OPJ_UINT32 win_tcx0 = tilec->win_x0;
2676 OPJ_UINT32 win_tcy0 = tilec->win_y0;
2677 OPJ_UINT32 win_tcx1 = tilec->win_x1;
2678 OPJ_UINT32 win_tcy1 = tilec->win_y1;
2679
2680 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
2681 return OPJ_TRUE;
2682 }
2683
2684 sa = opj_dwt_init_sparse_array(tilec, numres);
2685 if (sa == NULL) {
2686 return OPJ_FALSE;
2687 }
2688
2689 if (numres == 1U) {
2690 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2691 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2692 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2693 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2694 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2695 tilec->data_win,
2696 1, tr_max->win_x1 - tr_max->win_x0,
2697 OPJ_TRUE);
2698 assert(ret);
2699 OPJ_UNUSED(ret);
2700 opj_sparse_array_int32_free(sa);
2701 return OPJ_TRUE;
2702 }
2703
2704 l_data_size = opj_dwt_max_resolution(tr, numres);
2705 /* overflow check */
2706 if (l_data_size > (SIZE_MAX - 5U)) {
2707 /* FIXME event manager error callback */
2708 opj_sparse_array_int32_free(sa);
2709 return OPJ_FALSE;
2710 }
2711 l_data_size += 5U;
2712 /* overflow check */
2713 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
2714 /* FIXME event manager error callback */
2715 opj_sparse_array_int32_free(sa);
2716 return OPJ_FALSE;
2717 }
2718 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
2719 if (!h.wavelet) {
2720 /* FIXME event manager error callback */
2721 opj_sparse_array_int32_free(sa);
2722 return OPJ_FALSE;
2723 }
2724 v.wavelet = h.wavelet;
2725
2726 for (resno = 1; resno < numres; resno ++) {
2727 OPJ_UINT32 j;
2728 /* Window of interest subband-based coordinates */
2729 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2730 OPJ_UINT32 win_hl_x0, win_hl_x1;
2731 OPJ_UINT32 win_lh_y0, win_lh_y1;
2732 /* Window of interest tile-resolution-based coordinates */
2733 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2734 /* Tile-resolution subband-based coordinates */
2735 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2736
2737 ++tr;
2738
2739 h.sn = (OPJ_INT32)rw;
2740 v.sn = (OPJ_INT32)rh;
2741
2742 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2743 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2744
2745 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2746 h.cas = tr->x0 % 2;
2747
2748 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2749 v.cas = tr->y0 % 2;
2750
2751 /* Get the subband coordinates for the window of interest */
2752 /* LL band */
2753 opj_dwt_get_band_coordinates(tilec, resno, 0,
2754 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2755 &win_ll_x0, &win_ll_y0,
2756 &win_ll_x1, &win_ll_y1);
2757
2758 /* HL band */
2759 opj_dwt_get_band_coordinates(tilec, resno, 1,
2760 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2761 &win_hl_x0, NULL, &win_hl_x1, NULL);
2762
2763 /* LH band */
2764 opj_dwt_get_band_coordinates(tilec, resno, 2,
2765 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2766 NULL, &win_lh_y0, NULL, &win_lh_y1);
2767
2768 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2769 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2770 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2771 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2772 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2773
2774 /* Subtract the origin of the bands for this tile, to the subwindow */
2775 /* of interest band coordinates, so as to get them relative to the */
2776 /* tile */
2777 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2778 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2779 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2780 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2781 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2782 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2783 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2784 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2785
2786 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2787 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2788
2789 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2790 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2791
2792 /* Compute the tile-resolution-based coordinates for the window of interest */
2793 if (h.cas == 0) {
2794 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2795 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2796 } else {
2797 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2798 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2799 }
2800
2801 if (v.cas == 0) {
2802 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2803 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2804 } else {
2805 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2806 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2807 }
2808
2809 h.win_l_x0 = win_ll_x0;
2810 h.win_l_x1 = win_ll_x1;
2811 h.win_h_x0 = win_hl_x0;
2812 h.win_h_x1 = win_hl_x1;
2813 for (j = 0; j + 3 < rh; j += 4) {
2814 if ((j + 3 >= win_ll_y0 && j < win_ll_y1) ||
2815 (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn &&
2816 j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2817 opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j));
2818 opj_v4dwt_decode(&h);
2819 if (!opj_sparse_array_int32_write(sa,
2820 win_tr_x0, j,
2821 win_tr_x1, j + 4,
2822 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
2823 4, 1, OPJ_TRUE)) {
2824 /* FIXME event manager error callback */
2825 opj_sparse_array_int32_free(sa);
2826 opj_aligned_free(h.wavelet);
2827 return OPJ_FALSE;
2828 }
2829 }
2830 }
2831
2832 if (j < rh &&
2833 ((j + 3 >= win_ll_y0 && j < win_ll_y1) ||
2834 (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn &&
2835 j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
2836 opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j);
2837 opj_v4dwt_decode(&h);
2838 if (!opj_sparse_array_int32_write(sa,
2839 win_tr_x0, j,
2840 win_tr_x1, rh,
2841 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
2842 4, 1, OPJ_TRUE)) {
2843 /* FIXME event manager error callback */
2844 opj_sparse_array_int32_free(sa);
2845 opj_aligned_free(h.wavelet);
2846 return OPJ_FALSE;
2847 }
2848 }
2849
2850 v.win_l_x0 = win_ll_y0;
2851 v.win_l_x1 = win_ll_y1;
2852 v.win_h_x0 = win_lh_y0;
2853 v.win_h_x1 = win_lh_y1;
2854 for (j = win_tr_x0; j < win_tr_x1; j += 4) {
2855 OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j);
2856
2857 opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts);
2858 opj_v4dwt_decode(&v);
2859
2860 if (!opj_sparse_array_int32_write(sa,
2861 j, win_tr_y0,
2862 j + nb_elts, win_tr_y1,
2863 (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
2864 1, 4, OPJ_TRUE)) {
2865 /* FIXME event manager error callback */
2866 opj_sparse_array_int32_free(sa);
2867 opj_aligned_free(h.wavelet);
2868 return OPJ_FALSE;
2869 }
2870 }
2871 }
2872
2873 {
2874 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2875 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2876 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2877 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2878 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2879 tilec->data_win,
2880 1, tr_max->win_x1 - tr_max->win_x0,
2881 OPJ_TRUE);
2882 assert(ret);
2883 OPJ_UNUSED(ret);
2884 }
2885 opj_sparse_array_int32_free(sa);
2886
2887 opj_aligned_free(h.wavelet);
2888 return OPJ_TRUE;
2889 }
2890
2891
opj_dwt_decode_real(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)2892 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
2893 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2894 OPJ_UINT32 numres)
2895 {
2896 if (p_tcd->whole_tile_decoding) {
2897 return opj_dwt_decode_tile_97(tilec, numres);
2898 } else {
2899 return opj_dwt_decode_partial_97(tilec, numres);
2900 }
2901 }
2902