1 /*
2 * The copyright in this software is being made available under the 2-clauses
3 * BSD License, included below. This software may be subject to other third
4 * party and contributor rights, including patent rights, and no such rights
5 * are granted under this license.
6 *
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8 * Copyright (c) 2002-2014, Professor Benoit Macq
9 * Copyright (c) 2001-2003, David Janssens
10 * Copyright (c) 2002-2003, Yannick Verschueren
11 * Copyright (c) 2003-2007, Francois-Olivier Devaux
12 * Copyright (c) 2003-2014, Antonin Descampe
13 * Copyright (c) 2005, Herve Drolon, FreeImage Team
14 * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
15 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
16 * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
17 * All rights reserved.
18 *
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
21 * are met:
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
29 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 */
40
41 #include <assert.h>
42
43 #define OPJ_SKIP_POISON
44 #include "opj_includes.h"
45
46 #ifdef __SSE__
47 #include <xmmintrin.h>
48 #endif
49 #ifdef __SSE2__
50 #include <emmintrin.h>
51 #endif
52 #ifdef __SSSE3__
53 #include <tmmintrin.h>
54 #endif
55 #ifdef __AVX2__
56 #include <immintrin.h>
57 #endif
58
59 #if defined(__GNUC__)
60 #pragma GCC poison malloc calloc realloc free
61 #endif
62
63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
64 /*@{*/
65
66 #ifdef __AVX2__
67 /** Number of int32 values in a AVX2 register */
68 #define VREG_INT_COUNT 8
69 #else
70 /** Number of int32 values in a SSE2 register */
71 #define VREG_INT_COUNT 4
72 #endif
73
74 /** Number of columns that we can process in parallel in the vertical pass */
75 #define PARALLEL_COLS_53 (2*VREG_INT_COUNT)
76
77 /** @name Local data structures */
78 /*@{*/
79
80 typedef struct dwt_local {
81 OPJ_INT32* mem;
82 OPJ_SIZE_T mem_count;
83 OPJ_INT32 dn; /* number of elements in high pass band */
84 OPJ_INT32 sn; /* number of elements in low pass band */
85 OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */
86 } opj_dwt_t;
87
88 #define NB_ELTS_V8 8
89
90 typedef union {
91 OPJ_FLOAT32 f[NB_ELTS_V8];
92 } opj_v8_t;
93
94 typedef struct v8dwt_local {
95 opj_v8_t* wavelet ;
96 OPJ_INT32 dn ; /* number of elements in high pass band */
97 OPJ_INT32 sn ; /* number of elements in low pass band */
98 OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */
99 OPJ_UINT32 win_l_x0; /* start coord in low pass band */
100 OPJ_UINT32 win_l_x1; /* end coord in low pass band */
101 OPJ_UINT32 win_h_x0; /* start coord in high pass band */
102 OPJ_UINT32 win_h_x1; /* end coord in high pass band */
103 } opj_v8dwt_t ;
104
105 /* From table F.4 from the standard */
106 static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f;
107 static const OPJ_FLOAT32 opj_dwt_beta = -0.052980118f;
108 static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f;
109 static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f;
110
111 static const OPJ_FLOAT32 opj_K = 1.230174105f;
112 static const OPJ_FLOAT32 opj_invK = (OPJ_FLOAT32)(1.0 / 1.230174105);
113
114 /*@}*/
115
116 /** @name Local static functions */
117 /*@{*/
118
119 /**
120 Forward lazy transform (horizontal)
121 */
122 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
123 OPJ_INT32 * OPJ_RESTRICT b,
124 OPJ_INT32 dn,
125 OPJ_INT32 sn, OPJ_INT32 cas);
126
127 /**
128 Forward 9-7 wavelet transform in 1-D
129 */
130 static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
131 OPJ_INT32 cas);
132 /**
133 Explicit calculation of the Quantization Stepsizes
134 */
135 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
136 opj_stepsize_t *bandno_stepsize);
137 /**
138 Inverse wavelet transform in 2-D.
139 */
140 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
141 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
142
143 static OPJ_BOOL opj_dwt_decode_partial_tile(
144 opj_tcd_tilecomp_t* tilec,
145 OPJ_UINT32 numres);
146
147 /* Forward transform, for the vertical pass, processing cols columns */
148 /* where cols <= NB_ELTS_V8 */
149 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
150 typedef void (*opj_encode_and_deinterleave_v_fnptr_type)(
151 void *array,
152 void *tmp,
153 OPJ_UINT32 height,
154 OPJ_BOOL even,
155 OPJ_UINT32 stride_width,
156 OPJ_UINT32 cols);
157
158 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
159 typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)(
160 void *row,
161 void *tmp,
162 OPJ_UINT32 width,
163 OPJ_BOOL even);
164
165 static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
166 opj_tcd_tilecomp_t * tilec,
167 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
168 opj_encode_and_deinterleave_h_one_row_fnptr_type
169 p_encode_and_deinterleave_h_one_row);
170
171 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
172 OPJ_UINT32 i);
173
174 /* <summary> */
175 /* Inverse 9-7 wavelet transform in 1-D. */
176 /* </summary> */
177
178 /*@}*/
179
180 /*@}*/
181
182 #define IDX_S(i) (i)*2
183 #define IDX_D(i) 1 + (i)* 2
184 #define UNDERFLOW_SN(i) ((i) >= sn&&sn>0)
185 #define UNDERFLOW_DN(i) ((i) >= dn&&dn>0)
186 #define OVERFLOW_S(i) (IDX_S(i) >= a_count)
187 #define OVERFLOW_D(i) (IDX_D(i) >= a_count)
188
189 #define OPJ_S(i) a[IDX_S(i)]
190 #define OPJ_D(i) a[IDX_D(i)]
191 #define OPJ_S_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_SN(i) ? OPJ_S(sn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i)))
192 #define OPJ_D_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_DN(i) ? OPJ_D(dn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i)))
193 /* new */
194 #define OPJ_SS_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_DN(i) ? OPJ_S(dn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i)))
195 #define OPJ_DD_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_SN(i) ? OPJ_D(sn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i)))
196
197 /* <summary> */
198 /* This table contains the norms of the 5-3 wavelets for different bands. */
199 /* </summary> */
200 /* FIXME! the array should really be extended up to 33 resolution levels */
201 /* See https://github.com/uclouvain/openjpeg/issues/493 */
202 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
203 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
204 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
205 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
206 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
207 };
208
209 /* <summary> */
210 /* This table contains the norms of the 9-7 wavelets for different bands. */
211 /* </summary> */
212 /* FIXME! the array should really be extended up to 33 resolution levels */
213 /* See https://github.com/uclouvain/openjpeg/issues/493 */
214 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
215 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
216 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
217 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
218 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
219 };
220
221 /*
222 ==========================================================
223 local functions
224 ==========================================================
225 */
226
227 /* <summary> */
228 /* Forward lazy transform (horizontal). */
229 /* </summary> */
opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,OPJ_INT32 * OPJ_RESTRICT b,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)230 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
231 OPJ_INT32 * OPJ_RESTRICT b,
232 OPJ_INT32 dn,
233 OPJ_INT32 sn, OPJ_INT32 cas)
234 {
235 OPJ_INT32 i;
236 OPJ_INT32 * OPJ_RESTRICT l_dest = b;
237 const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas;
238
239 for (i = 0; i < sn; ++i) {
240 *l_dest++ = *l_src;
241 l_src += 2;
242 }
243
244 l_dest = b + sn;
245 l_src = a + 1 - cas;
246
247 for (i = 0; i < dn; ++i) {
248 *l_dest++ = *l_src;
249 l_src += 2;
250 }
251 }
252
253 #ifdef STANDARD_SLOW_VERSION
254 /* <summary> */
255 /* Inverse lazy transform (horizontal). */
256 /* </summary> */
opj_dwt_interleave_h(const opj_dwt_t * h,OPJ_INT32 * a)257 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
258 {
259 const OPJ_INT32 *ai = a;
260 OPJ_INT32 *bi = h->mem + h->cas;
261 OPJ_INT32 i = h->sn;
262 while (i--) {
263 *bi = *(ai++);
264 bi += 2;
265 }
266 ai = a + h->sn;
267 bi = h->mem + 1 - h->cas;
268 i = h->dn ;
269 while (i--) {
270 *bi = *(ai++);
271 bi += 2;
272 }
273 }
274
275 /* <summary> */
276 /* Inverse lazy transform (vertical). */
277 /* </summary> */
opj_dwt_interleave_v(const opj_dwt_t * v,OPJ_INT32 * a,OPJ_INT32 x)278 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
279 {
280 const OPJ_INT32 *ai = a;
281 OPJ_INT32 *bi = v->mem + v->cas;
282 OPJ_INT32 i = v->sn;
283 while (i--) {
284 *bi = *ai;
285 bi += 2;
286 ai += x;
287 }
288 ai = a + (v->sn * (OPJ_SIZE_T)x);
289 bi = v->mem + 1 - v->cas;
290 i = v->dn ;
291 while (i--) {
292 *bi = *ai;
293 bi += 2;
294 ai += x;
295 }
296 }
297
298 #endif /* STANDARD_SLOW_VERSION */
299
300 #ifdef STANDARD_SLOW_VERSION
301 /* <summary> */
302 /* Inverse 5-3 wavelet transform in 1-D. */
303 /* </summary> */
opj_dwt_decode_1_(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)304 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
305 OPJ_INT32 sn, OPJ_INT32 cas)
306 {
307 OPJ_INT32 i;
308
309 if (!cas) {
310 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
311 for (i = 0; i < sn; i++) {
312 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
313 }
314 for (i = 0; i < dn; i++) {
315 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
316 }
317 }
318 } else {
319 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
320 OPJ_S(0) /= 2;
321 } else {
322 for (i = 0; i < sn; i++) {
323 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
324 }
325 for (i = 0; i < dn; i++) {
326 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
327 }
328 }
329 }
330 }
331
opj_dwt_decode_1(const opj_dwt_t * v)332 static void opj_dwt_decode_1(const opj_dwt_t *v)
333 {
334 opj_dwt_decode_1_(v->mem, v->mem_count, v->dn, v->sn, v->cas);
335 }
336
337 #endif /* STANDARD_SLOW_VERSION */
338
339 #if !defined(STANDARD_SLOW_VERSION)
opj_idwt53_h_cas0(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp)340 static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
341 const OPJ_INT32 sn,
342 const OPJ_INT32 len,
343 OPJ_INT32* tiledp)
344 {
345 OPJ_INT32 i, j;
346 const OPJ_INT32* in_even = &tiledp[0];
347 const OPJ_INT32* in_odd = &tiledp[sn];
348
349 #ifdef TWO_PASS_VERSION
350 /* For documentation purpose: performs lifting in two iterations, */
351 /* but without explicit interleaving */
352
353 assert(len > 1);
354
355 /* Even */
356 tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
357 for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
358 tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
359 }
360 if (len & 1) { /* if len is odd */
361 tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
362 }
363
364 /* Odd */
365 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
366 tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
367 }
368 if (!(len & 1)) { /* if len is even */
369 tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
370 }
371 #else
372 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
373
374 assert(len > 1);
375
376 /* Improved version of the TWO_PASS_VERSION: */
377 /* Performs lifting in one single iteration. Saves memory */
378 /* accesses and explicit interleaving. */
379 s1n = in_even[0];
380 d1n = in_odd[0];
381 s0n = s1n - ((d1n + 1) >> 1);
382
383 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
384 d1c = d1n;
385 s0c = s0n;
386
387 s1n = in_even[j];
388 d1n = in_odd[j];
389
390 s0n = s1n - ((d1c + d1n + 2) >> 2);
391
392 tmp[i ] = s0c;
393 tmp[i + 1] = opj_int_add_no_overflow(d1c, opj_int_add_no_overflow(s0c,
394 s0n) >> 1);
395 }
396
397 tmp[i] = s0n;
398
399 if (len & 1) {
400 tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
401 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
402 } else {
403 tmp[len - 1] = d1n + s0n;
404 }
405 #endif
406 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
407 }
408
opj_idwt53_h_cas1(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp)409 static void opj_idwt53_h_cas1(OPJ_INT32* tmp,
410 const OPJ_INT32 sn,
411 const OPJ_INT32 len,
412 OPJ_INT32* tiledp)
413 {
414 OPJ_INT32 i, j;
415 const OPJ_INT32* in_even = &tiledp[sn];
416 const OPJ_INT32* in_odd = &tiledp[0];
417
418 #ifdef TWO_PASS_VERSION
419 /* For documentation purpose: performs lifting in two iterations, */
420 /* but without explicit interleaving */
421
422 assert(len > 2);
423
424 /* Odd */
425 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
426 tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
427 }
428 if (!(len & 1)) {
429 tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
430 }
431
432 /* Even */
433 tmp[0] = in_even[0] + tmp[1];
434 for (i = 2, j = 1; i < len - 1; i += 2, j++) {
435 tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
436 }
437 if (len & 1) {
438 tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
439 }
440 #else
441 OPJ_INT32 s1, s2, dc, dn;
442
443 assert(len > 2);
444
445 /* Improved version of the TWO_PASS_VERSION: */
446 /* Performs lifting in one single iteration. Saves memory */
447 /* accesses and explicit interleaving. */
448
449 s1 = in_even[1];
450 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
451 tmp[0] = in_even[0] + dc;
452
453 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
454
455 s2 = in_even[j + 1];
456
457 dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
458 tmp[i ] = dc;
459 tmp[i + 1] = opj_int_add_no_overflow(s1, opj_int_add_no_overflow(dn, dc) >> 1);
460
461 dc = dn;
462 s1 = s2;
463 }
464
465 tmp[i] = dc;
466
467 if (!(len & 1)) {
468 dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
469 tmp[len - 2] = s1 + ((dn + dc) >> 1);
470 tmp[len - 1] = dn;
471 } else {
472 tmp[len - 1] = s1 + dc;
473 }
474 #endif
475 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
476 }
477
478
479 #endif /* !defined(STANDARD_SLOW_VERSION) */
480
481 /* <summary> */
482 /* Inverse 5-3 wavelet transform in 1-D for one row. */
483 /* </summary> */
484 /* Performs interleave, inverse wavelet transform and copy back to buffer */
opj_idwt53_h(const opj_dwt_t * dwt,OPJ_INT32 * tiledp)485 static void opj_idwt53_h(const opj_dwt_t *dwt,
486 OPJ_INT32* tiledp)
487 {
488 #ifdef STANDARD_SLOW_VERSION
489 /* For documentation purpose */
490 opj_dwt_interleave_h(dwt, tiledp);
491 opj_dwt_decode_1(dwt);
492 memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
493 #else
494 const OPJ_INT32 sn = dwt->sn;
495 const OPJ_INT32 len = sn + dwt->dn;
496 if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
497 if (len > 1) {
498 opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
499 } else {
500 /* Unmodified value */
501 }
502 } else { /* Left-most sample is on odd coordinate */
503 if (len == 1) {
504 tiledp[0] /= 2;
505 } else if (len == 2) {
506 OPJ_INT32* out = dwt->mem;
507 const OPJ_INT32* in_even = &tiledp[sn];
508 const OPJ_INT32* in_odd = &tiledp[0];
509 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
510 out[0] = in_even[0] + out[1];
511 memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
512 } else if (len > 2) {
513 opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
514 }
515 }
516 #endif
517 }
518
519 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
520
521 /* Conveniency macros to improve the readability of the formulas */
522 #if __AVX2__
523 #define VREG __m256i
524 #define LOAD_CST(x) _mm256_set1_epi32(x)
525 #define LOAD(x) _mm256_load_si256((const VREG*)(x))
526 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x))
527 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y))
528 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
529 #define ADD(x,y) _mm256_add_epi32((x),(y))
530 #define SUB(x,y) _mm256_sub_epi32((x),(y))
531 #define SAR(x,y) _mm256_srai_epi32((x),(y))
532 #else
533 #define VREG __m128i
534 #define LOAD_CST(x) _mm_set1_epi32(x)
535 #define LOAD(x) _mm_load_si128((const VREG*)(x))
536 #define LOADU(x) _mm_loadu_si128((const VREG*)(x))
537 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y))
538 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
539 #define ADD(x,y) _mm_add_epi32((x),(y))
540 #define SUB(x,y) _mm_sub_epi32((x),(y))
541 #define SAR(x,y) _mm_srai_epi32((x),(y))
542 #endif
543 #define ADD3(x,y,z) ADD(ADD(x,y),z)
544
545 static
opj_idwt53_v_final_memcpy(OPJ_INT32 * tiledp_col,const OPJ_INT32 * tmp,OPJ_INT32 len,OPJ_SIZE_T stride)546 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
547 const OPJ_INT32* tmp,
548 OPJ_INT32 len,
549 OPJ_SIZE_T stride)
550 {
551 OPJ_INT32 i;
552 for (i = 0; i < len; ++i) {
553 /* A memcpy(&tiledp_col[i * stride + 0],
554 &tmp[PARALLEL_COLS_53 * i + 0],
555 PARALLEL_COLS_53 * sizeof(OPJ_INT32))
556 would do but would be a tiny bit slower.
557 We can take here advantage of our knowledge of alignment */
558 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0],
559 LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
560 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
561 LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
562 }
563 }
564
565 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
566 * 16 in AVX2, when top-most pixel is on even coordinate */
opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)567 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
568 OPJ_INT32* tmp,
569 const OPJ_INT32 sn,
570 const OPJ_INT32 len,
571 OPJ_INT32* tiledp_col,
572 const OPJ_SIZE_T stride)
573 {
574 const OPJ_INT32* in_even = &tiledp_col[0];
575 const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
576
577 OPJ_INT32 i;
578 OPJ_SIZE_T j;
579 VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
580 VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
581 const VREG two = LOAD_CST(2);
582
583 assert(len > 1);
584 #if __AVX2__
585 assert(PARALLEL_COLS_53 == 16);
586 assert(VREG_INT_COUNT == 8);
587 #else
588 assert(PARALLEL_COLS_53 == 8);
589 assert(VREG_INT_COUNT == 4);
590 #endif
591
592 /* Note: loads of input even/odd values must be done in a unaligned */
593 /* fashion. But stores in tmp can be done with aligned store, since */
594 /* the temporary buffer is properly aligned */
595 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
596
597 s1n_0 = LOADU(in_even + 0);
598 s1n_1 = LOADU(in_even + VREG_INT_COUNT);
599 d1n_0 = LOADU(in_odd);
600 d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
601
602 /* s0n = s1n - ((d1n + 1) >> 1); <==> */
603 /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
604 s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
605 s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
606
607 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
608 d1c_0 = d1n_0;
609 s0c_0 = s0n_0;
610 d1c_1 = d1n_1;
611 s0c_1 = s0n_1;
612
613 s1n_0 = LOADU(in_even + j * stride);
614 s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
615 d1n_0 = LOADU(in_odd + j * stride);
616 d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
617
618 /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
619 s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
620 s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
621
622 STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
623 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
624
625 /* d1c + ((s0c + s0n) >> 1) */
626 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
627 ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
628 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
629 ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
630 }
631
632 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
633 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
634
635 if (len & 1) {
636 VREG tmp_len_minus_1;
637 s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
638 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
639 tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
640 STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
641 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
642 STORE(tmp + PARALLEL_COLS_53 * (len - 2),
643 ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
644
645 s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
646 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
647 tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
648 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
649 tmp_len_minus_1);
650 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
651 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
652 ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
653
654
655 } else {
656 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
657 ADD(d1n_0, s0n_0));
658 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
659 ADD(d1n_1, s0n_1));
660 }
661
662 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
663 }
664
665
666 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
667 * 16 in AVX2, when top-most pixel is on odd coordinate */
opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)668 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
669 OPJ_INT32* tmp,
670 const OPJ_INT32 sn,
671 const OPJ_INT32 len,
672 OPJ_INT32* tiledp_col,
673 const OPJ_SIZE_T stride)
674 {
675 OPJ_INT32 i;
676 OPJ_SIZE_T j;
677
678 VREG s1_0, s2_0, dc_0, dn_0;
679 VREG s1_1, s2_1, dc_1, dn_1;
680 const VREG two = LOAD_CST(2);
681
682 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
683 const OPJ_INT32* in_odd = &tiledp_col[0];
684
685 assert(len > 2);
686 #if __AVX2__
687 assert(PARALLEL_COLS_53 == 16);
688 assert(VREG_INT_COUNT == 8);
689 #else
690 assert(PARALLEL_COLS_53 == 8);
691 assert(VREG_INT_COUNT == 4);
692 #endif
693
694 /* Note: loads of input even/odd values must be done in a unaligned */
695 /* fashion. But stores in tmp can be done with aligned store, since */
696 /* the temporary buffer is properly aligned */
697 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
698
699 s1_0 = LOADU(in_even + stride);
700 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
701 dc_0 = SUB(LOADU(in_odd + 0),
702 SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
703 STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
704
705 s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
706 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
707 dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
708 SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
709 STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
710 ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
711
712 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
713
714 s2_0 = LOADU(in_even + (j + 1) * stride);
715 s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
716
717 /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
718 dn_0 = SUB(LOADU(in_odd + j * stride),
719 SAR(ADD3(s1_0, s2_0, two), 2));
720 dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
721 SAR(ADD3(s1_1, s2_1, two), 2));
722
723 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
724 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
725
726 /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
727 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
728 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
729 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
730 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
731
732 dc_0 = dn_0;
733 s1_0 = s2_0;
734 dc_1 = dn_1;
735 s1_1 = s2_1;
736 }
737 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
738 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
739
740 if (!(len & 1)) {
741 /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
742 dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
743 SAR(ADD3(s1_0, s1_0, two), 2));
744 dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
745 SAR(ADD3(s1_1, s1_1, two), 2));
746
747 /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
748 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
749 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
750 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
751 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
752
753 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
754 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
755 } else {
756 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
757 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
758 ADD(s1_1, dc_1));
759 }
760
761 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
762 }
763
764 #undef VREG
765 #undef LOAD_CST
766 #undef LOADU
767 #undef LOAD
768 #undef STORE
769 #undef STOREU
770 #undef ADD
771 #undef ADD3
772 #undef SUB
773 #undef SAR
774
775 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
776
777 #if !defined(STANDARD_SLOW_VERSION)
778 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
779 * pixel is on even coordinate */
opj_idwt3_v_cas0(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)780 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
781 const OPJ_INT32 sn,
782 const OPJ_INT32 len,
783 OPJ_INT32* tiledp_col,
784 const OPJ_SIZE_T stride)
785 {
786 OPJ_INT32 i, j;
787 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
788
789 assert(len > 1);
790
791 /* Performs lifting in one single iteration. Saves memory */
792 /* accesses and explicit interleaving. */
793
794 s1n = tiledp_col[0];
795 d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
796 s0n = s1n - ((d1n + 1) >> 1);
797
798 for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
799 d1c = d1n;
800 s0c = s0n;
801
802 s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
803 d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
804
805 s0n = opj_int_sub_no_overflow(s1n,
806 opj_int_add_no_overflow(opj_int_add_no_overflow(d1c, d1n), 2) >> 2);
807
808 tmp[i ] = s0c;
809 tmp[i + 1] = opj_int_add_no_overflow(d1c, opj_int_add_no_overflow(s0c,
810 s0n) >> 1);
811 }
812
813 tmp[i] = s0n;
814
815 if (len & 1) {
816 tmp[len - 1] =
817 tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
818 ((d1n + 1) >> 1);
819 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
820 } else {
821 tmp[len - 1] = d1n + s0n;
822 }
823
824 for (i = 0; i < len; ++i) {
825 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
826 }
827 }
828
829
830 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
831 * pixel is on odd coordinate */
opj_idwt3_v_cas1(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)832 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
833 const OPJ_INT32 sn,
834 const OPJ_INT32 len,
835 OPJ_INT32* tiledp_col,
836 const OPJ_SIZE_T stride)
837 {
838 OPJ_INT32 i, j;
839 OPJ_INT32 s1, s2, dc, dn;
840 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
841 const OPJ_INT32* in_odd = &tiledp_col[0];
842
843 assert(len > 2);
844
845 /* Performs lifting in one single iteration. Saves memory */
846 /* accesses and explicit interleaving. */
847
848 s1 = in_even[stride];
849 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
850 tmp[0] = in_even[0] + dc;
851 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
852
853 s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
854
855 dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
856 tmp[i ] = dc;
857 tmp[i + 1] = s1 + ((dn + dc) >> 1);
858
859 dc = dn;
860 s1 = s2;
861 }
862 tmp[i] = dc;
863 if (!(len & 1)) {
864 dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
865 tmp[len - 2] = s1 + ((dn + dc) >> 1);
866 tmp[len - 1] = dn;
867 } else {
868 tmp[len - 1] = s1 + dc;
869 }
870
871 for (i = 0; i < len; ++i) {
872 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
873 }
874 }
875 #endif /* !defined(STANDARD_SLOW_VERSION) */
876
877 /* <summary> */
878 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
879 /* </summary> */
880 /* Performs interleave, inverse wavelet transform and copy back to buffer */
opj_idwt53_v(const opj_dwt_t * dwt,OPJ_INT32 * tiledp_col,OPJ_SIZE_T stride,OPJ_INT32 nb_cols)881 static void opj_idwt53_v(const opj_dwt_t *dwt,
882 OPJ_INT32* tiledp_col,
883 OPJ_SIZE_T stride,
884 OPJ_INT32 nb_cols)
885 {
886 #ifdef STANDARD_SLOW_VERSION
887 /* For documentation purpose */
888 OPJ_INT32 k, c;
889 for (c = 0; c < nb_cols; c ++) {
890 opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
891 opj_dwt_decode_1(dwt);
892 for (k = 0; k < dwt->sn + dwt->dn; ++k) {
893 tiledp_col[c + k * stride] = dwt->mem[k];
894 }
895 }
896 #else
897 const OPJ_INT32 sn = dwt->sn;
898 const OPJ_INT32 len = sn + dwt->dn;
899 if (dwt->cas == 0) {
900 /* If len == 1, unmodified value */
901
902 #if (defined(__SSE2__) || defined(__AVX2__))
903 if (len > 1 && nb_cols == PARALLEL_COLS_53) {
904 /* Same as below general case, except that thanks to SSE2/AVX2 */
905 /* we can efficiently process 8/16 columns in parallel */
906 opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
907 return;
908 }
909 #endif
910 if (len > 1) {
911 OPJ_INT32 c;
912 for (c = 0; c < nb_cols; c++, tiledp_col++) {
913 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
914 }
915 return;
916 }
917 } else {
918 if (len == 1) {
919 OPJ_INT32 c;
920 for (c = 0; c < nb_cols; c++, tiledp_col++) {
921 tiledp_col[0] /= 2;
922 }
923 return;
924 }
925
926 if (len == 2) {
927 OPJ_INT32 c;
928 OPJ_INT32* out = dwt->mem;
929 for (c = 0; c < nb_cols; c++, tiledp_col++) {
930 OPJ_INT32 i;
931 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
932 const OPJ_INT32* in_odd = &tiledp_col[0];
933
934 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
935 out[0] = in_even[0] + out[1];
936
937 for (i = 0; i < len; ++i) {
938 tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
939 }
940 }
941
942 return;
943 }
944
945 #if (defined(__SSE2__) || defined(__AVX2__))
946 if (len > 2 && nb_cols == PARALLEL_COLS_53) {
947 /* Same as below general case, except that thanks to SSE2/AVX2 */
948 /* we can efficiently process 8/16 columns in parallel */
949 opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
950 return;
951 }
952 #endif
953 if (len > 2) {
954 OPJ_INT32 c;
955 for (c = 0; c < nb_cols; c++, tiledp_col++) {
956 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
957 }
958 return;
959 }
960 }
961 #endif
962 }
963
964 #if 0
965 static void opj_dwt_encode_step1(OPJ_FLOAT32* fw,
966 OPJ_UINT32 end,
967 const OPJ_FLOAT32 c)
968 {
969 OPJ_UINT32 i = 0;
970 for (; i < end; ++i) {
971 fw[0] *= c;
972 fw += 2;
973 }
974 }
975 #else
opj_dwt_encode_step1_combined(OPJ_FLOAT32 * fw,OPJ_UINT32 iters_c1,OPJ_UINT32 iters_c2,const OPJ_FLOAT32 c1,const OPJ_FLOAT32 c2)976 static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw,
977 OPJ_UINT32 iters_c1,
978 OPJ_UINT32 iters_c2,
979 const OPJ_FLOAT32 c1,
980 const OPJ_FLOAT32 c2)
981 {
982 OPJ_UINT32 i = 0;
983 const OPJ_UINT32 iters_common = opj_uint_min(iters_c1, iters_c2);
984 assert((((OPJ_SIZE_T)fw) & 0xf) == 0);
985 assert(opj_int_abs((OPJ_INT32)iters_c1 - (OPJ_INT32)iters_c2) <= 1);
986 for (; i + 3 < iters_common; i += 4) {
987 #ifdef __SSE__
988 const __m128 vcst = _mm_set_ps(c2, c1, c2, c1);
989 *(__m128*)fw = _mm_mul_ps(*(__m128*)fw, vcst);
990 *(__m128*)(fw + 4) = _mm_mul_ps(*(__m128*)(fw + 4), vcst);
991 #else
992 fw[0] *= c1;
993 fw[1] *= c2;
994 fw[2] *= c1;
995 fw[3] *= c2;
996 fw[4] *= c1;
997 fw[5] *= c2;
998 fw[6] *= c1;
999 fw[7] *= c2;
1000 #endif
1001 fw += 8;
1002 }
1003 for (; i < iters_common; i++) {
1004 fw[0] *= c1;
1005 fw[1] *= c2;
1006 fw += 2;
1007 }
1008 if (i < iters_c1) {
1009 fw[0] *= c1;
1010 } else if (i < iters_c2) {
1011 fw[1] *= c2;
1012 }
1013 }
1014
1015 #endif
1016
opj_dwt_encode_step2(OPJ_FLOAT32 * fl,OPJ_FLOAT32 * fw,OPJ_UINT32 end,OPJ_UINT32 m,OPJ_FLOAT32 c)1017 static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1018 OPJ_UINT32 end,
1019 OPJ_UINT32 m,
1020 OPJ_FLOAT32 c)
1021 {
1022 OPJ_UINT32 i;
1023 OPJ_UINT32 imax = opj_uint_min(end, m);
1024 if (imax > 0) {
1025 fw[-1] += (fl[0] + fw[0]) * c;
1026 fw += 2;
1027 i = 1;
1028 for (; i + 3 < imax; i += 4) {
1029 fw[-1] += (fw[-2] + fw[0]) * c;
1030 fw[1] += (fw[0] + fw[2]) * c;
1031 fw[3] += (fw[2] + fw[4]) * c;
1032 fw[5] += (fw[4] + fw[6]) * c;
1033 fw += 8;
1034 }
1035 for (; i < imax; ++i) {
1036 fw[-1] += (fw[-2] + fw[0]) * c;
1037 fw += 2;
1038 }
1039 }
1040 if (m < end) {
1041 assert(m + 1 == end);
1042 fw[-1] += (2 * fw[-2]) * c;
1043 }
1044 }
1045
opj_dwt_encode_1_real(void * aIn,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)1046 static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
1047 OPJ_INT32 cas)
1048 {
1049 OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn;
1050 OPJ_INT32 a, b;
1051 assert(dn + sn > 1);
1052 if (cas == 0) {
1053 a = 0;
1054 b = 1;
1055 } else {
1056 a = 1;
1057 b = 0;
1058 }
1059 opj_dwt_encode_step2(w + a, w + b + 1,
1060 (OPJ_UINT32)dn,
1061 (OPJ_UINT32)opj_int_min(dn, sn - b),
1062 opj_dwt_alpha);
1063 opj_dwt_encode_step2(w + b, w + a + 1,
1064 (OPJ_UINT32)sn,
1065 (OPJ_UINT32)opj_int_min(sn, dn - a),
1066 opj_dwt_beta);
1067 opj_dwt_encode_step2(w + a, w + b + 1,
1068 (OPJ_UINT32)dn,
1069 (OPJ_UINT32)opj_int_min(dn, sn - b),
1070 opj_dwt_gamma);
1071 opj_dwt_encode_step2(w + b, w + a + 1,
1072 (OPJ_UINT32)sn,
1073 (OPJ_UINT32)opj_int_min(sn, dn - a),
1074 opj_dwt_delta);
1075 #if 0
1076 opj_dwt_encode_step1(w + b, (OPJ_UINT32)dn,
1077 opj_K);
1078 opj_dwt_encode_step1(w + a, (OPJ_UINT32)sn,
1079 opj_invK);
1080 #else
1081 if (a == 0) {
1082 opj_dwt_encode_step1_combined(w,
1083 (OPJ_UINT32)sn,
1084 (OPJ_UINT32)dn,
1085 opj_invK,
1086 opj_K);
1087 } else {
1088 opj_dwt_encode_step1_combined(w,
1089 (OPJ_UINT32)dn,
1090 (OPJ_UINT32)sn,
1091 opj_K,
1092 opj_invK);
1093 }
1094 #endif
1095 }
1096
opj_dwt_encode_stepsize(OPJ_INT32 stepsize,OPJ_INT32 numbps,opj_stepsize_t * bandno_stepsize)1097 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1098 opj_stepsize_t *bandno_stepsize)
1099 {
1100 OPJ_INT32 p, n;
1101 p = opj_int_floorlog2(stepsize) - 13;
1102 n = 11 - opj_int_floorlog2(stepsize);
1103 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1104 bandno_stepsize->expn = numbps - p;
1105 }
1106
1107 /*
1108 ==========================================================
1109 DWT interface
1110 ==========================================================
1111 */
1112
1113 /** Process one line for the horizontal pass of the 5x3 forward transform */
1114 static
opj_dwt_encode_and_deinterleave_h_one_row(void * rowIn,void * tmpIn,OPJ_UINT32 width,OPJ_BOOL even)1115 void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn,
1116 void* tmpIn,
1117 OPJ_UINT32 width,
1118 OPJ_BOOL even)
1119 {
1120 OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn;
1121 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn;
1122 const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1123 const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1124
1125 if (even) {
1126 if (width > 1) {
1127 OPJ_INT32 i;
1128 for (i = 0; i < sn - 1; i++) {
1129 tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1);
1130 }
1131 if ((width % 2) == 0) {
1132 tmp[sn + i] = row[2 * i + 1] - row[(i) * 2];
1133 }
1134 row[0] += (tmp[sn] + tmp[sn] + 2) >> 2;
1135 for (i = 1; i < dn; i++) {
1136 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2);
1137 }
1138 if ((width % 2) == 1) {
1139 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2);
1140 }
1141 memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1142 }
1143 } else {
1144 if (width == 1) {
1145 row[0] *= 2;
1146 } else {
1147 OPJ_INT32 i;
1148 tmp[sn + 0] = row[0] - row[1];
1149 for (i = 1; i < sn; i++) {
1150 tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1);
1151 }
1152 if ((width % 2) == 1) {
1153 tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1];
1154 }
1155
1156 for (i = 0; i < dn - 1; i++) {
1157 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2);
1158 }
1159 if ((width % 2) == 0) {
1160 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2);
1161 }
1162 memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1163 }
1164 }
1165 }
1166
1167 /** Process one line for the horizontal pass of the 9x7 forward transform */
1168 static
opj_dwt_encode_and_deinterleave_h_one_row_real(void * rowIn,void * tmpIn,OPJ_UINT32 width,OPJ_BOOL even)1169 void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
1170 void* tmpIn,
1171 OPJ_UINT32 width,
1172 OPJ_BOOL even)
1173 {
1174 OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn;
1175 OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn;
1176 const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1177 const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1178 if (width == 1) {
1179 return;
1180 }
1181 memcpy(tmp, row, width * sizeof(OPJ_FLOAT32));
1182 opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
1183 opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp,
1184 (OPJ_INT32 * OPJ_RESTRICT)row,
1185 dn, sn, even ? 0 : 1);
1186 }
1187
1188 typedef struct {
1189 opj_dwt_t h;
1190 OPJ_UINT32 rw; /* Width of the resolution to process */
1191 OPJ_UINT32 w; /* Width of tiledp */
1192 OPJ_INT32 * OPJ_RESTRICT tiledp;
1193 OPJ_UINT32 min_j;
1194 OPJ_UINT32 max_j;
1195 opj_encode_and_deinterleave_h_one_row_fnptr_type p_function;
1196 } opj_dwt_encode_h_job_t;
1197
opj_dwt_encode_h_func(void * user_data,opj_tls_t * tls)1198 static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
1199 {
1200 OPJ_UINT32 j;
1201 opj_dwt_encode_h_job_t* job;
1202 (void)tls;
1203
1204 job = (opj_dwt_encode_h_job_t*)user_data;
1205 for (j = job->min_j; j < job->max_j; j++) {
1206 OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
1207 (*job->p_function)(aj, job->h.mem, job->rw,
1208 job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE);
1209 }
1210
1211 opj_aligned_free(job->h.mem);
1212 opj_free(job);
1213 }
1214
1215 typedef struct {
1216 opj_dwt_t v;
1217 OPJ_UINT32 rh;
1218 OPJ_UINT32 w;
1219 OPJ_INT32 * OPJ_RESTRICT tiledp;
1220 OPJ_UINT32 min_j;
1221 OPJ_UINT32 max_j;
1222 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v;
1223 } opj_dwt_encode_v_job_t;
1224
opj_dwt_encode_v_func(void * user_data,opj_tls_t * tls)1225 static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
1226 {
1227 OPJ_UINT32 j;
1228 opj_dwt_encode_v_job_t* job;
1229 (void)tls;
1230
1231 job = (opj_dwt_encode_v_job_t*)user_data;
1232 for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) {
1233 (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1234 job->v.mem,
1235 job->rh,
1236 job->v.cas == 0,
1237 job->w,
1238 NB_ELTS_V8);
1239 }
1240 if (j < job->max_j) {
1241 (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1242 job->v.mem,
1243 job->rh,
1244 job->v.cas == 0,
1245 job->w,
1246 job->max_j - j);
1247 }
1248
1249 opj_aligned_free(job->v.mem);
1250 opj_free(job);
1251 }
1252
1253 /** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */
1254 /* that has a NB_ELTS_V8 interleave factor. */
opj_dwt_fetch_cols_vertical_pass(const void * arrayIn,void * tmpOut,OPJ_UINT32 height,OPJ_UINT32 stride_width,OPJ_UINT32 cols)1255 static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn,
1256 void *tmpOut,
1257 OPJ_UINT32 height,
1258 OPJ_UINT32 stride_width,
1259 OPJ_UINT32 cols)
1260 {
1261 const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1262 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut;
1263 if (cols == NB_ELTS_V8) {
1264 OPJ_UINT32 k;
1265 for (k = 0; k < height; ++k) {
1266 memcpy(tmp + NB_ELTS_V8 * k,
1267 array + k * stride_width,
1268 NB_ELTS_V8 * sizeof(OPJ_INT32));
1269 }
1270 } else {
1271 OPJ_UINT32 k;
1272 for (k = 0; k < height; ++k) {
1273 OPJ_UINT32 c;
1274 for (c = 0; c < cols; c++) {
1275 tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width];
1276 }
1277 for (; c < NB_ELTS_V8; c++) {
1278 tmp[NB_ELTS_V8 * k + c] = 0;
1279 }
1280 }
1281 }
1282 }
1283
1284 /* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */
1285 /* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */
1286 /* columns. */
opj_dwt_deinterleave_v_cols(const OPJ_INT32 * OPJ_RESTRICT src,OPJ_INT32 * OPJ_RESTRICT dst,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_UINT32 stride_width,OPJ_INT32 cas,OPJ_UINT32 cols)1287 static INLINE void opj_dwt_deinterleave_v_cols(
1288 const OPJ_INT32 * OPJ_RESTRICT src,
1289 OPJ_INT32 * OPJ_RESTRICT dst,
1290 OPJ_INT32 dn,
1291 OPJ_INT32 sn,
1292 OPJ_UINT32 stride_width,
1293 OPJ_INT32 cas,
1294 OPJ_UINT32 cols)
1295 {
1296 OPJ_INT32 k;
1297 OPJ_INT32 i = sn;
1298 OPJ_INT32 * OPJ_RESTRICT l_dest = dst;
1299 const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8;
1300 OPJ_UINT32 c;
1301
1302 for (k = 0; k < 2; k++) {
1303 while (i--) {
1304 if (cols == NB_ELTS_V8) {
1305 memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32));
1306 } else {
1307 c = 0;
1308 switch (cols) {
1309 case 7:
1310 l_dest[c] = l_src[c];
1311 c++; /* fallthru */
1312 case 6:
1313 l_dest[c] = l_src[c];
1314 c++; /* fallthru */
1315 case 5:
1316 l_dest[c] = l_src[c];
1317 c++; /* fallthru */
1318 case 4:
1319 l_dest[c] = l_src[c];
1320 c++; /* fallthru */
1321 case 3:
1322 l_dest[c] = l_src[c];
1323 c++; /* fallthru */
1324 case 2:
1325 l_dest[c] = l_src[c];
1326 c++; /* fallthru */
1327 default:
1328 l_dest[c] = l_src[c];
1329 break;
1330 }
1331 }
1332 l_dest += stride_width;
1333 l_src += 2 * NB_ELTS_V8;
1334 }
1335
1336 l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width;
1337 l_src = src + (1 - cas) * NB_ELTS_V8;
1338 i = dn;
1339 }
1340 }
1341
1342
1343 /* Forward 5-3 transform, for the vertical pass, processing cols columns */
1344 /* where cols <= NB_ELTS_V8 */
opj_dwt_encode_and_deinterleave_v(void * arrayIn,void * tmpIn,OPJ_UINT32 height,OPJ_BOOL even,OPJ_UINT32 stride_width,OPJ_UINT32 cols)1345 static void opj_dwt_encode_and_deinterleave_v(
1346 void *arrayIn,
1347 void *tmpIn,
1348 OPJ_UINT32 height,
1349 OPJ_BOOL even,
1350 OPJ_UINT32 stride_width,
1351 OPJ_UINT32 cols)
1352 {
1353 OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1354 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn;
1355 const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1;
1356 const OPJ_UINT32 dn = height - sn;
1357
1358 opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1359
1360 #define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c]
1361 #define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c]
1362
1363 #ifdef __SSE2__
1364 if (height == 1) {
1365 if (!even) {
1366 OPJ_UINT32 c;
1367 for (c = 0; c < NB_ELTS_V8; c++) {
1368 tmp[c] *= 2;
1369 }
1370 }
1371 } else if (even) {
1372 OPJ_UINT32 c;
1373 OPJ_UINT32 i;
1374 i = 0;
1375 if (i + 1 < sn) {
1376 __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0);
1377 __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1);
1378 for (; i + 1 < sn; i++) {
1379 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1380 (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1381 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1382 (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1383 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1384 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1385 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1386 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1387 xmm_Di_0 = _mm_sub_epi32(xmm_Di_0,
1388 _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1));
1389 xmm_Di_1 = _mm_sub_epi32(xmm_Di_1,
1390 _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1));
1391 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
1392 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
1393 xmm_Si_0 = xmm_Sip1_0;
1394 xmm_Si_1 = xmm_Sip1_1;
1395 }
1396 }
1397 if (((height) % 2) == 0) {
1398 for (c = 0; c < NB_ELTS_V8; c++) {
1399 OPJ_Dc(i) -= OPJ_Sc(i);
1400 }
1401 }
1402 for (c = 0; c < NB_ELTS_V8; c++) {
1403 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1404 }
1405 i = 1;
1406 if (i < dn) {
1407 __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1408 (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1409 __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1410 (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1411 const __m128i xmm_two = _mm_set1_epi32(2);
1412 for (; i < dn; i++) {
1413 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1414 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1415 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1416 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1417 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1418 (i * 2) * NB_ELTS_V8 + 4 * 0);
1419 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1420 (i * 2) * NB_ELTS_V8 + 4 * 1);
1421 xmm_Si_0 = _mm_add_epi32(xmm_Si_0,
1422 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2));
1423 xmm_Si_1 = _mm_add_epi32(xmm_Si_1,
1424 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2));
1425 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1426 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1427 xmm_Dim1_0 = xmm_Di_0;
1428 xmm_Dim1_1 = xmm_Di_1;
1429 }
1430 }
1431 if (((height) % 2) == 1) {
1432 for (c = 0; c < NB_ELTS_V8; c++) {
1433 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1434 }
1435 }
1436 } else {
1437 OPJ_UINT32 c;
1438 OPJ_UINT32 i;
1439 for (c = 0; c < NB_ELTS_V8; c++) {
1440 OPJ_Sc(0) -= OPJ_Dc(0);
1441 }
1442 i = 1;
1443 if (i < sn) {
1444 __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1445 (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1446 __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1447 (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1448 for (; i < sn; i++) {
1449 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1450 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1451 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1452 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1453 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1454 (i * 2) * NB_ELTS_V8 + 4 * 0);
1455 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1456 (i * 2) * NB_ELTS_V8 + 4 * 1);
1457 xmm_Si_0 = _mm_sub_epi32(xmm_Si_0,
1458 _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1));
1459 xmm_Si_1 = _mm_sub_epi32(xmm_Si_1,
1460 _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1));
1461 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1462 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1463 xmm_Dim1_0 = xmm_Di_0;
1464 xmm_Dim1_1 = xmm_Di_1;
1465 }
1466 }
1467 if (((height) % 2) == 1) {
1468 for (c = 0; c < NB_ELTS_V8; c++) {
1469 OPJ_Sc(i) -= OPJ_Dc(i - 1);
1470 }
1471 }
1472 i = 0;
1473 if (i + 1 < dn) {
1474 __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0));
1475 __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1));
1476 const __m128i xmm_two = _mm_set1_epi32(2);
1477 for (; i + 1 < dn; i++) {
1478 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1479 (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1480 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1481 (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1482 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1483 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1484 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1485 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1486 xmm_Di_0 = _mm_add_epi32(xmm_Di_0,
1487 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2));
1488 xmm_Di_1 = _mm_add_epi32(xmm_Di_1,
1489 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2));
1490 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
1491 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
1492 xmm_Si_0 = xmm_Sip1_0;
1493 xmm_Si_1 = xmm_Sip1_1;
1494 }
1495 }
1496 if (((height) % 2) == 0) {
1497 for (c = 0; c < NB_ELTS_V8; c++) {
1498 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1499 }
1500 }
1501 }
1502 #else
1503 if (even) {
1504 OPJ_UINT32 c;
1505 if (height > 1) {
1506 OPJ_UINT32 i;
1507 for (i = 0; i + 1 < sn; i++) {
1508 for (c = 0; c < NB_ELTS_V8; c++) {
1509 OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1;
1510 }
1511 }
1512 if (((height) % 2) == 0) {
1513 for (c = 0; c < NB_ELTS_V8; c++) {
1514 OPJ_Dc(i) -= OPJ_Sc(i);
1515 }
1516 }
1517 for (c = 0; c < NB_ELTS_V8; c++) {
1518 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1519 }
1520 for (i = 1; i < dn; i++) {
1521 for (c = 0; c < NB_ELTS_V8; c++) {
1522 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2;
1523 }
1524 }
1525 if (((height) % 2) == 1) {
1526 for (c = 0; c < NB_ELTS_V8; c++) {
1527 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1528 }
1529 }
1530 }
1531 } else {
1532 OPJ_UINT32 c;
1533 if (height == 1) {
1534 for (c = 0; c < NB_ELTS_V8; c++) {
1535 OPJ_Sc(0) *= 2;
1536 }
1537 } else {
1538 OPJ_UINT32 i;
1539 for (c = 0; c < NB_ELTS_V8; c++) {
1540 OPJ_Sc(0) -= OPJ_Dc(0);
1541 }
1542 for (i = 1; i < sn; i++) {
1543 for (c = 0; c < NB_ELTS_V8; c++) {
1544 OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1;
1545 }
1546 }
1547 if (((height) % 2) == 1) {
1548 for (c = 0; c < NB_ELTS_V8; c++) {
1549 OPJ_Sc(i) -= OPJ_Dc(i - 1);
1550 }
1551 }
1552 for (i = 0; i + 1 < dn; i++) {
1553 for (c = 0; c < NB_ELTS_V8; c++) {
1554 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2;
1555 }
1556 }
1557 if (((height) % 2) == 0) {
1558 for (c = 0; c < NB_ELTS_V8; c++) {
1559 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1560 }
1561 }
1562 }
1563 }
1564 #endif
1565
1566 if (cols == NB_ELTS_V8) {
1567 opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1568 stride_width, even ? 0 : 1, NB_ELTS_V8);
1569 } else {
1570 opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1571 stride_width, even ? 0 : 1, cols);
1572 }
1573 }
1574
opj_v8dwt_encode_step1(OPJ_FLOAT32 * fw,OPJ_UINT32 end,const OPJ_FLOAT32 cst)1575 static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw,
1576 OPJ_UINT32 end,
1577 const OPJ_FLOAT32 cst)
1578 {
1579 OPJ_UINT32 i;
1580 #ifdef __SSE__
1581 __m128* vw = (__m128*) fw;
1582 const __m128 vcst = _mm_set1_ps(cst);
1583 for (i = 0; i < end; ++i) {
1584 vw[0] = _mm_mul_ps(vw[0], vcst);
1585 vw[1] = _mm_mul_ps(vw[1], vcst);
1586 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1587 }
1588 #else
1589 OPJ_UINT32 c;
1590 for (i = 0; i < end; ++i) {
1591 for (c = 0; c < NB_ELTS_V8; c++) {
1592 fw[i * 2 * NB_ELTS_V8 + c] *= cst;
1593 }
1594 }
1595 #endif
1596 }
1597
opj_v8dwt_encode_step2(OPJ_FLOAT32 * fl,OPJ_FLOAT32 * fw,OPJ_UINT32 end,OPJ_UINT32 m,OPJ_FLOAT32 cst)1598 static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1599 OPJ_UINT32 end,
1600 OPJ_UINT32 m,
1601 OPJ_FLOAT32 cst)
1602 {
1603 OPJ_UINT32 i;
1604 OPJ_UINT32 imax = opj_uint_min(end, m);
1605 #ifdef __SSE__
1606 __m128* vw = (__m128*) fw;
1607 __m128 vcst = _mm_set1_ps(cst);
1608 if (imax > 0) {
1609 __m128* vl = (__m128*) fl;
1610 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst));
1611 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst));
1612 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1613 i = 1;
1614
1615 for (; i < imax; ++i) {
1616 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst));
1617 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst));
1618 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1619 }
1620 }
1621 if (m < end) {
1622 assert(m + 1 == end);
1623 vcst = _mm_add_ps(vcst, vcst);
1624 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst));
1625 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst));
1626 }
1627 #else
1628 OPJ_INT32 c;
1629 if (imax > 0) {
1630 for (c = 0; c < NB_ELTS_V8; c++) {
1631 fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1632 cst;
1633 }
1634 fw += 2 * NB_ELTS_V8;
1635 i = 1;
1636 for (; i < imax; ++i) {
1637 for (c = 0; c < NB_ELTS_V8; c++) {
1638 fw[-1 * NB_ELTS_V8 + c] += (fw[-2 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1639 cst;
1640 }
1641 fw += 2 * NB_ELTS_V8;
1642 }
1643 }
1644 if (m < end) {
1645 assert(m + 1 == end);
1646 for (c = 0; c < NB_ELTS_V8; c++) {
1647 fw[-1 * NB_ELTS_V8 + c] += (2 * fw[-2 * NB_ELTS_V8 + c]) * cst;
1648 }
1649 }
1650 #endif
1651 }
1652
1653 /* Forward 9-7 transform, for the vertical pass, processing cols columns */
1654 /* where cols <= NB_ELTS_V8 */
opj_dwt_encode_and_deinterleave_v_real(void * arrayIn,void * tmpIn,OPJ_UINT32 height,OPJ_BOOL even,OPJ_UINT32 stride_width,OPJ_UINT32 cols)1655 static void opj_dwt_encode_and_deinterleave_v_real(
1656 void *arrayIn,
1657 void *tmpIn,
1658 OPJ_UINT32 height,
1659 OPJ_BOOL even,
1660 OPJ_UINT32 stride_width,
1661 OPJ_UINT32 cols)
1662 {
1663 OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn;
1664 OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn;
1665 const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1);
1666 const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn);
1667 OPJ_INT32 a, b;
1668
1669 if (height == 1) {
1670 return;
1671 }
1672
1673 opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1674
1675 if (even) {
1676 a = 0;
1677 b = 1;
1678 } else {
1679 a = 1;
1680 b = 0;
1681 }
1682 opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1683 tmp + (b + 1) * NB_ELTS_V8,
1684 (OPJ_UINT32)dn,
1685 (OPJ_UINT32)opj_int_min(dn, sn - b),
1686 opj_dwt_alpha);
1687 opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1688 tmp + (a + 1) * NB_ELTS_V8,
1689 (OPJ_UINT32)sn,
1690 (OPJ_UINT32)opj_int_min(sn, dn - a),
1691 opj_dwt_beta);
1692 opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1693 tmp + (b + 1) * NB_ELTS_V8,
1694 (OPJ_UINT32)dn,
1695 (OPJ_UINT32)opj_int_min(dn, sn - b),
1696 opj_dwt_gamma);
1697 opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1698 tmp + (a + 1) * NB_ELTS_V8,
1699 (OPJ_UINT32)sn,
1700 (OPJ_UINT32)opj_int_min(sn, dn - a),
1701 opj_dwt_delta);
1702 opj_v8dwt_encode_step1(tmp + b * NB_ELTS_V8, (OPJ_UINT32)dn,
1703 opj_K);
1704 opj_v8dwt_encode_step1(tmp + a * NB_ELTS_V8, (OPJ_UINT32)sn,
1705 opj_invK);
1706
1707
1708 if (cols == NB_ELTS_V8) {
1709 opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1710 (OPJ_INT32*)array,
1711 (OPJ_INT32)dn, (OPJ_INT32)sn,
1712 stride_width, even ? 0 : 1, NB_ELTS_V8);
1713 } else {
1714 opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1715 (OPJ_INT32*)array,
1716 (OPJ_INT32)dn, (OPJ_INT32)sn,
1717 stride_width, even ? 0 : 1, cols);
1718 }
1719 }
1720
1721
1722 /* <summary> */
1723 /* Forward 5-3 wavelet transform in 2-D. */
1724 /* </summary> */
opj_dwt_encode_procedure(opj_thread_pool_t * tp,opj_tcd_tilecomp_t * tilec,opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,opj_encode_and_deinterleave_h_one_row_fnptr_type p_encode_and_deinterleave_h_one_row)1725 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
1726 opj_tcd_tilecomp_t * tilec,
1727 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
1728 opj_encode_and_deinterleave_h_one_row_fnptr_type
1729 p_encode_and_deinterleave_h_one_row)
1730 {
1731 OPJ_INT32 i;
1732 OPJ_INT32 *bj = 00;
1733 OPJ_UINT32 w;
1734 OPJ_INT32 l;
1735
1736 OPJ_SIZE_T l_data_size;
1737
1738 opj_tcd_resolution_t * l_cur_res = 0;
1739 opj_tcd_resolution_t * l_last_res = 0;
1740 const int num_threads = opj_thread_pool_get_thread_count(tp);
1741 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1742
1743 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1744 l = (OPJ_INT32)tilec->numresolutions - 1;
1745
1746 l_cur_res = tilec->resolutions + l;
1747 l_last_res = l_cur_res - 1;
1748
1749 l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1750 /* overflow check */
1751 if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) {
1752 /* FIXME event manager error callback */
1753 return OPJ_FALSE;
1754 }
1755 l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32);
1756 bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1757 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1758 /* in that case, so do not error out */
1759 if (l_data_size != 0 && ! bj) {
1760 return OPJ_FALSE;
1761 }
1762 i = l;
1763
1764 while (i--) {
1765 OPJ_UINT32 j;
1766 OPJ_UINT32 rw; /* width of the resolution level computed */
1767 OPJ_UINT32 rh; /* height of the resolution level computed */
1768 OPJ_UINT32
1769 rw1; /* width of the resolution level once lower than computed one */
1770 OPJ_UINT32
1771 rh1; /* height of the resolution level once lower than computed one */
1772 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1773 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
1774 OPJ_INT32 dn, sn;
1775
1776 rw = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0);
1777 rh = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0);
1778 rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0);
1779 rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0);
1780
1781 cas_row = l_cur_res->x0 & 1;
1782 cas_col = l_cur_res->y0 & 1;
1783
1784 sn = (OPJ_INT32)rh1;
1785 dn = (OPJ_INT32)(rh - rh1);
1786
1787 /* Perform vertical pass */
1788 if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
1789 for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) {
1790 p_encode_and_deinterleave_v(tiledp + j,
1791 bj,
1792 rh,
1793 cas_col == 0,
1794 w,
1795 NB_ELTS_V8);
1796 }
1797 if (j < rw) {
1798 p_encode_and_deinterleave_v(tiledp + j,
1799 bj,
1800 rh,
1801 cas_col == 0,
1802 w,
1803 rw - j);
1804 }
1805 } else {
1806 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1807 OPJ_UINT32 step_j;
1808
1809 if (rw < num_jobs) {
1810 num_jobs = rw;
1811 }
1812 step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
1813
1814 for (j = 0; j < num_jobs; j++) {
1815 opj_dwt_encode_v_job_t* job;
1816
1817 job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t));
1818 if (!job) {
1819 opj_thread_pool_wait_completion(tp, 0);
1820 opj_aligned_free(bj);
1821 return OPJ_FALSE;
1822 }
1823 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1824 if (!job->v.mem) {
1825 opj_thread_pool_wait_completion(tp, 0);
1826 opj_free(job);
1827 opj_aligned_free(bj);
1828 return OPJ_FALSE;
1829 }
1830 job->v.dn = dn;
1831 job->v.sn = sn;
1832 job->v.cas = cas_col;
1833 job->rh = rh;
1834 job->w = w;
1835 job->tiledp = tiledp;
1836 job->min_j = j * step_j;
1837 job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j;
1838 job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v;
1839 opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job);
1840 }
1841 opj_thread_pool_wait_completion(tp, 0);
1842 }
1843
1844 sn = (OPJ_INT32)rw1;
1845 dn = (OPJ_INT32)(rw - rw1);
1846
1847 /* Perform horizontal pass */
1848 if (num_threads <= 1 || rh <= 1) {
1849 for (j = 0; j < rh; j++) {
1850 OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w;
1851 (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw,
1852 cas_row == 0 ? OPJ_TRUE : OPJ_FALSE);
1853 }
1854 } else {
1855 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1856 OPJ_UINT32 step_j;
1857
1858 if (rh < num_jobs) {
1859 num_jobs = rh;
1860 }
1861 step_j = (rh / num_jobs);
1862
1863 for (j = 0; j < num_jobs; j++) {
1864 opj_dwt_encode_h_job_t* job;
1865
1866 job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t));
1867 if (!job) {
1868 opj_thread_pool_wait_completion(tp, 0);
1869 opj_aligned_free(bj);
1870 return OPJ_FALSE;
1871 }
1872 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1873 if (!job->h.mem) {
1874 opj_thread_pool_wait_completion(tp, 0);
1875 opj_free(job);
1876 opj_aligned_free(bj);
1877 return OPJ_FALSE;
1878 }
1879 job->h.dn = dn;
1880 job->h.sn = sn;
1881 job->h.cas = cas_row;
1882 job->rw = rw;
1883 job->w = w;
1884 job->tiledp = tiledp;
1885 job->min_j = j * step_j;
1886 job->max_j = (j + 1U) * step_j; /* this can overflow */
1887 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
1888 job->max_j = rh;
1889 }
1890 job->p_function = p_encode_and_deinterleave_h_one_row;
1891 opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job);
1892 }
1893 opj_thread_pool_wait_completion(tp, 0);
1894 }
1895
1896 l_cur_res = l_last_res;
1897
1898 --l_last_res;
1899 }
1900
1901 opj_aligned_free(bj);
1902 return OPJ_TRUE;
1903 }
1904
1905 /* Forward 5-3 wavelet transform in 2-D. */
1906 /* </summary> */
opj_dwt_encode(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * tilec)1907 OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
1908 opj_tcd_tilecomp_t * tilec)
1909 {
1910 return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1911 opj_dwt_encode_and_deinterleave_v,
1912 opj_dwt_encode_and_deinterleave_h_one_row);
1913 }
1914
1915 /* <summary> */
1916 /* Inverse 5-3 wavelet transform in 2-D. */
1917 /* </summary> */
opj_dwt_decode(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1918 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
1919 OPJ_UINT32 numres)
1920 {
1921 if (p_tcd->whole_tile_decoding) {
1922 return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1923 } else {
1924 return opj_dwt_decode_partial_tile(tilec, numres);
1925 }
1926 }
1927
1928 /* <summary> */
1929 /* Get norm of 5-3 wavelet. */
1930 /* </summary> */
opj_dwt_getnorm(OPJ_UINT32 level,OPJ_UINT32 orient)1931 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1932 {
1933 /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1934 /* but the array should really be extended up to 33 resolution levels */
1935 /* See https://github.com/uclouvain/openjpeg/issues/493 */
1936 if (orient == 0 && level >= 10) {
1937 level = 9;
1938 } else if (orient > 0 && level >= 9) {
1939 level = 8;
1940 }
1941 return opj_dwt_norms[orient][level];
1942 }
1943
1944 /* <summary> */
1945 /* Forward 9-7 wavelet transform in 2-D. */
1946 /* </summary> */
opj_dwt_encode_real(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * tilec)1947 OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
1948 opj_tcd_tilecomp_t * tilec)
1949 {
1950 return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1951 opj_dwt_encode_and_deinterleave_v_real,
1952 opj_dwt_encode_and_deinterleave_h_one_row_real);
1953 }
1954
1955 /* <summary> */
1956 /* Get norm of 9-7 wavelet. */
1957 /* </summary> */
opj_dwt_getnorm_real(OPJ_UINT32 level,OPJ_UINT32 orient)1958 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1959 {
1960 /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1961 /* but the array should really be extended up to 33 resolution levels */
1962 /* See https://github.com/uclouvain/openjpeg/issues/493 */
1963 if (orient == 0 && level >= 10) {
1964 level = 9;
1965 } else if (orient > 0 && level >= 9) {
1966 level = 8;
1967 }
1968 return opj_dwt_norms_real[orient][level];
1969 }
1970
opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp,OPJ_UINT32 prec)1971 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1972 {
1973 OPJ_UINT32 numbands, bandno;
1974 numbands = 3 * tccp->numresolutions - 2;
1975 for (bandno = 0; bandno < numbands; bandno++) {
1976 OPJ_FLOAT64 stepsize;
1977 OPJ_UINT32 resno, level, orient, gain;
1978
1979 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1980 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1981 level = tccp->numresolutions - 1 - resno;
1982 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1983 (orient == 2)) ? 1 : 2));
1984 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1985 stepsize = 1.0;
1986 } else {
1987 OPJ_FLOAT64 norm = opj_dwt_getnorm_real(level, orient);
1988 stepsize = (1 << (gain)) / norm;
1989 }
1990 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1991 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1992 }
1993 }
1994
1995 /* <summary> */
1996 /* Determine maximum computed resolution level for inverse wavelet transform */
1997 /* </summary> */
opj_dwt_max_resolution(opj_tcd_resolution_t * OPJ_RESTRICT r,OPJ_UINT32 i)1998 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1999 OPJ_UINT32 i)
2000 {
2001 OPJ_UINT32 mr = 0;
2002 OPJ_UINT32 w;
2003 while (--i) {
2004 ++r;
2005 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
2006 mr = w ;
2007 }
2008 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
2009 mr = w ;
2010 }
2011 }
2012 return mr ;
2013 }
2014
2015 typedef struct {
2016 opj_dwt_t h;
2017 OPJ_UINT32 rw;
2018 OPJ_UINT32 w;
2019 OPJ_INT32 * OPJ_RESTRICT tiledp;
2020 OPJ_UINT32 min_j;
2021 OPJ_UINT32 max_j;
2022 } opj_dwt_decode_h_job_t;
2023
opj_dwt_decode_h_func(void * user_data,opj_tls_t * tls)2024 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
2025 {
2026 OPJ_UINT32 j;
2027 opj_dwt_decode_h_job_t* job;
2028 (void)tls;
2029
2030 job = (opj_dwt_decode_h_job_t*)user_data;
2031 for (j = job->min_j; j < job->max_j; j++) {
2032 opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
2033 }
2034
2035 opj_aligned_free(job->h.mem);
2036 opj_free(job);
2037 }
2038
2039 typedef struct {
2040 opj_dwt_t v;
2041 OPJ_UINT32 rh;
2042 OPJ_UINT32 w;
2043 OPJ_INT32 * OPJ_RESTRICT tiledp;
2044 OPJ_UINT32 min_j;
2045 OPJ_UINT32 max_j;
2046 } opj_dwt_decode_v_job_t;
2047
opj_dwt_decode_v_func(void * user_data,opj_tls_t * tls)2048 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
2049 {
2050 OPJ_UINT32 j;
2051 opj_dwt_decode_v_job_t* job;
2052 (void)tls;
2053
2054 job = (opj_dwt_decode_v_job_t*)user_data;
2055 for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
2056 j += PARALLEL_COLS_53) {
2057 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2058 PARALLEL_COLS_53);
2059 }
2060 if (j < job->max_j)
2061 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2062 (OPJ_INT32)(job->max_j - j));
2063
2064 opj_aligned_free(job->v.mem);
2065 opj_free(job);
2066 }
2067
2068
2069 /* <summary> */
2070 /* Inverse wavelet transform in 2-D. */
2071 /* </summary> */
opj_dwt_decode_tile(opj_thread_pool_t * tp,const opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)2072 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
2073 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
2074 {
2075 opj_dwt_t h;
2076 opj_dwt_t v;
2077
2078 opj_tcd_resolution_t* tr = tilec->resolutions;
2079
2080 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2081 tr->x0); /* width of the resolution level computed */
2082 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2083 tr->y0); /* height of the resolution level computed */
2084
2085 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2086 1].x1 -
2087 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
2088 OPJ_SIZE_T h_mem_size;
2089 int num_threads;
2090
2091 if (numres == 1U) {
2092 return OPJ_TRUE;
2093 }
2094 num_threads = opj_thread_pool_get_thread_count(tp);
2095 h.mem_count = opj_dwt_max_resolution(tr, numres);
2096 /* overflow check */
2097 if (h.mem_count > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
2098 /* FIXME event manager error callback */
2099 return OPJ_FALSE;
2100 }
2101 /* We need PARALLEL_COLS_53 times the height of the array, */
2102 /* since for the vertical pass */
2103 /* we process PARALLEL_COLS_53 columns at a time */
2104 h_mem_size = h.mem_count * PARALLEL_COLS_53 * sizeof(OPJ_INT32);
2105 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2106 if (! h.mem) {
2107 /* FIXME event manager error callback */
2108 return OPJ_FALSE;
2109 }
2110
2111 v.mem_count = h.mem_count;
2112 v.mem = h.mem;
2113
2114 while (--numres) {
2115 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
2116 OPJ_UINT32 j;
2117
2118 ++tr;
2119 h.sn = (OPJ_INT32)rw;
2120 v.sn = (OPJ_INT32)rh;
2121
2122 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2123 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2124
2125 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2126 h.cas = tr->x0 % 2;
2127
2128 if (num_threads <= 1 || rh <= 1) {
2129 for (j = 0; j < rh; ++j) {
2130 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
2131 }
2132 } else {
2133 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2134 OPJ_UINT32 step_j;
2135
2136 if (rh < num_jobs) {
2137 num_jobs = rh;
2138 }
2139 step_j = (rh / num_jobs);
2140
2141 for (j = 0; j < num_jobs; j++) {
2142 opj_dwt_decode_h_job_t* job;
2143
2144 job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t));
2145 if (!job) {
2146 /* It would be nice to fallback to single thread case, but */
2147 /* unfortunately some jobs may be launched and have modified */
2148 /* tiledp, so it is not practical to recover from that error */
2149 /* FIXME event manager error callback */
2150 opj_thread_pool_wait_completion(tp, 0);
2151 opj_aligned_free(h.mem);
2152 return OPJ_FALSE;
2153 }
2154 job->h = h;
2155 job->rw = rw;
2156 job->w = w;
2157 job->tiledp = tiledp;
2158 job->min_j = j * step_j;
2159 job->max_j = (j + 1U) * step_j; /* this can overflow */
2160 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
2161 job->max_j = rh;
2162 }
2163 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2164 if (!job->h.mem) {
2165 /* FIXME event manager error callback */
2166 opj_thread_pool_wait_completion(tp, 0);
2167 opj_free(job);
2168 opj_aligned_free(h.mem);
2169 return OPJ_FALSE;
2170 }
2171 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
2172 }
2173 opj_thread_pool_wait_completion(tp, 0);
2174 }
2175
2176 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2177 v.cas = tr->y0 % 2;
2178
2179 if (num_threads <= 1 || rw <= 1) {
2180 for (j = 0; j + PARALLEL_COLS_53 <= rw;
2181 j += PARALLEL_COLS_53) {
2182 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
2183 }
2184 if (j < rw) {
2185 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
2186 }
2187 } else {
2188 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2189 OPJ_UINT32 step_j;
2190
2191 if (rw < num_jobs) {
2192 num_jobs = rw;
2193 }
2194 step_j = (rw / num_jobs);
2195
2196 for (j = 0; j < num_jobs; j++) {
2197 opj_dwt_decode_v_job_t* job;
2198
2199 job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t));
2200 if (!job) {
2201 /* It would be nice to fallback to single thread case, but */
2202 /* unfortunately some jobs may be launched and have modified */
2203 /* tiledp, so it is not practical to recover from that error */
2204 /* FIXME event manager error callback */
2205 opj_thread_pool_wait_completion(tp, 0);
2206 opj_aligned_free(v.mem);
2207 return OPJ_FALSE;
2208 }
2209 job->v = v;
2210 job->rh = rh;
2211 job->w = w;
2212 job->tiledp = tiledp;
2213 job->min_j = j * step_j;
2214 job->max_j = (j + 1U) * step_j; /* this can overflow */
2215 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
2216 job->max_j = rw;
2217 }
2218 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2219 if (!job->v.mem) {
2220 /* FIXME event manager error callback */
2221 opj_thread_pool_wait_completion(tp, 0);
2222 opj_free(job);
2223 opj_aligned_free(v.mem);
2224 return OPJ_FALSE;
2225 }
2226 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
2227 }
2228 opj_thread_pool_wait_completion(tp, 0);
2229 }
2230 }
2231 opj_aligned_free(h.mem);
2232 return OPJ_TRUE;
2233 }
2234
opj_dwt_interleave_partial_h(OPJ_INT32 * dest,OPJ_INT32 cas,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_line,OPJ_UINT32 sn,OPJ_UINT32 win_l_x0,OPJ_UINT32 win_l_x1,OPJ_UINT32 win_h_x0,OPJ_UINT32 win_h_x1)2235 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
2236 OPJ_INT32 cas,
2237 opj_sparse_array_int32_t* sa,
2238 OPJ_UINT32 sa_line,
2239 OPJ_UINT32 sn,
2240 OPJ_UINT32 win_l_x0,
2241 OPJ_UINT32 win_l_x1,
2242 OPJ_UINT32 win_h_x0,
2243 OPJ_UINT32 win_h_x1)
2244 {
2245 OPJ_BOOL ret;
2246 ret = opj_sparse_array_int32_read(sa,
2247 win_l_x0, sa_line,
2248 win_l_x1, sa_line + 1,
2249 dest + cas + 2 * win_l_x0,
2250 2, 0, OPJ_TRUE);
2251 assert(ret);
2252 ret = opj_sparse_array_int32_read(sa,
2253 sn + win_h_x0, sa_line,
2254 sn + win_h_x1, sa_line + 1,
2255 dest + 1 - cas + 2 * win_h_x0,
2256 2, 0, OPJ_TRUE);
2257 assert(ret);
2258 OPJ_UNUSED(ret);
2259 }
2260
2261
opj_dwt_interleave_partial_v(OPJ_INT32 * dest,OPJ_INT32 cas,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_col,OPJ_UINT32 nb_cols,OPJ_UINT32 sn,OPJ_UINT32 win_l_y0,OPJ_UINT32 win_l_y1,OPJ_UINT32 win_h_y0,OPJ_UINT32 win_h_y1)2262 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
2263 OPJ_INT32 cas,
2264 opj_sparse_array_int32_t* sa,
2265 OPJ_UINT32 sa_col,
2266 OPJ_UINT32 nb_cols,
2267 OPJ_UINT32 sn,
2268 OPJ_UINT32 win_l_y0,
2269 OPJ_UINT32 win_l_y1,
2270 OPJ_UINT32 win_h_y0,
2271 OPJ_UINT32 win_h_y1)
2272 {
2273 OPJ_BOOL ret;
2274 ret = opj_sparse_array_int32_read(sa,
2275 sa_col, win_l_y0,
2276 sa_col + nb_cols, win_l_y1,
2277 dest + cas * 4 + 2 * 4 * win_l_y0,
2278 1, 2 * 4, OPJ_TRUE);
2279 assert(ret);
2280 ret = opj_sparse_array_int32_read(sa,
2281 sa_col, sn + win_h_y0,
2282 sa_col + nb_cols, sn + win_h_y1,
2283 dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
2284 1, 2 * 4, OPJ_TRUE);
2285 assert(ret);
2286 OPJ_UNUSED(ret);
2287 }
2288
opj_dwt_decode_partial_1(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas,OPJ_INT32 win_l_x0,OPJ_INT32 win_l_x1,OPJ_INT32 win_h_x0,OPJ_INT32 win_h_x1)2289 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_SIZE_T a_count,
2290 OPJ_INT32 dn, OPJ_INT32 sn,
2291 OPJ_INT32 cas,
2292 OPJ_INT32 win_l_x0,
2293 OPJ_INT32 win_l_x1,
2294 OPJ_INT32 win_h_x0,
2295 OPJ_INT32 win_h_x1)
2296 {
2297 OPJ_INT32 i;
2298
2299 if (!cas) {
2300 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
2301
2302 /* Naive version is :
2303 for (i = win_l_x0; i < i_max; i++) {
2304 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2305 }
2306 for (i = win_h_x0; i < win_h_x1; i++) {
2307 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2308 }
2309 but the compiler doesn't manage to unroll it to avoid bound
2310 checking in OPJ_S_ and OPJ_D_ macros
2311 */
2312
2313 i = win_l_x0;
2314 if (i < win_l_x1) {
2315 OPJ_INT32 i_max;
2316
2317 /* Left-most case */
2318 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2319 i ++;
2320
2321 i_max = win_l_x1;
2322 if (i_max > dn) {
2323 i_max = dn;
2324 }
2325 for (; i < i_max; i++) {
2326 /* No bound checking */
2327 OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
2328 }
2329 for (; i < win_l_x1; i++) {
2330 /* Right-most case */
2331 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2332 }
2333 }
2334
2335 i = win_h_x0;
2336 if (i < win_h_x1) {
2337 OPJ_INT32 i_max = win_h_x1;
2338 if (i_max >= sn) {
2339 i_max = sn - 1;
2340 }
2341 for (; i < i_max; i++) {
2342 /* No bound checking */
2343 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
2344 }
2345 for (; i < win_h_x1; i++) {
2346 /* Right-most case */
2347 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2348 }
2349 }
2350 }
2351 } else {
2352 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
2353 OPJ_S(0) /= 2;
2354 } else {
2355 for (i = win_l_x0; i < win_l_x1; i++) {
2356 OPJ_D(i) = opj_int_sub_no_overflow(OPJ_D(i),
2357 opj_int_add_no_overflow(opj_int_add_no_overflow(OPJ_SS_(i), OPJ_SS_(i + 1)),
2358 2) >> 2);
2359 }
2360 for (i = win_h_x0; i < win_h_x1; i++) {
2361 OPJ_S(i) = opj_int_add_no_overflow(OPJ_S(i),
2362 opj_int_add_no_overflow(OPJ_DD_(i), OPJ_DD_(i - 1)) >> 1);
2363 }
2364 }
2365 }
2366 }
2367
2368 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
2369 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
2370 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
2371 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
2372 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
2373 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
2374
opj_dwt_decode_partial_1_parallel(OPJ_INT32 * a,OPJ_UINT32 nb_cols,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas,OPJ_INT32 win_l_x0,OPJ_INT32 win_l_x1,OPJ_INT32 win_h_x0,OPJ_INT32 win_h_x1)2375 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
2376 OPJ_UINT32 nb_cols,
2377 OPJ_INT32 dn, OPJ_INT32 sn,
2378 OPJ_INT32 cas,
2379 OPJ_INT32 win_l_x0,
2380 OPJ_INT32 win_l_x1,
2381 OPJ_INT32 win_h_x0,
2382 OPJ_INT32 win_h_x1)
2383 {
2384 OPJ_INT32 i;
2385 OPJ_UINT32 off;
2386
2387 (void)nb_cols;
2388
2389 if (!cas) {
2390 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
2391
2392 /* Naive version is :
2393 for (i = win_l_x0; i < i_max; i++) {
2394 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2395 }
2396 for (i = win_h_x0; i < win_h_x1; i++) {
2397 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2398 }
2399 but the compiler doesn't manage to unroll it to avoid bound
2400 checking in OPJ_S_ and OPJ_D_ macros
2401 */
2402
2403 i = win_l_x0;
2404 if (i < win_l_x1) {
2405 OPJ_INT32 i_max;
2406
2407 /* Left-most case */
2408 for (off = 0; off < 4; off++) {
2409 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2410 }
2411 i ++;
2412
2413 i_max = win_l_x1;
2414 if (i_max > dn) {
2415 i_max = dn;
2416 }
2417
2418 #ifdef __SSE2__
2419 if (i + 1 < i_max) {
2420 const __m128i two = _mm_set1_epi32(2);
2421 __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
2422 for (; i + 1 < i_max; i += 2) {
2423 /* No bound checking */
2424 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
2425 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2426 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2427 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2428 S = _mm_sub_epi32(S,
2429 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
2430 S1 = _mm_sub_epi32(S1,
2431 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
2432 _mm_store_si128((__m128i*)(a + i * 8), S);
2433 _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
2434 Dm1 = D1;
2435 }
2436 }
2437 #endif
2438
2439 for (; i < i_max; i++) {
2440 /* No bound checking */
2441 for (off = 0; off < 4; off++) {
2442 OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
2443 }
2444 }
2445 for (; i < win_l_x1; i++) {
2446 /* Right-most case */
2447 for (off = 0; off < 4; off++) {
2448 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2449 }
2450 }
2451 }
2452
2453 i = win_h_x0;
2454 if (i < win_h_x1) {
2455 OPJ_INT32 i_max = win_h_x1;
2456 if (i_max >= sn) {
2457 i_max = sn - 1;
2458 }
2459
2460 #ifdef __SSE2__
2461 if (i + 1 < i_max) {
2462 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
2463 for (; i + 1 < i_max; i += 2) {
2464 /* No bound checking */
2465 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2466 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2467 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2468 __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
2469 D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
2470 D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
2471 _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
2472 _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
2473 S = S2;
2474 }
2475 }
2476 #endif
2477
2478 for (; i < i_max; i++) {
2479 /* No bound checking */
2480 for (off = 0; off < 4; off++) {
2481 OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
2482 }
2483 }
2484 for (; i < win_h_x1; i++) {
2485 /* Right-most case */
2486 for (off = 0; off < 4; off++) {
2487 OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
2488 }
2489 }
2490 }
2491 }
2492 } else {
2493 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
2494 for (off = 0; off < 4; off++) {
2495 OPJ_S_off(0, off) /= 2;
2496 }
2497 } else {
2498 for (i = win_l_x0; i < win_l_x1; i++) {
2499 for (off = 0; off < 4; off++) {
2500 OPJ_D_off(i, off) = opj_int_sub_no_overflow(
2501 OPJ_D_off(i, off),
2502 opj_int_add_no_overflow(
2503 opj_int_add_no_overflow(OPJ_SS__off(i, off), OPJ_SS__off(i + 1, off)), 2) >> 2);
2504 }
2505 }
2506 for (i = win_h_x0; i < win_h_x1; i++) {
2507 for (off = 0; off < 4; off++) {
2508 OPJ_S_off(i, off) = opj_int_add_no_overflow(
2509 OPJ_S_off(i, off),
2510 opj_int_add_no_overflow(OPJ_DD__off(i, off), OPJ_DD__off(i - 1, off)) >> 1);
2511 }
2512 }
2513 }
2514 }
2515 }
2516
opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 resno,OPJ_UINT32 bandno,OPJ_UINT32 tcx0,OPJ_UINT32 tcy0,OPJ_UINT32 tcx1,OPJ_UINT32 tcy1,OPJ_UINT32 * tbx0,OPJ_UINT32 * tby0,OPJ_UINT32 * tbx1,OPJ_UINT32 * tby1)2517 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
2518 OPJ_UINT32 resno,
2519 OPJ_UINT32 bandno,
2520 OPJ_UINT32 tcx0,
2521 OPJ_UINT32 tcy0,
2522 OPJ_UINT32 tcx1,
2523 OPJ_UINT32 tcy1,
2524 OPJ_UINT32* tbx0,
2525 OPJ_UINT32* tby0,
2526 OPJ_UINT32* tbx1,
2527 OPJ_UINT32* tby1)
2528 {
2529 /* Compute number of decomposition for this band. See table F-1 */
2530 OPJ_UINT32 nb = (resno == 0) ?
2531 tilec->numresolutions - 1 :
2532 tilec->numresolutions - resno;
2533 /* Map above tile-based coordinates to sub-band-based coordinates per */
2534 /* equation B-15 of the standard */
2535 OPJ_UINT32 x0b = bandno & 1;
2536 OPJ_UINT32 y0b = bandno >> 1;
2537 if (tbx0) {
2538 *tbx0 = (nb == 0) ? tcx0 :
2539 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 :
2540 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb);
2541 }
2542 if (tby0) {
2543 *tby0 = (nb == 0) ? tcy0 :
2544 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 :
2545 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb);
2546 }
2547 if (tbx1) {
2548 *tbx1 = (nb == 0) ? tcx1 :
2549 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 :
2550 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb);
2551 }
2552 if (tby1) {
2553 *tby1 = (nb == 0) ? tcy1 :
2554 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 :
2555 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb);
2556 }
2557 }
2558
opj_dwt_segment_grow(OPJ_UINT32 filter_width,OPJ_UINT32 max_size,OPJ_UINT32 * start,OPJ_UINT32 * end)2559 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
2560 OPJ_UINT32 max_size,
2561 OPJ_UINT32* start,
2562 OPJ_UINT32* end)
2563 {
2564 *start = opj_uint_subs(*start, filter_width);
2565 *end = opj_uint_adds(*end, filter_width);
2566 *end = opj_uint_min(*end, max_size);
2567 }
2568
2569
opj_dwt_init_sparse_array(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)2570 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
2571 opj_tcd_tilecomp_t* tilec,
2572 OPJ_UINT32 numres)
2573 {
2574 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2575 OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
2576 OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
2577 OPJ_UINT32 resno, bandno, precno, cblkno;
2578 opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
2579 w, h, opj_uint_min(w, 64), opj_uint_min(h, 64));
2580 if (sa == NULL) {
2581 return NULL;
2582 }
2583
2584 for (resno = 0; resno < numres; ++resno) {
2585 opj_tcd_resolution_t* res = &tilec->resolutions[resno];
2586
2587 for (bandno = 0; bandno < res->numbands; ++bandno) {
2588 opj_tcd_band_t* band = &res->bands[bandno];
2589
2590 for (precno = 0; precno < res->pw * res->ph; ++precno) {
2591 opj_tcd_precinct_t* precinct = &band->precincts[precno];
2592 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
2593 opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
2594 if (cblk->decoded_data != NULL) {
2595 OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
2596 OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
2597 OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
2598 OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
2599
2600 if (band->bandno & 1) {
2601 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2602 x += (OPJ_UINT32)(pres->x1 - pres->x0);
2603 }
2604 if (band->bandno & 2) {
2605 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2606 y += (OPJ_UINT32)(pres->y1 - pres->y0);
2607 }
2608
2609 if (!opj_sparse_array_int32_write(sa, x, y,
2610 x + cblk_w, y + cblk_h,
2611 cblk->decoded_data,
2612 1, cblk_w, OPJ_TRUE)) {
2613 opj_sparse_array_int32_free(sa);
2614 return NULL;
2615 }
2616 }
2617 }
2618 }
2619 }
2620 }
2621
2622 return sa;
2623 }
2624
2625
opj_dwt_decode_partial_tile(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)2626 static OPJ_BOOL opj_dwt_decode_partial_tile(
2627 opj_tcd_tilecomp_t* tilec,
2628 OPJ_UINT32 numres)
2629 {
2630 opj_sparse_array_int32_t* sa;
2631 opj_dwt_t h;
2632 opj_dwt_t v;
2633 OPJ_UINT32 resno;
2634 /* This value matches the maximum left/right extension given in tables */
2635 /* F.2 and F.3 of the standard. */
2636 const OPJ_UINT32 filter_width = 2U;
2637
2638 opj_tcd_resolution_t* tr = tilec->resolutions;
2639 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2640
2641 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2642 tr->x0); /* width of the resolution level computed */
2643 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2644 tr->y0); /* height of the resolution level computed */
2645
2646 OPJ_SIZE_T h_mem_size;
2647
2648 /* Compute the intersection of the area of interest, expressed in tile coordinates */
2649 /* with the tile coordinates */
2650 OPJ_UINT32 win_tcx0 = tilec->win_x0;
2651 OPJ_UINT32 win_tcy0 = tilec->win_y0;
2652 OPJ_UINT32 win_tcx1 = tilec->win_x1;
2653 OPJ_UINT32 win_tcy1 = tilec->win_y1;
2654
2655 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
2656 return OPJ_TRUE;
2657 }
2658
2659 sa = opj_dwt_init_sparse_array(tilec, numres);
2660 if (sa == NULL) {
2661 return OPJ_FALSE;
2662 }
2663
2664 if (numres == 1U) {
2665 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2666 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2667 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2668 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2669 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2670 tilec->data_win,
2671 1, tr_max->win_x1 - tr_max->win_x0,
2672 OPJ_TRUE);
2673 assert(ret);
2674 OPJ_UNUSED(ret);
2675 opj_sparse_array_int32_free(sa);
2676 return OPJ_TRUE;
2677 }
2678 h.mem_count = opj_dwt_max_resolution(tr, numres);
2679 /* overflow check */
2680 /* in vertical pass, we process 4 columns at a time */
2681 if (h.mem_count > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
2682 /* FIXME event manager error callback */
2683 opj_sparse_array_int32_free(sa);
2684 return OPJ_FALSE;
2685 }
2686
2687 h_mem_size = h.mem_count * 4 * sizeof(OPJ_INT32);
2688 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2689 if (! h.mem) {
2690 /* FIXME event manager error callback */
2691 opj_sparse_array_int32_free(sa);
2692 return OPJ_FALSE;
2693 }
2694
2695 v.mem_count = h.mem_count;
2696 v.mem = h.mem;
2697
2698 for (resno = 1; resno < numres; resno ++) {
2699 OPJ_UINT32 i, j;
2700 /* Window of interest subband-based coordinates */
2701 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2702 OPJ_UINT32 win_hl_x0, win_hl_x1;
2703 OPJ_UINT32 win_lh_y0, win_lh_y1;
2704 /* Window of interest tile-resolution-based coordinates */
2705 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2706 /* Tile-resolution subband-based coordinates */
2707 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2708
2709 ++tr;
2710
2711 h.sn = (OPJ_INT32)rw;
2712 v.sn = (OPJ_INT32)rh;
2713
2714 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2715 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2716
2717 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2718 h.cas = tr->x0 % 2;
2719
2720 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2721 v.cas = tr->y0 % 2;
2722
2723 /* Get the subband coordinates for the window of interest */
2724 /* LL band */
2725 opj_dwt_get_band_coordinates(tilec, resno, 0,
2726 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2727 &win_ll_x0, &win_ll_y0,
2728 &win_ll_x1, &win_ll_y1);
2729
2730 /* HL band */
2731 opj_dwt_get_band_coordinates(tilec, resno, 1,
2732 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2733 &win_hl_x0, NULL, &win_hl_x1, NULL);
2734
2735 /* LH band */
2736 opj_dwt_get_band_coordinates(tilec, resno, 2,
2737 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2738 NULL, &win_lh_y0, NULL, &win_lh_y1);
2739
2740 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2741 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2742 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2743 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2744 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2745
2746 /* Subtract the origin of the bands for this tile, to the subwindow */
2747 /* of interest band coordinates, so as to get them relative to the */
2748 /* tile */
2749 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2750 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2751 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2752 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2753 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2754 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2755 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2756 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2757
2758 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2759 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2760
2761 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2762 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2763
2764 /* Compute the tile-resolution-based coordinates for the window of interest */
2765 if (h.cas == 0) {
2766 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2767 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2768 } else {
2769 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2770 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2771 }
2772
2773 if (v.cas == 0) {
2774 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2775 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2776 } else {
2777 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2778 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2779 }
2780
2781 for (j = 0; j < rh; ++j) {
2782 if ((j >= win_ll_y0 && j < win_ll_y1) ||
2783 (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2784
2785 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */
2786 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */
2787 /* on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */
2788 /* This is less extreme than memsetting the whole buffer to 0 */
2789 /* although we could potentially do better with better handling of edge conditions */
2790 if (win_tr_x1 >= 1 && win_tr_x1 < rw) {
2791 h.mem[win_tr_x1 - 1] = 0;
2792 }
2793 if (win_tr_x1 < rw) {
2794 h.mem[win_tr_x1] = 0;
2795 }
2796
2797 opj_dwt_interleave_partial_h(h.mem,
2798 h.cas,
2799 sa,
2800 j,
2801 (OPJ_UINT32)h.sn,
2802 win_ll_x0,
2803 win_ll_x1,
2804 win_hl_x0,
2805 win_hl_x1);
2806 opj_dwt_decode_partial_1(h.mem, h.mem_count, h.dn, h.sn, h.cas,
2807 (OPJ_INT32)win_ll_x0,
2808 (OPJ_INT32)win_ll_x1,
2809 (OPJ_INT32)win_hl_x0,
2810 (OPJ_INT32)win_hl_x1);
2811 if (!opj_sparse_array_int32_write(sa,
2812 win_tr_x0, j,
2813 win_tr_x1, j + 1,
2814 h.mem + win_tr_x0,
2815 1, 0, OPJ_TRUE)) {
2816 /* FIXME event manager error callback */
2817 opj_sparse_array_int32_free(sa);
2818 opj_aligned_free(h.mem);
2819 return OPJ_FALSE;
2820 }
2821 }
2822 }
2823
2824 for (i = win_tr_x0; i < win_tr_x1;) {
2825 OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
2826 opj_dwt_interleave_partial_v(v.mem,
2827 v.cas,
2828 sa,
2829 i,
2830 nb_cols,
2831 (OPJ_UINT32)v.sn,
2832 win_ll_y0,
2833 win_ll_y1,
2834 win_lh_y0,
2835 win_lh_y1);
2836 opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2837 (OPJ_INT32)win_ll_y0,
2838 (OPJ_INT32)win_ll_y1,
2839 (OPJ_INT32)win_lh_y0,
2840 (OPJ_INT32)win_lh_y1);
2841 if (!opj_sparse_array_int32_write(sa,
2842 i, win_tr_y0,
2843 i + nb_cols, win_tr_y1,
2844 v.mem + 4 * win_tr_y0,
2845 1, 4, OPJ_TRUE)) {
2846 /* FIXME event manager error callback */
2847 opj_sparse_array_int32_free(sa);
2848 opj_aligned_free(h.mem);
2849 return OPJ_FALSE;
2850 }
2851
2852 i += nb_cols;
2853 }
2854 }
2855 opj_aligned_free(h.mem);
2856
2857 {
2858 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2859 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2860 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2861 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2862 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2863 tilec->data_win,
2864 1, tr_max->win_x1 - tr_max->win_x0,
2865 OPJ_TRUE);
2866 assert(ret);
2867 OPJ_UNUSED(ret);
2868 }
2869 opj_sparse_array_int32_free(sa);
2870 return OPJ_TRUE;
2871 }
2872
opj_v8dwt_interleave_h(opj_v8dwt_t * OPJ_RESTRICT dwt,OPJ_FLOAT32 * OPJ_RESTRICT a,OPJ_UINT32 width,OPJ_UINT32 remaining_height)2873 static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
2874 OPJ_FLOAT32* OPJ_RESTRICT a,
2875 OPJ_UINT32 width,
2876 OPJ_UINT32 remaining_height)
2877 {
2878 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2879 OPJ_UINT32 i, k;
2880 OPJ_UINT32 x0 = dwt->win_l_x0;
2881 OPJ_UINT32 x1 = dwt->win_l_x1;
2882
2883 for (k = 0; k < 2; ++k) {
2884 if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
2885 ((OPJ_SIZE_T) bi & 0x0f) == 0) {
2886 /* Fast code path */
2887 for (i = x0; i < x1; ++i) {
2888 OPJ_UINT32 j = i;
2889 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2890 dst[0] = a[j];
2891 j += width;
2892 dst[1] = a[j];
2893 j += width;
2894 dst[2] = a[j];
2895 j += width;
2896 dst[3] = a[j];
2897 j += width;
2898 dst[4] = a[j];
2899 j += width;
2900 dst[5] = a[j];
2901 j += width;
2902 dst[6] = a[j];
2903 j += width;
2904 dst[7] = a[j];
2905 }
2906 } else {
2907 /* Slow code path */
2908 for (i = x0; i < x1; ++i) {
2909 OPJ_UINT32 j = i;
2910 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2911 dst[0] = a[j];
2912 j += width;
2913 if (remaining_height == 1) {
2914 continue;
2915 }
2916 dst[1] = a[j];
2917 j += width;
2918 if (remaining_height == 2) {
2919 continue;
2920 }
2921 dst[2] = a[j];
2922 j += width;
2923 if (remaining_height == 3) {
2924 continue;
2925 }
2926 dst[3] = a[j];
2927 j += width;
2928 if (remaining_height == 4) {
2929 continue;
2930 }
2931 dst[4] = a[j];
2932 j += width;
2933 if (remaining_height == 5) {
2934 continue;
2935 }
2936 dst[5] = a[j];
2937 j += width;
2938 if (remaining_height == 6) {
2939 continue;
2940 }
2941 dst[6] = a[j];
2942 j += width;
2943 if (remaining_height == 7) {
2944 continue;
2945 }
2946 dst[7] = a[j];
2947 }
2948 }
2949
2950 bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas);
2951 a += dwt->sn;
2952 x0 = dwt->win_h_x0;
2953 x1 = dwt->win_h_x1;
2954 }
2955 }
2956
opj_v8dwt_interleave_partial_h(opj_v8dwt_t * dwt,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_line,OPJ_UINT32 remaining_height)2957 static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
2958 opj_sparse_array_int32_t* sa,
2959 OPJ_UINT32 sa_line,
2960 OPJ_UINT32 remaining_height)
2961 {
2962 OPJ_UINT32 i;
2963 for (i = 0; i < remaining_height; i++) {
2964 OPJ_BOOL ret;
2965 ret = opj_sparse_array_int32_read(sa,
2966 dwt->win_l_x0, sa_line + i,
2967 dwt->win_l_x1, sa_line + i + 1,
2968 /* Nasty cast from float* to int32* */
2969 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
2970 2 * NB_ELTS_V8, 0, OPJ_TRUE);
2971 assert(ret);
2972 ret = opj_sparse_array_int32_read(sa,
2973 (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2974 (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
2975 /* Nasty cast from float* to int32* */
2976 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
2977 2 * NB_ELTS_V8, 0, OPJ_TRUE);
2978 assert(ret);
2979 OPJ_UNUSED(ret);
2980 }
2981 }
2982
opj_v8dwt_interleave_v(opj_v8dwt_t * OPJ_RESTRICT dwt,OPJ_FLOAT32 * OPJ_RESTRICT a,OPJ_UINT32 width,OPJ_UINT32 nb_elts_read)2983 static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2984 OPJ_FLOAT32* OPJ_RESTRICT a,
2985 OPJ_UINT32 width,
2986 OPJ_UINT32 nb_elts_read)
2987 {
2988 opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2989 OPJ_UINT32 i;
2990
2991 for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2992 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2993 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2994 }
2995
2996 a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2997 bi = dwt->wavelet + 1 - dwt->cas;
2998
2999 for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
3000 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
3001 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
3002 }
3003 }
3004
opj_v8dwt_interleave_partial_v(opj_v8dwt_t * OPJ_RESTRICT dwt,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_col,OPJ_UINT32 nb_elts_read)3005 static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
3006 opj_sparse_array_int32_t* sa,
3007 OPJ_UINT32 sa_col,
3008 OPJ_UINT32 nb_elts_read)
3009 {
3010 OPJ_BOOL ret;
3011 ret = opj_sparse_array_int32_read(sa,
3012 sa_col, dwt->win_l_x0,
3013 sa_col + nb_elts_read, dwt->win_l_x1,
3014 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
3015 1, 2 * NB_ELTS_V8, OPJ_TRUE);
3016 assert(ret);
3017 ret = opj_sparse_array_int32_read(sa,
3018 sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
3019 sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
3020 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
3021 1, 2 * NB_ELTS_V8, OPJ_TRUE);
3022 assert(ret);
3023 OPJ_UNUSED(ret);
3024 }
3025
3026 #ifdef __SSE__
3027
opj_v8dwt_decode_step1_sse(opj_v8_t * w,OPJ_UINT32 start,OPJ_UINT32 end,const __m128 c)3028 static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
3029 OPJ_UINT32 start,
3030 OPJ_UINT32 end,
3031 const __m128 c)
3032 {
3033 __m128* OPJ_RESTRICT vw = (__m128*) w;
3034 OPJ_UINT32 i = start;
3035 /* To be adapted if NB_ELTS_V8 changes */
3036 vw += 4 * start;
3037 /* Note: attempt at loop unrolling x2 doesn't help */
3038 for (; i < end; ++i, vw += 4) {
3039 vw[0] = _mm_mul_ps(vw[0], c);
3040 vw[1] = _mm_mul_ps(vw[1], c);
3041 }
3042 }
3043
opj_v8dwt_decode_step2_sse(opj_v8_t * l,opj_v8_t * w,OPJ_UINT32 start,OPJ_UINT32 end,OPJ_UINT32 m,__m128 c)3044 static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
3045 OPJ_UINT32 start,
3046 OPJ_UINT32 end,
3047 OPJ_UINT32 m,
3048 __m128 c)
3049 {
3050 __m128* OPJ_RESTRICT vl = (__m128*) l;
3051 __m128* OPJ_RESTRICT vw = (__m128*) w;
3052 /* To be adapted if NB_ELTS_V8 changes */
3053 OPJ_UINT32 i;
3054 OPJ_UINT32 imax = opj_uint_min(end, m);
3055 if (start == 0) {
3056 if (imax >= 1) {
3057 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c));
3058 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c));
3059 vw += 4;
3060 start = 1;
3061 }
3062 } else {
3063 vw += start * 4;
3064 }
3065
3066 i = start;
3067 /* Note: attempt at loop unrolling x2 doesn't help */
3068 for (; i < imax; ++i) {
3069 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c));
3070 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c));
3071 vw += 4;
3072 }
3073 if (m < end) {
3074 assert(m + 1 == end);
3075 c = _mm_add_ps(c, c);
3076 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4]));
3077 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3]));
3078 }
3079 }
3080
3081 #else
3082
opj_v8dwt_decode_step1(opj_v8_t * w,OPJ_UINT32 start,OPJ_UINT32 end,const OPJ_FLOAT32 c)3083 static void opj_v8dwt_decode_step1(opj_v8_t* w,
3084 OPJ_UINT32 start,
3085 OPJ_UINT32 end,
3086 const OPJ_FLOAT32 c)
3087 {
3088 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
3089 OPJ_UINT32 i;
3090 /* To be adapted if NB_ELTS_V8 changes */
3091 for (i = start; i < end; ++i) {
3092 fw[i * 2 * 8 ] = fw[i * 2 * 8 ] * c;
3093 fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
3094 fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
3095 fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
3096 fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
3097 fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
3098 fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
3099 fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
3100 }
3101 }
3102
opj_v8dwt_decode_step2(opj_v8_t * l,opj_v8_t * w,OPJ_UINT32 start,OPJ_UINT32 end,OPJ_UINT32 m,OPJ_FLOAT32 c)3103 static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
3104 OPJ_UINT32 start,
3105 OPJ_UINT32 end,
3106 OPJ_UINT32 m,
3107 OPJ_FLOAT32 c)
3108 {
3109 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
3110 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
3111 OPJ_UINT32 i;
3112 OPJ_UINT32 imax = opj_uint_min(end, m);
3113 if (start > 0) {
3114 fw += 2 * NB_ELTS_V8 * start;
3115 fl = fw - 2 * NB_ELTS_V8;
3116 }
3117 /* To be adapted if NB_ELTS_V8 changes */
3118 for (i = start; i < imax; ++i) {
3119 fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
3120 fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
3121 fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
3122 fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
3123 fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
3124 fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
3125 fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
3126 fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
3127 fl = fw;
3128 fw += 2 * NB_ELTS_V8;
3129 }
3130 if (m < end) {
3131 assert(m + 1 == end);
3132 c += c;
3133 fw[-8] = fw[-8] + fl[0] * c;
3134 fw[-7] = fw[-7] + fl[1] * c;
3135 fw[-6] = fw[-6] + fl[2] * c;
3136 fw[-5] = fw[-5] + fl[3] * c;
3137 fw[-4] = fw[-4] + fl[4] * c;
3138 fw[-3] = fw[-3] + fl[5] * c;
3139 fw[-2] = fw[-2] + fl[6] * c;
3140 fw[-1] = fw[-1] + fl[7] * c;
3141 }
3142 }
3143
3144 #endif
3145
3146 /* <summary> */
3147 /* Inverse 9-7 wavelet transform in 1-D. */
3148 /* </summary> */
opj_v8dwt_decode(opj_v8dwt_t * OPJ_RESTRICT dwt)3149 static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
3150 {
3151 OPJ_INT32 a, b;
3152 /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */
3153 /* Historic value for 2 / opj_invK */
3154 /* Normally, we should use invK, but if we do so, we have failures in the */
3155 /* conformance test, due to MSE and peak errors significantly higher than */
3156 /* accepted value */
3157 /* Due to using two_invK instead of invK, we have to compensate in tcd.c */
3158 /* the computation of the stepsize for the non LL subbands */
3159 const float two_invK = 1.625732422f;
3160 if (dwt->cas == 0) {
3161 if (!((dwt->dn > 0) || (dwt->sn > 1))) {
3162 return;
3163 }
3164 a = 0;
3165 b = 1;
3166 } else {
3167 if (!((dwt->sn > 0) || (dwt->dn > 1))) {
3168 return;
3169 }
3170 a = 1;
3171 b = 0;
3172 }
3173 #ifdef __SSE__
3174 opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3175 _mm_set1_ps(opj_K));
3176 opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3177 _mm_set1_ps(two_invK));
3178 opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3179 dwt->win_l_x0, dwt->win_l_x1,
3180 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3181 _mm_set1_ps(-opj_dwt_delta));
3182 opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3183 dwt->win_h_x0, dwt->win_h_x1,
3184 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3185 _mm_set1_ps(-opj_dwt_gamma));
3186 opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3187 dwt->win_l_x0, dwt->win_l_x1,
3188 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3189 _mm_set1_ps(-opj_dwt_beta));
3190 opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3191 dwt->win_h_x0, dwt->win_h_x1,
3192 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3193 _mm_set1_ps(-opj_dwt_alpha));
3194 #else
3195 opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3196 opj_K);
3197 opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3198 two_invK);
3199 opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3200 dwt->win_l_x0, dwt->win_l_x1,
3201 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3202 -opj_dwt_delta);
3203 opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3204 dwt->win_h_x0, dwt->win_h_x1,
3205 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3206 -opj_dwt_gamma);
3207 opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3208 dwt->win_l_x0, dwt->win_l_x1,
3209 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3210 -opj_dwt_beta);
3211 opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3212 dwt->win_h_x0, dwt->win_h_x1,
3213 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3214 -opj_dwt_alpha);
3215 #endif
3216 }
3217
3218 typedef struct {
3219 opj_v8dwt_t h;
3220 OPJ_UINT32 rw;
3221 OPJ_UINT32 w;
3222 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3223 OPJ_UINT32 nb_rows;
3224 } opj_dwt97_decode_h_job_t;
3225
opj_dwt97_decode_h_func(void * user_data,opj_tls_t * tls)3226 static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls)
3227 {
3228 OPJ_UINT32 j;
3229 opj_dwt97_decode_h_job_t* job;
3230 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3231 OPJ_UINT32 w;
3232 (void)tls;
3233
3234 job = (opj_dwt97_decode_h_job_t*)user_data;
3235 w = job->w;
3236
3237 assert((job->nb_rows % NB_ELTS_V8) == 0);
3238
3239 aj = job->aj;
3240 for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) {
3241 OPJ_UINT32 k;
3242 opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8);
3243 opj_v8dwt_decode(&job->h);
3244
3245 /* To be adapted if NB_ELTS_V8 changes */
3246 for (k = 0; k < job->rw; k++) {
3247 aj[k ] = job->h.wavelet[k].f[0];
3248 aj[k + (OPJ_SIZE_T)w ] = job->h.wavelet[k].f[1];
3249 aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2];
3250 aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3];
3251 }
3252 for (k = 0; k < job->rw; k++) {
3253 aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4];
3254 aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5];
3255 aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6];
3256 aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7];
3257 }
3258
3259 aj += w * NB_ELTS_V8;
3260 }
3261
3262 opj_aligned_free(job->h.wavelet);
3263 opj_free(job);
3264 }
3265
3266
3267 typedef struct {
3268 opj_v8dwt_t v;
3269 OPJ_UINT32 rh;
3270 OPJ_UINT32 w;
3271 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3272 OPJ_UINT32 nb_columns;
3273 } opj_dwt97_decode_v_job_t;
3274
opj_dwt97_decode_v_func(void * user_data,opj_tls_t * tls)3275 static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls)
3276 {
3277 OPJ_UINT32 j;
3278 opj_dwt97_decode_v_job_t* job;
3279 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3280 (void)tls;
3281
3282 job = (opj_dwt97_decode_v_job_t*)user_data;
3283
3284 assert((job->nb_columns % NB_ELTS_V8) == 0);
3285
3286 aj = job->aj;
3287 for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) {
3288 OPJ_UINT32 k;
3289
3290 opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8);
3291 opj_v8dwt_decode(&job->v);
3292
3293 for (k = 0; k < job->rh; ++k) {
3294 memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k],
3295 NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3296 }
3297 aj += NB_ELTS_V8;
3298 }
3299
3300 opj_aligned_free(job->v.wavelet);
3301 opj_free(job);
3302 }
3303
3304
3305 /* <summary> */
3306 /* Inverse 9-7 wavelet transform in 2-D. */
3307 /* </summary> */
3308 static
opj_dwt_decode_tile_97(opj_thread_pool_t * tp,opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)3309 OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp,
3310 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3311 OPJ_UINT32 numres)
3312 {
3313 opj_v8dwt_t h;
3314 opj_v8dwt_t v;
3315
3316 opj_tcd_resolution_t* res = tilec->resolutions;
3317
3318 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
3319 res->x0); /* width of the resolution level computed */
3320 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
3321 res->y0); /* height of the resolution level computed */
3322
3323 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
3324 1].x1 -
3325 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
3326
3327 OPJ_SIZE_T l_data_size;
3328 const int num_threads = opj_thread_pool_get_thread_count(tp);
3329
3330 if (numres == 1) {
3331 return OPJ_TRUE;
3332 }
3333
3334 l_data_size = opj_dwt_max_resolution(res, numres);
3335 /* overflow check */
3336 if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3337 /* FIXME event manager error callback */
3338 return OPJ_FALSE;
3339 }
3340 h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3341 if (!h.wavelet) {
3342 /* FIXME event manager error callback */
3343 return OPJ_FALSE;
3344 }
3345 v.wavelet = h.wavelet;
3346
3347 while (--numres) {
3348 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
3349 OPJ_UINT32 j;
3350
3351 h.sn = (OPJ_INT32)rw;
3352 v.sn = (OPJ_INT32)rh;
3353
3354 ++res;
3355
3356 rw = (OPJ_UINT32)(res->x1 -
3357 res->x0); /* width of the resolution level computed */
3358 rh = (OPJ_UINT32)(res->y1 -
3359 res->y0); /* height of the resolution level computed */
3360
3361 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3362 h.cas = res->x0 % 2;
3363
3364 h.win_l_x0 = 0;
3365 h.win_l_x1 = (OPJ_UINT32)h.sn;
3366 h.win_h_x0 = 0;
3367 h.win_h_x1 = (OPJ_UINT32)h.dn;
3368
3369 if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) {
3370 for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3371 OPJ_UINT32 k;
3372 opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8);
3373 opj_v8dwt_decode(&h);
3374
3375 /* To be adapted if NB_ELTS_V8 changes */
3376 for (k = 0; k < rw; k++) {
3377 aj[k ] = h.wavelet[k].f[0];
3378 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1];
3379 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
3380 aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
3381 }
3382 for (k = 0; k < rw; k++) {
3383 aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4];
3384 aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5];
3385 aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6];
3386 aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7];
3387 }
3388
3389 aj += w * NB_ELTS_V8;
3390 }
3391 } else {
3392 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
3393 OPJ_UINT32 step_j;
3394
3395 if ((rh / NB_ELTS_V8) < num_jobs) {
3396 num_jobs = rh / NB_ELTS_V8;
3397 }
3398 step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3399 for (j = 0; j < num_jobs; j++) {
3400 opj_dwt97_decode_h_job_t* job;
3401
3402 job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t));
3403 if (!job) {
3404 opj_thread_pool_wait_completion(tp, 0);
3405 opj_aligned_free(h.wavelet);
3406 return OPJ_FALSE;
3407 }
3408 job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3409 if (!job->h.wavelet) {
3410 opj_thread_pool_wait_completion(tp, 0);
3411 opj_free(job);
3412 opj_aligned_free(h.wavelet);
3413 return OPJ_FALSE;
3414 }
3415 job->h.dn = h.dn;
3416 job->h.sn = h.sn;
3417 job->h.cas = h.cas;
3418 job->h.win_l_x0 = h.win_l_x0;
3419 job->h.win_l_x1 = h.win_l_x1;
3420 job->h.win_h_x0 = h.win_h_x0;
3421 job->h.win_h_x1 = h.win_h_x1;
3422 job->rw = rw;
3423 job->w = w;
3424 job->aj = aj;
3425 job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~
3426 (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3427 aj += w * job->nb_rows;
3428 opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job);
3429 }
3430 opj_thread_pool_wait_completion(tp, 0);
3431 j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1);
3432 }
3433
3434 if (j < rh) {
3435 OPJ_UINT32 k;
3436 opj_v8dwt_interleave_h(&h, aj, w, rh - j);
3437 opj_v8dwt_decode(&h);
3438 for (k = 0; k < rw; k++) {
3439 OPJ_UINT32 l;
3440 for (l = 0; l < rh - j; l++) {
3441 aj[k + (OPJ_SIZE_T)w * l ] = h.wavelet[k].f[l];
3442 }
3443 }
3444 }
3445
3446 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3447 v.cas = res->y0 % 2;
3448 v.win_l_x0 = 0;
3449 v.win_l_x1 = (OPJ_UINT32)v.sn;
3450 v.win_h_x0 = 0;
3451 v.win_h_x1 = (OPJ_UINT32)v.dn;
3452
3453 aj = (OPJ_FLOAT32*) tilec->data;
3454 if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
3455 for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) {
3456 OPJ_UINT32 k;
3457
3458 opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8);
3459 opj_v8dwt_decode(&v);
3460
3461 for (k = 0; k < rh; ++k) {
3462 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3463 }
3464 aj += NB_ELTS_V8;
3465 }
3466 } else {
3467 /* "bench_dwt -I" shows that scaling is poor, likely due to RAM
3468 transfer being the limiting factor. So limit the number of
3469 threads.
3470 */
3471 OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U);
3472 OPJ_UINT32 step_j;
3473
3474 if ((rw / NB_ELTS_V8) < num_jobs) {
3475 num_jobs = rw / NB_ELTS_V8;
3476 }
3477 step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3478 for (j = 0; j < num_jobs; j++) {
3479 opj_dwt97_decode_v_job_t* job;
3480
3481 job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t));
3482 if (!job) {
3483 opj_thread_pool_wait_completion(tp, 0);
3484 opj_aligned_free(h.wavelet);
3485 return OPJ_FALSE;
3486 }
3487 job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3488 if (!job->v.wavelet) {
3489 opj_thread_pool_wait_completion(tp, 0);
3490 opj_free(job);
3491 opj_aligned_free(h.wavelet);
3492 return OPJ_FALSE;
3493 }
3494 job->v.dn = v.dn;
3495 job->v.sn = v.sn;
3496 job->v.cas = v.cas;
3497 job->v.win_l_x0 = v.win_l_x0;
3498 job->v.win_l_x1 = v.win_l_x1;
3499 job->v.win_h_x0 = v.win_h_x0;
3500 job->v.win_h_x1 = v.win_h_x1;
3501 job->rh = rh;
3502 job->w = w;
3503 job->aj = aj;
3504 job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~
3505 (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3506 aj += job->nb_columns;
3507 opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job);
3508 }
3509 opj_thread_pool_wait_completion(tp, 0);
3510 }
3511
3512 if (rw & (NB_ELTS_V8 - 1)) {
3513 OPJ_UINT32 k;
3514
3515 j = rw & (NB_ELTS_V8 - 1);
3516
3517 opj_v8dwt_interleave_v(&v, aj, w, j);
3518 opj_v8dwt_decode(&v);
3519
3520 for (k = 0; k < rh; ++k) {
3521 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
3522 (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
3523 }
3524 }
3525 }
3526
3527 opj_aligned_free(h.wavelet);
3528 return OPJ_TRUE;
3529 }
3530
3531 static
opj_dwt_decode_partial_97(opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)3532 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3533 OPJ_UINT32 numres)
3534 {
3535 opj_sparse_array_int32_t* sa;
3536 opj_v8dwt_t h;
3537 opj_v8dwt_t v;
3538 OPJ_UINT32 resno;
3539 /* This value matches the maximum left/right extension given in tables */
3540 /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
3541 /* we currently use 3. */
3542 const OPJ_UINT32 filter_width = 4U;
3543
3544 opj_tcd_resolution_t* tr = tilec->resolutions;
3545 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
3546
3547 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
3548 tr->x0); /* width of the resolution level computed */
3549 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
3550 tr->y0); /* height of the resolution level computed */
3551
3552 OPJ_SIZE_T l_data_size;
3553
3554 /* Compute the intersection of the area of interest, expressed in tile coordinates */
3555 /* with the tile coordinates */
3556 OPJ_UINT32 win_tcx0 = tilec->win_x0;
3557 OPJ_UINT32 win_tcy0 = tilec->win_y0;
3558 OPJ_UINT32 win_tcx1 = tilec->win_x1;
3559 OPJ_UINT32 win_tcy1 = tilec->win_y1;
3560
3561 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
3562 return OPJ_TRUE;
3563 }
3564
3565 sa = opj_dwt_init_sparse_array(tilec, numres);
3566 if (sa == NULL) {
3567 return OPJ_FALSE;
3568 }
3569
3570 if (numres == 1U) {
3571 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3572 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3573 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3574 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3575 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3576 tilec->data_win,
3577 1, tr_max->win_x1 - tr_max->win_x0,
3578 OPJ_TRUE);
3579 assert(ret);
3580 OPJ_UNUSED(ret);
3581 opj_sparse_array_int32_free(sa);
3582 return OPJ_TRUE;
3583 }
3584
3585 l_data_size = opj_dwt_max_resolution(tr, numres);
3586 /* overflow check */
3587 if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3588 /* FIXME event manager error callback */
3589 opj_sparse_array_int32_free(sa);
3590 return OPJ_FALSE;
3591 }
3592 h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3593 if (!h.wavelet) {
3594 /* FIXME event manager error callback */
3595 opj_sparse_array_int32_free(sa);
3596 return OPJ_FALSE;
3597 }
3598 v.wavelet = h.wavelet;
3599
3600 for (resno = 1; resno < numres; resno ++) {
3601 OPJ_UINT32 j;
3602 /* Window of interest subband-based coordinates */
3603 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
3604 OPJ_UINT32 win_hl_x0, win_hl_x1;
3605 OPJ_UINT32 win_lh_y0, win_lh_y1;
3606 /* Window of interest tile-resolution-based coordinates */
3607 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
3608 /* Tile-resolution subband-based coordinates */
3609 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
3610
3611 ++tr;
3612
3613 h.sn = (OPJ_INT32)rw;
3614 v.sn = (OPJ_INT32)rh;
3615
3616 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
3617 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
3618
3619 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3620 h.cas = tr->x0 % 2;
3621
3622 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3623 v.cas = tr->y0 % 2;
3624
3625 /* Get the subband coordinates for the window of interest */
3626 /* LL band */
3627 opj_dwt_get_band_coordinates(tilec, resno, 0,
3628 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3629 &win_ll_x0, &win_ll_y0,
3630 &win_ll_x1, &win_ll_y1);
3631
3632 /* HL band */
3633 opj_dwt_get_band_coordinates(tilec, resno, 1,
3634 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3635 &win_hl_x0, NULL, &win_hl_x1, NULL);
3636
3637 /* LH band */
3638 opj_dwt_get_band_coordinates(tilec, resno, 2,
3639 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3640 NULL, &win_lh_y0, NULL, &win_lh_y1);
3641
3642 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
3643 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
3644 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
3645 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
3646 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
3647
3648 /* Subtract the origin of the bands for this tile, to the subwindow */
3649 /* of interest band coordinates, so as to get them relative to the */
3650 /* tile */
3651 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
3652 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
3653 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
3654 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
3655 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
3656 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
3657 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
3658 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
3659
3660 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
3661 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
3662
3663 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
3664 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
3665
3666 /* Compute the tile-resolution-based coordinates for the window of interest */
3667 if (h.cas == 0) {
3668 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
3669 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
3670 } else {
3671 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
3672 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
3673 }
3674
3675 if (v.cas == 0) {
3676 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
3677 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
3678 } else {
3679 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
3680 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
3681 }
3682
3683 h.win_l_x0 = win_ll_x0;
3684 h.win_l_x1 = win_ll_x1;
3685 h.win_h_x0 = win_hl_x0;
3686 h.win_h_x1 = win_hl_x1;
3687 for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3688 if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3689 (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3690 j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
3691 opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j));
3692 opj_v8dwt_decode(&h);
3693 if (!opj_sparse_array_int32_write(sa,
3694 win_tr_x0, j,
3695 win_tr_x1, j + NB_ELTS_V8,
3696 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3697 NB_ELTS_V8, 1, OPJ_TRUE)) {
3698 /* FIXME event manager error callback */
3699 opj_sparse_array_int32_free(sa);
3700 opj_aligned_free(h.wavelet);
3701 return OPJ_FALSE;
3702 }
3703 }
3704 }
3705
3706 if (j < rh &&
3707 ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3708 (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3709 j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
3710 opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j);
3711 opj_v8dwt_decode(&h);
3712 if (!opj_sparse_array_int32_write(sa,
3713 win_tr_x0, j,
3714 win_tr_x1, rh,
3715 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3716 NB_ELTS_V8, 1, OPJ_TRUE)) {
3717 /* FIXME event manager error callback */
3718 opj_sparse_array_int32_free(sa);
3719 opj_aligned_free(h.wavelet);
3720 return OPJ_FALSE;
3721 }
3722 }
3723
3724 v.win_l_x0 = win_ll_y0;
3725 v.win_l_x1 = win_ll_y1;
3726 v.win_h_x0 = win_lh_y0;
3727 v.win_h_x1 = win_lh_y1;
3728 for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) {
3729 OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j);
3730
3731 opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts);
3732 opj_v8dwt_decode(&v);
3733
3734 if (!opj_sparse_array_int32_write(sa,
3735 j, win_tr_y0,
3736 j + nb_elts, win_tr_y1,
3737 (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
3738 1, NB_ELTS_V8, OPJ_TRUE)) {
3739 /* FIXME event manager error callback */
3740 opj_sparse_array_int32_free(sa);
3741 opj_aligned_free(h.wavelet);
3742 return OPJ_FALSE;
3743 }
3744 }
3745 }
3746
3747 {
3748 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3749 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3750 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3751 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3752 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3753 tilec->data_win,
3754 1, tr_max->win_x1 - tr_max->win_x0,
3755 OPJ_TRUE);
3756 assert(ret);
3757 OPJ_UNUSED(ret);
3758 }
3759 opj_sparse_array_int32_free(sa);
3760
3761 opj_aligned_free(h.wavelet);
3762 return OPJ_TRUE;
3763 }
3764
3765
opj_dwt_decode_real(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)3766 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
3767 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3768 OPJ_UINT32 numres)
3769 {
3770 if (p_tcd->whole_tile_decoding) {
3771 return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres);
3772 } else {
3773 return opj_dwt_decode_partial_97(tilec, numres);
3774 }
3775 }
3776