1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions to calculate variance per component in a NxN footprint.
22 *
23 * We need N to be parametric, so the routine below uses summed area tables in order to execute in
24 * O(1) time independent of how big N is.
25 *
26 * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27 * perform a binary reduction, and then distributes the results. This method means that there is no
28 * serial dependency between a given element and the next one, and also significantly improves
29 * numerical stability allowing us to use floats rather than doubles.
30 */
31
32 #include "astcenc_internal.h"
33
34 #include <cassert>
35
36 /**
37 * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38 *
39 * This will take an input array of the form:
40 * v0, v1, v2, ...
41 * ... and modify in-place to turn it into a prefix-sum array of the form:
42 * v0, v0+v1, v0+v1+v2, ...
43 *
44 * @param d The array to prefix-sum.
45 * @param items The number of items in the array.
46 * @param stride The item spacing in the array; i.e. dense arrays should use 1.
47 */
brent_kung_prefix_sum(vfloat4 * d,size_t items,int stride)48 static void brent_kung_prefix_sum(
49 vfloat4* d,
50 size_t items,
51 int stride
52 ) {
53 if (items < 2)
54 return;
55
56 size_t lc_stride = 2;
57 size_t log2_stride = 1;
58
59 // The reduction-tree loop
60 do {
61 size_t step = lc_stride >> 1;
62 size_t start = lc_stride - 1;
63 size_t iters = items >> log2_stride;
64
65 vfloat4 *da = d + (start * stride);
66 ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67 size_t ofs_stride = stride << log2_stride;
68
69 while (iters)
70 {
71 *da = *da + da[ofs];
72 da += ofs_stride;
73 iters--;
74 }
75
76 log2_stride += 1;
77 lc_stride <<= 1;
78 } while (lc_stride <= items);
79
80 // The expansion-tree loop
81 do {
82 log2_stride -= 1;
83 lc_stride >>= 1;
84
85 size_t step = lc_stride >> 1;
86 size_t start = step + lc_stride - 1;
87 size_t iters = (items - step) >> log2_stride;
88
89 vfloat4 *da = d + (start * stride);
90 ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91 size_t ofs_stride = stride << log2_stride;
92
93 while (iters)
94 {
95 *da = *da + da[ofs];
96 da += ofs_stride;
97 iters--;
98 }
99 } while (lc_stride > 2);
100 }
101
102 /**
103 * @brief Compute averages for a pixel region.
104 *
105 * The routine computes both in a single pass, using a summed-area table to decouple the running
106 * time from the averaging/variance kernel size.
107 *
108 * @param[out] ctx The compressor context storing the output data.
109 * @param arg The input parameter structure.
110 */
compute_pixel_region_variance(astcenc_context & ctx,const pixel_region_args & arg)111 static void compute_pixel_region_variance(
112 astcenc_context& ctx,
113 const pixel_region_args& arg
114 ) {
115 // Unpack the memory structure into local variables
116 const astcenc_image* img = arg.img;
117 astcenc_swizzle swz = arg.swz;
118 bool have_z = arg.have_z;
119
120 int size_x = arg.size_x;
121 int size_y = arg.size_y;
122 int size_z = arg.size_z;
123
124 int offset_x = arg.offset_x;
125 int offset_y = arg.offset_y;
126 int offset_z = arg.offset_z;
127
128 int alpha_kernel_radius = arg.alpha_kernel_radius;
129
130 float* input_alpha_averages = ctx.input_alpha_averages;
131 vfloat4* work_memory = arg.work_memory;
132
133 // Compute memory sizes and dimensions that we need
134 int kernel_radius = alpha_kernel_radius;
135 int kerneldim = 2 * kernel_radius + 1;
136 int kernel_radius_xy = kernel_radius;
137 int kernel_radius_z = have_z ? kernel_radius : 0;
138
139 int padsize_x = size_x + kerneldim;
140 int padsize_y = size_y + kerneldim;
141 int padsize_z = size_z + (have_z ? kerneldim : 0);
142 int sizeprod = padsize_x * padsize_y * padsize_z;
143
144 int zd_start = have_z ? 1 : 0;
145
146 vfloat4 *varbuf1 = work_memory;
147 vfloat4 *varbuf2 = work_memory + sizeprod;
148
149 // Scaling factors to apply to Y and Z for accesses into the work buffers
150 int yst = padsize_x;
151 int zst = padsize_x * padsize_y;
152
153 // Scaling factors to apply to Y and Z for accesses into result buffers
154 int ydt = img->dim_x;
155 int zdt = img->dim_x * img->dim_y;
156
157 // Macros to act as accessor functions for the work-memory
158 #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
159 #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
160
161 // Load N and N^2 values into the work buffers
162 if (img->data_type == ASTCENC_TYPE_U8)
163 {
164 // Swizzle data structure 4 = ZERO, 5 = ONE
165 uint8_t data[6];
166 data[ASTCENC_SWZ_0] = 0;
167 data[ASTCENC_SWZ_1] = 255;
168
169 for (int z = zd_start; z < padsize_z; z++)
170 {
171 int z_src = (z - zd_start) + offset_z - kernel_radius_z;
172 z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
173 uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
174
175 for (int y = 1; y < padsize_y; y++)
176 {
177 int y_src = (y - 1) + offset_y - kernel_radius_xy;
178 y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
179
180 for (int x = 1; x < padsize_x; x++)
181 {
182 int x_src = (x - 1) + offset_x - kernel_radius_xy;
183 x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
184
185 data[0] = data8[(4 * img->dim_stride * y_src) + (4 * x_src )];
186 data[1] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 1)];
187 data[2] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 2)];
188 data[3] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 3)];
189
190 uint8_t r = data[swz.r];
191 uint8_t g = data[swz.g];
192 uint8_t b = data[swz.b];
193 uint8_t a = data[swz.a];
194
195 vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
196 g * (1.0f / 255.0f),
197 b * (1.0f / 255.0f),
198 a * (1.0f / 255.0f));
199
200 VARBUF1(z, y, x) = d;
201 VARBUF2(z, y, x) = d * d;
202 }
203 }
204 }
205 }
206 else if (img->data_type == ASTCENC_TYPE_F16)
207 {
208 // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
209 uint16_t data[6];
210 data[ASTCENC_SWZ_0] = 0;
211 data[ASTCENC_SWZ_1] = 0x3C00;
212
213 for (int z = zd_start; z < padsize_z; z++)
214 {
215 int z_src = (z - zd_start) + offset_z - kernel_radius_z;
216 z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
217 uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
218
219 for (int y = 1; y < padsize_y; y++)
220 {
221 int y_src = (y - 1) + offset_y - kernel_radius_xy;
222 y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
223
224 for (int x = 1; x < padsize_x; x++)
225 {
226 int x_src = (x - 1) + offset_x - kernel_radius_xy;
227 x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
228
229 data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )];
230 data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
231 data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
232 data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
233
234 vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
235 vfloat4 d = float16_to_float(di);
236
237 VARBUF1(z, y, x) = d;
238 VARBUF2(z, y, x) = d * d;
239 }
240 }
241 }
242 }
243 else // if (img->data_type == ASTCENC_TYPE_F32)
244 {
245 assert(img->data_type == ASTCENC_TYPE_F32);
246
247 // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
248 float data[6];
249 data[ASTCENC_SWZ_0] = 0.0f;
250 data[ASTCENC_SWZ_1] = 1.0f;
251
252 for (int z = zd_start; z < padsize_z; z++)
253 {
254 int z_src = (z - zd_start) + offset_z - kernel_radius_z;
255 z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
256 float* data32 = static_cast<float*>(img->data[z_src]);
257
258 for (int y = 1; y < padsize_y; y++)
259 {
260 int y_src = (y - 1) + offset_y - kernel_radius_xy;
261 y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
262
263 for (int x = 1; x < padsize_x; x++)
264 {
265 int x_src = (x - 1) + offset_x - kernel_radius_xy;
266 x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
267
268 data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )];
269 data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
270 data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
271 data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
272
273 float r = data[swz.r];
274 float g = data[swz.g];
275 float b = data[swz.b];
276 float a = data[swz.a];
277
278 vfloat4 d(r, g, b, a);
279
280 VARBUF1(z, y, x) = d;
281 VARBUF2(z, y, x) = d * d;
282 }
283 }
284 }
285 }
286
287 // Pad with an extra layer of 0s; this forms the edge of the SAT tables
288 vfloat4 vbz = vfloat4::zero();
289 for (int z = 0; z < padsize_z; z++)
290 {
291 for (int y = 0; y < padsize_y; y++)
292 {
293 VARBUF1(z, y, 0) = vbz;
294 VARBUF2(z, y, 0) = vbz;
295 }
296
297 for (int x = 0; x < padsize_x; x++)
298 {
299 VARBUF1(z, 0, x) = vbz;
300 VARBUF2(z, 0, x) = vbz;
301 }
302 }
303
304 if (have_z)
305 {
306 for (int y = 0; y < padsize_y; y++)
307 {
308 for (int x = 0; x < padsize_x; x++)
309 {
310 VARBUF1(0, y, x) = vbz;
311 VARBUF2(0, y, x) = vbz;
312 }
313 }
314 }
315
316 // Generate summed-area tables for N and N^2; this is done in-place, using
317 // a Brent-Kung parallel-prefix based algorithm to minimize precision loss
318 for (int z = zd_start; z < padsize_z; z++)
319 {
320 for (int y = 1; y < padsize_y; y++)
321 {
322 brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
323 brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
324 }
325 }
326
327 for (int z = zd_start; z < padsize_z; z++)
328 {
329 for (int x = 1; x < padsize_x; x++)
330 {
331 brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
332 brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
333 }
334 }
335
336 if (have_z)
337 {
338 for (int y = 1; y < padsize_y; y++)
339 {
340 for (int x = 1; x < padsize_x; x++)
341 {
342 brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
343 brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
344 }
345 }
346 }
347
348 // Compute a few constants used in the variance-calculation.
349 float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
350 float alpha_rsamples;
351
352 if (have_z)
353 {
354 alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
355 }
356 else
357 {
358 alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
359 }
360
361 // Use the summed-area tables to compute variance for each neighborhood
362 if (have_z)
363 {
364 for (int z = 0; z < size_z; z++)
365 {
366 int z_src = z + kernel_radius_z;
367 int z_dst = z + offset_z;
368 int z_low = z_src - alpha_kernel_radius;
369 int z_high = z_src + alpha_kernel_radius + 1;
370
371 for (int y = 0; y < size_y; y++)
372 {
373 int y_src = y + kernel_radius_xy;
374 int y_dst = y + offset_y;
375 int y_low = y_src - alpha_kernel_radius;
376 int y_high = y_src + alpha_kernel_radius + 1;
377
378 for (int x = 0; x < size_x; x++)
379 {
380 int x_src = x + kernel_radius_xy;
381 int x_dst = x + offset_x;
382 int x_low = x_src - alpha_kernel_radius;
383 int x_high = x_src + alpha_kernel_radius + 1;
384
385 // Summed-area table lookups for alpha average
386 float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>()
387 - VARBUF1(z_high, y_low, x_high).lane<3>()
388 - VARBUF1(z_high, y_high, x_low).lane<3>()
389 + VARBUF1(z_high, y_high, x_high).lane<3>()) -
390 ( VARBUF1(z_low, y_low, x_low).lane<3>()
391 - VARBUF1(z_low, y_low, x_high).lane<3>()
392 - VARBUF1(z_low, y_high, x_low).lane<3>()
393 + VARBUF1(z_low, y_high, x_high).lane<3>());
394
395 int out_index = z_dst * zdt + y_dst * ydt + x_dst;
396 input_alpha_averages[out_index] = (vasum * alpha_rsamples);
397 }
398 }
399 }
400 }
401 else
402 {
403 for (int y = 0; y < size_y; y++)
404 {
405 int y_src = y + kernel_radius_xy;
406 int y_dst = y + offset_y;
407 int y_low = y_src - alpha_kernel_radius;
408 int y_high = y_src + alpha_kernel_radius + 1;
409
410 for (int x = 0; x < size_x; x++)
411 {
412 int x_src = x + kernel_radius_xy;
413 int x_dst = x + offset_x;
414 int x_low = x_src - alpha_kernel_radius;
415 int x_high = x_src + alpha_kernel_radius + 1;
416
417 // Summed-area table lookups for alpha average
418 float vasum = VARBUF1(0, y_low, x_low).lane<3>()
419 - VARBUF1(0, y_low, x_high).lane<3>()
420 - VARBUF1(0, y_high, x_low).lane<3>()
421 + VARBUF1(0, y_high, x_high).lane<3>();
422
423 int out_index = y_dst * ydt + x_dst;
424 input_alpha_averages[out_index] = (vasum * alpha_rsamples);
425 }
426 }
427 }
428 }
429
compute_averages(astcenc_context & ctx,const avg_args & ag)430 void compute_averages(
431 astcenc_context& ctx,
432 const avg_args &ag
433 ) {
434 pixel_region_args arg = ag.arg;
435 arg.work_memory = new vfloat4[ag.work_memory_size];
436
437 int size_x = ag.img_size_x;
438 int size_y = ag.img_size_y;
439 int size_z = ag.img_size_z;
440
441 int step_xy = ag.blk_size_xy;
442 int step_z = ag.blk_size_z;
443
444 int y_tasks = (size_y + step_xy - 1) / step_xy;
445
446 // All threads run this processing loop until there is no work remaining
447 while (true)
448 {
449 unsigned int count;
450 unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
451 if (!count)
452 {
453 break;
454 }
455
456 for (unsigned int i = base; i < base + count; i++)
457 {
458 int z = (i / (y_tasks)) * step_z;
459 int y = (i - (z * y_tasks)) * step_xy;
460
461 arg.size_z = astc::min(step_z, size_z - z);
462 arg.offset_z = z;
463
464 arg.size_y = astc::min(step_xy, size_y - y);
465 arg.offset_y = y;
466
467 for (int x = 0; x < size_x; x += step_xy)
468 {
469 arg.size_x = astc::min(step_xy, size_x - x);
470 arg.offset_x = x;
471 compute_pixel_region_variance(ctx, arg);
472 }
473 }
474
475 ctx.manage_avg.complete_task_assignment(count);
476 }
477
478 delete[] arg.work_memory;
479 }
480
481 /* See header for documentation. */
init_compute_averages(const astcenc_image & img,unsigned int alpha_kernel_radius,const astcenc_swizzle & swz,avg_args & ag)482 unsigned int init_compute_averages(
483 const astcenc_image& img,
484 unsigned int alpha_kernel_radius,
485 const astcenc_swizzle& swz,
486 avg_args& ag
487 ) {
488 unsigned int size_x = img.dim_x;
489 unsigned int size_y = img.dim_y;
490 unsigned int size_z = img.dim_z;
491
492 // Compute maximum block size and from that the working memory buffer size
493 unsigned int kernel_radius = alpha_kernel_radius;
494 unsigned int kerneldim = 2 * kernel_radius + 1;
495
496 bool have_z = (size_z > 1);
497 unsigned int max_blk_size_xy = have_z ? 16 : 32;
498 unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
499
500 unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
501 unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
502
503 // Perform block-wise averages calculations across the image
504 // Initialize fields which are not populated until later
505 ag.arg.size_x = 0;
506 ag.arg.size_y = 0;
507 ag.arg.size_z = 0;
508 ag.arg.offset_x = 0;
509 ag.arg.offset_y = 0;
510 ag.arg.offset_z = 0;
511 ag.arg.work_memory = nullptr;
512
513 ag.arg.img = &img;
514 ag.arg.swz = swz;
515 ag.arg.have_z = have_z;
516 ag.arg.alpha_kernel_radius = alpha_kernel_radius;
517
518 ag.img_size_x = size_x;
519 ag.img_size_y = size_y;
520 ag.img_size_z = size_z;
521 ag.blk_size_xy = max_blk_size_xy;
522 ag.blk_size_z = max_blk_size_z;
523 ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
524
525 // The parallel task count
526 unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
527 unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
528 return z_tasks * y_tasks;
529 }
530
531 #endif
532