• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19 
20 /**
21  * @brief Functions to calculate variance per component in a NxN footprint.
22  *
23  * We need N to be parametric, so the routine below uses summed area tables in order to execute in
24  * O(1) time independent of how big N is.
25  *
26  * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
27  * perform a binary reduction, and then distributes the results. This method means that there is no
28  * serial dependency between a given element and the next one, and also significantly improves
29  * numerical stability allowing us to use floats rather than doubles.
30  */
31 
32 #include "astcenc_internal.h"
33 
34 #include <cassert>
35 
36 /**
37  * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
38  *
39  * This will take an input array of the form:
40  *     v0, v1, v2, ...
41  * ... and modify in-place to turn it into a prefix-sum array of the form:
42  *     v0, v0+v1, v0+v1+v2, ...
43  *
44  * @param d      The array to prefix-sum.
45  * @param items  The number of items in the array.
46  * @param stride The item spacing in the array; i.e. dense arrays should use 1.
47  */
brent_kung_prefix_sum(vfloat4 * d,size_t items,int stride)48 static void brent_kung_prefix_sum(
49 	vfloat4* d,
50 	size_t items,
51 	int stride
52 ) {
53 	if (items < 2)
54 		return;
55 
56 	size_t lc_stride = 2;
57 	size_t log2_stride = 1;
58 
59 	// The reduction-tree loop
60 	do {
61 		size_t step = lc_stride >> 1;
62 		size_t start = lc_stride - 1;
63 		size_t iters = items >> log2_stride;
64 
65 		vfloat4 *da = d + (start * stride);
66 		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
67 		size_t ofs_stride = stride << log2_stride;
68 
69 		while (iters)
70 		{
71 			*da = *da + da[ofs];
72 			da += ofs_stride;
73 			iters--;
74 		}
75 
76 		log2_stride += 1;
77 		lc_stride <<= 1;
78 	} while (lc_stride <= items);
79 
80 	// The expansion-tree loop
81 	do {
82 		log2_stride -= 1;
83 		lc_stride >>= 1;
84 
85 		size_t step = lc_stride >> 1;
86 		size_t start = step + lc_stride - 1;
87 		size_t iters = (items - step) >> log2_stride;
88 
89 		vfloat4 *da = d + (start * stride);
90 		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
91 		size_t ofs_stride = stride << log2_stride;
92 
93 		while (iters)
94 		{
95 			*da = *da + da[ofs];
96 			da += ofs_stride;
97 			iters--;
98 		}
99 	} while (lc_stride > 2);
100 }
101 
102 /**
103  * @brief Compute averages for a pixel region.
104  *
105  * The routine computes both in a single pass, using a summed-area table to decouple the running
106  * time from the averaging/variance kernel size.
107  *
108  * @param[out] ctx   The compressor context storing the output data.
109  * @param      arg   The input parameter structure.
110  */
compute_pixel_region_variance(astcenc_context & ctx,const pixel_region_args & arg)111 static void compute_pixel_region_variance(
112 	astcenc_context& ctx,
113 	const pixel_region_args& arg
114 ) {
115 	// Unpack the memory structure into local variables
116 	const astcenc_image* img = arg.img;
117 	astcenc_swizzle swz = arg.swz;
118 	bool have_z = arg.have_z;
119 
120 	int size_x = arg.size_x;
121 	int size_y = arg.size_y;
122 	int size_z = arg.size_z;
123 
124 	int offset_x = arg.offset_x;
125 	int offset_y = arg.offset_y;
126 	int offset_z = arg.offset_z;
127 
128 	int alpha_kernel_radius = arg.alpha_kernel_radius;
129 
130 	float*   input_alpha_averages = ctx.input_alpha_averages;
131 	vfloat4* work_memory = arg.work_memory;
132 
133 	// Compute memory sizes and dimensions that we need
134 	int kernel_radius = alpha_kernel_radius;
135 	int kerneldim = 2 * kernel_radius + 1;
136 	int kernel_radius_xy = kernel_radius;
137 	int kernel_radius_z = have_z ? kernel_radius : 0;
138 
139 	int padsize_x = size_x + kerneldim;
140 	int padsize_y = size_y + kerneldim;
141 	int padsize_z = size_z + (have_z ? kerneldim : 0);
142 	int sizeprod = padsize_x * padsize_y * padsize_z;
143 
144 	int zd_start = have_z ? 1 : 0;
145 
146 	vfloat4 *varbuf1 = work_memory;
147 	vfloat4 *varbuf2 = work_memory + sizeprod;
148 
149 	// Scaling factors to apply to Y and Z for accesses into the work buffers
150 	int yst = padsize_x;
151 	int zst = padsize_x * padsize_y;
152 
153 	// Scaling factors to apply to Y and Z for accesses into result buffers
154 	int ydt = img->dim_x;
155 	int zdt = img->dim_x * img->dim_y;
156 
157 	// Macros to act as accessor functions for the work-memory
158 	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
159 	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
160 
161 	// Load N and N^2 values into the work buffers
162 	if (img->data_type == ASTCENC_TYPE_U8)
163 	{
164 		// Swizzle data structure 4 = ZERO, 5 = ONE
165 		uint8_t data[6];
166 		data[ASTCENC_SWZ_0] = 0;
167 		data[ASTCENC_SWZ_1] = 255;
168 
169 		for (int z = zd_start; z < padsize_z; z++)
170 		{
171 			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
172 			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
173 			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
174 
175 			for (int y = 1; y < padsize_y; y++)
176 			{
177 				int y_src = (y - 1) + offset_y - kernel_radius_xy;
178 				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
179 
180 				for (int x = 1; x < padsize_x; x++)
181 				{
182 					int x_src = (x - 1) + offset_x - kernel_radius_xy;
183 					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
184 
185 					data[0] = data8[(4 * img->dim_stride * y_src) + (4 * x_src    )];
186 					data[1] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 1)];
187 					data[2] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 2)];
188 					data[3] = data8[(4 * img->dim_stride * y_src) + (4 * x_src + 3)];
189 
190 					uint8_t r = data[swz.r];
191 					uint8_t g = data[swz.g];
192 					uint8_t b = data[swz.b];
193 					uint8_t a = data[swz.a];
194 
195 					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
196 					                     g * (1.0f / 255.0f),
197 					                     b * (1.0f / 255.0f),
198 					                     a * (1.0f / 255.0f));
199 
200 					VARBUF1(z, y, x) = d;
201 					VARBUF2(z, y, x) = d * d;
202 				}
203 			}
204 		}
205 	}
206 	else if (img->data_type == ASTCENC_TYPE_F16)
207 	{
208 		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
209 		uint16_t data[6];
210 		data[ASTCENC_SWZ_0] = 0;
211 		data[ASTCENC_SWZ_1] = 0x3C00;
212 
213 		for (int z = zd_start; z < padsize_z; z++)
214 		{
215 			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
216 			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
217 			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
218 
219 			for (int y = 1; y < padsize_y; y++)
220 			{
221 				int y_src = (y - 1) + offset_y - kernel_radius_xy;
222 				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
223 
224 				for (int x = 1; x < padsize_x; x++)
225 				{
226 					int x_src = (x - 1) + offset_x - kernel_radius_xy;
227 					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
228 
229 					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
230 					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
231 					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
232 					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
233 
234 					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
235 					vfloat4 d = float16_to_float(di);
236 
237 					VARBUF1(z, y, x) = d;
238 					VARBUF2(z, y, x) = d * d;
239 				}
240 			}
241 		}
242 	}
243 	else // if (img->data_type == ASTCENC_TYPE_F32)
244 	{
245 		assert(img->data_type == ASTCENC_TYPE_F32);
246 
247 		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
248 		float data[6];
249 		data[ASTCENC_SWZ_0] = 0.0f;
250 		data[ASTCENC_SWZ_1] = 1.0f;
251 
252 		for (int z = zd_start; z < padsize_z; z++)
253 		{
254 			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
255 			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
256 			float* data32 = static_cast<float*>(img->data[z_src]);
257 
258 			for (int y = 1; y < padsize_y; y++)
259 			{
260 				int y_src = (y - 1) + offset_y - kernel_radius_xy;
261 				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
262 
263 				for (int x = 1; x < padsize_x; x++)
264 				{
265 					int x_src = (x - 1) + offset_x - kernel_radius_xy;
266 					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
267 
268 					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
269 					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
270 					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
271 					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
272 
273 					float r = data[swz.r];
274 					float g = data[swz.g];
275 					float b = data[swz.b];
276 					float a = data[swz.a];
277 
278 					vfloat4 d(r, g, b, a);
279 
280 					VARBUF1(z, y, x) = d;
281 					VARBUF2(z, y, x) = d * d;
282 				}
283 			}
284 		}
285 	}
286 
287 	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
288 	vfloat4 vbz = vfloat4::zero();
289 	for (int z = 0; z < padsize_z; z++)
290 	{
291 		for (int y = 0; y < padsize_y; y++)
292 		{
293 			VARBUF1(z, y, 0) = vbz;
294 			VARBUF2(z, y, 0) = vbz;
295 		}
296 
297 		for (int x = 0; x < padsize_x; x++)
298 		{
299 			VARBUF1(z, 0, x) = vbz;
300 			VARBUF2(z, 0, x) = vbz;
301 		}
302 	}
303 
304 	if (have_z)
305 	{
306 		for (int y = 0; y < padsize_y; y++)
307 		{
308 			for (int x = 0; x < padsize_x; x++)
309 			{
310 				VARBUF1(0, y, x) = vbz;
311 				VARBUF2(0, y, x) = vbz;
312 			}
313 		}
314 	}
315 
316 	// Generate summed-area tables for N and N^2; this is done in-place, using
317 	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
318 	for (int z = zd_start; z < padsize_z; z++)
319 	{
320 		for (int y = 1; y < padsize_y; y++)
321 		{
322 			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
323 			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
324 		}
325 	}
326 
327 	for (int z = zd_start; z < padsize_z; z++)
328 	{
329 		for (int x = 1; x < padsize_x; x++)
330 		{
331 			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
332 			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
333 		}
334 	}
335 
336 	if (have_z)
337 	{
338 		for (int y = 1; y < padsize_y; y++)
339 		{
340 			for (int x = 1; x < padsize_x; x++)
341 			{
342 				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
343 				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
344 			}
345 		}
346 	}
347 
348 	// Compute a few constants used in the variance-calculation.
349 	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
350 	float alpha_rsamples;
351 
352 	if (have_z)
353 	{
354 		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
355 	}
356 	else
357 	{
358 		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
359 	}
360 
361 	// Use the summed-area tables to compute variance for each neighborhood
362 	if (have_z)
363 	{
364 		for (int z = 0; z < size_z; z++)
365 		{
366 			int z_src = z + kernel_radius_z;
367 			int z_dst = z + offset_z;
368 			int z_low  = z_src - alpha_kernel_radius;
369 			int z_high = z_src + alpha_kernel_radius + 1;
370 
371 			for (int y = 0; y < size_y; y++)
372 			{
373 				int y_src = y + kernel_radius_xy;
374 				int y_dst = y + offset_y;
375 				int y_low  = y_src - alpha_kernel_radius;
376 				int y_high = y_src + alpha_kernel_radius + 1;
377 
378 				for (int x = 0; x < size_x; x++)
379 				{
380 					int x_src = x + kernel_radius_xy;
381 					int x_dst = x + offset_x;
382 					int x_low  = x_src - alpha_kernel_radius;
383 					int x_high = x_src + alpha_kernel_radius + 1;
384 
385 					// Summed-area table lookups for alpha average
386 					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
387 					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
388 					               - VARBUF1(z_high, y_high, x_low).lane<3>()
389 					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
390 					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
391 					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
392 					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
393 					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
394 
395 					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
396 					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
397 				}
398 			}
399 		}
400 	}
401 	else
402 	{
403 		for (int y = 0; y < size_y; y++)
404 		{
405 			int y_src = y + kernel_radius_xy;
406 			int y_dst = y + offset_y;
407 			int y_low  = y_src - alpha_kernel_radius;
408 			int y_high = y_src + alpha_kernel_radius + 1;
409 
410 			for (int x = 0; x < size_x; x++)
411 			{
412 				int x_src = x + kernel_radius_xy;
413 				int x_dst = x + offset_x;
414 				int x_low  = x_src - alpha_kernel_radius;
415 				int x_high = x_src + alpha_kernel_radius + 1;
416 
417 				// Summed-area table lookups for alpha average
418 				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
419 				            - VARBUF1(0, y_low,  x_high).lane<3>()
420 				            - VARBUF1(0, y_high, x_low).lane<3>()
421 				            + VARBUF1(0, y_high, x_high).lane<3>();
422 
423 				int out_index = y_dst * ydt + x_dst;
424 				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
425 			}
426 		}
427 	}
428 }
429 
compute_averages(astcenc_context & ctx,const avg_args & ag)430 void compute_averages(
431 	astcenc_context& ctx,
432 	const avg_args &ag
433 ) {
434 	pixel_region_args arg = ag.arg;
435 	arg.work_memory = new vfloat4[ag.work_memory_size];
436 
437 	int size_x = ag.img_size_x;
438 	int size_y = ag.img_size_y;
439 	int size_z = ag.img_size_z;
440 
441 	int step_xy = ag.blk_size_xy;
442 	int step_z = ag.blk_size_z;
443 
444 	int y_tasks = (size_y + step_xy - 1) / step_xy;
445 
446 	// All threads run this processing loop until there is no work remaining
447 	while (true)
448 	{
449 		unsigned int count;
450 		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
451 		if (!count)
452 		{
453 			break;
454 		}
455 
456 		for (unsigned int i = base; i < base + count; i++)
457 		{
458 			int z = (i / (y_tasks)) * step_z;
459 			int y = (i - (z * y_tasks)) * step_xy;
460 
461 			arg.size_z = astc::min(step_z, size_z - z);
462 			arg.offset_z = z;
463 
464 			arg.size_y = astc::min(step_xy, size_y - y);
465 			arg.offset_y = y;
466 
467 			for (int x = 0; x < size_x; x += step_xy)
468 			{
469 				arg.size_x = astc::min(step_xy, size_x - x);
470 				arg.offset_x = x;
471 				compute_pixel_region_variance(ctx, arg);
472 			}
473 		}
474 
475 		ctx.manage_avg.complete_task_assignment(count);
476 	}
477 
478 	delete[] arg.work_memory;
479 }
480 
481 /* See header for documentation. */
init_compute_averages(const astcenc_image & img,unsigned int alpha_kernel_radius,const astcenc_swizzle & swz,avg_args & ag)482 unsigned int init_compute_averages(
483 	const astcenc_image& img,
484 	unsigned int alpha_kernel_radius,
485 	const astcenc_swizzle& swz,
486 	avg_args& ag
487 ) {
488 	unsigned int size_x = img.dim_x;
489 	unsigned int size_y = img.dim_y;
490 	unsigned int size_z = img.dim_z;
491 
492 	// Compute maximum block size and from that the working memory buffer size
493 	unsigned int kernel_radius = alpha_kernel_radius;
494 	unsigned int kerneldim = 2 * kernel_radius + 1;
495 
496 	bool have_z = (size_z > 1);
497 	unsigned int max_blk_size_xy = have_z ? 16 : 32;
498 	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
499 
500 	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
501 	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
502 
503 	// Perform block-wise averages calculations across the image
504 	// Initialize fields which are not populated until later
505 	ag.arg.size_x = 0;
506 	ag.arg.size_y = 0;
507 	ag.arg.size_z = 0;
508 	ag.arg.offset_x = 0;
509 	ag.arg.offset_y = 0;
510 	ag.arg.offset_z = 0;
511 	ag.arg.work_memory = nullptr;
512 
513 	ag.arg.img = &img;
514 	ag.arg.swz = swz;
515 	ag.arg.have_z = have_z;
516 	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
517 
518 	ag.img_size_x = size_x;
519 	ag.img_size_y = size_y;
520 	ag.img_size_z = size_z;
521 	ag.blk_size_xy = max_blk_size_xy;
522 	ag.blk_size_z = max_blk_size_z;
523 	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
524 
525 	// The parallel task count
526 	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
527 	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
528 	return z_tasks * y_tasks;
529 }
530 
531 #endif
532