• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19 
20 /**
21  * @brief Functions to compress a symbolic block.
22  */
23 
24 #include "astcenc_internal.h"
25 #include "astcenc_diagnostic_trace.h"
26 
27 #include <cassert>
28 #ifdef ASTC_CUSTOMIZED_ENABLE
29 AstcCustomizedSoManager g_astcCustomizedSoManager;
30 #endif
31 
32 /**
33  * @brief Merge two planes of endpoints into a single vector.
34  *
35  * @param      ep_plane1          The endpoints for plane 1.
36  * @param      ep_plane2          The endpoints for plane 2.
37  * @param      component_plane2   The color component for plane 2.
38  * @param[out] result             The merged output.
39  */
merge_endpoints(const endpoints & ep_plane1,const endpoints & ep_plane2,unsigned int component_plane2,endpoints & result)40 static void merge_endpoints(
41 	const endpoints& ep_plane1,
42 	const endpoints& ep_plane2,
43 	unsigned int component_plane2,
44 	endpoints& result
45 ) {
46 	unsigned int partition_count = ep_plane1.partition_count;
47 	assert(partition_count == 1);
48 
49 	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
50 
51 	result.partition_count = partition_count;
52 	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
53 	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
54 }
55 
56 /**
57  * @brief Attempt to improve weights given a chosen configuration.
58  *
59  * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
60  * partition and per plane) and attempt to improve image quality by moving each weight up by one or
61  * down by one quantization step.
62  *
63  * This is a specialized function which only supports operating on undecimated weight grids,
64  * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
65  * is needed less often.
66  *
67  * @param      decode_mode   The decode mode (LDR, HDR).
68  * @param      bsd           The block size information.
69  * @param      blk           The image block color data to compress.
70  * @param[out] scb           The symbolic compressed block output.
71  */
realign_weights_undecimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)72 static bool realign_weights_undecimated(
73 	astcenc_profile decode_mode,
74 	const block_size_descriptor& bsd,
75 	const image_block& blk,
76 	symbolic_compressed_block& scb
77 ) {
78 	// Get the partition descriptor
79 	unsigned int partition_count = scb.partition_count;
80 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
81 
82 	// Get the quantization table
83 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
84 	unsigned int weight_quant_level = bm.quant_mode;
85 	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
86 
87 	unsigned int max_plane = bm.is_dual_plane;
88 	int plane2_component = scb.plane2_component;
89 	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
90 
91 	// Decode the color endpoints
92 	bool rgb_hdr;
93 	bool alpha_hdr;
94 	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
95 	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
96 	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
97 	vfloat4 offset[BLOCK_MAX_PARTITIONS];
98 
99 	promise(partition_count > 0);
100 
101 	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
102 	{
103 		unpack_color_endpoints(decode_mode,
104 		                       scb.color_formats[pa_idx],
105 		                       scb.color_values[pa_idx],
106 		                       rgb_hdr, alpha_hdr,
107 		                       endpnt0[pa_idx],
108 		                       endpnt1[pa_idx]);
109 	}
110 
111 	uint8_t* dec_weights_uquant = scb.weights;
112 	bool adjustments = false;
113 
114 	// For each plane and partition ...
115 	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
116 	{
117 		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
118 		{
119 			// Compute the endpoint delta for all components in current plane
120 			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
121 			epd = select(epd, vint4::zero(), plane_mask);
122 
123 			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
124 			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
125 		}
126 
127 		// For each weight compute previous, current, and next errors
128 		promise(bsd.texel_count > 0);
129 		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
130 		{
131 			int uqw = dec_weights_uquant[texel];
132 
133 			uint32_t prev_and_next = qat.prev_next_values[uqw];
134 			int uqw_down = prev_and_next & 0xFF;
135 			int uqw_up = (prev_and_next >> 8) & 0xFF;
136 
137 			// Interpolate the colors to create the diffs
138 			float weight_base = static_cast<float>(uqw);
139 			float weight_down = static_cast<float>(uqw_down - uqw);
140 			float weight_up = static_cast<float>(uqw_up - uqw);
141 
142 			unsigned int partition = pi.partition_of_texel[texel];
143 			vfloat4 color_offset = offset[partition];
144 			vfloat4 color_base   = endpnt0f[partition];
145 
146 			vfloat4 color = color_base + color_offset * weight_base;
147 			vfloat4 orig_color   = blk.texel(texel);
148 			vfloat4 error_weight = blk.channel_weight;
149 
150 			vfloat4 color_diff      = color - orig_color;
151 			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
152 			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
153 
154 			float error_base = dot_s(color_diff      * color_diff,      error_weight);
155 			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
156 			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
157 
158 			// Check if the prev or next error is better, and if so use it
159 			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
160 			{
161 				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
162 				adjustments = true;
163 			}
164 			else if ((error_down < error_base) && (uqw > 0))
165 			{
166 				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
167 				adjustments = true;
168 			}
169 		}
170 
171 		// Prepare iteration for plane 2
172 		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
173 		plane_mask = ~plane_mask;
174 	}
175 
176 	return adjustments;
177 }
178 
179 /**
180  * @brief Attempt to improve weights given a chosen configuration.
181  *
182  * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
183  * partition and per plane) and attempt to improve image quality by moving each weight up by one or
184  * down by one quantization step.
185  *
186  * @param      decode_mode   The decode mode (LDR, HDR).
187  * @param      bsd           The block size information.
188  * @param      blk           The image block color data to compress.
189  * @param[out] scb           The symbolic compressed block output.
190  */
realign_weights_decimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)191 static bool realign_weights_decimated(
192 	astcenc_profile decode_mode,
193 	const block_size_descriptor& bsd,
194 	const image_block& blk,
195 	symbolic_compressed_block& scb
196 ) {
197 	// Get the partition descriptor
198 	unsigned int partition_count = scb.partition_count;
199 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
200 
201 	// Get the quantization table
202 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
203 	unsigned int weight_quant_level = bm.quant_mode;
204 	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
205 
206 	// Get the decimation table
207 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
208 	unsigned int weight_count = di.weight_count;
209 	assert(weight_count != bsd.texel_count);
210 
211 	unsigned int max_plane = bm.is_dual_plane;
212 	int plane2_component = scb.plane2_component;
213 	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
214 
215 	// Decode the color endpoints
216 	bool rgb_hdr;
217 	bool alpha_hdr;
218 	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
219 	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
220 	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
221 	vfloat4 offset[BLOCK_MAX_PARTITIONS];
222 
223 	promise(partition_count > 0);
224 	promise(weight_count > 0);
225 
226 	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
227 	{
228 		unpack_color_endpoints(decode_mode,
229 		                       scb.color_formats[pa_idx],
230 		                       scb.color_values[pa_idx],
231 		                       rgb_hdr, alpha_hdr,
232 		                       endpnt0[pa_idx],
233 		                       endpnt1[pa_idx]);
234 	}
235 
236 	uint8_t* dec_weights_uquant = scb.weights;
237 	bool adjustments = false;
238 
239 	// For each plane and partition ...
240 	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
241 	{
242 		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
243 		{
244 			// Compute the endpoint delta for all components in current plane
245 			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
246 			epd = select(epd, vint4::zero(), plane_mask);
247 
248 			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
249 			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
250 		}
251 
252 		// Create an unquantized weight grid for this decimation level
253 		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
254 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
255 		{
256 			vint unquant_value(dec_weights_uquant + we_idx);
257 			vfloat unquant_valuef = int_to_float(unquant_value);
258 			storea(unquant_valuef, uq_weightsf + we_idx);
259 		}
260 
261 		// For each weight compute previous, current, and next errors
262 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
263 		{
264 			int uqw = dec_weights_uquant[we_idx];
265 			uint32_t prev_and_next = qat.prev_next_values[uqw];
266 
267 			float uqw_base = uq_weightsf[we_idx];
268 			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
269 			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
270 
271 			float uqw_diff_down = uqw_down - uqw_base;
272 			float uqw_diff_up = uqw_up - uqw_base;
273 
274 			vfloat4 error_basev = vfloat4::zero();
275 			vfloat4 error_downv = vfloat4::zero();
276 			vfloat4 error_upv = vfloat4::zero();
277 
278 			// Interpolate the colors to create the diffs
279 			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
280 			promise(texels_to_evaluate > 0);
281 			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
282 			{
283 				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
284 
285 				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
286 
287 				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
288 				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
289 					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
290 				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
291 
292 				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
293 				// float weight = astc::flt_rd(weight_base + 0.5f);
294 				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
295 				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
296 				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
297 				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
298 
299 				unsigned int partition = pi.partition_of_texel[texel];
300 				vfloat4 color_offset = offset[partition];
301 				vfloat4 color_base   = endpnt0f[partition];
302 
303 				vfloat4 color = color_base + color_offset * weight_base;
304 				vfloat4 orig_color = blk.texel(texel);
305 
306 				vfloat4 color_diff      = color - orig_color;
307 				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
308 				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
309 
310 				error_basev += color_diff * color_diff;
311 				error_downv += color_down_diff * color_down_diff;
312 				error_upv   += color_up_diff * color_up_diff;
313 			}
314 
315 			vfloat4 error_weight = blk.channel_weight;
316 			float error_base = hadd_s(error_basev * error_weight);
317 			float error_down = hadd_s(error_downv * error_weight);
318 			float error_up   = hadd_s(error_upv   * error_weight);
319 
320 			// Check if the prev or next error is better, and if so use it
321 			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
322 			{
323 				uq_weightsf[we_idx] = uqw_up;
324 				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
325 				adjustments = true;
326 			}
327 			else if ((error_down < error_base) && (uqw > 0))
328 			{
329 				uq_weightsf[we_idx] = uqw_down;
330 				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
331 				adjustments = true;
332 			}
333 		}
334 
335 		// Prepare iteration for plane 2
336 		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
337 		plane_mask = ~plane_mask;
338 	}
339 
340 	return adjustments;
341 }
342 
343 /**
344  * @brief Compress a block using a chosen partitioning and 1 plane of weights.
345  *
346  * @param      config                    The compressor configuration.
347  * @param      bsd                       The block size information.
348  * @param      blk                       The image block color data to compress.
349  * @param      only_always               True if we only use "always" percentile block modes.
350  * @param      tune_errorval_threshold   The error value threshold.
351  * @param      partition_count           The partition count.
352  * @param      partition_index           The partition index if @c partition_count is 2-4.
353  * @param[out] scb                       The symbolic compressed block output.
354  * @param[out] tmpbuf                    The quantized weights for plane 1.
355  */
compress_symbolic_block_for_partition_1plane(QualityProfile privateProfile,const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,bool only_always,float tune_errorval_threshold,unsigned int partition_count,unsigned int partition_index,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)356 static float compress_symbolic_block_for_partition_1plane(
357 	QualityProfile privateProfile,
358 	const astcenc_config& config,
359 	const block_size_descriptor& bsd,
360 	const image_block& blk,
361 	bool only_always,
362 	float tune_errorval_threshold,
363 	unsigned int partition_count,
364 	unsigned int partition_index,
365 	symbolic_compressed_block& scb,
366 	compression_working_buffers& tmpbuf,
367 	int quant_limit
368 ) {
369 	promise(partition_count > 0);
370 	promise(config.tune_candidate_limit > 0);
371 	promise(config.tune_refinement_limit > 0);
372 
373 	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
374 
375 	auto compute_difference = &compute_symbolic_block_difference_1plane;
376 	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
377 	{
378 		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
379 	}
380 
381 	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
382 
383 	// Compute ideal weights and endpoint colors, with no quantization or decimation
384 	endpoints_and_weights& ei = tmpbuf.ei1;
385 	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
386 
387 	// Compute ideal weights and endpoint colors for every decimation
388 	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
389 	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
390 
391 	// For each decimation mode, compute an ideal set of weights with no quantization
392 	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
393 	                                                : bsd.decimation_mode_count_selected;
394 	promise(max_decimation_modes > 0);
395 	for (unsigned int i = 0; i < max_decimation_modes; i++)
396 	{
397 		const auto& dm = bsd.get_decimation_mode(i);
398 		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
399 		{
400 			continue;
401 		}
402 
403 		const auto& di = bsd.get_decimation_info(i);
404 
405 		compute_ideal_weights_for_decimation(
406 		    ei,
407 		    di,
408 		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
409 	}
410 
411 	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
412 	// weight pair, compute the smallest weight that will result in a color value greater than 1
413 	vfloat4 min_ep(10.0f);
414 	for (unsigned int i = 0; i < partition_count; i++)
415 	{
416 		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
417 
418 		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
419 		min_ep = select(min_ep, ep, use_ep);
420 	}
421 
422 	float min_wt_cutoff = hmin_s(min_ep);
423 
424 	// For each mode, use the angular method to compute a shift
425 	compute_angular_endpoints_1plane(
426 	    only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
427 
428 	float* weight_low_value = tmpbuf.weight_low_value1;
429 	float* weight_high_value = tmpbuf.weight_high_value1;
430 	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
431 	float* qwt_errors = tmpbuf.qwt_errors;
432 
433 	// For each mode (which specifies a decimation and a quantization):
434 	//     * Compute number of bits needed for the quantized weights
435 	//     * Generate an optimized set of quantized weights
436 	//     * Compute quantization errors for the mode
437 
438 
439 	static const int8_t free_bits_for_partition_count[4] {
440 		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
441 	};
442 
443 	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
444 	                                           : bsd.block_mode_count_1plane_selected;
445 	promise(max_block_modes > 0);
446 	for (unsigned int i = 0; i < max_block_modes; i++)
447 	{
448 		const block_mode& bm = bsd.block_modes[i];
449 
450 		if (bm.quant_mode > max_weight_quant)
451 		{
452 			qwt_errors[i] = 1e38f;
453 			continue;
454 		}
455 
456 		assert(!bm.is_dual_plane);
457 		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
458 		if (bitcount <= 0)
459 		{
460 			qwt_errors[i] = 1e38f;
461 			continue;
462 		}
463 
464 		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
465 		{
466 			weight_high_value[i] = 1.0f;
467 		}
468 
469 		int decimation_mode = bm.decimation_mode;
470 		const auto& di = bsd.get_decimation_info(decimation_mode);
471 
472 		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
473 
474 		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
475 
476 		// Generate the optimized set of weights for the weight mode
477 		compute_quantized_weights_for_decimation(
478 		    di,
479 		    weight_low_value[i], weight_high_value[i],
480 		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
481 		    dec_weights_uquantf,
482 		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
483 		    bm.get_weight_quant_mode());
484 
485 		// Compute weight quantization errors for the block mode
486 		qwt_errors[i] = compute_error_of_weight_set_1plane(
487 		    ei,
488 		    di,
489 		    dec_weights_uquantf);
490 	}
491 
492 	// Decide the optimal combination of color endpoint encodings and weight encodings
493 	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
494 	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
495 
496 	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
497 	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
498 
499 	unsigned int candidate_count = compute_ideal_endpoint_formats(
500 	    privateProfile,
501 	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
502 	    config.tune_candidate_limit, 0, max_block_modes,
503 	    partition_format_specifiers, block_mode_index,
504 	    color_quant_level, color_quant_level_mod, tmpbuf);
505 
506 	// Iterate over the N believed-to-be-best modes to find out which one is actually best
507 	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
508 	float best_errorval_in_scb = scb.errorval;
509 
510 	for (unsigned int i = 0; i < candidate_count; i++)
511 	{
512 		TRACE_NODE(node0, "candidate");
513 
514 		const int bm_packed_index = block_mode_index[i];
515 		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
516 		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
517 
518 		int decimation_mode = qw_bm.decimation_mode;
519 		const auto& di = bsd.get_decimation_info(decimation_mode);
520 		promise(di.weight_count > 0);
521 
522 		trace_add_data("weight_x", di.weight_x);
523 		trace_add_data("weight_y", di.weight_y);
524 		trace_add_data("weight_z", di.weight_z);
525 		trace_add_data("weight_quant", qw_bm.quant_mode);
526 
527 		// Recompute the ideal color endpoints before storing them
528 		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
529 		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
530 
531 		symbolic_compressed_block workscb;
532 		endpoints workep = ei.ep;
533 
534 		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
535 
536 		for (unsigned int j = 0; j < di.weight_count; j++)
537 		{
538 			workscb.weights[j] = u8_weight_src[j];
539 		}
540 
541 		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
542 		{
543 			recompute_ideal_colors_1plane(
544 			    blk, pi, di, workscb.weights,
545 			    workep, rgbs_colors, rgbo_colors);
546 
547 			// Quantize the chosen color, tracking if worth trying the mod value
548 			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
549 			for (unsigned int j = 0; j < partition_count; j++)
550 			{
551 				workscb.color_formats[j] = pack_color_endpoints(
552 				    privateProfile,
553 				    workep.endpt0[j],
554 				    workep.endpt1[j],
555 				    rgbs_colors[j],
556 				    rgbo_colors[j],
557 				    partition_format_specifiers[i][j],
558 				    workscb.color_values[j],
559 				    color_quant_level[i]);
560 
561 				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
562 			}
563 
564 			// If all the color endpoint modes are the same, we get a few more bits to store colors;
565 			// let's see if we can take advantage of this: requantize all the colors and see if the
566 			// endpoint modes remain the same.
567 			workscb.color_formats_matched = 0;
568 			if (partition_count >= 2 && all_same)
569 			{
570 				uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
571 				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
572 				bool all_same_mod = true;
573 				for (unsigned int j = 0; j < partition_count; j++)
574 				{
575 					color_formats_mod[j] = pack_color_endpoints(
576 					    privateProfile,
577 					    workep.endpt0[j],
578 					    workep.endpt1[j],
579 					    rgbs_colors[j],
580 					    rgbo_colors[j],
581 					    partition_format_specifiers[i][j],
582 					    colorvals[j],
583 					    color_quant_level_mod[i]);
584 
585 					// Early out as soon as it's no longer possible to use mod
586 					if (color_formats_mod[j] != color_formats_mod[0])
587 					{
588 						all_same_mod = false;
589 						break;
590 					}
591 				}
592 
593 				if (all_same_mod)
594 				{
595 					workscb.color_formats_matched = 1;
596 					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
597 					{
598 						for (unsigned int k = 0; k < 8; k++)
599 						{
600 							workscb.color_values[j][k] = colorvals[j][k];
601 						}
602 
603 						workscb.color_formats[j] = color_formats_mod[j];
604 					}
605 				}
606 			}
607 
608 			// Store header fields
609 			workscb.partition_count = static_cast<uint8_t>(partition_count);
610 			workscb.partition_index = static_cast<uint16_t>(partition_index);
611 			workscb.plane2_component = -1;
612 			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
613 			workscb.block_mode = qw_bm.mode_index;
614 			workscb.block_type = SYM_BTYPE_NONCONST;
615 			if (privateProfile == HIGH_SPEED_PROFILE)
616 			{
617 				workscb.errorval = 0;
618 				scb = workscb;
619 				break;
620 			}
621 			// Pre-realign test
622 			if (l == 0)
623 			{
624 				float errorval = compute_difference(config, bsd, workscb, blk);
625 				if (errorval == -ERROR_CALC_DEFAULT)
626 				{
627 					errorval = -errorval;
628 					workscb.block_type = SYM_BTYPE_ERROR;
629 				}
630 
631 				trace_add_data("error_prerealign", errorval);
632 				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
633 
634 				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
635 				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
636 				// drive a heuristic to skip blocks that are unlikely to catch up with the best
637 				// block we have already.
638 				unsigned int iters_remaining = config.tune_refinement_limit - l;
639 				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
640 				if (errorval > (threshold * best_errorval_in_scb))
641 				{
642 					break;
643 				}
644 
645 				if (errorval < best_errorval_in_scb)
646 				{
647 					best_errorval_in_scb = errorval;
648 					workscb.errorval = errorval;
649 					scb = workscb;
650 
651 					if (errorval < tune_errorval_threshold)
652 					{
653 						// Skip remaining candidates - this is "good enough"
654 						i = candidate_count;
655 						break;
656 					}
657 				}
658 			}
659 
660 			bool adjustments;
661 			if (di.weight_count != bsd.texel_count)
662 			{
663 				adjustments = realign_weights_decimated(
664 					config.profile, bsd, blk, workscb);
665 			}
666 			else
667 			{
668 				adjustments = realign_weights_undecimated(
669 					config.profile, bsd, blk, workscb);
670 			}
671 
672 			// Post-realign test
673 			float errorval = compute_difference(config, bsd, workscb, blk);
674 			if (errorval == -ERROR_CALC_DEFAULT)
675 			{
676 				errorval = -errorval;
677 				workscb.block_type = SYM_BTYPE_ERROR;
678 			}
679 
680 			trace_add_data("error_postrealign", errorval);
681 			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
682 
683 			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
684 			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
685 			// give benefit of the doubt ...
686 			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
687 			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
688 			if (errorval > (threshold * best_errorval_in_scb))
689 			{
690 				break;
691 			}
692 
693 			if (errorval < best_errorval_in_scb)
694 			{
695 				best_errorval_in_scb = errorval;
696 				workscb.errorval = errorval;
697 				scb = workscb;
698 
699 				if (errorval < tune_errorval_threshold)
700 				{
701 					// Skip remaining candidates - this is "good enough"
702 					i = candidate_count;
703 					break;
704 				}
705 			}
706 
707 			if (!adjustments)
708 			{
709 				break;
710 			}
711 		}
712 	}
713 
714 	return best_errorval_in_mode;
715 }
716 
717 /**
718  * @brief Compress a block using a chosen partitioning and 2 planes of weights.
719  *
720  * @param      config                    The compressor configuration.
721  * @param      bsd                       The block size information.
722  * @param      blk                       The image block color data to compress.
723  * @param      tune_errorval_threshold   The error value threshold.
724  * @param      plane2_component          The component index for the second plane of weights.
725  * @param[out] scb                       The symbolic compressed block output.
726  * @param[out] tmpbuf                    The quantized weights for plane 1.
727  */
compress_symbolic_block_for_partition_2planes(QualityProfile privateProfile,const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,float tune_errorval_threshold,unsigned int plane2_component,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)728 static float compress_symbolic_block_for_partition_2planes(
729 	QualityProfile privateProfile,
730 	const astcenc_config& config,
731 	const block_size_descriptor& bsd,
732 	const image_block& blk,
733 	float tune_errorval_threshold,
734 	unsigned int plane2_component,
735 	symbolic_compressed_block& scb,
736 	compression_working_buffers& tmpbuf,
737 	int quant_limit
738 ) {
739 	promise(config.tune_candidate_limit > 0);
740 	promise(config.tune_refinement_limit > 0);
741 	promise(bsd.decimation_mode_count_selected > 0);
742 
743 	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
744 
745 	// Compute ideal weights and endpoint colors, with no quantization or decimation
746 	endpoints_and_weights& ei1 = tmpbuf.ei1;
747 	endpoints_and_weights& ei2 = tmpbuf.ei2;
748 
749 	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
750 
751 	// Compute ideal weights and endpoint colors for every decimation
752 	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
753 	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
754 
755 	// For each decimation mode, compute an ideal set of weights with no quantization
756 	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
757 	{
758 		const auto& dm = bsd.get_decimation_mode(i);
759 		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
760 		{
761 			continue;
762 		}
763 
764 		const auto& di = bsd.get_decimation_info(i);
765 
766 		compute_ideal_weights_for_decimation(
767 		    ei1,
768 		    di,
769 		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
770 
771 		compute_ideal_weights_for_decimation(
772 		    ei2,
773 		    di,
774 		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
775 	}
776 
777 	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
778 	// weight pair, compute the smallest weight that will result in a color value greater than 1
779 	vfloat4 min_ep1(10.0f);
780 	vfloat4 min_ep2(10.0f);
781 
782 	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
783 	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
784 	min_ep1 = select(min_ep1, ep1, use_ep1);
785 
786 	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
787 	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
788 	min_ep2 = select(min_ep2, ep2, use_ep2);
789 
790 	vfloat4 err_max(ERROR_CALC_DEFAULT);
791 	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
792 
793 	// Set the plane2 component to max error in ep1
794 	min_ep1 = select(min_ep1, err_max, err_mask);
795 
796 	float min_wt_cutoff1 = hmin_s(min_ep1);
797 
798 	// Set the minwt2 to the plane2 component min in ep2
799 	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
800 
801 	compute_angular_endpoints_2planes(
802 	    bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
803 
804 	// For each mode (which specifies a decimation and a quantization):
805 	//     * Compute number of bits needed for the quantized weights
806 	//     * Generate an optimized set of quantized weights
807 	//     * Compute quantization errors for the mode
808 
809 	float* weight_low_value1 = tmpbuf.weight_low_value1;
810 	float* weight_high_value1 = tmpbuf.weight_high_value1;
811 	float* weight_low_value2 = tmpbuf.weight_low_value2;
812 	float* weight_high_value2 = tmpbuf.weight_high_value2;
813 
814 	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
815 	float* qwt_errors = tmpbuf.qwt_errors;
816 
817 	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
818 	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
819 
820 	for (unsigned int i = start_2plane; i < end_2plane; i++)
821 	{
822 		const block_mode& bm = bsd.block_modes[i];
823 		assert(bm.is_dual_plane);
824 
825 		if (bm.quant_mode > max_weight_quant)
826 		{
827 			qwt_errors[i] = 1e38f;
828 			continue;
829 		}
830 
831 		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
832 
833 		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
834 		{
835 			weight_high_value1[i] = 1.0f;
836 		}
837 
838 		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
839 		{
840 			weight_high_value2[i] = 1.0f;
841 		}
842 
843 		unsigned int decimation_mode = bm.decimation_mode;
844 		const auto& di = bsd.get_decimation_info(decimation_mode);
845 
846 		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
847 
848 		// Generate the optimized set of weights for the mode
849 		compute_quantized_weights_for_decimation(
850 		    di,
851 		    weight_low_value1[i],
852 		    weight_high_value1[i],
853 		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
854 		    dec_weights_uquantf,
855 		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
856 		    bm.get_weight_quant_mode());
857 
858 		compute_quantized_weights_for_decimation(
859 		    di,
860 		    weight_low_value2[i],
861 		    weight_high_value2[i],
862 		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
863 		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
864 		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
865 		    bm.get_weight_quant_mode());
866 
867 		// Compute weight quantization errors for the block mode
868 		qwt_errors[i] = compute_error_of_weight_set_2planes(
869 		    ei1,
870 		    ei2,
871 		    di,
872 		    dec_weights_uquantf,
873 		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
874 	}
875 
876 	// Decide the optimal combination of color endpoint encodings and weight encodings
877 	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
878 	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
879 
880 	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
881 	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
882 
883 	endpoints epm;
884 	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
885 
886 	const auto& pi = bsd.get_partition_info(1, 0);
887 	unsigned int candidate_count = compute_ideal_endpoint_formats(
888 	    config.privateProfile,
889 	    pi, blk, epm, qwt_bitcounts, qwt_errors,
890 	    config.tune_candidate_limit,
891 		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
892 	    partition_format_specifiers, block_mode_index,
893 	    color_quant_level, color_quant_level_mod, tmpbuf);
894 
895 	// Iterate over the N believed-to-be-best modes to find out which one is actually best
896 	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
897 	float best_errorval_in_scb = scb.errorval;
898 
899 	for (unsigned int i = 0; i < candidate_count; i++)
900 	{
901 		TRACE_NODE(node0, "candidate");
902 
903 		const int bm_packed_index = block_mode_index[i];
904 		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
905 		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
906 		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
907 
908 		int decimation_mode = qw_bm.decimation_mode;
909 		const auto& di = bsd.get_decimation_info(decimation_mode);
910 		promise(di.weight_count > 0);
911 
912 		trace_add_data("weight_x", di.weight_x);
913 		trace_add_data("weight_y", di.weight_y);
914 		trace_add_data("weight_z", di.weight_z);
915 		trace_add_data("weight_quant", qw_bm.quant_mode);
916 
917 		vfloat4 rgbs_color;
918 		vfloat4 rgbo_color;
919 
920 		symbolic_compressed_block workscb;
921 		endpoints workep = epm;
922 
923 		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
924 		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
925 
926 		for (int j = 0; j < di.weight_count; j++)
927 		{
928 			workscb.weights[j] = u8_weight1_src[j];
929 			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
930 		}
931 
932 		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
933 		{
934 			recompute_ideal_colors_2planes(
935 			    blk, bsd, di,
936 			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
937 			    workep, rgbs_color, rgbo_color, plane2_component);
938 
939 			// Quantize the chosen color
940 			workscb.color_formats[0] = pack_color_endpoints(
941 			                               privateProfile,
942 			                               workep.endpt0[0],
943 			                               workep.endpt1[0],
944 			                               rgbs_color, rgbo_color,
945 			                               partition_format_specifiers[i][0],
946 			                               workscb.color_values[0],
947 			                               color_quant_level[i]);
948 
949 			// Store header fields
950 			workscb.partition_count = 1;
951 			workscb.partition_index = 0;
952 			workscb.quant_mode = color_quant_level[i];
953 			workscb.color_formats_matched = 0;
954 			workscb.block_mode = qw_bm.mode_index;
955 			workscb.plane2_component = static_cast<int8_t>(plane2_component);
956 			workscb.block_type = SYM_BTYPE_NONCONST;
957 
958 			// Pre-realign test
959 			if (l == 0)
960 			{
961 				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
962 				if (errorval == -ERROR_CALC_DEFAULT)
963 				{
964 					errorval = -errorval;
965 					workscb.block_type = SYM_BTYPE_ERROR;
966 				}
967 
968 				trace_add_data("error_prerealign", errorval);
969 				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
970 
971 				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
972 				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
973 				// drive a heuristic to skip blocks that are unlikely to catch up with the best
974 				// block we have already.
975 				unsigned int iters_remaining = config.tune_refinement_limit - l;
976 				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
977 				if (errorval > (threshold * best_errorval_in_scb))
978 				{
979 					break;
980 				}
981 
982 				if (errorval < best_errorval_in_scb)
983 				{
984 					best_errorval_in_scb = errorval;
985 					workscb.errorval = errorval;
986 					scb = workscb;
987 
988 					if (errorval < tune_errorval_threshold)
989 					{
990 						// Skip remaining candidates - this is "good enough"
991 						i = candidate_count;
992 						break;
993 					}
994 				}
995 			}
996 
997 			// Perform a final pass over the weights to try to improve them.
998 			bool adjustments;
999 			if (di.weight_count != bsd.texel_count)
1000 			{
1001 				adjustments = realign_weights_decimated(
1002 					config.profile, bsd, blk, workscb);
1003 			}
1004 			else
1005 			{
1006 				adjustments = realign_weights_undecimated(
1007 					config.profile, bsd, blk, workscb);
1008 			}
1009 
1010 			// Post-realign test
1011 			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1012 			if (errorval == -ERROR_CALC_DEFAULT)
1013 			{
1014 				errorval = -errorval;
1015 				workscb.block_type = SYM_BTYPE_ERROR;
1016 			}
1017 
1018 			trace_add_data("error_postrealign", errorval);
1019 			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1020 
1021 			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
1022 			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1023 			// give benefit of the doubt ...
1024 			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1025 			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1026 			if (errorval > (threshold * best_errorval_in_scb))
1027 			{
1028 				break;
1029 			}
1030 
1031 			if (errorval < best_errorval_in_scb)
1032 			{
1033 				best_errorval_in_scb = errorval;
1034 				workscb.errorval = errorval;
1035 				scb = workscb;
1036 
1037 				if (errorval < tune_errorval_threshold)
1038 				{
1039 					// Skip remaining candidates - this is "good enough"
1040 					i = candidate_count;
1041 					break;
1042 				}
1043 			}
1044 
1045 			if (!adjustments)
1046 			{
1047 				break;
1048 			}
1049 		}
1050 	}
1051 
1052 	return best_errorval_in_mode;
1053 }
1054 
1055 /**
1056  * @brief Determine the lowest cross-channel correlation factor.
1057  *
1058  * @param texels_per_block   The number of texels in a block.
1059  * @param blk                The image block color data to compress.
1060  *
1061  * @return Return the lowest correlation factor.
1062  */
prepare_block_statistics(int texels_per_block,const image_block & blk)1063 static float prepare_block_statistics(
1064 	int texels_per_block,
1065 	const image_block& blk
1066 ) {
1067 	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1068 	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
1069 	float rs = 0.0f;
1070 	float gs = 0.0f;
1071 	float bs = 0.0f;
1072 	float as = 0.0f;
1073 	float rr_var = 0.0f;
1074 	float gg_var = 0.0f;
1075 	float bb_var = 0.0f;
1076 	float aa_var = 0.0f;
1077 	float rg_cov = 0.0f;
1078 	float rb_cov = 0.0f;
1079 	float ra_cov = 0.0f;
1080 	float gb_cov = 0.0f;
1081 	float ga_cov = 0.0f;
1082 	float ba_cov = 0.0f;
1083 
1084 	float weight_sum = 0.0f;
1085 
1086 	promise(texels_per_block > 0);
1087 	for (int i = 0; i < texels_per_block; i++)
1088 	{
1089 		float weight = hadd_s(blk.channel_weight) / 4.0f;
1090 		assert(weight >= 0.0f);
1091 		weight_sum += weight;
1092 
1093 		float r = blk.data_r[i];
1094 		float g = blk.data_g[i];
1095 		float b = blk.data_b[i];
1096 		float a = blk.data_a[i];
1097 
1098 		float rw = r * weight;
1099 		rs += rw;
1100 		rr_var += r * rw;
1101 		rg_cov += g * rw;
1102 		rb_cov += b * rw;
1103 		ra_cov += a * rw;
1104 
1105 		float gw = g * weight;
1106 		gs += gw;
1107 		gg_var += g * gw;
1108 		gb_cov += b * gw;
1109 		ga_cov += a * gw;
1110 
1111 		float bw = b * weight;
1112 		bs += bw;
1113 		bb_var += b * bw;
1114 		ba_cov += a * bw;
1115 
1116 		float aw = a * weight;
1117 		as += aw;
1118 		aa_var += a * aw;
1119 	}
1120 
1121 	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1122 
1123 	rr_var -= rs * (rs * rpt);
1124 	rg_cov -= gs * (rs * rpt);
1125 	rb_cov -= bs * (rs * rpt);
1126 	ra_cov -= as * (rs * rpt);
1127 
1128 	gg_var -= gs * (gs * rpt);
1129 	gb_cov -= bs * (gs * rpt);
1130 	ga_cov -= as * (gs * rpt);
1131 
1132 	bb_var -= bs * (bs * rpt);
1133 	ba_cov -= as * (bs * rpt);
1134 
1135 	aa_var -= as * (as * rpt);
1136 
1137 	// These will give a NaN if a channel is constant - these are fixed up in the next step
1138 	rg_cov *= astc::rsqrt(rr_var * gg_var);
1139 	rb_cov *= astc::rsqrt(rr_var * bb_var);
1140 	ra_cov *= astc::rsqrt(rr_var * aa_var);
1141 	gb_cov *= astc::rsqrt(gg_var * bb_var);
1142 	ga_cov *= astc::rsqrt(gg_var * aa_var);
1143 	ba_cov *= astc::rsqrt(bb_var * aa_var);
1144 
1145 	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1146 	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1147 	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1148 	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1149 	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1150 	if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1151 
1152 	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
1153 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
1154 	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
1155 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
1156 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
1157 
1158 	// Diagnostic trace points
1159 	trace_add_data("min_r", blk.data_min.lane<0>());
1160 	trace_add_data("max_r", blk.data_max.lane<0>());
1161 	trace_add_data("min_g", blk.data_min.lane<1>());
1162 	trace_add_data("max_g", blk.data_max.lane<1>());
1163 	trace_add_data("min_b", blk.data_min.lane<2>());
1164 	trace_add_data("max_b", blk.data_max.lane<2>());
1165 	trace_add_data("min_a", blk.data_min.lane<3>());
1166 	trace_add_data("max_a", blk.data_max.lane<3>());
1167 	trace_add_data("cov_rg", fabsf(rg_cov));
1168 	trace_add_data("cov_rb", fabsf(rb_cov));
1169 	trace_add_data("cov_ra", fabsf(ra_cov));
1170 	trace_add_data("cov_gb", fabsf(gb_cov));
1171 	trace_add_data("cov_ga", fabsf(ga_cov));
1172 	trace_add_data("cov_ba", fabsf(ba_cov));
1173 
1174 	return lowest_correlation;
1175 }
1176 
1177 /* See header for documentation. */
compress_block(const astcenc_contexti & ctx,const image_block & blk,uint8_t pcb[16],compression_working_buffers & tmpbuf,bool calQualityEnable,int32_t * mseBlock[RGBA_COM])1178 void compress_block(
1179 	const astcenc_contexti& ctx,
1180 	const image_block& blk,
1181 	uint8_t pcb[16],
1182 #if QUALITY_CONTROL
1183 	compression_working_buffers& tmpbuf,
1184 	bool calQualityEnable,
1185 	int32_t *mseBlock[RGBA_COM]
1186 #else
1187 	compression_working_buffers& tmpbuf
1188 #endif
1189 	)
1190 {
1191 	astcenc_profile decode_mode = ctx.config.profile;
1192 	symbolic_compressed_block scb;
1193 	const block_size_descriptor& bsd = *ctx.bsd;
1194 	float lowest_correl;
1195 
1196 	TRACE_NODE(node0, "block");
1197 	trace_add_data("pos_x", blk.xpos);
1198 	trace_add_data("pos_y", blk.ypos);
1199 	trace_add_data("pos_z", blk.zpos);
1200 
1201 	// Set stricter block targets for luminance data as we have more bits to play with
1202 	bool block_is_l = blk.is_luminance();
1203 	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1204 
1205 	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
1206 	bool block_is_la = blk.is_luminancealpha();
1207 	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1208 
1209 	bool block_skip_two_plane = false;
1210 	int max_partitions;
1211 	if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1212 	{
1213 		max_partitions = 1;
1214 	}
1215 #ifdef ASTC_CUSTOMIZED_ENABLE
1216 	else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1217 	{
1218 		if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1219 			g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr)
1220 		{
1221 			printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n");
1222 			return;
1223 		}
1224 		max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_();
1225 	}
1226 #endif
1227 	else
1228 	{
1229 		max_partitions = ctx.config.tune_partition_count_limit;
1230 	}
1231 
1232 	unsigned int requested_partition_indices[3] {
1233 		ctx.config.tune_2partition_index_limit,
1234 		ctx.config.tune_3partition_index_limit,
1235 		ctx.config.tune_4partition_index_limit
1236 	};
1237 
1238 	unsigned int requested_partition_trials[3] {
1239 		ctx.config.tune_2partitioning_candidate_limit,
1240 		ctx.config.tune_3partitioning_candidate_limit,
1241 		ctx.config.tune_4partitioning_candidate_limit
1242 	};
1243 
1244 #if defined(ASTCENC_DIAGNOSTICS)
1245 	// Do this early in diagnostic builds so we can dump uniform metrics
1246 	// for every block. Do it later in release builds to avoid redundant work!
1247 	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1248 	float error_threshold = ctx.config.tune_db_limit
1249 	                      * error_weight_sum
1250 	                      * block_is_l_scale
1251 	                      * block_is_la_scale;
1252 
1253 	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1254 	trace_add_data("lowest_correl", lowest_correl);
1255 	trace_add_data("tune_error_threshold", error_threshold);
1256 #endif
1257 
1258 	// Detected a constant-color block
1259 	if (all(blk.data_min == blk.data_max))
1260 	{
1261 		TRACE_NODE(node1, "pass");
1262 		trace_add_data("partition_count", 0);
1263 		trace_add_data("plane_count", 1);
1264 
1265 		scb.partition_count = 0;
1266 
1267 		// Encode as FP16 if using HDR
1268 		if ((decode_mode == ASTCENC_PRF_HDR) ||
1269 		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1270 		{
1271 			scb.block_type = SYM_BTYPE_CONST_F16;
1272 			vint4 color_f16 = float_to_float16(blk.origin_texel);
1273 			store(color_f16, scb.constant_color);
1274 		}
1275 		// Encode as UNORM16 if NOT using HDR
1276 		else
1277 		{
1278 			scb.block_type = SYM_BTYPE_CONST_U16;
1279 			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1280 			vint4 color_u16 = float_to_int_rtn(color_f32);
1281 			store(color_u16, scb.constant_color);
1282 		}
1283 
1284 		trace_add_data("exit", "quality hit");
1285 		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1286 		{
1287 			scb.block_type = SYM_BTYPE_NONCONST;
1288 			scb.partition_count = 1;
1289 			scb.color_formats_matched = 0;
1290 			scb.plane2_component = -1;
1291 			if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1292 			{
1293 				scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE;
1294 			}
1295 #ifdef ASTC_CUSTOMIZED_ENABLE
1296 			else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1297 			{
1298 				if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1299 					g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr)
1300 				{
1301 					printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n");
1302 					return;
1303 				}
1304 				scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_();
1305 			}
1306 #endif
1307 			scb.partition_index = 0;
1308 			scb.quant_mode = QUANT_256;
1309 			scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1310 			for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1311 				scb.weights[w] = 0;
1312 			}
1313 			for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
1314 				scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
1315 				scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
1316 			}
1317 		}
1318 		scb.privateProfile = ctx.config.privateProfile;
1319 		symbolic_to_physical(bsd, scb, pcb);
1320 #if QUALITY_CONTROL
1321 	if (calQualityEnable) {
1322 		*mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0;
1323 	}
1324 #endif
1325 		return;
1326 	}
1327 
1328 #if !defined(ASTCENC_DIAGNOSTICS)
1329 	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1330 	float error_threshold = ctx.config.tune_db_limit
1331 	                      * error_weight_sum
1332 	                      * block_is_l_scale
1333 	                      * block_is_la_scale;
1334 #endif
1335 
1336 	// Set SCB and mode errors to a very high error value
1337 	scb.errorval = ERROR_CALC_DEFAULT;
1338 	scb.block_type = SYM_BTYPE_ERROR;
1339 
1340 	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1341 		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1342 	};
1343 
1344 	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1345 		0.0f,
1346 		ctx.config.tune_2partition_early_out_limit_factor,
1347 		ctx.config.tune_3partition_early_out_limit_factor,
1348 		0.0f
1349 	};
1350 
1351 	// Trial using 1 plane of weights and 1 partition.
1352 
1353 	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1354 	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1355 	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1356 	// compression and slightly reduces image quality.
1357 
1358 	float errorval_mult[2] {
1359 		1.0f / ctx.config.tune_mse_overshoot,
1360 		1.0f
1361 	};
1362 
1363 	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1364 
1365 	// Only enable MODE0 fast path if enabled
1366 	// Never enable for 3D blocks as no "always" block modes are available
1367 	int start_trial = 1;
1368  	if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1369 	{
1370 		start_trial = 0;
1371 	}
1372 
1373 	int quant_limit = QUANT_32;
1374 	for (int i = start_trial; i < 2; i++)
1375 	{
1376 		TRACE_NODE(node1, "pass");
1377 		trace_add_data("partition_count", 1);
1378 		trace_add_data("plane_count", 1);
1379 		trace_add_data("search_mode", i);
1380 
1381 		float errorval = compress_symbolic_block_for_partition_1plane(
1382 		    ctx.config.privateProfile,
1383 		    ctx.config, bsd, blk, i == 0,
1384 		    error_threshold * errorval_mult[i] * errorval_overshoot,
1385 		    1, 0,  scb, tmpbuf, QUANT_32);
1386 
1387 		// Record the quant level so we can use the filter later searches
1388 		const auto& bm = bsd.get_block_mode(scb.block_mode);
1389 		quant_limit = bm.get_weight_quant_mode();
1390 
1391 		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1392 		if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i])))
1393 		{
1394 			trace_add_data("exit", "quality hit");
1395 			goto END_OF_TESTS;
1396 		}
1397 	}
1398 
1399 #if !defined(ASTCENC_DIAGNOSTICS)
1400 	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1401 #endif
1402 
1403 	block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1404 
1405 	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1406 	// alpha is the most likely to be non-correlated if it is present in the data.
1407 	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1408 	{
1409 		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1410 		{
1411 			break;
1412 		}
1413 		TRACE_NODE(node1, "pass");
1414 		trace_add_data("partition_count", 1);
1415 		trace_add_data("plane_count", 2);
1416 		trace_add_data("plane_component", i);
1417 
1418 		if (block_skip_two_plane)
1419 		{
1420 			trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1421 			continue;
1422 		}
1423 
1424 		if (blk.grayscale && i != 3)
1425 		{
1426 			trace_add_data("skip", "grayscale block");
1427 			continue;
1428 		}
1429 
1430 		if (blk.is_constant_channel(i))
1431 		{
1432 			trace_add_data("skip", "constant component");
1433 			continue;
1434 		}
1435 
1436 		float errorval = compress_symbolic_block_for_partition_2planes(
1437 		    ctx.config.privateProfile,
1438 		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1439 		    i, scb, tmpbuf, quant_limit);
1440 
1441 		// If attempting two planes is much worse than the best one plane result
1442 		// then further two plane searches are unlikely to help so move on ...
1443 		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1444 		{
1445 			break;
1446 		}
1447 
1448 		if (errorval < error_threshold)
1449 		{
1450 			trace_add_data("exit", "quality hit");
1451 			goto END_OF_TESTS;
1452 		}
1453 	}
1454 
1455 	// Find best blocks for 2, 3 and 4 partitions
1456 	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1457 	{
1458 		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1459 
1460 		unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1461 
1462 		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1463 		requested_trials = astc::min(requested_trials, requested_indices);
1464 
1465 		unsigned int actual_trials = find_best_partition_candidates(
1466 		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1467 
1468 		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1469 
1470 		for (unsigned int i = 0; i < actual_trials; i++)
1471 		{
1472 			TRACE_NODE(node1, "pass");
1473 			trace_add_data("partition_count", partition_count);
1474 			trace_add_data("partition_index", partition_indices[i]);
1475 			trace_add_data("plane_count", 1);
1476 			trace_add_data("search_mode", i);
1477 
1478 			float errorval = compress_symbolic_block_for_partition_1plane(
1479 			    ctx.config.privateProfile,
1480 			    ctx.config, bsd, blk, false,
1481 			    error_threshold * errorval_overshoot,
1482 			    partition_count, partition_indices[i],
1483 			    scb, tmpbuf, quant_limit);
1484 
1485 			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1486 
1487 			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
1488 			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
1489 			// aligns with a partitioning that suits that encoding, so for this inner loop check add
1490 			// a large error scale because the "other" trial could be a lot better.
1491 			float best_error = best_errorvals_for_pcount[partition_count - 1];
1492 			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1493 			if (best_error > (best_error_in_prev * best_error_scale))
1494 			{
1495 				trace_add_data("skip", "tune_partition_early_out_limit_factor");
1496 				goto END_OF_TESTS;
1497 			}
1498 
1499 			if (errorval < error_threshold)
1500 			{
1501 				trace_add_data("exit", "quality hit");
1502 				goto END_OF_TESTS;
1503 			}
1504 		}
1505 
1506 		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1507 		float best_error = best_errorvals_for_pcount[partition_count - 1];
1508 		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1509 		if (best_error > (best_error_in_prev * best_error_scale))
1510 		{
1511 			trace_add_data("skip", "tune_partition_early_out_limit_factor");
1512 			goto END_OF_TESTS;
1513 		}
1514 	}
1515 
1516 	trace_add_data("exit", "quality not hit");
1517 
1518 END_OF_TESTS:
1519 	// If we still have an error block then convert to something we can encode
1520 	// TODO: Do something more sensible here, such as average color block
1521 	if (scb.block_type == SYM_BTYPE_ERROR)
1522 	{
1523 #if defined(ASTCENC_DIAGNOSTICS)
1524 		static bool printed_once = false;
1525 		if (!printed_once)
1526 		{
1527 			printed_once = true;
1528 			printf("WARN: At least one block failed to find a valid encoding.\n"
1529 			       "      Try increasing compression quality settings.\n\n");
1530 		}
1531 #endif
1532 
1533 		scb.block_type = SYM_BTYPE_CONST_U16;
1534 		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1535 		vint4 color_u16 = float_to_int_rtn(color_f32);
1536 		store(color_u16, scb.constant_color);
1537 	}
1538 
1539 	// Compress to a physical block
1540 	scb.privateProfile = ctx.config.privateProfile;
1541 	symbolic_to_physical(bsd, scb, pcb);
1542 #if QUALITY_CONTROL
1543 	if (calQualityEnable) {
1544 		image_block decBlk = blk;
1545 		decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk);
1546 		vint4 colorSumDiff = vint4::zero();
1547 		for (size_t ii = 0; ii < bsd.texel_count; ii++) {
1548 			vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f);
1549 			vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f);
1550 			vint4 colorDiff = colorRef - colorTest;
1551 			colorSumDiff += colorDiff * colorDiff;
1552 		}
1553 		*mseBlock[R_COM] = colorSumDiff.lane<0>();
1554 		*mseBlock[G_COM] = colorSumDiff.lane<1>();
1555 		*mseBlock[B_COM] = colorSumDiff.lane<2>();
1556 		*mseBlock[A_COM] = colorSumDiff.lane<3>();
1557     }
1558 #endif
1559 }
1560 
1561 #endif
1562