• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19 
20 /**
21  * @brief Functions to compress a symbolic block.
22  */
23 
24 #include "astcenc_internal.h"
25 #include "astcenc_diagnostic_trace.h"
26 
27 #include <cassert>
28 #ifdef ASTC_CUSTOMIZED_ENABLE
29 AstcCustomizedSoManager g_astcCustomizedSoManager;
30 #endif
31 
32 /**
33  * @brief Merge two planes of endpoints into a single vector.
34  *
35  * @param      ep_plane1          The endpoints for plane 1.
36  * @param      ep_plane2          The endpoints for plane 2.
37  * @param      component_plane2   The color component for plane 2.
38  * @param[out] result             The merged output.
39  */
merge_endpoints(const endpoints & ep_plane1,const endpoints & ep_plane2,unsigned int component_plane2,endpoints & result)40 static void merge_endpoints(
41 	const endpoints& ep_plane1,
42 	const endpoints& ep_plane2,
43 	unsigned int component_plane2,
44 	endpoints& result
45 ) {
46 	unsigned int partition_count = ep_plane1.partition_count;
47 	assert(partition_count == 1);
48 
49 	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
50 
51 	result.partition_count = partition_count;
52 	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
53 	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
54 }
55 
56 /**
57  * @brief Attempt to improve weights given a chosen configuration.
58  *
59  * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
60  * partition and per plane) and attempt to improve image quality by moving each weight up by one or
61  * down by one quantization step.
62  *
63  * This is a specialized function which only supports operating on undecimated weight grids,
64  * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
65  * is needed less often.
66  *
67  * @param      decode_mode   The decode mode (LDR, HDR).
68  * @param      bsd           The block size information.
69  * @param      blk           The image block color data to compress.
70  * @param[out] scb           The symbolic compressed block output.
71  */
72 #if ASTCENC_NEON != 0
realign_weights_undecimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)73 static bool realign_weights_undecimated(
74 	astcenc_profile decode_mode,
75 	const block_size_descriptor& bsd,
76 	const image_block& blk,
77 	symbolic_compressed_block& scb
78 ) {
79 	// Get the partition descriptor
80 	unsigned int partition_count = scb.partition_count;
81 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
82 
83 	// Get the quantization table
84 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
85 	unsigned int weight_quant_level = bm.quant_mode;
86 	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
87 
88 	unsigned int max_plane = bm.is_dual_plane;
89 	int plane2_component = scb.plane2_component;
90 	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
91 
92 	// Decode the color endpoints
93 	bool rgb_hdr;
94 	bool alpha_hdr;
95 	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
96 	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
97 	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
98 	vfloat4 offset[BLOCK_MAX_PARTITIONS];
99 
100 	promise(partition_count > 0);
101 
102 	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
103 	{
104 		unpack_color_endpoints(decode_mode,
105 		                       scb.color_formats[pa_idx],
106 		                       scb.color_values[pa_idx],
107 		                       rgb_hdr, alpha_hdr,
108 		                       endpnt0[pa_idx],
109 		                       endpnt1[pa_idx]);
110 	}
111 
112 	uint8_t* dec_weights_uquant = scb.weights;
113 	bool adjustments = false;
114 
115 	// For each plane and partition ...
116 	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
117 	{
118 		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
119 		{
120 			// Compute the endpoint delta for all components in current plane
121 			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
122 			epd = select(epd, vint4::zero(), plane_mask);
123 
124 			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
125 			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
126 		}
127 
128 		// For each weight compute previous, current, and next errors
129 		promise(bsd.texel_count > 0);
130 
131 		unsigned int texel = 0;
132 		for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH)
133 		{
134 			int uqw0 = dec_weights_uquant[texel];
135 			int uqw1 = dec_weights_uquant[texel + 1];
136 			int uqw2 = dec_weights_uquant[texel + 2];
137 			int uqw3 = dec_weights_uquant[texel + 3];
138 
139 			vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3);
140 			vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1],
141 							qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]);
142 
143 			vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF);
144 			vint4 uqw_down_vec = prev_and_next_vec & mask;
145 			vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask;
146 
147 			vfloat4 weight_base_vec = int_to_float(uqw_vec);
148 			vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec;
149 			vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec;
150 
151 			unsigned int partition0 = pi.partition_of_texel[texel];
152 			unsigned int partition1 = pi.partition_of_texel[texel + 1];
153 			unsigned int partition2 = pi.partition_of_texel[texel + 2];
154 			unsigned int partition3 = pi.partition_of_texel[texel + 3];
155 
156 			vfloat4 color_offset0 = offset[partition0];
157 			vfloat4 color_offset1 = offset[partition1];
158 			vfloat4 color_offset2 = offset[partition2];
159 			vfloat4 color_offset3 = offset[partition3];
160 
161 			vfloat4 color_base0 = endpnt0f[partition0];
162 			vfloat4 color_base1 = endpnt0f[partition1];
163 			vfloat4 color_base2 = endpnt0f[partition2];
164 			vfloat4 color_base3 = endpnt0f[partition3];
165 
166 			vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>();
167 			vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>();
168 			vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>();
169 			vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>();
170 
171 			vfloat4 orig_color0 = blk.texel(texel);
172 			vfloat4 orig_color1 = blk.texel(texel + 1);
173 			vfloat4 orig_color2 = blk.texel(texel + 2);
174 			vfloat4 orig_color3 = blk.texel(texel + 3);
175 
176 			vfloat4 error_weight = blk.channel_weight;
177 
178 			vfloat4 color_diff0 = color0 - orig_color0;
179 			vfloat4 color_diff1 = color1 - orig_color1;
180 			vfloat4 color_diff2 = color2 - orig_color2;
181 			vfloat4 color_diff3 = color3 - orig_color3;
182 
183 			vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>();
184 			vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>();
185 			vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>();
186 			vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>();
187 
188 			vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>();
189 			vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>();
190 			vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>();
191 			vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>();
192 
193 			float error_base0 = dot_s(color_diff0 * color_diff0, error_weight);
194 			float error_base1 = dot_s(color_diff1 * color_diff1, error_weight);
195 			float error_base2 = dot_s(color_diff2 * color_diff2, error_weight);
196 			float error_base3 = dot_s(color_diff3 * color_diff3, error_weight);
197 
198 			float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight);
199 			float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight);
200 			float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight);
201 			float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight);
202 
203 			float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight);
204 			float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight);
205 			float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight);
206 			float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight);
207 
208 			vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3);
209 			vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3);
210 			vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3);
211 
212 			vmask4 check_result_up = (error_up_vec < error_base_vec) &
213 			        (error_up_vec < error_down_vec) & (uqw_vec < vint4(64));
214 
215 			vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero());
216 			check_result_down = check_result_down & (~check_result_up);
217 
218 			if (popcount(check_result_up | check_result_down) != 0)
219 			{
220 				uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up);
221 				uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down);
222 
223 				dec_weights_uquant[texel] = uqw_vec.lane<0>();
224 				dec_weights_uquant[texel + 1] = uqw_vec.lane<1>();
225 				dec_weights_uquant[texel + 2] = uqw_vec.lane<2>();    // channel 2
226 				dec_weights_uquant[texel + 3] = uqw_vec.lane<3>();    // channel 3
227 				adjustments = true;
228 			}
229 		};
230 
231 		for (; texel < bsd.texel_count; texel++)
232 		{
233 			int uqw = dec_weights_uquant[texel];
234 
235 			uint32_t prev_and_next = qat.prev_next_values[uqw];
236 			int uqw_down = prev_and_next & 0xFF;
237 			int uqw_up = (prev_and_next >> 8) & 0xFF;
238 
239 			// Interpolate the colors to create the diffs
240 			float weight_base = static_cast<float>(uqw);
241 			float weight_down = static_cast<float>(uqw_down - uqw);
242 			float weight_up = static_cast<float>(uqw_up - uqw);
243 
244 			unsigned int partition = pi.partition_of_texel[texel];
245 			vfloat4 color_offset = offset[partition];
246 			vfloat4 color_base   = endpnt0f[partition];
247 
248 			vfloat4 color = color_base + color_offset * weight_base;
249 			vfloat4 orig_color   = blk.texel(texel);
250 			vfloat4 error_weight = blk.channel_weight;
251 
252 			vfloat4 color_diff      = color - orig_color;
253 			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
254 			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
255 
256 			float error_base = dot_s(color_diff      * color_diff,      error_weight);
257 			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
258 			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
259 
260 			// Check if the prev or next error is better, and if so use it
261 			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
262 			{
263 				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
264 				adjustments = true;
265 			}
266 			else if ((error_down < error_base) && (uqw > 0))
267 			{
268 				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
269 				adjustments = true;
270 			}
271 		}
272 
273 		// Prepare iteration for plane 2
274 		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
275 		plane_mask = ~plane_mask;
276 	}
277 	return adjustments;
278 }
279 #else
realign_weights_undecimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)280 static bool realign_weights_undecimated(
281 	astcenc_profile decode_mode,
282 	const block_size_descriptor& bsd,
283 	const image_block& blk,
284 	symbolic_compressed_block& scb
285 ) {
286 	// Get the partition descriptor
287 	unsigned int partition_count = scb.partition_count;
288 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
289 
290 	// Get the quantization table
291 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
292 	unsigned int weight_quant_level = bm.quant_mode;
293 	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
294 
295 	unsigned int max_plane = bm.is_dual_plane;
296 	int plane2_component = scb.plane2_component;
297 	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
298 
299 	// Decode the color endpoints
300 	bool rgb_hdr;
301 	bool alpha_hdr;
302 	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
303 	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
304 	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
305 	vfloat4 offset[BLOCK_MAX_PARTITIONS];
306 
307 	promise(partition_count > 0);
308 
309 	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
310 	{
311 		unpack_color_endpoints(decode_mode,
312 		                       scb.color_formats[pa_idx],
313 		                       scb.color_values[pa_idx],
314 		                       rgb_hdr, alpha_hdr,
315 		                       endpnt0[pa_idx],
316 		                       endpnt1[pa_idx]);
317 	}
318 
319 	uint8_t* dec_weights_uquant = scb.weights;
320 	bool adjustments = false;
321 
322 	// For each plane and partition ...
323 	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
324 	{
325 		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
326 		{
327 			// Compute the endpoint delta for all components in current plane
328 			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
329 			epd = select(epd, vint4::zero(), plane_mask);
330 
331 			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
332 			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
333 		}
334 
335 		// For each weight compute previous, current, and next errors
336 		promise(bsd.texel_count > 0);
337 		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
338 		{
339 			int uqw = dec_weights_uquant[texel];
340 
341 			uint32_t prev_and_next = qat.prev_next_values[uqw];
342 			int uqw_down = prev_and_next & 0xFF;
343 			int uqw_up = (prev_and_next >> 8) & 0xFF;
344 
345 			// Interpolate the colors to create the diffs
346 			float weight_base = static_cast<float>(uqw);
347 			float weight_down = static_cast<float>(uqw_down - uqw);
348 			float weight_up = static_cast<float>(uqw_up - uqw);
349 
350 			unsigned int partition = pi.partition_of_texel[texel];
351 			vfloat4 color_offset = offset[partition];
352 			vfloat4 color_base   = endpnt0f[partition];
353 
354 			vfloat4 color = color_base + color_offset * weight_base;
355 			vfloat4 orig_color   = blk.texel(texel);
356 			vfloat4 error_weight = blk.channel_weight;
357 
358 			vfloat4 color_diff      = color - orig_color;
359 			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
360 			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
361 
362 			float error_base = dot_s(color_diff      * color_diff,      error_weight);
363 			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
364 			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
365 
366 			// Check if the prev or next error is better, and if so use it
367 			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
368 			{
369 				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
370 				adjustments = true;
371 			}
372 			else if ((error_down < error_base) && (uqw > 0))
373 			{
374 				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
375 				adjustments = true;
376 			}
377 		}
378 
379 		// Prepare iteration for plane 2
380 		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
381 		plane_mask = ~plane_mask;
382 	}
383 
384 	return adjustments;
385 }
386 #endif
387 
388 /**
389  * @brief Attempt to improve weights given a chosen configuration.
390  *
391  * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
392  * partition and per plane) and attempt to improve image quality by moving each weight up by one or
393  * down by one quantization step.
394  *
395  * @param      decode_mode   The decode mode (LDR, HDR).
396  * @param      bsd           The block size information.
397  * @param      blk           The image block color data to compress.
398  * @param[out] scb           The symbolic compressed block output.
399  */
realign_weights_decimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)400 static bool realign_weights_decimated(
401 	astcenc_profile decode_mode,
402 	const block_size_descriptor& bsd,
403 	const image_block& blk,
404 	symbolic_compressed_block& scb
405 ) {
406 	// Get the partition descriptor
407 	unsigned int partition_count = scb.partition_count;
408 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
409 
410 	// Get the quantization table
411 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
412 	unsigned int weight_quant_level = bm.quant_mode;
413 	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
414 
415 	// Get the decimation table
416 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
417 	unsigned int weight_count = di.weight_count;
418 	assert(weight_count != bsd.texel_count);
419 
420 	unsigned int max_plane = bm.is_dual_plane;
421 	int plane2_component = scb.plane2_component;
422 	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
423 
424 	// Decode the color endpoints
425 	bool rgb_hdr;
426 	bool alpha_hdr;
427 	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
428 	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
429 	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
430 	vfloat4 offset[BLOCK_MAX_PARTITIONS];
431 
432 	promise(partition_count > 0);
433 	promise(weight_count > 0);
434 
435 	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
436 	{
437 		unpack_color_endpoints(decode_mode,
438 		                       scb.color_formats[pa_idx],
439 		                       scb.color_values[pa_idx],
440 		                       rgb_hdr, alpha_hdr,
441 		                       endpnt0[pa_idx],
442 		                       endpnt1[pa_idx]);
443 	}
444 
445 	uint8_t* dec_weights_uquant = scb.weights;
446 	bool adjustments = false;
447 
448 	// For each plane and partition ...
449 	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
450 	{
451 		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
452 		{
453 			// Compute the endpoint delta for all components in current plane
454 			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
455 			epd = select(epd, vint4::zero(), plane_mask);
456 
457 			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
458 			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
459 		}
460 
461 		// Create an unquantized weight grid for this decimation level
462 		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
463 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
464 		{
465 			vint unquant_value(dec_weights_uquant + we_idx);
466 			vfloat unquant_valuef = int_to_float(unquant_value);
467 			storea(unquant_valuef, uq_weightsf + we_idx);
468 		}
469 
470 		// For each weight compute previous, current, and next errors
471 		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
472 		{
473 			int uqw = dec_weights_uquant[we_idx];
474 			uint32_t prev_and_next = qat.prev_next_values[uqw];
475 
476 			float uqw_base = uq_weightsf[we_idx];
477 			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
478 			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
479 
480 			float uqw_diff_down = uqw_down - uqw_base;
481 			float uqw_diff_up = uqw_up - uqw_base;
482 
483 			vfloat4 error_basev = vfloat4::zero();
484 			vfloat4 error_downv = vfloat4::zero();
485 			vfloat4 error_upv = vfloat4::zero();
486 
487 			// Interpolate the colors to create the diffs
488 			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
489 			promise(texels_to_evaluate > 0);
490 			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
491 			{
492 				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
493 
494 				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
495 
496 				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
497 				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
498 					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
499 				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
500 
501 				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
502 				// float weight = astc::flt_rd(weight_base + 0.5f);
503 				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
504 				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
505 				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
506 				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
507 
508 				unsigned int partition = pi.partition_of_texel[texel];
509 				vfloat4 color_offset = offset[partition];
510 				vfloat4 color_base   = endpnt0f[partition];
511 
512 				vfloat4 color = color_base + color_offset * weight_base;
513 				vfloat4 orig_color = blk.texel(texel);
514 
515 				vfloat4 color_diff      = color - orig_color;
516 				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
517 				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
518 
519 				error_basev += color_diff * color_diff;
520 				error_downv += color_down_diff * color_down_diff;
521 				error_upv   += color_up_diff * color_up_diff;
522 			}
523 
524 			vfloat4 error_weight = blk.channel_weight;
525 			float error_base = hadd_s(error_basev * error_weight);
526 			float error_down = hadd_s(error_downv * error_weight);
527 			float error_up   = hadd_s(error_upv   * error_weight);
528 
529 			// Check if the prev or next error is better, and if so use it
530 			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
531 			{
532 				uq_weightsf[we_idx] = uqw_up;
533 				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
534 				adjustments = true;
535 			}
536 			else if ((error_down < error_base) && (uqw > 0))
537 			{
538 				uq_weightsf[we_idx] = uqw_down;
539 				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
540 				adjustments = true;
541 			}
542 		}
543 
544 		// Prepare iteration for plane 2
545 		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
546 		plane_mask = ~plane_mask;
547 	}
548 
549 	return adjustments;
550 }
551 
552 /**
553  * @brief Compress a block using a chosen partitioning and 1 plane of weights.
554  *
555  * @param      config                    The compressor configuration.
556  * @param      bsd                       The block size information.
557  * @param      blk                       The image block color data to compress.
558  * @param      only_always               True if we only use "always" percentile block modes.
559  * @param      tune_errorval_threshold   The error value threshold.
560  * @param      partition_count           The partition count.
561  * @param      partition_index           The partition index if @c partition_count is 2-4.
562  * @param[out] scb                       The symbolic compressed block output.
563  * @param[out] tmpbuf                    The quantized weights for plane 1.
564  */
compress_symbolic_block_for_partition_1plane(QualityProfile privateProfile,const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,bool only_always,float tune_errorval_threshold,unsigned int partition_count,unsigned int partition_index,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)565 static float compress_symbolic_block_for_partition_1plane(
566 	QualityProfile privateProfile,
567 	const astcenc_config& config,
568 	const block_size_descriptor& bsd,
569 	const image_block& blk,
570 	bool only_always,
571 	float tune_errorval_threshold,
572 	unsigned int partition_count,
573 	unsigned int partition_index,
574 	symbolic_compressed_block& scb,
575 	compression_working_buffers& tmpbuf,
576 	int quant_limit
577 ) {
578 	promise(partition_count > 0);
579 	promise(config.tune_candidate_limit > 0);
580 	promise(config.tune_refinement_limit > 0);
581 
582 	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
583 
584 	auto compute_difference = &compute_symbolic_block_difference_1plane;
585 	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
586 	{
587 		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
588 	}
589 
590 	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
591 
592 	// Compute ideal weights and endpoint colors, with no quantization or decimation
593 	endpoints_and_weights& ei = tmpbuf.ei1;
594 	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
595 
596 	// Compute ideal weights and endpoint colors for every decimation
597 	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
598 	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
599 
600 	// For each decimation mode, compute an ideal set of weights with no quantization
601 	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
602 	                                                : bsd.decimation_mode_count_selected;
603 	promise(max_decimation_modes > 0);
604 	for (unsigned int i = 0; i < max_decimation_modes; i++)
605 	{
606 		const auto& dm = bsd.get_decimation_mode(i);
607 		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
608 		{
609 			continue;
610 		}
611 
612 		const auto& di = bsd.get_decimation_info(i);
613 
614 		compute_ideal_weights_for_decimation(
615 		    ei,
616 		    di,
617 		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
618 	}
619 
620 	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
621 	// weight pair, compute the smallest weight that will result in a color value greater than 1
622 	vfloat4 min_ep(10.0f);
623 	for (unsigned int i = 0; i < partition_count; i++)
624 	{
625 		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
626 
627 		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
628 		min_ep = select(min_ep, ep, use_ep);
629 	}
630 
631 	float min_wt_cutoff = hmin_s(min_ep);
632 
633 	// For each mode, use the angular method to compute a shift
634 	compute_angular_endpoints_1plane(
635 	    privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
636 
637 	float* weight_low_value = tmpbuf.weight_low_value1;
638 	float* weight_high_value = tmpbuf.weight_high_value1;
639 	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
640 	float* qwt_errors = tmpbuf.qwt_errors;
641 
642 	// For each mode (which specifies a decimation and a quantization):
643 	//     * Compute number of bits needed for the quantized weights
644 	//     * Generate an optimized set of quantized weights
645 	//     * Compute quantization errors for the mode
646 
647 
648 	static const int8_t free_bits_for_partition_count[4] {
649 		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
650 	};
651 
652 	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
653 	                                           : bsd.block_mode_count_1plane_selected;
654 	promise(max_block_modes > 0);
655 	for (unsigned int i = 0; i < max_block_modes; i++)
656 	{
657 		const block_mode& bm = bsd.block_modes[i];
658 
659 		if (bm.quant_mode > max_weight_quant)
660 		{
661 			qwt_errors[i] = 1e38f;
662 			continue;
663 		}
664 
665 		assert(!bm.is_dual_plane);
666 		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
667 		if (bitcount <= 0)
668 		{
669 			qwt_errors[i] = 1e38f;
670 			continue;
671 		}
672 
673 		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
674 		{
675 			weight_high_value[i] = 1.0f;
676 		}
677 
678 		int decimation_mode = bm.decimation_mode;
679 		const auto& di = bsd.get_decimation_info(decimation_mode);
680 
681 		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
682 
683 		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
684 
685 		// Generate the optimized set of weights for the weight mode
686 		compute_quantized_weights_for_decimation(
687 		    di,
688 		    weight_low_value[i], weight_high_value[i],
689 		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
690 		    dec_weights_uquantf,
691 		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
692 		    bm.get_weight_quant_mode());
693 
694 		// Compute weight quantization errors for the block mode
695 		qwt_errors[i] = compute_error_of_weight_set_1plane(
696 		    ei,
697 		    di,
698 		    dec_weights_uquantf);
699 	}
700 
701 	// Decide the optimal combination of color endpoint encodings and weight encodings
702 	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
703 	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
704 
705 	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
706 	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
707 
708 	unsigned int candidate_count = compute_ideal_endpoint_formats(
709 	    privateProfile,
710 	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
711 	    config.tune_candidate_limit, 0, max_block_modes,
712 	    partition_format_specifiers, block_mode_index,
713 	    color_quant_level, color_quant_level_mod, tmpbuf);
714 
715 	// Iterate over the N believed-to-be-best modes to find out which one is actually best
716 	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
717 	float best_errorval_in_scb = scb.errorval;
718 
719 	for (unsigned int i = 0; i < candidate_count; i++)
720 	{
721 		TRACE_NODE(node0, "candidate");
722 
723 		const int bm_packed_index = block_mode_index[i];
724 		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
725 		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
726 
727 		int decimation_mode = qw_bm.decimation_mode;
728 		const auto& di = bsd.get_decimation_info(decimation_mode);
729 		promise(di.weight_count > 0);
730 
731 		trace_add_data("weight_x", di.weight_x);
732 		trace_add_data("weight_y", di.weight_y);
733 		trace_add_data("weight_z", di.weight_z);
734 		trace_add_data("weight_quant", qw_bm.quant_mode);
735 
736 		// Recompute the ideal color endpoints before storing them
737 		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
738 		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
739 
740 		symbolic_compressed_block workscb;
741 		endpoints workep = ei.ep;
742 
743 		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
744 
745 		for (unsigned int j = 0; j < di.weight_count; j++)
746 		{
747 			workscb.weights[j] = u8_weight_src[j];
748 		}
749 
750 		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
751 		{
752 			recompute_ideal_colors_1plane(
753 			    blk, pi, di, workscb.weights,
754 			    workep, rgbs_colors, rgbo_colors);
755 
756 			// Quantize the chosen color, tracking if worth trying the mod value
757 			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
758 			for (unsigned int j = 0; j < partition_count; j++)
759 			{
760 				workscb.color_formats[j] = pack_color_endpoints(
761 				    privateProfile,
762 				    workep.endpt0[j],
763 				    workep.endpt1[j],
764 				    rgbs_colors[j],
765 				    rgbo_colors[j],
766 				    partition_format_specifiers[i][j],
767 				    workscb.color_values[j],
768 				    color_quant_level[i]);
769 
770 				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
771 			}
772 
773 			// If all the color endpoint modes are the same, we get a few more bits to store colors;
774 			// let's see if we can take advantage of this: requantize all the colors and see if the
775 			// endpoint modes remain the same.
776 			workscb.color_formats_matched = 0;
777 			if (partition_count >= 2 && all_same)
778 			{
779 				uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
780 				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
781 				bool all_same_mod = true;
782 				for (unsigned int j = 0; j < partition_count; j++)
783 				{
784 					color_formats_mod[j] = pack_color_endpoints(
785 					    privateProfile,
786 					    workep.endpt0[j],
787 					    workep.endpt1[j],
788 					    rgbs_colors[j],
789 					    rgbo_colors[j],
790 					    partition_format_specifiers[i][j],
791 					    colorvals[j],
792 					    color_quant_level_mod[i]);
793 
794 					// Early out as soon as it's no longer possible to use mod
795 					if (color_formats_mod[j] != color_formats_mod[0])
796 					{
797 						all_same_mod = false;
798 						break;
799 					}
800 				}
801 
802 				if (all_same_mod)
803 				{
804 					workscb.color_formats_matched = 1;
805 					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
806 					{
807 						for (unsigned int k = 0; k < 8; k++)
808 						{
809 							workscb.color_values[j][k] = colorvals[j][k];
810 						}
811 
812 						workscb.color_formats[j] = color_formats_mod[j];
813 					}
814 				}
815 			}
816 
817 			// Store header fields
818 			workscb.partition_count = static_cast<uint8_t>(partition_count);
819 			workscb.partition_index = static_cast<uint16_t>(partition_index);
820 			workscb.plane2_component = -1;
821 			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
822 			workscb.block_mode = qw_bm.mode_index;
823 			workscb.block_type = SYM_BTYPE_NONCONST;
824 			if (privateProfile == HIGH_SPEED_PROFILE ||
825 				privateProfile == HIGH_SPEED_PROFILE_HIGHBITS)
826 			{
827 				workscb.errorval = 0;
828 				scb = workscb;
829 				break;
830 			}
831 			// Pre-realign test
832 			if (l == 0)
833 			{
834 				float errorval = compute_difference(config, bsd, workscb, blk);
835 				if (errorval == -ERROR_CALC_DEFAULT)
836 				{
837 					errorval = -errorval;
838 					workscb.block_type = SYM_BTYPE_ERROR;
839 				}
840 
841 				trace_add_data("error_prerealign", errorval);
842 				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
843 
844 				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
845 				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
846 				// drive a heuristic to skip blocks that are unlikely to catch up with the best
847 				// block we have already.
848 				unsigned int iters_remaining = config.tune_refinement_limit - l;
849 				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
850 				if (errorval > (threshold * best_errorval_in_scb))
851 				{
852 					break;
853 				}
854 
855 				if (errorval < best_errorval_in_scb)
856 				{
857 					best_errorval_in_scb = errorval;
858 					workscb.errorval = errorval;
859 					scb = workscb;
860 
861 					if (errorval < tune_errorval_threshold)
862 					{
863 						// Skip remaining candidates - this is "good enough"
864 						i = candidate_count;
865 						break;
866 					}
867 				}
868 			}
869 
870 			bool adjustments;
871 			if (di.weight_count != bsd.texel_count)
872 			{
873 				adjustments = realign_weights_decimated(
874 					config.profile, bsd, blk, workscb);
875 			}
876 			else
877 			{
878 				adjustments = realign_weights_undecimated(
879 					config.profile, bsd, blk, workscb);
880 			}
881 
882 			// Post-realign test
883 			float errorval = compute_difference(config, bsd, workscb, blk);
884 			if (errorval == -ERROR_CALC_DEFAULT)
885 			{
886 				errorval = -errorval;
887 				workscb.block_type = SYM_BTYPE_ERROR;
888 			}
889 
890 			trace_add_data("error_postrealign", errorval);
891 			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
892 
893 			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
894 			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
895 			// give benefit of the doubt ...
896 			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
897 			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
898 			if (errorval > (threshold * best_errorval_in_scb))
899 			{
900 				break;
901 			}
902 
903 			if (errorval < best_errorval_in_scb)
904 			{
905 				best_errorval_in_scb = errorval;
906 				workscb.errorval = errorval;
907 				scb = workscb;
908 
909 				if (errorval < tune_errorval_threshold)
910 				{
911 					// Skip remaining candidates - this is "good enough"
912 					i = candidate_count;
913 					break;
914 				}
915 			}
916 
917 			if (!adjustments)
918 			{
919 				break;
920 			}
921 		}
922 	}
923 
924 	return best_errorval_in_mode;
925 }
926 
927 /**
928  * @brief Compress a block using a chosen partitioning and 2 planes of weights.
929  *
930  * @param      config                    The compressor configuration.
931  * @param      bsd                       The block size information.
932  * @param      blk                       The image block color data to compress.
933  * @param      tune_errorval_threshold   The error value threshold.
934  * @param      plane2_component          The component index for the second plane of weights.
935  * @param[out] scb                       The symbolic compressed block output.
936  * @param[out] tmpbuf                    The quantized weights for plane 1.
937  */
compress_symbolic_block_for_partition_2planes(QualityProfile privateProfile,const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,float tune_errorval_threshold,unsigned int plane2_component,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)938 static float compress_symbolic_block_for_partition_2planes(
939 	QualityProfile privateProfile,
940 	const astcenc_config& config,
941 	const block_size_descriptor& bsd,
942 	const image_block& blk,
943 	float tune_errorval_threshold,
944 	unsigned int plane2_component,
945 	symbolic_compressed_block& scb,
946 	compression_working_buffers& tmpbuf,
947 	int quant_limit
948 ) {
949 	promise(config.tune_candidate_limit > 0);
950 	promise(config.tune_refinement_limit > 0);
951 	promise(bsd.decimation_mode_count_selected > 0);
952 
953 	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
954 
955 	// Compute ideal weights and endpoint colors, with no quantization or decimation
956 	endpoints_and_weights& ei1 = tmpbuf.ei1;
957 	endpoints_and_weights& ei2 = tmpbuf.ei2;
958 
959 	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
960 
961 	// Compute ideal weights and endpoint colors for every decimation
962 	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
963 	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
964 
965 	// For each decimation mode, compute an ideal set of weights with no quantization
966 	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
967 	{
968 		const auto& dm = bsd.get_decimation_mode(i);
969 		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
970 		{
971 			continue;
972 		}
973 
974 		const auto& di = bsd.get_decimation_info(i);
975 
976 		compute_ideal_weights_for_decimation(
977 		    ei1,
978 		    di,
979 		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
980 
981 		compute_ideal_weights_for_decimation(
982 		    ei2,
983 		    di,
984 		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
985 	}
986 
987 	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
988 	// weight pair, compute the smallest weight that will result in a color value greater than 1
989 	vfloat4 min_ep1(10.0f);
990 	vfloat4 min_ep2(10.0f);
991 
992 	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
993 	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
994 	min_ep1 = select(min_ep1, ep1, use_ep1);
995 
996 	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
997 	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
998 	min_ep2 = select(min_ep2, ep2, use_ep2);
999 
1000 	vfloat4 err_max(ERROR_CALC_DEFAULT);
1001 	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
1002 
1003 	// Set the plane2 component to max error in ep1
1004 	min_ep1 = select(min_ep1, err_max, err_mask);
1005 
1006 	float min_wt_cutoff1 = hmin_s(min_ep1);
1007 
1008 	// Set the minwt2 to the plane2 component min in ep2
1009 	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
1010 
1011 	compute_angular_endpoints_2planes(
1012 	    privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
1013 
1014 	// For each mode (which specifies a decimation and a quantization):
1015 	//     * Compute number of bits needed for the quantized weights
1016 	//     * Generate an optimized set of quantized weights
1017 	//     * Compute quantization errors for the mode
1018 
1019 	float* weight_low_value1 = tmpbuf.weight_low_value1;
1020 	float* weight_high_value1 = tmpbuf.weight_high_value1;
1021 	float* weight_low_value2 = tmpbuf.weight_low_value2;
1022 	float* weight_high_value2 = tmpbuf.weight_high_value2;
1023 
1024 	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
1025 	float* qwt_errors = tmpbuf.qwt_errors;
1026 
1027 	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
1028 	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
1029 
1030 	for (unsigned int i = start_2plane; i < end_2plane; i++)
1031 	{
1032 		const block_mode& bm = bsd.block_modes[i];
1033 		assert(bm.is_dual_plane);
1034 
1035 		if (bm.quant_mode > max_weight_quant)
1036 		{
1037 			qwt_errors[i] = 1e38f;
1038 			continue;
1039 		}
1040 
1041 		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
1042 
1043 		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
1044 		{
1045 			weight_high_value1[i] = 1.0f;
1046 		}
1047 
1048 		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
1049 		{
1050 			weight_high_value2[i] = 1.0f;
1051 		}
1052 
1053 		unsigned int decimation_mode = bm.decimation_mode;
1054 		const auto& di = bsd.get_decimation_info(decimation_mode);
1055 
1056 		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
1057 
1058 		// Generate the optimized set of weights for the mode
1059 		compute_quantized_weights_for_decimation(
1060 		    di,
1061 		    weight_low_value1[i],
1062 		    weight_high_value1[i],
1063 		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
1064 		    dec_weights_uquantf,
1065 		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
1066 		    bm.get_weight_quant_mode());
1067 
1068 		compute_quantized_weights_for_decimation(
1069 		    di,
1070 		    weight_low_value2[i],
1071 		    weight_high_value2[i],
1072 		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
1073 		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
1074 		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
1075 		    bm.get_weight_quant_mode());
1076 
1077 		// Compute weight quantization errors for the block mode
1078 		qwt_errors[i] = compute_error_of_weight_set_2planes(
1079 		    ei1,
1080 		    ei2,
1081 		    di,
1082 		    dec_weights_uquantf,
1083 		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
1084 	}
1085 
1086 	// Decide the optimal combination of color endpoint encodings and weight encodings
1087 	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
1088 	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
1089 
1090 	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
1091 	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
1092 
1093 	endpoints epm;
1094 	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
1095 
1096 	const auto& pi = bsd.get_partition_info(1, 0);
1097 	unsigned int candidate_count = compute_ideal_endpoint_formats(
1098 	    config.privateProfile,
1099 	    pi, blk, epm, qwt_bitcounts, qwt_errors,
1100 	    config.tune_candidate_limit,
1101 		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
1102 	    partition_format_specifiers, block_mode_index,
1103 	    color_quant_level, color_quant_level_mod, tmpbuf);
1104 
1105 	// Iterate over the N believed-to-be-best modes to find out which one is actually best
1106 	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
1107 	float best_errorval_in_scb = scb.errorval;
1108 
1109 	for (unsigned int i = 0; i < candidate_count; i++)
1110 	{
1111 		TRACE_NODE(node0, "candidate");
1112 
1113 		const int bm_packed_index = block_mode_index[i];
1114 		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
1115 		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
1116 		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
1117 
1118 		int decimation_mode = qw_bm.decimation_mode;
1119 		const auto& di = bsd.get_decimation_info(decimation_mode);
1120 		promise(di.weight_count > 0);
1121 
1122 		trace_add_data("weight_x", di.weight_x);
1123 		trace_add_data("weight_y", di.weight_y);
1124 		trace_add_data("weight_z", di.weight_z);
1125 		trace_add_data("weight_quant", qw_bm.quant_mode);
1126 
1127 		vfloat4 rgbs_color;
1128 		vfloat4 rgbo_color;
1129 
1130 		symbolic_compressed_block workscb;
1131 		endpoints workep = epm;
1132 
1133 		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
1134 		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
1135 
1136 		for (int j = 0; j < di.weight_count; j++)
1137 		{
1138 			workscb.weights[j] = u8_weight1_src[j];
1139 			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
1140 		}
1141 
1142 		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
1143 		{
1144 			recompute_ideal_colors_2planes(
1145 			    blk, bsd, di,
1146 			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
1147 			    workep, rgbs_color, rgbo_color, plane2_component);
1148 
1149 			// Quantize the chosen color
1150 			workscb.color_formats[0] = pack_color_endpoints(
1151 			                               privateProfile,
1152 			                               workep.endpt0[0],
1153 			                               workep.endpt1[0],
1154 			                               rgbs_color, rgbo_color,
1155 			                               partition_format_specifiers[i][0],
1156 			                               workscb.color_values[0],
1157 			                               color_quant_level[i]);
1158 
1159 			// Store header fields
1160 			workscb.partition_count = 1;
1161 			workscb.partition_index = 0;
1162 			workscb.quant_mode = color_quant_level[i];
1163 			workscb.color_formats_matched = 0;
1164 			workscb.block_mode = qw_bm.mode_index;
1165 			workscb.plane2_component = static_cast<int8_t>(plane2_component);
1166 			workscb.block_type = SYM_BTYPE_NONCONST;
1167 
1168 			// Pre-realign test
1169 			if (l == 0)
1170 			{
1171 				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1172 				if (errorval == -ERROR_CALC_DEFAULT)
1173 				{
1174 					errorval = -errorval;
1175 					workscb.block_type = SYM_BTYPE_ERROR;
1176 				}
1177 
1178 				trace_add_data("error_prerealign", errorval);
1179 				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1180 
1181 				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
1182 				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
1183 				// drive a heuristic to skip blocks that are unlikely to catch up with the best
1184 				// block we have already.
1185 				unsigned int iters_remaining = config.tune_refinement_limit - l;
1186 				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
1187 				if (errorval > (threshold * best_errorval_in_scb))
1188 				{
1189 					break;
1190 				}
1191 
1192 				if (errorval < best_errorval_in_scb)
1193 				{
1194 					best_errorval_in_scb = errorval;
1195 					workscb.errorval = errorval;
1196 					scb = workscb;
1197 
1198 					if (errorval < tune_errorval_threshold)
1199 					{
1200 						// Skip remaining candidates - this is "good enough"
1201 						i = candidate_count;
1202 						break;
1203 					}
1204 				}
1205 			}
1206 
1207 			// Perform a final pass over the weights to try to improve them.
1208 			bool adjustments;
1209 			if (di.weight_count != bsd.texel_count)
1210 			{
1211 				adjustments = realign_weights_decimated(
1212 					config.profile, bsd, blk, workscb);
1213 			}
1214 			else
1215 			{
1216 				adjustments = realign_weights_undecimated(
1217 					config.profile, bsd, blk, workscb);
1218 			}
1219 
1220 			// Post-realign test
1221 			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1222 			if (errorval == -ERROR_CALC_DEFAULT)
1223 			{
1224 				errorval = -errorval;
1225 				workscb.block_type = SYM_BTYPE_ERROR;
1226 			}
1227 
1228 			trace_add_data("error_postrealign", errorval);
1229 			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1230 
1231 			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
1232 			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1233 			// give benefit of the doubt ...
1234 			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1235 			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1236 			if (errorval > (threshold * best_errorval_in_scb))
1237 			{
1238 				break;
1239 			}
1240 
1241 			if (errorval < best_errorval_in_scb)
1242 			{
1243 				best_errorval_in_scb = errorval;
1244 				workscb.errorval = errorval;
1245 				scb = workscb;
1246 
1247 				if (errorval < tune_errorval_threshold)
1248 				{
1249 					// Skip remaining candidates - this is "good enough"
1250 					i = candidate_count;
1251 					break;
1252 				}
1253 			}
1254 
1255 			if (!adjustments)
1256 			{
1257 				break;
1258 			}
1259 		}
1260 	}
1261 
1262 	return best_errorval_in_mode;
1263 }
1264 
1265 /**
1266  * @brief Determine the lowest cross-channel correlation factor.
1267  *
1268  * @param texels_per_block   The number of texels in a block.
1269  * @param blk                The image block color data to compress.
1270  *
1271  * @return Return the lowest correlation factor.
1272  */
prepare_block_statistics(int texels_per_block,const image_block & blk)1273 static float prepare_block_statistics(
1274 	int texels_per_block,
1275 	const image_block& blk
1276 ) {
1277 	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1278 	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
1279 	float rs = 0.0f;
1280 	float gs = 0.0f;
1281 	float bs = 0.0f;
1282 	float as = 0.0f;
1283 	float rr_var = 0.0f;
1284 	float gg_var = 0.0f;
1285 	float bb_var = 0.0f;
1286 	float aa_var = 0.0f;
1287 	float rg_cov = 0.0f;
1288 	float rb_cov = 0.0f;
1289 	float ra_cov = 0.0f;
1290 	float gb_cov = 0.0f;
1291 	float ga_cov = 0.0f;
1292 	float ba_cov = 0.0f;
1293 
1294 	float weight_sum = 0.0f;
1295 
1296 	promise(texels_per_block > 0);
1297 	for (int i = 0; i < texels_per_block; i++)
1298 	{
1299 		float weight = hadd_s(blk.channel_weight) / 4.0f;
1300 		assert(weight >= 0.0f);
1301 		weight_sum += weight;
1302 
1303 		float r = blk.data_r[i];
1304 		float g = blk.data_g[i];
1305 		float b = blk.data_b[i];
1306 		float a = blk.data_a[i];
1307 
1308 		float rw = r * weight;
1309 		rs += rw;
1310 		rr_var += r * rw;
1311 		rg_cov += g * rw;
1312 		rb_cov += b * rw;
1313 		ra_cov += a * rw;
1314 
1315 		float gw = g * weight;
1316 		gs += gw;
1317 		gg_var += g * gw;
1318 		gb_cov += b * gw;
1319 		ga_cov += a * gw;
1320 
1321 		float bw = b * weight;
1322 		bs += bw;
1323 		bb_var += b * bw;
1324 		ba_cov += a * bw;
1325 
1326 		float aw = a * weight;
1327 		as += aw;
1328 		aa_var += a * aw;
1329 	}
1330 
1331 	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1332 
1333 	rr_var -= rs * (rs * rpt);
1334 	rg_cov -= gs * (rs * rpt);
1335 	rb_cov -= bs * (rs * rpt);
1336 	ra_cov -= as * (rs * rpt);
1337 
1338 	gg_var -= gs * (gs * rpt);
1339 	gb_cov -= bs * (gs * rpt);
1340 	ga_cov -= as * (gs * rpt);
1341 
1342 	bb_var -= bs * (bs * rpt);
1343 	ba_cov -= as * (bs * rpt);
1344 
1345 	aa_var -= as * (as * rpt);
1346 
1347 	// These will give a NaN if a channel is constant - these are fixed up in the next step
1348 	rg_cov *= astc::rsqrt(rr_var * gg_var);
1349 	rb_cov *= astc::rsqrt(rr_var * bb_var);
1350 	ra_cov *= astc::rsqrt(rr_var * aa_var);
1351 	gb_cov *= astc::rsqrt(gg_var * bb_var);
1352 	ga_cov *= astc::rsqrt(gg_var * aa_var);
1353 	ba_cov *= astc::rsqrt(bb_var * aa_var);
1354 
1355 	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1356 	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1357 	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1358 	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1359 	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1360 	if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1361 
1362 	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
1363 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
1364 	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
1365 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
1366 	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
1367 
1368 	// Diagnostic trace points
1369 	trace_add_data("min_r", blk.data_min.lane<0>());
1370 	trace_add_data("max_r", blk.data_max.lane<0>());
1371 	trace_add_data("min_g", blk.data_min.lane<1>());
1372 	trace_add_data("max_g", blk.data_max.lane<1>());
1373 	trace_add_data("min_b", blk.data_min.lane<2>());
1374 	trace_add_data("max_b", blk.data_max.lane<2>());
1375 	trace_add_data("min_a", blk.data_min.lane<3>());
1376 	trace_add_data("max_a", blk.data_max.lane<3>());
1377 	trace_add_data("cov_rg", fabsf(rg_cov));
1378 	trace_add_data("cov_rb", fabsf(rb_cov));
1379 	trace_add_data("cov_ra", fabsf(ra_cov));
1380 	trace_add_data("cov_gb", fabsf(gb_cov));
1381 	trace_add_data("cov_ga", fabsf(ga_cov));
1382 	trace_add_data("cov_ba", fabsf(ba_cov));
1383 
1384 	return lowest_correlation;
1385 }
1386 
1387 /* See header for documentation. */
compress_block(const astcenc_contexti & ctx,const image_block & blk,uint8_t pcb[16],compression_working_buffers & tmpbuf,bool calQualityEnable,int32_t * mseBlock[RGBA_COM])1388 void compress_block(
1389 	const astcenc_contexti& ctx,
1390 	const image_block& blk,
1391 	uint8_t pcb[16],
1392 #if QUALITY_CONTROL
1393 	compression_working_buffers& tmpbuf,
1394 	bool calQualityEnable,
1395 	int32_t *mseBlock[RGBA_COM]
1396 #else
1397 	compression_working_buffers& tmpbuf
1398 #endif
1399 	)
1400 {
1401 	astcenc_profile decode_mode = ctx.config.profile;
1402 	symbolic_compressed_block scb;
1403 	const block_size_descriptor& bsd = *ctx.bsd;
1404 	float lowest_correl;
1405 
1406 	TRACE_NODE(node0, "block");
1407 	trace_add_data("pos_x", blk.xpos);
1408 	trace_add_data("pos_y", blk.ypos);
1409 	trace_add_data("pos_z", blk.zpos);
1410 
1411 	// Set stricter block targets for luminance data as we have more bits to play with
1412 	bool block_is_l = blk.is_luminance();
1413 	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1414 
1415 	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
1416 	bool block_is_la = blk.is_luminancealpha();
1417 	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1418 
1419 	bool block_skip_two_plane = false;
1420 	int max_partitions;
1421 	if (ctx.config.privateProfile == HIGH_SPEED_PROFILE ||
1422 		ctx.config.privateProfile == HIGH_SPEED_PROFILE_HIGHBITS)
1423 	{
1424 		max_partitions = 1;
1425 	}
1426 #ifdef ASTC_CUSTOMIZED_ENABLE
1427 	else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1428 	{
1429 		if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1430 			g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr)
1431 		{
1432 			printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n");
1433 			return;
1434 		}
1435 		max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_();
1436 	}
1437 #endif
1438 	else
1439 	{
1440 		max_partitions = ctx.config.tune_partition_count_limit;
1441 	}
1442 
1443 	unsigned int requested_partition_indices[3] {
1444 		ctx.config.tune_2partition_index_limit,
1445 		ctx.config.tune_3partition_index_limit,
1446 		ctx.config.tune_4partition_index_limit
1447 	};
1448 
1449 	unsigned int requested_partition_trials[3] {
1450 		ctx.config.tune_2partitioning_candidate_limit,
1451 		ctx.config.tune_3partitioning_candidate_limit,
1452 		ctx.config.tune_4partitioning_candidate_limit
1453 	};
1454 
1455 #if defined(ASTCENC_DIAGNOSTICS)
1456 	// Do this early in diagnostic builds so we can dump uniform metrics
1457 	// for every block. Do it later in release builds to avoid redundant work!
1458 	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1459 	float error_threshold = ctx.config.tune_db_limit
1460 	                      * error_weight_sum
1461 	                      * block_is_l_scale
1462 	                      * block_is_la_scale;
1463 
1464 	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1465 	trace_add_data("lowest_correl", lowest_correl);
1466 	trace_add_data("tune_error_threshold", error_threshold);
1467 #endif
1468 
1469 	// Detected a constant-color block
1470 	if (all(blk.data_min == blk.data_max))
1471 	{
1472 		TRACE_NODE(node1, "pass");
1473 		trace_add_data("partition_count", 0);
1474 		trace_add_data("plane_count", 1);
1475 
1476 		scb.partition_count = 0;
1477 
1478 		// Encode as FP16 if using HDR
1479 		if ((decode_mode == ASTCENC_PRF_HDR) ||
1480 		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1481 		{
1482 			scb.block_type = SYM_BTYPE_CONST_F16;
1483 			vint4 color_f16 = float_to_float16(blk.origin_texel);
1484 			store(color_f16, scb.constant_color);
1485 		}
1486 		// Encode as UNORM16 if NOT using HDR
1487 		else
1488 		{
1489 			scb.block_type = SYM_BTYPE_CONST_U16;
1490 			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1491 			vint4 color_u16 = float_to_int_rtn(color_f32);
1492 			store(color_u16, scb.constant_color);
1493 		}
1494 
1495 		trace_add_data("exit", "quality hit");
1496 		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE &&
1497 			ctx.config.privateProfile != HIGH_SPEED_PROFILE_HIGHBITS)
1498 		{
1499 			scb.block_type = SYM_BTYPE_NONCONST;
1500 			scb.partition_count = 1;
1501 			scb.color_formats_matched = 0;
1502 			scb.plane2_component = -1;
1503 			if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1504 			{
1505 				scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE;
1506 			}
1507 #ifdef ASTC_CUSTOMIZED_ENABLE
1508 			else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1509 			{
1510 				if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1511 					g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr)
1512 				{
1513 					printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n");
1514 					return;
1515 				}
1516 				scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_();
1517 			}
1518 #endif
1519 			scb.partition_index = 0;
1520 			scb.quant_mode = QUANT_256;
1521 			scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1522 			for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1523 				scb.weights[w] = 0;
1524 			}
1525 			for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
1526 				scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
1527 				scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
1528 			}
1529 		}
1530 		scb.privateProfile = ctx.config.privateProfile;
1531 		symbolic_to_physical(bsd, scb, pcb);
1532 #if QUALITY_CONTROL
1533 	if (calQualityEnable) {
1534 		*mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0;
1535 	}
1536 #endif
1537 		return;
1538 	}
1539 
1540 #if !defined(ASTCENC_DIAGNOSTICS)
1541 	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1542 	float error_threshold = ctx.config.tune_db_limit
1543 	                      * error_weight_sum
1544 	                      * block_is_l_scale
1545 	                      * block_is_la_scale;
1546 #endif
1547 
1548 	// Set SCB and mode errors to a very high error value
1549 	scb.errorval = ERROR_CALC_DEFAULT;
1550 	scb.block_type = SYM_BTYPE_ERROR;
1551 
1552 	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1553 		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1554 	};
1555 
1556 	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1557 		0.0f,
1558 		ctx.config.tune_2partition_early_out_limit_factor,
1559 		ctx.config.tune_3partition_early_out_limit_factor,
1560 		0.0f
1561 	};
1562 
1563 	// Trial using 1 plane of weights and 1 partition.
1564 
1565 	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1566 	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1567 	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1568 	// compression and slightly reduces image quality.
1569 
1570 	float errorval_mult[2] {
1571 		1.0f / ctx.config.tune_mse_overshoot,
1572 		1.0f
1573 	};
1574 
1575 	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1576 
1577 	// Only enable MODE0 fast path if enabled
1578 	// Never enable for 3D blocks as no "always" block modes are available
1579 	int start_trial = 1;
1580  	if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1581 	{
1582 		start_trial = 0;
1583 	}
1584 
1585 	int quant_limit = QUANT_32;
1586 	for (int i = start_trial; i < 2; i++)
1587 	{
1588 		TRACE_NODE(node1, "pass");
1589 		trace_add_data("partition_count", 1);
1590 		trace_add_data("plane_count", 1);
1591 		trace_add_data("search_mode", i);
1592 
1593 		float errorval = compress_symbolic_block_for_partition_1plane(
1594 		    ctx.config.privateProfile,
1595 		    ctx.config, bsd, blk, i == 0,
1596 		    error_threshold * errorval_mult[i] * errorval_overshoot,
1597 		    1, 0,  scb, tmpbuf, QUANT_32);
1598 
1599 		// Record the quant level so we can use the filter later searches
1600 		const auto& bm = bsd.get_block_mode(scb.block_mode);
1601 		quant_limit = bm.get_weight_quant_mode();
1602 
1603 		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1604 		if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE ||
1605 			ctx.config.privateProfile == HIGH_SPEED_PROFILE_HIGHBITS) ||
1606 			(errorval < (error_threshold * errorval_mult[i])))
1607 		{
1608 			trace_add_data("exit", "quality hit");
1609 			goto END_OF_TESTS;
1610 		}
1611 	}
1612 
1613 #if !defined(ASTCENC_DIAGNOSTICS)
1614 	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1615 #endif
1616 
1617 	block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1618 
1619 	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1620 	// alpha is the most likely to be non-correlated if it is present in the data.
1621 	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1622 	{
1623 		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1624 		{
1625 			break;
1626 		}
1627 		TRACE_NODE(node1, "pass");
1628 		trace_add_data("partition_count", 1);
1629 		trace_add_data("plane_count", 2);
1630 		trace_add_data("plane_component", i);
1631 
1632 		if (block_skip_two_plane)
1633 		{
1634 			trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1635 			continue;
1636 		}
1637 
1638 		if (blk.grayscale && i != 3)
1639 		{
1640 			trace_add_data("skip", "grayscale block");
1641 			continue;
1642 		}
1643 
1644 		if (blk.is_constant_channel(i))
1645 		{
1646 			trace_add_data("skip", "constant component");
1647 			continue;
1648 		}
1649 
1650 		float errorval = compress_symbolic_block_for_partition_2planes(
1651 		    ctx.config.privateProfile,
1652 		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1653 		    i, scb, tmpbuf, quant_limit);
1654 
1655 		// If attempting two planes is much worse than the best one plane result
1656 		// then further two plane searches are unlikely to help so move on ...
1657 		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1658 		{
1659 			break;
1660 		}
1661 
1662 		if (errorval < error_threshold)
1663 		{
1664 			trace_add_data("exit", "quality hit");
1665 			goto END_OF_TESTS;
1666 		}
1667 	}
1668 
1669 	// Find best blocks for 2, 3 and 4 partitions
1670 	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1671 	{
1672 		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1673 
1674 		unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1675 
1676 		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1677 		requested_trials = astc::min(requested_trials, requested_indices);
1678 
1679 		unsigned int actual_trials = find_best_partition_candidates(
1680 		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1681 
1682 		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1683 
1684 		for (unsigned int i = 0; i < actual_trials; i++)
1685 		{
1686 			TRACE_NODE(node1, "pass");
1687 			trace_add_data("partition_count", partition_count);
1688 			trace_add_data("partition_index", partition_indices[i]);
1689 			trace_add_data("plane_count", 1);
1690 			trace_add_data("search_mode", i);
1691 
1692 			float errorval = compress_symbolic_block_for_partition_1plane(
1693 			    ctx.config.privateProfile,
1694 			    ctx.config, bsd, blk, false,
1695 			    error_threshold * errorval_overshoot,
1696 			    partition_count, partition_indices[i],
1697 			    scb, tmpbuf, quant_limit);
1698 
1699 			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1700 
1701 			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
1702 			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
1703 			// aligns with a partitioning that suits that encoding, so for this inner loop check add
1704 			// a large error scale because the "other" trial could be a lot better.
1705 			float best_error = best_errorvals_for_pcount[partition_count - 1];
1706 			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1707 			if (best_error > (best_error_in_prev * best_error_scale))
1708 			{
1709 				trace_add_data("skip", "tune_partition_early_out_limit_factor");
1710 				goto END_OF_TESTS;
1711 			}
1712 
1713 			if (errorval < error_threshold)
1714 			{
1715 				trace_add_data("exit", "quality hit");
1716 				goto END_OF_TESTS;
1717 			}
1718 		}
1719 
1720 		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1721 		float best_error = best_errorvals_for_pcount[partition_count - 1];
1722 		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1723 		if (best_error > (best_error_in_prev * best_error_scale))
1724 		{
1725 			trace_add_data("skip", "tune_partition_early_out_limit_factor");
1726 			goto END_OF_TESTS;
1727 		}
1728 	}
1729 
1730 	trace_add_data("exit", "quality not hit");
1731 
1732 END_OF_TESTS:
1733 	// If we still have an error block then convert to something we can encode
1734 	// TODO: Do something more sensible here, such as average color block
1735 	if (scb.block_type == SYM_BTYPE_ERROR)
1736 	{
1737 #if defined(ASTCENC_DIAGNOSTICS)
1738 		static bool printed_once = false;
1739 		if (!printed_once)
1740 		{
1741 			printed_once = true;
1742 			printf("WARN: At least one block failed to find a valid encoding.\n"
1743 			       "      Try increasing compression quality settings.\n\n");
1744 		}
1745 #endif
1746 
1747 		scb.block_type = SYM_BTYPE_CONST_U16;
1748 		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1749 		vint4 color_u16 = float_to_int_rtn(color_f32);
1750 		store(color_u16, scb.constant_color);
1751 	}
1752 
1753 	// Compress to a physical block
1754 	scb.privateProfile = ctx.config.privateProfile;
1755 	symbolic_to_physical(bsd, scb, pcb);
1756 #if QUALITY_CONTROL
1757 	if (calQualityEnable) {
1758 		image_block decBlk = blk;
1759 		decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk);
1760 		vint4 colorSumDiff = vint4::zero();
1761 		for (size_t ii = 0; ii < bsd.texel_count; ii++) {
1762 			vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f);
1763 			vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f);
1764 			vint4 colorDiff = colorRef - colorTest;
1765 			colorSumDiff += colorDiff * colorDiff;
1766 		}
1767 		*mseBlock[R_COM] = colorSumDiff.lane<0>();
1768 		*mseBlock[G_COM] = colorSumDiff.lane<1>();
1769 		*mseBlock[B_COM] = colorSumDiff.lane<2>();
1770 		*mseBlock[A_COM] = colorSumDiff.lane<3>();
1771     }
1772 #endif
1773 }
1774 
1775 #endif
1776