astc-encoder/Source/astcenc_compress_symbolic.cpp

// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations
// under the License.
// ----------------------------------------------------------------------------

#if !defined(ASTCENC_DECOMPRESS_ONLY)

/**
 * @brief Functions to compress a symbolic block.
 */

#include "astcenc_internal.h"
#include "astcenc_diagnostic_trace.h"

#include <cassert>
#ifdef ASTC_CUSTOMIZED_ENABLE
AstcCustomizedSoManager g_astcCustomizedSoManager;
#endif

/**
 * @brief Merge two planes of endpoints into a single vector.
 *
 * @param      ep_plane1          The endpoints for plane 1.
 * @param      ep_plane2          The endpoints for plane 2.
 * @param      component_plane2   The color component for plane 2.
 * @param[out] result             The merged output.
 */
static void merge_endpoints(
	const endpoints& ep_plane1,
	const endpoints& ep_plane2,
	unsigned int component_plane2,
	endpoints& result
) {
	unsigned int partition_count = ep_plane1.partition_count;
	assert(partition_count == 1);

	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);

	result.partition_count = partition_count;
	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
}

/**
 * @brief Attempt to improve weights given a chosen configuration.
 *
 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
 * down by one quantization step.
 *
 * This is a specialized function which only supports operating on undecimated weight grids,
 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
 * is needed less often.
 *
 * @param      decode_mode   The decode mode (LDR, HDR).
 * @param      bsd           The block size information.
 * @param      blk           The image block color data to compress.
 * @param[out] scb           The symbolic compressed block output.
 */
#if ASTCENC_NEON != 0
static bool realign_weights_undecimated(
	astcenc_profile decode_mode,
	const block_size_descriptor& bsd,
	const image_block& blk,
	symbolic_compressed_block& scb
) {
	// Get the partition descriptor
	unsigned int partition_count = scb.partition_count;
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);

	// Get the quantization table
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
	unsigned int weight_quant_level = bm.quant_mode;
	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];

	unsigned int max_plane = bm.is_dual_plane;
	int plane2_component = scb.plane2_component;
	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);

	// Decode the color endpoints
	bool rgb_hdr;
	bool alpha_hdr;
	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
	vfloat4 offset[BLOCK_MAX_PARTITIONS];

	promise(partition_count > 0);

	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
	{
		unpack_color_endpoints(decode_mode,
		                       scb.color_formats[pa_idx],
		                       scb.color_values[pa_idx],
		                       rgb_hdr, alpha_hdr,
		                       endpnt0[pa_idx],
		                       endpnt1[pa_idx]);
	}

	uint8_t* dec_weights_uquant = scb.weights;
	bool adjustments = false;

	// For each plane and partition ...
	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
	{
		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
		{
			// Compute the endpoint delta for all components in current plane
			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
			epd = select(epd, vint4::zero(), plane_mask);

			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
		}

		// For each weight compute previous, current, and next errors
		promise(bsd.texel_count > 0);

		unsigned int texel = 0;
		for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH)
		{
			int uqw0 = dec_weights_uquant[texel];
			int uqw1 = dec_weights_uquant[texel + 1];
			int uqw2 = dec_weights_uquant[texel + 2];
			int uqw3 = dec_weights_uquant[texel + 3];

			vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3);
			vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1],
							qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]);

			vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF);
			vint4 uqw_down_vec = prev_and_next_vec & mask;
			vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask;

			vfloat4 weight_base_vec = int_to_float(uqw_vec);
			vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec;
			vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec;

			unsigned int partition0 = pi.partition_of_texel[texel];
			unsigned int partition1 = pi.partition_of_texel[texel + 1];
			unsigned int partition2 = pi.partition_of_texel[texel + 2];
			unsigned int partition3 = pi.partition_of_texel[texel + 3];

			vfloat4 color_offset0 = offset[partition0];
			vfloat4 color_offset1 = offset[partition1];
			vfloat4 color_offset2 = offset[partition2];
			vfloat4 color_offset3 = offset[partition3];

			vfloat4 color_base0 = endpnt0f[partition0];
			vfloat4 color_base1 = endpnt0f[partition1];
			vfloat4 color_base2 = endpnt0f[partition2];
			vfloat4 color_base3 = endpnt0f[partition3];

			vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>();
			vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>();
			vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>();
			vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>();

			vfloat4 orig_color0 = blk.texel(texel);
			vfloat4 orig_color1 = blk.texel(texel + 1);
			vfloat4 orig_color2 = blk.texel(texel + 2);
			vfloat4 orig_color3 = blk.texel(texel + 3);

			vfloat4 error_weight = blk.channel_weight;

			vfloat4 color_diff0 = color0 - orig_color0;
			vfloat4 color_diff1 = color1 - orig_color1;
			vfloat4 color_diff2 = color2 - orig_color2;
			vfloat4 color_diff3 = color3 - orig_color3;

			vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>();
			vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>();
			vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>();
			vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>();

			vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>();
			vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>();
			vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>();
			vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>();

			float error_base0 = dot_s(color_diff0 * color_diff0, error_weight);
			float error_base1 = dot_s(color_diff1 * color_diff1, error_weight);
			float error_base2 = dot_s(color_diff2 * color_diff2, error_weight);
			float error_base3 = dot_s(color_diff3 * color_diff3, error_weight);

			float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight);
			float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight);
			float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight);
			float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight);

			float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight);
			float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight);
			float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight);
			float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight);

			vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3);
			vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3);
			vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3);

			vmask4 check_result_up = (error_up_vec < error_base_vec) &
			        (error_up_vec < error_down_vec) & (uqw_vec < vint4(64));

			vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero());
			check_result_down = check_result_down & (~check_result_up);

			if (popcount(check_result_up | check_result_down) != 0)
			{
				uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up);
				uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down);

				dec_weights_uquant[texel] = uqw_vec.lane<0>();
				dec_weights_uquant[texel + 1] = uqw_vec.lane<1>();
				dec_weights_uquant[texel + 2] = uqw_vec.lane<2>();    // channel 2
				dec_weights_uquant[texel + 3] = uqw_vec.lane<3>();    // channel 3
				adjustments = true;
			}
		};

		for (; texel < bsd.texel_count; texel++)
		{
			int uqw = dec_weights_uquant[texel];

			uint32_t prev_and_next = qat.prev_next_values[uqw];
			int uqw_down = prev_and_next & 0xFF;
			int uqw_up = (prev_and_next >> 8) & 0xFF;

			// Interpolate the colors to create the diffs
			float weight_base = static_cast<float>(uqw);
			float weight_down = static_cast<float>(uqw_down - uqw);
			float weight_up = static_cast<float>(uqw_up - uqw);

			unsigned int partition = pi.partition_of_texel[texel];
			vfloat4 color_offset = offset[partition];
			vfloat4 color_base   = endpnt0f[partition];

			vfloat4 color = color_base + color_offset * weight_base;
			vfloat4 orig_color   = blk.texel(texel);
			vfloat4 error_weight = blk.channel_weight;

			vfloat4 color_diff      = color - orig_color;
			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;

			float error_base = dot_s(color_diff      * color_diff,      error_weight);
			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);

			// Check if the prev or next error is better, and if so use it
			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
			{
				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
				adjustments = true;
			}
			else if ((error_down < error_base) && (uqw > 0))
			{
				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
				adjustments = true;
			}
		}

		// Prepare iteration for plane 2
		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
		plane_mask = ~plane_mask;
	}
	return adjustments;
}
#else
static bool realign_weights_undecimated(
	astcenc_profile decode_mode,
	const block_size_descriptor& bsd,
	const image_block& blk,
	symbolic_compressed_block& scb
) {
	// Get the partition descriptor
	unsigned int partition_count = scb.partition_count;
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);

	// Get the quantization table
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
	unsigned int weight_quant_level = bm.quant_mode;
	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];

	unsigned int max_plane = bm.is_dual_plane;
	int plane2_component = scb.plane2_component;
	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);

	// Decode the color endpoints
	bool rgb_hdr;
	bool alpha_hdr;
	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
	vfloat4 offset[BLOCK_MAX_PARTITIONS];

	promise(partition_count > 0);

	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
	{
		unpack_color_endpoints(decode_mode,
		                       scb.color_formats[pa_idx],
		                       scb.color_values[pa_idx],
		                       rgb_hdr, alpha_hdr,
		                       endpnt0[pa_idx],
		                       endpnt1[pa_idx]);
	}

	uint8_t* dec_weights_uquant = scb.weights;
	bool adjustments = false;

	// For each plane and partition ...
	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
	{
		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
		{
			// Compute the endpoint delta for all components in current plane
			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
			epd = select(epd, vint4::zero(), plane_mask);

			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
		}

		// For each weight compute previous, current, and next errors
		promise(bsd.texel_count > 0);
		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
		{
			int uqw = dec_weights_uquant[texel];

			uint32_t prev_and_next = qat.prev_next_values[uqw];
			int uqw_down = prev_and_next & 0xFF;
			int uqw_up = (prev_and_next >> 8) & 0xFF;

			// Interpolate the colors to create the diffs
			float weight_base = static_cast<float>(uqw);
			float weight_down = static_cast<float>(uqw_down - uqw);
			float weight_up = static_cast<float>(uqw_up - uqw);

			unsigned int partition = pi.partition_of_texel[texel];
			vfloat4 color_offset = offset[partition];
			vfloat4 color_base   = endpnt0f[partition];

			vfloat4 color = color_base + color_offset * weight_base;
			vfloat4 orig_color   = blk.texel(texel);
			vfloat4 error_weight = blk.channel_weight;

			vfloat4 color_diff      = color - orig_color;
			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;

			float error_base = dot_s(color_diff      * color_diff,      error_weight);
			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);

			// Check if the prev or next error is better, and if so use it
			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
			{
				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
				adjustments = true;
			}
			else if ((error_down < error_base) && (uqw > 0))
			{
				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
				adjustments = true;
			}
		}

		// Prepare iteration for plane 2
		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
		plane_mask = ~plane_mask;
	}

	return adjustments;
}
#endif

/**
 * @brief Attempt to improve weights given a chosen configuration.
 *
 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
 * down by one quantization step.
 *
 * @param      decode_mode   The decode mode (LDR, HDR).
 * @param      bsd           The block size information.
 * @param      blk           The image block color data to compress.
 * @param[out] scb           The symbolic compressed block output.
 */
static bool realign_weights_decimated(
	astcenc_profile decode_mode,
	const block_size_descriptor& bsd,
	const image_block& blk,
	symbolic_compressed_block& scb
) {
	// Get the partition descriptor
	unsigned int partition_count = scb.partition_count;
	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);

	// Get the quantization table
	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
	unsigned int weight_quant_level = bm.quant_mode;
	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];

	// Get the decimation table
	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
	unsigned int weight_count = di.weight_count;
	assert(weight_count != bsd.texel_count);

	unsigned int max_plane = bm.is_dual_plane;
	int plane2_component = scb.plane2_component;
	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);

	// Decode the color endpoints
	bool rgb_hdr;
	bool alpha_hdr;
	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
	vfloat4 offset[BLOCK_MAX_PARTITIONS];

	promise(partition_count > 0);
	promise(weight_count > 0);

	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
	{
		unpack_color_endpoints(decode_mode,
		                       scb.color_formats[pa_idx],
		                       scb.color_values[pa_idx],
		                       rgb_hdr, alpha_hdr,
		                       endpnt0[pa_idx],
		                       endpnt1[pa_idx]);
	}

	uint8_t* dec_weights_uquant = scb.weights;
	bool adjustments = false;

	// For each plane and partition ...
	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
	{
		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
		{
			// Compute the endpoint delta for all components in current plane
			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
			epd = select(epd, vint4::zero(), plane_mask);

			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
		}

		// Create an unquantized weight grid for this decimation level
		ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
		{
			vint unquant_value(dec_weights_uquant + we_idx);
			vfloat unquant_valuef = int_to_float(unquant_value);
			storea(unquant_valuef, uq_weightsf + we_idx);
		}

		// For each weight compute previous, current, and next errors
		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
		{
			int uqw = dec_weights_uquant[we_idx];
			uint32_t prev_and_next = qat.prev_next_values[uqw];

			float uqw_base = uq_weightsf[we_idx];
			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);

			float uqw_diff_down = uqw_down - uqw_base;
			float uqw_diff_up = uqw_up - uqw_base;

			vfloat4 error_basev = vfloat4::zero();
			vfloat4 error_downv = vfloat4::zero();
			vfloat4 error_upv = vfloat4::zero();

			// Interpolate the colors to create the diffs
			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
			promise(texels_to_evaluate > 0);
			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
			{
				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];

				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];

				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);

				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
				// float weight = astc::flt_rd(weight_base + 0.5f);
				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;

				unsigned int partition = pi.partition_of_texel[texel];
				vfloat4 color_offset = offset[partition];
				vfloat4 color_base   = endpnt0f[partition];

				vfloat4 color = color_base + color_offset * weight_base;
				vfloat4 orig_color = blk.texel(texel);

				vfloat4 color_diff      = color - orig_color;
				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;

				error_basev += color_diff * color_diff;
				error_downv += color_down_diff * color_down_diff;
				error_upv   += color_up_diff * color_up_diff;
			}

			vfloat4 error_weight = blk.channel_weight;
			float error_base = hadd_s(error_basev * error_weight);
			float error_down = hadd_s(error_downv * error_weight);
			float error_up   = hadd_s(error_upv   * error_weight);

			// Check if the prev or next error is better, and if so use it
			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
			{
				uq_weightsf[we_idx] = uqw_up;
				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
				adjustments = true;
			}
			else if ((error_down < error_base) && (uqw > 0))
			{
				uq_weightsf[we_idx] = uqw_down;
				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
				adjustments = true;
			}
		}

		// Prepare iteration for plane 2
		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
		plane_mask = ~plane_mask;
	}

	return adjustments;
}

/**
 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
 *
 * @param      config                    The compressor configuration.
 * @param      bsd                       The block size information.
 * @param      blk                       The image block color data to compress.
 * @param      only_always               True if we only use "always" percentile block modes.
 * @param      tune_errorval_threshold   The error value threshold.
 * @param      partition_count           The partition count.
 * @param      partition_index           The partition index if @c partition_count is 2-4.
 * @param[out] scb                       The symbolic compressed block output.
 * @param[out] tmpbuf                    The quantized weights for plane 1.
 */
static float compress_symbolic_block_for_partition_1plane(
	QualityProfile privateProfile,
	const astcenc_config& config,
	const block_size_descriptor& bsd,
	const image_block& blk,
	bool only_always,
	float tune_errorval_threshold,
	unsigned int partition_count,
	unsigned int partition_index,
	symbolic_compressed_block& scb,
	compression_working_buffers& tmpbuf,
	int quant_limit
) {
	promise(partition_count > 0);
	promise(config.tune_candidate_limit > 0);
	promise(config.tune_refinement_limit > 0);

	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);

	auto compute_difference = &compute_symbolic_block_difference_1plane;
	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
	{
		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
	}

	const auto& pi = bsd.get_partition_info(partition_count, partition_index);

	// Compute ideal weights and endpoint colors, with no quantization or decimation
	endpoints_and_weights& ei = tmpbuf.ei1;
	compute_ideal_colors_and_weights_1plane(blk, pi, ei);

	// Compute ideal weights and endpoint colors for every decimation
	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;

	// For each decimation mode, compute an ideal set of weights with no quantization
	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
	                                                : bsd.decimation_mode_count_selected;
	promise(max_decimation_modes > 0);
	for (unsigned int i = 0; i < max_decimation_modes; i++)
	{
		const auto& dm = bsd.get_decimation_mode(i);
		if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
		{
			continue;
		}

		const auto& di = bsd.get_decimation_info(i);

		compute_ideal_weights_for_decimation(
		    ei,
		    di,
		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
	}

	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
	// weight pair, compute the smallest weight that will result in a color value greater than 1
	vfloat4 min_ep(10.0f);
	for (unsigned int i = 0; i < partition_count; i++)
	{
		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);

		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
		min_ep = select(min_ep, ep, use_ep);
	}

	float min_wt_cutoff = hmin_s(min_ep);

	// For each mode, use the angular method to compute a shift
	compute_angular_endpoints_1plane(
	    privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);

	float* weight_low_value = tmpbuf.weight_low_value1;
	float* weight_high_value = tmpbuf.weight_high_value1;
	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
	float* qwt_errors = tmpbuf.qwt_errors;

	// For each mode (which specifies a decimation and a quantization):
	//     * Compute number of bits needed for the quantized weights
	//     * Generate an optimized set of quantized weights
	//     * Compute quantization errors for the mode


	static const int8_t free_bits_for_partition_count[4] {
		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
	};

	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
	                                           : bsd.block_mode_count_1plane_selected;
	promise(max_block_modes > 0);
	for (unsigned int i = 0; i < max_block_modes; i++)
	{
		const block_mode& bm = bsd.block_modes[i];

		if (bm.quant_mode > max_weight_quant)
		{
			qwt_errors[i] = 1e38f;
			continue;
		}

		assert(!bm.is_dual_plane);
		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
		if (bitcount <= 0)
		{
			qwt_errors[i] = 1e38f;
			continue;
		}

		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
		{
			weight_high_value[i] = 1.0f;
		}

		int decimation_mode = bm.decimation_mode;
		const auto& di = bsd.get_decimation_info(decimation_mode);

		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);

		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];

		// Generate the optimized set of weights for the weight mode
		compute_quantized_weights_for_decimation(
		    di,
		    weight_low_value[i], weight_high_value[i],
		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
		    dec_weights_uquantf,
		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
		    bm.get_weight_quant_mode());

		// Compute weight quantization errors for the block mode
		qwt_errors[i] = compute_error_of_weight_set_1plane(
		    ei,
		    di,
		    dec_weights_uquantf);
	}

	// Decide the optimal combination of color endpoint encodings and weight encodings
	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];

	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];

	unsigned int candidate_count = compute_ideal_endpoint_formats(
	    privateProfile,
	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
	    config.tune_candidate_limit, 0, max_block_modes,
	    partition_format_specifiers, block_mode_index,
	    color_quant_level, color_quant_level_mod, tmpbuf);

	// Iterate over the N believed-to-be-best modes to find out which one is actually best
	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
	float best_errorval_in_scb = scb.errorval;

	for (unsigned int i = 0; i < candidate_count; i++)
	{
		TRACE_NODE(node0, "candidate");

		const int bm_packed_index = block_mode_index[i];
		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];

		int decimation_mode = qw_bm.decimation_mode;
		const auto& di = bsd.get_decimation_info(decimation_mode);
		promise(di.weight_count > 0);

		trace_add_data("weight_x", di.weight_x);
		trace_add_data("weight_y", di.weight_y);
		trace_add_data("weight_z", di.weight_z);
		trace_add_data("weight_quant", qw_bm.quant_mode);

		// Recompute the ideal color endpoints before storing them
		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];

		symbolic_compressed_block workscb;
		endpoints workep = ei.ep;

		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;

		for (unsigned int j = 0; j < di.weight_count; j++)
		{
			workscb.weights[j] = u8_weight_src[j];
		}

		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
		{
			recompute_ideal_colors_1plane(
			    blk, pi, di, workscb.weights,
			    workep, rgbs_colors, rgbo_colors);

			// Quantize the chosen color, tracking if worth trying the mod value
			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
			for (unsigned int j = 0; j < partition_count; j++)
			{
				workscb.color_formats[j] = pack_color_endpoints(
				    privateProfile,
				    workep.endpt0[j],
				    workep.endpt1[j],
				    rgbs_colors[j],
				    rgbo_colors[j],
				    partition_format_specifiers[i][j],
				    workscb.color_values[j],
				    color_quant_level[i]);

				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
			}

			// If all the color endpoint modes are the same, we get a few more bits to store colors;
			// let's see if we can take advantage of this: requantize all the colors and see if the
			// endpoint modes remain the same.
			workscb.color_formats_matched = 0;
			if (partition_count >= 2 && all_same)
			{
				uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
				bool all_same_mod = true;
				for (unsigned int j = 0; j < partition_count; j++)
				{
					color_formats_mod[j] = pack_color_endpoints(
					    privateProfile,
					    workep.endpt0[j],
					    workep.endpt1[j],
					    rgbs_colors[j],
					    rgbo_colors[j],
					    partition_format_specifiers[i][j],
					    colorvals[j],
					    color_quant_level_mod[i]);

					// Early out as soon as it's no longer possible to use mod
					if (color_formats_mod[j] != color_formats_mod[0])
					{
						all_same_mod = false;
						break;
					}
				}

				if (all_same_mod)
				{
					workscb.color_formats_matched = 1;
					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
					{
						for (unsigned int k = 0; k < 8; k++)
						{
							workscb.color_values[j][k] = colorvals[j][k];
						}

						workscb.color_formats[j] = color_formats_mod[j];
					}
				}
			}

			// Store header fields
			workscb.partition_count = static_cast<uint8_t>(partition_count);
			workscb.partition_index = static_cast<uint16_t>(partition_index);
			workscb.plane2_component = -1;
			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
			workscb.block_mode = qw_bm.mode_index;
			workscb.block_type = SYM_BTYPE_NONCONST;
			if (privateProfile == HIGH_SPEED_PROFILE ||
				privateProfile == HIGH_SPEED_PROFILE_HIGHBITS)
			{
				workscb.errorval = 0;
				scb = workscb;
				break;
			}
			// Pre-realign test
			if (l == 0)
			{
				float errorval = compute_difference(config, bsd, workscb, blk);
				if (errorval == -ERROR_CALC_DEFAULT)
				{
					errorval = -errorval;
					workscb.block_type = SYM_BTYPE_ERROR;
				}

				trace_add_data("error_prerealign", errorval);
				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);

				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
				// drive a heuristic to skip blocks that are unlikely to catch up with the best
				// block we have already.
				unsigned int iters_remaining = config.tune_refinement_limit - l;
				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
				if (errorval > (threshold * best_errorval_in_scb))
				{
					break;
				}

				if (errorval < best_errorval_in_scb)
				{
					best_errorval_in_scb = errorval;
					workscb.errorval = errorval;
					scb = workscb;

					if (errorval < tune_errorval_threshold)
					{
						// Skip remaining candidates - this is "good enough"
						i = candidate_count;
						break;
					}
				}
			}

			bool adjustments;
			if (di.weight_count != bsd.texel_count)
			{
				adjustments = realign_weights_decimated(
					config.profile, bsd, blk, workscb);
			}
			else
			{
				adjustments = realign_weights_undecimated(
					config.profile, bsd, blk, workscb);
			}

			// Post-realign test
			float errorval = compute_difference(config, bsd, workscb, blk);
			if (errorval == -ERROR_CALC_DEFAULT)
			{
				errorval = -errorval;
				workscb.block_type = SYM_BTYPE_ERROR;
			}

			trace_add_data("error_postrealign", errorval);
			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);

			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
			// give benefit of the doubt ...
			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
			if (errorval > (threshold * best_errorval_in_scb))
			{
				break;
			}

			if (errorval < best_errorval_in_scb)
			{
				best_errorval_in_scb = errorval;
				workscb.errorval = errorval;
				scb = workscb;

				if (errorval < tune_errorval_threshold)
				{
					// Skip remaining candidates - this is "good enough"
					i = candidate_count;
					break;
				}
			}

			if (!adjustments)
			{
				break;
			}
		}
	}

	return best_errorval_in_mode;
}

/**
 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
 *
 * @param      config                    The compressor configuration.
 * @param      bsd                       The block size information.
 * @param      blk                       The image block color data to compress.
 * @param      tune_errorval_threshold   The error value threshold.
 * @param      plane2_component          The component index for the second plane of weights.
 * @param[out] scb                       The symbolic compressed block output.
 * @param[out] tmpbuf                    The quantized weights for plane 1.
 */
static float compress_symbolic_block_for_partition_2planes(
	QualityProfile privateProfile,
	const astcenc_config& config,
	const block_size_descriptor& bsd,
	const image_block& blk,
	float tune_errorval_threshold,
	unsigned int plane2_component,
	symbolic_compressed_block& scb,
	compression_working_buffers& tmpbuf,
	int quant_limit
) {
	promise(config.tune_candidate_limit > 0);
	promise(config.tune_refinement_limit > 0);
	promise(bsd.decimation_mode_count_selected > 0);

	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);

	// Compute ideal weights and endpoint colors, with no quantization or decimation
	endpoints_and_weights& ei1 = tmpbuf.ei1;
	endpoints_and_weights& ei2 = tmpbuf.ei2;

	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);

	// Compute ideal weights and endpoint colors for every decimation
	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;

	// For each decimation mode, compute an ideal set of weights with no quantization
	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
	{
		const auto& dm = bsd.get_decimation_mode(i);
		if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
		{
			continue;
		}

		const auto& di = bsd.get_decimation_info(i);

		compute_ideal_weights_for_decimation(
		    ei1,
		    di,
		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);

		compute_ideal_weights_for_decimation(
		    ei2,
		    di,
		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
	}

	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
	// weight pair, compute the smallest weight that will result in a color value greater than 1
	vfloat4 min_ep1(10.0f);
	vfloat4 min_ep2(10.0f);

	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
	min_ep1 = select(min_ep1, ep1, use_ep1);

	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
	min_ep2 = select(min_ep2, ep2, use_ep2);

	vfloat4 err_max(ERROR_CALC_DEFAULT);
	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);

	// Set the plane2 component to max error in ep1
	min_ep1 = select(min_ep1, err_max, err_mask);

	float min_wt_cutoff1 = hmin_s(min_ep1);

	// Set the minwt2 to the plane2 component min in ep2
	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));

	compute_angular_endpoints_2planes(
	    privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);

	// For each mode (which specifies a decimation and a quantization):
	//     * Compute number of bits needed for the quantized weights
	//     * Generate an optimized set of quantized weights
	//     * Compute quantization errors for the mode

	float* weight_low_value1 = tmpbuf.weight_low_value1;
	float* weight_high_value1 = tmpbuf.weight_high_value1;
	float* weight_low_value2 = tmpbuf.weight_low_value2;
	float* weight_high_value2 = tmpbuf.weight_high_value2;

	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
	float* qwt_errors = tmpbuf.qwt_errors;

	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;

	for (unsigned int i = start_2plane; i < end_2plane; i++)
	{
		const block_mode& bm = bsd.block_modes[i];
		assert(bm.is_dual_plane);

		if (bm.quant_mode > max_weight_quant)
		{
			qwt_errors[i] = 1e38f;
			continue;
		}

		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);

		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
		{
			weight_high_value1[i] = 1.0f;
		}

		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
		{
			weight_high_value2[i] = 1.0f;
		}

		unsigned int decimation_mode = bm.decimation_mode;
		const auto& di = bsd.get_decimation_info(decimation_mode);

		ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];

		// Generate the optimized set of weights for the mode
		compute_quantized_weights_for_decimation(
		    di,
		    weight_low_value1[i],
		    weight_high_value1[i],
		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
		    dec_weights_uquantf,
		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
		    bm.get_weight_quant_mode());

		compute_quantized_weights_for_decimation(
		    di,
		    weight_low_value2[i],
		    weight_high_value2[i],
		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
		    bm.get_weight_quant_mode());

		// Compute weight quantization errors for the block mode
		qwt_errors[i] = compute_error_of_weight_set_2planes(
		    ei1,
		    ei2,
		    di,
		    dec_weights_uquantf,
		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
	}

	// Decide the optimal combination of color endpoint encodings and weight encodings
	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];

	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];

	endpoints epm;
	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);

	const auto& pi = bsd.get_partition_info(1, 0);
	unsigned int candidate_count = compute_ideal_endpoint_formats(
	    config.privateProfile,
	    pi, blk, epm, qwt_bitcounts, qwt_errors,
	    config.tune_candidate_limit,
		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
	    partition_format_specifiers, block_mode_index,
	    color_quant_level, color_quant_level_mod, tmpbuf);

	// Iterate over the N believed-to-be-best modes to find out which one is actually best
	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
	float best_errorval_in_scb = scb.errorval;

	for (unsigned int i = 0; i < candidate_count; i++)
	{
		TRACE_NODE(node0, "candidate");

		const int bm_packed_index = block_mode_index[i];
		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];

		int decimation_mode = qw_bm.decimation_mode;
		const auto& di = bsd.get_decimation_info(decimation_mode);
		promise(di.weight_count > 0);

		trace_add_data("weight_x", di.weight_x);
		trace_add_data("weight_y", di.weight_y);
		trace_add_data("weight_z", di.weight_z);
		trace_add_data("weight_quant", qw_bm.quant_mode);

		vfloat4 rgbs_color;
		vfloat4 rgbo_color;

		symbolic_compressed_block workscb;
		endpoints workep = epm;

		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;

		for (int j = 0; j < di.weight_count; j++)
		{
			workscb.weights[j] = u8_weight1_src[j];
			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
		}

		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
		{
			recompute_ideal_colors_2planes(
			    blk, bsd, di,
			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
			    workep, rgbs_color, rgbo_color, plane2_component);

			// Quantize the chosen color
			workscb.color_formats[0] = pack_color_endpoints(
			                               privateProfile,
			                               workep.endpt0[0],
			                               workep.endpt1[0],
			                               rgbs_color, rgbo_color,
			                               partition_format_specifiers[i][0],
			                               workscb.color_values[0],
			                               color_quant_level[i]);

			// Store header fields
			workscb.partition_count = 1;
			workscb.partition_index = 0;
			workscb.quant_mode = color_quant_level[i];
			workscb.color_formats_matched = 0;
			workscb.block_mode = qw_bm.mode_index;
			workscb.plane2_component = static_cast<int8_t>(plane2_component);
			workscb.block_type = SYM_BTYPE_NONCONST;

			// Pre-realign test
			if (l == 0)
			{
				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
				if (errorval == -ERROR_CALC_DEFAULT)
				{
					errorval = -errorval;
					workscb.block_type = SYM_BTYPE_ERROR;
				}

				trace_add_data("error_prerealign", errorval);
				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);

				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
				// drive a heuristic to skip blocks that are unlikely to catch up with the best
				// block we have already.
				unsigned int iters_remaining = config.tune_refinement_limit - l;
				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
				if (errorval > (threshold * best_errorval_in_scb))
				{
					break;
				}

				if (errorval < best_errorval_in_scb)
				{
					best_errorval_in_scb = errorval;
					workscb.errorval = errorval;
					scb = workscb;

					if (errorval < tune_errorval_threshold)
					{
						// Skip remaining candidates - this is "good enough"
						i = candidate_count;
						break;
					}
				}
			}

			// Perform a final pass over the weights to try to improve them.
			bool adjustments;
			if (di.weight_count != bsd.texel_count)
			{
				adjustments = realign_weights_decimated(
					config.profile, bsd, blk, workscb);
			}
			else
			{
				adjustments = realign_weights_undecimated(
					config.profile, bsd, blk, workscb);
			}

			// Post-realign test
			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
			if (errorval == -ERROR_CALC_DEFAULT)
			{
				errorval = -errorval;
				workscb.block_type = SYM_BTYPE_ERROR;
			}

			trace_add_data("error_postrealign", errorval);
			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);

			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
			// give benefit of the doubt ...
			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
			if (errorval > (threshold * best_errorval_in_scb))
			{
				break;
			}

			if (errorval < best_errorval_in_scb)
			{
				best_errorval_in_scb = errorval;
				workscb.errorval = errorval;
				scb = workscb;

				if (errorval < tune_errorval_threshold)
				{
					// Skip remaining candidates - this is "good enough"
					i = candidate_count;
					break;
				}
			}

			if (!adjustments)
			{
				break;
			}
		}
	}

	return best_errorval_in_mode;
}

/**
 * @brief Determine the lowest cross-channel correlation factor.
 *
 * @param texels_per_block   The number of texels in a block.
 * @param blk                The image block color data to compress.
 *
 * @return Return the lowest correlation factor.
 */
static float prepare_block_statistics(
	int texels_per_block,
	const image_block& blk
) {
	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
	float rs = 0.0f;
	float gs = 0.0f;
	float bs = 0.0f;
	float as = 0.0f;
	float rr_var = 0.0f;
	float gg_var = 0.0f;
	float bb_var = 0.0f;
	float aa_var = 0.0f;
	float rg_cov = 0.0f;
	float rb_cov = 0.0f;
	float ra_cov = 0.0f;
	float gb_cov = 0.0f;
	float ga_cov = 0.0f;
	float ba_cov = 0.0f;

	float weight_sum = 0.0f;

	promise(texels_per_block > 0);
	for (int i = 0; i < texels_per_block; i++)
	{
		float weight = hadd_s(blk.channel_weight) / 4.0f;
		assert(weight >= 0.0f);
		weight_sum += weight;

		float r = blk.data_r[i];
		float g = blk.data_g[i];
		float b = blk.data_b[i];
		float a = blk.data_a[i];

		float rw = r * weight;
		rs += rw;
		rr_var += r * rw;
		rg_cov += g * rw;
		rb_cov += b * rw;
		ra_cov += a * rw;

		float gw = g * weight;
		gs += gw;
		gg_var += g * gw;
		gb_cov += b * gw;
		ga_cov += a * gw;

		float bw = b * weight;
		bs += bw;
		bb_var += b * bw;
		ba_cov += a * bw;

		float aw = a * weight;
		as += aw;
		aa_var += a * aw;
	}

	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);

	rr_var -= rs * (rs * rpt);
	rg_cov -= gs * (rs * rpt);
	rb_cov -= bs * (rs * rpt);
	ra_cov -= as * (rs * rpt);

	gg_var -= gs * (gs * rpt);
	gb_cov -= bs * (gs * rpt);
	ga_cov -= as * (gs * rpt);

	bb_var -= bs * (bs * rpt);
	ba_cov -= as * (bs * rpt);

	aa_var -= as * (as * rpt);

	// These will give a NaN if a channel is constant - these are fixed up in the next step
	rg_cov *= astc::rsqrt(rr_var * gg_var);
	rb_cov *= astc::rsqrt(rr_var * bb_var);
	ra_cov *= astc::rsqrt(rr_var * aa_var);
	gb_cov *= astc::rsqrt(gg_var * bb_var);
	ga_cov *= astc::rsqrt(gg_var * aa_var);
	ba_cov *= astc::rsqrt(bb_var * aa_var);

	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
	if (astc::isnan(ba_cov)) ba_cov = 1.0f;

	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));

	// Diagnostic trace points
	trace_add_data("min_r", blk.data_min.lane<0>());
	trace_add_data("max_r", blk.data_max.lane<0>());
	trace_add_data("min_g", blk.data_min.lane<1>());
	trace_add_data("max_g", blk.data_max.lane<1>());
	trace_add_data("min_b", blk.data_min.lane<2>());
	trace_add_data("max_b", blk.data_max.lane<2>());
	trace_add_data("min_a", blk.data_min.lane<3>());
	trace_add_data("max_a", blk.data_max.lane<3>());
	trace_add_data("cov_rg", fabsf(rg_cov));
	trace_add_data("cov_rb", fabsf(rb_cov));
	trace_add_data("cov_ra", fabsf(ra_cov));
	trace_add_data("cov_gb", fabsf(gb_cov));
	trace_add_data("cov_ga", fabsf(ga_cov));
	trace_add_data("cov_ba", fabsf(ba_cov));

	return lowest_correlation;
}

/* See header for documentation. */
void compress_block(
	const astcenc_contexti& ctx,
	const image_block& blk,
	uint8_t pcb[16],
#if QUALITY_CONTROL
	compression_working_buffers& tmpbuf,
	bool calQualityEnable,
	int32_t *mseBlock[RGBA_COM]
#else
	compression_working_buffers& tmpbuf
#endif
	)
{
	astcenc_profile decode_mode = ctx.config.profile;
	symbolic_compressed_block scb;
	const block_size_descriptor& bsd = *ctx.bsd;
	float lowest_correl;

	TRACE_NODE(node0, "block");
	trace_add_data("pos_x", blk.xpos);
	trace_add_data("pos_y", blk.ypos);
	trace_add_data("pos_z", blk.zpos);

	// Set stricter block targets for luminance data as we have more bits to play with
	bool block_is_l = blk.is_luminance();
	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;

	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
	bool block_is_la = blk.is_luminancealpha();
	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;

	bool block_skip_two_plane = false;
	int max_partitions;
	if (ctx.config.privateProfile == HIGH_SPEED_PROFILE ||
		ctx.config.privateProfile == HIGH_SPEED_PROFILE_HIGHBITS)
	{
		max_partitions = 1;
	}
#ifdef ASTC_CUSTOMIZED_ENABLE
	else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
	{
		if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
			g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr)
		{
			printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n");
			return;
		}
		max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_();
	}
#endif
	else
	{
		max_partitions = ctx.config.tune_partition_count_limit;
	}

	unsigned int requested_partition_indices[3] {
		ctx.config.tune_2partition_index_limit,
		ctx.config.tune_3partition_index_limit,
		ctx.config.tune_4partition_index_limit
	};

	unsigned int requested_partition_trials[3] {
		ctx.config.tune_2partitioning_candidate_limit,
		ctx.config.tune_3partitioning_candidate_limit,
		ctx.config.tune_4partitioning_candidate_limit
	};

#if defined(ASTCENC_DIAGNOSTICS)
	// Do this early in diagnostic builds so we can dump uniform metrics
	// for every block. Do it later in release builds to avoid redundant work!
	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
	float error_threshold = ctx.config.tune_db_limit
	                      * error_weight_sum
	                      * block_is_l_scale
	                      * block_is_la_scale;

	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
	trace_add_data("lowest_correl", lowest_correl);
	trace_add_data("tune_error_threshold", error_threshold);
#endif

	// Detected a constant-color block
	if (all(blk.data_min == blk.data_max))
	{
		TRACE_NODE(node1, "pass");
		trace_add_data("partition_count", 0);
		trace_add_data("plane_count", 1);

		scb.partition_count = 0;

		// Encode as FP16 if using HDR
		if ((decode_mode == ASTCENC_PRF_HDR) ||
		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
		{
			scb.block_type = SYM_BTYPE_CONST_F16;
			vint4 color_f16 = float_to_float16(blk.origin_texel);
			store(color_f16, scb.constant_color);
		}
		// Encode as UNORM16 if NOT using HDR
		else
		{
			scb.block_type = SYM_BTYPE_CONST_U16;
			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
			vint4 color_u16 = float_to_int_rtn(color_f32);
			store(color_u16, scb.constant_color);
		}

		trace_add_data("exit", "quality hit");
		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE &&
			ctx.config.privateProfile != HIGH_SPEED_PROFILE_HIGHBITS)
		{
			scb.block_type = SYM_BTYPE_NONCONST;
			scb.partition_count = 1;
			scb.color_formats_matched = 0;
			scb.plane2_component = -1;
			if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
			{
				scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE;
			}
#ifdef ASTC_CUSTOMIZED_ENABLE
			else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
			{
				if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
					g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr)
				{
					printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n");
					return;
				}
				scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_();
			}
#endif
			scb.partition_index = 0;
			scb.quant_mode = QUANT_256;
			scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
			for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
				scb.weights[w] = 0;
			}
			for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
				scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
				scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
			}
		}
		scb.privateProfile = ctx.config.privateProfile;
		symbolic_to_physical(bsd, scb, pcb);
#if QUALITY_CONTROL
	if (calQualityEnable) {
		*mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0;
	}
#endif
		return;
	}

#if !defined(ASTCENC_DIAGNOSTICS)
	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
	float error_threshold = ctx.config.tune_db_limit
	                      * error_weight_sum
	                      * block_is_l_scale
	                      * block_is_la_scale;
#endif

	// Set SCB and mode errors to a very high error value
	scb.errorval = ERROR_CALC_DEFAULT;
	scb.block_type = SYM_BTYPE_ERROR;

	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
	};

	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
		0.0f,
		ctx.config.tune_2partition_early_out_limit_factor,
		ctx.config.tune_3partition_early_out_limit_factor,
		0.0f
	};

	// Trial using 1 plane of weights and 1 partition.

	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
	// compression and slightly reduces image quality.

	float errorval_mult[2] {
		1.0f / ctx.config.tune_mse_overshoot,
		1.0f
	};

	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;

	// Only enable MODE0 fast path if enabled
	// Never enable for 3D blocks as no "always" block modes are available
	int start_trial = 1;
 	if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
	{
		start_trial = 0;
	}

	int quant_limit = QUANT_32;
	for (int i = start_trial; i < 2; i++)
	{
		TRACE_NODE(node1, "pass");
		trace_add_data("partition_count", 1);
		trace_add_data("plane_count", 1);
		trace_add_data("search_mode", i);

		float errorval = compress_symbolic_block_for_partition_1plane(
		    ctx.config.privateProfile,
		    ctx.config, bsd, blk, i == 0,
		    error_threshold * errorval_mult[i] * errorval_overshoot,
		    1, 0,  scb, tmpbuf, QUANT_32);

		// Record the quant level so we can use the filter later searches
		const auto& bm = bsd.get_block_mode(scb.block_mode);
		quant_limit = bm.get_weight_quant_mode();

		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
		if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE ||
			ctx.config.privateProfile == HIGH_SPEED_PROFILE_HIGHBITS) ||
			(errorval < (error_threshold * errorval_mult[i])))
		{
			trace_add_data("exit", "quality hit");
			goto END_OF_TESTS;
		}
	}

#if !defined(ASTCENC_DIAGNOSTICS)
	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
#endif

	block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;

	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
	// alpha is the most likely to be non-correlated if it is present in the data.
	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
	{
		if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
		{
			break;
		}
		TRACE_NODE(node1, "pass");
		trace_add_data("partition_count", 1);
		trace_add_data("plane_count", 2);
		trace_add_data("plane_component", i);

		if (block_skip_two_plane)
		{
			trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
			continue;
		}

		if (blk.grayscale && i != 3)
		{
			trace_add_data("skip", "grayscale block");
			continue;
		}

		if (blk.is_constant_channel(i))
		{
			trace_add_data("skip", "constant component");
			continue;
		}

		float errorval = compress_symbolic_block_for_partition_2planes(
		    ctx.config.privateProfile,
		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
		    i, scb, tmpbuf, quant_limit);

		// If attempting two planes is much worse than the best one plane result
		// then further two plane searches are unlikely to help so move on ...
		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
		{
			break;
		}

		if (errorval < error_threshold)
		{
			trace_add_data("exit", "quality hit");
			goto END_OF_TESTS;
		}
	}

	// Find best blocks for 2, 3 and 4 partitions
	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
	{
		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];

		unsigned int requested_indices = requested_partition_indices[partition_count - 2];

		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
		requested_trials = astc::min(requested_trials, requested_indices);

		unsigned int actual_trials = find_best_partition_candidates(
		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);

		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];

		for (unsigned int i = 0; i < actual_trials; i++)
		{
			TRACE_NODE(node1, "pass");
			trace_add_data("partition_count", partition_count);
			trace_add_data("partition_index", partition_indices[i]);
			trace_add_data("plane_count", 1);
			trace_add_data("search_mode", i);

			float errorval = compress_symbolic_block_for_partition_1plane(
			    ctx.config.privateProfile,
			    ctx.config, bsd, blk, false,
			    error_threshold * errorval_overshoot,
			    partition_count, partition_indices[i],
			    scb, tmpbuf, quant_limit);

			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);

			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
			// aligns with a partitioning that suits that encoding, so for this inner loop check add
			// a large error scale because the "other" trial could be a lot better.
			float best_error = best_errorvals_for_pcount[partition_count - 1];
			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
			if (best_error > (best_error_in_prev * best_error_scale))
			{
				trace_add_data("skip", "tune_partition_early_out_limit_factor");
				goto END_OF_TESTS;
			}

			if (errorval < error_threshold)
			{
				trace_add_data("exit", "quality hit");
				goto END_OF_TESTS;
			}
		}

		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
		float best_error = best_errorvals_for_pcount[partition_count - 1];
		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
		if (best_error > (best_error_in_prev * best_error_scale))
		{
			trace_add_data("skip", "tune_partition_early_out_limit_factor");
			goto END_OF_TESTS;
		}
	}

	trace_add_data("exit", "quality not hit");

END_OF_TESTS:
	// If we still have an error block then convert to something we can encode
	// TODO: Do something more sensible here, such as average color block
	if (scb.block_type == SYM_BTYPE_ERROR)
	{
#if defined(ASTCENC_DIAGNOSTICS)
		static bool printed_once = false;
		if (!printed_once)
		{
			printed_once = true;
			printf("WARN: At least one block failed to find a valid encoding.\n"
			       "      Try increasing compression quality settings.\n\n");
		}
#endif

		scb.block_type = SYM_BTYPE_CONST_U16;
		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
		vint4 color_u16 = float_to_int_rtn(color_f32);
		store(color_u16, scb.constant_color);
	}

	// Compress to a physical block
	scb.privateProfile = ctx.config.privateProfile;
	symbolic_to_physical(bsd, scb, pcb);
#if QUALITY_CONTROL
	if (calQualityEnable) {
		image_block decBlk = blk;
		decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk);
		vint4 colorSumDiff = vint4::zero();
		for (size_t ii = 0; ii < bsd.texel_count; ii++) {
			vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f);
			vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f);
			vint4 colorDiff = colorRef - colorTest;
			colorSumDiff += colorDiff * colorDiff;
		}
		*mseBlock[R_COM] = colorSumDiff.lane<0>();
		*mseBlock[G_COM] = colorSumDiff.lane<1>();
		*mseBlock[B_COM] = colorSumDiff.lane<2>();
		*mseBlock[A_COM] = colorSumDiff.lane<3>();
    }
#endif
}

#endif