• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions to decompress a symbolic block.
20  */
21 
22 #include "astcenc_internal.h"
23 
24 #include <stdio.h>
25 #include <assert.h>
26 
27 /**
28  * @brief Compute a vector of texel weights by interpolating the decimated weight grid.
29  *
30  * @param base_texel_index   The first texel to get; N (SIMD width) consecutive texels are loaded.
31  * @param di                 The weight grid decimation to use.
32  * @param weights            The raw weights.
33  *
34  * @return The undecimated weight for N (SIMD width) texels.
35  */
compute_value_of_texel_weight_int_vla(int base_texel_index,const decimation_info & di,const int * weights)36 static vint compute_value_of_texel_weight_int_vla(
37 	int base_texel_index,
38 	const decimation_info& di,
39 	const int* weights
40 ) {
41 	vint summed_value(8);
42 	vint weight_count(di.texel_weight_count + base_texel_index);
43 	int max_weight_count = hmax(weight_count).lane<0>();
44 
45 	promise(max_weight_count > 0);
46 	for (int i = 0; i < max_weight_count; i++)
47 	{
48 		vint texel_weights(di.texel_weights_4t[i] + base_texel_index);
49 		vint texel_weights_int(di.texel_weights_int_4t[i] + base_texel_index);
50 
51 		summed_value += gatheri(weights, texel_weights) * texel_weights_int;
52 	}
53 
54 	return lsr<4>(summed_value);
55 }
56 
57 /**
58  * @brief Compute the integer linear interpolation of two color endpoints.
59  *
60  * @param decode_mode   The ASTC profile (linear or sRGB)
61  * @param color0        The endpoint0 color.
62  * @param color1        The endpoint1 color.
63  * @param weights        The interpolation weight (between 0 and 64).
64  *
65  * @return The interpolated color.
66  */
lerp_color_int(astcenc_profile decode_mode,vint4 color0,vint4 color1,vint4 weights)67 static vint4 lerp_color_int(
68 	astcenc_profile decode_mode,
69 	vint4 color0,
70 	vint4 color1,
71 	vint4 weights
72 ) {
73 	vint4 weight1 = weights;
74 	vint4 weight0 = vint4(64) - weight1;
75 
76 	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
77 	{
78 		color0 = asr<8>(color0);
79 		color1 = asr<8>(color1);
80 	}
81 
82 	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
83 	color = asr<6>(color);
84 
85 	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
86 	{
87 		color = color * vint4(257);
88 	}
89 
90 	return color;
91 }
92 
93 
94 /**
95  * @brief Convert integer color value into a float value for the decoder.
96  *
97  * @param data       The integer color value post-interpolation.
98  * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
99  *
100  * @return The float color value.
101  */
decode_texel(vint4 data,vmask4 lns_mask)102 static inline vfloat4 decode_texel(
103 	vint4 data,
104 	vmask4 lns_mask
105 ) {
106 	vint4 color_lns = vint4::zero();
107 	vint4 color_unorm = vint4::zero();
108 
109 	if (any(lns_mask))
110 	{
111 		color_lns = lns_to_sf16(data);
112 	}
113 
114 	if (!all(lns_mask))
115 	{
116 		color_unorm = unorm16_to_sf16(data);
117 	}
118 
119 	// Pick components and then convert to FP16
120 	vint4 datai = select(color_unorm, color_lns, lns_mask);
121 	return float16_to_float(datai);
122 }
123 
124 /* See header for documentation. */
unpack_weights(const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const decimation_info & di,bool is_dual_plane,quant_method quant_level,int weights_plane1[BLOCK_MAX_TEXELS],int weights_plane2[BLOCK_MAX_TEXELS])125 void unpack_weights(
126 	const block_size_descriptor& bsd,
127 	const symbolic_compressed_block& scb,
128 	const decimation_info& di,
129 	bool is_dual_plane,
130 	quant_method quant_level,
131 	int weights_plane1[BLOCK_MAX_TEXELS],
132 	int weights_plane2[BLOCK_MAX_TEXELS]
133 ) {
134 	// First, unquantize the weights ...
135 	int uq_plane1_weights[BLOCK_MAX_WEIGHTS];
136 	int uq_plane2_weights[BLOCK_MAX_WEIGHTS];
137 	unsigned int weight_count = di.weight_count;
138 
139 	const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quant_level]);
140 
141 	// Second, undecimate the weights ...
142 	// Safe to overshoot as all arrays are allocated to full size
143 	if (!is_dual_plane)
144 	{
145 		for (unsigned int i = 0; i < weight_count; i++)
146 		{
147 			uq_plane1_weights[i] = qat->unquantized_value[scb.weights[i]];
148 		}
149 
150 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
151 		{
152 			store(compute_value_of_texel_weight_int_vla(i, di, uq_plane1_weights), weights_plane1 + i);
153 		}
154 	}
155 	else
156 	{
157 		for (unsigned int i = 0; i < weight_count; i++)
158 		{
159 			uq_plane1_weights[i] = qat->unquantized_value[scb.weights[i]];
160 			uq_plane2_weights[i] = qat->unquantized_value[scb.weights[i + WEIGHTS_PLANE2_OFFSET]];
161 		}
162 
163 		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
164 		{
165 			store(compute_value_of_texel_weight_int_vla(i, di, uq_plane1_weights), weights_plane1 + i);
166 			store(compute_value_of_texel_weight_int_vla(i, di, uq_plane2_weights), weights_plane2 + i);
167 		}
168 	}
169 }
170 
171 /**
172  * @brief Return an FP32 NaN value for use in error colors.
173  *
174  * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
175  *
176  * @return The float color value.
177  */
error_color_nan()178 static float error_color_nan()
179 {
180 	if32 v;
181 	v.u = 0xFFFFE000U;
182 	return v.f;
183 }
184 
185 /* See header for documentation. */
decompress_symbolic_block(astcenc_profile decode_mode,const block_size_descriptor & bsd,int xpos,int ypos,int zpos,const symbolic_compressed_block & scb,image_block & blk)186 void decompress_symbolic_block(
187 	astcenc_profile decode_mode,
188 	const block_size_descriptor& bsd,
189 	int xpos,
190 	int ypos,
191 	int zpos,
192 	const symbolic_compressed_block& scb,
193 	image_block& blk
194 ) {
195 	blk.xpos = xpos;
196 	blk.ypos = ypos;
197 	blk.zpos = zpos;
198 
199 	blk.data_min = vfloat4::zero();
200 	blk.data_mean = vfloat4::zero();
201 	blk.data_max = vfloat4::zero();
202 	blk.grayscale = false;
203 
204 	// If we detected an error-block, blow up immediately.
205 	if (scb.block_type == SYM_BTYPE_ERROR)
206 	{
207 		for (unsigned int i = 0; i < bsd.texel_count; i++)
208 		{
209 			blk.data_r[i] = error_color_nan();
210 			blk.data_g[i] = error_color_nan();
211 			blk.data_b[i] = error_color_nan();
212 			blk.data_a[i] = error_color_nan();
213 			blk.rgb_lns[i] = 0;
214 			blk.alpha_lns[i] = 0;
215 		}
216 
217 		return;
218 	}
219 
220 	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
221 	    (scb.block_type == SYM_BTYPE_CONST_U16))
222 	{
223 		vfloat4 color;
224 		uint8_t use_lns = 0;
225 
226 		// UNORM16 constant color block
227 		if (scb.block_type == SYM_BTYPE_CONST_U16)
228 		{
229 			vint4 colori(scb.constant_color);
230 
231 			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
232 			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
233 			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
234 			{
235 				colori = asr<8>(colori) * 257;
236 			}
237 
238 			vint4 colorf16 = unorm16_to_sf16(colori);
239 			color = float16_to_float(colorf16);
240 		}
241 		// FLOAT16 constant color block
242 		else
243 		{
244 			switch (decode_mode)
245 			{
246 			case ASTCENC_PRF_LDR_SRGB:
247 			case ASTCENC_PRF_LDR:
248 				color = vfloat4(error_color_nan());
249 				break;
250 			case ASTCENC_PRF_HDR_RGB_LDR_A:
251 			case ASTCENC_PRF_HDR:
252 				// Constant-color block; unpack from FP16 to FP32.
253 				color = float16_to_float(vint4(scb.constant_color));
254 				use_lns = 1;
255 				break;
256 			}
257 		}
258 
259 		for (unsigned int i = 0; i < bsd.texel_count; i++)
260 		{
261 			blk.data_r[i] = color.lane<0>();
262 			blk.data_g[i] = color.lane<1>();
263 			blk.data_b[i] = color.lane<2>();
264 			blk.data_a[i] = color.lane<3>();
265 			blk.rgb_lns[i] = use_lns;
266 			blk.alpha_lns[i] = use_lns;
267 		}
268 
269 		return;
270 	}
271 
272 	// Get the appropriate partition-table entry
273 	int partition_count = scb.partition_count;
274 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
275 
276 	// Get the appropriate block descriptors
277 	const auto& bm = bsd.get_block_mode(scb.block_mode);
278 	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
279 
280 	int is_dual_plane = bm.is_dual_plane;
281 
282 	// Unquantize and undecimate the weights
283 	int plane1_weights[BLOCK_MAX_TEXELS];
284 	int plane2_weights[BLOCK_MAX_TEXELS];
285 	unpack_weights(bsd, scb, di, is_dual_plane, bm.get_weight_quant_mode(), plane1_weights, plane2_weights);
286 
287 	// Now that we have endpoint colors and weights, we can unpack texel colors
288 	int plane2_component = is_dual_plane ? scb.plane2_component : -1;
289 	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
290 
291 	for (int i = 0; i < partition_count; i++)
292 	{
293 		// Decode the color endpoints for this partition
294 		vint4 ep0;
295 		vint4 ep1;
296 		bool rgb_lns;
297 		bool a_lns;
298 
299 		unpack_color_endpoints(decode_mode,
300 		                       scb.color_formats[i],
301 		                       scb.get_color_quant_mode(),
302 		                       scb.color_values[i],
303 		                       rgb_lns, a_lns,
304 		                       ep0, ep1);
305 
306 		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
307 
308 		int texel_count = pi.partition_texel_count[i];
309 		for (int j = 0; j < texel_count; j++)
310 		{
311 			int tix = pi.texels_of_partition[i][j];
312 			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
313 			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
314 			vfloat4 colorf = decode_texel(color, lns_mask);
315 
316 			blk.data_r[tix] = colorf.lane<0>();
317 			blk.data_g[tix] = colorf.lane<1>();
318 			blk.data_b[tix] = colorf.lane<2>();
319 			blk.data_a[tix] = colorf.lane<3>();
320 		}
321 	}
322 }
323 
324 #if !defined(ASTCENC_DECOMPRESS_ONLY)
325 
326 /* See header for documentation. */
compute_symbolic_block_difference_2plane(const astcenc_config & config,const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const image_block & blk)327 float compute_symbolic_block_difference_2plane(
328 	const astcenc_config& config,
329 	const block_size_descriptor& bsd,
330 	const symbolic_compressed_block& scb,
331 	const image_block& blk
332 ) {
333 	// If we detected an error-block, blow up immediately.
334 	if (scb.block_type == SYM_BTYPE_ERROR)
335 	{
336 		return ERROR_CALC_DEFAULT;
337 	}
338 
339 	assert(scb.block_mode >= 0);
340 	assert(scb.partition_count == 1);
341 	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
342 
343 	// Get the appropriate block descriptor
344 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
345 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
346 
347 	// Unquantize and undecimate the weights
348 	int plane1_weights[BLOCK_MAX_TEXELS];
349 	int plane2_weights[BLOCK_MAX_TEXELS];
350 	unpack_weights(bsd, scb, di, true, bm.get_weight_quant_mode(), plane1_weights, plane2_weights);
351 
352 	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
353 
354 	vfloat4 summa = vfloat4::zero();
355 
356 	// Decode the color endpoints for this partition
357 	vint4 ep0;
358 	vint4 ep1;
359 	bool rgb_lns;
360 	bool a_lns;
361 
362 	unpack_color_endpoints(config.profile,
363 	                       scb.color_formats[0],
364 	                       scb.get_color_quant_mode(),
365 	                       scb.color_values[0],
366 	                       rgb_lns, a_lns,
367 	                       ep0, ep1);
368 
369 	// Unpack and compute error for each texel in the partition
370 	unsigned int texel_count = bsd.texel_count;
371 	for (unsigned int i = 0; i < texel_count; i++)
372 	{
373 		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
374 		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
375 
376 		vfloat4 color = int_to_float(colori);
377 		vfloat4 oldColor = blk.texel(i);
378 
379 		// Compare error using a perceptual decode metric for RGBM textures
380 		if (config.flags & ASTCENC_FLG_MAP_RGBM)
381 		{
382 			// Fail encodings that result in zero weight M pixels. Note that this can cause
383 			// "interesting" artifacts if we reject all useful encodings - we typically get max
384 			// brightness encodings instead which look just as bad. We recommend users apply a
385 			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
386 			// getting small M values post-quantization, but we can't prove it would never
387 			// happen, especially at low bit rates ...
388 			if (color.lane<3>() == 0.0f)
389 			{
390 				return -ERROR_CALC_DEFAULT;
391 			}
392 
393 			// Compute error based on decoded RGBM color
394 			color = vfloat4(
395 				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
396 				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
397 				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
398 				1.0f
399 			);
400 
401 			oldColor = vfloat4(
402 				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
403 				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
404 				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
405 				1.0f
406 			);
407 		}
408 
409 		vfloat4 error = oldColor - color;
410 		error = min(abs(error), 1e15f);
411 		error = error * error;
412 
413 		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
414 	}
415 
416 	return summa.lane<0>();
417 }
418 
419 /* See header for documentation. */
compute_symbolic_block_difference_1plane(const astcenc_config & config,const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const image_block & blk)420 float compute_symbolic_block_difference_1plane(
421 	const astcenc_config& config,
422 	const block_size_descriptor& bsd,
423 	const symbolic_compressed_block& scb,
424 	const image_block& blk
425 ) {
426 	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
427 
428 	// If we detected an error-block, blow up immediately.
429 	if (scb.block_type == SYM_BTYPE_ERROR)
430 	{
431 		return ERROR_CALC_DEFAULT;
432 	}
433 
434 	assert(scb.block_mode >= 0);
435 
436 	// Get the appropriate partition-table entry
437 	unsigned int partition_count = scb.partition_count;
438 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
439 
440 	// Get the appropriate block descriptor
441 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
442 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
443 
444 	// Unquantize and undecimate the weights
445 	int plane1_weights[BLOCK_MAX_TEXELS];
446 	unpack_weights(bsd, scb, di, false, bm.get_weight_quant_mode(), plane1_weights, nullptr);
447 
448 	vfloat4 summa = vfloat4::zero();
449 	for (unsigned int i = 0; i < partition_count; i++)
450 	{
451 		// Decode the color endpoints for this partition
452 		vint4 ep0;
453 		vint4 ep1;
454 		bool rgb_lns;
455 		bool a_lns;
456 
457 		unpack_color_endpoints(config.profile,
458 		                       scb.color_formats[i],
459 		                       scb.get_color_quant_mode(),
460 		                       scb.color_values[i],
461 		                       rgb_lns, a_lns,
462 		                       ep0, ep1);
463 
464 		// Unpack and compute error for each texel in the partition
465 		unsigned int texel_count = pi.partition_texel_count[i];
466 		for (unsigned int j = 0; j < texel_count; j++)
467 		{
468 			unsigned int tix = pi.texels_of_partition[i][j];
469 			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
470 			                              vint4(plane1_weights[tix]));
471 
472 			vfloat4 color = int_to_float(colori);
473 			vfloat4 oldColor = blk.texel(tix);
474 
475 			// Compare error using a perceptual decode metric for RGBM textures
476 			if (config.flags & ASTCENC_FLG_MAP_RGBM)
477 			{
478 				// Fail encodings that result in zero weight M pixels. Note that this can cause
479 				// "interesting" artifacts if we reject all useful encodings - we typically get max
480 				// brightness encodings instead which look just as bad. We recommend users apply a
481 				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
482 				// getting small M values post-quantization, but we can't prove it would never
483 				// happen, especially at low bit rates ...
484 				if (color.lane<3>() == 0.0f)
485 				{
486 					return -ERROR_CALC_DEFAULT;
487 				}
488 
489 				// Compute error based on decoded RGBM color
490 				color = vfloat4(
491 					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
492 					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
493 					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
494 					1.0f
495 				);
496 
497 				oldColor = vfloat4(
498 					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
499 					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
500 					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
501 					1.0f
502 				);
503 			}
504 
505 			vfloat4 error = oldColor - color;
506 			error = min(abs(error), 1e15f);
507 			error = error * error;
508 
509 			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
510 		}
511 	}
512 
513 	return summa.lane<0>();
514 }
515 
516 /* See header for documentation. */
compute_symbolic_block_difference_1plane_1partition(const astcenc_config & config,const block_size_descriptor & bsd,const symbolic_compressed_block & scb,const image_block & blk)517 float compute_symbolic_block_difference_1plane_1partition(
518 	const astcenc_config& config,
519 	const block_size_descriptor& bsd,
520 	const symbolic_compressed_block& scb,
521 	const image_block& blk
522 ) {
523 	// If we detected an error-block, blow up immediately.
524 	if (scb.block_type == SYM_BTYPE_ERROR)
525 	{
526 		return ERROR_CALC_DEFAULT;
527 	}
528 
529 	assert(scb.block_mode >= 0);
530 	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
531 
532 	// Get the appropriate block descriptor
533 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
534 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
535 
536 	// Unquantize and undecimate the weights
537 	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
538 	unpack_weights(bsd, scb, di, false, bm.get_weight_quant_mode(), plane1_weights, nullptr);
539 
540 	// Decode the color endpoints for this partition
541 	vint4 ep0;
542 	vint4 ep1;
543 	bool rgb_lns;
544 	bool a_lns;
545 
546 	unpack_color_endpoints(config.profile,
547 	                       scb.color_formats[0],
548 	                       scb.get_color_quant_mode(),
549 	                       scb.color_values[0],
550 	                       rgb_lns, a_lns,
551 	                       ep0, ep1);
552 
553 
554 	// Pre-shift sRGB so things round correctly
555 	if (config.profile == ASTCENC_PRF_LDR_SRGB)
556 	{
557 		ep0 = asr<8>(ep0);
558 		ep1 = asr<8>(ep1);
559 	}
560 
561 	// Unpack and compute error for each texel in the partition
562 	vfloatacc summav = vfloatacc::zero();
563 
564 	vint lane_id = vint::lane_id();
565 	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
566 
567 	unsigned int texel_count = bsd.texel_count;
568 	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
569 	{
570 		// Compute EP1 contribution
571 		vint weight1 = vint::loada(plane1_weights + i);
572 		vint ep1_r = vint(ep1.lane<0>()) * weight1;
573 		vint ep1_g = vint(ep1.lane<1>()) * weight1;
574 		vint ep1_b = vint(ep1.lane<2>()) * weight1;
575 		vint ep1_a = vint(ep1.lane<3>()) * weight1;
576 
577 		// Compute EP0 contribution
578 		vint weight0 = vint(64) - weight1;
579 		vint ep0_r = vint(ep0.lane<0>()) * weight0;
580 		vint ep0_g = vint(ep0.lane<1>()) * weight0;
581 		vint ep0_b = vint(ep0.lane<2>()) * weight0;
582 		vint ep0_a = vint(ep0.lane<3>()) * weight0;
583 
584 		// Shift so things round correctly
585 		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
586 		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
587 		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
588 		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
589 
590 		// Compute color diff
591 		vfloat color_r = int_to_float(colori_r);
592 		vfloat color_g = int_to_float(colori_g);
593 		vfloat color_b = int_to_float(colori_b);
594 		vfloat color_a = int_to_float(colori_a);
595 
596 		vfloat color_orig_r = loada(blk.data_r + i);
597 		vfloat color_orig_g = loada(blk.data_g + i);
598 		vfloat color_orig_b = loada(blk.data_b + i);
599 		vfloat color_orig_a = loada(blk.data_a + i);
600 
601 		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
602 		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
603 		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
604 		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
605 
606 		// Compute squared error metric
607 		color_error_r = color_error_r * color_error_r;
608 		color_error_g = color_error_g * color_error_g;
609 		color_error_b = color_error_b * color_error_b;
610 		color_error_a = color_error_a * color_error_a;
611 
612 		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
613 		              + color_error_g * blk.channel_weight.lane<1>()
614 		              + color_error_b * blk.channel_weight.lane<2>()
615 		              + color_error_a * blk.channel_weight.lane<3>();
616 
617 		// Mask off bad lanes
618 		vmask mask = lane_id < vint(texel_count);
619 		lane_id += vint(ASTCENC_SIMD_WIDTH);
620 		haccumulate(summav, metric, mask);
621 	}
622 
623 	return hadd_s(summav);
624 }
625 
626 #endif
627