• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions for the library entrypoint.
20  */
21 
22 #include <array>
23 #include <cstring>
24 #include <new>
25 
26 #include "astcenc.h"
27 #include "astcenc_internal.h"
28 #include "astcenc_diagnostic_trace.h"
29 
30 /**
31  * @brief Record of the quality tuning parameter values.
32  *
33  * See the @c astcenc_config structure for detailed parameter documentation.
34  *
35  * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
36  * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
37  * for the more through search presets because the underlying db_limit is so much higher.
38  */
39 struct astcenc_preset_config
40 {
41 	float quality;
42 	unsigned int tune_partition_count_limit;
43 	unsigned int tune_partition_index_limit;
44 	unsigned int tune_block_mode_limit;
45 	unsigned int tune_refinement_limit;
46 	unsigned int tune_candidate_limit;
47 	float tune_db_limit_a_base;
48 	float tune_db_limit_b_base;
49 	float tune_mode0_mse_overshoot;
50 	float tune_refinement_mse_overshoot;
51 	float tune_2_partition_early_out_limit_factor;
52 	float tune_3_partition_early_out_limit_factor;
53 	float tune_2_plane_early_out_limit_correlation;
54 	unsigned int tune_low_weight_count_limit;
55 };
56 
57 
58 /**
59  * @brief The static quality presets that are built-in for high bandwidth
60  * presets (x < 25 texels per block).
61  */
62 static const std::array<astcenc_preset_config, 5> preset_configs_high {{
63 	{
64 		ASTCENC_PRE_FASTEST,
65 		2, 8, 42, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
66 	}, {
67 		ASTCENC_PRE_FAST,
68 		3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
69 	}, {
70 		ASTCENC_PRE_MEDIUM,
71 		4, 26, 76, 3, 3 , 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
72 	}, {
73 		ASTCENC_PRE_THOROUGH,
74 		4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12
75 	}, {
76 		ASTCENC_PRE_EXHAUSTIVE,
77 		4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
78 	}
79 }};
80 
81 
82 /**
83  * @brief The static quality presets that are built-in for medium bandwidth
84  * presets (25 <= x < 64 texels per block).
85  */
86 static const std::array<astcenc_preset_config, 5> preset_configs_mid {{
87 	{
88 		ASTCENC_PRE_FASTEST,
89 		2, 8, 40, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
90 	}, {
91 		ASTCENC_PRE_FAST,
92 		3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
93 	}, {
94 		ASTCENC_PRE_MEDIUM,
95 		4, 26, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
96 	}, {
97 		ASTCENC_PRE_THOROUGH,
98 		4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10
99 	}, {
100 		ASTCENC_PRE_EXHAUSTIVE,
101 		4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
102 	}
103 }};
104 
105 
106 /**
107  * @brief The static quality presets that are built-in for low bandwidth
108  * presets (64 <= x texels per block).
109  */
110 static const std::array<astcenc_preset_config, 5> preset_configs_low {{
111 	{
112 		ASTCENC_PRE_FASTEST,
113 		2, 6, 38, 2, 2, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
114 	}, {
115 		ASTCENC_PRE_FAST,
116 		3, 10, 53, 3, 3, 85.0f, 63.0f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
117 	}, {
118 		ASTCENC_PRE_MEDIUM,
119 		3, 26, 76, 3, 3, 95.0f, 70.0f, 3.5f, 3.5f, 1.2f, 1.25f, 0.65f, 12
120 	}, {
121 		ASTCENC_PRE_THOROUGH,
122 		4, 75, 92, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.85f, 10
123 	}, {
124 		ASTCENC_PRE_EXHAUSTIVE,
125 		4, 1024, 100, 4, 4, 200.0f, 200.0f, 10.0f, 10.0f, 10.0f, 10.0f, 0.99f, 0
126 	}
127 }};
128 
129 /**
130  * @brief Validate CPU floating point meets assumptions made in the codec.
131  *
132  * The codec is written with the assumption that a float threaded through the @c if32 union will be
133  * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
134  * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
135  * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
136  *
137  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
138  */
validate_cpu_float()139 static astcenc_error validate_cpu_float()
140 {
141 	if32 p;
142 	volatile float xprec_testval = 2.51f;
143 	p.f = xprec_testval + 12582912.0f;
144 	float q = p.f - 12582912.0f;
145 
146 	if (q != 3.0f)
147 	{
148 		return ASTCENC_ERR_BAD_CPU_FLOAT;
149 	}
150 
151 	return ASTCENC_SUCCESS;
152 }
153 
154 /**
155  * @brief Validate CPU ISA support meets the requirements of this build of the library.
156  *
157  * Each library build is statically compiled for a particular set of CPU ISA features, such as the
158  * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
159  * actually supports everything this build needs.
160  *
161  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
162  */
validate_cpu_isa()163 static astcenc_error validate_cpu_isa()
164 {
165 	#if ASTCENC_SSE >= 41
166 		if (!cpu_supports_sse41())
167 		{
168 			return ASTCENC_ERR_BAD_CPU_ISA;
169 		}
170 	#endif
171 
172 	#if ASTCENC_POPCNT >= 1
173 		if (!cpu_supports_popcnt())
174 		{
175 			return ASTCENC_ERR_BAD_CPU_ISA;
176 		}
177 	#endif
178 
179 	#if ASTCENC_F16C >= 1
180 		if (!cpu_supports_f16c())
181 		{
182 			return ASTCENC_ERR_BAD_CPU_ISA;
183 		}
184 	#endif
185 
186 	#if ASTCENC_AVX >= 2
187 		if (!cpu_supports_avx2())
188 		{
189 			return ASTCENC_ERR_BAD_CPU_ISA;
190 		}
191 	#endif
192 
193 	return ASTCENC_SUCCESS;
194 }
195 
196 /**
197  * @brief Validate config profile.
198  *
199  * @param profile   The profile to check.
200  *
201  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
202  */
validate_profile(astcenc_profile profile)203 static astcenc_error validate_profile(
204 	astcenc_profile profile
205 ) {
206 	// Values in this enum are from an external user, so not guaranteed to be
207 	// bounded to the enum values
208 	switch (static_cast<int>(profile))
209 	{
210 	case ASTCENC_PRF_LDR_SRGB:
211 	case ASTCENC_PRF_LDR:
212 	case ASTCENC_PRF_HDR_RGB_LDR_A:
213 	case ASTCENC_PRF_HDR:
214 		return ASTCENC_SUCCESS;
215 	default:
216 		return ASTCENC_ERR_BAD_PROFILE;
217 	}
218 }
219 
220 /**
221  * @brief Validate block size.
222  *
223  * @param block_x   The block x dimensions.
224  * @param block_y   The block y dimensions.
225  * @param block_z   The block z dimensions.
226  *
227  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
228  */
validate_block_size(unsigned int block_x,unsigned int block_y,unsigned int block_z)229 static astcenc_error validate_block_size(
230 	unsigned int block_x,
231 	unsigned int block_y,
232 	unsigned int block_z
233 ) {
234 	// Test if this is a legal block size at all
235 	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
236 	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
237 	if (!is_legal)
238 	{
239 		return ASTCENC_ERR_BAD_BLOCK_SIZE;
240 	}
241 
242 	// Test if this build has sufficient capacity for this block size
243 	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
244 	if (!have_capacity)
245 	{
246 		return ASTCENC_ERR_NOT_IMPLEMENTED;
247 	}
248 
249 	return ASTCENC_SUCCESS;
250 }
251 
252 /**
253  * @brief Validate flags.
254  *
255  * @param flags   The flags to check.
256  *
257  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
258  */
validate_flags(unsigned int flags)259 static astcenc_error validate_flags(
260 	unsigned int flags
261 ) {
262 	// Flags field must not contain any unknown flag bits
263 	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
264 	if (popcount(flags & exMask) != 0)
265 	{
266 		return ASTCENC_ERR_BAD_FLAGS;
267 	}
268 
269 	// Flags field must only contain at most a single map type
270 	exMask = ASTCENC_FLG_MAP_MASK
271 	       | ASTCENC_FLG_MAP_NORMAL
272 	       | ASTCENC_FLG_MAP_RGBM;
273 	if (popcount(flags & exMask) > 1)
274 	{
275 		return ASTCENC_ERR_BAD_FLAGS;
276 	}
277 
278 	return ASTCENC_SUCCESS;
279 }
280 
281 #if !defined(ASTCENC_DECOMPRESS_ONLY)
282 
283 /**
284  * @brief Validate single channel compression swizzle.
285  *
286  * @param swizzle   The swizzle to check.
287  *
288  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
289  */
validate_compression_swz(astcenc_swz swizzle)290 static astcenc_error validate_compression_swz(
291 	astcenc_swz swizzle
292 ) {
293 	// Not all enum values are handled; SWZ_Z is invalid for compression
294 	switch (static_cast<int>(swizzle))
295 	{
296 	case ASTCENC_SWZ_R:
297 	case ASTCENC_SWZ_G:
298 	case ASTCENC_SWZ_B:
299 	case ASTCENC_SWZ_A:
300 	case ASTCENC_SWZ_0:
301 	case ASTCENC_SWZ_1:
302 		return ASTCENC_SUCCESS;
303 	default:
304 		return ASTCENC_ERR_BAD_SWIZZLE;
305 	}
306 }
307 
308 /**
309  * @brief Validate overall compression swizzle.
310  *
311  * @param swizzle   The swizzle to check.
312  *
313  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
314  */
validate_compression_swizzle(const astcenc_swizzle & swizzle)315 static astcenc_error validate_compression_swizzle(
316 	const astcenc_swizzle& swizzle
317 ) {
318 	if (validate_compression_swz(swizzle.r) ||
319 	    validate_compression_swz(swizzle.g) ||
320 	    validate_compression_swz(swizzle.b) ||
321 	    validate_compression_swz(swizzle.a))
322 	{
323 		return ASTCENC_ERR_BAD_SWIZZLE;
324 	}
325 
326 	return ASTCENC_SUCCESS;
327 }
328 #endif
329 
330 /**
331  * @brief Validate single channel decompression swizzle.
332  *
333  * @param swizzle   The swizzle to check.
334  *
335  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
336  */
validate_decompression_swz(astcenc_swz swizzle)337 static astcenc_error validate_decompression_swz(
338 	astcenc_swz swizzle
339 ) {
340 	// Values in this enum are from an external user, so not guaranteed to be
341 	// bounded to the enum values
342 	switch (static_cast<int>(swizzle))
343 	{
344 	case ASTCENC_SWZ_R:
345 	case ASTCENC_SWZ_G:
346 	case ASTCENC_SWZ_B:
347 	case ASTCENC_SWZ_A:
348 	case ASTCENC_SWZ_0:
349 	case ASTCENC_SWZ_1:
350 	case ASTCENC_SWZ_Z:
351 		return ASTCENC_SUCCESS;
352 	default:
353 		return ASTCENC_ERR_BAD_SWIZZLE;
354 	}
355 }
356 
357 /**
358  * @brief Validate overall decompression swizzle.
359  *
360  * @param swizzle   The swizzle to check.
361  *
362  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
363  */
validate_decompression_swizzle(const astcenc_swizzle & swizzle)364 static astcenc_error validate_decompression_swizzle(
365 	const astcenc_swizzle& swizzle
366 ) {
367 	if (validate_decompression_swz(swizzle.r) ||
368 	    validate_decompression_swz(swizzle.g) ||
369 	    validate_decompression_swz(swizzle.b) ||
370 	    validate_decompression_swz(swizzle.a))
371 	{
372 		return ASTCENC_ERR_BAD_SWIZZLE;
373 	}
374 
375 	return ASTCENC_SUCCESS;
376 }
377 
378 /**
379  * Validate that an incoming configuration is in-spec.
380  *
381  * This function can respond in two ways:
382  *
383  *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
384  *     for out-of-range inputs in this case.
385  *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
386  *     algorithmically will return an error.
387  *
388  * @param[in,out] config   The input compressor configuration.
389  *
390  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
391  */
validate_config(astcenc_config & config)392 static astcenc_error validate_config(
393 	astcenc_config &config
394 ) {
395 	astcenc_error status;
396 
397 	status = validate_profile(config.profile);
398 	if (status != ASTCENC_SUCCESS)
399 	{
400 		return status;
401 	}
402 
403 	status = validate_flags(config.flags);
404 	if (status != ASTCENC_SUCCESS)
405 	{
406 		return status;
407 	}
408 
409 	status = validate_block_size(config.block_x, config.block_y, config.block_z);
410 	if (status != ASTCENC_SUCCESS)
411 	{
412 		return status;
413 	}
414 
415 #if defined(ASTCENC_DECOMPRESS_ONLY)
416 	// Decompress-only builds only support decompress-only contexts
417 	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
418 	{
419 		return ASTCENC_ERR_BAD_PARAM;
420 	}
421 #endif
422 
423 	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
424 
425 	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
426 	config.tune_partition_index_limit = astc::clamp(config.tune_partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
427 	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
428 	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
429 	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
430 	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
431 	config.tune_mode0_mse_overshoot = astc::max(config.tune_mode0_mse_overshoot, 1.0f);
432 	config.tune_refinement_mse_overshoot = astc::max(config.tune_refinement_mse_overshoot, 1.0f);
433 	config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
434 	config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
435 	config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
436 
437 	// Specifying a zero weight color component is not allowed; force to small value
438 	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
439 	                             astc::max(config.cw_b_weight, config.cw_a_weight));
440 	if (max_weight > 0.0f)
441 	{
442 		max_weight /= 1000.0f;
443 		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
444 		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
445 		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
446 		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
447 	}
448 	// If all color components error weights are zero then return an error
449 	else
450 	{
451 		return ASTCENC_ERR_BAD_PARAM;
452 	}
453 
454 	return ASTCENC_SUCCESS;
455 }
456 
457 /* See header for documentation. */
astcenc_config_init(astcenc_profile profile,unsigned int block_x,unsigned int block_y,unsigned int block_z,float quality,unsigned int flags,astcenc_config * configp)458 astcenc_error astcenc_config_init(
459 	astcenc_profile profile,
460 	unsigned int block_x,
461 	unsigned int block_y,
462 	unsigned int block_z,
463 	float quality,
464 	unsigned int flags,
465 	astcenc_config* configp
466 ) {
467 	astcenc_error status;
468 	astcenc_config& config = *configp;
469 
470 	// Zero init all config fields; although most of will be over written
471 	std::memset(&config, 0, sizeof(config));
472 
473 	// Process the block size
474 	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
475 	status = validate_block_size(block_x, block_y, block_z);
476 	if (status != ASTCENC_SUCCESS)
477 	{
478 		return status;
479 	}
480 
481 	config.block_x = block_x;
482 	config.block_y = block_y;
483 	config.block_z = block_z;
484 
485 	float texels = static_cast<float>(block_x * block_y * block_z);
486 	float ltexels = logf(texels) / logf(10.0f);
487 
488 	// Process the performance quality level or preset; note that this must be done before we
489 	// process any additional settings, such as color profile and flags, which may replace some of
490 	// these settings with more use case tuned values
491 	if (quality < ASTCENC_PRE_FASTEST ||
492 	    quality > ASTCENC_PRE_EXHAUSTIVE)
493 	{
494 		return ASTCENC_ERR_BAD_QUALITY;
495 	}
496 
497 	static const std::array<astcenc_preset_config, 5>* preset_configs;
498 	int texels_int = block_x * block_y * block_z;
499 	if (texels_int < 25)
500 	{
501 		preset_configs = &preset_configs_high;
502 	}
503 	else if (texels_int < 64)
504 	{
505 		preset_configs = &preset_configs_mid;
506 	}
507 	else
508 	{
509 		preset_configs = &preset_configs_low;
510 	}
511 
512 	// Determine which preset to use, or which pair to interpolate
513 	size_t start;
514 	size_t end;
515 	for (end = 0; end < preset_configs->size(); end++)
516 	{
517 		if ((*preset_configs)[end].quality >= quality)
518 		{
519 			break;
520 		}
521 	}
522 
523 	start = end == 0 ? 0 : end - 1;
524 
525 	// Start and end node are the same - so just transfer the values.
526 	if (start == end)
527 	{
528 		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
529 		config.tune_partition_index_limit = (*preset_configs)[start].tune_partition_index_limit;
530 		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
531 		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
532 		config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit,
533 		                                        TUNE_MAX_TRIAL_CANDIDATES);
534 		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
535 		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
536 
537 		config.tune_mode0_mse_overshoot = (*preset_configs)[start].tune_mode0_mse_overshoot;
538 		config.tune_refinement_mse_overshoot = (*preset_configs)[start].tune_refinement_mse_overshoot;
539 
540 		config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
541 		config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
542 		config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
543 		config.tune_low_weight_count_limit = (*preset_configs)[start].tune_low_weight_count_limit;
544 	}
545 	// Start and end node are not the same - so interpolate between them
546 	else
547 	{
548 		auto& node_a = (*preset_configs)[start];
549 		auto& node_b = (*preset_configs)[end];
550 
551 		float wt_range = node_b.quality - node_a.quality;
552 		assert(wt_range > 0);
553 
554 		// Compute interpolation factors
555 		float wt_node_a = (node_b.quality - quality) / wt_range;
556 		float wt_node_b = (quality - node_a.quality) / wt_range;
557 
558 		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
559 		#define LERPI(param) astc::flt2int_rtn(\
560 		                         (static_cast<float>(node_a.param) * wt_node_a) + \
561 		                         (static_cast<float>(node_b.param) * wt_node_b))
562 		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
563 
564 		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
565 		config.tune_partition_index_limit = LERPI(tune_partition_index_limit);
566 		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
567 		config.tune_refinement_limit = LERPI(tune_refinement_limit);
568 		config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
569 		                                        TUNE_MAX_TRIAL_CANDIDATES);
570 		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
571 		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
572 
573 		config.tune_mode0_mse_overshoot = LERP(tune_mode0_mse_overshoot);
574 		config.tune_refinement_mse_overshoot = LERP(tune_refinement_mse_overshoot);
575 
576 		config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
577 		config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
578 		config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
579 		config.tune_low_weight_count_limit = LERPI(tune_low_weight_count_limit);
580 		#undef LERP
581 		#undef LERPI
582 		#undef LERPUI
583 	}
584 
585 	// Set heuristics to the defaults for each color profile
586 	config.cw_r_weight = 1.0f;
587 	config.cw_g_weight = 1.0f;
588 	config.cw_b_weight = 1.0f;
589 	config.cw_a_weight = 1.0f;
590 
591 	config.a_scale_radius = 0;
592 
593 	config.rgbm_m_scale = 0.0f;
594 
595 	config.profile = profile;
596 
597 	// Values in this enum are from an external user, so not guaranteed to be
598 	// bounded to the enum values
599 	switch (static_cast<int>(profile))
600 	{
601 	case ASTCENC_PRF_LDR:
602 	case ASTCENC_PRF_LDR_SRGB:
603 		break;
604 	case ASTCENC_PRF_HDR_RGB_LDR_A:
605 	case ASTCENC_PRF_HDR:
606 		config.tune_db_limit = 999.0f;
607 		break;
608 	default:
609 		return ASTCENC_ERR_BAD_PROFILE;
610 	}
611 
612 	// Flags field must not contain any unknown flag bits
613 	status = validate_flags(flags);
614 	if (status != ASTCENC_SUCCESS)
615 	{
616 		return status;
617 	}
618 
619 	if (flags & ASTCENC_FLG_MAP_NORMAL)
620 	{
621 		// Normal map encoding uses L+A blocks, so allow one more partitioning
622 		// than normal. We need need fewer bits for endpoints, so more likely
623 		// to be able to use more partitions than an RGB/RGBA block
624 		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
625 
626 		config.cw_g_weight = 0.0f;
627 		config.cw_b_weight = 0.0f;
628 		config.tune_2_partition_early_out_limit_factor *= 1.5f;
629 		config.tune_3_partition_early_out_limit_factor *= 1.5f;
630 		config.tune_2_plane_early_out_limit_correlation = 0.99f;
631 
632 		// Normals are prone to blocking artifacts on smooth curves
633 		// so force compressor to try harder here ...
634 		config.tune_db_limit *= 1.03f;
635 	}
636 	else if (flags & ASTCENC_FLG_MAP_MASK)
637 	{
638 		// Masks are prone to blocking artifacts on mask edges
639 		// so force compressor to try harder here ...
640 		config.tune_db_limit *= 1.03f;
641 	}
642 	else if (flags & ASTCENC_FLG_MAP_RGBM)
643 	{
644 		config.rgbm_m_scale = 5.0f;
645 		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
646 	}
647 	else // (This is color data)
648 	{
649 		// This is a very basic perceptual metric for RGB color data, which weights error
650 		// significance by the perceptual luminance contribution of each color channel. For
651 		// luminance the usual weights to compute luminance from a linear RGB value are as
652 		// follows:
653 		//
654 		//     l = r * 0.3 + g * 0.59 + b * 0.11
655 		//
656 		// ... but we scale these up to keep a better balance between color and alpha. Note
657 		// that if the content is using alpha we'd recommend using the -a option to weight
658 		// the color conribution by the alpha transparency.
659 		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
660 		{
661 			config.cw_r_weight = 0.30f * 2.25f;
662 			config.cw_g_weight = 0.59f * 2.25f;
663 			config.cw_b_weight = 0.11f * 2.25f;
664 		}
665 	}
666 	config.flags = flags;
667 
668 	return ASTCENC_SUCCESS;
669 }
670 
671 /* See header for documentation. */
astcenc_context_alloc(const astcenc_config * configp,unsigned int thread_count,astcenc_context ** context)672 astcenc_error astcenc_context_alloc(
673 	const astcenc_config* configp,
674 	unsigned int thread_count,
675 	astcenc_context** context
676 ) {
677 	astcenc_error status;
678 	const astcenc_config& config = *configp;
679 
680 	status = validate_cpu_float();
681 	if (status != ASTCENC_SUCCESS)
682 	{
683 		return status;
684 	}
685 
686 	status = validate_cpu_isa();
687 	if (status != ASTCENC_SUCCESS)
688 	{
689 		return status;
690 	}
691 
692 	if (thread_count == 0)
693 	{
694 		return ASTCENC_ERR_BAD_PARAM;
695 	}
696 
697 #if defined(ASTCENC_DIAGNOSTICS)
698 	// Force single threaded compressor use in diagnostic mode.
699 	if (thread_count != 1)
700 	{
701 		return ASTCENC_ERR_BAD_PARAM;
702 	}
703 #endif
704 
705 	astcenc_context* ctx = new astcenc_context;
706 	ctx->thread_count = thread_count;
707 	ctx->config = config;
708 	ctx->working_buffers = nullptr;
709 
710 	// These are allocated per-compress, as they depend on image size
711 	ctx->input_alpha_averages = nullptr;
712 
713 	// Copy the config first and validate the copy (we may modify it)
714 	status = validate_config(ctx->config);
715 	if (status != ASTCENC_SUCCESS)
716 	{
717 		delete ctx;
718 		return status;
719 	}
720 
721 	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
722 	bool can_omit_modes = config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
723 	init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
724 	                           can_omit_modes,
725 	                           config.tune_partition_count_limit,
726 	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
727 	                           *ctx->bsd);
728 
729 #if !defined(ASTCENC_DECOMPRESS_ONLY)
730 	// Do setup only needed by compression
731 	if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
732 	{
733 		// Turn a dB limit into a per-texel error for faster use later
734 		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
735 		{
736 			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
737 		}
738 		else
739 		{
740 			ctx->config.tune_db_limit = 0.0f;
741 		}
742 
743 		size_t worksize = sizeof(compression_working_buffers) * thread_count;
744 		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
745 		static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
746 		              "compression_working_buffers size must be multiple of vector alignment");
747 		if (!ctx->working_buffers)
748 		{
749 			aligned_free<block_size_descriptor>(ctx->bsd);
750 			delete ctx;
751 			*context = nullptr;
752 			return ASTCENC_ERR_OUT_OF_MEM;
753 		}
754 	}
755 #endif
756 
757 #if defined(ASTCENC_DIAGNOSTICS)
758 	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
759 	if (!ctx->trace_log->m_file)
760 	{
761 		return ASTCENC_ERR_DTRACE_FAILURE;
762 	}
763 
764 	trace_add_data("block_x", config.block_x);
765 	trace_add_data("block_y", config.block_y);
766 	trace_add_data("block_z", config.block_z);
767 #endif
768 
769 	*context = ctx;
770 
771 #if !defined(ASTCENC_DECOMPRESS_ONLY)
772 	prepare_angular_tables();
773 #endif
774 
775 	return ASTCENC_SUCCESS;
776 }
777 
778 /* See header dor documentation. */
astcenc_context_free(astcenc_context * ctx)779 void astcenc_context_free(
780 	astcenc_context* ctx
781 ) {
782 	if (ctx)
783 	{
784 		aligned_free<compression_working_buffers>(ctx->working_buffers);
785 		aligned_free<block_size_descriptor>(ctx->bsd);
786 #if defined(ASTCENC_DIAGNOSTICS)
787 		delete ctx->trace_log;
788 #endif
789 		delete ctx;
790 	}
791 }
792 
793 #if !defined(ASTCENC_DECOMPRESS_ONLY)
794 
795 /**
796  * @brief Compress an image, after any preflight has completed.
797  *
798  * @param[out] ctx            The compressor context.
799  * @param      thread_index   The thread index.
800  * @param      image          The intput image.
801  * @param      swizzle        The input swizzle.
802  * @param[out] buffer         The output array for the compressed data.
803  */
compress_image(astcenc_context & ctx,unsigned int thread_index,const astcenc_image & image,const astcenc_swizzle & swizzle,uint8_t * buffer,bool calQualityEnable,int32_t * mse[RGBA_COM])804 static void compress_image(
805 	astcenc_context& ctx,
806 	unsigned int thread_index,
807 	const astcenc_image& image,
808 	const astcenc_swizzle& swizzle,
809 #if QUALITY_CONTROL
810 	uint8_t* buffer,
811 	bool calQualityEnable,
812 	int32_t *mse[RGBA_COM]
813 #else
814 	uint8_t* buffer
815 #endif
816 ) {
817 	const block_size_descriptor& bsd = *ctx.bsd;
818 	astcenc_profile decode_mode = ctx.config.profile;
819 
820 	image_block blk;
821 
822 	int block_x = bsd.xdim;
823 	int block_y = bsd.ydim;
824 	int block_z = bsd.zdim;
825 	blk.texel_count = block_x * block_y * block_z;
826 
827 	int dim_x = image.dim_x;
828 	int dim_y = image.dim_y;
829 	int dim_z = image.dim_z;
830 
831 	int xblocks = (dim_x + block_x - 1) / block_x;
832 	int yblocks = (dim_y + block_y - 1) / block_y;
833 	int zblocks = (dim_z + block_z - 1) / block_z;
834 	int block_count = zblocks * yblocks * xblocks;
835 
836 	int row_blocks = xblocks;
837 	int plane_blocks = xblocks * yblocks;
838 
839 	// Populate the block channel weights
840 	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
841 	                             ctx.config.cw_g_weight,
842 	                             ctx.config.cw_b_weight,
843 	                             ctx.config.cw_a_weight);
844 
845 	// Use preallocated scratch buffer
846 	auto& temp_buffers = ctx.working_buffers[thread_index];
847 
848 	// Only the first thread actually runs the initializer
849 	ctx.manage_compress.init(block_count);
850 
851 
852 	// Determine if we can use an optimized load function
853 	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
854 	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
855 
856 	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
857 	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
858 
859 	bool use_fast_load = !needs_swz && !needs_hdr &&
860 	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
861 
862 	auto load_func = fetch_image_block;
863 	if (use_fast_load)
864 	{
865 		load_func = fetch_image_block_fast_ldr;
866 	}
867 
868 	// All threads run this processing loop until there is no work remaining
869 	while (true)
870 	{
871 		unsigned int count;
872 		unsigned int base = ctx.manage_compress.get_task_assignment(16, count);
873 		if (!count)
874 		{
875 			break;
876 		}
877 
878 		for (unsigned int i = base; i < base + count; i++)
879 		{
880 			// Decode i into x, y, z block indices
881 			int z = i / plane_blocks;
882 			unsigned int rem = i - (z * plane_blocks);
883 			int y = rem / row_blocks;
884 			int x = rem - (y * row_blocks);
885 
886 			// Test if we can apply some basic alpha-scale RDO
887 			bool use_full_block = true;
888 			if (ctx.config.a_scale_radius != 0 && block_z == 1)
889 			{
890 				int start_x = x * block_x;
891 				int end_x = astc::min(dim_x, start_x + block_x);
892 
893 				int start_y = y * block_y;
894 				int end_y = astc::min(dim_y, start_y + block_y);
895 
896 				// SATs accumulate error, so don't test exactly zero. Test for
897 				// less than 1 alpha in the expanded block footprint that
898 				// includes the alpha radius.
899 				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
900 
901 				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
902 
903 				float footprint = static_cast<float>(x_footprint * y_footprint);
904 				float threshold = 0.9f / (255.0f * footprint);
905 
906 				// Do we have any alpha values?
907 				use_full_block = false;
908 				for (int ay = start_y; ay < end_y; ay++)
909 				{
910 					for (int ax = start_x; ax < end_x; ax++)
911 					{
912 						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
913 						if (a_avg > threshold)
914 						{
915 							use_full_block = true;
916 							ax = end_x;
917 							ay = end_y;
918 						}
919 					}
920 				}
921 			}
922 
923 			// Fetch the full block for compression
924 			if (use_full_block)
925 			{
926 				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
927 			}
928 			// Apply alpha scale RDO - substitute constant color block
929 			else
930 			{
931 				blk.origin_texel = vfloat4::zero();
932 				blk.data_min = vfloat4::zero();
933 				blk.data_mean = vfloat4::zero();
934 				blk.data_max = vfloat4::zero();
935 				blk.grayscale = true;
936 			}
937 
938 			int offset = ((z * yblocks + y) * xblocks + x) * 16;
939 			uint8_t *bp = buffer + offset;
940 			physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
941 #if QUALITY_CONTROL
942 			int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
943 			if (calQualityEnable) {
944 				int offset = (z * yblocks + y) * xblocks + x;
945 				mseBlock[R_COM] = mse[R_COM] + offset;
946 				mseBlock[G_COM] = mse[G_COM] + offset;
947 				mseBlock[B_COM] = mse[B_COM] + offset;
948 				mseBlock[A_COM] = mse[A_COM] + offset;
949 			}
950 			compress_block(ctx, blk, *pcb, temp_buffers, calQualityEnable, mseBlock);
951 #else
952 			compress_block(ctx, blk, *pcb, temp_buffers);
953 #endif
954 		}
955 
956 		ctx.manage_compress.complete_task_assignment(count);
957 	}
958 }
959 
960 #endif
961 
962 /* See header for documentation. */
astcenc_compress_image(astcenc_context * ctx,astcenc_image * imagep,const astcenc_swizzle * swizzle,uint8_t * data_out,size_t data_len,bool calQualityEnable,int32_t * mse[RGBA_COM],unsigned int thread_index)963 astcenc_error astcenc_compress_image(
964 	astcenc_context* ctx,
965 	astcenc_image* imagep,
966 	const astcenc_swizzle* swizzle,
967 	uint8_t* data_out,
968 	size_t data_len,
969 #if QUALITY_CONTROL
970 	bool calQualityEnable,
971 	int32_t *mse[RGBA_COM],
972 #endif
973 	unsigned int thread_index
974 ) {
975 #if defined(ASTCENC_DECOMPRESS_ONLY)
976 	(void)ctx;
977 	(void)imagep;
978 	(void)swizzle;
979 	(void)data_out;
980 	(void)data_len;
981 	(void)thread_index;
982 	return ASTCENC_ERR_BAD_CONTEXT;
983 #else
984 	astcenc_error status;
985 	astcenc_image& image = *imagep;
986 
987 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
988 	{
989 		return ASTCENC_ERR_BAD_CONTEXT;
990 	}
991 
992 	status = validate_compression_swizzle(*swizzle);
993 	if (status != ASTCENC_SUCCESS)
994 	{
995 		return status;
996 	}
997 
998 	if (thread_index >= ctx->thread_count)
999 	{
1000 		return ASTCENC_ERR_BAD_PARAM;
1001 	}
1002 
1003 	unsigned int block_x = ctx->config.block_x;
1004 	unsigned int block_y = ctx->config.block_y;
1005 	unsigned int block_z = ctx->config.block_z;
1006 
1007 	unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
1008 	unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
1009 	unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
1010 
1011 	// Check we have enough output space (16 bytes per block)
1012 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1013 	if (data_len < size_needed)
1014 	{
1015 		return ASTCENC_ERR_OUT_OF_MEM;
1016 	}
1017 
1018 	// If context thread count is one then implicitly reset
1019 	if (ctx->thread_count == 1)
1020 	{
1021 		astcenc_compress_reset(ctx);
1022 	}
1023 
1024 	if (ctx->config.a_scale_radius != 0)
1025 	{
1026 		// First thread to enter will do setup, other threads will subsequently
1027 		// enter the critical section but simply skip over the initialization
1028 		auto init_avg = [ctx, &image, swizzle]() {
1029 			// Perform memory allocations for the destination buffers
1030 			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
1031 			ctx->input_alpha_averages = new float[texel_count];
1032 
1033 			return init_compute_averages(
1034 				image, ctx->config.a_scale_radius, *swizzle,
1035 				ctx->avg_preprocess_args);
1036 		};
1037 
1038 		// Only the first thread actually runs the initializer
1039 		ctx->manage_avg.init(init_avg);
1040 
1041 		// All threads will enter this function and dynamically grab work
1042 		compute_averages(*ctx, ctx->avg_preprocess_args);
1043 	}
1044 
1045 	// Wait for compute_averages to complete before compressing
1046 	ctx->manage_avg.wait();
1047 #if QUALITY_CONTROL
1048 	compress_image(*ctx, thread_index, image, *swizzle, data_out, calQualityEnable, mse);
1049 #else
1050 	compress_image(*ctx, thread_index, image, *swizzle, data_out);
1051 #endif
1052 	// Wait for compress to complete before freeing memory
1053 	ctx->manage_compress.wait();
1054 
1055 	auto term_compress = [ctx]() {
1056 		delete[] ctx->input_alpha_averages;
1057 		ctx->input_alpha_averages = nullptr;
1058 	};
1059 
1060 	// Only the first thread to arrive actually runs the term
1061 	ctx->manage_compress.term(term_compress);
1062 
1063 	return ASTCENC_SUCCESS;
1064 #endif
1065 }
1066 
1067 /* See header for documentation. */
astcenc_compress_reset(astcenc_context * ctx)1068 astcenc_error astcenc_compress_reset(
1069 	astcenc_context* ctx
1070 ) {
1071 #if defined(ASTCENC_DECOMPRESS_ONLY)
1072 	(void)ctx;
1073 	return ASTCENC_ERR_BAD_CONTEXT;
1074 #else
1075 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1076 	{
1077 		return ASTCENC_ERR_BAD_CONTEXT;
1078 	}
1079 
1080 	ctx->manage_avg.reset();
1081 	ctx->manage_compress.reset();
1082 	return ASTCENC_SUCCESS;
1083 #endif
1084 }
1085 
1086 /* See header for documentation. */
astcenc_decompress_image(astcenc_context * ctx,const uint8_t * data,size_t data_len,astcenc_image * image_outp,const astcenc_swizzle * swizzle,unsigned int thread_index)1087 astcenc_error astcenc_decompress_image(
1088 	astcenc_context* ctx,
1089 	const uint8_t* data,
1090 	size_t data_len,
1091 	astcenc_image* image_outp,
1092 	const astcenc_swizzle* swizzle,
1093 	unsigned int thread_index
1094 ) {
1095 	astcenc_error status;
1096 	astcenc_image& image_out = *image_outp;
1097 
1098 	// Today this doesn't matter (working set on stack) but might in future ...
1099 	if (thread_index >= ctx->thread_count)
1100 	{
1101 		return ASTCENC_ERR_BAD_PARAM;
1102 	}
1103 
1104 	status = validate_decompression_swizzle(*swizzle);
1105 	if (status != ASTCENC_SUCCESS)
1106 	{
1107 		return status;
1108 	}
1109 
1110 	unsigned int block_x = ctx->config.block_x;
1111 	unsigned int block_y = ctx->config.block_y;
1112 	unsigned int block_z = ctx->config.block_z;
1113 
1114 	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1115 	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1116 	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1117 
1118 	int row_blocks = xblocks;
1119 	int plane_blocks = xblocks * yblocks;
1120 
1121 	// Check we have enough output space (16 bytes per block)
1122 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1123 	if (data_len < size_needed)
1124 	{
1125 		return ASTCENC_ERR_OUT_OF_MEM;
1126 	}
1127 
1128 	image_block blk;
1129 	blk.texel_count = block_x * block_y * block_z;
1130 
1131 	// If context thread count is one then implicitly reset
1132 	if (ctx->thread_count == 1)
1133 	{
1134 		astcenc_decompress_reset(ctx);
1135 	}
1136 
1137 	// Only the first thread actually runs the initializer
1138 	ctx->manage_decompress.init(zblocks * yblocks * xblocks);
1139 
1140 	// All threads run this processing loop until there is no work remaining
1141 	while (true)
1142 	{
1143 		unsigned int count;
1144 		unsigned int base = ctx->manage_decompress.get_task_assignment(128, count);
1145 		if (!count)
1146 		{
1147 			break;
1148 		}
1149 
1150 		for (unsigned int i = base; i < base + count; i++)
1151 		{
1152 			// Decode i into x, y, z block indices
1153 			int z = i / plane_blocks;
1154 			unsigned int rem = i - (z * plane_blocks);
1155 			int y = rem / row_blocks;
1156 			int x = rem - (y * row_blocks);
1157 
1158 			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1159 			const uint8_t* bp = data + offset;
1160 
1161 			const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
1162 			symbolic_compressed_block scb;
1163 
1164 			physical_to_symbolic(*ctx->bsd, pcb, scb);
1165 
1166 			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1167 			                          x * block_x, y * block_y, z * block_z,
1168 			                          scb, blk);
1169 
1170 			write_image_block(image_out, blk, *ctx->bsd,
1171 			                  x * block_x, y * block_y, z * block_z, *swizzle);
1172 		}
1173 
1174 		ctx->manage_decompress.complete_task_assignment(count);
1175 	}
1176 
1177 	return ASTCENC_SUCCESS;
1178 }
1179 
1180 /* See header for documentation. */
astcenc_decompress_reset(astcenc_context * ctx)1181 astcenc_error astcenc_decompress_reset(
1182 	astcenc_context* ctx
1183 ) {
1184 	ctx->manage_decompress.reset();
1185 	return ASTCENC_SUCCESS;
1186 }
1187 
1188 /* See header for documentation. */
astcenc_get_block_info(astcenc_context * ctx,const uint8_t data[16],astcenc_block_info * info)1189 astcenc_error astcenc_get_block_info(
1190 	astcenc_context* ctx,
1191 	const uint8_t data[16],
1192 	astcenc_block_info* info
1193 ) {
1194 #if defined(ASTCENC_DECOMPRESS_ONLY)
1195 	(void)ctx;
1196 	(void)data;
1197 	(void)info;
1198 	return ASTCENC_ERR_BAD_CONTEXT;
1199 #else
1200 	// Decode the compressed data into a symbolic form
1201 	const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
1202 	symbolic_compressed_block scb;
1203 	physical_to_symbolic(*ctx->bsd, pcb, scb);
1204 
1205 	// Fetch the appropriate partition and decimation tables
1206 	block_size_descriptor& bsd = *ctx->bsd;
1207 
1208 	// Start from a clean slate
1209 	memset(info, 0, sizeof(*info));
1210 
1211 	// Basic info we can always populate
1212 	info->profile = ctx->config.profile;
1213 
1214 	info->block_x = ctx->config.block_x;
1215 	info->block_y = ctx->config.block_y;
1216 	info->block_z = ctx->config.block_z;
1217 	info->texel_count = bsd.texel_count;
1218 
1219 	// Check for error blocks first
1220 	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1221 	if (info->is_error_block)
1222 	{
1223 		return ASTCENC_SUCCESS;
1224 	}
1225 
1226 	// Check for constant color blocks second
1227 	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1228 	                          scb.block_type == SYM_BTYPE_CONST_U16;
1229 	if (info->is_constant_block)
1230 	{
1231 		return ASTCENC_SUCCESS;
1232 	}
1233 
1234 	// Otherwise handle a full block ; known to be valid after conditions above have been checked
1235 	int partition_count = scb.partition_count;
1236 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1237 
1238 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1239 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1240 
1241 	info->weight_x = di.weight_x;
1242 	info->weight_y = di.weight_y;
1243 	info->weight_z = di.weight_z;
1244 
1245 	info->is_dual_plane_block = bm.is_dual_plane != 0;
1246 
1247 	info->partition_count = scb.partition_count;
1248 	info->partition_index = scb.partition_index;
1249 	info->dual_plane_component = scb.plane2_component;
1250 
1251 	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1252 	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1253 
1254 	// Unpack color endpoints for each active partition
1255 	for (unsigned int i = 0; i < scb.partition_count; i++)
1256 	{
1257 		bool rgb_hdr;
1258 		bool a_hdr;
1259 		vint4 endpnt[2];
1260 
1261 		unpack_color_endpoints(ctx->config.profile,
1262 		                       scb.color_formats[i],
1263 		                       scb.get_color_quant_mode(),
1264 		                       scb.color_values[i],
1265 		                       rgb_hdr, a_hdr,
1266 		                       endpnt[0], endpnt[1]);
1267 
1268 		// Store the color endpoint mode info
1269 		info->color_endpoint_modes[i] = scb.color_formats[i];
1270 		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1271 
1272 		// Store the unpacked and decoded color endpoint
1273 		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1274 		for (int j = 0; j < 2; j++)
1275 		{
1276 			vint4 color_lns = lns_to_sf16(endpnt[j]);
1277 			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1278 			vint4 datai = select(color_unorm, color_lns, hdr_mask);
1279 			store(float16_to_float(datai), info->color_endpoints[i][j]);
1280 		}
1281 	}
1282 
1283 	// Unpack weights for each texel
1284 	int weight_plane1[BLOCK_MAX_TEXELS];
1285 	int weight_plane2[BLOCK_MAX_TEXELS];
1286 
1287 	unpack_weights(bsd, scb, di, bm.is_dual_plane, bm.get_weight_quant_mode(), weight_plane1, weight_plane2);
1288 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1289 	{
1290 		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1291 		if (info->is_dual_plane_block)
1292 		{
1293 			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1294 		}
1295 	}
1296 
1297 	// Unpack partition assignments for each texel
1298 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1299 	{
1300 		info->partition_assignment[i] = pi.partition_of_texel[i];
1301 	}
1302 
1303 	return ASTCENC_SUCCESS;
1304 #endif
1305 }
1306 
1307 /* See header for documentation. */
astcenc_get_error_string(astcenc_error status)1308 const char* astcenc_get_error_string(
1309 	astcenc_error status
1310 ) {
1311 	// Values in this enum are from an external user, so not guaranteed to be
1312 	// bounded to the enum values
1313 	switch (static_cast<int>(status))
1314 	{
1315 	case ASTCENC_SUCCESS:
1316 		return "ASTCENC_SUCCESS";
1317 	case ASTCENC_ERR_OUT_OF_MEM:
1318 		return "ASTCENC_ERR_OUT_OF_MEM";
1319 	case ASTCENC_ERR_BAD_CPU_FLOAT:
1320 		return "ASTCENC_ERR_BAD_CPU_FLOAT";
1321 	case ASTCENC_ERR_BAD_CPU_ISA:
1322 		return "ASTCENC_ERR_BAD_CPU_ISA";
1323 	case ASTCENC_ERR_BAD_PARAM:
1324 		return "ASTCENC_ERR_BAD_PARAM";
1325 	case ASTCENC_ERR_BAD_BLOCK_SIZE:
1326 		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1327 	case ASTCENC_ERR_BAD_PROFILE:
1328 		return "ASTCENC_ERR_BAD_PROFILE";
1329 	case ASTCENC_ERR_BAD_QUALITY:
1330 		return "ASTCENC_ERR_BAD_QUALITY";
1331 	case ASTCENC_ERR_BAD_FLAGS:
1332 		return "ASTCENC_ERR_BAD_FLAGS";
1333 	case ASTCENC_ERR_BAD_SWIZZLE:
1334 		return "ASTCENC_ERR_BAD_SWIZZLE";
1335 	case ASTCENC_ERR_BAD_CONTEXT:
1336 		return "ASTCENC_ERR_BAD_CONTEXT";
1337 	case ASTCENC_ERR_NOT_IMPLEMENTED:
1338 		return "ASTCENC_ERR_NOT_IMPLEMENTED";
1339 #if defined(ASTCENC_DIAGNOSTICS)
1340 	case ASTCENC_ERR_DTRACE_FAILURE:
1341 		return "ASTCENC_ERR_DTRACE_FAILURE";
1342 #endif
1343 	default:
1344 		return nullptr;
1345 	}
1346 }
1347