• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions for the library entrypoint.
20  */
21 
22 #include <array>
23 #include <cstring>
24 #include <new>
25 
26 #include "astcenc.h"
27 #include "astcenc_internal_entry.h"
28 #include "astcenc_diagnostic_trace.h"
29 
30 // RGBA数据存储格式说明(内存布局/位分配)
31 constexpr uint8_t COMPONENT_NUM = 4;
32 constexpr uint8_t COMP_G_SHIFT_POSITION = 10;
33 constexpr uint8_t COMP_B_SHIFT_POSITION = 20;
34 constexpr uint8_t COMP_A_SHIFT_POSITION = 30;
35 
36 /**
37  * @brief Record of the quality tuning parameter values.
38  *
39  * See the @c astcenc_config structure for detailed parameter documentation.
40  *
41  * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
42  * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
43  * for the more through search presets because the underlying db_limit is so much higher.
44  */
45 struct astcenc_preset_config
46 {
47 	float quality;
48 	unsigned int tune_partition_count_limit;
49 	unsigned int tune_2partition_index_limit;
50 	unsigned int tune_3partition_index_limit;
51 	unsigned int tune_4partition_index_limit;
52 	unsigned int tune_block_mode_limit;
53 	unsigned int tune_refinement_limit;
54 	unsigned int tune_candidate_limit;
55 	unsigned int tune_2partitioning_candidate_limit;
56 	unsigned int tune_3partitioning_candidate_limit;
57 	unsigned int tune_4partitioning_candidate_limit;
58 	float tune_db_limit_a_base;
59 	float tune_db_limit_b_base;
60 	float tune_mse_overshoot;
61 	float tune_2partition_early_out_limit_factor;
62 	float tune_3partition_early_out_limit_factor;
63 	float tune_2plane_early_out_limit_correlation;
64 	float tune_search_mode0_enable;
65 };
66 
67 /**
68  * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
69  */
70 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
71 	{
72 		ASTCENC_PRE_FASTEST,
73 		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
74 	}, {
75 		ASTCENC_PRE_FAST,
76 		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
77 	}, {
78 		ASTCENC_PRE_MEDIUM,
79 		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
80 	}, {
81 		ASTCENC_PRE_THOROUGH,
82 		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
83 	}, {
84 		ASTCENC_PRE_VERYTHOROUGH,
85 		4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
86 	}, {
87 		ASTCENC_PRE_EXHAUSTIVE,
88 		4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
89 	}
90 }};
91 
92 /**
93  * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
94  */
95 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
96 	{
97 		ASTCENC_PRE_FASTEST,
98 		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
99 	}, {
100 		ASTCENC_PRE_FAST,
101 		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
102 	}, {
103 		ASTCENC_PRE_MEDIUM,
104 		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
105 	}, {
106 		ASTCENC_PRE_THOROUGH,
107 		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
108 	}, {
109 		ASTCENC_PRE_VERYTHOROUGH,
110 		4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
111 	}, {
112 		ASTCENC_PRE_EXHAUSTIVE,
113 		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
114 	}
115 }};
116 
117 /**
118  * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
119  */
120 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
121 	{
122 		ASTCENC_PRE_FASTEST,
123 		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
124 	}, {
125 		ASTCENC_PRE_FAST,
126 		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
127 	}, {
128 		ASTCENC_PRE_MEDIUM,
129 		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
130 	}, {
131 		ASTCENC_PRE_THOROUGH,
132 		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
133 	}, {
134 		ASTCENC_PRE_VERYTHOROUGH,
135 		4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
136 	}, {
137 		ASTCENC_PRE_EXHAUSTIVE,
138 		4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
139 	}
140 }};
141 
142 /**
143  * @brief Validate CPU floating point meets assumptions made in the codec.
144  *
145  * The codec is written with the assumption that a float threaded through the @c if32 union will be
146  * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
147  * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
148  * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
149  *
150  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
151  */
validate_cpu_float()152 static astcenc_error validate_cpu_float()
153 {
154 	if32 p;
155 	volatile float xprec_testval = 2.51f;
156 	p.f = xprec_testval + 12582912.0f;
157 	float q = p.f - 12582912.0f;
158 
159 	if (q != 3.0f)
160 	{
161 		return ASTCENC_ERR_BAD_CPU_FLOAT;
162 	}
163 
164 	return ASTCENC_SUCCESS;
165 }
166 
167 /**
168  * @brief Validate config profile.
169  *
170  * @param profile   The profile to check.
171  *
172  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
173  */
validate_profile(astcenc_profile profile)174 static astcenc_error validate_profile(
175 	astcenc_profile profile
176 ) {
177 	// Values in this enum are from an external user, so not guaranteed to be
178 	// bounded to the enum values
179 	switch (static_cast<int>(profile))
180 	{
181 	case ASTCENC_PRF_LDR_SRGB:
182 	case ASTCENC_PRF_LDR:
183 	case ASTCENC_PRF_HDR_RGB_LDR_A:
184 	case ASTCENC_PRF_HDR:
185 		return ASTCENC_SUCCESS;
186 	default:
187 		return ASTCENC_ERR_BAD_PROFILE;
188 	}
189 }
190 
191 /**
192  * @brief Validate block size.
193  *
194  * @param block_x   The block x dimensions.
195  * @param block_y   The block y dimensions.
196  * @param block_z   The block z dimensions.
197  *
198  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
199  */
validate_block_size(unsigned int block_x,unsigned int block_y,unsigned int block_z)200 static astcenc_error validate_block_size(
201 	unsigned int block_x,
202 	unsigned int block_y,
203 	unsigned int block_z
204 ) {
205 	// Test if this is a legal block size at all
206 	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
207 	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
208 	if (!is_legal)
209 	{
210 		return ASTCENC_ERR_BAD_BLOCK_SIZE;
211 	}
212 
213 	// Test if this build has sufficient capacity for this block size
214 	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
215 	if (!have_capacity)
216 	{
217 		return ASTCENC_ERR_NOT_IMPLEMENTED;
218 	}
219 
220 	return ASTCENC_SUCCESS;
221 }
222 
223 /**
224  * @brief Validate flags.
225  *
226  * @param profile   The profile to check.
227  * @param flags     The flags to check.
228  *
229  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
230  */
validate_flags(astcenc_profile profile,unsigned int flags)231 static astcenc_error validate_flags(
232 	astcenc_profile profile,
233 	unsigned int flags
234 ) {
235 	// Flags field must not contain any unknown flag bits
236 	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
237 	if (popcount(flags & exMask) != 0)
238 	{
239 		return ASTCENC_ERR_BAD_FLAGS;
240 	}
241 
242 	// Flags field must only contain at most a single map type
243 	exMask = ASTCENC_FLG_MAP_NORMAL
244 	       | ASTCENC_FLG_MAP_RGBM;
245 	if (popcount(flags & exMask) > 1)
246 	{
247 		return ASTCENC_ERR_BAD_FLAGS;
248 	}
249 
250 	// Decode_unorm8 must only be used with an LDR profile
251 	bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
252 	bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
253 	if (is_unorm8 && is_hdr)
254 	{
255 		return ASTCENC_ERR_BAD_DECODE_MODE;
256 	}
257 
258 	return ASTCENC_SUCCESS;
259 }
260 
261 #if !defined(ASTCENC_DECOMPRESS_ONLY)
262 
263 /**
264  * @brief Validate single channel compression swizzle.
265  *
266  * @param swizzle   The swizzle to check.
267  *
268  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
269  */
validate_compression_swz(astcenc_swz swizzle)270 static astcenc_error validate_compression_swz(
271 	astcenc_swz swizzle
272 ) {
273 	// Not all enum values are handled; SWZ_Z is invalid for compression
274 	switch (static_cast<int>(swizzle))
275 	{
276 	case ASTCENC_SWZ_R:
277 	case ASTCENC_SWZ_G:
278 	case ASTCENC_SWZ_B:
279 	case ASTCENC_SWZ_A:
280 	case ASTCENC_SWZ_0:
281 	case ASTCENC_SWZ_1:
282 		return ASTCENC_SUCCESS;
283 	default:
284 		return ASTCENC_ERR_BAD_SWIZZLE;
285 	}
286 }
287 
288 /**
289  * @brief Validate overall compression swizzle.
290  *
291  * @param swizzle   The swizzle to check.
292  *
293  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
294  */
validate_compression_swizzle(const astcenc_swizzle & swizzle)295 static astcenc_error validate_compression_swizzle(
296 	const astcenc_swizzle& swizzle
297 ) {
298 	if (validate_compression_swz(swizzle.r) ||
299 	    validate_compression_swz(swizzle.g) ||
300 	    validate_compression_swz(swizzle.b) ||
301 	    validate_compression_swz(swizzle.a))
302 	{
303 		return ASTCENC_ERR_BAD_SWIZZLE;
304 	}
305 
306 	return ASTCENC_SUCCESS;
307 }
308 #endif
309 
310 /**
311  * @brief Validate single channel decompression swizzle.
312  *
313  * @param swizzle   The swizzle to check.
314  *
315  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
316  */
validate_decompression_swz(astcenc_swz swizzle)317 static astcenc_error validate_decompression_swz(
318 	astcenc_swz swizzle
319 ) {
320 	// Values in this enum are from an external user, so not guaranteed to be
321 	// bounded to the enum values
322 	switch (static_cast<int>(swizzle))
323 	{
324 	case ASTCENC_SWZ_R:
325 	case ASTCENC_SWZ_G:
326 	case ASTCENC_SWZ_B:
327 	case ASTCENC_SWZ_A:
328 	case ASTCENC_SWZ_0:
329 	case ASTCENC_SWZ_1:
330 	case ASTCENC_SWZ_Z:
331 		return ASTCENC_SUCCESS;
332 	default:
333 		return ASTCENC_ERR_BAD_SWIZZLE;
334 	}
335 }
336 
337 /**
338  * @brief Validate overall decompression swizzle.
339  *
340  * @param swizzle   The swizzle to check.
341  *
342  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
343  */
validate_decompression_swizzle(const astcenc_swizzle & swizzle)344 static astcenc_error validate_decompression_swizzle(
345 	const astcenc_swizzle& swizzle
346 ) {
347 	if (validate_decompression_swz(swizzle.r) ||
348 	    validate_decompression_swz(swizzle.g) ||
349 	    validate_decompression_swz(swizzle.b) ||
350 	    validate_decompression_swz(swizzle.a))
351 	{
352 		return ASTCENC_ERR_BAD_SWIZZLE;
353 	}
354 
355 	return ASTCENC_SUCCESS;
356 }
357 
358 /**
359  * Validate that an incoming configuration is in-spec.
360  *
361  * This function can respond in two ways:
362  *
363  *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
364  *     for out-of-range inputs in this case.
365  *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
366  *     algorithmically will return an error.
367  *
368  * @param[in,out] config   The input compressor configuration.
369  *
370  * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
371  */
validate_config(astcenc_config & config)372 static astcenc_error validate_config(
373 	astcenc_config &config
374 ) {
375 	astcenc_error status;
376 
377 	status = validate_profile(config.profile);
378 	if (status != ASTCENC_SUCCESS)
379 	{
380 		return status;
381 	}
382 
383 	status = validate_flags(config.profile, config.flags);
384 	if (status != ASTCENC_SUCCESS)
385 	{
386 		return status;
387 	}
388 
389 	status = validate_block_size(config.block_x, config.block_y, config.block_z);
390 	if (status != ASTCENC_SUCCESS)
391 	{
392 		return status;
393 	}
394 
395 #if defined(ASTCENC_DECOMPRESS_ONLY)
396 	// Decompress-only builds only support decompress-only contexts
397 	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
398 	{
399 		return ASTCENC_ERR_BAD_PARAM;
400 	}
401 #endif
402 
403 	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
404 
405 	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
406 	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
407 	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
408 	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
409 	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
410 	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
411 	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
412 	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
413 	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
414 	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
415 	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
416 	config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
417 	config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
418 	config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
419 	config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
420 
421 	// Specifying a zero weight color component is not allowed; force to small value
422 	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
423 	                             astc::max(config.cw_b_weight, config.cw_a_weight));
424 	if (max_weight > 0.0f)
425 	{
426 		max_weight /= 1000.0f;
427 		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
428 		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
429 		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
430 		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
431 	}
432 	// If all color components error weights are zero then return an error
433 	else
434 	{
435 		return ASTCENC_ERR_BAD_PARAM;
436 	}
437 
438 	return ASTCENC_SUCCESS;
439 }
440 
441 /* See header for documentation. */
astcenc_config_init(astcenc_profile profile,unsigned int block_x,unsigned int block_y,unsigned int block_z,float quality,unsigned int flags,astcenc_config * configp)442 astcenc_error astcenc_config_init(
443 	astcenc_profile profile,
444 	unsigned int block_x,
445 	unsigned int block_y,
446 	unsigned int block_z,
447 	float quality,
448 	unsigned int flags,
449 	astcenc_config* configp
450 ) {
451 	astcenc_error status;
452 
453 	status = validate_cpu_float();
454 	if (status != ASTCENC_SUCCESS)
455 	{
456 		return status;
457 	}
458 
459 	// Zero init all config fields; although most of will be over written
460 	astcenc_config& config = *configp;
461 	std::memset(&config, 0, sizeof(config));
462 
463 	// Process the block size
464 	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
465 	status = validate_block_size(block_x, block_y, block_z);
466 	if (status != ASTCENC_SUCCESS)
467 	{
468 		return status;
469 	}
470 
471 	config.block_x = block_x;
472 	config.block_y = block_y;
473 	config.block_z = block_z;
474 
475 	float texels = static_cast<float>(block_x * block_y * block_z);
476 	float ltexels = logf(texels) / logf(10.0f);
477 
478 	// Process the performance quality level or preset; note that this must be done before we
479 	// process any additional settings, such as color profile and flags, which may replace some of
480 	// these settings with more use case tuned values
481 	if (quality < ASTCENC_PRE_FASTEST ||
482 	    quality > ASTCENC_PRE_EXHAUSTIVE)
483 	{
484 		return ASTCENC_ERR_BAD_QUALITY;
485 	}
486 
487 	static const std::array<astcenc_preset_config, 6>* preset_configs;
488 	int texels_int = block_x * block_y * block_z;
489 	if (texels_int < 25)
490 	{
491 		preset_configs = &preset_configs_high;
492 	}
493 	else if (texels_int < 64)
494 	{
495 		preset_configs = &preset_configs_mid;
496 	}
497 	else
498 	{
499 		preset_configs = &preset_configs_low;
500 	}
501 
502 	// Determine which preset to use, or which pair to interpolate
503 	size_t start;
504 	size_t end;
505 	for (end = 0; end < preset_configs->size(); end++)
506 	{
507 		if ((*preset_configs)[end].quality >= quality)
508 		{
509 			break;
510 		}
511 	}
512 
513 	start = end == 0 ? 0 : end - 1;
514 
515 	// Start and end node are the same - so just transfer the values.
516 	if (start == end)
517 	{
518 		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
519 		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
520 		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
521 		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
522 		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
523 		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
524 		config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
525 		config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
526 		config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
527 		config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
528 		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
529 		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
530 
531 		config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
532 
533 		config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
534 		config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
535 		config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
536 		config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
537 	}
538 	// Start and end node are not the same - so interpolate between them
539 	else
540 	{
541 		auto& node_a = (*preset_configs)[start];
542 		auto& node_b = (*preset_configs)[end];
543 
544 		float wt_range = node_b.quality - node_a.quality;
545 		assert(wt_range > 0);
546 
547 		// Compute interpolation factors
548 		float wt_node_a = (node_b.quality - quality) / wt_range;
549 		float wt_node_b = (quality - node_a.quality) / wt_range;
550 
551 		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
552 		#define LERPI(param) astc::flt2int_rtn(\
553 		                         (static_cast<float>(node_a.param) * wt_node_a) + \
554 		                         (static_cast<float>(node_b.param) * wt_node_b))
555 		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
556 
557 		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
558 		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
559 		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
560 		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
561 		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
562 		config.tune_refinement_limit = LERPI(tune_refinement_limit);
563 		config.tune_candidate_limit = LERPUI(tune_candidate_limit);
564 		config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
565 		config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
566 		config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
567 		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
568 		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
569 
570 		config.tune_mse_overshoot = LERP(tune_mse_overshoot);
571 
572 		config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
573 		config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
574 		config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
575 		config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
576 		#undef LERP
577 		#undef LERPI
578 		#undef LERPUI
579 	}
580 
581 	// Set heuristics to the defaults for each color profile
582 	config.cw_r_weight = 1.0f;
583 	config.cw_g_weight = 1.0f;
584 	config.cw_b_weight = 1.0f;
585 	config.cw_a_weight = 1.0f;
586 
587 	config.a_scale_radius = 0;
588 
589 	config.rgbm_m_scale = 0.0f;
590 
591 	config.profile = profile;
592 
593 	// Values in this enum are from an external user, so not guaranteed to be
594 	// bounded to the enum values
595 	switch (static_cast<int>(profile))
596 	{
597 	case ASTCENC_PRF_LDR:
598 	case ASTCENC_PRF_LDR_SRGB:
599 		break;
600 	case ASTCENC_PRF_HDR_RGB_LDR_A:
601 	case ASTCENC_PRF_HDR:
602 		config.tune_db_limit = 999.0f;
603 		config.tune_search_mode0_enable = 0.0f;
604 		break;
605 	default:
606 		return ASTCENC_ERR_BAD_PROFILE;
607 	}
608 
609 	// Flags field must not contain any unknown flag bits
610 	status = validate_flags(profile, flags);
611 	if (status != ASTCENC_SUCCESS)
612 	{
613 		return status;
614 	}
615 
616 	if (flags & ASTCENC_FLG_MAP_NORMAL)
617 	{
618 		// Normal map encoding uses L+A blocks, so allow one more partitioning
619 		// than normal. We need need fewer bits for endpoints, so more likely
620 		// to be able to use more partitions than an RGB/RGBA block
621 		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
622 
623 		config.cw_g_weight = 0.0f;
624 		config.cw_b_weight = 0.0f;
625 		config.tune_2partition_early_out_limit_factor *= 1.5f;
626 		config.tune_3partition_early_out_limit_factor *= 1.5f;
627 		config.tune_2plane_early_out_limit_correlation = 0.99f;
628 
629 		// Normals are prone to blocking artifacts on smooth curves
630 		// so force compressor to try harder here ...
631 		config.tune_db_limit *= 1.03f;
632 	}
633 	else if (flags & ASTCENC_FLG_MAP_RGBM)
634 	{
635 		config.rgbm_m_scale = 5.0f;
636 		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
637 	}
638 	else // (This is color data)
639 	{
640 		// This is a very basic perceptual metric for RGB color data, which weights error
641 		// significance by the perceptual luminance contribution of each color channel. For
642 		// luminance the usual weights to compute luminance from a linear RGB value are as
643 		// follows:
644 		//
645 		//     l = r * 0.3 + g * 0.59 + b * 0.11
646 		//
647 		// ... but we scale these up to keep a better balance between color and alpha. Note
648 		// that if the content is using alpha we'd recommend using the -a option to weight
649 		// the color contribution by the alpha transparency.
650 		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
651 		{
652 			config.cw_r_weight = 0.30f * 2.25f;
653 			config.cw_g_weight = 0.59f * 2.25f;
654 			config.cw_b_weight = 0.11f * 2.25f;
655 		}
656 	}
657 	config.flags = flags;
658 
659 	return ASTCENC_SUCCESS;
660 }
661 
662 /* See header for documentation. */
astcenc_context_alloc(const astcenc_config * configp,unsigned int thread_count,astcenc_context ** context)663 astcenc_error astcenc_context_alloc(
664 	const astcenc_config* configp,
665 	unsigned int thread_count,
666 	astcenc_context** context
667 ) {
668 	astcenc_error status;
669 	const astcenc_config& config = *configp;
670 
671 	status = validate_cpu_float();
672 	if (status != ASTCENC_SUCCESS)
673 	{
674 		return status;
675 	}
676 
677 	if (thread_count == 0)
678 	{
679 		return ASTCENC_ERR_BAD_PARAM;
680 	}
681 
682 #if defined(ASTCENC_DIAGNOSTICS)
683 	// Force single threaded compressor use in diagnostic mode.
684 	if (thread_count != 1)
685 	{
686 		return ASTCENC_ERR_BAD_PARAM;
687 	}
688 #endif
689 
690 #ifndef ASTC_CUSTOMIZED_ENABLE
691 	if (config.privateProfile == CUSTOMIZED_PROFILE)
692 	{
693 		return ASTCENC_ERR_BAD_PARAM;
694 	}
695 #endif
696 
697 	astcenc_context* ctxo = new astcenc_context;
698 	astcenc_contexti* ctx = &ctxo->context;
699 	ctx->thread_count = thread_count;
700 	ctx->config = config;
701 	ctx->working_buffers = nullptr;
702 
703 	// These are allocated per-compress, as they depend on image size
704 	ctx->input_alpha_averages = nullptr;
705 
706 	// Copy the config first and validate the copy (we may modify it)
707 	status = validate_config(ctx->config);
708 	if (status != ASTCENC_SUCCESS)
709 	{
710 		delete ctxo;
711 		return status;
712 	}
713 
714 	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
715 	if (!ctx->bsd)
716 	{
717 		delete ctxo;
718 		return ASTCENC_ERR_OUT_OF_MEM;
719 	}
720 
721 	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
722 #ifdef ASTC_CUSTOMIZED_ENABLE
723 	if (!init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
724 	                           can_omit_modes,
725 	                           config.tune_partition_count_limit,
726 	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
727 	                           *ctx->bsd))
728 	{
729 		aligned_free<block_size_descriptor>(ctx->bsd);
730 		delete ctxo;
731 		*context = nullptr;
732 		return ASTCENC_ERR_DLOPEN_FAILED;
733 	}
734 #else
735 	init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
736 	                           can_omit_modes,
737 	                           config.tune_partition_count_limit,
738 	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
739 	                           *ctx->bsd);
740 #endif
741 
742 #if !defined(ASTCENC_DECOMPRESS_ONLY)
743 	// Do setup only needed by compression
744 	if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
745 	{
746 		// Turn a dB limit into a per-texel error for faster use later
747 		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
748 		{
749 			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
750 		}
751 		else
752 		{
753 			ctx->config.tune_db_limit = 0.0f;
754 		}
755 
756 		size_t worksize = sizeof(compression_working_buffers) * thread_count;
757 		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
758 		static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
759 		              "compression_working_buffers size must be multiple of vector alignment");
760 		if (!ctx->working_buffers)
761 		{
762 			aligned_free<block_size_descriptor>(ctx->bsd);
763 			delete ctxo;
764 			*context = nullptr;
765 			return ASTCENC_ERR_OUT_OF_MEM;
766 		}
767 	}
768 #endif
769 
770 #if defined(ASTCENC_DIAGNOSTICS)
771 	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
772 	if (!ctx->trace_log->m_file)
773 	{
774 		return ASTCENC_ERR_DTRACE_FAILURE;
775 	}
776 
777 	trace_add_data("block_x", config.block_x);
778 	trace_add_data("block_y", config.block_y);
779 	trace_add_data("block_z", config.block_z);
780 #endif
781 
782 	*context = ctxo;
783 
784 #if !defined(ASTCENC_DECOMPRESS_ONLY)
785 	prepare_angular_tables();
786 #endif
787 
788 	return ASTCENC_SUCCESS;
789 }
790 
791 /* See header dor documentation. */
astcenc_context_free(astcenc_context * ctxo)792 void astcenc_context_free(
793 	astcenc_context* ctxo
794 ) {
795 	if (ctxo)
796 	{
797 		astcenc_contexti* ctx = &ctxo->context;
798 		if (ctx->working_buffers)
799 		{
800 			aligned_free<compression_working_buffers>(ctx->working_buffers);
801 		}
802 		else
803 		{
804 			printf("ctx->working_buffers is nullptr !!\n");
805 		}
806 		if (ctx->bsd)
807 		{
808 			aligned_free<block_size_descriptor>(ctx->bsd);
809 		}
810 		else
811 		{
812 			printf("ctx->bsd is nullptr !!\n");
813 		}
814 #if defined(ASTCENC_DIAGNOSTICS)
815 		delete ctx->trace_log;
816 #endif
817 		delete ctxo;
818 	}
819 }
820 
821 #if !defined(ASTCENC_DECOMPRESS_ONLY)
822 
823 /**
824  * @brief Compress an image, after any preflight has completed.
825  *
826  * @param[out] ctxo           The compressor context.
827  * @param      thread_index   The thread index.
828  * @param      image          The intput image.
829  * @param      swizzle        The input swizzle.
830  * @param[out] buffer         The output array for the compressed data.
831  */
compress_image(astcenc_context & ctxo,unsigned int thread_index,const astcenc_image & image,const astcenc_swizzle & swizzle,uint8_t * buffer,bool calQualityEnable,int32_t * mse[RGBA_COM])832 static void compress_image(
833 	astcenc_context& ctxo,
834 	unsigned int thread_index,
835 	const astcenc_image& image,
836 	const astcenc_swizzle& swizzle,
837 #if QUALITY_CONTROL
838 	uint8_t* buffer,
839 	bool calQualityEnable,
840 	int32_t *mse[RGBA_COM]
841 #else
842 	uint8_t* buffer
843 #endif
844 ) {
845 	astcenc_contexti& ctx = ctxo.context;
846 	const block_size_descriptor& bsd = *ctx.bsd;
847 	astcenc_profile decode_mode = ctx.config.profile;
848 
849 	image_block blk;
850 
851 	int block_x = bsd.xdim;
852 	int block_y = bsd.ydim;
853 	int block_z = bsd.zdim;
854 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
855 
856 	int dim_x = image.dim_x;
857 	int dim_y = image.dim_y;
858 	int dim_z = image.dim_z;
859 
860 	int xblocks = (dim_x + block_x - 1) / block_x;
861 	int yblocks = (dim_y + block_y - 1) / block_y;
862 	int zblocks = (dim_z + block_z - 1) / block_z;
863 	int block_count = zblocks * yblocks * xblocks;
864 
865 	int row_blocks = xblocks;
866 	int plane_blocks = xblocks * yblocks;
867 
868 	blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
869 
870 	// Populate the block channel weights
871 	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
872 	                             ctx.config.cw_g_weight,
873 	                             ctx.config.cw_b_weight,
874 	                             ctx.config.cw_a_weight);
875 
876 	// Use preallocated scratch buffer
877 	auto& temp_buffers = ctx.working_buffers[thread_index];
878 
879 	// Only the first thread actually runs the initializer
880 	ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
881 
882 	// Determine if we can use an optimized load function
883 	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
884 	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
885 
886 	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
887 	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
888 
889 	bool use_fast_load = !needs_swz && !needs_hdr &&
890 	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
891 
892 	auto load_func = load_image_block;
893 	if (use_fast_load)
894 	{
895 		load_func = load_image_block_fast_ldr;
896 	}
897 
898 	// All threads run this processing loop until there is no work remaining
899 	while (true)
900 	{
901 		unsigned int count;
902 		unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
903 		if (!count)
904 		{
905 			break;
906 		}
907 
908 		for (unsigned int i = base; i < base + count; i++)
909 		{
910 			// Decode i into x, y, z block indices
911 			int z = i / plane_blocks;
912 			unsigned int rem = i - (z * plane_blocks);
913 			int y = rem / row_blocks;
914 			int x = rem - (y * row_blocks);
915 
916 			// Test if we can apply some basic alpha-scale RDO
917 			bool use_full_block = true;
918 			if (ctx.config.a_scale_radius != 0 && block_z == 1)
919 			{
920 				int start_x = x * block_x;
921 				int end_x = astc::min(dim_x, start_x + block_x);
922 
923 				int start_y = y * block_y;
924 				int end_y = astc::min(dim_y, start_y + block_y);
925 
926 				// SATs accumulate error, so don't test exactly zero. Test for
927 				// less than 1 alpha in the expanded block footprint that
928 				// includes the alpha radius.
929 				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
930 
931 				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
932 
933 				float footprint = static_cast<float>(x_footprint * y_footprint);
934 				float threshold = 0.9f / (255.0f * footprint);
935 
936 				// Do we have any alpha values?
937 				use_full_block = false;
938 				for (int ay = start_y; ay < end_y; ay++)
939 				{
940 					for (int ax = start_x; ax < end_x; ax++)
941 					{
942 						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
943 						if (a_avg > threshold)
944 						{
945 							use_full_block = true;
946 							ax = end_x;
947 							ay = end_y;
948 						}
949 					}
950 				}
951 			}
952 
953 			// Fetch the full block for compression
954 			if (use_full_block)
955 			{
956 				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
957 
958 				// Scale RGB error contribution by the maximum alpha in the block
959 				// This encourages preserving alpha accuracy in regions with high
960 				// transparency, and can buy up to 0.5 dB PSNR.
961 				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
962 				{
963 					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
964 					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
965 					                             ctx.config.cw_g_weight * alpha_scale,
966 					                             ctx.config.cw_b_weight * alpha_scale,
967 					                             ctx.config.cw_a_weight);
968 				}
969 			}
970 			// Apply alpha scale RDO - substitute constant color block
971 			else
972 			{
973 				blk.origin_texel = vfloat4::zero();
974 				blk.data_min = vfloat4::zero();
975 				blk.data_mean = vfloat4::zero();
976 				blk.data_max = vfloat4::zero();
977 				blk.grayscale = true;
978 			}
979 
980 			int offset = ((z * yblocks + y) * xblocks + x) * 16;
981 			uint8_t *bp = buffer + offset;
982 #if QUALITY_CONTROL
983 			int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
984 			if (calQualityEnable) {
985 				offset = (z * yblocks + y) * xblocks + x;
986 				mseBlock[R_COM] = mse[R_COM] + offset;
987 				mseBlock[G_COM] = mse[G_COM] + offset;
988 				mseBlock[B_COM] = mse[B_COM] + offset;
989 				mseBlock[A_COM] = mse[A_COM] + offset;
990 			}
991 			compress_block(ctx, blk, bp, temp_buffers, calQualityEnable, mseBlock);
992 #else
993 			compress_block(ctx, blk, bp, temp_buffers);
994 #endif
995 		}
996 
997 		ctxo.manage_compress.complete_task_assignment(count);
998 	}
999 }
1000 
1001 /**
1002  * @brief Compute regional averages in an image.
1003  *
1004  * This function can be called by multiple threads, but only after a single
1005  * thread calls the setup function @c init_compute_averages().
1006  *
1007  * Results are written back into @c img->input_alpha_averages.
1008  *
1009  * @param[out] ctx   The context.
1010  * @param      ag    The average and variance arguments created during setup.
1011  */
compute_averages(astcenc_context & ctx,const avg_args & ag)1012 static void compute_averages(
1013 	astcenc_context& ctx,
1014 	const avg_args &ag
1015 ) {
1016 	pixel_region_args arg = ag.arg;
1017 	arg.work_memory = new vfloat4[ag.work_memory_size];
1018 
1019 	int size_x = ag.img_size_x;
1020 	int size_y = ag.img_size_y;
1021 	int size_z = ag.img_size_z;
1022 
1023 	int step_xy = ag.blk_size_xy;
1024 	int step_z = ag.blk_size_z;
1025 
1026 	int y_tasks = (size_y + step_xy - 1) / step_xy;
1027 
1028 	// All threads run this processing loop until there is no work remaining
1029 	while (true)
1030 	{
1031 		unsigned int count;
1032 		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
1033 		if (!count)
1034 		{
1035 			break;
1036 		}
1037 
1038 		for (unsigned int i = base; i < base + count; i++)
1039 		{
1040 			int z = (i / (y_tasks)) * step_z;
1041 			int y = (i - (z * y_tasks)) * step_xy;
1042 
1043 			arg.size_z = astc::min(step_z, size_z - z);
1044 			arg.offset_z = z;
1045 
1046 			arg.size_y = astc::min(step_xy, size_y - y);
1047 			arg.offset_y = y;
1048 
1049 			for (int x = 0; x < size_x; x += step_xy)
1050 			{
1051 				arg.size_x = astc::min(step_xy, size_x - x);
1052 				arg.offset_x = x;
1053 				compute_pixel_region_variance(ctx.context, arg);
1054 			}
1055 		}
1056 
1057 		ctx.manage_avg.complete_task_assignment(count);
1058 	}
1059 
1060 	delete[] arg.work_memory;
1061 }
1062 
1063 #endif
1064 
free_image_inside(astcenc_image * img)1065 static void free_image_inside(astcenc_image* img)
1066 {
1067 	if (img->data != nullptr)
1068 	{
1069 		for (unsigned int z = 0; z < img->dim_z; z++)
1070 		{
1071 			delete[] reinterpret_cast<char *>(img->data[z]);
1072 			img->data[z] = nullptr;
1073 		}
1074 	}
1075 	delete[] img->data;
1076 	img->data = nullptr;
1077 }
1078 
convert_rgba10_to_float16(astcenc_image * imgRGBA,astcenc_image * image)1079 static void convert_rgba10_to_float16(astcenc_image* imgRGBA, astcenc_image* image)
1080 {
1081 	uint32_t* src = static_cast<uint32_t *>(image->data[0]);
1082 	uint16_t* dst = static_cast<uint16_t *>(imgRGBA->data[0]);
1083 	for (unsigned int y = 0; y < image->dim_y; y++)
1084 	{
1085 		for (unsigned int x = 0; x < image->dim_x; x++)
1086 		{
1087 			uint32_t data_rgba = src[image->dim_stride * y + x];
1088 			uint16_t data_r = data_rgba & 0x3FF;
1089 			uint16_t data_g = (data_rgba >> COMP_G_SHIFT_POSITION) & 0x3FF;
1090 			uint16_t data_b = (data_rgba >> COMP_B_SHIFT_POSITION) & 0x3FF;
1091 			uint16_t data_a = (data_rgba >> COMP_A_SHIFT_POSITION) & 0x3;
1092 			vint4 colorf16 = float_to_float16(vfloat4(data_r / 1023.0, // 1023.0: 10bit to 0-1
1093 							data_g / 1023.0,
1094 							data_b / 1023.0,
1095 							data_a / 3.0)); // 3.0: 2bit to 0-1
1096 			dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x)] =
1097 				static_cast<uint16_t>(colorf16.lane<0>()); // 0: R
1098 			dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x + 1)] = // 1: G
1099 				static_cast<uint16_t>(colorf16.lane<1>()); // 1: G
1100 			dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x + 2)] = // 2: B
1101 				static_cast<uint16_t>(colorf16.lane<2>()); // 2: B
1102 			dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x + 3)] = // 3: A
1103 				static_cast<uint16_t>(colorf16.lane<3>()); // 3: A
1104 		}
1105 	}
1106 }
1107 
1108 /* See header for documentation. */
astcenc_compress_image(astcenc_context * ctxo,astcenc_image * imagep,const astcenc_swizzle * swizzle,uint8_t * data_out,size_t data_len,bool calQualityEnable,int32_t * mse[RGBA_COM],unsigned int thread_index)1109 astcenc_error astcenc_compress_image(
1110 	astcenc_context* ctxo,
1111 	astcenc_image* imagep,
1112 	const astcenc_swizzle* swizzle,
1113 	uint8_t* data_out,
1114 	size_t data_len,
1115 #if QUALITY_CONTROL
1116 	bool calQualityEnable,
1117 	int32_t *mse[RGBA_COM],
1118 #endif
1119 	unsigned int thread_index
1120 ) {
1121 #if defined(ASTCENC_DECOMPRESS_ONLY)
1122 	(void)ctxo;
1123 	(void)imagep;
1124 	(void)swizzle;
1125 	(void)data_out;
1126 	(void)data_len;
1127 	(void)thread_index;
1128 	return ASTCENC_ERR_BAD_CONTEXT;
1129 #else
1130 	astcenc_contexti* ctx = &ctxo->context;
1131 	astcenc_error status;
1132 	astcenc_image* image = imagep;
1133 
1134 	astcenc_image imgRGBA = {};
1135 	imgRGBA.data = nullptr;
1136 	if (image->data_type == ASTCENC_TYPE_RGBA1010102)
1137 	{
1138 		imgRGBA.dim_x = image->dim_x;
1139 		imgRGBA.dim_y = image->dim_y;
1140 		imgRGBA.dim_stride = imgRGBA.dim_x;
1141 		imgRGBA.dim_z = 1;
1142 		imgRGBA.data_type = ASTCENC_TYPE_F16;
1143 		imgRGBA.data = new(std::nothrow) void* [imgRGBA.dim_z];
1144 		if (imgRGBA.data == nullptr)
1145 		{
1146 			return ASTCENC_ERR_OUT_OF_MEM;
1147 		}
1148 		imgRGBA.data[0] = new(std::nothrow)
1149 			uint16_t[imgRGBA.dim_x * imgRGBA.dim_y * COMPONENT_NUM];
1150 		if (imgRGBA.data[0] == nullptr)
1151 		{
1152 			free_image_inside(&imgRGBA);
1153 			return ASTCENC_ERR_OUT_OF_MEM;
1154 		}
1155 		convert_rgba10_to_float16(&imgRGBA, imagep);
1156 		image = &imgRGBA;
1157 	}
1158 
1159 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1160 	{
1161 		free_image_inside(&imgRGBA);
1162 		return ASTCENC_ERR_BAD_CONTEXT;
1163 	}
1164 
1165 	status = validate_compression_swizzle(*swizzle);
1166 	if (status != ASTCENC_SUCCESS)
1167 	{
1168 		free_image_inside(&imgRGBA);
1169 		return status;
1170 	}
1171 
1172 	if (thread_index >= ctx->thread_count)
1173 	{
1174 		free_image_inside(&imgRGBA);
1175 		return ASTCENC_ERR_BAD_PARAM;
1176 	}
1177 
1178 	unsigned int block_x = ctx->config.block_x;
1179 	unsigned int block_y = ctx->config.block_y;
1180 	unsigned int block_z = ctx->config.block_z;
1181 
1182 	unsigned int xblocks = (image->dim_x + block_x - 1) / block_x;
1183 	unsigned int yblocks = (image->dim_y + block_y - 1) / block_y;
1184 	unsigned int zblocks = (image->dim_z + block_z - 1) / block_z;
1185 
1186 	// Check we have enough output space (16 bytes per block)
1187 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1188 	if (data_len < size_needed)
1189 	{
1190 		free_image_inside(&imgRGBA);
1191 		return ASTCENC_ERR_OUT_OF_MEM;
1192 	}
1193 
1194 	// If context thread count is one then implicitly reset
1195 	if (ctx->thread_count == 1)
1196 	{
1197 		astcenc_compress_reset(ctxo);
1198 	}
1199 
1200 	if (ctx->config.a_scale_radius != 0)
1201 	{
1202 		image = imagep;
1203 		free_image_inside(&imgRGBA);
1204 		// First thread to enter will do setup, other threads will subsequently
1205 		// enter the critical section but simply skip over the initialization
1206 		auto init_avg = [ctx, &image, swizzle]() {
1207 			// Perform memory allocations for the destination buffers
1208 			size_t texel_count = image->dim_x * image->dim_y * image->dim_z;
1209 			ctx->input_alpha_averages = new float[texel_count];
1210 
1211 			return init_compute_averages(
1212 				*image, ctx->config.a_scale_radius, *swizzle,
1213 				ctx->avg_preprocess_args);
1214 		};
1215 
1216 		// Only the first thread actually runs the initializer
1217 		ctxo->manage_avg.init(init_avg);
1218 
1219 		// All threads will enter this function and dynamically grab work
1220 		compute_averages(*ctxo, ctx->avg_preprocess_args);
1221 	}
1222 
1223 	// Wait for compute_averages to complete before compressing
1224 	ctxo->manage_avg.wait();
1225 #if QUALITY_CONTROL
1226 	compress_image(*ctxo, thread_index, *image, *swizzle, data_out, calQualityEnable, mse);
1227 #else
1228 	compress_image(*ctxo, thread_index, *image, *swizzle, data_out);
1229 #endif
1230 	// Wait for compress to complete before freeing memory
1231 	ctxo->manage_compress.wait();
1232 
1233 	auto term_compress = [ctx]() {
1234 		delete[] ctx->input_alpha_averages;
1235 		ctx->input_alpha_averages = nullptr;
1236 	};
1237 
1238 	// Only the first thread to arrive actually runs the term
1239 	ctxo->manage_compress.term(term_compress);
1240 	free_image_inside(&imgRGBA);
1241 	return ASTCENC_SUCCESS;
1242 #endif
1243 }
1244 
1245 /* See header for documentation. */
astcenc_compress_reset(astcenc_context * ctxo)1246 astcenc_error astcenc_compress_reset(
1247 	astcenc_context* ctxo
1248 ) {
1249 #if defined(ASTCENC_DECOMPRESS_ONLY)
1250 	(void)ctxo;
1251 	return ASTCENC_ERR_BAD_CONTEXT;
1252 #else
1253 	astcenc_contexti* ctx = &ctxo->context;
1254 	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1255 	{
1256 		return ASTCENC_ERR_BAD_CONTEXT;
1257 	}
1258 
1259 	ctxo->manage_avg.reset();
1260 	ctxo->manage_compress.reset();
1261 	return ASTCENC_SUCCESS;
1262 #endif
1263 }
1264 
1265 /* See header for documentation. */
astcenc_decompress_image(astcenc_context * ctxo,const uint8_t * data,size_t data_len,astcenc_image * image_outp,const astcenc_swizzle * swizzle,unsigned int thread_index)1266 astcenc_error astcenc_decompress_image(
1267 	astcenc_context* ctxo,
1268 	const uint8_t* data,
1269 	size_t data_len,
1270 	astcenc_image* image_outp,
1271 	const astcenc_swizzle* swizzle,
1272 	unsigned int thread_index
1273 ) {
1274 	astcenc_error status;
1275 	astcenc_image& image_out = *image_outp;
1276 	astcenc_contexti* ctx = &ctxo->context;
1277 
1278 	// Today this doesn't matter (working set on stack) but might in future ...
1279 	if (thread_index >= ctx->thread_count)
1280 	{
1281 		return ASTCENC_ERR_BAD_PARAM;
1282 	}
1283 
1284 	status = validate_decompression_swizzle(*swizzle);
1285 	if (status != ASTCENC_SUCCESS)
1286 	{
1287 		return status;
1288 	}
1289 
1290 	unsigned int block_x = ctx->config.block_x;
1291 	unsigned int block_y = ctx->config.block_y;
1292 	unsigned int block_z = ctx->config.block_z;
1293 
1294 	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1295 	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1296 	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1297 	unsigned int block_count = zblocks * yblocks * xblocks;
1298 
1299 	int row_blocks = xblocks;
1300 	int plane_blocks = xblocks * yblocks;
1301 
1302 	// Check we have enough output space (16 bytes per block)
1303 	size_t size_needed = xblocks * yblocks * zblocks * 16;
1304 	if (data_len < size_needed)
1305 	{
1306 		return ASTCENC_ERR_OUT_OF_MEM;
1307 	}
1308 
1309 	image_block blk;
1310 	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1311 
1312 	// Decode mode inferred from the output data type
1313 	blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
1314 
1315 	// If context thread count is one then implicitly reset
1316 	if (ctx->thread_count == 1)
1317 	{
1318 		astcenc_decompress_reset(ctxo);
1319 	}
1320 
1321 	// Only the first thread actually runs the initializer
1322 	ctxo->manage_decompress.init(block_count, nullptr);
1323 
1324 	// All threads run this processing loop until there is no work remaining
1325 	while (true)
1326 	{
1327 		unsigned int count;
1328 		unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1329 		if (!count)
1330 		{
1331 			break;
1332 		}
1333 
1334 		for (unsigned int i = base; i < base + count; i++)
1335 		{
1336 			// Decode i into x, y, z block indices
1337 			int z = i / plane_blocks;
1338 			unsigned int rem = i - (z * plane_blocks);
1339 			int y = rem / row_blocks;
1340 			int x = rem - (y * row_blocks);
1341 
1342 			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1343 			const uint8_t* bp = data + offset;
1344 
1345 			symbolic_compressed_block scb;
1346 
1347 			physical_to_symbolic(*ctx->bsd, bp, scb);
1348 
1349 			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1350 			                          x * block_x, y * block_y, z * block_z,
1351 			                          scb, blk);
1352 
1353 			store_image_block(image_out, blk, *ctx->bsd,
1354 			                  x * block_x, y * block_y, z * block_z, *swizzle);
1355 		}
1356 
1357 		ctxo->manage_decompress.complete_task_assignment(count);
1358 	}
1359 
1360 	return ASTCENC_SUCCESS;
1361 }
1362 
1363 /* See header for documentation. */
astcenc_decompress_reset(astcenc_context * ctxo)1364 astcenc_error astcenc_decompress_reset(
1365 	astcenc_context* ctxo
1366 ) {
1367 	ctxo->manage_decompress.reset();
1368 	return ASTCENC_SUCCESS;
1369 }
1370 
1371 /* See header for documentation. */
astcenc_get_block_info(astcenc_context * ctxo,const uint8_t data[16],astcenc_block_info * info)1372 astcenc_error astcenc_get_block_info(
1373 	astcenc_context* ctxo,
1374 	const uint8_t data[16],
1375 	astcenc_block_info* info
1376 ) {
1377 #if defined(ASTCENC_DECOMPRESS_ONLY)
1378 	(void)ctxo;
1379 	(void)data;
1380 	(void)info;
1381 	return ASTCENC_ERR_BAD_CONTEXT;
1382 #else
1383 	astcenc_contexti* ctx = &ctxo->context;
1384 
1385 	// Decode the compressed data into a symbolic form
1386 	symbolic_compressed_block scb;
1387 	physical_to_symbolic(*ctx->bsd, data, scb);
1388 
1389 	// Fetch the appropriate partition and decimation tables
1390 	block_size_descriptor& bsd = *ctx->bsd;
1391 
1392 	// Start from a clean slate
1393 	memset(info, 0, sizeof(*info));
1394 
1395 	// Basic info we can always populate
1396 	info->profile = ctx->config.profile;
1397 
1398 	info->block_x = ctx->config.block_x;
1399 	info->block_y = ctx->config.block_y;
1400 	info->block_z = ctx->config.block_z;
1401 	info->texel_count = bsd.texel_count;
1402 
1403 	// Check for error blocks first
1404 	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1405 	if (info->is_error_block)
1406 	{
1407 		return ASTCENC_SUCCESS;
1408 	}
1409 
1410 	// Check for constant color blocks second
1411 	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1412 	                          scb.block_type == SYM_BTYPE_CONST_U16;
1413 	if (info->is_constant_block)
1414 	{
1415 		return ASTCENC_SUCCESS;
1416 	}
1417 
1418 	// Otherwise handle a full block ; known to be valid after conditions above have been checked
1419 	int partition_count = scb.partition_count;
1420 	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1421 
1422 	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1423 	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1424 
1425 	info->weight_x = di.weight_x;
1426 	info->weight_y = di.weight_y;
1427 	info->weight_z = di.weight_z;
1428 
1429 	info->is_dual_plane_block = bm.is_dual_plane != 0;
1430 
1431 	info->partition_count = scb.partition_count;
1432 	info->partition_index = scb.partition_index;
1433 	info->dual_plane_component = scb.plane2_component;
1434 
1435 	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1436 	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1437 
1438 	// Unpack color endpoints for each active partition
1439 	for (unsigned int i = 0; i < scb.partition_count; i++)
1440 	{
1441 		bool rgb_hdr;
1442 		bool a_hdr;
1443 		vint4 endpnt[2];
1444 
1445 		unpack_color_endpoints(ctx->config.profile,
1446 		                       scb.color_formats[i],
1447 		                       scb.color_values[i],
1448 		                       rgb_hdr, a_hdr,
1449 		                       endpnt[0], endpnt[1]);
1450 
1451 		// Store the color endpoint mode info
1452 		info->color_endpoint_modes[i] = scb.color_formats[i];
1453 		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1454 
1455 		// Store the unpacked and decoded color endpoint
1456 		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1457 		for (int j = 0; j < 2; j++)
1458 		{
1459 			vint4 color_lns = lns_to_sf16(endpnt[j]);
1460 			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1461 			vint4 datai = select(color_unorm, color_lns, hdr_mask);
1462 			store(float16_to_float(datai), info->color_endpoints[i][j]);
1463 		}
1464 	}
1465 
1466 	// Unpack weights for each texel
1467 	int weight_plane1[BLOCK_MAX_TEXELS];
1468 	int weight_plane2[BLOCK_MAX_TEXELS];
1469 
1470 	unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1471 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1472 	{
1473 		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1474 		if (info->is_dual_plane_block)
1475 		{
1476 			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1477 		}
1478 	}
1479 
1480 	// Unpack partition assignments for each texel
1481 	for (unsigned int i = 0; i < bsd.texel_count; i++)
1482 	{
1483 		info->partition_assignment[i] = pi.partition_of_texel[i];
1484 	}
1485 
1486 	return ASTCENC_SUCCESS;
1487 #endif
1488 }
1489 
1490 /* See header for documentation. */
astcenc_get_error_string(astcenc_error status)1491 const char* astcenc_get_error_string(
1492 	astcenc_error status
1493 ) {
1494 	// Values in this enum are from an external user, so not guaranteed to be
1495 	// bounded to the enum values
1496 	switch (static_cast<int>(status))
1497 	{
1498 	case ASTCENC_SUCCESS:
1499 		return "ASTCENC_SUCCESS";
1500 	case ASTCENC_ERR_OUT_OF_MEM:
1501 		return "ASTCENC_ERR_OUT_OF_MEM";
1502 	case ASTCENC_ERR_BAD_CPU_FLOAT:
1503 		return "ASTCENC_ERR_BAD_CPU_FLOAT";
1504 	case ASTCENC_ERR_BAD_PARAM:
1505 		return "ASTCENC_ERR_BAD_PARAM";
1506 	case ASTCENC_ERR_BAD_BLOCK_SIZE:
1507 		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1508 	case ASTCENC_ERR_BAD_PROFILE:
1509 		return "ASTCENC_ERR_BAD_PROFILE";
1510 	case ASTCENC_ERR_BAD_QUALITY:
1511 		return "ASTCENC_ERR_BAD_QUALITY";
1512 	case ASTCENC_ERR_BAD_FLAGS:
1513 		return "ASTCENC_ERR_BAD_FLAGS";
1514 	case ASTCENC_ERR_BAD_SWIZZLE:
1515 		return "ASTCENC_ERR_BAD_SWIZZLE";
1516 	case ASTCENC_ERR_BAD_CONTEXT:
1517 		return "ASTCENC_ERR_BAD_CONTEXT";
1518 	case ASTCENC_ERR_NOT_IMPLEMENTED:
1519 		return "ASTCENC_ERR_NOT_IMPLEMENTED";
1520 	case ASTCENC_ERR_BAD_DECODE_MODE:
1521 		return "ASTCENC_ERR_BAD_DECODE_MODE";
1522 #if defined(ASTCENC_DIAGNOSTICS)
1523 	case ASTCENC_ERR_DTRACE_FAILURE:
1524 		return "ASTCENC_ERR_DTRACE_FAILURE";
1525 #endif
1526 	default:
1527 		return nullptr;
1528 	}
1529 }
1530