1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Functions for the library entrypoint.
20 */
21
22 #include <array>
23 #include <cstring>
24 #include <new>
25
26 #include "astcenc.h"
27 #include "astcenc_internal_entry.h"
28 #include "astcenc_diagnostic_trace.h"
29
30 // RGBA数据存储格式说明(内存布局/位分配)
31 constexpr uint8_t COMPONENT_NUM = 4;
32 constexpr uint8_t COMP_G_SHIFT_POSITION = 10;
33 constexpr uint8_t COMP_B_SHIFT_POSITION = 20;
34 constexpr uint8_t COMP_A_SHIFT_POSITION = 30;
35
36 /**
37 * @brief Record of the quality tuning parameter values.
38 *
39 * See the @c astcenc_config structure for detailed parameter documentation.
40 *
41 * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
42 * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
43 * for the more through search presets because the underlying db_limit is so much higher.
44 */
45 struct astcenc_preset_config
46 {
47 float quality;
48 unsigned int tune_partition_count_limit;
49 unsigned int tune_2partition_index_limit;
50 unsigned int tune_3partition_index_limit;
51 unsigned int tune_4partition_index_limit;
52 unsigned int tune_block_mode_limit;
53 unsigned int tune_refinement_limit;
54 unsigned int tune_candidate_limit;
55 unsigned int tune_2partitioning_candidate_limit;
56 unsigned int tune_3partitioning_candidate_limit;
57 unsigned int tune_4partitioning_candidate_limit;
58 float tune_db_limit_a_base;
59 float tune_db_limit_b_base;
60 float tune_mse_overshoot;
61 float tune_2partition_early_out_limit_factor;
62 float tune_3partition_early_out_limit_factor;
63 float tune_2plane_early_out_limit_correlation;
64 float tune_search_mode0_enable;
65 };
66
67 /**
68 * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
69 */
70 static const std::array<astcenc_preset_config, 6> preset_configs_high {{
71 {
72 ASTCENC_PRE_FASTEST,
73 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 0.0f
74 }, {
75 ASTCENC_PRE_FAST,
76 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f, 0.0f
77 }, {
78 ASTCENC_PRE_MEDIUM,
79 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f, 0.0f
80 }, {
81 ASTCENC_PRE_THOROUGH,
82 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f, 0.0f
83 }, {
84 ASTCENC_PRE_VERYTHOROUGH,
85 4, 256, 128, 64, 98, 4, 6, 8, 6, 4, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
86 }, {
87 ASTCENC_PRE_EXHAUSTIVE,
88 4, 512, 512, 512, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
89 }
90 }};
91
92 /**
93 * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
94 */
95 static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
96 {
97 ASTCENC_PRE_FASTEST,
98 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
99 }, {
100 ASTCENC_PRE_FAST,
101 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
102 }, {
103 ASTCENC_PRE_MEDIUM,
104 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f, 1.0f
105 }, {
106 ASTCENC_PRE_THOROUGH,
107 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f, 0.0f
108 }, {
109 ASTCENC_PRE_VERYTHOROUGH,
110 4, 256, 128, 64, 98, 4, 6, 8, 6, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 0.0f
111 }, {
112 ASTCENC_PRE_EXHAUSTIVE,
113 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 0.0f
114 }
115 }};
116
117 /**
118 * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
119 */
120 static const std::array<astcenc_preset_config, 6> preset_configs_low {{
121 {
122 ASTCENC_PRE_FASTEST,
123 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f, 1.0f
124 }, {
125 ASTCENC_PRE_FAST,
126 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f, 1.0f
127 }, {
128 ASTCENC_PRE_MEDIUM,
129 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f, 1.0f
130 }, {
131 ASTCENC_PRE_THOROUGH,
132 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f, 1.0f
133 }, {
134 ASTCENC_PRE_VERYTHOROUGH,
135 4, 256, 128, 64, 98, 4, 6, 8, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f, 1.0f
136 }, {
137 ASTCENC_PRE_EXHAUSTIVE,
138 4, 256, 256, 256, 100, 4, 8, 8, 8, 8, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f, 1.0f
139 }
140 }};
141
142 /**
143 * @brief Validate CPU floating point meets assumptions made in the codec.
144 *
145 * The codec is written with the assumption that a float threaded through the @c if32 union will be
146 * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
147 * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
148 * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
149 *
150 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
151 */
validate_cpu_float()152 static astcenc_error validate_cpu_float()
153 {
154 if32 p;
155 volatile float xprec_testval = 2.51f;
156 p.f = xprec_testval + 12582912.0f;
157 float q = p.f - 12582912.0f;
158
159 if (q != 3.0f)
160 {
161 return ASTCENC_ERR_BAD_CPU_FLOAT;
162 }
163
164 return ASTCENC_SUCCESS;
165 }
166
167 /**
168 * @brief Validate config profile.
169 *
170 * @param profile The profile to check.
171 *
172 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
173 */
validate_profile(astcenc_profile profile)174 static astcenc_error validate_profile(
175 astcenc_profile profile
176 ) {
177 // Values in this enum are from an external user, so not guaranteed to be
178 // bounded to the enum values
179 switch (static_cast<int>(profile))
180 {
181 case ASTCENC_PRF_LDR_SRGB:
182 case ASTCENC_PRF_LDR:
183 case ASTCENC_PRF_HDR_RGB_LDR_A:
184 case ASTCENC_PRF_HDR:
185 return ASTCENC_SUCCESS;
186 default:
187 return ASTCENC_ERR_BAD_PROFILE;
188 }
189 }
190
191 /**
192 * @brief Validate block size.
193 *
194 * @param block_x The block x dimensions.
195 * @param block_y The block y dimensions.
196 * @param block_z The block z dimensions.
197 *
198 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
199 */
validate_block_size(unsigned int block_x,unsigned int block_y,unsigned int block_z)200 static astcenc_error validate_block_size(
201 unsigned int block_x,
202 unsigned int block_y,
203 unsigned int block_z
204 ) {
205 // Test if this is a legal block size at all
206 bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
207 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
208 if (!is_legal)
209 {
210 return ASTCENC_ERR_BAD_BLOCK_SIZE;
211 }
212
213 // Test if this build has sufficient capacity for this block size
214 bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
215 if (!have_capacity)
216 {
217 return ASTCENC_ERR_NOT_IMPLEMENTED;
218 }
219
220 return ASTCENC_SUCCESS;
221 }
222
223 /**
224 * @brief Validate flags.
225 *
226 * @param profile The profile to check.
227 * @param flags The flags to check.
228 *
229 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
230 */
validate_flags(astcenc_profile profile,unsigned int flags)231 static astcenc_error validate_flags(
232 astcenc_profile profile,
233 unsigned int flags
234 ) {
235 // Flags field must not contain any unknown flag bits
236 unsigned int exMask = ~ASTCENC_ALL_FLAGS;
237 if (popcount(flags & exMask) != 0)
238 {
239 return ASTCENC_ERR_BAD_FLAGS;
240 }
241
242 // Flags field must only contain at most a single map type
243 exMask = ASTCENC_FLG_MAP_NORMAL
244 | ASTCENC_FLG_MAP_RGBM;
245 if (popcount(flags & exMask) > 1)
246 {
247 return ASTCENC_ERR_BAD_FLAGS;
248 }
249
250 // Decode_unorm8 must only be used with an LDR profile
251 bool is_unorm8 = flags & ASTCENC_FLG_USE_DECODE_UNORM8;
252 bool is_hdr = (profile == ASTCENC_PRF_HDR) || (profile == ASTCENC_PRF_HDR_RGB_LDR_A);
253 if (is_unorm8 && is_hdr)
254 {
255 return ASTCENC_ERR_BAD_DECODE_MODE;
256 }
257
258 return ASTCENC_SUCCESS;
259 }
260
261 #if !defined(ASTCENC_DECOMPRESS_ONLY)
262
263 /**
264 * @brief Validate single channel compression swizzle.
265 *
266 * @param swizzle The swizzle to check.
267 *
268 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
269 */
validate_compression_swz(astcenc_swz swizzle)270 static astcenc_error validate_compression_swz(
271 astcenc_swz swizzle
272 ) {
273 // Not all enum values are handled; SWZ_Z is invalid for compression
274 switch (static_cast<int>(swizzle))
275 {
276 case ASTCENC_SWZ_R:
277 case ASTCENC_SWZ_G:
278 case ASTCENC_SWZ_B:
279 case ASTCENC_SWZ_A:
280 case ASTCENC_SWZ_0:
281 case ASTCENC_SWZ_1:
282 return ASTCENC_SUCCESS;
283 default:
284 return ASTCENC_ERR_BAD_SWIZZLE;
285 }
286 }
287
288 /**
289 * @brief Validate overall compression swizzle.
290 *
291 * @param swizzle The swizzle to check.
292 *
293 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
294 */
validate_compression_swizzle(const astcenc_swizzle & swizzle)295 static astcenc_error validate_compression_swizzle(
296 const astcenc_swizzle& swizzle
297 ) {
298 if (validate_compression_swz(swizzle.r) ||
299 validate_compression_swz(swizzle.g) ||
300 validate_compression_swz(swizzle.b) ||
301 validate_compression_swz(swizzle.a))
302 {
303 return ASTCENC_ERR_BAD_SWIZZLE;
304 }
305
306 return ASTCENC_SUCCESS;
307 }
308 #endif
309
310 /**
311 * @brief Validate single channel decompression swizzle.
312 *
313 * @param swizzle The swizzle to check.
314 *
315 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
316 */
validate_decompression_swz(astcenc_swz swizzle)317 static astcenc_error validate_decompression_swz(
318 astcenc_swz swizzle
319 ) {
320 // Values in this enum are from an external user, so not guaranteed to be
321 // bounded to the enum values
322 switch (static_cast<int>(swizzle))
323 {
324 case ASTCENC_SWZ_R:
325 case ASTCENC_SWZ_G:
326 case ASTCENC_SWZ_B:
327 case ASTCENC_SWZ_A:
328 case ASTCENC_SWZ_0:
329 case ASTCENC_SWZ_1:
330 case ASTCENC_SWZ_Z:
331 return ASTCENC_SUCCESS;
332 default:
333 return ASTCENC_ERR_BAD_SWIZZLE;
334 }
335 }
336
337 /**
338 * @brief Validate overall decompression swizzle.
339 *
340 * @param swizzle The swizzle to check.
341 *
342 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
343 */
validate_decompression_swizzle(const astcenc_swizzle & swizzle)344 static astcenc_error validate_decompression_swizzle(
345 const astcenc_swizzle& swizzle
346 ) {
347 if (validate_decompression_swz(swizzle.r) ||
348 validate_decompression_swz(swizzle.g) ||
349 validate_decompression_swz(swizzle.b) ||
350 validate_decompression_swz(swizzle.a))
351 {
352 return ASTCENC_ERR_BAD_SWIZZLE;
353 }
354
355 return ASTCENC_SUCCESS;
356 }
357
358 /**
359 * Validate that an incoming configuration is in-spec.
360 *
361 * This function can respond in two ways:
362 *
363 * * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
364 * for out-of-range inputs in this case.
365 * * Numerical inputs and logic inputs are are logically invalid and which make no sense
366 * algorithmically will return an error.
367 *
368 * @param[in,out] config The input compressor configuration.
369 *
370 * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
371 */
validate_config(astcenc_config & config)372 static astcenc_error validate_config(
373 astcenc_config &config
374 ) {
375 astcenc_error status;
376
377 status = validate_profile(config.profile);
378 if (status != ASTCENC_SUCCESS)
379 {
380 return status;
381 }
382
383 status = validate_flags(config.profile, config.flags);
384 if (status != ASTCENC_SUCCESS)
385 {
386 return status;
387 }
388
389 status = validate_block_size(config.block_x, config.block_y, config.block_z);
390 if (status != ASTCENC_SUCCESS)
391 {
392 return status;
393 }
394
395 #if defined(ASTCENC_DECOMPRESS_ONLY)
396 // Decompress-only builds only support decompress-only contexts
397 if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
398 {
399 return ASTCENC_ERR_BAD_PARAM;
400 }
401 #endif
402
403 config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
404
405 config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
406 config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
407 config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
408 config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
409 config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
410 config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
411 config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
412 config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
413 config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
414 config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
415 config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
416 config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
417 config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f);
418 config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f);
419 config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f);
420
421 // Specifying a zero weight color component is not allowed; force to small value
422 float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
423 astc::max(config.cw_b_weight, config.cw_a_weight));
424 if (max_weight > 0.0f)
425 {
426 max_weight /= 1000.0f;
427 config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
428 config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
429 config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
430 config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
431 }
432 // If all color components error weights are zero then return an error
433 else
434 {
435 return ASTCENC_ERR_BAD_PARAM;
436 }
437
438 return ASTCENC_SUCCESS;
439 }
440
441 /* See header for documentation. */
astcenc_config_init(astcenc_profile profile,unsigned int block_x,unsigned int block_y,unsigned int block_z,float quality,unsigned int flags,astcenc_config * configp)442 astcenc_error astcenc_config_init(
443 astcenc_profile profile,
444 unsigned int block_x,
445 unsigned int block_y,
446 unsigned int block_z,
447 float quality,
448 unsigned int flags,
449 astcenc_config* configp
450 ) {
451 astcenc_error status;
452
453 status = validate_cpu_float();
454 if (status != ASTCENC_SUCCESS)
455 {
456 return status;
457 }
458
459 // Zero init all config fields; although most of will be over written
460 astcenc_config& config = *configp;
461 std::memset(&config, 0, sizeof(config));
462
463 // Process the block size
464 block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
465 status = validate_block_size(block_x, block_y, block_z);
466 if (status != ASTCENC_SUCCESS)
467 {
468 return status;
469 }
470
471 config.block_x = block_x;
472 config.block_y = block_y;
473 config.block_z = block_z;
474
475 float texels = static_cast<float>(block_x * block_y * block_z);
476 float ltexels = logf(texels) / logf(10.0f);
477
478 // Process the performance quality level or preset; note that this must be done before we
479 // process any additional settings, such as color profile and flags, which may replace some of
480 // these settings with more use case tuned values
481 if (quality < ASTCENC_PRE_FASTEST ||
482 quality > ASTCENC_PRE_EXHAUSTIVE)
483 {
484 return ASTCENC_ERR_BAD_QUALITY;
485 }
486
487 static const std::array<astcenc_preset_config, 6>* preset_configs;
488 int texels_int = block_x * block_y * block_z;
489 if (texels_int < 25)
490 {
491 preset_configs = &preset_configs_high;
492 }
493 else if (texels_int < 64)
494 {
495 preset_configs = &preset_configs_mid;
496 }
497 else
498 {
499 preset_configs = &preset_configs_low;
500 }
501
502 // Determine which preset to use, or which pair to interpolate
503 size_t start;
504 size_t end;
505 for (end = 0; end < preset_configs->size(); end++)
506 {
507 if ((*preset_configs)[end].quality >= quality)
508 {
509 break;
510 }
511 }
512
513 start = end == 0 ? 0 : end - 1;
514
515 // Start and end node are the same - so just transfer the values.
516 if (start == end)
517 {
518 config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
519 config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
520 config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
521 config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
522 config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
523 config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
524 config.tune_candidate_limit = (*preset_configs)[start].tune_candidate_limit;
525 config.tune_2partitioning_candidate_limit = (*preset_configs)[start].tune_2partitioning_candidate_limit;
526 config.tune_3partitioning_candidate_limit = (*preset_configs)[start].tune_3partitioning_candidate_limit;
527 config.tune_4partitioning_candidate_limit = (*preset_configs)[start].tune_4partitioning_candidate_limit;
528 config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
529 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
530
531 config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
532
533 config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor;
534 config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor;
535 config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation;
536 config.tune_search_mode0_enable = (*preset_configs)[start].tune_search_mode0_enable;
537 }
538 // Start and end node are not the same - so interpolate between them
539 else
540 {
541 auto& node_a = (*preset_configs)[start];
542 auto& node_b = (*preset_configs)[end];
543
544 float wt_range = node_b.quality - node_a.quality;
545 assert(wt_range > 0);
546
547 // Compute interpolation factors
548 float wt_node_a = (node_b.quality - quality) / wt_range;
549 float wt_node_b = (quality - node_a.quality) / wt_range;
550
551 #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
552 #define LERPI(param) astc::flt2int_rtn(\
553 (static_cast<float>(node_a.param) * wt_node_a) + \
554 (static_cast<float>(node_b.param) * wt_node_b))
555 #define LERPUI(param) static_cast<unsigned int>(LERPI(param))
556
557 config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
558 config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
559 config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
560 config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
561 config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
562 config.tune_refinement_limit = LERPI(tune_refinement_limit);
563 config.tune_candidate_limit = LERPUI(tune_candidate_limit);
564 config.tune_2partitioning_candidate_limit = LERPUI(tune_2partitioning_candidate_limit);
565 config.tune_3partitioning_candidate_limit = LERPUI(tune_3partitioning_candidate_limit);
566 config.tune_4partitioning_candidate_limit = LERPUI(tune_4partitioning_candidate_limit);
567 config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
568 LERP(tune_db_limit_b_base) - 19 * ltexels);
569
570 config.tune_mse_overshoot = LERP(tune_mse_overshoot);
571
572 config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor);
573 config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor);
574 config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation);
575 config.tune_search_mode0_enable = LERP(tune_search_mode0_enable);
576 #undef LERP
577 #undef LERPI
578 #undef LERPUI
579 }
580
581 // Set heuristics to the defaults for each color profile
582 config.cw_r_weight = 1.0f;
583 config.cw_g_weight = 1.0f;
584 config.cw_b_weight = 1.0f;
585 config.cw_a_weight = 1.0f;
586
587 config.a_scale_radius = 0;
588
589 config.rgbm_m_scale = 0.0f;
590
591 config.profile = profile;
592
593 // Values in this enum are from an external user, so not guaranteed to be
594 // bounded to the enum values
595 switch (static_cast<int>(profile))
596 {
597 case ASTCENC_PRF_LDR:
598 case ASTCENC_PRF_LDR_SRGB:
599 break;
600 case ASTCENC_PRF_HDR_RGB_LDR_A:
601 case ASTCENC_PRF_HDR:
602 config.tune_db_limit = 999.0f;
603 config.tune_search_mode0_enable = 0.0f;
604 break;
605 default:
606 return ASTCENC_ERR_BAD_PROFILE;
607 }
608
609 // Flags field must not contain any unknown flag bits
610 status = validate_flags(profile, flags);
611 if (status != ASTCENC_SUCCESS)
612 {
613 return status;
614 }
615
616 if (flags & ASTCENC_FLG_MAP_NORMAL)
617 {
618 // Normal map encoding uses L+A blocks, so allow one more partitioning
619 // than normal. We need need fewer bits for endpoints, so more likely
620 // to be able to use more partitions than an RGB/RGBA block
621 config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
622
623 config.cw_g_weight = 0.0f;
624 config.cw_b_weight = 0.0f;
625 config.tune_2partition_early_out_limit_factor *= 1.5f;
626 config.tune_3partition_early_out_limit_factor *= 1.5f;
627 config.tune_2plane_early_out_limit_correlation = 0.99f;
628
629 // Normals are prone to blocking artifacts on smooth curves
630 // so force compressor to try harder here ...
631 config.tune_db_limit *= 1.03f;
632 }
633 else if (flags & ASTCENC_FLG_MAP_RGBM)
634 {
635 config.rgbm_m_scale = 5.0f;
636 config.cw_a_weight = 2.0f * config.rgbm_m_scale;
637 }
638 else // (This is color data)
639 {
640 // This is a very basic perceptual metric for RGB color data, which weights error
641 // significance by the perceptual luminance contribution of each color channel. For
642 // luminance the usual weights to compute luminance from a linear RGB value are as
643 // follows:
644 //
645 // l = r * 0.3 + g * 0.59 + b * 0.11
646 //
647 // ... but we scale these up to keep a better balance between color and alpha. Note
648 // that if the content is using alpha we'd recommend using the -a option to weight
649 // the color contribution by the alpha transparency.
650 if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
651 {
652 config.cw_r_weight = 0.30f * 2.25f;
653 config.cw_g_weight = 0.59f * 2.25f;
654 config.cw_b_weight = 0.11f * 2.25f;
655 }
656 }
657 config.flags = flags;
658
659 return ASTCENC_SUCCESS;
660 }
661
662 /* See header for documentation. */
astcenc_context_alloc(const astcenc_config * configp,unsigned int thread_count,astcenc_context ** context)663 astcenc_error astcenc_context_alloc(
664 const astcenc_config* configp,
665 unsigned int thread_count,
666 astcenc_context** context
667 ) {
668 astcenc_error status;
669 const astcenc_config& config = *configp;
670
671 status = validate_cpu_float();
672 if (status != ASTCENC_SUCCESS)
673 {
674 return status;
675 }
676
677 if (thread_count == 0)
678 {
679 return ASTCENC_ERR_BAD_PARAM;
680 }
681
682 #if defined(ASTCENC_DIAGNOSTICS)
683 // Force single threaded compressor use in diagnostic mode.
684 if (thread_count != 1)
685 {
686 return ASTCENC_ERR_BAD_PARAM;
687 }
688 #endif
689
690 #ifndef ASTC_CUSTOMIZED_ENABLE
691 if (config.privateProfile == CUSTOMIZED_PROFILE)
692 {
693 return ASTCENC_ERR_BAD_PARAM;
694 }
695 #endif
696
697 astcenc_context* ctxo = new astcenc_context;
698 astcenc_contexti* ctx = &ctxo->context;
699 ctx->thread_count = thread_count;
700 ctx->config = config;
701 ctx->working_buffers = nullptr;
702
703 // These are allocated per-compress, as they depend on image size
704 ctx->input_alpha_averages = nullptr;
705
706 // Copy the config first and validate the copy (we may modify it)
707 status = validate_config(ctx->config);
708 if (status != ASTCENC_SUCCESS)
709 {
710 delete ctxo;
711 return status;
712 }
713
714 ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
715 if (!ctx->bsd)
716 {
717 delete ctxo;
718 return ASTCENC_ERR_OUT_OF_MEM;
719 }
720
721 bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
722 #ifdef ASTC_CUSTOMIZED_ENABLE
723 if (!init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
724 can_omit_modes,
725 config.tune_partition_count_limit,
726 static_cast<float>(config.tune_block_mode_limit) / 100.0f,
727 *ctx->bsd))
728 {
729 aligned_free<block_size_descriptor>(ctx->bsd);
730 delete ctxo;
731 *context = nullptr;
732 return ASTCENC_ERR_DLOPEN_FAILED;
733 }
734 #else
735 init_block_size_descriptor(ctx->config.privateProfile, config.block_x, config.block_y, config.block_z,
736 can_omit_modes,
737 config.tune_partition_count_limit,
738 static_cast<float>(config.tune_block_mode_limit) / 100.0f,
739 *ctx->bsd);
740 #endif
741
742 #if !defined(ASTCENC_DECOMPRESS_ONLY)
743 // Do setup only needed by compression
744 if (!(ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
745 {
746 // Turn a dB limit into a per-texel error for faster use later
747 if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
748 {
749 ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
750 }
751 else
752 {
753 ctx->config.tune_db_limit = 0.0f;
754 }
755
756 size_t worksize = sizeof(compression_working_buffers) * thread_count;
757 ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
758 static_assert((ASTCENC_VECALIGN == 0) || ((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0),
759 "compression_working_buffers size must be multiple of vector alignment");
760 if (!ctx->working_buffers)
761 {
762 aligned_free<block_size_descriptor>(ctx->bsd);
763 delete ctxo;
764 *context = nullptr;
765 return ASTCENC_ERR_OUT_OF_MEM;
766 }
767 }
768 #endif
769
770 #if defined(ASTCENC_DIAGNOSTICS)
771 ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
772 if (!ctx->trace_log->m_file)
773 {
774 return ASTCENC_ERR_DTRACE_FAILURE;
775 }
776
777 trace_add_data("block_x", config.block_x);
778 trace_add_data("block_y", config.block_y);
779 trace_add_data("block_z", config.block_z);
780 #endif
781
782 *context = ctxo;
783
784 #if !defined(ASTCENC_DECOMPRESS_ONLY)
785 prepare_angular_tables();
786 #endif
787
788 return ASTCENC_SUCCESS;
789 }
790
791 /* See header dor documentation. */
astcenc_context_free(astcenc_context * ctxo)792 void astcenc_context_free(
793 astcenc_context* ctxo
794 ) {
795 if (ctxo)
796 {
797 astcenc_contexti* ctx = &ctxo->context;
798 if (ctx->working_buffers)
799 {
800 aligned_free<compression_working_buffers>(ctx->working_buffers);
801 }
802 else
803 {
804 printf("ctx->working_buffers is nullptr !!\n");
805 }
806 if (ctx->bsd)
807 {
808 aligned_free<block_size_descriptor>(ctx->bsd);
809 }
810 else
811 {
812 printf("ctx->bsd is nullptr !!\n");
813 }
814 #if defined(ASTCENC_DIAGNOSTICS)
815 delete ctx->trace_log;
816 #endif
817 delete ctxo;
818 }
819 }
820
821 #if !defined(ASTCENC_DECOMPRESS_ONLY)
822
823 /**
824 * @brief Compress an image, after any preflight has completed.
825 *
826 * @param[out] ctxo The compressor context.
827 * @param thread_index The thread index.
828 * @param image The intput image.
829 * @param swizzle The input swizzle.
830 * @param[out] buffer The output array for the compressed data.
831 */
compress_image(astcenc_context & ctxo,unsigned int thread_index,const astcenc_image & image,const astcenc_swizzle & swizzle,uint8_t * buffer,bool calQualityEnable,int32_t * mse[RGBA_COM])832 static void compress_image(
833 astcenc_context& ctxo,
834 unsigned int thread_index,
835 const astcenc_image& image,
836 const astcenc_swizzle& swizzle,
837 #if QUALITY_CONTROL
838 uint8_t* buffer,
839 bool calQualityEnable,
840 int32_t *mse[RGBA_COM]
841 #else
842 uint8_t* buffer
843 #endif
844 ) {
845 astcenc_contexti& ctx = ctxo.context;
846 const block_size_descriptor& bsd = *ctx.bsd;
847 astcenc_profile decode_mode = ctx.config.profile;
848
849 image_block blk;
850
851 int block_x = bsd.xdim;
852 int block_y = bsd.ydim;
853 int block_z = bsd.zdim;
854 blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
855
856 int dim_x = image.dim_x;
857 int dim_y = image.dim_y;
858 int dim_z = image.dim_z;
859
860 int xblocks = (dim_x + block_x - 1) / block_x;
861 int yblocks = (dim_y + block_y - 1) / block_y;
862 int zblocks = (dim_z + block_z - 1) / block_z;
863 int block_count = zblocks * yblocks * xblocks;
864
865 int row_blocks = xblocks;
866 int plane_blocks = xblocks * yblocks;
867
868 blk.decode_unorm8 = ctxo.context.config.flags & ASTCENC_FLG_USE_DECODE_UNORM8;
869
870 // Populate the block channel weights
871 blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
872 ctx.config.cw_g_weight,
873 ctx.config.cw_b_weight,
874 ctx.config.cw_a_weight);
875
876 // Use preallocated scratch buffer
877 auto& temp_buffers = ctx.working_buffers[thread_index];
878
879 // Only the first thread actually runs the initializer
880 ctxo.manage_compress.init(block_count, ctx.config.progress_callback);
881
882 // Determine if we can use an optimized load function
883 bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
884 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
885
886 bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
887 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
888
889 bool use_fast_load = !needs_swz && !needs_hdr &&
890 block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
891
892 auto load_func = load_image_block;
893 if (use_fast_load)
894 {
895 load_func = load_image_block_fast_ldr;
896 }
897
898 // All threads run this processing loop until there is no work remaining
899 while (true)
900 {
901 unsigned int count;
902 unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
903 if (!count)
904 {
905 break;
906 }
907
908 for (unsigned int i = base; i < base + count; i++)
909 {
910 // Decode i into x, y, z block indices
911 int z = i / plane_blocks;
912 unsigned int rem = i - (z * plane_blocks);
913 int y = rem / row_blocks;
914 int x = rem - (y * row_blocks);
915
916 // Test if we can apply some basic alpha-scale RDO
917 bool use_full_block = true;
918 if (ctx.config.a_scale_radius != 0 && block_z == 1)
919 {
920 int start_x = x * block_x;
921 int end_x = astc::min(dim_x, start_x + block_x);
922
923 int start_y = y * block_y;
924 int end_y = astc::min(dim_y, start_y + block_y);
925
926 // SATs accumulate error, so don't test exactly zero. Test for
927 // less than 1 alpha in the expanded block footprint that
928 // includes the alpha radius.
929 int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
930
931 int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
932
933 float footprint = static_cast<float>(x_footprint * y_footprint);
934 float threshold = 0.9f / (255.0f * footprint);
935
936 // Do we have any alpha values?
937 use_full_block = false;
938 for (int ay = start_y; ay < end_y; ay++)
939 {
940 for (int ax = start_x; ax < end_x; ax++)
941 {
942 float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
943 if (a_avg > threshold)
944 {
945 use_full_block = true;
946 ax = end_x;
947 ay = end_y;
948 }
949 }
950 }
951 }
952
953 // Fetch the full block for compression
954 if (use_full_block)
955 {
956 load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
957
958 // Scale RGB error contribution by the maximum alpha in the block
959 // This encourages preserving alpha accuracy in regions with high
960 // transparency, and can buy up to 0.5 dB PSNR.
961 if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
962 {
963 float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
964 blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
965 ctx.config.cw_g_weight * alpha_scale,
966 ctx.config.cw_b_weight * alpha_scale,
967 ctx.config.cw_a_weight);
968 }
969 }
970 // Apply alpha scale RDO - substitute constant color block
971 else
972 {
973 blk.origin_texel = vfloat4::zero();
974 blk.data_min = vfloat4::zero();
975 blk.data_mean = vfloat4::zero();
976 blk.data_max = vfloat4::zero();
977 blk.grayscale = true;
978 }
979
980 int offset = ((z * yblocks + y) * xblocks + x) * 16;
981 uint8_t *bp = buffer + offset;
982 #if QUALITY_CONTROL
983 int32_t *mseBlock[RGBA_COM] = {nullptr, nullptr, nullptr, nullptr};
984 if (calQualityEnable) {
985 offset = (z * yblocks + y) * xblocks + x;
986 mseBlock[R_COM] = mse[R_COM] + offset;
987 mseBlock[G_COM] = mse[G_COM] + offset;
988 mseBlock[B_COM] = mse[B_COM] + offset;
989 mseBlock[A_COM] = mse[A_COM] + offset;
990 }
991 compress_block(ctx, blk, bp, temp_buffers, calQualityEnable, mseBlock);
992 #else
993 compress_block(ctx, blk, bp, temp_buffers);
994 #endif
995 }
996
997 ctxo.manage_compress.complete_task_assignment(count);
998 }
999 }
1000
1001 /**
1002 * @brief Compute regional averages in an image.
1003 *
1004 * This function can be called by multiple threads, but only after a single
1005 * thread calls the setup function @c init_compute_averages().
1006 *
1007 * Results are written back into @c img->input_alpha_averages.
1008 *
1009 * @param[out] ctx The context.
1010 * @param ag The average and variance arguments created during setup.
1011 */
compute_averages(astcenc_context & ctx,const avg_args & ag)1012 static void compute_averages(
1013 astcenc_context& ctx,
1014 const avg_args &ag
1015 ) {
1016 pixel_region_args arg = ag.arg;
1017 arg.work_memory = new vfloat4[ag.work_memory_size];
1018
1019 int size_x = ag.img_size_x;
1020 int size_y = ag.img_size_y;
1021 int size_z = ag.img_size_z;
1022
1023 int step_xy = ag.blk_size_xy;
1024 int step_z = ag.blk_size_z;
1025
1026 int y_tasks = (size_y + step_xy - 1) / step_xy;
1027
1028 // All threads run this processing loop until there is no work remaining
1029 while (true)
1030 {
1031 unsigned int count;
1032 unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
1033 if (!count)
1034 {
1035 break;
1036 }
1037
1038 for (unsigned int i = base; i < base + count; i++)
1039 {
1040 int z = (i / (y_tasks)) * step_z;
1041 int y = (i - (z * y_tasks)) * step_xy;
1042
1043 arg.size_z = astc::min(step_z, size_z - z);
1044 arg.offset_z = z;
1045
1046 arg.size_y = astc::min(step_xy, size_y - y);
1047 arg.offset_y = y;
1048
1049 for (int x = 0; x < size_x; x += step_xy)
1050 {
1051 arg.size_x = astc::min(step_xy, size_x - x);
1052 arg.offset_x = x;
1053 compute_pixel_region_variance(ctx.context, arg);
1054 }
1055 }
1056
1057 ctx.manage_avg.complete_task_assignment(count);
1058 }
1059
1060 delete[] arg.work_memory;
1061 }
1062
1063 #endif
1064
free_image_inside(astcenc_image * img)1065 static void free_image_inside(astcenc_image* img)
1066 {
1067 if (img->data != nullptr)
1068 {
1069 for (unsigned int z = 0; z < img->dim_z; z++)
1070 {
1071 delete[] reinterpret_cast<char *>(img->data[z]);
1072 img->data[z] = nullptr;
1073 }
1074 }
1075 delete[] img->data;
1076 img->data = nullptr;
1077 }
1078
convert_rgba10_to_float16(astcenc_image * imgRGBA,astcenc_image * image)1079 static void convert_rgba10_to_float16(astcenc_image* imgRGBA, astcenc_image* image)
1080 {
1081 uint32_t* src = static_cast<uint32_t *>(image->data[0]);
1082 uint16_t* dst = static_cast<uint16_t *>(imgRGBA->data[0]);
1083 for (unsigned int y = 0; y < image->dim_y; y++)
1084 {
1085 for (unsigned int x = 0; x < image->dim_x; x++)
1086 {
1087 uint32_t data_rgba = src[image->dim_stride * y + x];
1088 uint16_t data_r = data_rgba & 0x3FF;
1089 uint16_t data_g = (data_rgba >> COMP_G_SHIFT_POSITION) & 0x3FF;
1090 uint16_t data_b = (data_rgba >> COMP_B_SHIFT_POSITION) & 0x3FF;
1091 uint16_t data_a = (data_rgba >> COMP_A_SHIFT_POSITION) & 0x3;
1092 vint4 colorf16 = float_to_float16(vfloat4(data_r / 1023.0, // 1023.0: 10bit to 0-1
1093 data_g / 1023.0,
1094 data_b / 1023.0,
1095 data_a / 3.0)); // 3.0: 2bit to 0-1
1096 dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x)] =
1097 static_cast<uint16_t>(colorf16.lane<0>()); // 0: R
1098 dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x + 1)] = // 1: G
1099 static_cast<uint16_t>(colorf16.lane<1>()); // 1: G
1100 dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x + 2)] = // 2: B
1101 static_cast<uint16_t>(colorf16.lane<2>()); // 2: B
1102 dst[(COMPONENT_NUM * image->dim_x * y) + (COMPONENT_NUM * x + 3)] = // 3: A
1103 static_cast<uint16_t>(colorf16.lane<3>()); // 3: A
1104 }
1105 }
1106 }
1107
1108 /* See header for documentation. */
astcenc_compress_image(astcenc_context * ctxo,astcenc_image * imagep,const astcenc_swizzle * swizzle,uint8_t * data_out,size_t data_len,bool calQualityEnable,int32_t * mse[RGBA_COM],unsigned int thread_index)1109 astcenc_error astcenc_compress_image(
1110 astcenc_context* ctxo,
1111 astcenc_image* imagep,
1112 const astcenc_swizzle* swizzle,
1113 uint8_t* data_out,
1114 size_t data_len,
1115 #if QUALITY_CONTROL
1116 bool calQualityEnable,
1117 int32_t *mse[RGBA_COM],
1118 #endif
1119 unsigned int thread_index
1120 ) {
1121 #if defined(ASTCENC_DECOMPRESS_ONLY)
1122 (void)ctxo;
1123 (void)imagep;
1124 (void)swizzle;
1125 (void)data_out;
1126 (void)data_len;
1127 (void)thread_index;
1128 return ASTCENC_ERR_BAD_CONTEXT;
1129 #else
1130 astcenc_contexti* ctx = &ctxo->context;
1131 astcenc_error status;
1132 astcenc_image* image = imagep;
1133
1134 astcenc_image imgRGBA = {};
1135 imgRGBA.data = nullptr;
1136 if (image->data_type == ASTCENC_TYPE_RGBA1010102)
1137 {
1138 imgRGBA.dim_x = image->dim_x;
1139 imgRGBA.dim_y = image->dim_y;
1140 imgRGBA.dim_stride = imgRGBA.dim_x;
1141 imgRGBA.dim_z = 1;
1142 imgRGBA.data_type = ASTCENC_TYPE_F16;
1143 imgRGBA.data = new(std::nothrow) void* [imgRGBA.dim_z];
1144 if (imgRGBA.data == nullptr)
1145 {
1146 return ASTCENC_ERR_OUT_OF_MEM;
1147 }
1148 imgRGBA.data[0] = new(std::nothrow)
1149 uint16_t[imgRGBA.dim_x * imgRGBA.dim_y * COMPONENT_NUM];
1150 if (imgRGBA.data[0] == nullptr)
1151 {
1152 free_image_inside(&imgRGBA);
1153 return ASTCENC_ERR_OUT_OF_MEM;
1154 }
1155 convert_rgba10_to_float16(&imgRGBA, imagep);
1156 image = &imgRGBA;
1157 }
1158
1159 if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1160 {
1161 free_image_inside(&imgRGBA);
1162 return ASTCENC_ERR_BAD_CONTEXT;
1163 }
1164
1165 status = validate_compression_swizzle(*swizzle);
1166 if (status != ASTCENC_SUCCESS)
1167 {
1168 free_image_inside(&imgRGBA);
1169 return status;
1170 }
1171
1172 if (thread_index >= ctx->thread_count)
1173 {
1174 free_image_inside(&imgRGBA);
1175 return ASTCENC_ERR_BAD_PARAM;
1176 }
1177
1178 unsigned int block_x = ctx->config.block_x;
1179 unsigned int block_y = ctx->config.block_y;
1180 unsigned int block_z = ctx->config.block_z;
1181
1182 unsigned int xblocks = (image->dim_x + block_x - 1) / block_x;
1183 unsigned int yblocks = (image->dim_y + block_y - 1) / block_y;
1184 unsigned int zblocks = (image->dim_z + block_z - 1) / block_z;
1185
1186 // Check we have enough output space (16 bytes per block)
1187 size_t size_needed = xblocks * yblocks * zblocks * 16;
1188 if (data_len < size_needed)
1189 {
1190 free_image_inside(&imgRGBA);
1191 return ASTCENC_ERR_OUT_OF_MEM;
1192 }
1193
1194 // If context thread count is one then implicitly reset
1195 if (ctx->thread_count == 1)
1196 {
1197 astcenc_compress_reset(ctxo);
1198 }
1199
1200 if (ctx->config.a_scale_radius != 0)
1201 {
1202 image = imagep;
1203 free_image_inside(&imgRGBA);
1204 // First thread to enter will do setup, other threads will subsequently
1205 // enter the critical section but simply skip over the initialization
1206 auto init_avg = [ctx, &image, swizzle]() {
1207 // Perform memory allocations for the destination buffers
1208 size_t texel_count = image->dim_x * image->dim_y * image->dim_z;
1209 ctx->input_alpha_averages = new float[texel_count];
1210
1211 return init_compute_averages(
1212 *image, ctx->config.a_scale_radius, *swizzle,
1213 ctx->avg_preprocess_args);
1214 };
1215
1216 // Only the first thread actually runs the initializer
1217 ctxo->manage_avg.init(init_avg);
1218
1219 // All threads will enter this function and dynamically grab work
1220 compute_averages(*ctxo, ctx->avg_preprocess_args);
1221 }
1222
1223 // Wait for compute_averages to complete before compressing
1224 ctxo->manage_avg.wait();
1225 #if QUALITY_CONTROL
1226 compress_image(*ctxo, thread_index, *image, *swizzle, data_out, calQualityEnable, mse);
1227 #else
1228 compress_image(*ctxo, thread_index, *image, *swizzle, data_out);
1229 #endif
1230 // Wait for compress to complete before freeing memory
1231 ctxo->manage_compress.wait();
1232
1233 auto term_compress = [ctx]() {
1234 delete[] ctx->input_alpha_averages;
1235 ctx->input_alpha_averages = nullptr;
1236 };
1237
1238 // Only the first thread to arrive actually runs the term
1239 ctxo->manage_compress.term(term_compress);
1240 free_image_inside(&imgRGBA);
1241 return ASTCENC_SUCCESS;
1242 #endif
1243 }
1244
1245 /* See header for documentation. */
astcenc_compress_reset(astcenc_context * ctxo)1246 astcenc_error astcenc_compress_reset(
1247 astcenc_context* ctxo
1248 ) {
1249 #if defined(ASTCENC_DECOMPRESS_ONLY)
1250 (void)ctxo;
1251 return ASTCENC_ERR_BAD_CONTEXT;
1252 #else
1253 astcenc_contexti* ctx = &ctxo->context;
1254 if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
1255 {
1256 return ASTCENC_ERR_BAD_CONTEXT;
1257 }
1258
1259 ctxo->manage_avg.reset();
1260 ctxo->manage_compress.reset();
1261 return ASTCENC_SUCCESS;
1262 #endif
1263 }
1264
1265 /* See header for documentation. */
astcenc_decompress_image(astcenc_context * ctxo,const uint8_t * data,size_t data_len,astcenc_image * image_outp,const astcenc_swizzle * swizzle,unsigned int thread_index)1266 astcenc_error astcenc_decompress_image(
1267 astcenc_context* ctxo,
1268 const uint8_t* data,
1269 size_t data_len,
1270 astcenc_image* image_outp,
1271 const astcenc_swizzle* swizzle,
1272 unsigned int thread_index
1273 ) {
1274 astcenc_error status;
1275 astcenc_image& image_out = *image_outp;
1276 astcenc_contexti* ctx = &ctxo->context;
1277
1278 // Today this doesn't matter (working set on stack) but might in future ...
1279 if (thread_index >= ctx->thread_count)
1280 {
1281 return ASTCENC_ERR_BAD_PARAM;
1282 }
1283
1284 status = validate_decompression_swizzle(*swizzle);
1285 if (status != ASTCENC_SUCCESS)
1286 {
1287 return status;
1288 }
1289
1290 unsigned int block_x = ctx->config.block_x;
1291 unsigned int block_y = ctx->config.block_y;
1292 unsigned int block_z = ctx->config.block_z;
1293
1294 unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
1295 unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
1296 unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
1297 unsigned int block_count = zblocks * yblocks * xblocks;
1298
1299 int row_blocks = xblocks;
1300 int plane_blocks = xblocks * yblocks;
1301
1302 // Check we have enough output space (16 bytes per block)
1303 size_t size_needed = xblocks * yblocks * zblocks * 16;
1304 if (data_len < size_needed)
1305 {
1306 return ASTCENC_ERR_OUT_OF_MEM;
1307 }
1308
1309 image_block blk;
1310 blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
1311
1312 // Decode mode inferred from the output data type
1313 blk.decode_unorm8 = image_out.data_type == ASTCENC_TYPE_U8;
1314
1315 // If context thread count is one then implicitly reset
1316 if (ctx->thread_count == 1)
1317 {
1318 astcenc_decompress_reset(ctxo);
1319 }
1320
1321 // Only the first thread actually runs the initializer
1322 ctxo->manage_decompress.init(block_count, nullptr);
1323
1324 // All threads run this processing loop until there is no work remaining
1325 while (true)
1326 {
1327 unsigned int count;
1328 unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
1329 if (!count)
1330 {
1331 break;
1332 }
1333
1334 for (unsigned int i = base; i < base + count; i++)
1335 {
1336 // Decode i into x, y, z block indices
1337 int z = i / plane_blocks;
1338 unsigned int rem = i - (z * plane_blocks);
1339 int y = rem / row_blocks;
1340 int x = rem - (y * row_blocks);
1341
1342 unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
1343 const uint8_t* bp = data + offset;
1344
1345 symbolic_compressed_block scb;
1346
1347 physical_to_symbolic(*ctx->bsd, bp, scb);
1348
1349 decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
1350 x * block_x, y * block_y, z * block_z,
1351 scb, blk);
1352
1353 store_image_block(image_out, blk, *ctx->bsd,
1354 x * block_x, y * block_y, z * block_z, *swizzle);
1355 }
1356
1357 ctxo->manage_decompress.complete_task_assignment(count);
1358 }
1359
1360 return ASTCENC_SUCCESS;
1361 }
1362
1363 /* See header for documentation. */
astcenc_decompress_reset(astcenc_context * ctxo)1364 astcenc_error astcenc_decompress_reset(
1365 astcenc_context* ctxo
1366 ) {
1367 ctxo->manage_decompress.reset();
1368 return ASTCENC_SUCCESS;
1369 }
1370
1371 /* See header for documentation. */
astcenc_get_block_info(astcenc_context * ctxo,const uint8_t data[16],astcenc_block_info * info)1372 astcenc_error astcenc_get_block_info(
1373 astcenc_context* ctxo,
1374 const uint8_t data[16],
1375 astcenc_block_info* info
1376 ) {
1377 #if defined(ASTCENC_DECOMPRESS_ONLY)
1378 (void)ctxo;
1379 (void)data;
1380 (void)info;
1381 return ASTCENC_ERR_BAD_CONTEXT;
1382 #else
1383 astcenc_contexti* ctx = &ctxo->context;
1384
1385 // Decode the compressed data into a symbolic form
1386 symbolic_compressed_block scb;
1387 physical_to_symbolic(*ctx->bsd, data, scb);
1388
1389 // Fetch the appropriate partition and decimation tables
1390 block_size_descriptor& bsd = *ctx->bsd;
1391
1392 // Start from a clean slate
1393 memset(info, 0, sizeof(*info));
1394
1395 // Basic info we can always populate
1396 info->profile = ctx->config.profile;
1397
1398 info->block_x = ctx->config.block_x;
1399 info->block_y = ctx->config.block_y;
1400 info->block_z = ctx->config.block_z;
1401 info->texel_count = bsd.texel_count;
1402
1403 // Check for error blocks first
1404 info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
1405 if (info->is_error_block)
1406 {
1407 return ASTCENC_SUCCESS;
1408 }
1409
1410 // Check for constant color blocks second
1411 info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
1412 scb.block_type == SYM_BTYPE_CONST_U16;
1413 if (info->is_constant_block)
1414 {
1415 return ASTCENC_SUCCESS;
1416 }
1417
1418 // Otherwise handle a full block ; known to be valid after conditions above have been checked
1419 int partition_count = scb.partition_count;
1420 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
1421
1422 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
1423 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
1424
1425 info->weight_x = di.weight_x;
1426 info->weight_y = di.weight_y;
1427 info->weight_z = di.weight_z;
1428
1429 info->is_dual_plane_block = bm.is_dual_plane != 0;
1430
1431 info->partition_count = scb.partition_count;
1432 info->partition_index = scb.partition_index;
1433 info->dual_plane_component = scb.plane2_component;
1434
1435 info->color_level_count = get_quant_level(scb.get_color_quant_mode());
1436 info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
1437
1438 // Unpack color endpoints for each active partition
1439 for (unsigned int i = 0; i < scb.partition_count; i++)
1440 {
1441 bool rgb_hdr;
1442 bool a_hdr;
1443 vint4 endpnt[2];
1444
1445 unpack_color_endpoints(ctx->config.profile,
1446 scb.color_formats[i],
1447 scb.color_values[i],
1448 rgb_hdr, a_hdr,
1449 endpnt[0], endpnt[1]);
1450
1451 // Store the color endpoint mode info
1452 info->color_endpoint_modes[i] = scb.color_formats[i];
1453 info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
1454
1455 // Store the unpacked and decoded color endpoint
1456 vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
1457 for (int j = 0; j < 2; j++)
1458 {
1459 vint4 color_lns = lns_to_sf16(endpnt[j]);
1460 vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
1461 vint4 datai = select(color_unorm, color_lns, hdr_mask);
1462 store(float16_to_float(datai), info->color_endpoints[i][j]);
1463 }
1464 }
1465
1466 // Unpack weights for each texel
1467 int weight_plane1[BLOCK_MAX_TEXELS];
1468 int weight_plane2[BLOCK_MAX_TEXELS];
1469
1470 unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
1471 for (unsigned int i = 0; i < bsd.texel_count; i++)
1472 {
1473 info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1474 if (info->is_dual_plane_block)
1475 {
1476 info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
1477 }
1478 }
1479
1480 // Unpack partition assignments for each texel
1481 for (unsigned int i = 0; i < bsd.texel_count; i++)
1482 {
1483 info->partition_assignment[i] = pi.partition_of_texel[i];
1484 }
1485
1486 return ASTCENC_SUCCESS;
1487 #endif
1488 }
1489
1490 /* See header for documentation. */
astcenc_get_error_string(astcenc_error status)1491 const char* astcenc_get_error_string(
1492 astcenc_error status
1493 ) {
1494 // Values in this enum are from an external user, so not guaranteed to be
1495 // bounded to the enum values
1496 switch (static_cast<int>(status))
1497 {
1498 case ASTCENC_SUCCESS:
1499 return "ASTCENC_SUCCESS";
1500 case ASTCENC_ERR_OUT_OF_MEM:
1501 return "ASTCENC_ERR_OUT_OF_MEM";
1502 case ASTCENC_ERR_BAD_CPU_FLOAT:
1503 return "ASTCENC_ERR_BAD_CPU_FLOAT";
1504 case ASTCENC_ERR_BAD_PARAM:
1505 return "ASTCENC_ERR_BAD_PARAM";
1506 case ASTCENC_ERR_BAD_BLOCK_SIZE:
1507 return "ASTCENC_ERR_BAD_BLOCK_SIZE";
1508 case ASTCENC_ERR_BAD_PROFILE:
1509 return "ASTCENC_ERR_BAD_PROFILE";
1510 case ASTCENC_ERR_BAD_QUALITY:
1511 return "ASTCENC_ERR_BAD_QUALITY";
1512 case ASTCENC_ERR_BAD_FLAGS:
1513 return "ASTCENC_ERR_BAD_FLAGS";
1514 case ASTCENC_ERR_BAD_SWIZZLE:
1515 return "ASTCENC_ERR_BAD_SWIZZLE";
1516 case ASTCENC_ERR_BAD_CONTEXT:
1517 return "ASTCENC_ERR_BAD_CONTEXT";
1518 case ASTCENC_ERR_NOT_IMPLEMENTED:
1519 return "ASTCENC_ERR_NOT_IMPLEMENTED";
1520 case ASTCENC_ERR_BAD_DECODE_MODE:
1521 return "ASTCENC_ERR_BAD_DECODE_MODE";
1522 #if defined(ASTCENC_DIAGNOSTICS)
1523 case ASTCENC_ERR_DTRACE_FAILURE:
1524 return "ASTCENC_ERR_DTRACE_FAILURE";
1525 #endif
1526 default:
1527 return nullptr;
1528 }
1529 }
1530