1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions to compress a symbolic block.
22 */
23
24 #include "astcenc_internal.h"
25 #include "astcenc_diagnostic_trace.h"
26
27 #include <cassert>
28 #ifdef ASTC_CUSTOMIZED_ENABLE
29 AstcCustomizedSoManager g_astcCustomizedSoManager;
30 #endif
31
32 /**
33 * @brief Merge two planes of endpoints into a single vector.
34 *
35 * @param ep_plane1 The endpoints for plane 1.
36 * @param ep_plane2 The endpoints for plane 2.
37 * @param component_plane2 The color component for plane 2.
38 * @param[out] result The merged output.
39 */
merge_endpoints(const endpoints & ep_plane1,const endpoints & ep_plane2,unsigned int component_plane2,endpoints & result)40 static void merge_endpoints(
41 const endpoints& ep_plane1,
42 const endpoints& ep_plane2,
43 unsigned int component_plane2,
44 endpoints& result
45 ) {
46 unsigned int partition_count = ep_plane1.partition_count;
47 assert(partition_count == 1);
48
49 vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
50
51 result.partition_count = partition_count;
52 result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
53 result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
54 }
55
56 /**
57 * @brief Attempt to improve weights given a chosen configuration.
58 *
59 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
60 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
61 * down by one quantization step.
62 *
63 * This is a specialized function which only supports operating on undecimated weight grids,
64 * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
65 * is needed less often.
66 *
67 * @param decode_mode The decode mode (LDR, HDR).
68 * @param bsd The block size information.
69 * @param blk The image block color data to compress.
70 * @param[out] scb The symbolic compressed block output.
71 */
realign_weights_undecimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)72 static bool realign_weights_undecimated(
73 astcenc_profile decode_mode,
74 const block_size_descriptor& bsd,
75 const image_block& blk,
76 symbolic_compressed_block& scb
77 ) {
78 // Get the partition descriptor
79 unsigned int partition_count = scb.partition_count;
80 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
81
82 // Get the quantization table
83 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
84 unsigned int weight_quant_level = bm.quant_mode;
85 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
86
87 unsigned int max_plane = bm.is_dual_plane;
88 int plane2_component = scb.plane2_component;
89 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
90
91 // Decode the color endpoints
92 bool rgb_hdr;
93 bool alpha_hdr;
94 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
95 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
96 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
97 vfloat4 offset[BLOCK_MAX_PARTITIONS];
98
99 promise(partition_count > 0);
100
101 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
102 {
103 unpack_color_endpoints(decode_mode,
104 scb.color_formats[pa_idx],
105 scb.color_values[pa_idx],
106 rgb_hdr, alpha_hdr,
107 endpnt0[pa_idx],
108 endpnt1[pa_idx]);
109 }
110
111 uint8_t* dec_weights_uquant = scb.weights;
112 bool adjustments = false;
113
114 // For each plane and partition ...
115 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
116 {
117 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
118 {
119 // Compute the endpoint delta for all components in current plane
120 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
121 epd = select(epd, vint4::zero(), plane_mask);
122
123 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
124 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
125 }
126
127 // For each weight compute previous, current, and next errors
128 promise(bsd.texel_count > 0);
129 for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
130 {
131 int uqw = dec_weights_uquant[texel];
132
133 uint32_t prev_and_next = qat.prev_next_values[uqw];
134 int uqw_down = prev_and_next & 0xFF;
135 int uqw_up = (prev_and_next >> 8) & 0xFF;
136
137 // Interpolate the colors to create the diffs
138 float weight_base = static_cast<float>(uqw);
139 float weight_down = static_cast<float>(uqw_down - uqw);
140 float weight_up = static_cast<float>(uqw_up - uqw);
141
142 unsigned int partition = pi.partition_of_texel[texel];
143 vfloat4 color_offset = offset[partition];
144 vfloat4 color_base = endpnt0f[partition];
145
146 vfloat4 color = color_base + color_offset * weight_base;
147 vfloat4 orig_color = blk.texel(texel);
148 vfloat4 error_weight = blk.channel_weight;
149
150 vfloat4 color_diff = color - orig_color;
151 vfloat4 color_diff_down = color_diff + color_offset * weight_down;
152 vfloat4 color_diff_up = color_diff + color_offset * weight_up;
153
154 float error_base = dot_s(color_diff * color_diff, error_weight);
155 float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
156 float error_up = dot_s(color_diff_up * color_diff_up, error_weight);
157
158 // Check if the prev or next error is better, and if so use it
159 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
160 {
161 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
162 adjustments = true;
163 }
164 else if ((error_down < error_base) && (uqw > 0))
165 {
166 dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
167 adjustments = true;
168 }
169 }
170
171 // Prepare iteration for plane 2
172 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
173 plane_mask = ~plane_mask;
174 }
175
176 return adjustments;
177 }
178
179 /**
180 * @brief Attempt to improve weights given a chosen configuration.
181 *
182 * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
183 * partition and per plane) and attempt to improve image quality by moving each weight up by one or
184 * down by one quantization step.
185 *
186 * @param decode_mode The decode mode (LDR, HDR).
187 * @param bsd The block size information.
188 * @param blk The image block color data to compress.
189 * @param[out] scb The symbolic compressed block output.
190 */
realign_weights_decimated(astcenc_profile decode_mode,const block_size_descriptor & bsd,const image_block & blk,symbolic_compressed_block & scb)191 static bool realign_weights_decimated(
192 astcenc_profile decode_mode,
193 const block_size_descriptor& bsd,
194 const image_block& blk,
195 symbolic_compressed_block& scb
196 ) {
197 // Get the partition descriptor
198 unsigned int partition_count = scb.partition_count;
199 const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
200
201 // Get the quantization table
202 const block_mode& bm = bsd.get_block_mode(scb.block_mode);
203 unsigned int weight_quant_level = bm.quant_mode;
204 const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
205
206 // Get the decimation table
207 const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
208 unsigned int weight_count = di.weight_count;
209 assert(weight_count != bsd.texel_count);
210
211 unsigned int max_plane = bm.is_dual_plane;
212 int plane2_component = scb.plane2_component;
213 vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
214
215 // Decode the color endpoints
216 bool rgb_hdr;
217 bool alpha_hdr;
218 vint4 endpnt0[BLOCK_MAX_PARTITIONS];
219 vint4 endpnt1[BLOCK_MAX_PARTITIONS];
220 vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
221 vfloat4 offset[BLOCK_MAX_PARTITIONS];
222
223 promise(partition_count > 0);
224 promise(weight_count > 0);
225
226 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
227 {
228 unpack_color_endpoints(decode_mode,
229 scb.color_formats[pa_idx],
230 scb.color_values[pa_idx],
231 rgb_hdr, alpha_hdr,
232 endpnt0[pa_idx],
233 endpnt1[pa_idx]);
234 }
235
236 uint8_t* dec_weights_uquant = scb.weights;
237 bool adjustments = false;
238
239 // For each plane and partition ...
240 for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
241 {
242 for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
243 {
244 // Compute the endpoint delta for all components in current plane
245 vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
246 epd = select(epd, vint4::zero(), plane_mask);
247
248 endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
249 offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
250 }
251
252 // Create an unquantized weight grid for this decimation level
253 ASTCENC_ALIGNAS float uq_weightsf[BLOCK_MAX_WEIGHTS];
254 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
255 {
256 vint unquant_value(dec_weights_uquant + we_idx);
257 vfloat unquant_valuef = int_to_float(unquant_value);
258 storea(unquant_valuef, uq_weightsf + we_idx);
259 }
260
261 // For each weight compute previous, current, and next errors
262 for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
263 {
264 int uqw = dec_weights_uquant[we_idx];
265 uint32_t prev_and_next = qat.prev_next_values[uqw];
266
267 float uqw_base = uq_weightsf[we_idx];
268 float uqw_down = static_cast<float>(prev_and_next & 0xFF);
269 float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
270
271 float uqw_diff_down = uqw_down - uqw_base;
272 float uqw_diff_up = uqw_up - uqw_base;
273
274 vfloat4 error_basev = vfloat4::zero();
275 vfloat4 error_downv = vfloat4::zero();
276 vfloat4 error_upv = vfloat4::zero();
277
278 // Interpolate the colors to create the diffs
279 unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
280 promise(texels_to_evaluate > 0);
281 for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
282 {
283 unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
284
285 float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
286
287 float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
288 + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
289 + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
290 + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
291
292 // Ideally this is integer rounded, but IQ gain it isn't worth the overhead
293 // float weight = astc::flt_rd(weight_base + 0.5f);
294 // float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
295 // float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
296 float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
297 float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
298
299 unsigned int partition = pi.partition_of_texel[texel];
300 vfloat4 color_offset = offset[partition];
301 vfloat4 color_base = endpnt0f[partition];
302
303 vfloat4 color = color_base + color_offset * weight_base;
304 vfloat4 orig_color = blk.texel(texel);
305
306 vfloat4 color_diff = color - orig_color;
307 vfloat4 color_down_diff = color_diff + color_offset * weight_down;
308 vfloat4 color_up_diff = color_diff + color_offset * weight_up;
309
310 error_basev += color_diff * color_diff;
311 error_downv += color_down_diff * color_down_diff;
312 error_upv += color_up_diff * color_up_diff;
313 }
314
315 vfloat4 error_weight = blk.channel_weight;
316 float error_base = hadd_s(error_basev * error_weight);
317 float error_down = hadd_s(error_downv * error_weight);
318 float error_up = hadd_s(error_upv * error_weight);
319
320 // Check if the prev or next error is better, and if so use it
321 if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
322 {
323 uq_weightsf[we_idx] = uqw_up;
324 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
325 adjustments = true;
326 }
327 else if ((error_down < error_base) && (uqw > 0))
328 {
329 uq_weightsf[we_idx] = uqw_down;
330 dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
331 adjustments = true;
332 }
333 }
334
335 // Prepare iteration for plane 2
336 dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
337 plane_mask = ~plane_mask;
338 }
339
340 return adjustments;
341 }
342
343 /**
344 * @brief Compress a block using a chosen partitioning and 1 plane of weights.
345 *
346 * @param config The compressor configuration.
347 * @param bsd The block size information.
348 * @param blk The image block color data to compress.
349 * @param only_always True if we only use "always" percentile block modes.
350 * @param tune_errorval_threshold The error value threshold.
351 * @param partition_count The partition count.
352 * @param partition_index The partition index if @c partition_count is 2-4.
353 * @param[out] scb The symbolic compressed block output.
354 * @param[out] tmpbuf The quantized weights for plane 1.
355 */
compress_symbolic_block_for_partition_1plane(QualityProfile privateProfile,const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,bool only_always,float tune_errorval_threshold,unsigned int partition_count,unsigned int partition_index,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)356 static float compress_symbolic_block_for_partition_1plane(
357 QualityProfile privateProfile,
358 const astcenc_config& config,
359 const block_size_descriptor& bsd,
360 const image_block& blk,
361 bool only_always,
362 float tune_errorval_threshold,
363 unsigned int partition_count,
364 unsigned int partition_index,
365 symbolic_compressed_block& scb,
366 compression_working_buffers& tmpbuf,
367 int quant_limit
368 ) {
369 promise(partition_count > 0);
370 promise(config.tune_candidate_limit > 0);
371 promise(config.tune_refinement_limit > 0);
372
373 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
374
375 auto compute_difference = &compute_symbolic_block_difference_1plane;
376 if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
377 {
378 compute_difference = &compute_symbolic_block_difference_1plane_1partition;
379 }
380
381 const auto& pi = bsd.get_partition_info(partition_count, partition_index);
382
383 // Compute ideal weights and endpoint colors, with no quantization or decimation
384 endpoints_and_weights& ei = tmpbuf.ei1;
385 compute_ideal_colors_and_weights_1plane(blk, pi, ei);
386
387 // Compute ideal weights and endpoint colors for every decimation
388 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
389 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
390
391 // For each decimation mode, compute an ideal set of weights with no quantization
392 unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
393 : bsd.decimation_mode_count_selected;
394 promise(max_decimation_modes > 0);
395 for (unsigned int i = 0; i < max_decimation_modes; i++)
396 {
397 const auto& dm = bsd.get_decimation_mode(i);
398 if (!dm.is_ref_1plane(static_cast<quant_method>(max_weight_quant)))
399 {
400 continue;
401 }
402
403 const auto& di = bsd.get_decimation_info(i);
404
405 compute_ideal_weights_for_decimation(
406 ei,
407 di,
408 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
409 }
410
411 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
412 // weight pair, compute the smallest weight that will result in a color value greater than 1
413 vfloat4 min_ep(10.0f);
414 for (unsigned int i = 0; i < partition_count; i++)
415 {
416 vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
417
418 vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
419 min_ep = select(min_ep, ep, use_ep);
420 }
421
422 float min_wt_cutoff = hmin_s(min_ep);
423
424 // For each mode, use the angular method to compute a shift
425 compute_angular_endpoints_1plane(
426 only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
427
428 float* weight_low_value = tmpbuf.weight_low_value1;
429 float* weight_high_value = tmpbuf.weight_high_value1;
430 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
431 float* qwt_errors = tmpbuf.qwt_errors;
432
433 // For each mode (which specifies a decimation and a quantization):
434 // * Compute number of bits needed for the quantized weights
435 // * Generate an optimized set of quantized weights
436 // * Compute quantization errors for the mode
437
438
439 static const int8_t free_bits_for_partition_count[4] {
440 115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
441 };
442
443 unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
444 : bsd.block_mode_count_1plane_selected;
445 promise(max_block_modes > 0);
446 for (unsigned int i = 0; i < max_block_modes; i++)
447 {
448 const block_mode& bm = bsd.block_modes[i];
449
450 if (bm.quant_mode > max_weight_quant)
451 {
452 qwt_errors[i] = 1e38f;
453 continue;
454 }
455
456 assert(!bm.is_dual_plane);
457 int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
458 if (bitcount <= 0)
459 {
460 qwt_errors[i] = 1e38f;
461 continue;
462 }
463
464 if (weight_high_value[i] > 1.02f * min_wt_cutoff)
465 {
466 weight_high_value[i] = 1.0f;
467 }
468
469 int decimation_mode = bm.decimation_mode;
470 const auto& di = bsd.get_decimation_info(decimation_mode);
471
472 qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
473
474 ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
475
476 // Generate the optimized set of weights for the weight mode
477 compute_quantized_weights_for_decimation(
478 di,
479 weight_low_value[i], weight_high_value[i],
480 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
481 dec_weights_uquantf,
482 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
483 bm.get_weight_quant_mode());
484
485 // Compute weight quantization errors for the block mode
486 qwt_errors[i] = compute_error_of_weight_set_1plane(
487 ei,
488 di,
489 dec_weights_uquantf);
490 }
491
492 // Decide the optimal combination of color endpoint encodings and weight encodings
493 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
494 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
495
496 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
497 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
498
499 unsigned int candidate_count = compute_ideal_endpoint_formats(
500 privateProfile,
501 pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
502 config.tune_candidate_limit, 0, max_block_modes,
503 partition_format_specifiers, block_mode_index,
504 color_quant_level, color_quant_level_mod, tmpbuf);
505
506 // Iterate over the N believed-to-be-best modes to find out which one is actually best
507 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
508 float best_errorval_in_scb = scb.errorval;
509
510 for (unsigned int i = 0; i < candidate_count; i++)
511 {
512 TRACE_NODE(node0, "candidate");
513
514 const int bm_packed_index = block_mode_index[i];
515 assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
516 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
517
518 int decimation_mode = qw_bm.decimation_mode;
519 const auto& di = bsd.get_decimation_info(decimation_mode);
520 promise(di.weight_count > 0);
521
522 trace_add_data("weight_x", di.weight_x);
523 trace_add_data("weight_y", di.weight_y);
524 trace_add_data("weight_z", di.weight_z);
525 trace_add_data("weight_quant", qw_bm.quant_mode);
526
527 // Recompute the ideal color endpoints before storing them
528 vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
529 vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
530
531 symbolic_compressed_block workscb;
532 endpoints workep = ei.ep;
533
534 uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
535
536 for (unsigned int j = 0; j < di.weight_count; j++)
537 {
538 workscb.weights[j] = u8_weight_src[j];
539 }
540
541 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
542 {
543 recompute_ideal_colors_1plane(
544 blk, pi, di, workscb.weights,
545 workep, rgbs_colors, rgbo_colors);
546
547 // Quantize the chosen color, tracking if worth trying the mod value
548 bool all_same = color_quant_level[i] != color_quant_level_mod[i];
549 for (unsigned int j = 0; j < partition_count; j++)
550 {
551 workscb.color_formats[j] = pack_color_endpoints(
552 privateProfile,
553 workep.endpt0[j],
554 workep.endpt1[j],
555 rgbs_colors[j],
556 rgbo_colors[j],
557 partition_format_specifiers[i][j],
558 workscb.color_values[j],
559 color_quant_level[i]);
560
561 all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
562 }
563
564 // If all the color endpoint modes are the same, we get a few more bits to store colors;
565 // let's see if we can take advantage of this: requantize all the colors and see if the
566 // endpoint modes remain the same.
567 workscb.color_formats_matched = 0;
568 if (partition_count >= 2 && all_same)
569 {
570 uint8_t colorvals[BLOCK_MAX_PARTITIONS][8];
571 uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
572 bool all_same_mod = true;
573 for (unsigned int j = 0; j < partition_count; j++)
574 {
575 color_formats_mod[j] = pack_color_endpoints(
576 privateProfile,
577 workep.endpt0[j],
578 workep.endpt1[j],
579 rgbs_colors[j],
580 rgbo_colors[j],
581 partition_format_specifiers[i][j],
582 colorvals[j],
583 color_quant_level_mod[i]);
584
585 // Early out as soon as it's no longer possible to use mod
586 if (color_formats_mod[j] != color_formats_mod[0])
587 {
588 all_same_mod = false;
589 break;
590 }
591 }
592
593 if (all_same_mod)
594 {
595 workscb.color_formats_matched = 1;
596 for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
597 {
598 for (unsigned int k = 0; k < 8; k++)
599 {
600 workscb.color_values[j][k] = colorvals[j][k];
601 }
602
603 workscb.color_formats[j] = color_formats_mod[j];
604 }
605 }
606 }
607
608 // Store header fields
609 workscb.partition_count = static_cast<uint8_t>(partition_count);
610 workscb.partition_index = static_cast<uint16_t>(partition_index);
611 workscb.plane2_component = -1;
612 workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
613 workscb.block_mode = qw_bm.mode_index;
614 workscb.block_type = SYM_BTYPE_NONCONST;
615 if (privateProfile == HIGH_SPEED_PROFILE)
616 {
617 workscb.errorval = 0;
618 scb = workscb;
619 break;
620 }
621 // Pre-realign test
622 if (l == 0)
623 {
624 float errorval = compute_difference(config, bsd, workscb, blk);
625 if (errorval == -ERROR_CALC_DEFAULT)
626 {
627 errorval = -errorval;
628 workscb.block_type = SYM_BTYPE_ERROR;
629 }
630
631 trace_add_data("error_prerealign", errorval);
632 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
633
634 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
635 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
636 // drive a heuristic to skip blocks that are unlikely to catch up with the best
637 // block we have already.
638 unsigned int iters_remaining = config.tune_refinement_limit - l;
639 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
640 if (errorval > (threshold * best_errorval_in_scb))
641 {
642 break;
643 }
644
645 if (errorval < best_errorval_in_scb)
646 {
647 best_errorval_in_scb = errorval;
648 workscb.errorval = errorval;
649 scb = workscb;
650
651 if (errorval < tune_errorval_threshold)
652 {
653 // Skip remaining candidates - this is "good enough"
654 i = candidate_count;
655 break;
656 }
657 }
658 }
659
660 bool adjustments;
661 if (di.weight_count != bsd.texel_count)
662 {
663 adjustments = realign_weights_decimated(
664 config.profile, bsd, blk, workscb);
665 }
666 else
667 {
668 adjustments = realign_weights_undecimated(
669 config.profile, bsd, blk, workscb);
670 }
671
672 // Post-realign test
673 float errorval = compute_difference(config, bsd, workscb, blk);
674 if (errorval == -ERROR_CALC_DEFAULT)
675 {
676 errorval = -errorval;
677 workscb.block_type = SYM_BTYPE_ERROR;
678 }
679
680 trace_add_data("error_postrealign", errorval);
681 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
682
683 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
684 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
685 // give benefit of the doubt ...
686 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
687 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
688 if (errorval > (threshold * best_errorval_in_scb))
689 {
690 break;
691 }
692
693 if (errorval < best_errorval_in_scb)
694 {
695 best_errorval_in_scb = errorval;
696 workscb.errorval = errorval;
697 scb = workscb;
698
699 if (errorval < tune_errorval_threshold)
700 {
701 // Skip remaining candidates - this is "good enough"
702 i = candidate_count;
703 break;
704 }
705 }
706
707 if (!adjustments)
708 {
709 break;
710 }
711 }
712 }
713
714 return best_errorval_in_mode;
715 }
716
717 /**
718 * @brief Compress a block using a chosen partitioning and 2 planes of weights.
719 *
720 * @param config The compressor configuration.
721 * @param bsd The block size information.
722 * @param blk The image block color data to compress.
723 * @param tune_errorval_threshold The error value threshold.
724 * @param plane2_component The component index for the second plane of weights.
725 * @param[out] scb The symbolic compressed block output.
726 * @param[out] tmpbuf The quantized weights for plane 1.
727 */
compress_symbolic_block_for_partition_2planes(QualityProfile privateProfile,const astcenc_config & config,const block_size_descriptor & bsd,const image_block & blk,float tune_errorval_threshold,unsigned int plane2_component,symbolic_compressed_block & scb,compression_working_buffers & tmpbuf,int quant_limit)728 static float compress_symbolic_block_for_partition_2planes(
729 QualityProfile privateProfile,
730 const astcenc_config& config,
731 const block_size_descriptor& bsd,
732 const image_block& blk,
733 float tune_errorval_threshold,
734 unsigned int plane2_component,
735 symbolic_compressed_block& scb,
736 compression_working_buffers& tmpbuf,
737 int quant_limit
738 ) {
739 promise(config.tune_candidate_limit > 0);
740 promise(config.tune_refinement_limit > 0);
741 promise(bsd.decimation_mode_count_selected > 0);
742
743 int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
744
745 // Compute ideal weights and endpoint colors, with no quantization or decimation
746 endpoints_and_weights& ei1 = tmpbuf.ei1;
747 endpoints_and_weights& ei2 = tmpbuf.ei2;
748
749 compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
750
751 // Compute ideal weights and endpoint colors for every decimation
752 float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
753 uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
754
755 // For each decimation mode, compute an ideal set of weights with no quantization
756 for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
757 {
758 const auto& dm = bsd.get_decimation_mode(i);
759 if (!dm.is_ref_2plane(static_cast<quant_method>(max_weight_quant)))
760 {
761 continue;
762 }
763
764 const auto& di = bsd.get_decimation_info(i);
765
766 compute_ideal_weights_for_decimation(
767 ei1,
768 di,
769 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
770
771 compute_ideal_weights_for_decimation(
772 ei2,
773 di,
774 dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
775 }
776
777 // Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
778 // weight pair, compute the smallest weight that will result in a color value greater than 1
779 vfloat4 min_ep1(10.0f);
780 vfloat4 min_ep2(10.0f);
781
782 vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
783 vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
784 min_ep1 = select(min_ep1, ep1, use_ep1);
785
786 vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
787 vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
788 min_ep2 = select(min_ep2, ep2, use_ep2);
789
790 vfloat4 err_max(ERROR_CALC_DEFAULT);
791 vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
792
793 // Set the plane2 component to max error in ep1
794 min_ep1 = select(min_ep1, err_max, err_mask);
795
796 float min_wt_cutoff1 = hmin_s(min_ep1);
797
798 // Set the minwt2 to the plane2 component min in ep2
799 float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
800
801 compute_angular_endpoints_2planes(
802 bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
803
804 // For each mode (which specifies a decimation and a quantization):
805 // * Compute number of bits needed for the quantized weights
806 // * Generate an optimized set of quantized weights
807 // * Compute quantization errors for the mode
808
809 float* weight_low_value1 = tmpbuf.weight_low_value1;
810 float* weight_high_value1 = tmpbuf.weight_high_value1;
811 float* weight_low_value2 = tmpbuf.weight_low_value2;
812 float* weight_high_value2 = tmpbuf.weight_high_value2;
813
814 int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
815 float* qwt_errors = tmpbuf.qwt_errors;
816
817 unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
818 unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
819
820 for (unsigned int i = start_2plane; i < end_2plane; i++)
821 {
822 const block_mode& bm = bsd.block_modes[i];
823 assert(bm.is_dual_plane);
824
825 if (bm.quant_mode > max_weight_quant)
826 {
827 qwt_errors[i] = 1e38f;
828 continue;
829 }
830
831 qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
832
833 if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
834 {
835 weight_high_value1[i] = 1.0f;
836 }
837
838 if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
839 {
840 weight_high_value2[i] = 1.0f;
841 }
842
843 unsigned int decimation_mode = bm.decimation_mode;
844 const auto& di = bsd.get_decimation_info(decimation_mode);
845
846 ASTCENC_ALIGNAS float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
847
848 // Generate the optimized set of weights for the mode
849 compute_quantized_weights_for_decimation(
850 di,
851 weight_low_value1[i],
852 weight_high_value1[i],
853 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
854 dec_weights_uquantf,
855 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
856 bm.get_weight_quant_mode());
857
858 compute_quantized_weights_for_decimation(
859 di,
860 weight_low_value2[i],
861 weight_high_value2[i],
862 dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
863 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
864 dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
865 bm.get_weight_quant_mode());
866
867 // Compute weight quantization errors for the block mode
868 qwt_errors[i] = compute_error_of_weight_set_2planes(
869 ei1,
870 ei2,
871 di,
872 dec_weights_uquantf,
873 dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
874 }
875
876 // Decide the optimal combination of color endpoint encodings and weight encodings
877 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
878 int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
879
880 quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
881 quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
882
883 endpoints epm;
884 merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
885
886 const auto& pi = bsd.get_partition_info(1, 0);
887 unsigned int candidate_count = compute_ideal_endpoint_formats(
888 config.privateProfile,
889 pi, blk, epm, qwt_bitcounts, qwt_errors,
890 config.tune_candidate_limit,
891 bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
892 partition_format_specifiers, block_mode_index,
893 color_quant_level, color_quant_level_mod, tmpbuf);
894
895 // Iterate over the N believed-to-be-best modes to find out which one is actually best
896 float best_errorval_in_mode = ERROR_CALC_DEFAULT;
897 float best_errorval_in_scb = scb.errorval;
898
899 for (unsigned int i = 0; i < candidate_count; i++)
900 {
901 TRACE_NODE(node0, "candidate");
902
903 const int bm_packed_index = block_mode_index[i];
904 assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
905 bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
906 const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
907
908 int decimation_mode = qw_bm.decimation_mode;
909 const auto& di = bsd.get_decimation_info(decimation_mode);
910 promise(di.weight_count > 0);
911
912 trace_add_data("weight_x", di.weight_x);
913 trace_add_data("weight_y", di.weight_y);
914 trace_add_data("weight_z", di.weight_z);
915 trace_add_data("weight_quant", qw_bm.quant_mode);
916
917 vfloat4 rgbs_color;
918 vfloat4 rgbo_color;
919
920 symbolic_compressed_block workscb;
921 endpoints workep = epm;
922
923 uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
924 uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
925
926 for (int j = 0; j < di.weight_count; j++)
927 {
928 workscb.weights[j] = u8_weight1_src[j];
929 workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
930 }
931
932 for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
933 {
934 recompute_ideal_colors_2planes(
935 blk, bsd, di,
936 workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
937 workep, rgbs_color, rgbo_color, plane2_component);
938
939 // Quantize the chosen color
940 workscb.color_formats[0] = pack_color_endpoints(
941 privateProfile,
942 workep.endpt0[0],
943 workep.endpt1[0],
944 rgbs_color, rgbo_color,
945 partition_format_specifiers[i][0],
946 workscb.color_values[0],
947 color_quant_level[i]);
948
949 // Store header fields
950 workscb.partition_count = 1;
951 workscb.partition_index = 0;
952 workscb.quant_mode = color_quant_level[i];
953 workscb.color_formats_matched = 0;
954 workscb.block_mode = qw_bm.mode_index;
955 workscb.plane2_component = static_cast<int8_t>(plane2_component);
956 workscb.block_type = SYM_BTYPE_NONCONST;
957
958 // Pre-realign test
959 if (l == 0)
960 {
961 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
962 if (errorval == -ERROR_CALC_DEFAULT)
963 {
964 errorval = -errorval;
965 workscb.block_type = SYM_BTYPE_ERROR;
966 }
967
968 trace_add_data("error_prerealign", errorval);
969 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
970
971 // Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
972 // iteration can help more so we give it a extra 8% leeway. Use this knowledge to
973 // drive a heuristic to skip blocks that are unlikely to catch up with the best
974 // block we have already.
975 unsigned int iters_remaining = config.tune_refinement_limit - l;
976 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
977 if (errorval > (threshold * best_errorval_in_scb))
978 {
979 break;
980 }
981
982 if (errorval < best_errorval_in_scb)
983 {
984 best_errorval_in_scb = errorval;
985 workscb.errorval = errorval;
986 scb = workscb;
987
988 if (errorval < tune_errorval_threshold)
989 {
990 // Skip remaining candidates - this is "good enough"
991 i = candidate_count;
992 break;
993 }
994 }
995 }
996
997 // Perform a final pass over the weights to try to improve them.
998 bool adjustments;
999 if (di.weight_count != bsd.texel_count)
1000 {
1001 adjustments = realign_weights_decimated(
1002 config.profile, bsd, blk, workscb);
1003 }
1004 else
1005 {
1006 adjustments = realign_weights_undecimated(
1007 config.profile, bsd, blk, workscb);
1008 }
1009
1010 // Post-realign test
1011 float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
1012 if (errorval == -ERROR_CALC_DEFAULT)
1013 {
1014 errorval = -errorval;
1015 workscb.block_type = SYM_BTYPE_ERROR;
1016 }
1017
1018 trace_add_data("error_postrealign", errorval);
1019 best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
1020
1021 // Average refinement improvement is 3.5% per iteration, so skip blocks that are
1022 // unlikely to catch up with the best block we have already. Assume a 4.5% per step to
1023 // give benefit of the doubt ...
1024 unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
1025 float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
1026 if (errorval > (threshold * best_errorval_in_scb))
1027 {
1028 break;
1029 }
1030
1031 if (errorval < best_errorval_in_scb)
1032 {
1033 best_errorval_in_scb = errorval;
1034 workscb.errorval = errorval;
1035 scb = workscb;
1036
1037 if (errorval < tune_errorval_threshold)
1038 {
1039 // Skip remaining candidates - this is "good enough"
1040 i = candidate_count;
1041 break;
1042 }
1043 }
1044
1045 if (!adjustments)
1046 {
1047 break;
1048 }
1049 }
1050 }
1051
1052 return best_errorval_in_mode;
1053 }
1054
1055 /**
1056 * @brief Determine the lowest cross-channel correlation factor.
1057 *
1058 * @param texels_per_block The number of texels in a block.
1059 * @param blk The image block color data to compress.
1060 *
1061 * @return Return the lowest correlation factor.
1062 */
prepare_block_statistics(int texels_per_block,const image_block & blk)1063 static float prepare_block_statistics(
1064 int texels_per_block,
1065 const image_block& blk
1066 ) {
1067 // Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
1068 // of the matrix. The matrix is symmetric, so this is all we need for this use case.
1069 float rs = 0.0f;
1070 float gs = 0.0f;
1071 float bs = 0.0f;
1072 float as = 0.0f;
1073 float rr_var = 0.0f;
1074 float gg_var = 0.0f;
1075 float bb_var = 0.0f;
1076 float aa_var = 0.0f;
1077 float rg_cov = 0.0f;
1078 float rb_cov = 0.0f;
1079 float ra_cov = 0.0f;
1080 float gb_cov = 0.0f;
1081 float ga_cov = 0.0f;
1082 float ba_cov = 0.0f;
1083
1084 float weight_sum = 0.0f;
1085
1086 promise(texels_per_block > 0);
1087 for (int i = 0; i < texels_per_block; i++)
1088 {
1089 float weight = hadd_s(blk.channel_weight) / 4.0f;
1090 assert(weight >= 0.0f);
1091 weight_sum += weight;
1092
1093 float r = blk.data_r[i];
1094 float g = blk.data_g[i];
1095 float b = blk.data_b[i];
1096 float a = blk.data_a[i];
1097
1098 float rw = r * weight;
1099 rs += rw;
1100 rr_var += r * rw;
1101 rg_cov += g * rw;
1102 rb_cov += b * rw;
1103 ra_cov += a * rw;
1104
1105 float gw = g * weight;
1106 gs += gw;
1107 gg_var += g * gw;
1108 gb_cov += b * gw;
1109 ga_cov += a * gw;
1110
1111 float bw = b * weight;
1112 bs += bw;
1113 bb_var += b * bw;
1114 ba_cov += a * bw;
1115
1116 float aw = a * weight;
1117 as += aw;
1118 aa_var += a * aw;
1119 }
1120
1121 float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
1122
1123 rr_var -= rs * (rs * rpt);
1124 rg_cov -= gs * (rs * rpt);
1125 rb_cov -= bs * (rs * rpt);
1126 ra_cov -= as * (rs * rpt);
1127
1128 gg_var -= gs * (gs * rpt);
1129 gb_cov -= bs * (gs * rpt);
1130 ga_cov -= as * (gs * rpt);
1131
1132 bb_var -= bs * (bs * rpt);
1133 ba_cov -= as * (bs * rpt);
1134
1135 aa_var -= as * (as * rpt);
1136
1137 // These will give a NaN if a channel is constant - these are fixed up in the next step
1138 rg_cov *= astc::rsqrt(rr_var * gg_var);
1139 rb_cov *= astc::rsqrt(rr_var * bb_var);
1140 ra_cov *= astc::rsqrt(rr_var * aa_var);
1141 gb_cov *= astc::rsqrt(gg_var * bb_var);
1142 ga_cov *= astc::rsqrt(gg_var * aa_var);
1143 ba_cov *= astc::rsqrt(bb_var * aa_var);
1144
1145 if (astc::isnan(rg_cov)) rg_cov = 1.0f;
1146 if (astc::isnan(rb_cov)) rb_cov = 1.0f;
1147 if (astc::isnan(ra_cov)) ra_cov = 1.0f;
1148 if (astc::isnan(gb_cov)) gb_cov = 1.0f;
1149 if (astc::isnan(ga_cov)) ga_cov = 1.0f;
1150 if (astc::isnan(ba_cov)) ba_cov = 1.0f;
1151
1152 float lowest_correlation = astc::min(fabsf(rg_cov), fabsf(rb_cov));
1153 lowest_correlation = astc::min(lowest_correlation, fabsf(ra_cov));
1154 lowest_correlation = astc::min(lowest_correlation, fabsf(gb_cov));
1155 lowest_correlation = astc::min(lowest_correlation, fabsf(ga_cov));
1156 lowest_correlation = astc::min(lowest_correlation, fabsf(ba_cov));
1157
1158 // Diagnostic trace points
1159 trace_add_data("min_r", blk.data_min.lane<0>());
1160 trace_add_data("max_r", blk.data_max.lane<0>());
1161 trace_add_data("min_g", blk.data_min.lane<1>());
1162 trace_add_data("max_g", blk.data_max.lane<1>());
1163 trace_add_data("min_b", blk.data_min.lane<2>());
1164 trace_add_data("max_b", blk.data_max.lane<2>());
1165 trace_add_data("min_a", blk.data_min.lane<3>());
1166 trace_add_data("max_a", blk.data_max.lane<3>());
1167 trace_add_data("cov_rg", fabsf(rg_cov));
1168 trace_add_data("cov_rb", fabsf(rb_cov));
1169 trace_add_data("cov_ra", fabsf(ra_cov));
1170 trace_add_data("cov_gb", fabsf(gb_cov));
1171 trace_add_data("cov_ga", fabsf(ga_cov));
1172 trace_add_data("cov_ba", fabsf(ba_cov));
1173
1174 return lowest_correlation;
1175 }
1176
1177 /* See header for documentation. */
compress_block(const astcenc_contexti & ctx,const image_block & blk,uint8_t pcb[16],compression_working_buffers & tmpbuf,bool calQualityEnable,int32_t * mseBlock[RGBA_COM])1178 void compress_block(
1179 const astcenc_contexti& ctx,
1180 const image_block& blk,
1181 uint8_t pcb[16],
1182 #if QUALITY_CONTROL
1183 compression_working_buffers& tmpbuf,
1184 bool calQualityEnable,
1185 int32_t *mseBlock[RGBA_COM]
1186 #else
1187 compression_working_buffers& tmpbuf
1188 #endif
1189 )
1190 {
1191 astcenc_profile decode_mode = ctx.config.profile;
1192 symbolic_compressed_block scb;
1193 const block_size_descriptor& bsd = *ctx.bsd;
1194 float lowest_correl;
1195
1196 TRACE_NODE(node0, "block");
1197 trace_add_data("pos_x", blk.xpos);
1198 trace_add_data("pos_y", blk.ypos);
1199 trace_add_data("pos_z", blk.zpos);
1200
1201 // Set stricter block targets for luminance data as we have more bits to play with
1202 bool block_is_l = blk.is_luminance();
1203 float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
1204
1205 // Set slightly stricter block targets for lumalpha data as we have more bits to play with
1206 bool block_is_la = blk.is_luminancealpha();
1207 float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
1208
1209 bool block_skip_two_plane = false;
1210 int max_partitions;
1211 if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1212 {
1213 max_partitions = 1;
1214 }
1215 #ifdef ASTC_CUSTOMIZED_ENABLE
1216 else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1217 {
1218 if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1219 g_astcCustomizedSoManager.customizedMaxPartitionsFunc_ == nullptr)
1220 {
1221 printf("astcenc customized so dlopen failed or customizedMaxPartitionsFunc_ is nullptr!\n");
1222 return;
1223 }
1224 max_partitions = g_astcCustomizedSoManager.customizedMaxPartitionsFunc_();
1225 }
1226 #endif
1227 else
1228 {
1229 max_partitions = ctx.config.tune_partition_count_limit;
1230 }
1231
1232 unsigned int requested_partition_indices[3] {
1233 ctx.config.tune_2partition_index_limit,
1234 ctx.config.tune_3partition_index_limit,
1235 ctx.config.tune_4partition_index_limit
1236 };
1237
1238 unsigned int requested_partition_trials[3] {
1239 ctx.config.tune_2partitioning_candidate_limit,
1240 ctx.config.tune_3partitioning_candidate_limit,
1241 ctx.config.tune_4partitioning_candidate_limit
1242 };
1243
1244 #if defined(ASTCENC_DIAGNOSTICS)
1245 // Do this early in diagnostic builds so we can dump uniform metrics
1246 // for every block. Do it later in release builds to avoid redundant work!
1247 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1248 float error_threshold = ctx.config.tune_db_limit
1249 * error_weight_sum
1250 * block_is_l_scale
1251 * block_is_la_scale;
1252
1253 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1254 trace_add_data("lowest_correl", lowest_correl);
1255 trace_add_data("tune_error_threshold", error_threshold);
1256 #endif
1257
1258 // Detected a constant-color block
1259 if (all(blk.data_min == blk.data_max))
1260 {
1261 TRACE_NODE(node1, "pass");
1262 trace_add_data("partition_count", 0);
1263 trace_add_data("plane_count", 1);
1264
1265 scb.partition_count = 0;
1266
1267 // Encode as FP16 if using HDR
1268 if ((decode_mode == ASTCENC_PRF_HDR) ||
1269 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
1270 {
1271 scb.block_type = SYM_BTYPE_CONST_F16;
1272 vint4 color_f16 = float_to_float16(blk.origin_texel);
1273 store(color_f16, scb.constant_color);
1274 }
1275 // Encode as UNORM16 if NOT using HDR
1276 else
1277 {
1278 scb.block_type = SYM_BTYPE_CONST_U16;
1279 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1280 vint4 color_u16 = float_to_int_rtn(color_f32);
1281 store(color_u16, scb.constant_color);
1282 }
1283
1284 trace_add_data("exit", "quality hit");
1285 if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1286 {
1287 scb.block_type = SYM_BTYPE_NONCONST;
1288 scb.partition_count = 1;
1289 scb.color_formats_matched = 0;
1290 scb.plane2_component = -1;
1291 if (ctx.config.privateProfile == HIGH_SPEED_PROFILE)
1292 {
1293 scb.block_mode = HIGH_SPEED_PROFILE_BLOCK_MODE;
1294 }
1295 #ifdef ASTC_CUSTOMIZED_ENABLE
1296 else if (ctx.config.privateProfile == CUSTOMIZED_PROFILE)
1297 {
1298 if (!g_astcCustomizedSoManager.LoadSutCustomizedSo() ||
1299 g_astcCustomizedSoManager.customizedBlockModeFunc_ == nullptr)
1300 {
1301 printf("astcenc customized so dlopen failed or customizedBlockModeFunc_ is nullptr!\n");
1302 return;
1303 }
1304 scb.block_mode = g_astcCustomizedSoManager.customizedBlockModeFunc_();
1305 }
1306 #endif
1307 scb.partition_index = 0;
1308 scb.quant_mode = QUANT_256;
1309 scb.color_formats[0] = 12; // color format is 12 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1310 for (int w = 0; w < 16; w++) { // weights num is 16 when block mode is HIGH_SPEED_PROFILE_BLOCK_MODE
1311 scb.weights[w] = 0;
1312 }
1313 for (unsigned int pixel = 0; pixel < BLOCK_MAX_COMPONENTS; pixel++) { // scb.constant_color[pixel] is 16 bit
1314 scb.color_values[0][pixel << 1] = scb.constant_color[pixel] & BYTE_MASK; // low byte
1315 scb.color_values[0][(pixel << 1) + 1] = (scb.constant_color[pixel] >> 8) & BYTE_MASK; // high byte
1316 }
1317 }
1318 scb.privateProfile = ctx.config.privateProfile;
1319 symbolic_to_physical(bsd, scb, pcb);
1320 #if QUALITY_CONTROL
1321 if (calQualityEnable) {
1322 *mseBlock[R_COM] = *mseBlock[G_COM] = *mseBlock[B_COM] = *mseBlock[A_COM] = 0;
1323 }
1324 #endif
1325 return;
1326 }
1327
1328 #if !defined(ASTCENC_DIAGNOSTICS)
1329 float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
1330 float error_threshold = ctx.config.tune_db_limit
1331 * error_weight_sum
1332 * block_is_l_scale
1333 * block_is_la_scale;
1334 #endif
1335
1336 // Set SCB and mode errors to a very high error value
1337 scb.errorval = ERROR_CALC_DEFAULT;
1338 scb.block_type = SYM_BTYPE_ERROR;
1339
1340 float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
1341 ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
1342 };
1343
1344 float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
1345 0.0f,
1346 ctx.config.tune_2partition_early_out_limit_factor,
1347 ctx.config.tune_3partition_early_out_limit_factor,
1348 0.0f
1349 };
1350
1351 // Trial using 1 plane of weights and 1 partition.
1352
1353 // Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
1354 // mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
1355 // optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
1356 // compression and slightly reduces image quality.
1357
1358 float errorval_mult[2] {
1359 1.0f / ctx.config.tune_mse_overshoot,
1360 1.0f
1361 };
1362
1363 static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
1364
1365 // Only enable MODE0 fast path if enabled
1366 // Never enable for 3D blocks as no "always" block modes are available
1367 int start_trial = 1;
1368 if ((ctx.config.tune_search_mode0_enable >= TUNE_MIN_SEARCH_MODE0) && (bsd.zdim == 1))
1369 {
1370 start_trial = 0;
1371 }
1372
1373 int quant_limit = QUANT_32;
1374 for (int i = start_trial; i < 2; i++)
1375 {
1376 TRACE_NODE(node1, "pass");
1377 trace_add_data("partition_count", 1);
1378 trace_add_data("plane_count", 1);
1379 trace_add_data("search_mode", i);
1380
1381 float errorval = compress_symbolic_block_for_partition_1plane(
1382 ctx.config.privateProfile,
1383 ctx.config, bsd, blk, i == 0,
1384 error_threshold * errorval_mult[i] * errorval_overshoot,
1385 1, 0, scb, tmpbuf, QUANT_32);
1386
1387 // Record the quant level so we can use the filter later searches
1388 const auto& bm = bsd.get_block_mode(scb.block_mode);
1389 quant_limit = bm.get_weight_quant_mode();
1390
1391 best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
1392 if ((ctx.config.privateProfile == HIGH_SPEED_PROFILE) || (errorval < (error_threshold * errorval_mult[i])))
1393 {
1394 trace_add_data("exit", "quality hit");
1395 goto END_OF_TESTS;
1396 }
1397 }
1398
1399 #if !defined(ASTCENC_DIAGNOSTICS)
1400 lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
1401 #endif
1402
1403 block_skip_two_plane = lowest_correl > ctx.config.tune_2plane_early_out_limit_correlation;
1404
1405 // Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
1406 // alpha is the most likely to be non-correlated if it is present in the data.
1407 for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
1408 {
1409 if (ctx.config.privateProfile != HIGH_QUALITY_PROFILE)
1410 {
1411 break;
1412 }
1413 TRACE_NODE(node1, "pass");
1414 trace_add_data("partition_count", 1);
1415 trace_add_data("plane_count", 2);
1416 trace_add_data("plane_component", i);
1417
1418 if (block_skip_two_plane)
1419 {
1420 trace_add_data("skip", "tune_2plane_early_out_limit_correlation");
1421 continue;
1422 }
1423
1424 if (blk.grayscale && i != 3)
1425 {
1426 trace_add_data("skip", "grayscale block");
1427 continue;
1428 }
1429
1430 if (blk.is_constant_channel(i))
1431 {
1432 trace_add_data("skip", "constant component");
1433 continue;
1434 }
1435
1436 float errorval = compress_symbolic_block_for_partition_2planes(
1437 ctx.config.privateProfile,
1438 ctx.config, bsd, blk, error_threshold * errorval_overshoot,
1439 i, scb, tmpbuf, quant_limit);
1440
1441 // If attempting two planes is much worse than the best one plane result
1442 // then further two plane searches are unlikely to help so move on ...
1443 if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
1444 {
1445 break;
1446 }
1447
1448 if (errorval < error_threshold)
1449 {
1450 trace_add_data("exit", "quality hit");
1451 goto END_OF_TESTS;
1452 }
1453 }
1454
1455 // Find best blocks for 2, 3 and 4 partitions
1456 for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
1457 {
1458 unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
1459
1460 unsigned int requested_indices = requested_partition_indices[partition_count - 2];
1461
1462 unsigned int requested_trials = requested_partition_trials[partition_count - 2];
1463 requested_trials = astc::min(requested_trials, requested_indices);
1464
1465 unsigned int actual_trials = find_best_partition_candidates(
1466 bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
1467
1468 float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
1469
1470 for (unsigned int i = 0; i < actual_trials; i++)
1471 {
1472 TRACE_NODE(node1, "pass");
1473 trace_add_data("partition_count", partition_count);
1474 trace_add_data("partition_index", partition_indices[i]);
1475 trace_add_data("plane_count", 1);
1476 trace_add_data("search_mode", i);
1477
1478 float errorval = compress_symbolic_block_for_partition_1plane(
1479 ctx.config.privateProfile,
1480 ctx.config, bsd, blk, false,
1481 error_threshold * errorval_overshoot,
1482 partition_count, partition_indices[i],
1483 scb, tmpbuf, quant_limit);
1484
1485 best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
1486
1487 // If using N partitions doesn't improve much over using N-1 partitions then skip trying
1488 // N+1. Error can dramatically improve if the data is correlated or non-correlated and
1489 // aligns with a partitioning that suits that encoding, so for this inner loop check add
1490 // a large error scale because the "other" trial could be a lot better.
1491 float best_error = best_errorvals_for_pcount[partition_count - 1];
1492 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
1493 if (best_error > (best_error_in_prev * best_error_scale))
1494 {
1495 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1496 goto END_OF_TESTS;
1497 }
1498
1499 if (errorval < error_threshold)
1500 {
1501 trace_add_data("exit", "quality hit");
1502 goto END_OF_TESTS;
1503 }
1504 }
1505
1506 // If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
1507 float best_error = best_errorvals_for_pcount[partition_count - 1];
1508 float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
1509 if (best_error > (best_error_in_prev * best_error_scale))
1510 {
1511 trace_add_data("skip", "tune_partition_early_out_limit_factor");
1512 goto END_OF_TESTS;
1513 }
1514 }
1515
1516 trace_add_data("exit", "quality not hit");
1517
1518 END_OF_TESTS:
1519 // If we still have an error block then convert to something we can encode
1520 // TODO: Do something more sensible here, such as average color block
1521 if (scb.block_type == SYM_BTYPE_ERROR)
1522 {
1523 #if defined(ASTCENC_DIAGNOSTICS)
1524 static bool printed_once = false;
1525 if (!printed_once)
1526 {
1527 printed_once = true;
1528 printf("WARN: At least one block failed to find a valid encoding.\n"
1529 " Try increasing compression quality settings.\n\n");
1530 }
1531 #endif
1532
1533 scb.block_type = SYM_BTYPE_CONST_U16;
1534 vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
1535 vint4 color_u16 = float_to_int_rtn(color_f32);
1536 store(color_u16, scb.constant_color);
1537 }
1538
1539 // Compress to a physical block
1540 scb.privateProfile = ctx.config.privateProfile;
1541 symbolic_to_physical(bsd, scb, pcb);
1542 #if QUALITY_CONTROL
1543 if (calQualityEnable) {
1544 image_block decBlk = blk;
1545 decompress_symbolic_block(ctx.config.profile, bsd, blk.xpos, blk.ypos, blk.zpos, scb, decBlk);
1546 vint4 colorSumDiff = vint4::zero();
1547 for (size_t ii = 0; ii < bsd.texel_count; ii++) {
1548 vint4 colorRef = float_to_int_rtn(blk.texel(ii) * 255.0f / 65535.0f);
1549 vint4 colorTest = float_to_int_rtn(min(decBlk.texel(ii), 1.0f) * 255.0f);
1550 vint4 colorDiff = colorRef - colorTest;
1551 colorSumDiff += colorDiff * colorDiff;
1552 }
1553 *mseBlock[R_COM] = colorSumDiff.lane<0>();
1554 *mseBlock[G_COM] = colorSumDiff.lane<1>();
1555 *mseBlock[B_COM] = colorSumDiff.lane<2>();
1556 *mseBlock[A_COM] = colorSumDiff.lane<3>();
1557 }
1558 #endif
1559 }
1560
1561 #endif
1562