1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions for finding best endpoint format.
22 *
23 * We assume there are two independent sources of error in any given partition:
24 *
25 * - Encoding choice errors
26 * - Quantization errors
27 *
28 * Encoding choice errors are caused by encoder decisions. For example:
29 *
30 * - Using luminance instead of separate RGB components.
31 * - Using a constant 1.0 alpha instead of storing an alpha component.
32 * - Using RGB+scale instead of storing two full RGB endpoints.
33 *
34 * Quantization errors occur due to the limited precision we use for storage. These errors generally
35 * scale with quantization level, but are not actually independent of color encoding. In particular:
36 *
37 * - If we can use offset encoding then quantization error is halved.
38 * - If we can use blue-contraction then quantization error for RG is halved.
39 * - If we use HDR endpoints the quantization error is higher.
40 *
41 * Apart from these effects, we assume the error is proportional to the quantization step size.
42 */
43
44
45 #include "astcenc_internal.h"
46 #include "astcenc_vecmathlib.h"
47
48 #include <assert.h>
49
50 /**
51 * @brief Compute the errors of the endpoint line options for one partition.
52 *
53 * Uncorrelated data assumes storing completely independent RGBA channels for each endpoint. Same
54 * chroma data assumes storing RGBA endpoints which pass though the origin (LDR only). RGBL data
55 * assumes storing RGB + lumashift (HDR only). Luminance error assumes storing RGB channels as a
56 * single value.
57 *
58 *
59 * @param pi The partition info data.
60 * @param partition_index The partition index to compule the error for.
61 * @param blk The image block.
62 * @param uncor_pline The endpoint line assuming uncorrelated endpoints.
63 * @param[out] uncor_err The computed error for the uncorrelated endpoint line.
64 * @param samec_pline The endpoint line assuming the same chroma for both endpoints.
65 * @param[out] samec_err The computed error for the uncorrelated endpoint line.
66 * @param rgbl_pline The endpoint line assuming RGB + lumashift data.
67 * @param[out] rgbl_err The computed error for the RGB + lumashift endpoint line.
68 * @param l_pline The endpoint line assuming luminance data.
69 * @param[out] l_err The computed error for the luminance endpoint line.
70 * @param[out] a_drop_err The computed error for dropping the alpha component.
71 */
compute_error_squared_rgb_single_partition(const partition_info & pi,int partition_index,const image_block & blk,const processed_line3 & uncor_pline,float & uncor_err,const processed_line3 & samec_pline,float & samec_err,const processed_line3 & rgbl_pline,float & rgbl_err,const processed_line3 & l_pline,float & l_err,float & a_drop_err)72 static void compute_error_squared_rgb_single_partition(
73 const partition_info& pi,
74 int partition_index,
75 const image_block& blk,
76 const processed_line3& uncor_pline,
77 float& uncor_err,
78 const processed_line3& samec_pline,
79 float& samec_err,
80 const processed_line3& rgbl_pline,
81 float& rgbl_err,
82 const processed_line3& l_pline,
83 float& l_err,
84 float& a_drop_err
85 ) {
86 vfloat4 ews = blk.channel_weight;
87
88 unsigned int texel_count = pi.partition_texel_count[partition_index];
89 const uint8_t* texel_indexes = pi.texels_of_partition[partition_index];
90 promise(texel_count > 0);
91
92 vfloatacc a_drop_errv = vfloatacc::zero();
93 vfloat default_a(blk.get_default_alpha());
94
95 vfloatacc uncor_errv = vfloatacc::zero();
96 vfloat uncor_bs0(uncor_pline.bs.lane<0>());
97 vfloat uncor_bs1(uncor_pline.bs.lane<1>());
98 vfloat uncor_bs2(uncor_pline.bs.lane<2>());
99
100 vfloat uncor_amod0(uncor_pline.amod.lane<0>());
101 vfloat uncor_amod1(uncor_pline.amod.lane<1>());
102 vfloat uncor_amod2(uncor_pline.amod.lane<2>());
103
104 vfloatacc samec_errv = vfloatacc::zero();
105 vfloat samec_bs0(samec_pline.bs.lane<0>());
106 vfloat samec_bs1(samec_pline.bs.lane<1>());
107 vfloat samec_bs2(samec_pline.bs.lane<2>());
108
109 vfloatacc rgbl_errv = vfloatacc::zero();
110 vfloat rgbl_bs0(rgbl_pline.bs.lane<0>());
111 vfloat rgbl_bs1(rgbl_pline.bs.lane<1>());
112 vfloat rgbl_bs2(rgbl_pline.bs.lane<2>());
113
114 vfloat rgbl_amod0(rgbl_pline.amod.lane<0>());
115 vfloat rgbl_amod1(rgbl_pline.amod.lane<1>());
116 vfloat rgbl_amod2(rgbl_pline.amod.lane<2>());
117
118 vfloatacc l_errv = vfloatacc::zero();
119 vfloat l_bs0(l_pline.bs.lane<0>());
120 vfloat l_bs1(l_pline.bs.lane<1>());
121 vfloat l_bs2(l_pline.bs.lane<2>());
122
123 vint lane_ids = vint::lane_id();
124 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
125 {
126 vint tix(texel_indexes + i);
127
128 vmask mask = lane_ids < vint(texel_count);
129 lane_ids += vint(ASTCENC_SIMD_WIDTH);
130
131 // Compute the error that arises from just ditching alpha
132 vfloat data_a = gatherf(blk.data_a, tix);
133 vfloat alpha_diff = data_a - default_a;
134 alpha_diff = alpha_diff * alpha_diff;
135
136 haccumulate(a_drop_errv, alpha_diff, mask);
137
138 vfloat data_r = gatherf(blk.data_r, tix);
139 vfloat data_g = gatherf(blk.data_g, tix);
140 vfloat data_b = gatherf(blk.data_b, tix);
141
142 // Compute uncorrelated error
143 vfloat param = data_r * uncor_bs0
144 + data_g * uncor_bs1
145 + data_b * uncor_bs2;
146
147 vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r;
148 vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g;
149 vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b;
150
151 vfloat error = dist0 * dist0 * ews.lane<0>()
152 + dist1 * dist1 * ews.lane<1>()
153 + dist2 * dist2 * ews.lane<2>();
154
155 haccumulate(uncor_errv, error, mask);
156
157 // Compute same chroma error - no "amod", its always zero
158 param = data_r * samec_bs0
159 + data_g * samec_bs1
160 + data_b * samec_bs2;
161
162 dist0 = (param * samec_bs0) - data_r;
163 dist1 = (param * samec_bs1) - data_g;
164 dist2 = (param * samec_bs2) - data_b;
165
166 error = dist0 * dist0 * ews.lane<0>()
167 + dist1 * dist1 * ews.lane<1>()
168 + dist2 * dist2 * ews.lane<2>();
169
170 haccumulate(samec_errv, error, mask);
171
172 // Compute rgbl error
173 param = data_r * rgbl_bs0
174 + data_g * rgbl_bs1
175 + data_b * rgbl_bs2;
176
177 dist0 = (rgbl_amod0 + param * rgbl_bs0) - data_r;
178 dist1 = (rgbl_amod1 + param * rgbl_bs1) - data_g;
179 dist2 = (rgbl_amod2 + param * rgbl_bs2) - data_b;
180
181 error = dist0 * dist0 * ews.lane<0>()
182 + dist1 * dist1 * ews.lane<1>()
183 + dist2 * dist2 * ews.lane<2>();
184
185 haccumulate(rgbl_errv, error, mask);
186
187 // Compute luma error - no "amod", its always zero
188 param = data_r * l_bs0
189 + data_g * l_bs1
190 + data_b * l_bs2;
191
192 dist0 = (param * l_bs0) - data_r;
193 dist1 = (param * l_bs1) - data_g;
194 dist2 = (param * l_bs2) - data_b;
195
196 error = dist0 * dist0 * ews.lane<0>()
197 + dist1 * dist1 * ews.lane<1>()
198 + dist2 * dist2 * ews.lane<2>();
199
200 haccumulate(l_errv, error, mask);
201 }
202
203 a_drop_err = hadd_s(a_drop_errv) * ews.lane<3>();
204 uncor_err = hadd_s(uncor_errv);
205 samec_err = hadd_s(samec_errv);
206 rgbl_err = hadd_s(rgbl_errv);
207 l_err = hadd_s(l_errv);
208 }
209
210 /**
211 * @brief For a given set of input colors and partitioning determine endpoint encode errors.
212 *
213 * This function determines the color error that results from RGB-scale encoding (LDR only),
214 * RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether
215 * the endpoints are eligible for offset encoding or blue-contraction
216 *
217 * @param blk The image block.
218 * @param pi The partition info data.
219 * @param ep The idealized endpoints.
220 * @param[out] eci The resulting encoding choice error metrics.
221 */
compute_encoding_choice_errors(const image_block & blk,const partition_info & pi,const endpoints & ep,encoding_choice_errors eci[BLOCK_MAX_PARTITIONS])222 static void compute_encoding_choice_errors(
223 const image_block& blk,
224 const partition_info& pi,
225 const endpoints& ep,
226 encoding_choice_errors eci[BLOCK_MAX_PARTITIONS])
227 {
228 int partition_count = pi.partition_count;
229 promise(partition_count > 0);
230
231 partition_metrics pms[BLOCK_MAX_PARTITIONS];
232
233 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
234
235 for (int i = 0; i < partition_count; i++)
236 {
237 partition_metrics& pm = pms[i];
238
239 line3 uncor_rgb_lines;
240 line3 samec_rgb_lines; // for LDR-RGB-scale
241 line3 rgb_luma_lines; // for HDR-RGB-scale
242
243 processed_line3 uncor_rgb_plines;
244 processed_line3 samec_rgb_plines;
245 processed_line3 rgb_luma_plines;
246 processed_line3 luminance_plines;
247
248 float uncorr_rgb_error;
249 float samechroma_rgb_error;
250 float rgb_luma_error;
251 float luminance_rgb_error;
252 float alpha_drop_error;
253
254 uncor_rgb_lines.a = pm.avg;
255 uncor_rgb_lines.b = normalize_safe(pm.dir, unit3());
256
257 samec_rgb_lines.a = vfloat4::zero();
258 samec_rgb_lines.b = normalize_safe(pm.avg, unit3());
259
260 rgb_luma_lines.a = pm.avg;
261 rgb_luma_lines.b = unit3();
262
263 uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b);
264 uncor_rgb_plines.bs = uncor_rgb_lines.b;
265
266 // Same chroma always goes though zero, so this is simpler than the others
267 samec_rgb_plines.amod = vfloat4::zero();
268 samec_rgb_plines.bs = samec_rgb_lines.b;
269
270 rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b);
271 rgb_luma_plines.bs = rgb_luma_lines.b;
272
273 // Luminance always goes though zero, so this is simpler than the others
274 luminance_plines.amod = vfloat4::zero();
275 luminance_plines.bs = unit3();
276
277 compute_error_squared_rgb_single_partition(
278 pi, i, blk,
279 uncor_rgb_plines, uncorr_rgb_error,
280 samec_rgb_plines, samechroma_rgb_error,
281 rgb_luma_plines, rgb_luma_error,
282 luminance_plines, luminance_rgb_error,
283 alpha_drop_error);
284
285 // Determine if we can offset encode RGB lanes
286 vfloat4 endpt0 = ep.endpt0[i];
287 vfloat4 endpt1 = ep.endpt1[i];
288 vfloat4 endpt_diff = abs(endpt1 - endpt0);
289 vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f);
290 bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7;
291
292 // Determine if we can blue contract encode RGB lanes
293 vfloat4 endpt_diff_bc(
294 endpt0.lane<0>() + (endpt0.lane<0>() - endpt0.lane<2>()),
295 endpt1.lane<0>() + (endpt1.lane<0>() - endpt1.lane<2>()),
296 endpt0.lane<1>() + (endpt0.lane<1>() - endpt0.lane<2>()),
297 endpt1.lane<1>() + (endpt1.lane<1>() - endpt1.lane<2>())
298 );
299
300 vmask4 endpt_can_bc_lo = endpt_diff_bc > vfloat4(0.01f * 65535.0f);
301 vmask4 endpt_can_bc_hi = endpt_diff_bc < vfloat4(0.99f * 65535.0f);
302 bool can_blue_contract = (mask(endpt_can_bc_lo & endpt_can_bc_hi) & 0x7) == 0x7;
303
304 // Store out the settings
305 eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical
306 eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess
307 eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical
308 eci[i].alpha_drop_error = alpha_drop_error * 3.0f;
309 eci[i].can_offset_encode = can_offset_encode;
310 eci[i].can_blue_contract = can_blue_contract;
311 }
312 }
313
314 /**
315 * @brief For a given partition compute the error for every endpoint integer count and quant level.
316 *
317 * @param encode_hdr_rgb @c true if using HDR for RGB, @c false for LDR.
318 * @param encode_hdr_alpha @c true if using HDR for alpha, @c false for LDR.
319 * @param partition_index The partition index.
320 * @param pi The partition info.
321 * @param eci The encoding choice error metrics.
322 * @param ep The idealized endpoints.
323 * @param error_weight The resulting encoding choice error metrics.
324 * @param[out] best_error The best error for each integer count and quant level.
325 * @param[out] format_of_choice The preferred endpoint format for each integer count and quant level.
326 */
compute_color_error_for_every_integer_count_and_quant_level(bool encode_hdr_rgb,bool encode_hdr_alpha,int partition_index,const partition_info & pi,const encoding_choice_errors & eci,const endpoints & ep,vfloat4 error_weight,float best_error[21][4],int format_of_choice[21][4])327 static void compute_color_error_for_every_integer_count_and_quant_level(
328 bool encode_hdr_rgb,
329 bool encode_hdr_alpha,
330 int partition_index,
331 const partition_info& pi,
332 const encoding_choice_errors& eci,
333 const endpoints& ep,
334 vfloat4 error_weight,
335 float best_error[21][4],
336 int format_of_choice[21][4]
337 ) {
338 int partition_size = pi.partition_texel_count[partition_index];
339
340 static const float baseline_quant_error[21] {
341 (65536.0f * 65536.0f / 18.0f), // 2 values, 1 step
342 (65536.0f * 65536.0f / 18.0f) / (2 * 2), // 3 values, 2 steps
343 (65536.0f * 65536.0f / 18.0f) / (3 * 3), // 4 values, 3 steps
344 (65536.0f * 65536.0f / 18.0f) / (4 * 4), // 5 values
345 (65536.0f * 65536.0f / 18.0f) / (5 * 5),
346 (65536.0f * 65536.0f / 18.0f) / (7 * 7),
347 (65536.0f * 65536.0f / 18.0f) / (9 * 9),
348 (65536.0f * 65536.0f / 18.0f) / (11 * 11),
349 (65536.0f * 65536.0f / 18.0f) / (15 * 15),
350 (65536.0f * 65536.0f / 18.0f) / (19 * 19),
351 (65536.0f * 65536.0f / 18.0f) / (23 * 23),
352 (65536.0f * 65536.0f / 18.0f) / (31 * 31),
353 (65536.0f * 65536.0f / 18.0f) / (39 * 39),
354 (65536.0f * 65536.0f / 18.0f) / (47 * 47),
355 (65536.0f * 65536.0f / 18.0f) / (63 * 63),
356 (65536.0f * 65536.0f / 18.0f) / (79 * 79),
357 (65536.0f * 65536.0f / 18.0f) / (95 * 95),
358 (65536.0f * 65536.0f / 18.0f) / (127 * 127),
359 (65536.0f * 65536.0f / 18.0f) / (159 * 159),
360 (65536.0f * 65536.0f / 18.0f) / (191 * 191),
361 (65536.0f * 65536.0f / 18.0f) / (255 * 255)
362 };
363
364 vfloat4 ep0 = ep.endpt0[partition_index];
365 vfloat4 ep1 = ep.endpt1[partition_index];
366
367 float ep1_min = hmin_rgb_s(ep1);
368 ep1_min = astc::max(ep1_min, 0.0f);
369
370 float error_weight_rgbsum = hadd_rgb_s(error_weight);
371
372 float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f;
373 float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f;
374
375 // It is possible to get endpoint colors significantly outside [0,upper-limit] even if the
376 // input data are safely contained in [0,upper-limit]; we need to add an error term for this
377 vfloat4 offset(range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_alpha);
378 vfloat4 ep0_range_error_high = max(ep0 - offset, 0.0f);
379 vfloat4 ep1_range_error_high = max(ep1 - offset, 0.0f);
380
381 vfloat4 ep0_range_error_low = min(ep0, 0.0f);
382 vfloat4 ep1_range_error_low = min(ep1, 0.0f);
383
384 vfloat4 sum_range_error =
385 (ep0_range_error_low * ep0_range_error_low) +
386 (ep1_range_error_low * ep1_range_error_low) +
387 (ep0_range_error_high * ep0_range_error_high) +
388 (ep1_range_error_high * ep1_range_error_high);
389
390 float rgb_range_error = dot3_s(sum_range_error, error_weight)
391 * 0.5f * static_cast<float>(partition_size);
392 float alpha_range_error = sum_range_error.lane<3>() * error_weight.lane<3>()
393 * 0.5f * static_cast<float>(partition_size);
394
395 if (encode_hdr_rgb)
396 {
397
398 // Collect some statistics
399 float af, cf;
400 if (ep1.lane<0>() > ep1.lane<1>() && ep1.lane<0>() > ep1.lane<2>())
401 {
402 af = ep1.lane<0>();
403 cf = ep1.lane<0>() - ep0.lane<0>();
404 }
405 else if (ep1.lane<1>() > ep1.lane<2>())
406 {
407 af = ep1.lane<1>();
408 cf = ep1.lane<1>() - ep0.lane<1>();
409 }
410 else
411 {
412 af = ep1.lane<2>();
413 cf = ep1.lane<2>() - ep0.lane<2>();
414 }
415
416 // Estimate of color-component spread in high endpoint color
417 float bf = af - ep1_min;
418 vfloat4 prd = (ep1 - vfloat4(cf)).swz<0, 1, 2>();
419 vfloat4 pdif = prd - ep0.swz<0, 1, 2>();
420 // Estimate of color-component spread in low endpoint color
421 float df = hmax_s(abs(pdif));
422
423 int b = static_cast<int>(bf);
424 int c = static_cast<int>(cf);
425 int d = static_cast<int>(df);
426
427 // Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode
428 int rgbo_mode = 5; // 7 bits per component
429 // mode 4: 8 7 6
430 if (b < 32768 && c < 16384)
431 {
432 rgbo_mode = 4;
433 }
434
435 // mode 3: 9 6 7
436 if (b < 8192 && c < 16384)
437 {
438 rgbo_mode = 3;
439 }
440
441 // mode 2: 10 5 8
442 if (b < 2048 && c < 16384)
443 {
444 rgbo_mode = 2;
445 }
446
447 // mode 1: 11 6 5
448 if (b < 2048 && c < 1024)
449 {
450 rgbo_mode = 1;
451 }
452
453 // mode 0: 11 5 7
454 if (b < 1024 && c < 4096)
455 {
456 rgbo_mode = 0;
457 }
458
459 // Determine which one of the 9 submodes is likely to be used in case of an RGB-mode.
460 int rgb_mode = 8; // 8 bits per component, except 7 bits for blue
461
462 // mode 0: 9 7 6 7
463 if (b < 16384 && c < 8192 && d < 8192)
464 {
465 rgb_mode = 0;
466 }
467
468 // mode 1: 9 8 6 6
469 if (b < 32768 && c < 8192 && d < 4096)
470 {
471 rgb_mode = 1;
472 }
473
474 // mode 2: 10 6 7 7
475 if (b < 4096 && c < 8192 && d < 4096)
476 {
477 rgb_mode = 2;
478 }
479
480 // mode 3: 10 7 7 6
481 if (b < 8192 && c < 8192 && d < 2048)
482 {
483 rgb_mode = 3;
484 }
485
486 // mode 4: 11 8 6 5
487 if (b < 8192 && c < 2048 && d < 512)
488 {
489 rgb_mode = 4;
490 }
491
492 // mode 5: 11 6 8 6
493 if (b < 2048 && c < 8192 && d < 1024)
494 {
495 rgb_mode = 5;
496 }
497
498 // mode 6: 12 7 7 5
499 if (b < 2048 && c < 2048 && d < 256)
500 {
501 rgb_mode = 6;
502 }
503
504 // mode 7: 12 6 7 6
505 if (b < 1024 && c < 2048 && d < 512)
506 {
507 rgb_mode = 7;
508 }
509
510 static const float rgbo_error_scales[6] { 4.0f, 4.0f, 16.0f, 64.0f, 256.0f, 1024.0f };
511 static const float rgb_error_scales[9] { 64.0f, 64.0f, 16.0f, 16.0f, 4.0f, 4.0f, 1.0f, 1.0f, 384.0f };
512
513 float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f; // Empirically determined ....
514 float mode11mult = rgb_error_scales[rgb_mode] * 0.010f; // Empirically determined ....
515
516
517 float lum_high = hadd_rgb_s(ep1) * (1.0f / 3.0f);
518 float lum_low = hadd_rgb_s(ep0) * (1.0f / 3.0f);
519 float lumdif = lum_high - lum_low;
520 float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f;
521
522 mode23mult *= 0.0005f; // Empirically determined ....
523
524 // Pick among the available HDR endpoint modes
525 for (int i = QUANT_2; i < QUANT_16; i++)
526 {
527 best_error[i][3] = ERROR_CALC_DEFAULT;
528 best_error[i][2] = ERROR_CALC_DEFAULT;
529 best_error[i][1] = ERROR_CALC_DEFAULT;
530 best_error[i][0] = ERROR_CALC_DEFAULT;
531
532 format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
533 format_of_choice[i][2] = FMT_HDR_RGB;
534 format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
535 format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
536 }
537
538 for (int i = QUANT_16; i <= QUANT_256; i++)
539 {
540 // The base_quant_error should depend on the scale-factor that would be used during
541 // actual encode of the color value
542
543 float base_quant_error = baseline_quant_error[i] * static_cast<float>(partition_size);
544 float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
545 float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f;
546 float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
547
548 // For 8 integers, we have two encodings: one with HDR A and another one with LDR A
549
550 float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
551 best_error[i][3] = full_hdr_rgba_error;
552 format_of_choice[i][3] = encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA;
553
554 // For 6 integers, we have one HDR-RGB encoding
555 float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error;
556 best_error[i][2] = full_hdr_rgb_error;
557 format_of_choice[i][2] = FMT_HDR_RGB;
558
559 // For 4 integers, we have one HDR-RGB-Scale encoding
560 float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci.alpha_drop_error + eci.rgb_luma_error;
561
562 best_error[i][1] = hdr_rgb_scale_error;
563 format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
564
565 // For 2 integers, we assume luminance-with-large-range
566 float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci.alpha_drop_error + eci.luminance_error;
567 best_error[i][0] = hdr_luminance_error;
568 format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
569 }
570 }
571 else
572 {
573 for (int i = QUANT_2; i < QUANT_6; i++)
574 {
575 best_error[i][3] = ERROR_CALC_DEFAULT;
576 best_error[i][2] = ERROR_CALC_DEFAULT;
577 best_error[i][1] = ERROR_CALC_DEFAULT;
578 best_error[i][0] = ERROR_CALC_DEFAULT;
579
580 format_of_choice[i][3] = FMT_RGBA;
581 format_of_choice[i][2] = FMT_RGB;
582 format_of_choice[i][1] = FMT_RGB_SCALE;
583 format_of_choice[i][0] = FMT_LUMINANCE;
584 }
585
586 float base_quant_error_rgb = error_weight_rgbsum * static_cast<float>(partition_size);
587 float base_quant_error_a = error_weight.lane<3>() * static_cast<float>(partition_size);
588 float base_quant_error_rgba = base_quant_error_rgb + base_quant_error_a;
589
590 float error_scale_bc_rgba = eci.can_blue_contract ? 0.625f : 1.0f;
591 float error_scale_oe_rgba = eci.can_offset_encode ? 0.5f : 1.0f;
592
593 float error_scale_bc_rgb = eci.can_blue_contract ? 0.5f : 1.0f;
594 float error_scale_oe_rgb = eci.can_offset_encode ? 0.25f : 1.0f;
595
596 // Pick among the available LDR endpoint modes
597 for (int i = QUANT_6; i <= QUANT_256; i++)
598 {
599 // Offset encoding not possible at higher quant levels
600 if (i >= QUANT_192)
601 {
602 error_scale_oe_rgba = 1.0f;
603 error_scale_oe_rgb = 1.0f;
604 }
605
606 float base_quant_error = baseline_quant_error[i];
607 float quant_error_rgb = base_quant_error_rgb * base_quant_error;
608 float quant_error_rgba = base_quant_error_rgba * base_quant_error;
609
610 // 8 integers can encode as RGBA+RGBA
611 float full_ldr_rgba_error = quant_error_rgba
612 * error_scale_bc_rgba
613 * error_scale_oe_rgba
614 + rgb_range_error
615 + alpha_range_error;
616
617 best_error[i][3] = full_ldr_rgba_error;
618 format_of_choice[i][3] = FMT_RGBA;
619
620 // 6 integers can encode as RGB+RGB or RGBS+AA
621 float full_ldr_rgb_error = quant_error_rgb
622 * error_scale_bc_rgb
623 * error_scale_oe_rgb
624 + rgb_range_error
625 + eci.alpha_drop_error;
626
627 float rgbs_alpha_error = quant_error_rgba
628 + eci.rgb_scale_error
629 + rgb_range_error
630 + alpha_range_error;
631
632 if (rgbs_alpha_error < full_ldr_rgb_error)
633 {
634 best_error[i][2] = rgbs_alpha_error;
635 format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA;
636 }
637 else
638 {
639 best_error[i][2] = full_ldr_rgb_error;
640 format_of_choice[i][2] = FMT_RGB;
641 }
642
643 // 4 integers can encode as RGBS or LA+LA
644 float ldr_rgbs_error = quant_error_rgb
645 + rgb_range_error
646 + eci.alpha_drop_error
647 + eci.rgb_scale_error;
648
649 float lum_alpha_error = quant_error_rgba
650 + rgb_range_error
651 + alpha_range_error
652 + eci.luminance_error;
653
654 if (ldr_rgbs_error < lum_alpha_error)
655 {
656 best_error[i][1] = ldr_rgbs_error;
657 format_of_choice[i][1] = FMT_RGB_SCALE;
658 }
659 else
660 {
661 best_error[i][1] = lum_alpha_error;
662 format_of_choice[i][1] = FMT_LUMINANCE_ALPHA;
663 }
664
665 // 2 integers can encode as L+L
666 float luminance_error = quant_error_rgb
667 + rgb_range_error
668 + eci.alpha_drop_error
669 + eci.luminance_error;
670
671 best_error[i][0] = luminance_error;
672 format_of_choice[i][0] = FMT_LUMINANCE;
673 }
674 }
675 }
676
677 /**
678 * @brief For one partition compute the best format and quantization for a given bit count.
679 *
680 * @param best_combined_error The best error for each quant level and integer count.
681 * @param best_combined_format The best format for each quant level and integer count.
682 * @param bits_available The number of bits available for encoding.
683 * @param[out] best_quant_level The output best color quant level.
684 * @param[out] best_format The output best color format.
685 *
686 * @return The output error for the best pairing.
687 */
one_partition_find_best_combination_for_bitcount(QualityProfile privateProfile,const float best_combined_error[21][4],const int best_combined_format[21][4],int bits_available,quant_method & best_quant_level,int & best_format)688 static float one_partition_find_best_combination_for_bitcount(
689 QualityProfile privateProfile,
690 const float best_combined_error[21][4],
691 const int best_combined_format[21][4],
692 int bits_available,
693 quant_method& best_quant_level,
694 int& best_format
695 ) {
696 int best_integer_count = 0;
697 float best_integer_count_error = ERROR_CALC_DEFAULT;
698
699 for (int integer_count = 1; integer_count <= 4; integer_count++)
700 {
701 if (privateProfile == HIGH_SPEED_PROFILE)
702 {
703 integer_count = 4; // constant 4 bit count for HIGH_SPEED_PROFILE mode
704 }
705 // Compute the quantization level for a given number of integers and a given number of bits
706 int quant_level = quant_mode_table[integer_count][bits_available];
707
708 // Don't have enough bits to represent a given endpoint format at all!
709 if (quant_level < QUANT_6)
710 {
711 continue;
712 }
713
714 float integer_count_error = best_combined_error[quant_level][integer_count - 1];
715 if (integer_count_error < best_integer_count_error)
716 {
717 best_integer_count_error = integer_count_error;
718 best_integer_count = integer_count - 1;
719 }
720 }
721
722 int ql = quant_mode_table[best_integer_count + 1][bits_available];
723
724 best_quant_level = static_cast<quant_method>(ql);
725 if (privateProfile == HIGH_SPEED_PROFILE) // keep openSource code style
726 {
727 best_format = FMT_RGBA;
728 }
729 else
730 {
731 best_format = FMT_LUMINANCE;
732
733 if (ql >= QUANT_6)
734 {
735 best_format = best_combined_format[ql][best_integer_count];
736 }
737 }
738
739 return best_integer_count_error;
740 }
741
742 /**
743 * @brief For 2 partitions compute the best format combinations for every pair of quant mode and integer count.
744 *
745 * @param best_error The best error for a single endpoint quant level and integer count.
746 * @param best_format The best format for a single endpoint quant level and integer count.
747 * @param[out] best_combined_error The best combined error pairings for the 2 partitions.
748 * @param[out] best_combined_format The best combined format pairings for the 2 partitions.
749 */
two_partitions_find_best_combination_for_every_quantization_and_integer_count(const float best_error[2][21][4],const int best_format[2][21][4],float best_combined_error[21][7],int best_combined_format[21][7][2])750 static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(
751 const float best_error[2][21][4], // indexed by (partition, quant-level, integer-pair-count-minus-1)
752 const int best_format[2][21][4],
753 float best_combined_error[21][7], // indexed by (quant-level, integer-pair-count-minus-2)
754 int best_combined_format[21][7][2]
755 ) {
756 for (int i = QUANT_2; i <= QUANT_256; i++)
757 {
758 for (int j = 0; j < 7; j++)
759 {
760 best_combined_error[i][j] = ERROR_CALC_DEFAULT;
761 }
762 }
763
764 for (int quant = QUANT_6; quant <= QUANT_256; quant++)
765 {
766 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair
767 {
768 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair
769 {
770 int low2 = astc::min(i, j);
771 int high2 = astc::max(i, j);
772 if ((high2 - low2) > 1)
773 {
774 continue;
775 }
776
777 int intcnt = i + j;
778 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j], 1e10f);
779 if (errorterm <= best_combined_error[quant][intcnt])
780 {
781 best_combined_error[quant][intcnt] = errorterm;
782 best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
783 best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
784 }
785 }
786 }
787 }
788 }
789
790 /**
791 * @brief For 2 partitions compute the best format and quantization for a given bit count.
792 *
793 * @param best_combined_error The best error for each quant level and integer count.
794 * @param best_combined_format The best format for each quant level and integer count.
795 * @param bits_available The number of bits available for encoding.
796 * @param[out] best_quant_level The output best color quant level.
797 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
798 * @param[out] best_formats The output best color formats.
799 *
800 * @return The output error for the best pairing.
801 */
two_partitions_find_best_combination_for_bitcount(float best_combined_error[21][7],int best_combined_format[21][7][2],int bits_available,quant_method & best_quant_level,quant_method & best_quant_level_mod,int * best_formats)802 static float two_partitions_find_best_combination_for_bitcount(
803 float best_combined_error[21][7],
804 int best_combined_format[21][7][2],
805 int bits_available,
806 quant_method& best_quant_level,
807 quant_method& best_quant_level_mod,
808 int* best_formats
809 ) {
810 int best_integer_count = 0;
811 float best_integer_count_error = ERROR_CALC_DEFAULT;
812
813 for (int integer_count = 2; integer_count <= 8; integer_count++)
814 {
815 // Compute the quantization level for a given number of integers and a given number of bits
816 int quant_level = quant_mode_table[integer_count][bits_available];
817
818 // Don't have enough bits to represent a given endpoint format at all!
819 if (quant_level < QUANT_6)
820 {
821 break;
822 }
823
824 float integer_count_error = best_combined_error[quant_level][integer_count - 2];
825 if (integer_count_error < best_integer_count_error)
826 {
827 best_integer_count_error = integer_count_error;
828 best_integer_count = integer_count;
829 }
830 }
831
832 int ql = quant_mode_table[best_integer_count][bits_available];
833 int ql_mod = quant_mode_table[best_integer_count][bits_available + 2];
834
835 best_quant_level = static_cast<quant_method>(ql);
836 best_quant_level_mod = static_cast<quant_method>(ql_mod);
837
838 if (ql >= QUANT_6)
839 {
840 for (int i = 0; i < 2; i++)
841 {
842 best_formats[i] = best_combined_format[ql][best_integer_count - 2][i];
843 }
844 }
845 else
846 {
847 for (int i = 0; i < 2; i++)
848 {
849 best_formats[i] = FMT_LUMINANCE;
850 }
851 }
852
853 return best_integer_count_error;
854 }
855
856 /**
857 * @brief For 3 partitions compute the best format combinations for every pair of quant mode and integer count.
858 *
859 * @param best_error The best error for a single endpoint quant level and integer count.
860 * @param best_format The best format for a single endpoint quant level and integer count.
861 * @param[out] best_combined_error The best combined error pairings for the 3 partitions.
862 * @param[out] best_combined_format The best combined format pairings for the 3 partitions.
863 */
three_partitions_find_best_combination_for_every_quantization_and_integer_count(const float best_error[3][21][4],const int best_format[3][21][4],float best_combined_error[21][10],int best_combined_format[21][10][3])864 static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(
865 const float best_error[3][21][4], // indexed by (partition, quant-level, integer-count)
866 const int best_format[3][21][4],
867 float best_combined_error[21][10],
868 int best_combined_format[21][10][3]
869 ) {
870 for (int i = QUANT_2; i <= QUANT_256; i++)
871 {
872 for (int j = 0; j < 10; j++)
873 {
874 best_combined_error[i][j] = ERROR_CALC_DEFAULT;
875 }
876 }
877
878 for (int quant = QUANT_6; quant <= QUANT_256; quant++)
879 {
880 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair
881 {
882 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair
883 {
884 int low2 = astc::min(i, j);
885 int high2 = astc::max(i, j);
886 if ((high2 - low2) > 1)
887 {
888 continue;
889 }
890
891 for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair
892 {
893 int low3 = astc::min(k, low2);
894 int high3 = astc::max(k, high2);
895 if ((high3 - low3) > 1)
896 {
897 continue;
898 }
899
900 int intcnt = i + j + k;
901 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], 1e10f);
902 if (errorterm <= best_combined_error[quant][intcnt])
903 {
904 best_combined_error[quant][intcnt] = errorterm;
905 best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
906 best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
907 best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
908 }
909 }
910 }
911 }
912 }
913 }
914
915 /**
916 * @brief For 3 partitions compute the best format and quantization for a given bit count.
917 *
918 * @param best_combined_error The best error for each quant level and integer count.
919 * @param best_combined_format The best format for each quant level and integer count.
920 * @param bits_available The number of bits available for encoding.
921 * @param[out] best_quant_level The output best color quant level.
922 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
923 * @param[out] best_formats The output best color formats.
924 *
925 * @return The output error for the best pairing.
926 */
three_partitions_find_best_combination_for_bitcount(const float best_combined_error[21][10],const int best_combined_format[21][10][3],int bits_available,quant_method & best_quant_level,quant_method & best_quant_level_mod,int * best_formats)927 static float three_partitions_find_best_combination_for_bitcount(
928 const float best_combined_error[21][10],
929 const int best_combined_format[21][10][3],
930 int bits_available,
931 quant_method& best_quant_level,
932 quant_method& best_quant_level_mod,
933 int* best_formats
934 ) {
935 int best_integer_count = 0;
936 float best_integer_count_error = ERROR_CALC_DEFAULT;
937
938 for (int integer_count = 3; integer_count <= 9; integer_count++)
939 {
940 // Compute the quantization level for a given number of integers and a given number of bits
941 int quant_level = quant_mode_table[integer_count][bits_available];
942
943 // Don't have enough bits to represent a given endpoint format at all!
944 if (quant_level < QUANT_6)
945 {
946 break;
947 }
948
949 float integer_count_error = best_combined_error[quant_level][integer_count - 3];
950 if (integer_count_error < best_integer_count_error)
951 {
952 best_integer_count_error = integer_count_error;
953 best_integer_count = integer_count;
954 }
955 }
956
957 int ql = quant_mode_table[best_integer_count][bits_available];
958 int ql_mod = quant_mode_table[best_integer_count][bits_available + 5];
959
960 best_quant_level = static_cast<quant_method>(ql);
961 best_quant_level_mod = static_cast<quant_method>(ql_mod);
962
963 if (ql >= QUANT_6)
964 {
965 for (int i = 0; i < 3; i++)
966 {
967 best_formats[i] = best_combined_format[ql][best_integer_count - 3][i];
968 }
969 }
970 else
971 {
972 for (int i = 0; i < 3; i++)
973 {
974 best_formats[i] = FMT_LUMINANCE;
975 }
976 }
977
978 return best_integer_count_error;
979 }
980
981 /**
982 * @brief For 4 partitions compute the best format combinations for every pair of quant mode and integer count.
983 *
984 * @param best_error The best error for a single endpoint quant level and integer count.
985 * @param best_format The best format for a single endpoint quant level and integer count.
986 * @param[out] best_combined_error The best combined error pairings for the 4 partitions.
987 * @param[out] best_combined_format The best combined format pairings for the 4 partitions.
988 */
four_partitions_find_best_combination_for_every_quantization_and_integer_count(const float best_error[4][21][4],const int best_format[4][21][4],float best_combined_error[21][13],int best_combined_format[21][13][4])989 static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(
990 const float best_error[4][21][4], // indexed by (partition, quant-level, integer-count)
991 const int best_format[4][21][4],
992 float best_combined_error[21][13],
993 int best_combined_format[21][13][4]
994 ) {
995 for (int i = QUANT_2; i <= QUANT_256; i++)
996 {
997 for (int j = 0; j < 13; j++)
998 {
999 best_combined_error[i][j] = ERROR_CALC_DEFAULT;
1000 }
1001 }
1002
1003 for (int quant = QUANT_6; quant <= QUANT_256; quant++)
1004 {
1005 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair
1006 {
1007 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair
1008 {
1009 int low2 = astc::min(i, j);
1010 int high2 = astc::max(i, j);
1011 if ((high2 - low2) > 1)
1012 {
1013 continue;
1014 }
1015
1016 for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair
1017 {
1018 int low3 = astc::min(k, low2);
1019 int high3 = astc::max(k, high2);
1020 if ((high3 - low3) > 1)
1021 {
1022 continue;
1023 }
1024
1025 for (int l = 0; l < 4; l++) // integer-count for fourth endpoint-pair
1026 {
1027 int low4 = astc::min(l, low3);
1028 int high4 = astc::max(l, high3);
1029 if ((high4 - low4) > 1)
1030 {
1031 continue;
1032 }
1033
1034 int intcnt = i + j + k + l;
1035 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], 1e10f);
1036 if (errorterm <= best_combined_error[quant][intcnt])
1037 {
1038 best_combined_error[quant][intcnt] = errorterm;
1039 best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
1040 best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
1041 best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
1042 best_combined_format[quant][intcnt][3] = best_format[3][quant][l];
1043 }
1044 }
1045 }
1046 }
1047 }
1048 }
1049 }
1050
1051 /**
1052 * @brief For 4 partitions compute the best format and quantization for a given bit count.
1053 *
1054 * @param best_combined_error The best error for each quant level and integer count.
1055 * @param best_combined_format The best format for each quant level and integer count.
1056 * @param bits_available The number of bits available for encoding.
1057 * @param[out] best_quant_level The output best color quant level.
1058 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
1059 * @param[out] best_formats The output best color formats.
1060 *
1061 * @return best_error The output error for the best pairing.
1062 */
four_partitions_find_best_combination_for_bitcount(const float best_combined_error[21][13],const int best_combined_format[21][13][4],int bits_available,quant_method & best_quant_level,quant_method & best_quant_level_mod,int * best_formats)1063 static float four_partitions_find_best_combination_for_bitcount(
1064 const float best_combined_error[21][13],
1065 const int best_combined_format[21][13][4],
1066 int bits_available,
1067 quant_method& best_quant_level,
1068 quant_method& best_quant_level_mod,
1069 int* best_formats
1070 ) {
1071 int best_integer_count = 0;
1072 float best_integer_count_error = ERROR_CALC_DEFAULT;
1073
1074 for (int integer_count = 4; integer_count <= 9; integer_count++)
1075 {
1076 // Compute the quantization level for a given number of integers and a given number of bits
1077 int quant_level = quant_mode_table[integer_count][bits_available];
1078
1079 // Don't have enough bits to represent a given endpoint format at all!
1080 if (quant_level < QUANT_6)
1081 {
1082 break;
1083 }
1084
1085 float integer_count_error = best_combined_error[quant_level][integer_count - 4];
1086 if (integer_count_error < best_integer_count_error)
1087 {
1088 best_integer_count_error = integer_count_error;
1089 best_integer_count = integer_count;
1090 }
1091 }
1092
1093 int ql = quant_mode_table[best_integer_count][bits_available];
1094 int ql_mod = quant_mode_table[best_integer_count][bits_available + 8];
1095
1096 best_quant_level = static_cast<quant_method>(ql);
1097 best_quant_level_mod = static_cast<quant_method>(ql_mod);
1098
1099 if (ql >= QUANT_6)
1100 {
1101 for (int i = 0; i < 4; i++)
1102 {
1103 best_formats[i] = best_combined_format[ql][best_integer_count - 4][i];
1104 }
1105 }
1106 else
1107 {
1108 for (int i = 0; i < 4; i++)
1109 {
1110 best_formats[i] = FMT_LUMINANCE;
1111 }
1112 }
1113
1114 return best_integer_count_error;
1115 }
1116
1117 /* See header for documentation. */
compute_ideal_endpoint_formats(QualityProfile privateProfile,const partition_info & pi,const image_block & blk,const endpoints & ep,const int * qwt_bitcounts,const float * qwt_errors,unsigned int tune_candidate_limit,unsigned int start_block_mode,unsigned int end_block_mode,int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],int block_mode[TUNE_MAX_TRIAL_CANDIDATES],quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],compression_working_buffers & tmpbuf)1118 unsigned int compute_ideal_endpoint_formats(
1119 QualityProfile privateProfile,
1120 const partition_info& pi,
1121 const image_block& blk,
1122 const endpoints& ep,
1123 // bitcounts and errors computed for the various quantization methods
1124 const int* qwt_bitcounts,
1125 const float* qwt_errors,
1126 unsigned int tune_candidate_limit,
1127 unsigned int start_block_mode,
1128 unsigned int end_block_mode,
1129 // output data
1130 int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
1131 int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
1132 quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
1133 quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
1134 compression_working_buffers& tmpbuf
1135 ) {
1136 int partition_count = pi.partition_count;
1137
1138 promise(partition_count > 0);
1139
1140 int encode_hdr_rgb = blk.rgb_lns[0];
1141 int encode_hdr_alpha = blk.alpha_lns[0];
1142
1143 // Compute the errors that result from various encoding choices (such as using luminance instead
1144 // of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
1145 encoding_choice_errors eci[BLOCK_MAX_PARTITIONS];
1146 compute_encoding_choice_errors(blk, pi, ep, eci);
1147
1148 float best_error[BLOCK_MAX_PARTITIONS][21][4];
1149 int format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
1150 for (int i = 0; i < partition_count; i++)
1151 {
1152 compute_color_error_for_every_integer_count_and_quant_level(
1153 encode_hdr_rgb, encode_hdr_alpha, i,
1154 pi, eci[i], ep, blk.channel_weight, best_error[i],
1155 format_of_choice[i]);
1156 }
1157
1158 float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
1159 quant_method* best_quant_levels = tmpbuf.best_quant_levels;
1160 quant_method* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
1161 int (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
1162
1163 // Ensure that the "overstep" of the last iteration in the vectorized loop will contain data
1164 // that will never be picked as best candidate
1165 const unsigned int packed_end_block_mode = round_up_to_simd_multiple_vla(end_block_mode);
1166
1167 // TODO: Can we avoid this?
1168 for (unsigned int i = 0; i < start_block_mode; i++)
1169 {
1170 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1171 best_quant_levels[i] = QUANT_2;
1172 best_quant_levels_mod[i] = QUANT_2;
1173 }
1174
1175 for (unsigned int i = end_block_mode; i < packed_end_block_mode; i++)
1176 {
1177 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1178 best_quant_levels[i] = QUANT_2;
1179 best_quant_levels_mod[i] = QUANT_2;
1180 }
1181
1182 // Track a scalar best to avoid expensive search at least once ...
1183 float error_of_best_combination = ERROR_CALC_DEFAULT;
1184 int index_of_best_combination = -1;
1185
1186 // The block contains 1 partition
1187 if (partition_count == 1)
1188 {
1189 for (unsigned int i = start_block_mode; i < end_block_mode; ++i)
1190 {
1191 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1192 {
1193 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1194 continue;
1195 }
1196
1197 float error_of_best = one_partition_find_best_combination_for_bitcount(
1198 privateProfile,
1199 best_error[0], format_of_choice[0], qwt_bitcounts[i],
1200 best_quant_levels[i], best_ep_formats[i][0]);
1201
1202 float total_error = error_of_best + qwt_errors[i];
1203 errors_of_best_combination[i] = total_error;
1204 best_quant_levels_mod[i] = best_quant_levels[i];
1205
1206 if (total_error < error_of_best_combination)
1207 {
1208 error_of_best_combination = total_error;
1209 index_of_best_combination = i;
1210 }
1211 }
1212 }
1213 // The block contains 2 partitions
1214 else if (partition_count == 2)
1215 {
1216 float combined_best_error[21][7];
1217 int formats_of_choice[21][7][2];
1218
1219 two_partitions_find_best_combination_for_every_quantization_and_integer_count(
1220 best_error, format_of_choice, combined_best_error, formats_of_choice);
1221
1222 assert(start_block_mode == 0);
1223 for (unsigned int i = 0; i < end_block_mode; ++i)
1224 {
1225 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1226 {
1227 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1228 continue;
1229 }
1230
1231 float error_of_best = two_partitions_find_best_combination_for_bitcount(
1232 combined_best_error, formats_of_choice, qwt_bitcounts[i],
1233 best_quant_levels[i], best_quant_levels_mod[i],
1234 best_ep_formats[i]);
1235
1236 float total_error = error_of_best + qwt_errors[i];
1237 errors_of_best_combination[i] = total_error;
1238
1239 if (total_error < error_of_best_combination)
1240 {
1241 error_of_best_combination = total_error;
1242 index_of_best_combination = i;
1243 }
1244 }
1245 }
1246 // The block contains 3 partitions
1247 else if (partition_count == 3)
1248 {
1249 float combined_best_error[21][10];
1250 int formats_of_choice[21][10][3];
1251
1252 three_partitions_find_best_combination_for_every_quantization_and_integer_count(
1253 best_error, format_of_choice, combined_best_error, formats_of_choice);
1254
1255 assert(start_block_mode == 0);
1256 for (unsigned int i = 0; i < end_block_mode; ++i)
1257 {
1258 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1259 {
1260 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1261 continue;
1262 }
1263
1264 float error_of_best = three_partitions_find_best_combination_for_bitcount(
1265 combined_best_error, formats_of_choice, qwt_bitcounts[i],
1266 best_quant_levels[i], best_quant_levels_mod[i],
1267 best_ep_formats[i]);
1268
1269 float total_error = error_of_best + qwt_errors[i];
1270 errors_of_best_combination[i] = total_error;
1271
1272 if (total_error < error_of_best_combination)
1273 {
1274 error_of_best_combination = total_error;
1275 index_of_best_combination = i;
1276 }
1277 }
1278 }
1279 // The block contains 4 partitions
1280 else // if (partition_count == 4)
1281 {
1282 assert(partition_count == 4);
1283 float combined_best_error[21][13];
1284 int formats_of_choice[21][13][4];
1285
1286 four_partitions_find_best_combination_for_every_quantization_and_integer_count(
1287 best_error, format_of_choice, combined_best_error, formats_of_choice);
1288
1289 assert(start_block_mode == 0);
1290 for (unsigned int i = 0; i < end_block_mode; ++i)
1291 {
1292 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1293 {
1294 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1295 continue;
1296 }
1297
1298 float error_of_best = four_partitions_find_best_combination_for_bitcount(
1299 combined_best_error, formats_of_choice, qwt_bitcounts[i],
1300 best_quant_levels[i], best_quant_levels_mod[i],
1301 best_ep_formats[i]);
1302
1303 float total_error = error_of_best + qwt_errors[i];
1304 errors_of_best_combination[i] = total_error;
1305
1306 if (total_error < error_of_best_combination)
1307 {
1308 error_of_best_combination = total_error;
1309 index_of_best_combination = i;
1310 }
1311 }
1312 }
1313
1314 int best_error_weights[TUNE_MAX_TRIAL_CANDIDATES];
1315
1316 // Fast path the first result and avoid the list search for trial 0
1317 best_error_weights[0] = index_of_best_combination;
1318 if (index_of_best_combination >= 0)
1319 {
1320 errors_of_best_combination[index_of_best_combination] = ERROR_CALC_DEFAULT;
1321 }
1322
1323 // Search the remaining results and pick the best candidate modes for trial 1+
1324 for (unsigned int i = 1; i < tune_candidate_limit; i++)
1325 {
1326 vint vbest_error_index(-1);
1327 vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
1328
1329 start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
1330 vint lane_ids = vint::lane_id() + vint(start_block_mode);
1331 for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
1332 {
1333 vfloat err = vfloat(&errors_of_best_combination[j]);
1334 vmask mask1 = err < vbest_ep_error;
1335 vmask mask2 = vint(reinterpret_cast<int*>(best_quant_levels + j)) > vint(4);
1336 vmask mask = mask1 & mask2;
1337 vbest_ep_error = select(vbest_ep_error, err, mask);
1338 vbest_error_index = select(vbest_error_index, lane_ids, mask);
1339 lane_ids += vint(ASTCENC_SIMD_WIDTH);
1340 }
1341
1342 // Pick best mode from the SIMD result, using lowest matching index to ensure invariance
1343 vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
1344 vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
1345 vbest_error_index = hmin(vbest_error_index);
1346 int best_error_index = vbest_error_index.lane<0>();
1347
1348 best_error_weights[i] = best_error_index;
1349
1350 // Max the error for this candidate so we don't pick it again
1351 if (best_error_index >= 0)
1352 {
1353 errors_of_best_combination[best_error_index] = ERROR_CALC_DEFAULT;
1354 }
1355 // Early-out if no more candidates are valid
1356 else
1357 {
1358 break;
1359 }
1360 }
1361
1362 for (unsigned int i = 0; i < tune_candidate_limit; i++)
1363 {
1364 if (best_error_weights[i] < 0)
1365 {
1366 return i;
1367 }
1368
1369 block_mode[i] = best_error_weights[i];
1370
1371 quant_level[i] = best_quant_levels[best_error_weights[i]];
1372 quant_level_mod[i] = best_quant_levels_mod[best_error_weights[i]];
1373
1374 assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256);
1375 assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256);
1376
1377 for (int j = 0; j < partition_count; j++)
1378 {
1379 partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j];
1380 }
1381 }
1382
1383 return tune_candidate_limit;
1384 }
1385
1386 #endif
1387