1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions for computing color endpoints and texel weights.
22 */
23
24 #include <cassert>
25
26 #include "astcenc_internal.h"
27 #include "astcenc_vecmathlib.h"
28
29 /**
30 * @brief Compute the ideal endpoints and weights for 1 color component.
31 *
32 * @param blk The image block color data to compress.
33 * @param pi The partition info for the current trial.
34 * @param[out] ei The computed ideal endpoints and weights.
35 * @param component The color component to compute.
36 */
compute_ideal_colors_and_weights_1_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int component)37 static void compute_ideal_colors_and_weights_1_comp(
38 const image_block& blk,
39 const partition_info& pi,
40 endpoints_and_weights& ei,
41 unsigned int component
42 ) {
43 unsigned int partition_count = pi.partition_count;
44 ei.ep.partition_count = partition_count;
45 promise(partition_count > 0);
46
47 unsigned int texel_count = blk.texel_count;
48 promise(texel_count > 0);
49
50 float error_weight;
51 const float* data_vr = nullptr;
52
53 assert(component < BLOCK_MAX_COMPONENTS);
54 switch (component)
55 {
56 case 0:
57 error_weight = blk.channel_weight.lane<0>();
58 data_vr = blk.data_r;
59 break;
60 case 1:
61 error_weight = blk.channel_weight.lane<1>();
62 data_vr = blk.data_g;
63 break;
64 case 2:
65 error_weight = blk.channel_weight.lane<2>();
66 data_vr = blk.data_b;
67 break;
68 default:
69 assert(component == 3);
70 error_weight = blk.channel_weight.lane<3>();
71 data_vr = blk.data_a;
72 break;
73 }
74
75 vmask4 sep_mask = vint4::lane_id() == vint4(component);
76 bool is_constant_wes { true };
77 float partition0_len_sq { 0.0f };
78
79 for (unsigned int i = 0; i < partition_count; i++)
80 {
81 float lowvalue { 1e10f };
82 float highvalue { -1e10f };
83
84 unsigned int partition_texel_count = pi.partition_texel_count[i];
85 for (unsigned int j = 0; j < partition_texel_count; j++)
86 {
87 unsigned int tix = pi.texels_of_partition[i][j];
88 float value = data_vr[tix];
89 lowvalue = astc::min(value, lowvalue);
90 highvalue = astc::max(value, highvalue);
91 }
92
93 if (highvalue < lowvalue)
94 {
95 lowvalue = 0.0f;
96 highvalue = 1e-7f;
97 }
98
99 float length = highvalue - lowvalue;
100 float length_squared = length * length;
101 float scale = 1.0f / length;
102
103 if (i == 0)
104 {
105 partition0_len_sq = length_squared;
106 }
107 else
108 {
109 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
110 }
111
112 for (unsigned int j = 0; j < partition_texel_count; j++)
113 {
114 unsigned int tix = pi.texels_of_partition[i][j];
115 float value = (data_vr[tix] - lowvalue) * scale;
116 value = astc::clamp1f(value);
117
118 ei.weights[tix] = value;
119 ei.weight_error_scale[tix] = length_squared * error_weight;
120 assert(!astc::isnan(ei.weight_error_scale[tix]));
121 }
122
123 ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
124 ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
125 }
126
127 // Zero initialize any SIMD over-fetch
128 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
129 for (unsigned int i = texel_count; i < texel_count_simd; i++)
130 {
131 ei.weights[i] = 0.0f;
132 ei.weight_error_scale[i] = 0.0f;
133 }
134
135 ei.is_constant_weight_error_scale = is_constant_wes;
136 }
137
138 /**
139 * @brief Compute the ideal endpoints and weights for 2 color components.
140 *
141 * @param blk The image block color data to compress.
142 * @param pi The partition info for the current trial.
143 * @param[out] ei The computed ideal endpoints and weights.
144 * @param component1 The first color component to compute.
145 * @param component2 The second color component to compute.
146 */
compute_ideal_colors_and_weights_2_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,int component1,int component2)147 static void compute_ideal_colors_and_weights_2_comp(
148 const image_block& blk,
149 const partition_info& pi,
150 endpoints_and_weights& ei,
151 int component1,
152 int component2
153 ) {
154 unsigned int partition_count = pi.partition_count;
155 ei.ep.partition_count = partition_count;
156 promise(partition_count > 0);
157
158 unsigned int texel_count = blk.texel_count;
159 promise(texel_count > 0);
160
161 partition_metrics pms[BLOCK_MAX_PARTITIONS];
162
163 float error_weight;
164 const float* data_vr = nullptr;
165 const float* data_vg = nullptr;
166
167 if (component1 == 0 && component2 == 1)
168 {
169 error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
170
171 data_vr = blk.data_r;
172 data_vg = blk.data_g;
173 }
174 else if (component1 == 0 && component2 == 2)
175 {
176 error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
177
178 data_vr = blk.data_r;
179 data_vg = blk.data_b;
180 }
181 else // (component1 == 1 && component2 == 2)
182 {
183 assert(component1 == 1 && component2 == 2);
184
185 error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
186
187 data_vr = blk.data_g;
188 data_vg = blk.data_b;
189 }
190
191 compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
192
193 bool is_constant_wes { true };
194 float partition0_len_sq { 0.0f };
195
196 vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
197 vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
198
199 for (unsigned int i = 0; i < partition_count; i++)
200 {
201 vfloat4 dir = pms[i].dir.swz<0, 1>();
202 if (hadd_s(dir) < 0.0f)
203 {
204 dir = vfloat4::zero() - dir;
205 }
206
207 line2 line { pms[i].avg.swz<0, 1>(), normalize_safe(dir, unit2()) };
208 float lowparam { 1e10f };
209 float highparam { -1e10f };
210
211 unsigned int partition_texel_count = pi.partition_texel_count[i];
212 for (unsigned int j = 0; j < partition_texel_count; j++)
213 {
214 unsigned int tix = pi.texels_of_partition[i][j];
215 vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
216 float param = dot_s(point - line.a, line.b);
217 ei.weights[tix] = param;
218
219 lowparam = astc::min(param, lowparam);
220 highparam = astc::max(param, highparam);
221 }
222
223 // It is possible for a uniform-color partition to produce length=0;
224 // this causes NaN issues so set to small value to avoid this problem
225 if (highparam < lowparam)
226 {
227 lowparam = 0.0f;
228 highparam = 1e-7f;
229 }
230
231 float length = highparam - lowparam;
232 float length_squared = length * length;
233 float scale = 1.0f / length;
234
235 if (i == 0)
236 {
237 partition0_len_sq = length_squared;
238 }
239 else
240 {
241 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
242 }
243
244 for (unsigned int j = 0; j < partition_texel_count; j++)
245 {
246 unsigned int tix = pi.texels_of_partition[i][j];
247 float idx = (ei.weights[tix] - lowparam) * scale;
248 idx = astc::clamp1f(idx);
249
250 ei.weights[tix] = idx;
251 ei.weight_error_scale[tix] = length_squared * error_weight;
252 assert(!astc::isnan(ei.weight_error_scale[tix]));
253 }
254
255 vfloat4 lowvalue = line.a + line.b * lowparam;
256 vfloat4 highvalue = line.a + line.b * highparam;
257
258 vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
259 vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
260
261 ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
262 ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
263 }
264
265 // Zero initialize any SIMD over-fetch
266 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
267 for (unsigned int i = texel_count; i < texel_count_simd; i++)
268 {
269 ei.weights[i] = 0.0f;
270 ei.weight_error_scale[i] = 0.0f;
271 }
272
273 ei.is_constant_weight_error_scale = is_constant_wes;
274 }
275
276 /**
277 * @brief Compute the ideal endpoints and weights for 3 color components.
278 *
279 * @param blk The image block color data to compress.
280 * @param pi The partition info for the current trial.
281 * @param[out] ei The computed ideal endpoints and weights.
282 * @param omitted_component The color component excluded from the calculation.
283 */
compute_ideal_colors_and_weights_3_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int omitted_component)284 static void compute_ideal_colors_and_weights_3_comp(
285 const image_block& blk,
286 const partition_info& pi,
287 endpoints_and_weights& ei,
288 unsigned int omitted_component
289 ) {
290 unsigned int partition_count = pi.partition_count;
291 ei.ep.partition_count = partition_count;
292 promise(partition_count > 0);
293
294 unsigned int texel_count = blk.texel_count;
295 promise(texel_count > 0);
296
297 partition_metrics pms[BLOCK_MAX_PARTITIONS];
298
299 float error_weight;
300 const float* data_vr = nullptr;
301 const float* data_vg = nullptr;
302 const float* data_vb = nullptr;
303 if (omitted_component == 0)
304 {
305 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
306 data_vr = blk.data_g;
307 data_vg = blk.data_b;
308 data_vb = blk.data_a;
309 }
310 else if (omitted_component == 1)
311 {
312 error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
313 data_vr = blk.data_r;
314 data_vg = blk.data_b;
315 data_vb = blk.data_a;
316 }
317 else if (omitted_component == 2)
318 {
319 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
320 data_vr = blk.data_r;
321 data_vg = blk.data_g;
322 data_vb = blk.data_a;
323 }
324 else
325 {
326 assert(omitted_component == 3);
327
328 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
329 data_vr = blk.data_r;
330 data_vg = blk.data_g;
331 data_vb = blk.data_b;
332 }
333
334 error_weight = error_weight * (1.0f / 3.0f);
335
336 if (omitted_component == 3)
337 {
338 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
339 }
340 else
341 {
342 compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
343 }
344
345 bool is_constant_wes { true };
346 float partition0_len_sq { 0.0f };
347
348 for (unsigned int i = 0; i < partition_count; i++)
349 {
350 vfloat4 dir = pms[i].dir;
351 if (hadd_rgb_s(dir) < 0.0f)
352 {
353 dir = vfloat4::zero() - dir;
354 }
355
356 line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
357 float lowparam { 1e10f };
358 float highparam { -1e10f };
359
360 unsigned int partition_texel_count = pi.partition_texel_count[i];
361 for (unsigned int j = 0; j < partition_texel_count; j++)
362 {
363 unsigned int tix = pi.texels_of_partition[i][j];
364 vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
365 float param = dot3_s(point - line.a, line.b);
366 ei.weights[tix] = param;
367
368 lowparam = astc::min(param, lowparam);
369 highparam = astc::max(param, highparam);
370 }
371
372 // It is possible for a uniform-color partition to produce length=0;
373 // this causes NaN issues so set to small value to avoid this problem
374 if (highparam < lowparam)
375 {
376 lowparam = 0.0f;
377 highparam = 1e-7f;
378 }
379
380 float length = highparam - lowparam;
381 float length_squared = length * length;
382 float scale = 1.0f / length;
383
384 if (i == 0)
385 {
386 partition0_len_sq = length_squared;
387 }
388 else
389 {
390 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
391 }
392
393 for (unsigned int j = 0; j < partition_texel_count; j++)
394 {
395 unsigned int tix = pi.texels_of_partition[i][j];
396 float idx = (ei.weights[tix] - lowparam) * scale;
397 idx = astc::clamp1f(idx);
398
399 ei.weights[tix] = idx;
400 ei.weight_error_scale[tix] = length_squared * error_weight;
401 assert(!astc::isnan(ei.weight_error_scale[tix]));
402 }
403
404 vfloat4 ep0 = line.a + line.b * lowparam;
405 vfloat4 ep1 = line.a + line.b * highparam;
406
407 vfloat4 bmin = blk.data_min;
408 vfloat4 bmax = blk.data_max;
409
410 assert(omitted_component < BLOCK_MAX_COMPONENTS);
411 switch (omitted_component)
412 {
413 case 0:
414 ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
415 ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
416 break;
417 case 1:
418 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
419 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
420 break;
421 case 2:
422 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
423 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
424 break;
425 default:
426 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
427 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
428 break;
429 }
430 }
431
432 // Zero initialize any SIMD over-fetch
433 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
434 for (unsigned int i = texel_count; i < texel_count_simd; i++)
435 {
436 ei.weights[i] = 0.0f;
437 ei.weight_error_scale[i] = 0.0f;
438 }
439
440 ei.is_constant_weight_error_scale = is_constant_wes;
441 }
442
443 /**
444 * @brief Compute the ideal endpoints and weights for 4 color components.
445 *
446 * @param blk The image block color data to compress.
447 * @param pi The partition info for the current trial.
448 * @param[out] ei The computed ideal endpoints and weights.
449 */
compute_ideal_colors_and_weights_4_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)450 static void compute_ideal_colors_and_weights_4_comp(
451 const image_block& blk,
452 const partition_info& pi,
453 endpoints_and_weights& ei
454 ) {
455 const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
456
457 unsigned int partition_count = pi.partition_count;
458
459 unsigned int texel_count = blk.texel_count;
460 promise(texel_count > 0);
461 promise(partition_count > 0);
462
463 partition_metrics pms[BLOCK_MAX_PARTITIONS];
464
465 compute_avgs_and_dirs_4_comp(pi, blk, pms);
466
467 bool is_constant_wes { true };
468 float partition0_len_sq { 0.0f };
469
470 for (unsigned int i = 0; i < partition_count; i++)
471 {
472 vfloat4 dir = pms[i].dir;
473 if (hadd_rgb_s(dir) < 0.0f)
474 {
475 dir = vfloat4::zero() - dir;
476 }
477
478 line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
479 float lowparam { 1e10f };
480 float highparam { -1e10f };
481
482 unsigned int partition_texel_count = pi.partition_texel_count[i];
483 for (unsigned int j = 0; j < partition_texel_count; j++)
484 {
485 unsigned int tix = pi.texels_of_partition[i][j];
486 vfloat4 point = blk.texel(tix);
487 float param = dot_s(point - line.a, line.b);
488 ei.weights[tix] = param;
489
490 lowparam = astc::min(param, lowparam);
491 highparam = astc::max(param, highparam);
492 }
493
494 // It is possible for a uniform-color partition to produce length=0;
495 // this causes NaN issues so set to small value to avoid this problem
496 if (highparam < lowparam)
497 {
498 lowparam = 0.0f;
499 highparam = 1e-7f;
500 }
501
502 float length = highparam - lowparam;
503 float length_squared = length * length;
504 float scale = 1.0f / length;
505
506 if (i == 0)
507 {
508 partition0_len_sq = length_squared;
509 }
510 else
511 {
512 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
513 }
514
515 ei.ep.endpt0[i] = line.a + line.b * lowparam;
516 ei.ep.endpt1[i] = line.a + line.b * highparam;
517
518 for (unsigned int j = 0; j < partition_texel_count; j++)
519 {
520 unsigned int tix = pi.texels_of_partition[i][j];
521 float idx = (ei.weights[tix] - lowparam) * scale;
522 idx = astc::clamp1f(idx);
523
524 ei.weights[tix] = idx;
525 ei.weight_error_scale[tix] = length_squared * error_weight;
526 assert(!astc::isnan(ei.weight_error_scale[tix]));
527 }
528 }
529
530 // Zero initialize any SIMD over-fetch
531 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
532 for (unsigned int i = texel_count; i < texel_count_simd; i++)
533 {
534 ei.weights[i] = 0.0f;
535 ei.weight_error_scale[i] = 0.0f;
536 }
537
538 ei.is_constant_weight_error_scale = is_constant_wes;
539 }
540
541 /* See header for documentation. */
compute_ideal_colors_and_weights_1plane(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)542 void compute_ideal_colors_and_weights_1plane(
543 const image_block& blk,
544 const partition_info& pi,
545 endpoints_and_weights& ei
546 ) {
547 bool uses_alpha = !blk.is_constant_channel(3);
548
549 if (uses_alpha)
550 {
551 compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
552 }
553 else
554 {
555 compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
556 }
557 }
558
559 /* See header for documentation. */
compute_ideal_colors_and_weights_2planes(const block_size_descriptor & bsd,const image_block & blk,unsigned int plane2_component,endpoints_and_weights & ei1,endpoints_and_weights & ei2)560 void compute_ideal_colors_and_weights_2planes(
561 const block_size_descriptor& bsd,
562 const image_block& blk,
563 unsigned int plane2_component,
564 endpoints_and_weights& ei1,
565 endpoints_and_weights& ei2
566 ) {
567 const auto& pi = bsd.get_partition_info(1, 0);
568 bool uses_alpha = !blk.is_constant_channel(3);
569
570 assert(plane2_component < BLOCK_MAX_COMPONENTS);
571 switch (plane2_component)
572 {
573 case 0: // Separate weights for red
574 if (uses_alpha)
575 {
576 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
577 }
578 else
579 {
580 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
581 }
582 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
583 break;
584
585 case 1: // Separate weights for green
586 if (uses_alpha)
587 {
588 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
589 }
590 else
591 {
592 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
593 }
594 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
595 break;
596
597 case 2: // Separate weights for blue
598 if (uses_alpha)
599 {
600 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
601 }
602 else
603 {
604 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
605 }
606 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
607 break;
608
609 default: // Separate weights for alpha
610 assert(uses_alpha);
611 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
612 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
613 break;
614 }
615 }
616
617 /* See header for documentation. */
compute_error_of_weight_set_1plane(const endpoints_and_weights & eai,const decimation_info & di,const float * dec_weight_quant_uvalue)618 float compute_error_of_weight_set_1plane(
619 const endpoints_and_weights& eai,
620 const decimation_info& di,
621 const float* dec_weight_quant_uvalue
622 ) {
623 vfloatacc error_summav = vfloatacc::zero();
624 float error_summa = 0.0f;
625 unsigned int texel_count = di.texel_count;
626
627 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
628 if (di.max_texel_weight_count > 2)
629 {
630 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
631 {
632 // Compute the bilinear interpolation of the decimated weight grid
633 vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
634
635 // Compute the error between the computed value and the ideal weight
636 vfloat actual_values = loada(eai.weights + i);
637 vfloat diff = current_values - actual_values;
638 vfloat significance = loada(eai.weight_error_scale + i);
639 vfloat error = diff * diff * significance;
640
641 haccumulate(error_summav, error);
642 }
643 }
644 else if (di.max_texel_weight_count > 1)
645 {
646 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
647 {
648 // Compute the bilinear interpolation of the decimated weight grid
649 vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
650
651 // Compute the error between the computed value and the ideal weight
652 vfloat actual_values = loada(eai.weights + i);
653 vfloat diff = current_values - actual_values;
654 vfloat significance = loada(eai.weight_error_scale + i);
655 vfloat error = diff * diff * significance;
656
657 haccumulate(error_summav, error);
658 }
659 }
660 else
661 {
662 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
663 {
664 // Load the weight set directly, without interpolation
665 vfloat current_values = loada(dec_weight_quant_uvalue + i);
666
667 // Compute the error between the computed value and the ideal weight
668 vfloat actual_values = loada(eai.weights + i);
669 vfloat diff = current_values - actual_values;
670 vfloat significance = loada(eai.weight_error_scale + i);
671 vfloat error = diff * diff * significance;
672
673 haccumulate(error_summav, error);
674 }
675 }
676
677 // Resolve the final scalar accumulator sum
678 return error_summa = hadd_s(error_summav);
679 }
680
681 /* See header for documentation. */
compute_error_of_weight_set_2planes(const endpoints_and_weights & eai1,const endpoints_and_weights & eai2,const decimation_info & di,const float * dec_weight_quant_uvalue_plane1,const float * dec_weight_quant_uvalue_plane2)682 float compute_error_of_weight_set_2planes(
683 const endpoints_and_weights& eai1,
684 const endpoints_and_weights& eai2,
685 const decimation_info& di,
686 const float* dec_weight_quant_uvalue_plane1,
687 const float* dec_weight_quant_uvalue_plane2
688 ) {
689 vfloatacc error_summav = vfloatacc::zero();
690 unsigned int texel_count = di.texel_count;
691
692 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
693 if (di.max_texel_weight_count > 2)
694 {
695 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
696 {
697 // Plane 1
698 // Compute the bilinear interpolation of the decimated weight grid
699 vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
700
701 // Compute the error between the computed value and the ideal weight
702 vfloat actual_values1 = loada(eai1.weights + i);
703 vfloat diff = current_values1 - actual_values1;
704 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
705
706 // Plane 2
707 // Compute the bilinear interpolation of the decimated weight grid
708 vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
709
710 // Compute the error between the computed value and the ideal weight
711 vfloat actual_values2 = loada(eai2.weights + i);
712 diff = current_values2 - actual_values2;
713 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
714
715 haccumulate(error_summav, error1 + error2);
716 }
717 }
718 else if (di.max_texel_weight_count > 1)
719 {
720 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
721 {
722 // Plane 1
723 // Compute the bilinear interpolation of the decimated weight grid
724 vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
725
726 // Compute the error between the computed value and the ideal weight
727 vfloat actual_values1 = loada(eai1.weights + i);
728 vfloat diff = current_values1 - actual_values1;
729 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
730
731 // Plane 2
732 // Compute the bilinear interpolation of the decimated weight grid
733 vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
734
735 // Compute the error between the computed value and the ideal weight
736 vfloat actual_values2 = loada(eai2.weights + i);
737 diff = current_values2 - actual_values2;
738 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
739
740 haccumulate(error_summav, error1 + error2);
741 }
742 }
743 else
744 {
745 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
746 {
747 // Plane 1
748 // Load the weight set directly, without interpolation
749 vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
750
751 // Compute the error between the computed value and the ideal weight
752 vfloat actual_values1 = loada(eai1.weights + i);
753 vfloat diff = current_values1 - actual_values1;
754 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
755
756 // Plane 2
757 // Load the weight set directly, without interpolation
758 vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
759
760 // Compute the error between the computed value and the ideal weight
761 vfloat actual_values2 = loada(eai2.weights + i);
762 diff = current_values2 - actual_values2;
763 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
764
765 haccumulate(error_summav, error1 + error2);
766 }
767 }
768
769 // Resolve the final scalar accumulator sum
770 return hadd_s(error_summav);
771 }
772
773 /* See header for documentation. */
compute_ideal_weights_for_decimation(const endpoints_and_weights & eai_in,endpoints_and_weights & eai_out,const decimation_info & di,float * dec_weight_ideal_value)774 void compute_ideal_weights_for_decimation(
775 const endpoints_and_weights& eai_in,
776 endpoints_and_weights& eai_out,
777 const decimation_info& di,
778 float* dec_weight_ideal_value
779 ) {
780 unsigned int texel_count = di.texel_count;
781 unsigned int weight_count = di.weight_count;
782 bool is_direct = texel_count == weight_count;
783 promise(texel_count > 0);
784 promise(weight_count > 0);
785
786 // This function includes a copy of the epw from eai_in to eai_out. We do it here because we
787 // want to load the data anyway, so we can avoid loading it from memory twice.
788 eai_out.ep = eai_in.ep;
789 eai_out.is_constant_weight_error_scale = eai_in.is_constant_weight_error_scale;
790
791 // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
792 // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
793 // arrays always contain space for 64 elements
794 unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
795 storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
796
797 // If we have a 1:1 mapping just shortcut the computation - clone the weights into both the
798 // weight set and the output epw copy.
799
800 // Transfer enough to also copy zero initialized SIMD over-fetch region
801 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
802 for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH)
803 {
804 vfloat weight(eai_in.weights + i);
805 vfloat weight_error_scale(eai_in.weight_error_scale + i);
806
807 storea(weight, eai_out.weights + i);
808 storea(weight_error_scale, eai_out.weight_error_scale + i);
809
810 // Direct 1:1 weight mapping, so clone weights directly
811 // TODO: Can we just avoid the copy for direct cases?
812 if (is_direct)
813 {
814 storea(weight, dec_weight_ideal_value + i);
815 }
816 }
817
818 if (is_direct)
819 {
820 return;
821 }
822
823 // Otherwise compute an estimate and perform single refinement iteration
824 alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
825
826 // Compute an initial average for each decimated weight
827 bool constant_wes = eai_in.is_constant_weight_error_scale;
828 vfloat weight_error_scale(eai_in.weight_error_scale[0]);
829
830 // This overshoots - this is OK as we initialize the array tails in the
831 // decimation table structures to safe values ...
832 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
833 {
834 // Start with a small value to avoid div-by-zero later
835 vfloat weight_weight(1e-10f);
836 vfloat initial_weight = vfloat::zero();
837
838 // Accumulate error weighting of all the texels using this weight
839 vint weight_texel_count(di.weight_texel_count + i);
840 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
841 promise(max_texel_count > 0);
842
843 for (unsigned int j = 0; j < max_texel_count; j++)
844 {
845 vint texel(di.weight_texel[j] + i);
846 vfloat weight = loada(di.weights_flt[j] + i);
847
848 if (!constant_wes)
849 {
850 weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
851 }
852
853 vfloat contrib_weight = weight * weight_error_scale;
854
855 weight_weight += contrib_weight;
856 initial_weight += gatherf(eai_in.weights, texel) * contrib_weight;
857 }
858
859 storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
860 }
861
862 // Populate the interpolated weight grid based on the initital average
863 // Process SIMD-width texel coordinates at at time while we can. Safe to
864 // over-process full SIMD vectors - the tail is zeroed.
865 if (di.max_texel_weight_count <= 2)
866 {
867 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
868 {
869 vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
870 storea(weight, infilled_weights + i);
871 }
872 }
873 else
874 {
875 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
876 {
877 vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
878 storea(weight, infilled_weights + i);
879 }
880 }
881
882 // Perform a single iteration of refinement
883 // Empirically determined step size; larger values don't help but smaller drops image quality
884 constexpr float stepsize = 0.25f;
885 constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
886
887 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
888 {
889 vfloat weight_val = loada(dec_weight_ideal_value + i);
890
891 // Accumulate error weighting of all the texels using this weight
892 // Start with a small value to avoid div-by-zero later
893 vfloat error_change0(1e-10f);
894 vfloat error_change1(0.0f);
895
896 // Accumulate error weighting of all the texels using this weight
897 vint weight_texel_count(di.weight_texel_count + i);
898 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
899 promise(max_texel_count > 0);
900
901 for (unsigned int j = 0; j < max_texel_count; j++)
902 {
903 vint texel(di.weight_texel[j] + i);
904 vfloat contrib_weight = loada(di.weights_flt[j] + i);
905
906 if (!constant_wes)
907 {
908 weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
909 }
910
911 vfloat scale = weight_error_scale * contrib_weight;
912 vfloat old_weight = gatherf(infilled_weights, texel);
913 vfloat ideal_weight = gatherf(eai_in.weights, texel);
914
915 error_change0 += contrib_weight * scale;
916 error_change1 += (old_weight - ideal_weight) * scale;
917 }
918
919 vfloat step = (error_change1 * chd_scale) / error_change0;
920 step = clamp(-stepsize, stepsize, step);
921
922 // Update the weight; note this can store negative values.
923 storea(weight_val + step, dec_weight_ideal_value + i);
924 }
925 }
926
927 /* See header for documentation. */
compute_quantized_weights_for_decimation(const decimation_info & di,float low_bound,float high_bound,const float * dec_weight_ideal_value,float * weight_set_out,uint8_t * quantized_weight_set,quant_method quant_level)928 void compute_quantized_weights_for_decimation(
929 const decimation_info& di,
930 float low_bound,
931 float high_bound,
932 const float* dec_weight_ideal_value,
933 float* weight_set_out,
934 uint8_t* quantized_weight_set,
935 quant_method quant_level
936 ) {
937 int weight_count = di.weight_count;
938 promise(weight_count > 0);
939 const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quant_level]);
940
941 // The available quant levels, stored with a minus 1 bias
942 static const float quant_levels_m1[12] {
943 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
944 };
945
946 float quant_level_m1 = quant_levels_m1[quant_level];
947
948 // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
949
950 // TODO: Oddity to investigate; triggered by test in issue #265.
951 if (high_bound < low_bound)
952 {
953 low_bound = 0.0f;
954 high_bound = 1.0f;
955 }
956
957 float rscale = high_bound - low_bound;
958 float scale = 1.0f / rscale;
959
960 float scaled_low_bound = low_bound * scale;
961 rscale *= 1.0f / 64.0f;
962
963 vfloat scalev(scale);
964 vfloat scaled_low_boundv(scaled_low_bound);
965 vfloat quant_level_m1v(quant_level_m1);
966 vfloat rscalev(rscale);
967 vfloat low_boundv(low_bound);
968
969 // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
970 // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
971 for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
972 {
973 vfloat ix = loada(&dec_weight_ideal_value[i]) * scalev - scaled_low_boundv;
974 ix = clampzo(ix);
975
976 // Look up the two closest indexes and return the one that was closest
977 vfloat ix1 = ix * quant_level_m1v;
978
979 vint weightl = float_to_int(ix1);
980 vint weighth = weightl + vint(1);
981
982 vfloat ixl = gatherf(qat->unquantized_value_unsc, weightl);
983 vfloat ixh = gatherf(qat->unquantized_value_unsc, weighth);
984
985 vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
986 vint weight = select(weightl, weighth, mask);
987 ixl = select(ixl, ixh, mask);
988
989 // Invert the weight-scaling that was done initially
990 storea(ixl * rscalev + low_boundv, &weight_set_out[i]);
991 vint scm = gatheri(qat->scramble_map, weight);
992 vint scn = pack_low_bytes(scm);
993 store_nbytes(scn, &quantized_weight_set[i]);
994 }
995 }
996
997 /**
998 * @brief Compute the RGB + offset for a HDR endpoint mode #7.
999 *
1000 * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
1001 * gives us ~24 multiplications vs. 96 for a generic inverse.
1002 *
1003 * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x);
1004 * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y);
1005 * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z);
1006 * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum);
1007 * mat = invert(mat);
1008 *
1009 * @param rgba_weight_sum Sum of partition component error weights.
1010 * @param weight_weight_sum Sum of partition component error weights * texel weight.
1011 * @param rgbq_sum Sum of partition component error weights * texel weight * color data.
1012 * @param psum Sum of RGB color weights * texel weight^2.
1013 */
compute_rgbo_vector(vfloat4 rgba_weight_sum,vfloat4 weight_weight_sum,vfloat4 rgbq_sum,float psum)1014 static inline vfloat4 compute_rgbo_vector(
1015 vfloat4 rgba_weight_sum,
1016 vfloat4 weight_weight_sum,
1017 vfloat4 rgbq_sum,
1018 float psum
1019 ) {
1020 float X = rgba_weight_sum.lane<0>();
1021 float Y = rgba_weight_sum.lane<1>();
1022 float Z = rgba_weight_sum.lane<2>();
1023 float P = weight_weight_sum.lane<0>();
1024 float Q = weight_weight_sum.lane<1>();
1025 float R = weight_weight_sum.lane<2>();
1026 float S = psum;
1027
1028 float PP = P * P;
1029 float QQ = Q * Q;
1030 float RR = R * R;
1031
1032 float SZmRR = S * Z - RR;
1033 float DT = SZmRR * Y - Z * QQ;
1034 float YP = Y * P;
1035 float QX = Q * X;
1036 float YX = Y * X;
1037 float mZYP = -Z * YP;
1038 float mZQX = -Z * QX;
1039 float mRYX = -R * YX;
1040 float ZQP = Z * Q * P;
1041 float RYP = R * YP;
1042 float RQX = R * QX;
1043
1044 // Compute the reciprocal of matrix determinant
1045 float rdet = 1.0f / (DT * X + mZYP * P);
1046
1047 // Actually compute the adjugate, and then apply 1/det separately
1048 vfloat4 mat0(DT, ZQP, RYP, mZYP);
1049 vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
1050 vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
1051 vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
1052 vfloat4 vect = rgbq_sum * rdet;
1053
1054 return vfloat4(dot_s(mat0, vect),
1055 dot_s(mat1, vect),
1056 dot_s(mat2, vect),
1057 dot_s(mat3, vect));
1058 }
1059
1060 /* See header for documentation. */
recompute_ideal_colors_1plane(const image_block & blk,const partition_info & pi,const decimation_info & di,int weight_quant_mode,const uint8_t * dec_weights_quant_pvalue,endpoints & ep,vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS])1061 void recompute_ideal_colors_1plane(
1062 const image_block& blk,
1063 const partition_info& pi,
1064 const decimation_info& di,
1065 int weight_quant_mode,
1066 const uint8_t* dec_weights_quant_pvalue,
1067 endpoints& ep,
1068 vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
1069 vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
1070 ) {
1071 unsigned int weight_count = di.weight_count;
1072 unsigned int total_texel_count = blk.texel_count;
1073 unsigned int partition_count = pi.partition_count;
1074
1075 promise(weight_count > 0);
1076 promise(total_texel_count > 0);
1077 promise(partition_count > 0);
1078
1079 const quantization_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_mode];
1080
1081 float dec_weight[BLOCK_MAX_WEIGHTS];
1082 for (unsigned int i = 0; i < weight_count; i++)
1083 {
1084 dec_weight[i] = qat.unquantized_value[dec_weights_quant_pvalue[i]] * (1.0f / 64.0f);
1085 }
1086
1087 alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
1088 float* undec_weight_ref;
1089 if (di.max_texel_weight_count == 1)
1090 {
1091 undec_weight_ref = dec_weight;
1092 }
1093 else if (di.max_texel_weight_count <= 2)
1094 {
1095 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1096 {
1097 vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
1098 storea(weight, undec_weight + i);
1099 }
1100
1101 undec_weight_ref = undec_weight;
1102 }
1103 else
1104 {
1105 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1106 {
1107 vfloat weight = bilinear_infill_vla(di, dec_weight, i);
1108 storea(weight, undec_weight + i);
1109 }
1110
1111 undec_weight_ref = undec_weight;
1112 }
1113
1114 vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
1115
1116 for (unsigned int i = 0; i < partition_count; i++)
1117 {
1118 unsigned int texel_count = pi.partition_texel_count[i];
1119 const uint8_t *texel_indexes = pi.texels_of_partition[i];
1120
1121 // Only compute a partition mean if more than one partition
1122 if (partition_count > 1)
1123 {
1124 rgba_sum = vfloat4(1e-17f);
1125 promise(texel_count > 0);
1126 for (unsigned int j = 0; j < texel_count; j++)
1127 {
1128 unsigned int tix = texel_indexes[j];
1129 rgba_sum += blk.texel(tix);
1130 }
1131 }
1132
1133 rgba_sum = rgba_sum * blk.channel_weight;
1134 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1135 vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
1136
1137 float scale_max = 0.0f;
1138 float scale_min = 1e10f;
1139
1140 float wmin1 = 1.0f;
1141 float wmax1 = 0.0f;
1142
1143 float left_sum_s = 0.0f;
1144 float middle_sum_s = 0.0f;
1145 float right_sum_s = 0.0f;
1146
1147 vfloat4 color_vec_x = vfloat4::zero();
1148 vfloat4 color_vec_y = vfloat4::zero();
1149
1150 vfloat4 scale_vec = vfloat4::zero();
1151
1152 float weight_weight_sum_s = 1e-17f;
1153
1154 vfloat4 color_weight = blk.channel_weight;
1155 float ls_weight = hadd_rgb_s(color_weight);
1156
1157 for (unsigned int j = 0; j < texel_count; j++)
1158 {
1159 unsigned int tix = texel_indexes[j];
1160
1161 vfloat4 rgba = blk.texel(tix);
1162
1163 float idx0 = undec_weight_ref[tix];
1164
1165 float om_idx0 = 1.0f - idx0;
1166 wmin1 = astc::min(idx0, wmin1);
1167 wmax1 = astc::max(idx0, wmax1);
1168
1169 float scale = dot3_s(scale_dir, rgba);
1170 scale_min = astc::min(scale, scale_min);
1171 scale_max = astc::max(scale, scale_max);
1172
1173 left_sum_s += om_idx0 * om_idx0;
1174 middle_sum_s += om_idx0 * idx0;
1175 right_sum_s += idx0 * idx0;
1176 weight_weight_sum_s += idx0;
1177
1178 vfloat4 color_idx(idx0);
1179 vfloat4 cwprod = rgba;
1180 vfloat4 cwiprod = cwprod * color_idx;
1181
1182 color_vec_y += cwiprod;
1183 color_vec_x += cwprod - cwiprod;
1184
1185 scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
1186 }
1187
1188 vfloat4 left_sum = vfloat4(left_sum_s) * color_weight;
1189 vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
1190 vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
1191 vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
1192
1193 vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
1194 float psum = right_sum_s * hadd_rgb_s(color_weight);
1195
1196 color_vec_x = color_vec_x * color_weight;
1197 color_vec_y = color_vec_y * color_weight;
1198
1199 // Initialize the luminance and scale vectors with a reasonable default
1200 float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
1201 scalediv = astc::clamp1f(scalediv);
1202
1203 vfloat4 sds = scale_dir * scale_max;
1204
1205 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1206
1207 if (wmin1 >= wmax1 * 0.999f)
1208 {
1209 // If all weights in the partition were equal, then just take average of all colors in
1210 // the partition and use that as both endpoint colors
1211 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1212
1213 vmask4 notnan_mask = avg == avg;
1214 ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
1215 ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
1216
1217 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1218 }
1219 else
1220 {
1221 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1222 // set of texel weights and pixel colors
1223 vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
1224 vfloat4 color_rdet1 = 1.0f / color_det1;
1225
1226 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1227 float ls_rdet1 = 1.0f / ls_det1;
1228
1229 vfloat4 color_mss1 = (left_sum * left_sum)
1230 + (2.0f * middle_sum * middle_sum)
1231 + (right_sum * right_sum);
1232
1233 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1234 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1235 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1236
1237 vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
1238 vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
1239
1240 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1241 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1242 vmask4 full_mask = det_mask & notnan_mask;
1243
1244 ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
1245 ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
1246
1247 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1248 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1249
1250 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1251 {
1252 float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
1253 vfloat4 sdsm = scale_dir * scale_ep1;
1254 rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1255 }
1256 }
1257
1258 // Calculations specific to mode #7, the HDR RGB-scale mode
1259 vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1260 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1261
1262 vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1263 rgbo_vectors[i] = rgbovec;
1264
1265 // We can get a failure due to the use of a singular (non-invertible) matrix
1266 // If it failed, compute rgbo_vectors[] with a different method ...
1267 if (astc::isnan(dot_s(rgbovec, rgbovec)))
1268 {
1269 vfloat4 v0 = ep.endpt0[i];
1270 vfloat4 v1 = ep.endpt1[i];
1271
1272 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1273 avgdif = astc::max(avgdif, 0.0f);
1274
1275 vfloat4 avg = (v0 + v1) * 0.5f;
1276 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1277 rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1278 }
1279 }
1280 }
1281
1282 /* See header for documentation. */
recompute_ideal_colors_2planes(const image_block & blk,const block_size_descriptor & bsd,const decimation_info & di,int weight_quant_mode,const uint8_t * dec_weights_quant_pvalue_plane1,const uint8_t * dec_weights_quant_pvalue_plane2,endpoints & ep,vfloat4 & rgbs_vector,vfloat4 & rgbo_vector,int plane2_component)1283 void recompute_ideal_colors_2planes(
1284 const image_block& blk,
1285 const block_size_descriptor& bsd,
1286 const decimation_info& di,
1287 int weight_quant_mode,
1288 const uint8_t* dec_weights_quant_pvalue_plane1,
1289 const uint8_t* dec_weights_quant_pvalue_plane2,
1290 endpoints& ep,
1291 vfloat4& rgbs_vector,
1292 vfloat4& rgbo_vector,
1293 int plane2_component
1294 ) {
1295 unsigned int weight_count = di.weight_count;
1296 unsigned int total_texel_count = blk.texel_count;
1297
1298 promise(total_texel_count > 0);
1299 promise(weight_count > 0);
1300
1301 const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_mode]);
1302
1303 float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
1304 float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
1305
1306 assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
1307 for (unsigned int i = 0; i < weight_count; i++)
1308 {
1309 dec_weight_plane1[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane1[i]] * (1.0f / 64.0f);
1310 dec_weight_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f);
1311 }
1312
1313 alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
1314 alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
1315
1316 float* undec_weight_plane1_ref;
1317 float* undec_weight_plane2_ref;
1318
1319 if (di.max_texel_weight_count == 1)
1320 {
1321 undec_weight_plane1_ref = dec_weight_plane1;
1322 undec_weight_plane2_ref = dec_weight_plane2;
1323 }
1324 else if (di.max_texel_weight_count <= 2)
1325 {
1326 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1327 {
1328 vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
1329 storea(weight, undec_weight_plane1 + i);
1330
1331 weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
1332 storea(weight, undec_weight_plane2 + i);
1333 }
1334
1335 undec_weight_plane1_ref = undec_weight_plane1;
1336 undec_weight_plane2_ref = undec_weight_plane2;
1337 }
1338 else
1339 {
1340 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1341 {
1342 vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
1343 storea(weight, undec_weight_plane1 + i);
1344
1345 weight = bilinear_infill_vla(di, dec_weight_plane2, i);
1346 storea(weight, undec_weight_plane2 + i);
1347 }
1348
1349 undec_weight_plane1_ref = undec_weight_plane1;
1350 undec_weight_plane2_ref = undec_weight_plane2;
1351 }
1352
1353 unsigned int texel_count = bsd.texel_count;
1354 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1355 vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
1356
1357 float scale_max = 0.0f;
1358 float scale_min = 1e10f;
1359
1360 float wmin1 = 1.0f;
1361 float wmax1 = 0.0f;
1362
1363 float wmin2 = 1.0f;
1364 float wmax2 = 0.0f;
1365
1366 float left1_sum_s = 0.0f;
1367 float middle1_sum_s = 0.0f;
1368 float right1_sum_s = 0.0f;
1369
1370 float left2_sum_s = 0.0f;
1371 float middle2_sum_s = 0.0f;
1372 float right2_sum_s = 0.0f;
1373
1374 vfloat4 color_vec_x = vfloat4::zero();
1375 vfloat4 color_vec_y = vfloat4::zero();
1376
1377 vfloat4 scale_vec = vfloat4::zero();
1378
1379 vfloat4 weight_weight_sum = vfloat4(1e-17f);
1380
1381 vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
1382 vfloat4 color_weight = blk.channel_weight;
1383 float ls_weight = hadd_rgb_s(color_weight);
1384
1385 for (unsigned int j = 0; j < texel_count; j++)
1386 {
1387 vfloat4 rgba = blk.texel(j);
1388
1389 float idx0 = undec_weight_plane1_ref[j];
1390
1391 float om_idx0 = 1.0f - idx0;
1392 wmin1 = astc::min(idx0, wmin1);
1393 wmax1 = astc::max(idx0, wmax1);
1394
1395 float scale = dot3_s(scale_dir, rgba);
1396 scale_min = astc::min(scale, scale_min);
1397 scale_max = astc::max(scale, scale_max);
1398
1399 left1_sum_s += om_idx0 * om_idx0;
1400 middle1_sum_s += om_idx0 * idx0;
1401 right1_sum_s += idx0 * idx0;
1402
1403 float idx1 = undec_weight_plane2_ref[j];
1404
1405 float om_idx1 = 1.0f - idx1;
1406 wmin2 = astc::min(idx1, wmin2);
1407 wmax2 = astc::max(idx1, wmax2);
1408
1409 left2_sum_s += om_idx1 * om_idx1;
1410 middle2_sum_s += om_idx1 * idx1;
1411 right2_sum_s += idx1 * idx1;
1412
1413 vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
1414
1415 vfloat4 cwprod = rgba;
1416 vfloat4 cwiprod = cwprod * color_idx;
1417
1418 color_vec_y += cwiprod;
1419 color_vec_x += cwprod - cwiprod;
1420
1421 scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
1422 weight_weight_sum += (color_weight * color_idx);
1423 }
1424
1425 vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
1426 vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
1427 vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight;
1428 vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
1429
1430 vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight;
1431 vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
1432 vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
1433
1434 float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
1435
1436 color_vec_x = color_vec_x * color_weight;
1437 color_vec_y = color_vec_y * color_weight;
1438
1439 // Initialize the luminance and scale vectors with a reasonable default
1440 float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
1441 scalediv = astc::clamp1f(scalediv);
1442
1443 vfloat4 sds = scale_dir * scale_max;
1444
1445 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1446
1447 if (wmin1 >= wmax1 * 0.999f)
1448 {
1449 // If all weights in the partition were equal, then just take average of all colors in
1450 // the partition and use that as both endpoint colors
1451 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1452
1453 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1454 vmask4 notnan_mask = avg == avg;
1455 vmask4 full_mask = p1_mask & notnan_mask;
1456
1457 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1458 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1459
1460 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1461 }
1462 else
1463 {
1464 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1465 // set of texel weights and pixel colors
1466 vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
1467 vfloat4 color_rdet1 = 1.0f / color_det1;
1468
1469 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1470 float ls_rdet1 = 1.0f / ls_det1;
1471
1472 vfloat4 color_mss1 = (left1_sum * left1_sum)
1473 + (2.0f * middle1_sum * middle1_sum)
1474 + (right1_sum * right1_sum);
1475
1476 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1477 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1478 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1479
1480 vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
1481 vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
1482
1483 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1484 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1485
1486 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1487 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1488 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1489 vmask4 full_mask = p1_mask & det_mask & notnan_mask;
1490
1491 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1492 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1493
1494 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1495 {
1496 float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
1497 vfloat4 sdsm = scale_dir * scale_ep1;
1498 rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1499 }
1500 }
1501
1502 if (wmin2 >= wmax2 * 0.999f)
1503 {
1504 // If all weights in the partition were equal, then just take average of all colors in
1505 // the partition and use that as both endpoint colors
1506 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1507
1508 vmask4 notnan_mask = avg == avg;
1509 vmask4 full_mask = p2_mask & notnan_mask;
1510
1511 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1512 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1513 }
1514 else
1515 {
1516 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1517 // set of texel weights and pixel colors
1518 vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
1519 vfloat4 color_rdet2 = 1.0f / color_det2;
1520
1521 vfloat4 color_mss2 = (left2_sum * left2_sum)
1522 + (2.0f * middle2_sum * middle2_sum)
1523 + (right2_sum * right2_sum);
1524
1525 vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
1526 vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
1527
1528 vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
1529 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1530 vmask4 full_mask = p2_mask & det_mask & notnan_mask;
1531
1532 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1533 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1534 }
1535
1536 // Calculations specific to mode #7, the HDR RGB-scale mode
1537 vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1538 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1539
1540 rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1541
1542 // We can get a failure due to the use of a singular (non-invertible) matrix
1543 // If it failed, compute rgbo_vectors[] with a different method ...
1544 if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
1545 {
1546 vfloat4 v0 = ep.endpt0[0];
1547 vfloat4 v1 = ep.endpt1[0];
1548
1549 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1550 avgdif = astc::max(avgdif, 0.0f);
1551
1552 vfloat4 avg = (v0 + v1) * 0.5f;
1553 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1554
1555 rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1556 }
1557 }
1558
1559 #endif
1560