1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2024 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions for computing color endpoints and texel weights.
22 */
23
24 #include <cassert>
25
26 #include "astcenc_internal.h"
27 #include "astcenc_vecmathlib.h"
28
29 /**
30 * @brief Compute the infilled weight for N texel indices in a decimated grid.
31 *
32 * @param di The weight grid decimation to use.
33 * @param weights The decimated weight values to use.
34 * @param index The first texel index to interpolate.
35 *
36 * @return The interpolated weight for the given set of SIMD_WIDTH texels.
37 */
bilinear_infill_vla(const decimation_info & di,const float * weights,unsigned int index)38 static vfloat bilinear_infill_vla(
39 const decimation_info& di,
40 const float* weights,
41 unsigned int index
42 ) {
43 // Load the bilinear filter texel weight indexes in the decimated grid
44 vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
45 vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
46 vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
47 vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
48
49 // Load the bilinear filter weights from the decimated grid
50 vfloat weight_val0 = gatherf(weights, weight_idx0);
51 vfloat weight_val1 = gatherf(weights, weight_idx1);
52 vfloat weight_val2 = gatherf(weights, weight_idx2);
53 vfloat weight_val3 = gatherf(weights, weight_idx3);
54
55 // Load the weight contribution factors for each decimated weight
56 vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
57 vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
58 vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
59 vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
60
61 // Compute the bilinear interpolation to generate the per-texel weight
62 return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
63 (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
64 }
65
66 /**
67 * @brief Compute the infilled weight for N texel indices in a decimated grid.
68 *
69 * This is specialized version which computes only two weights per texel for
70 * encodings that are only decimated in a single axis.
71 *
72 * @param di The weight grid decimation to use.
73 * @param weights The decimated weight values to use.
74 * @param index The first texel index to interpolate.
75 *
76 * @return The interpolated weight for the given set of SIMD_WIDTH texels.
77 */
bilinear_infill_vla_2(const decimation_info & di,const float * weights,unsigned int index)78 static vfloat bilinear_infill_vla_2(
79 const decimation_info& di,
80 const float* weights,
81 unsigned int index
82 ) {
83 // Load the bilinear filter texel weight indexes in the decimated grid
84 vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
85 vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
86
87 // Load the bilinear filter weights from the decimated grid
88 vfloat weight_val0 = gatherf(weights, weight_idx0);
89 vfloat weight_val1 = gatherf(weights, weight_idx1);
90
91 // Load the weight contribution factors for each decimated weight
92 vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
93 vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
94
95 // Compute the bilinear interpolation to generate the per-texel weight
96 return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
97 }
98
99 /**
100 * @brief Compute the ideal endpoints and weights for 1 color component.
101 *
102 * @param blk The image block color data to compress.
103 * @param pi The partition info for the current trial.
104 * @param[out] ei The computed ideal endpoints and weights.
105 * @param component The color component to compute.
106 */
compute_ideal_colors_and_weights_1_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int component)107 static void compute_ideal_colors_and_weights_1_comp(
108 const image_block& blk,
109 const partition_info& pi,
110 endpoints_and_weights& ei,
111 unsigned int component
112 ) {
113 unsigned int partition_count = pi.partition_count;
114 ei.ep.partition_count = partition_count;
115 promise(partition_count > 0);
116
117 unsigned int texel_count = blk.texel_count;
118 promise(texel_count > 0);
119
120 float error_weight;
121 const float* data_vr = nullptr;
122
123 assert(component < BLOCK_MAX_COMPONENTS);
124 switch (component)
125 {
126 case 0:
127 error_weight = blk.channel_weight.lane<0>();
128 data_vr = blk.data_r;
129 break;
130 case 1:
131 error_weight = blk.channel_weight.lane<1>();
132 data_vr = blk.data_g;
133 break;
134 case 2:
135 error_weight = blk.channel_weight.lane<2>();
136 data_vr = blk.data_b;
137 break;
138 default:
139 assert(component == 3);
140 error_weight = blk.channel_weight.lane<3>();
141 data_vr = blk.data_a;
142 break;
143 }
144
145 vmask4 sep_mask = vint4::lane_id() == vint4(component);
146 bool is_constant_wes { true };
147 float partition0_len_sq { 0.0f };
148
149 for (unsigned int i = 0; i < partition_count; i++)
150 {
151 float lowvalue { 1e10f };
152 float highvalue { -1e10f };
153
154 unsigned int partition_texel_count = pi.partition_texel_count[i];
155 for (unsigned int j = 0; j < partition_texel_count; j++)
156 {
157 unsigned int tix = pi.texels_of_partition[i][j];
158 float value = data_vr[tix];
159 lowvalue = astc::min(value, lowvalue);
160 highvalue = astc::max(value, highvalue);
161 }
162
163 if (highvalue <= lowvalue)
164 {
165 lowvalue = 0.0f;
166 highvalue = 1e-7f;
167 }
168
169 float length = highvalue - lowvalue;
170 float length_squared = length * length;
171 float scale = 1.0f / length;
172
173 if (i == 0)
174 {
175 partition0_len_sq = length_squared;
176 }
177 else
178 {
179 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
180 }
181
182 for (unsigned int j = 0; j < partition_texel_count; j++)
183 {
184 unsigned int tix = pi.texels_of_partition[i][j];
185 float value = (data_vr[tix] - lowvalue) * scale;
186 value = astc::clamp1f(value);
187
188 ei.weights[tix] = value;
189 ei.weight_error_scale[tix] = length_squared * error_weight;
190 assert(!astc::isnan(ei.weight_error_scale[tix]));
191 }
192
193 ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
194 ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
195 }
196
197 // Zero initialize any SIMD over-fetch
198 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
199 for (unsigned int i = texel_count; i < texel_count_simd; i++)
200 {
201 ei.weights[i] = 0.0f;
202 ei.weight_error_scale[i] = 0.0f;
203 }
204
205 ei.is_constant_weight_error_scale = is_constant_wes;
206 }
207
208 /**
209 * @brief Compute the ideal endpoints and weights for 2 color components.
210 *
211 * @param blk The image block color data to compress.
212 * @param pi The partition info for the current trial.
213 * @param[out] ei The computed ideal endpoints and weights.
214 * @param component1 The first color component to compute.
215 * @param component2 The second color component to compute.
216 */
compute_ideal_colors_and_weights_2_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,int component1,int component2)217 static void compute_ideal_colors_and_weights_2_comp(
218 const image_block& blk,
219 const partition_info& pi,
220 endpoints_and_weights& ei,
221 int component1,
222 int component2
223 ) {
224 unsigned int partition_count = pi.partition_count;
225 ei.ep.partition_count = partition_count;
226 promise(partition_count > 0);
227
228 unsigned int texel_count = blk.texel_count;
229 promise(texel_count > 0);
230
231 partition_metrics pms[BLOCK_MAX_PARTITIONS];
232
233 float error_weight;
234 const float* data_vr = nullptr;
235 const float* data_vg = nullptr;
236
237 if (component1 == 0 && component2 == 1)
238 {
239 error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
240
241 data_vr = blk.data_r;
242 data_vg = blk.data_g;
243 }
244 else if (component1 == 0 && component2 == 2)
245 {
246 error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
247
248 data_vr = blk.data_r;
249 data_vg = blk.data_b;
250 }
251 else // (component1 == 1 && component2 == 2)
252 {
253 assert(component1 == 1 && component2 == 2);
254
255 error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
256
257 data_vr = blk.data_g;
258 data_vg = blk.data_b;
259 }
260
261 compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
262
263 bool is_constant_wes { true };
264 float partition0_len_sq { 0.0f };
265
266 vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
267 vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
268
269 for (unsigned int i = 0; i < partition_count; i++)
270 {
271 vfloat4 dir = pms[i].dir;
272 if (hadd_s(dir) < 0.0f)
273 {
274 dir = vfloat4::zero() - dir;
275 }
276
277 line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
278 float lowparam { 1e10f };
279 float highparam { -1e10f };
280
281 unsigned int partition_texel_count = pi.partition_texel_count[i];
282 for (unsigned int j = 0; j < partition_texel_count; j++)
283 {
284 unsigned int tix = pi.texels_of_partition[i][j];
285 vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
286 float param = dot_s(point - line.a, line.b);
287 ei.weights[tix] = param;
288
289 lowparam = astc::min(param, lowparam);
290 highparam = astc::max(param, highparam);
291 }
292
293 // It is possible for a uniform-color partition to produce length=0;
294 // this causes NaN issues so set to small value to avoid this problem
295 if (highparam <= lowparam)
296 {
297 lowparam = 0.0f;
298 highparam = 1e-7f;
299 }
300
301 float length = highparam - lowparam;
302 float length_squared = length * length;
303 float scale = 1.0f / length;
304
305 if (i == 0)
306 {
307 partition0_len_sq = length_squared;
308 }
309 else
310 {
311 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
312 }
313
314 for (unsigned int j = 0; j < partition_texel_count; j++)
315 {
316 unsigned int tix = pi.texels_of_partition[i][j];
317 float idx = (ei.weights[tix] - lowparam) * scale;
318 idx = astc::clamp1f(idx);
319
320 ei.weights[tix] = idx;
321 ei.weight_error_scale[tix] = length_squared * error_weight;
322 assert(!astc::isnan(ei.weight_error_scale[tix]));
323 }
324
325 vfloat4 lowvalue = line.a + line.b * lowparam;
326 vfloat4 highvalue = line.a + line.b * highparam;
327
328 vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
329 vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
330
331 ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
332 ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
333 }
334
335 // Zero initialize any SIMD over-fetch
336 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
337 for (unsigned int i = texel_count; i < texel_count_simd; i++)
338 {
339 ei.weights[i] = 0.0f;
340 ei.weight_error_scale[i] = 0.0f;
341 }
342
343 ei.is_constant_weight_error_scale = is_constant_wes;
344 }
345
346 /**
347 * @brief Compute the ideal endpoints and weights for 3 color components.
348 *
349 * @param blk The image block color data to compress.
350 * @param pi The partition info for the current trial.
351 * @param[out] ei The computed ideal endpoints and weights.
352 * @param omitted_component The color component excluded from the calculation.
353 */
compute_ideal_colors_and_weights_3_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int omitted_component)354 static void compute_ideal_colors_and_weights_3_comp(
355 const image_block& blk,
356 const partition_info& pi,
357 endpoints_and_weights& ei,
358 unsigned int omitted_component
359 ) {
360 unsigned int partition_count = pi.partition_count;
361 ei.ep.partition_count = partition_count;
362 promise(partition_count > 0);
363
364 unsigned int texel_count = blk.texel_count;
365 promise(texel_count > 0);
366
367 partition_metrics *pms = reinterpret_cast<partition_metrics *>(&blk.pms[0]);
368
369 float error_weight;
370 const float* data_vr = nullptr;
371 const float* data_vg = nullptr;
372 const float* data_vb = nullptr;
373 if (omitted_component == 0)
374 {
375 error_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>());
376 data_vr = blk.data_g;
377 data_vg = blk.data_b;
378 data_vb = blk.data_a;
379 }
380 else if (omitted_component == 1)
381 {
382 error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
383 data_vr = blk.data_r;
384 data_vg = blk.data_b;
385 data_vb = blk.data_a;
386 }
387 else if (omitted_component == 2)
388 {
389 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
390 data_vr = blk.data_r;
391 data_vg = blk.data_g;
392 data_vb = blk.data_a;
393 }
394 else
395 {
396 assert(omitted_component == 3);
397
398 error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
399 data_vr = blk.data_r;
400 data_vg = blk.data_g;
401 data_vb = blk.data_b;
402 }
403
404 error_weight = error_weight * (1.0f / 3.0f);
405
406 if (omitted_component == 3)
407 {
408 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
409 }
410 else
411 {
412 compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
413 }
414
415 bool is_constant_wes { true };
416 float partition0_len_sq { 0.0f };
417
418 for (unsigned int i = 0; i < partition_count; i++)
419 {
420 vfloat4 dir = pms[i].dir;
421 if (hadd_rgb_s(dir) < 0.0f)
422 {
423 dir = vfloat4::zero() - dir;
424 }
425
426 line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
427 float lowparam { 1e10f };
428 float highparam { -1e10f };
429
430 unsigned int partition_texel_count = pi.partition_texel_count[i];
431
432 vfloat4 lowparam_vec = vfloat4(1e10f, 1e10f, 1e10f, 1e10f);
433 vfloat4 highparam_vec = vfloat4(-1e10f, -1e10f, -1e10f, -1e10f);
434
435 unsigned int j = 0;
436 for (; j + ASTCENC_SIMD_WIDTH <= partition_texel_count; j += ASTCENC_SIMD_WIDTH)
437 {
438 unsigned int tix0 = pi.texels_of_partition[i][j];
439 unsigned int tix1 = pi.texels_of_partition[i][j + 1];
440 unsigned int tix2 = pi.texels_of_partition[i][j + 2];
441 unsigned int tix3 = pi.texels_of_partition[i][j + 3];
442
443 vfloat4 points0 = vfloat4(data_vr[tix0], data_vg[tix0], data_vb[tix0], 0.0f);
444 vfloat4 points1 = vfloat4(data_vr[tix1], data_vg[tix1], data_vb[tix1], 0.0f);
445 vfloat4 points2 = vfloat4(data_vr[tix2], data_vg[tix2], data_vb[tix2], 0.0f);
446 vfloat4 points3 = vfloat4(data_vr[tix3], data_vg[tix3], data_vb[tix3], 0.0f);
447
448 vfloat4 sub_v0 = points0 - line.a;
449 vfloat4 sub_v1 = points1 - line.a;
450 vfloat4 sub_v2 = points2 - line.a;
451 vfloat4 sub_v3 = points3 - line.a;
452
453 vfloat4 params0 = sub_v0 * line.b;
454 vfloat4 params1 = sub_v1 * line.b;
455 vfloat4 params2 = sub_v2 * line.b;
456 vfloat4 params3 = sub_v3 * line.b;
457
458 float param0 = hadd_rgba_s(params0);
459 float param1 = hadd_rgba_s(params1);
460 float param2 = hadd_rgba_s(params2);
461 float param3 = hadd_rgba_s(params3);
462
463 ei.weights[tix0] = param0;
464 ei.weights[tix1] = param1;
465 ei.weights[tix2] = param2;
466 ei.weights[tix3] = param3;
467
468 vfloat4 params_vec = vfloat4(param0, param1, param2, param3);
469 lowparam_vec = min(params_vec, lowparam_vec);
470 highparam_vec = max(params_vec, highparam_vec);
471 }
472
473 lowparam = hmin_s(vfloat4(lowparam_vec));
474 highparam = hmax_s(vfloat4(highparam_vec));
475
476 for (; j < partition_texel_count; j++)
477 {
478 unsigned int tix = pi.texels_of_partition[i][j];
479 vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
480 float param = dot3_s(point - line.a, line.b);
481 ei.weights[tix] = param;
482
483 lowparam = astc::min(param, lowparam);
484 highparam = astc::max(param, highparam);
485 }
486
487 // It is possible for a uniform-color partition to produce length=0;
488 // this causes NaN issues so set to small value to avoid this problem
489 if (highparam <= lowparam)
490 {
491 lowparam = 0.0f;
492 highparam = 1e-7f;
493 }
494
495 float length = highparam - lowparam;
496 float length_squared = length * length;
497 float scale = 1.0f / length;
498
499 if (i == 0)
500 {
501 partition0_len_sq = length_squared;
502 }
503 else
504 {
505 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
506 }
507
508 for (j = 0; j < partition_texel_count; j++)
509 {
510 unsigned int tix = pi.texels_of_partition[i][j];
511 float idx = (ei.weights[tix] - lowparam) * scale;
512 idx = astc::clamp1f(idx);
513
514 ei.weights[tix] = idx;
515 ei.weight_error_scale[tix] = length_squared * error_weight;
516 assert(!astc::isnan(ei.weight_error_scale[tix]));
517 }
518
519 vfloat4 ep0 = line.a + line.b * lowparam;
520 vfloat4 ep1 = line.a + line.b * highparam;
521
522 vfloat4 bmin = blk.data_min;
523 vfloat4 bmax = blk.data_max;
524
525 assert(omitted_component < BLOCK_MAX_COMPONENTS);
526 switch (omitted_component)
527 {
528 case 0:
529 ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
530 ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
531 break;
532 case 1:
533 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
534 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
535 break;
536 case 2:
537 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
538 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
539 break;
540 default:
541 ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
542 ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
543 break;
544 }
545 }
546
547 // Zero initialize any SIMD over-fetch
548 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
549 for (unsigned int i = texel_count; i < texel_count_simd; i++)
550 {
551 ei.weights[i] = 0.0f;
552 ei.weight_error_scale[i] = 0.0f;
553 }
554
555 ei.is_constant_weight_error_scale = is_constant_wes;
556 }
557
558 /**
559 * @brief Compute the ideal endpoints and weights for 4 color components.
560 *
561 * @param blk The image block color data to compress.
562 * @param pi The partition info for the current trial.
563 * @param[out] ei The computed ideal endpoints and weights.
564 */
compute_ideal_colors_and_weights_4_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)565 static void compute_ideal_colors_and_weights_4_comp(
566 const image_block& blk,
567 const partition_info& pi,
568 endpoints_and_weights& ei
569 ) {
570 const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
571
572 unsigned int partition_count = pi.partition_count;
573
574 unsigned int texel_count = blk.texel_count;
575 promise(texel_count > 0);
576 promise(partition_count > 0);
577
578 partition_metrics pms[BLOCK_MAX_PARTITIONS];
579
580 compute_avgs_and_dirs_4_comp(pi, blk, pms);
581
582 bool is_constant_wes { true };
583 float partition0_len_sq { 0.0f };
584
585 for (unsigned int i = 0; i < partition_count; i++)
586 {
587 vfloat4 dir = pms[i].dir;
588 if (hadd_rgb_s(dir) < 0.0f)
589 {
590 dir = vfloat4::zero() - dir;
591 }
592
593 line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
594 float lowparam { 1e10f };
595 float highparam { -1e10f };
596
597 unsigned int partition_texel_count = pi.partition_texel_count[i];
598 for (unsigned int j = 0; j < partition_texel_count; j++)
599 {
600 unsigned int tix = pi.texels_of_partition[i][j];
601 vfloat4 point = blk.texel(tix);
602 float param = dot_s(point - line.a, line.b);
603 ei.weights[tix] = param;
604
605 lowparam = astc::min(param, lowparam);
606 highparam = astc::max(param, highparam);
607 }
608
609 // It is possible for a uniform-color partition to produce length=0;
610 // this causes NaN issues so set to small value to avoid this problem
611 if (highparam <= lowparam)
612 {
613 lowparam = 0.0f;
614 highparam = 1e-7f;
615 }
616
617 float length = highparam - lowparam;
618 float length_squared = length * length;
619 float scale = 1.0f / length;
620
621 if (i == 0)
622 {
623 partition0_len_sq = length_squared;
624 }
625 else
626 {
627 is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
628 }
629
630 ei.ep.endpt0[i] = line.a + line.b * lowparam;
631 ei.ep.endpt1[i] = line.a + line.b * highparam;
632
633 for (unsigned int j = 0; j < partition_texel_count; j++)
634 {
635 unsigned int tix = pi.texels_of_partition[i][j];
636 float idx = (ei.weights[tix] - lowparam) * scale;
637 idx = astc::clamp1f(idx);
638
639 ei.weights[tix] = idx;
640 ei.weight_error_scale[tix] = length_squared * error_weight;
641 assert(!astc::isnan(ei.weight_error_scale[tix]));
642 }
643 }
644
645 // Zero initialize any SIMD over-fetch
646 unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
647 for (unsigned int i = texel_count; i < texel_count_simd; i++)
648 {
649 ei.weights[i] = 0.0f;
650 ei.weight_error_scale[i] = 0.0f;
651 }
652
653 ei.is_constant_weight_error_scale = is_constant_wes;
654 }
655
656 /* See header for documentation. */
compute_ideal_colors_and_weights_1plane(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)657 void compute_ideal_colors_and_weights_1plane(
658 const image_block& blk,
659 const partition_info& pi,
660 endpoints_and_weights& ei
661 ) {
662 bool uses_alpha = !blk.is_constant_channel(3);
663
664 if (uses_alpha)
665 {
666 compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
667 }
668 else
669 {
670 compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
671 }
672 }
673
674 /* See header for documentation. */
compute_ideal_colors_and_weights_2planes(const block_size_descriptor & bsd,const image_block & blk,unsigned int plane2_component,endpoints_and_weights & ei1,endpoints_and_weights & ei2)675 void compute_ideal_colors_and_weights_2planes(
676 const block_size_descriptor& bsd,
677 const image_block& blk,
678 unsigned int plane2_component,
679 endpoints_and_weights& ei1,
680 endpoints_and_weights& ei2
681 ) {
682 const auto& pi = bsd.get_partition_info(1, 0);
683 bool uses_alpha = !blk.is_constant_channel(3);
684
685 assert(plane2_component < BLOCK_MAX_COMPONENTS);
686 switch (plane2_component)
687 {
688 case 0: // Separate weights for red
689 if (uses_alpha)
690 {
691 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
692 }
693 else
694 {
695 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
696 }
697 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
698 break;
699
700 case 1: // Separate weights for green
701 if (uses_alpha)
702 {
703 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
704 }
705 else
706 {
707 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
708 }
709 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
710 break;
711
712 case 2: // Separate weights for blue
713 if (uses_alpha)
714 {
715 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
716 }
717 else
718 {
719 compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
720 }
721 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
722 break;
723
724 default: // Separate weights for alpha
725 assert(uses_alpha);
726 compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
727 compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
728 break;
729 }
730 }
731
732 /* See header for documentation. */
compute_error_of_weight_set_1plane(const endpoints_and_weights & eai,const decimation_info & di,const float * dec_weight_quant_uvalue)733 float compute_error_of_weight_set_1plane(
734 const endpoints_and_weights& eai,
735 const decimation_info& di,
736 const float* dec_weight_quant_uvalue
737 ) {
738 vfloatacc error_summav = vfloatacc::zero();
739 unsigned int texel_count = di.texel_count;
740 promise(texel_count > 0);
741
742 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
743 if (di.max_texel_weight_count > 2)
744 {
745 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
746 {
747 // Compute the bilinear interpolation of the decimated weight grid
748 vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
749
750 // Compute the error between the computed value and the ideal weight
751 vfloat actual_values = loada(eai.weights + i);
752 vfloat diff = current_values - actual_values;
753 vfloat significance = loada(eai.weight_error_scale + i);
754 vfloat error = diff * diff * significance;
755
756 haccumulate(error_summav, error);
757 }
758 }
759 else if (di.max_texel_weight_count > 1)
760 {
761 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
762 {
763 // Compute the bilinear interpolation of the decimated weight grid
764 vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
765
766 // Compute the error between the computed value and the ideal weight
767 vfloat actual_values = loada(eai.weights + i);
768 vfloat diff = current_values - actual_values;
769 vfloat significance = loada(eai.weight_error_scale + i);
770 vfloat error = diff * diff * significance;
771
772 haccumulate(error_summav, error);
773 }
774 }
775 else
776 {
777 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
778 {
779 // Load the weight set directly, without interpolation
780 vfloat current_values = loada(dec_weight_quant_uvalue + i);
781
782 // Compute the error between the computed value and the ideal weight
783 vfloat actual_values = loada(eai.weights + i);
784 vfloat diff = current_values - actual_values;
785 vfloat significance = loada(eai.weight_error_scale + i);
786 vfloat error = diff * diff * significance;
787
788 haccumulate(error_summav, error);
789 }
790 }
791
792 // Resolve the final scalar accumulator sum
793 return hadd_s(error_summav);
794 }
795
796 /* See header for documentation. */
compute_error_of_weight_set_2planes(const endpoints_and_weights & eai1,const endpoints_and_weights & eai2,const decimation_info & di,const float * dec_weight_quant_uvalue_plane1,const float * dec_weight_quant_uvalue_plane2)797 float compute_error_of_weight_set_2planes(
798 const endpoints_and_weights& eai1,
799 const endpoints_and_weights& eai2,
800 const decimation_info& di,
801 const float* dec_weight_quant_uvalue_plane1,
802 const float* dec_weight_quant_uvalue_plane2
803 ) {
804 vfloatacc error_summav = vfloatacc::zero();
805 unsigned int texel_count = di.texel_count;
806 promise(texel_count > 0);
807
808 // Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
809 if (di.max_texel_weight_count > 2)
810 {
811 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
812 {
813 // Plane 1
814 // Compute the bilinear interpolation of the decimated weight grid
815 vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
816
817 // Compute the error between the computed value and the ideal weight
818 vfloat actual_values1 = loada(eai1.weights + i);
819 vfloat diff = current_values1 - actual_values1;
820 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
821
822 // Plane 2
823 // Compute the bilinear interpolation of the decimated weight grid
824 vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
825
826 // Compute the error between the computed value and the ideal weight
827 vfloat actual_values2 = loada(eai2.weights + i);
828 diff = current_values2 - actual_values2;
829 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
830
831 haccumulate(error_summav, error1 + error2);
832 }
833 }
834 else if (di.max_texel_weight_count > 1)
835 {
836 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
837 {
838 // Plane 1
839 // Compute the bilinear interpolation of the decimated weight grid
840 vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
841
842 // Compute the error between the computed value and the ideal weight
843 vfloat actual_values1 = loada(eai1.weights + i);
844 vfloat diff = current_values1 - actual_values1;
845 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
846
847 // Plane 2
848 // Compute the bilinear interpolation of the decimated weight grid
849 vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
850
851 // Compute the error between the computed value and the ideal weight
852 vfloat actual_values2 = loada(eai2.weights + i);
853 diff = current_values2 - actual_values2;
854 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
855
856 haccumulate(error_summav, error1 + error2);
857 }
858 }
859 else
860 {
861 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
862 {
863 // Plane 1
864 // Load the weight set directly, without interpolation
865 vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
866
867 // Compute the error between the computed value and the ideal weight
868 vfloat actual_values1 = loada(eai1.weights + i);
869 vfloat diff = current_values1 - actual_values1;
870 vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
871
872 // Plane 2
873 // Load the weight set directly, without interpolation
874 vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
875
876 // Compute the error between the computed value and the ideal weight
877 vfloat actual_values2 = loada(eai2.weights + i);
878 diff = current_values2 - actual_values2;
879 vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
880
881 haccumulate(error_summav, error1 + error2);
882 }
883 }
884
885 // Resolve the final scalar accumulator sum
886 return hadd_s(error_summav);
887 }
888
889 /* See header for documentation. */
compute_ideal_weights_for_decimation(const endpoints_and_weights & ei,const decimation_info & di,float * dec_weight_ideal_value)890 void compute_ideal_weights_for_decimation(
891 const endpoints_and_weights& ei,
892 const decimation_info& di,
893 float* dec_weight_ideal_value
894 ) {
895 unsigned int texel_count = di.texel_count;
896 unsigned int weight_count = di.weight_count;
897 bool is_direct = texel_count == weight_count;
898 promise(texel_count > 0);
899 promise(weight_count > 0);
900
901 // Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
902 // can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
903 // arrays always contain space for 64 elements
904 unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
905 storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
906
907 // If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
908 // zero-initialized SIMD over-fetch region
909 if (is_direct)
910 {
911 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
912 {
913 vfloat weight(ei.weights + i);
914 storea(weight, dec_weight_ideal_value + i);
915 }
916
917 return;
918 }
919
920 // Otherwise compute an estimate and perform single refinement iteration
921 ASTCENC_ALIGNAS float infilled_weights[BLOCK_MAX_TEXELS];
922
923 // Compute an initial average for each decimated weight
924 bool constant_wes = ei.is_constant_weight_error_scale;
925 vfloat weight_error_scale(ei.weight_error_scale[0]);
926
927 // This overshoots - this is OK as we initialize the array tails in the
928 // decimation table structures to safe values ...
929 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
930 {
931 // Start with a small value to avoid div-by-zero later
932 vfloat weight_weight(1e-10f);
933 vfloat initial_weight = vfloat::zero();
934
935 // Accumulate error weighting of all the texels using this weight
936 vint weight_texel_count(di.weight_texel_count + i);
937 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
938 promise(max_texel_count > 0);
939
940 for (unsigned int j = 0; j < max_texel_count; j++)
941 {
942 #ifdef ASTCENC_USE_COMMON_GATHERF
943 const uint8_t* texel = di.weight_texels_tr[j] + i;
944 #else
945 vint texel(di.weight_texels_tr[j] + i);
946 #endif
947 vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
948
949 if (!constant_wes)
950 {
951 weight_error_scale = gatherf(ei.weight_error_scale, texel);
952 }
953
954 vfloat contrib_weight = weight * weight_error_scale;
955
956 weight_weight += contrib_weight;
957 initial_weight += gatherf(ei.weights, texel) * contrib_weight;
958 }
959
960 storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
961 }
962
963 // Populate the interpolated weight grid based on the initial average
964 // Process SIMD-width texel coordinates at at time while we can. Safe to
965 // over-process full SIMD vectors - the tail is zeroed.
966 if (di.max_texel_weight_count <= 2)
967 {
968 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
969 {
970 vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
971 storea(weight, infilled_weights + i);
972 }
973 }
974 else
975 {
976 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
977 {
978 vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
979 storea(weight, infilled_weights + i);
980 }
981 }
982
983 // Perform a single iteration of refinement
984 // Empirically determined step size; larger values don't help but smaller drops image quality
985 constexpr float stepsize = 0.25f;
986 constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
987
988 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
989 {
990 vfloat weight_val = loada(dec_weight_ideal_value + i);
991
992 // Accumulate error weighting of all the texels using this weight
993 // Start with a small value to avoid div-by-zero later
994 vfloat error_change0(1e-10f);
995 vfloat error_change1(0.0f);
996
997 // Accumulate error weighting of all the texels using this weight
998 vint weight_texel_count(di.weight_texel_count + i);
999 unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
1000 promise(max_texel_count > 0);
1001
1002 for (unsigned int j = 0; j < max_texel_count; j++)
1003 {
1004 #ifdef ASTCENC_USE_COMMON_GATHERF
1005 const uint8_t* texel = di.weight_texels_tr[j] + i;
1006 #else
1007 vint texel(di.weight_texels_tr[j] + i);
1008 #endif
1009 vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
1010
1011 if (!constant_wes)
1012 {
1013 weight_error_scale = gatherf(ei.weight_error_scale, texel);
1014 }
1015
1016 vfloat scale = weight_error_scale * contrib_weight;
1017 vfloat old_weight = gatherf(infilled_weights, texel);
1018 vfloat ideal_weight = gatherf(ei.weights, texel);
1019
1020 error_change0 += contrib_weight * scale;
1021 error_change1 += (old_weight - ideal_weight) * scale;
1022 }
1023
1024 vfloat step = (error_change1 * chd_scale) / error_change0;
1025 step = clamp(-stepsize, stepsize, step);
1026
1027 // Update the weight; note this can store negative values
1028 storea(weight_val + step, dec_weight_ideal_value + i);
1029 }
1030 }
1031
1032 /* See header for documentation. */
compute_quantized_weights_for_decimation(const decimation_info & di,float low_bound,float high_bound,const float * dec_weight_ideal_value,float * weight_set_out,uint8_t * quantized_weight_set,quant_method quant_level)1033 void compute_quantized_weights_for_decimation(
1034 const decimation_info& di,
1035 float low_bound,
1036 float high_bound,
1037 const float* dec_weight_ideal_value,
1038 float* weight_set_out,
1039 uint8_t* quantized_weight_set,
1040 quant_method quant_level
1041 ) {
1042 int weight_count = di.weight_count;
1043 promise(weight_count > 0);
1044 const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
1045
1046 // The available quant levels, stored with a minus 1 bias
1047 static const float quant_levels_m1[12] {
1048 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
1049 };
1050
1051 vint steps_m1(get_quant_level(quant_level) - 1);
1052 float quant_level_m1 = quant_levels_m1[quant_level];
1053
1054 // Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
1055
1056 // TODO: Oddity to investigate; triggered by test in issue #265.
1057 if (high_bound <= low_bound)
1058 {
1059 low_bound = 0.0f;
1060 high_bound = 1.0f;
1061 }
1062
1063 float rscale = high_bound - low_bound;
1064 float scale = 1.0f / rscale;
1065
1066 float scaled_low_bound = low_bound * scale;
1067 rscale *= 1.0f / 64.0f;
1068
1069 vfloat scalev(scale);
1070 vfloat scaled_low_boundv(scaled_low_bound);
1071 vfloat quant_level_m1v(quant_level_m1);
1072 vfloat rscalev(rscale);
1073 vfloat low_boundv(low_bound);
1074
1075 // This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
1076 // safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
1077 if (get_quant_level(quant_level) <= 16)
1078 {
1079 vint4 tab0 = vint4::load(qat.quant_to_unquant);
1080 vint tab0p;
1081 vtable_prepare(tab0, tab0p);
1082
1083 for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1084 {
1085 vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1086 ix = clampzo(ix);
1087
1088 // Look up the two closest indexes and return the one that was closest
1089 vfloat ix1 = ix * quant_level_m1v;
1090
1091 vint weightl = float_to_int(ix1);
1092 vint weighth = min(weightl + vint(1), steps_m1);
1093
1094 vint ixli = vtable_8bt_32bi(tab0p, weightl);
1095 vint ixhi = vtable_8bt_32bi(tab0p, weighth);
1096
1097 vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix);
1098 vint weight = select(ixli, ixhi, mask);
1099 vfloat ixl = int_to_float(weight);
1100
1101 // Invert the weight-scaling that was done initially
1102 storea(ixl * rscalev + low_boundv, weight_set_out + i);
1103 vint scn = pack_low_bytes(weight);
1104 store_nbytes(scn, quantized_weight_set + i);
1105 }
1106 }
1107 else
1108 {
1109 vint4 tab0 = vint4::load(qat.quant_to_unquant + 0);
1110 vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
1111 vint tab0p, tab1p;
1112 vtable_prepare(tab0, tab1, tab0p, tab1p);
1113
1114 for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1115 {
1116 vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
1117 ix = clampzo(ix);
1118
1119 // Look up the two closest indexes and return the one that was closest
1120 vfloat ix1 = ix * quant_level_m1v;
1121
1122 vint weightl = float_to_int(ix1);
1123 vint weighth = min(weightl + vint(1), steps_m1);
1124
1125 vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
1126 vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
1127
1128 vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix);
1129 vint weight = select(ixli, ixhi, mask);
1130 vfloat ixl = int_to_float(weight);
1131
1132 // Invert the weight-scaling that was done initially
1133 storea(ixl * rscalev + low_boundv, weight_set_out + i);
1134 vint scn = pack_low_bytes(weight);
1135 store_nbytes(scn, quantized_weight_set + i);
1136 }
1137 }
1138 }
1139
1140 /**
1141 * @brief Compute the RGB + offset for a HDR endpoint mode #7.
1142 *
1143 * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
1144 * gives us ~24 multiplications vs. 96 for a generic inverse.
1145 *
1146 * mat[0] = vfloat4(rgba_ws.x, 0.0f, 0.0f, wght_ws.x);
1147 * mat[1] = vfloat4( 0.0f, rgba_ws.y, 0.0f, wght_ws.y);
1148 * mat[2] = vfloat4( 0.0f, 0.0f, rgba_ws.z, wght_ws.z);
1149 * mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z, psum);
1150 * mat = invert(mat);
1151 *
1152 * @param rgba_weight_sum Sum of partition component error weights.
1153 * @param weight_weight_sum Sum of partition component error weights * texel weight.
1154 * @param rgbq_sum Sum of partition component error weights * texel weight * color data.
1155 * @param psum Sum of RGB color weights * texel weight^2.
1156 */
compute_rgbo_vector(vfloat4 rgba_weight_sum,vfloat4 weight_weight_sum,vfloat4 rgbq_sum,float psum)1157 static inline vfloat4 compute_rgbo_vector(
1158 vfloat4 rgba_weight_sum,
1159 vfloat4 weight_weight_sum,
1160 vfloat4 rgbq_sum,
1161 float psum
1162 ) {
1163 float X = rgba_weight_sum.lane<0>();
1164 float Y = rgba_weight_sum.lane<1>();
1165 float Z = rgba_weight_sum.lane<2>();
1166 float P = weight_weight_sum.lane<0>();
1167 float Q = weight_weight_sum.lane<1>();
1168 float R = weight_weight_sum.lane<2>();
1169 float S = psum;
1170
1171 float PP = P * P;
1172 float QQ = Q * Q;
1173 float RR = R * R;
1174
1175 float SZmRR = S * Z - RR;
1176 float DT = SZmRR * Y - Z * QQ;
1177 float YP = Y * P;
1178 float QX = Q * X;
1179 float YX = Y * X;
1180 float mZYP = -Z * YP;
1181 float mZQX = -Z * QX;
1182 float mRYX = -R * YX;
1183 float ZQP = Z * Q * P;
1184 float RYP = R * YP;
1185 float RQX = R * QX;
1186
1187 // Compute the reciprocal of matrix determinant
1188 float rdet = 1.0f / (DT * X + mZYP * P);
1189
1190 // Actually compute the adjugate, and then apply 1/det separately
1191 vfloat4 mat0(DT, ZQP, RYP, mZYP);
1192 vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
1193 vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
1194 vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
1195 vfloat4 vect = rgbq_sum * rdet;
1196
1197 return vfloat4(dot_s(mat0, vect),
1198 dot_s(mat1, vect),
1199 dot_s(mat2, vect),
1200 dot_s(mat3, vect));
1201 }
1202
1203 /* See header for documentation. */
recompute_ideal_colors_1plane(const image_block & blk,const partition_info & pi,const decimation_info & di,const uint8_t * dec_weights_uquant,endpoints & ep,vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS])1204 void recompute_ideal_colors_1plane(
1205 const image_block& blk,
1206 const partition_info& pi,
1207 const decimation_info& di,
1208 const uint8_t* dec_weights_uquant,
1209 endpoints& ep,
1210 vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
1211 vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
1212 ) {
1213 unsigned int weight_count = di.weight_count;
1214 unsigned int total_texel_count = blk.texel_count;
1215 unsigned int partition_count = pi.partition_count;
1216
1217 promise(weight_count > 0);
1218 promise(total_texel_count > 0);
1219 promise(partition_count > 0);
1220
1221 ASTCENC_ALIGNAS float dec_weight[BLOCK_MAX_WEIGHTS];
1222 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1223 {
1224 vint unquant_value(dec_weights_uquant + i);
1225 vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
1226 storea(unquant_valuef, dec_weight + i);
1227 }
1228
1229 ASTCENC_ALIGNAS float undec_weight[BLOCK_MAX_TEXELS];
1230 float* undec_weight_ref;
1231 if (di.max_texel_weight_count == 1)
1232 {
1233 undec_weight_ref = dec_weight;
1234 }
1235 else if (di.max_texel_weight_count <= 2)
1236 {
1237 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1238 {
1239 vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
1240 storea(weight, undec_weight + i);
1241 }
1242
1243 undec_weight_ref = undec_weight;
1244 }
1245 else
1246 {
1247 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1248 {
1249 vfloat weight = bilinear_infill_vla(di, dec_weight, i);
1250 storea(weight, undec_weight + i);
1251 }
1252
1253 undec_weight_ref = undec_weight;
1254 }
1255
1256 vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
1257
1258 for (unsigned int i = 0; i < partition_count; i++)
1259 {
1260 unsigned int texel_count = pi.partition_texel_count[i];
1261 const uint8_t *texel_indexes = pi.texels_of_partition[i];
1262
1263 // Only compute a partition mean if more than one partition
1264 if (partition_count > 1)
1265 {
1266 rgba_sum = vfloat4::zero();
1267 promise(texel_count > 0);
1268 for (unsigned int j = 0; j < texel_count; j++)
1269 {
1270 unsigned int tix = texel_indexes[j];
1271 rgba_sum += blk.texel(tix);
1272 }
1273 }
1274
1275 rgba_sum = rgba_sum * blk.channel_weight;
1276 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1277 vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
1278
1279 float scale_max = 0.0f;
1280 float scale_min = 1e10f;
1281
1282 float wmin1 = 1.0f;
1283 float wmax1 = 0.0f;
1284
1285 float left_sum_s = 0.0f;
1286 float middle_sum_s = 0.0f;
1287 float right_sum_s = 0.0f;
1288
1289 vfloat4 color_vec_x = vfloat4::zero();
1290 vfloat4 color_vec_y = vfloat4::zero();
1291
1292 vfloat4 scale_vec = vfloat4::zero();
1293
1294 float weight_weight_sum_s = 1e-17f;
1295
1296 vfloat4 color_weight = blk.channel_weight;
1297 float ls_weight = hadd_rgb_s(color_weight);
1298
1299 for (unsigned int j = 0; j < texel_count; j++)
1300 {
1301 unsigned int tix = texel_indexes[j];
1302 vfloat4 rgba = blk.texel(tix);
1303
1304 float idx0 = undec_weight_ref[tix];
1305
1306 float om_idx0 = 1.0f - idx0;
1307 wmin1 = astc::min(idx0, wmin1);
1308 wmax1 = astc::max(idx0, wmax1);
1309
1310 float scale = dot3_s(scale_dir, rgba);
1311 scale_min = astc::min(scale, scale_min);
1312 scale_max = astc::max(scale, scale_max);
1313
1314 left_sum_s += om_idx0 * om_idx0;
1315 middle_sum_s += om_idx0 * idx0;
1316 right_sum_s += idx0 * idx0;
1317 weight_weight_sum_s += idx0;
1318
1319 vfloat4 color_idx(idx0);
1320 vfloat4 cwprod = rgba;
1321 vfloat4 cwiprod = cwprod * color_idx;
1322
1323 color_vec_y += cwiprod;
1324 color_vec_x += cwprod - cwiprod;
1325
1326 scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
1327 }
1328
1329 vfloat4 left_sum = vfloat4(left_sum_s) * color_weight;
1330 vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
1331 vfloat4 right_sum = vfloat4(right_sum_s) * color_weight;
1332 vfloat4 lmrs_sum = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
1333
1334 color_vec_x = color_vec_x * color_weight;
1335 color_vec_y = color_vec_y * color_weight;
1336
1337 // Initialize the luminance and scale vectors with a reasonable default
1338 float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1339 scalediv = astc::clamp1f(scalediv);
1340
1341 vfloat4 sds = scale_dir * scale_max;
1342
1343 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1344
1345 if (wmin1 >= wmax1 * 0.999f)
1346 {
1347 // If all weights in the partition were equal, then just take average of all colors in
1348 // the partition and use that as both endpoint colors
1349 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1350
1351 vmask4 notnan_mask = avg == avg;
1352 ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
1353 ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
1354
1355 rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1356 }
1357 else
1358 {
1359 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1360 // set of texel weights and pixel colors
1361 vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
1362 vfloat4 color_rdet1 = 1.0f / color_det1;
1363
1364 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1365 float ls_rdet1 = 1.0f / ls_det1;
1366
1367 vfloat4 color_mss1 = (left_sum * left_sum)
1368 + (2.0f * middle_sum * middle_sum)
1369 + (right_sum * right_sum);
1370
1371 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1372 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1373 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1374
1375 vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
1376 vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
1377
1378 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1379 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1380 vmask4 full_mask = det_mask & notnan_mask;
1381
1382 ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
1383 ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
1384
1385 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1386 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1387
1388 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1389 {
1390 float scalediv2 = scale_ep0 / scale_ep1;
1391 vfloat4 sdsm = scale_dir * scale_ep1;
1392 rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1393 }
1394 }
1395
1396 // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1397 if (blk.rgb_lns[0] || blk.alpha_lns[0])
1398 {
1399 vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
1400 float psum = right_sum_s * hadd_rgb_s(color_weight);
1401
1402 vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1403 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1404
1405 vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1406 rgbo_vectors[i] = rgbovec;
1407
1408 // We can get a failure due to the use of a singular (non-invertible) matrix
1409 // If it failed, compute rgbo_vectors[] with a different method ...
1410 if (astc::isnan(dot_s(rgbovec, rgbovec)))
1411 {
1412 vfloat4 v0 = ep.endpt0[i];
1413 vfloat4 v1 = ep.endpt1[i];
1414
1415 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1416 avgdif = astc::max(avgdif, 0.0f);
1417
1418 vfloat4 avg = (v0 + v1) * 0.5f;
1419 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1420 rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1421 }
1422 }
1423 }
1424 }
1425
1426 /* See header for documentation. */
recompute_ideal_colors_2planes(const image_block & blk,const block_size_descriptor & bsd,const decimation_info & di,const uint8_t * dec_weights_uquant_plane1,const uint8_t * dec_weights_uquant_plane2,endpoints & ep,vfloat4 & rgbs_vector,vfloat4 & rgbo_vector,int plane2_component)1427 void recompute_ideal_colors_2planes(
1428 const image_block& blk,
1429 const block_size_descriptor& bsd,
1430 const decimation_info& di,
1431 const uint8_t* dec_weights_uquant_plane1,
1432 const uint8_t* dec_weights_uquant_plane2,
1433 endpoints& ep,
1434 vfloat4& rgbs_vector,
1435 vfloat4& rgbo_vector,
1436 int plane2_component
1437 ) {
1438 unsigned int weight_count = di.weight_count;
1439 unsigned int total_texel_count = blk.texel_count;
1440
1441 promise(total_texel_count > 0);
1442 promise(weight_count > 0);
1443
1444 ASTCENC_ALIGNAS float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
1445 ASTCENC_ALIGNAS float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
1446
1447 assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
1448
1449 for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
1450 {
1451 vint unquant_value1(dec_weights_uquant_plane1 + i);
1452 vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
1453 storea(unquant_value1f, dec_weight_plane1 + i);
1454
1455 vint unquant_value2(dec_weights_uquant_plane2 + i);
1456 vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
1457 storea(unquant_value2f, dec_weight_plane2 + i);
1458 }
1459
1460 ASTCENC_ALIGNAS float undec_weight_plane1[BLOCK_MAX_TEXELS];
1461 ASTCENC_ALIGNAS float undec_weight_plane2[BLOCK_MAX_TEXELS];
1462
1463 float* undec_weight_plane1_ref;
1464 float* undec_weight_plane2_ref;
1465
1466 if (di.max_texel_weight_count == 1)
1467 {
1468 undec_weight_plane1_ref = dec_weight_plane1;
1469 undec_weight_plane2_ref = dec_weight_plane2;
1470 }
1471 else if (di.max_texel_weight_count <= 2)
1472 {
1473 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1474 {
1475 vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
1476 storea(weight, undec_weight_plane1 + i);
1477
1478 weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
1479 storea(weight, undec_weight_plane2 + i);
1480 }
1481
1482 undec_weight_plane1_ref = undec_weight_plane1;
1483 undec_weight_plane2_ref = undec_weight_plane2;
1484 }
1485 else
1486 {
1487 for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1488 {
1489 vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
1490 storea(weight, undec_weight_plane1 + i);
1491
1492 weight = bilinear_infill_vla(di, dec_weight_plane2, i);
1493 storea(weight, undec_weight_plane2 + i);
1494 }
1495
1496 undec_weight_plane1_ref = undec_weight_plane1;
1497 undec_weight_plane2_ref = undec_weight_plane2;
1498 }
1499
1500 unsigned int texel_count = bsd.texel_count;
1501 vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1502 vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
1503
1504 float scale_max = 0.0f;
1505 float scale_min = 1e10f;
1506
1507 float wmin1 = 1.0f;
1508 float wmax1 = 0.0f;
1509
1510 float wmin2 = 1.0f;
1511 float wmax2 = 0.0f;
1512
1513 float left1_sum_s = 0.0f;
1514 float middle1_sum_s = 0.0f;
1515 float right1_sum_s = 0.0f;
1516
1517 float left2_sum_s = 0.0f;
1518 float middle2_sum_s = 0.0f;
1519 float right2_sum_s = 0.0f;
1520
1521 vfloat4 color_vec_x = vfloat4::zero();
1522 vfloat4 color_vec_y = vfloat4::zero();
1523
1524 vfloat4 scale_vec = vfloat4::zero();
1525
1526 vfloat4 weight_weight_sum = vfloat4(1e-17f);
1527
1528 vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
1529 vfloat4 color_weight = blk.channel_weight;
1530 float ls_weight = hadd_rgb_s(color_weight);
1531
1532 for (unsigned int j = 0; j < texel_count; j++)
1533 {
1534 vfloat4 rgba = blk.texel(j);
1535
1536 float idx0 = undec_weight_plane1_ref[j];
1537
1538 float om_idx0 = 1.0f - idx0;
1539 wmin1 = astc::min(idx0, wmin1);
1540 wmax1 = astc::max(idx0, wmax1);
1541
1542 float scale = dot3_s(scale_dir, rgba);
1543 scale_min = astc::min(scale, scale_min);
1544 scale_max = astc::max(scale, scale_max);
1545
1546 left1_sum_s += om_idx0 * om_idx0;
1547 middle1_sum_s += om_idx0 * idx0;
1548 right1_sum_s += idx0 * idx0;
1549
1550 float idx1 = undec_weight_plane2_ref[j];
1551
1552 float om_idx1 = 1.0f - idx1;
1553 wmin2 = astc::min(idx1, wmin2);
1554 wmax2 = astc::max(idx1, wmax2);
1555
1556 left2_sum_s += om_idx1 * om_idx1;
1557 middle2_sum_s += om_idx1 * idx1;
1558 right2_sum_s += idx1 * idx1;
1559
1560 vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
1561
1562 vfloat4 cwprod = rgba;
1563 vfloat4 cwiprod = cwprod * color_idx;
1564
1565 color_vec_y += cwiprod;
1566 color_vec_x += cwprod - cwiprod;
1567
1568 scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
1569 weight_weight_sum += color_idx;
1570 }
1571
1572 vfloat4 left1_sum = vfloat4(left1_sum_s) * color_weight;
1573 vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
1574 vfloat4 right1_sum = vfloat4(right1_sum_s) * color_weight;
1575 vfloat4 lmrs_sum = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
1576
1577 vfloat4 left2_sum = vfloat4(left2_sum_s) * color_weight;
1578 vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
1579 vfloat4 right2_sum = vfloat4(right2_sum_s) * color_weight;
1580
1581 color_vec_x = color_vec_x * color_weight;
1582 color_vec_y = color_vec_y * color_weight;
1583
1584 // Initialize the luminance and scale vectors with a reasonable default
1585 float scalediv = scale_min / astc::max(scale_max, 1e-10f);
1586 scalediv = astc::clamp1f(scalediv);
1587
1588 vfloat4 sds = scale_dir * scale_max;
1589
1590 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1591
1592 if (wmin1 >= wmax1 * 0.999f)
1593 {
1594 // If all weights in the partition were equal, then just take average of all colors in
1595 // the partition and use that as both endpoint colors
1596 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1597
1598 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1599 vmask4 notnan_mask = avg == avg;
1600 vmask4 full_mask = p1_mask & notnan_mask;
1601
1602 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1603 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1604
1605 rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1606 }
1607 else
1608 {
1609 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1610 // set of texel weights and pixel colors
1611 vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
1612 vfloat4 color_rdet1 = 1.0f / color_det1;
1613
1614 float ls_det1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1615 float ls_rdet1 = 1.0f / ls_det1;
1616
1617 vfloat4 color_mss1 = (left1_sum * left1_sum)
1618 + (2.0f * middle1_sum * middle1_sum)
1619 + (right1_sum * right1_sum);
1620
1621 float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1622 + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1623 + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1624
1625 vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
1626 vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
1627
1628 float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1629 float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1630
1631 vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1632 vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1633 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1634 vmask4 full_mask = p1_mask & det_mask & notnan_mask;
1635
1636 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1637 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1638
1639 if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1640 {
1641 float scalediv2 = scale_ep0 / scale_ep1;
1642 vfloat4 sdsm = scale_dir * scale_ep1;
1643 rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1644 }
1645 }
1646
1647 if (wmin2 >= wmax2 * 0.999f)
1648 {
1649 // If all weights in the partition were equal, then just take average of all colors in
1650 // the partition and use that as both endpoint colors
1651 vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1652
1653 vmask4 notnan_mask = avg == avg;
1654 vmask4 full_mask = p2_mask & notnan_mask;
1655
1656 ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1657 ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1658 }
1659 else
1660 {
1661 // Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1662 // set of texel weights and pixel colors
1663 vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
1664 vfloat4 color_rdet2 = 1.0f / color_det2;
1665
1666 vfloat4 color_mss2 = (left2_sum * left2_sum)
1667 + (2.0f * middle2_sum * middle2_sum)
1668 + (right2_sum * right2_sum);
1669
1670 vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
1671 vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
1672
1673 vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
1674 vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1675 vmask4 full_mask = p2_mask & det_mask & notnan_mask;
1676
1677 ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1678 ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1679 }
1680
1681 // Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
1682 if (blk.rgb_lns[0] || blk.alpha_lns[0])
1683 {
1684 weight_weight_sum = weight_weight_sum * color_weight;
1685 float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
1686
1687 vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1688 rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1689
1690 rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1691
1692 // We can get a failure due to the use of a singular (non-invertible) matrix
1693 // If it failed, compute rgbo_vectors[] with a different method ...
1694 if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
1695 {
1696 vfloat4 v0 = ep.endpt0[0];
1697 vfloat4 v1 = ep.endpt1[0];
1698
1699 float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1700 avgdif = astc::max(avgdif, 0.0f);
1701
1702 vfloat4 avg = (v0 + v1) * 0.5f;
1703 vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1704
1705 rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1706 }
1707 }
1708 }
1709
1710 #endif
1711