• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19 
20 /**
21  * @brief Functions for computing color endpoints and texel weights.
22  */
23 
24 #include <cassert>
25 
26 #include "astcenc_internal.h"
27 #include "astcenc_vecmathlib.h"
28 
29 /**
30  * @brief Compute the ideal endpoints and weights for 1 color component.
31  *
32  * @param      blk         The image block color data to compress.
33  * @param      pi          The partition info for the current trial.
34  * @param[out] ei          The computed ideal endpoints and weights.
35  * @param      component   The color component to compute.
36  */
compute_ideal_colors_and_weights_1_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int component)37 static void compute_ideal_colors_and_weights_1_comp(
38 	const image_block& blk,
39 	const partition_info& pi,
40 	endpoints_and_weights& ei,
41 	unsigned int component
42 ) {
43 	unsigned int partition_count = pi.partition_count;
44 	ei.ep.partition_count = partition_count;
45 	promise(partition_count > 0);
46 
47 	unsigned int texel_count = blk.texel_count;
48 	promise(texel_count > 0);
49 
50 	float error_weight;
51 	const float* data_vr = nullptr;
52 
53 	assert(component < BLOCK_MAX_COMPONENTS);
54 	switch (component)
55 	{
56 	case 0:
57 		error_weight = blk.channel_weight.lane<0>();
58 		data_vr = blk.data_r;
59 		break;
60 	case 1:
61 		error_weight = blk.channel_weight.lane<1>();
62 		data_vr = blk.data_g;
63 		break;
64 	case 2:
65 		error_weight = blk.channel_weight.lane<2>();
66 		data_vr = blk.data_b;
67 		break;
68 	default:
69 		assert(component == 3);
70 		error_weight = blk.channel_weight.lane<3>();
71 		data_vr = blk.data_a;
72 		break;
73 	}
74 
75 	vmask4 sep_mask = vint4::lane_id() == vint4(component);
76 	bool is_constant_wes { true };
77 	float partition0_len_sq { 0.0f };
78 
79 	for (unsigned int i = 0; i < partition_count; i++)
80 	{
81 		float lowvalue { 1e10f };
82 		float highvalue { -1e10f };
83 
84 		unsigned int partition_texel_count = pi.partition_texel_count[i];
85 		for (unsigned int j = 0; j < partition_texel_count; j++)
86 		{
87 			unsigned int tix = pi.texels_of_partition[i][j];
88 			float value = data_vr[tix];
89 			lowvalue = astc::min(value, lowvalue);
90 			highvalue = astc::max(value, highvalue);
91 		}
92 
93 		if (highvalue < lowvalue)
94 		{
95 			lowvalue = 0.0f;
96 			highvalue = 1e-7f;
97 		}
98 
99 		float length = highvalue - lowvalue;
100 		float length_squared = length * length;
101 		float scale = 1.0f / length;
102 
103 		if (i == 0)
104 		{
105 			partition0_len_sq = length_squared;
106 		}
107 		else
108 		{
109 			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
110 		}
111 
112 		for (unsigned int j = 0; j < partition_texel_count; j++)
113 		{
114 			unsigned int tix = pi.texels_of_partition[i][j];
115 			float value = (data_vr[tix] - lowvalue) * scale;
116 			value = astc::clamp1f(value);
117 
118 			ei.weights[tix] = value;
119 			ei.weight_error_scale[tix] = length_squared * error_weight;
120 			assert(!astc::isnan(ei.weight_error_scale[tix]));
121 		}
122 
123 		ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
124 		ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
125 	}
126 
127 	// Zero initialize any SIMD over-fetch
128 	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
129 	for (unsigned int i = texel_count; i < texel_count_simd; i++)
130 	{
131 		ei.weights[i] = 0.0f;
132 		ei.weight_error_scale[i] = 0.0f;
133 	}
134 
135 	ei.is_constant_weight_error_scale = is_constant_wes;
136 }
137 
138 /**
139  * @brief Compute the ideal endpoints and weights for 2 color components.
140  *
141  * @param      blk          The image block color data to compress.
142  * @param      pi           The partition info for the current trial.
143  * @param[out] ei           The computed ideal endpoints and weights.
144  * @param      component1   The first color component to compute.
145  * @param      component2   The second color component to compute.
146  */
compute_ideal_colors_and_weights_2_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,int component1,int component2)147 static void compute_ideal_colors_and_weights_2_comp(
148 	const image_block& blk,
149 	const partition_info& pi,
150 	endpoints_and_weights& ei,
151 	int component1,
152 	int component2
153 ) {
154 	unsigned int partition_count = pi.partition_count;
155 	ei.ep.partition_count = partition_count;
156 	promise(partition_count > 0);
157 
158 	unsigned int texel_count = blk.texel_count;
159 	promise(texel_count > 0);
160 
161 	partition_metrics pms[BLOCK_MAX_PARTITIONS];
162 
163 	float error_weight;
164 	const float* data_vr = nullptr;
165 	const float* data_vg = nullptr;
166 
167 	if (component1 == 0 && component2 == 1)
168 	{
169 		error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
170 
171 		data_vr = blk.data_r;
172 		data_vg = blk.data_g;
173 	}
174 	else if (component1 == 0 && component2 == 2)
175 	{
176 		error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
177 
178 		data_vr = blk.data_r;
179 		data_vg = blk.data_b;
180 	}
181 	else // (component1 == 1 && component2 == 2)
182 	{
183 		assert(component1 == 1 && component2 == 2);
184 
185 		error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
186 
187 		data_vr = blk.data_g;
188 		data_vg = blk.data_b;
189 	}
190 
191 	compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
192 
193 	bool is_constant_wes { true };
194 	float partition0_len_sq { 0.0f };
195 
196 	vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
197 	vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
198 
199 	for (unsigned int i = 0; i < partition_count; i++)
200 	{
201 		vfloat4 dir = pms[i].dir.swz<0, 1>();
202 		if (hadd_s(dir) < 0.0f)
203 		{
204 			dir = vfloat4::zero() - dir;
205 		}
206 
207 		line2 line { pms[i].avg.swz<0, 1>(), normalize_safe(dir, unit2()) };
208 		float lowparam { 1e10f };
209 		float highparam { -1e10f };
210 
211 		unsigned int partition_texel_count = pi.partition_texel_count[i];
212 		for (unsigned int j = 0; j < partition_texel_count; j++)
213 		{
214 			unsigned int tix = pi.texels_of_partition[i][j];
215 			vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
216 			float param = dot_s(point - line.a, line.b);
217 			ei.weights[tix] = param;
218 
219 			lowparam = astc::min(param, lowparam);
220 			highparam = astc::max(param, highparam);
221 		}
222 
223 		// It is possible for a uniform-color partition to produce length=0;
224 		// this causes NaN issues so set to small value to avoid this problem
225 		if (highparam < lowparam)
226 		{
227 			lowparam = 0.0f;
228 			highparam = 1e-7f;
229 		}
230 
231 		float length = highparam - lowparam;
232 		float length_squared = length * length;
233 		float scale = 1.0f / length;
234 
235 		if (i == 0)
236 		{
237 			partition0_len_sq = length_squared;
238 		}
239 		else
240 		{
241 			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
242 		}
243 
244 		for (unsigned int j = 0; j < partition_texel_count; j++)
245 		{
246 			unsigned int tix = pi.texels_of_partition[i][j];
247 			float idx = (ei.weights[tix] - lowparam) * scale;
248 			idx = astc::clamp1f(idx);
249 
250 			ei.weights[tix] = idx;
251 			ei.weight_error_scale[tix] = length_squared * error_weight;
252 			assert(!astc::isnan(ei.weight_error_scale[tix]));
253 		}
254 
255 		vfloat4 lowvalue = line.a + line.b * lowparam;
256 		vfloat4 highvalue = line.a + line.b * highparam;
257 
258 		vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
259 		vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
260 
261 		ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
262 		ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
263 	}
264 
265 	// Zero initialize any SIMD over-fetch
266 	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
267 	for (unsigned int i = texel_count; i < texel_count_simd; i++)
268 	{
269 		ei.weights[i] = 0.0f;
270 		ei.weight_error_scale[i] = 0.0f;
271 	}
272 
273 	ei.is_constant_weight_error_scale = is_constant_wes;
274 }
275 
276 /**
277  * @brief Compute the ideal endpoints and weights for 3 color components.
278  *
279  * @param      blk                 The image block color data to compress.
280  * @param      pi                  The partition info for the current trial.
281  * @param[out] ei                  The computed ideal endpoints and weights.
282  * @param      omitted_component   The color component excluded from the calculation.
283  */
compute_ideal_colors_and_weights_3_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei,unsigned int omitted_component)284 static void compute_ideal_colors_and_weights_3_comp(
285 	const image_block& blk,
286 	const partition_info& pi,
287 	endpoints_and_weights& ei,
288 	unsigned int omitted_component
289 ) {
290 	unsigned int partition_count = pi.partition_count;
291 	ei.ep.partition_count = partition_count;
292 	promise(partition_count > 0);
293 
294 	unsigned int texel_count = blk.texel_count;
295 	promise(texel_count > 0);
296 
297 	partition_metrics pms[BLOCK_MAX_PARTITIONS];
298 
299 	float error_weight;
300 	const float* data_vr = nullptr;
301 	const float* data_vg = nullptr;
302 	const float* data_vb = nullptr;
303 	if (omitted_component == 0)
304 	{
305 		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
306 		data_vr = blk.data_g;
307 		data_vg = blk.data_b;
308 		data_vb = blk.data_a;
309 	}
310 	else if (omitted_component == 1)
311 	{
312 		error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
313 		data_vr = blk.data_r;
314 		data_vg = blk.data_b;
315 		data_vb = blk.data_a;
316 	}
317 	else if (omitted_component == 2)
318 	{
319 		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
320 		data_vr = blk.data_r;
321 		data_vg = blk.data_g;
322 		data_vb = blk.data_a;
323 	}
324 	else
325 	{
326 		assert(omitted_component == 3);
327 
328 		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
329 		data_vr = blk.data_r;
330 		data_vg = blk.data_g;
331 		data_vb = blk.data_b;
332 	}
333 
334 	error_weight = error_weight * (1.0f / 3.0f);
335 
336 	if (omitted_component == 3)
337 	{
338 		compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
339 	}
340 	else
341 	{
342 		compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
343 	}
344 
345 	bool is_constant_wes { true };
346 	float partition0_len_sq { 0.0f };
347 
348 	for (unsigned int i = 0; i < partition_count; i++)
349 	{
350 		vfloat4 dir = pms[i].dir;
351 		if (hadd_rgb_s(dir) < 0.0f)
352 		{
353 			dir = vfloat4::zero() - dir;
354 		}
355 
356 		line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
357 		float lowparam { 1e10f };
358 		float highparam { -1e10f };
359 
360 		unsigned int partition_texel_count = pi.partition_texel_count[i];
361 		for (unsigned int j = 0; j < partition_texel_count; j++)
362 		{
363 			unsigned int tix = pi.texels_of_partition[i][j];
364 			vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
365 			float param = dot3_s(point - line.a, line.b);
366 			ei.weights[tix] = param;
367 
368 			lowparam = astc::min(param, lowparam);
369 			highparam = astc::max(param, highparam);
370 		}
371 
372 		// It is possible for a uniform-color partition to produce length=0;
373 		// this causes NaN issues so set to small value to avoid this problem
374 		if (highparam < lowparam)
375 		{
376 			lowparam = 0.0f;
377 			highparam = 1e-7f;
378 		}
379 
380 		float length = highparam - lowparam;
381 		float length_squared = length * length;
382 		float scale = 1.0f / length;
383 
384 		if (i == 0)
385 		{
386 			partition0_len_sq = length_squared;
387 		}
388 		else
389 		{
390 			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
391 		}
392 
393 		for (unsigned int j = 0; j < partition_texel_count; j++)
394 		{
395 			unsigned int tix = pi.texels_of_partition[i][j];
396 			float idx = (ei.weights[tix] - lowparam) * scale;
397 			idx = astc::clamp1f(idx);
398 
399 			ei.weights[tix] = idx;
400 			ei.weight_error_scale[tix] = length_squared * error_weight;
401 			assert(!astc::isnan(ei.weight_error_scale[tix]));
402 		}
403 
404 		vfloat4 ep0 = line.a + line.b * lowparam;
405 		vfloat4 ep1 = line.a + line.b * highparam;
406 
407 		vfloat4 bmin = blk.data_min;
408 		vfloat4 bmax = blk.data_max;
409 
410 		assert(omitted_component < BLOCK_MAX_COMPONENTS);
411 		switch (omitted_component)
412 		{
413 			case 0:
414 				ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
415 				ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
416 				break;
417 			case 1:
418 				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
419 				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
420 				break;
421 			case 2:
422 				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
423 				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
424 				break;
425 			default:
426 				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
427 				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
428 				break;
429 		}
430 	}
431 
432 	// Zero initialize any SIMD over-fetch
433 	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
434 	for (unsigned int i = texel_count; i < texel_count_simd; i++)
435 	{
436 		ei.weights[i] = 0.0f;
437 		ei.weight_error_scale[i] = 0.0f;
438 	}
439 
440 	ei.is_constant_weight_error_scale = is_constant_wes;
441 }
442 
443 /**
444  * @brief Compute the ideal endpoints and weights for 4 color components.
445  *
446  * @param      blk   The image block color data to compress.
447  * @param      pi    The partition info for the current trial.
448  * @param[out] ei    The computed ideal endpoints and weights.
449  */
compute_ideal_colors_and_weights_4_comp(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)450 static void compute_ideal_colors_and_weights_4_comp(
451 	const image_block& blk,
452 	const partition_info& pi,
453 	endpoints_and_weights& ei
454 ) {
455 	const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
456 
457 	unsigned int partition_count = pi.partition_count;
458 
459 	unsigned int texel_count = blk.texel_count;
460 	promise(texel_count > 0);
461 	promise(partition_count > 0);
462 
463 	partition_metrics pms[BLOCK_MAX_PARTITIONS];
464 
465 	compute_avgs_and_dirs_4_comp(pi, blk, pms);
466 
467 	bool is_constant_wes { true };
468 	float partition0_len_sq { 0.0f };
469 
470 	for (unsigned int i = 0; i < partition_count; i++)
471 	{
472 		vfloat4 dir = pms[i].dir;
473 		if (hadd_rgb_s(dir) < 0.0f)
474 		{
475 			dir = vfloat4::zero() - dir;
476 		}
477 
478 		line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
479 		float lowparam { 1e10f };
480 		float highparam { -1e10f };
481 
482 		unsigned int partition_texel_count = pi.partition_texel_count[i];
483 		for (unsigned int j = 0; j < partition_texel_count; j++)
484 		{
485 			unsigned int tix = pi.texels_of_partition[i][j];
486 			vfloat4 point = blk.texel(tix);
487 			float param = dot_s(point - line.a, line.b);
488 			ei.weights[tix] = param;
489 
490 			lowparam = astc::min(param, lowparam);
491 			highparam = astc::max(param, highparam);
492 		}
493 
494 		// It is possible for a uniform-color partition to produce length=0;
495 		// this causes NaN issues so set to small value to avoid this problem
496 		if (highparam < lowparam)
497 		{
498 			lowparam = 0.0f;
499 			highparam = 1e-7f;
500 		}
501 
502 		float length = highparam - lowparam;
503 		float length_squared = length * length;
504 		float scale = 1.0f / length;
505 
506 		if (i == 0)
507 		{
508 			partition0_len_sq = length_squared;
509 		}
510 		else
511 		{
512 			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
513 		}
514 
515 		ei.ep.endpt0[i] = line.a + line.b * lowparam;
516 		ei.ep.endpt1[i] = line.a + line.b * highparam;
517 
518 		for (unsigned int j = 0; j < partition_texel_count; j++)
519 		{
520 			unsigned int tix = pi.texels_of_partition[i][j];
521 			float idx = (ei.weights[tix] - lowparam) * scale;
522 			idx = astc::clamp1f(idx);
523 
524 			ei.weights[tix] = idx;
525 			ei.weight_error_scale[tix] = length_squared * error_weight;
526 			assert(!astc::isnan(ei.weight_error_scale[tix]));
527 		}
528 	}
529 
530 	// Zero initialize any SIMD over-fetch
531 	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
532 	for (unsigned int i = texel_count; i < texel_count_simd; i++)
533 	{
534 		ei.weights[i] = 0.0f;
535 		ei.weight_error_scale[i] = 0.0f;
536 	}
537 
538 	ei.is_constant_weight_error_scale = is_constant_wes;
539 }
540 
541 /* See header for documentation. */
compute_ideal_colors_and_weights_1plane(const image_block & blk,const partition_info & pi,endpoints_and_weights & ei)542 void compute_ideal_colors_and_weights_1plane(
543 	const image_block& blk,
544 	const partition_info& pi,
545 	endpoints_and_weights& ei
546 ) {
547 	bool uses_alpha = !blk.is_constant_channel(3);
548 
549 	if (uses_alpha)
550 	{
551 		compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
552 	}
553 	else
554 	{
555 		compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
556 	}
557 }
558 
559 /* See header for documentation. */
compute_ideal_colors_and_weights_2planes(const block_size_descriptor & bsd,const image_block & blk,unsigned int plane2_component,endpoints_and_weights & ei1,endpoints_and_weights & ei2)560 void compute_ideal_colors_and_weights_2planes(
561 	const block_size_descriptor& bsd,
562 	const image_block& blk,
563 	unsigned int plane2_component,
564 	endpoints_and_weights& ei1,
565 	endpoints_and_weights& ei2
566 ) {
567 	const auto& pi = bsd.get_partition_info(1, 0);
568 	bool uses_alpha = !blk.is_constant_channel(3);
569 
570 	assert(plane2_component < BLOCK_MAX_COMPONENTS);
571 	switch (plane2_component)
572 	{
573 	case 0: // Separate weights for red
574 		if (uses_alpha)
575 		{
576 			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
577 		}
578 		else
579 		{
580 			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
581 		}
582 		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
583 		break;
584 
585 	case 1: // Separate weights for green
586 		if (uses_alpha)
587 		{
588 			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
589 		}
590 		else
591 		{
592 			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
593 		}
594 		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
595 		break;
596 
597 	case 2: // Separate weights for blue
598 		if (uses_alpha)
599 		{
600 			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
601 		}
602 		else
603 		{
604 			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
605 		}
606 		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
607 		break;
608 
609 	default: // Separate weights for alpha
610 		assert(uses_alpha);
611 		compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
612 		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
613 		break;
614 	}
615 }
616 
617 /* See header for documentation. */
compute_error_of_weight_set_1plane(const endpoints_and_weights & eai,const decimation_info & di,const float * dec_weight_quant_uvalue)618 float compute_error_of_weight_set_1plane(
619 	const endpoints_and_weights& eai,
620 	const decimation_info& di,
621 	const float* dec_weight_quant_uvalue
622 ) {
623 	vfloatacc error_summav = vfloatacc::zero();
624 	float error_summa = 0.0f;
625 	unsigned int texel_count = di.texel_count;
626 
627 	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
628 	if (di.max_texel_weight_count > 2)
629 	{
630 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
631 		{
632 			// Compute the bilinear interpolation of the decimated weight grid
633 			vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
634 
635 			// Compute the error between the computed value and the ideal weight
636 			vfloat actual_values = loada(eai.weights + i);
637 			vfloat diff = current_values - actual_values;
638 			vfloat significance = loada(eai.weight_error_scale + i);
639 			vfloat error = diff * diff * significance;
640 
641 			haccumulate(error_summav, error);
642 		}
643 	}
644 	else if (di.max_texel_weight_count > 1)
645 	{
646 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
647 		{
648 			// Compute the bilinear interpolation of the decimated weight grid
649 			vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
650 
651 			// Compute the error between the computed value and the ideal weight
652 			vfloat actual_values = loada(eai.weights + i);
653 			vfloat diff = current_values - actual_values;
654 			vfloat significance = loada(eai.weight_error_scale + i);
655 			vfloat error = diff * diff * significance;
656 
657 			haccumulate(error_summav, error);
658 		}
659 	}
660 	else
661 	{
662 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
663 		{
664 			// Load the weight set directly, without interpolation
665 			vfloat current_values = loada(dec_weight_quant_uvalue + i);
666 
667 			// Compute the error between the computed value and the ideal weight
668 			vfloat actual_values = loada(eai.weights + i);
669 			vfloat diff = current_values - actual_values;
670 			vfloat significance = loada(eai.weight_error_scale + i);
671 			vfloat error = diff * diff * significance;
672 
673 			haccumulate(error_summav, error);
674 		}
675 	}
676 
677 	// Resolve the final scalar accumulator sum
678 	return error_summa = hadd_s(error_summav);
679 }
680 
681 /* See header for documentation. */
compute_error_of_weight_set_2planes(const endpoints_and_weights & eai1,const endpoints_and_weights & eai2,const decimation_info & di,const float * dec_weight_quant_uvalue_plane1,const float * dec_weight_quant_uvalue_plane2)682 float compute_error_of_weight_set_2planes(
683 	const endpoints_and_weights& eai1,
684 	const endpoints_and_weights& eai2,
685 	const decimation_info& di,
686 	const float* dec_weight_quant_uvalue_plane1,
687 	const float* dec_weight_quant_uvalue_plane2
688 ) {
689 	vfloatacc error_summav = vfloatacc::zero();
690 	unsigned int texel_count = di.texel_count;
691 
692 	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
693 	if (di.max_texel_weight_count > 2)
694 	{
695 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
696 		{
697 			// Plane 1
698 			// Compute the bilinear interpolation of the decimated weight grid
699 			vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
700 
701 			// Compute the error between the computed value and the ideal weight
702 			vfloat actual_values1 = loada(eai1.weights + i);
703 			vfloat diff = current_values1 - actual_values1;
704 			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
705 
706 			// Plane 2
707 			// Compute the bilinear interpolation of the decimated weight grid
708 			vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
709 
710 			// Compute the error between the computed value and the ideal weight
711 			vfloat actual_values2 = loada(eai2.weights + i);
712 			diff = current_values2 - actual_values2;
713 			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
714 
715 			haccumulate(error_summav, error1 + error2);
716 		}
717 	}
718 	else if (di.max_texel_weight_count > 1)
719 	{
720 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
721 		{
722 			// Plane 1
723 			// Compute the bilinear interpolation of the decimated weight grid
724 			vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
725 
726 			// Compute the error between the computed value and the ideal weight
727 			vfloat actual_values1 = loada(eai1.weights + i);
728 			vfloat diff = current_values1 - actual_values1;
729 			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
730 
731 			// Plane 2
732 			// Compute the bilinear interpolation of the decimated weight grid
733 			vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
734 
735 			// Compute the error between the computed value and the ideal weight
736 			vfloat actual_values2 = loada(eai2.weights + i);
737 			diff = current_values2 - actual_values2;
738 			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
739 
740 			haccumulate(error_summav, error1 + error2);
741 		}
742 	}
743 	else
744 	{
745 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
746 		{
747 			// Plane 1
748 			// Load the weight set directly, without interpolation
749 			vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
750 
751 			// Compute the error between the computed value and the ideal weight
752 			vfloat actual_values1 = loada(eai1.weights + i);
753 			vfloat diff = current_values1 - actual_values1;
754 			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
755 
756 			// Plane 2
757 			// Load the weight set directly, without interpolation
758 			vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
759 
760 			// Compute the error between the computed value and the ideal weight
761 			vfloat actual_values2 = loada(eai2.weights + i);
762 			diff = current_values2 - actual_values2;
763 			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
764 
765 			haccumulate(error_summav, error1 + error2);
766 		}
767 	}
768 
769 	// Resolve the final scalar accumulator sum
770 	return hadd_s(error_summav);
771 }
772 
773 /* See header for documentation. */
compute_ideal_weights_for_decimation(const endpoints_and_weights & eai_in,endpoints_and_weights & eai_out,const decimation_info & di,float * dec_weight_ideal_value)774 void compute_ideal_weights_for_decimation(
775 	const endpoints_and_weights& eai_in,
776 	endpoints_and_weights& eai_out,
777 	const decimation_info& di,
778 	float* dec_weight_ideal_value
779 ) {
780 	unsigned int texel_count = di.texel_count;
781 	unsigned int weight_count = di.weight_count;
782 	bool is_direct = texel_count == weight_count;
783 	promise(texel_count > 0);
784 	promise(weight_count > 0);
785 
786 	// This function includes a copy of the epw from eai_in to eai_out. We do it here because we
787 	// want to load the data anyway, so we can avoid loading it from memory twice.
788 	eai_out.ep = eai_in.ep;
789 	eai_out.is_constant_weight_error_scale = eai_in.is_constant_weight_error_scale;
790 
791 	// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
792 	// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
793 	// arrays always contain space for 64 elements
794 	unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
795 	storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
796 
797 	// If we have a 1:1 mapping just shortcut the computation - clone the weights into both the
798 	// weight set and the output epw copy.
799 
800 	// Transfer enough to also copy zero initialized SIMD over-fetch region
801 	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
802 	for (unsigned int i = 0; i < texel_count_simd; i += ASTCENC_SIMD_WIDTH)
803 	{
804 		vfloat weight(eai_in.weights + i);
805 		vfloat weight_error_scale(eai_in.weight_error_scale + i);
806 
807 		storea(weight, eai_out.weights + i);
808 		storea(weight_error_scale, eai_out.weight_error_scale + i);
809 
810 		// Direct 1:1 weight mapping, so clone weights directly
811 		// TODO: Can we just avoid the copy for direct cases?
812 		if (is_direct)
813 		{
814 			storea(weight, dec_weight_ideal_value + i);
815 		}
816 	}
817 
818 	if (is_direct)
819 	{
820 		return;
821 	}
822 
823 	// Otherwise compute an estimate and perform single refinement iteration
824 	alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
825 
826 	// Compute an initial average for each decimated weight
827 	bool constant_wes = eai_in.is_constant_weight_error_scale;
828 	vfloat weight_error_scale(eai_in.weight_error_scale[0]);
829 
830 	// This overshoots - this is OK as we initialize the array tails in the
831 	// decimation table structures to safe values ...
832 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
833 	{
834 		// Start with a small value to avoid div-by-zero later
835 		vfloat weight_weight(1e-10f);
836 		vfloat initial_weight = vfloat::zero();
837 
838 		// Accumulate error weighting of all the texels using this weight
839 		vint weight_texel_count(di.weight_texel_count + i);
840 		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
841 		promise(max_texel_count > 0);
842 
843 		for (unsigned int j = 0; j < max_texel_count; j++)
844 		{
845 			vint texel(di.weight_texel[j] + i);
846 			vfloat weight = loada(di.weights_flt[j] + i);
847 
848 			if (!constant_wes)
849 			{
850 				weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
851 			}
852 
853 			vfloat contrib_weight = weight * weight_error_scale;
854 
855 			weight_weight += contrib_weight;
856 			initial_weight += gatherf(eai_in.weights, texel) * contrib_weight;
857 		}
858 
859 		storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
860 	}
861 
862 	// Populate the interpolated weight grid based on the initital average
863 	// Process SIMD-width texel coordinates at at time while we can. Safe to
864 	// over-process full SIMD vectors - the tail is zeroed.
865 	if (di.max_texel_weight_count <= 2)
866 	{
867 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
868 		{
869 			vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
870 			storea(weight, infilled_weights + i);
871 		}
872 	}
873 	else
874 	{
875 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
876 		{
877 			vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
878 			storea(weight, infilled_weights + i);
879 		}
880 	}
881 
882 	// Perform a single iteration of refinement
883 	// Empirically determined step size; larger values don't help but smaller drops image quality
884 	constexpr float stepsize = 0.25f;
885 	constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
886 
887 	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
888 	{
889 		vfloat weight_val = loada(dec_weight_ideal_value + i);
890 
891 		// Accumulate error weighting of all the texels using this weight
892 		// Start with a small value to avoid div-by-zero later
893 		vfloat error_change0(1e-10f);
894 		vfloat error_change1(0.0f);
895 
896 		// Accumulate error weighting of all the texels using this weight
897 		vint weight_texel_count(di.weight_texel_count + i);
898 		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
899 		promise(max_texel_count > 0);
900 
901 		for (unsigned int j = 0; j < max_texel_count; j++)
902 		{
903 			vint texel(di.weight_texel[j] + i);
904 			vfloat contrib_weight = loada(di.weights_flt[j] + i);
905 
906 			if (!constant_wes)
907 			{
908  				weight_error_scale = gatherf(eai_in.weight_error_scale, texel);
909 			}
910 
911 			vfloat scale = weight_error_scale * contrib_weight;
912 			vfloat old_weight = gatherf(infilled_weights, texel);
913 			vfloat ideal_weight = gatherf(eai_in.weights, texel);
914 
915 			error_change0 += contrib_weight * scale;
916 			error_change1 += (old_weight - ideal_weight) * scale;
917 		}
918 
919 		vfloat step = (error_change1 * chd_scale) / error_change0;
920 		step = clamp(-stepsize, stepsize, step);
921 
922 		// Update the weight; note this can store negative values.
923 		storea(weight_val + step, dec_weight_ideal_value + i);
924 	}
925 }
926 
927 /* See header for documentation. */
compute_quantized_weights_for_decimation(const decimation_info & di,float low_bound,float high_bound,const float * dec_weight_ideal_value,float * weight_set_out,uint8_t * quantized_weight_set,quant_method quant_level)928 void compute_quantized_weights_for_decimation(
929 	const decimation_info& di,
930 	float low_bound,
931 	float high_bound,
932 	const float* dec_weight_ideal_value,
933 	float* weight_set_out,
934 	uint8_t* quantized_weight_set,
935 	quant_method quant_level
936 ) {
937 	int weight_count = di.weight_count;
938 	promise(weight_count > 0);
939 	const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[quant_level]);
940 
941 	// The available quant levels, stored with a minus 1 bias
942 	static const float quant_levels_m1[12] {
943 		1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
944 	};
945 
946 	float quant_level_m1 = quant_levels_m1[quant_level];
947 
948 	// Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
949 
950 	// TODO: Oddity to investigate; triggered by test in issue #265.
951 	if (high_bound < low_bound)
952 	{
953 		low_bound = 0.0f;
954 		high_bound = 1.0f;
955 	}
956 
957 	float rscale = high_bound - low_bound;
958 	float scale = 1.0f / rscale;
959 
960 	float scaled_low_bound = low_bound * scale;
961 	rscale *= 1.0f / 64.0f;
962 
963 	vfloat scalev(scale);
964 	vfloat scaled_low_boundv(scaled_low_bound);
965 	vfloat quant_level_m1v(quant_level_m1);
966 	vfloat rscalev(rscale);
967 	vfloat low_boundv(low_bound);
968 
969 	// This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
970 	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
971 	for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
972 	{
973 		vfloat ix = loada(&dec_weight_ideal_value[i]) * scalev - scaled_low_boundv;
974 		ix = clampzo(ix);
975 
976 		// Look up the two closest indexes and return the one that was closest
977 		vfloat ix1 = ix * quant_level_m1v;
978 
979 		vint weightl = float_to_int(ix1);
980 		vint weighth = weightl + vint(1);
981 
982 		vfloat ixl = gatherf(qat->unquantized_value_unsc, weightl);
983 		vfloat ixh = gatherf(qat->unquantized_value_unsc, weighth);
984 
985 		vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
986 		vint weight = select(weightl, weighth, mask);
987 		ixl = select(ixl, ixh, mask);
988 
989 		// Invert the weight-scaling that was done initially
990 		storea(ixl * rscalev + low_boundv, &weight_set_out[i]);
991 		vint scm = gatheri(qat->scramble_map, weight);
992 		vint scn = pack_low_bytes(scm);
993 		store_nbytes(scn, &quantized_weight_set[i]);
994 	}
995 }
996 
997 /**
998  * @brief Compute the RGB + offset for a HDR endpoint mode #7.
999  *
1000  * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
1001  * gives us ~24 multiplications vs. 96 for a generic inverse.
1002  *
1003  *  mat[0] = vfloat4(rgba_ws.x,      0.0f,      0.0f, wght_ws.x);
1004  *  mat[1] = vfloat4(     0.0f, rgba_ws.y,      0.0f, wght_ws.y);
1005  *  mat[2] = vfloat4(     0.0f,      0.0f, rgba_ws.z, wght_ws.z);
1006  *  mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z,      psum);
1007  *  mat = invert(mat);
1008  *
1009  * @param rgba_weight_sum     Sum of partition component error weights.
1010  * @param weight_weight_sum   Sum of partition component error weights * texel weight.
1011  * @param rgbq_sum            Sum of partition component error weights * texel weight * color data.
1012  * @param psum                Sum of RGB color weights * texel weight^2.
1013  */
compute_rgbo_vector(vfloat4 rgba_weight_sum,vfloat4 weight_weight_sum,vfloat4 rgbq_sum,float psum)1014 static inline vfloat4 compute_rgbo_vector(
1015 	vfloat4 rgba_weight_sum,
1016 	vfloat4 weight_weight_sum,
1017 	vfloat4 rgbq_sum,
1018 	float psum
1019 ) {
1020 	float X = rgba_weight_sum.lane<0>();
1021 	float Y = rgba_weight_sum.lane<1>();
1022 	float Z = rgba_weight_sum.lane<2>();
1023 	float P = weight_weight_sum.lane<0>();
1024 	float Q = weight_weight_sum.lane<1>();
1025 	float R = weight_weight_sum.lane<2>();
1026 	float S = psum;
1027 
1028 	float PP = P * P;
1029 	float QQ = Q * Q;
1030 	float RR = R * R;
1031 
1032 	float SZmRR = S * Z - RR;
1033 	float DT = SZmRR * Y - Z * QQ;
1034 	float YP = Y * P;
1035 	float QX = Q * X;
1036 	float YX = Y * X;
1037 	float mZYP = -Z * YP;
1038 	float mZQX = -Z * QX;
1039 	float mRYX = -R * YX;
1040 	float ZQP = Z * Q * P;
1041 	float RYP = R * YP;
1042 	float RQX = R * QX;
1043 
1044 	// Compute the reciprocal of matrix determinant
1045 	float rdet = 1.0f / (DT * X + mZYP * P);
1046 
1047 	// Actually compute the adjugate, and then apply 1/det separately
1048 	vfloat4 mat0(DT, ZQP, RYP, mZYP);
1049 	vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
1050 	vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
1051 	vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
1052 	vfloat4 vect = rgbq_sum * rdet;
1053 
1054 	return vfloat4(dot_s(mat0, vect),
1055 	               dot_s(mat1, vect),
1056 	               dot_s(mat2, vect),
1057 	               dot_s(mat3, vect));
1058 }
1059 
1060 /* See header for documentation. */
recompute_ideal_colors_1plane(const image_block & blk,const partition_info & pi,const decimation_info & di,int weight_quant_mode,const uint8_t * dec_weights_quant_pvalue,endpoints & ep,vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS])1061 void recompute_ideal_colors_1plane(
1062 	const image_block& blk,
1063 	const partition_info& pi,
1064 	const decimation_info& di,
1065 	int weight_quant_mode,
1066 	const uint8_t* dec_weights_quant_pvalue,
1067 	endpoints& ep,
1068 	vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
1069 	vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
1070 ) {
1071 	unsigned int weight_count = di.weight_count;
1072 	unsigned int total_texel_count = blk.texel_count;
1073 	unsigned int partition_count = pi.partition_count;
1074 
1075 	promise(weight_count > 0);
1076 	promise(total_texel_count > 0);
1077 	promise(partition_count > 0);
1078 
1079 	const quantization_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_mode];
1080 
1081 	float dec_weight[BLOCK_MAX_WEIGHTS];
1082 	for (unsigned int i = 0; i < weight_count; i++)
1083 	{
1084 		dec_weight[i] = qat.unquantized_value[dec_weights_quant_pvalue[i]] * (1.0f / 64.0f);
1085 	}
1086 
1087 	alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
1088 	float* undec_weight_ref;
1089 	if (di.max_texel_weight_count == 1)
1090 	{
1091 		undec_weight_ref = dec_weight;
1092 	}
1093 	else if (di.max_texel_weight_count <= 2)
1094 	{
1095 		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1096 		{
1097 			vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
1098 			storea(weight, undec_weight + i);
1099 		}
1100 
1101 		undec_weight_ref = undec_weight;
1102 	}
1103 	else
1104 	{
1105 		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1106 		{
1107 			vfloat weight = bilinear_infill_vla(di, dec_weight, i);
1108 			storea(weight, undec_weight + i);
1109 		}
1110 
1111 		undec_weight_ref = undec_weight;
1112 	}
1113 
1114 	vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
1115 
1116 	for (unsigned int i = 0; i < partition_count; i++)
1117 	{
1118 		unsigned int texel_count = pi.partition_texel_count[i];
1119 		const uint8_t *texel_indexes = pi.texels_of_partition[i];
1120 
1121 		// Only compute a partition mean if more than one partition
1122 		if (partition_count > 1)
1123 		{
1124 			rgba_sum = vfloat4(1e-17f);
1125 			promise(texel_count > 0);
1126 			for (unsigned int j = 0; j < texel_count; j++)
1127 			{
1128 				unsigned int tix = texel_indexes[j];
1129 				rgba_sum += blk.texel(tix);
1130 			}
1131 		}
1132 
1133 		rgba_sum = rgba_sum * blk.channel_weight;
1134 		vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1135 		vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
1136 
1137 		float scale_max = 0.0f;
1138 		float scale_min = 1e10f;
1139 
1140 		float wmin1 = 1.0f;
1141 		float wmax1 = 0.0f;
1142 
1143 		float left_sum_s = 0.0f;
1144 		float middle_sum_s = 0.0f;
1145 		float right_sum_s = 0.0f;
1146 
1147 		vfloat4 color_vec_x = vfloat4::zero();
1148 		vfloat4 color_vec_y = vfloat4::zero();
1149 
1150 		vfloat4 scale_vec = vfloat4::zero();
1151 
1152 		float weight_weight_sum_s = 1e-17f;
1153 
1154 		vfloat4 color_weight = blk.channel_weight;
1155 		float ls_weight = hadd_rgb_s(color_weight);
1156 
1157 		for (unsigned int j = 0; j < texel_count; j++)
1158 		{
1159 			unsigned int tix = texel_indexes[j];
1160 
1161 			vfloat4 rgba = blk.texel(tix);
1162 
1163 			float idx0 = undec_weight_ref[tix];
1164 
1165 			float om_idx0 = 1.0f - idx0;
1166 			wmin1 = astc::min(idx0, wmin1);
1167 			wmax1 = astc::max(idx0, wmax1);
1168 
1169 			float scale = dot3_s(scale_dir, rgba);
1170 			scale_min = astc::min(scale, scale_min);
1171 			scale_max = astc::max(scale, scale_max);
1172 
1173 			left_sum_s   += om_idx0 * om_idx0;
1174 			middle_sum_s += om_idx0 * idx0;
1175 			right_sum_s  += idx0 * idx0;
1176 			weight_weight_sum_s += idx0;
1177 
1178 			vfloat4 color_idx(idx0);
1179 			vfloat4 cwprod = rgba;
1180 			vfloat4 cwiprod = cwprod * color_idx;
1181 
1182 			color_vec_y += cwiprod;
1183 			color_vec_x += cwprod - cwiprod;
1184 
1185 			scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
1186 		}
1187 
1188 		vfloat4 left_sum   = vfloat4(left_sum_s) * color_weight;
1189 		vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
1190 		vfloat4 right_sum  = vfloat4(right_sum_s) * color_weight;
1191 		vfloat4 lmrs_sum   = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
1192 
1193 		vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
1194 		float psum = right_sum_s * hadd_rgb_s(color_weight);
1195 
1196 		color_vec_x = color_vec_x * color_weight;
1197 		color_vec_y = color_vec_y * color_weight;
1198 
1199 		// Initialize the luminance and scale vectors with a reasonable default
1200 		float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
1201 		scalediv = astc::clamp1f(scalediv);
1202 
1203 		vfloat4 sds = scale_dir * scale_max;
1204 
1205 		rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1206 
1207 		if (wmin1 >= wmax1 * 0.999f)
1208 		{
1209 			// If all weights in the partition were equal, then just take average of all colors in
1210 			// the partition and use that as both endpoint colors
1211 			vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1212 
1213 			vmask4 notnan_mask = avg == avg;
1214 			ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
1215 			ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
1216 
1217 			rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1218 		}
1219 		else
1220 		{
1221 			// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1222 			// set of texel weights and pixel colors
1223 			vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
1224 			vfloat4 color_rdet1 = 1.0f / color_det1;
1225 
1226 			float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1227 			float ls_rdet1 = 1.0f / ls_det1;
1228 
1229 			vfloat4 color_mss1 = (left_sum * left_sum)
1230 			                   + (2.0f * middle_sum * middle_sum)
1231 			                   + (right_sum * right_sum);
1232 
1233 			float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1234 			              + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1235 			              + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1236 
1237 			vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
1238 			vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
1239 
1240 			vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1241 			vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1242 			vmask4 full_mask = det_mask & notnan_mask;
1243 
1244 			ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
1245 			ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
1246 
1247 			float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1248 			float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1249 
1250 			if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1251 			{
1252 				float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
1253 				vfloat4 sdsm = scale_dir * scale_ep1;
1254 				rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1255 			}
1256 		}
1257 
1258 		// Calculations specific to mode #7, the HDR RGB-scale mode
1259 		vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1260 		rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1261 
1262 		vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1263 		rgbo_vectors[i] = rgbovec;
1264 
1265 		// We can get a failure due to the use of a singular (non-invertible) matrix
1266 		// If it failed, compute rgbo_vectors[] with a different method ...
1267 		if (astc::isnan(dot_s(rgbovec, rgbovec)))
1268 		{
1269 			vfloat4 v0 = ep.endpt0[i];
1270 			vfloat4 v1 = ep.endpt1[i];
1271 
1272 			float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1273 			avgdif = astc::max(avgdif, 0.0f);
1274 
1275 			vfloat4 avg = (v0 + v1) * 0.5f;
1276 			vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1277 			rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1278 		}
1279 	}
1280 }
1281 
1282 /* See header for documentation. */
recompute_ideal_colors_2planes(const image_block & blk,const block_size_descriptor & bsd,const decimation_info & di,int weight_quant_mode,const uint8_t * dec_weights_quant_pvalue_plane1,const uint8_t * dec_weights_quant_pvalue_plane2,endpoints & ep,vfloat4 & rgbs_vector,vfloat4 & rgbo_vector,int plane2_component)1283 void recompute_ideal_colors_2planes(
1284 	const image_block& blk,
1285 	const block_size_descriptor& bsd,
1286 	const decimation_info& di,
1287 	int weight_quant_mode,
1288 	const uint8_t* dec_weights_quant_pvalue_plane1,
1289 	const uint8_t* dec_weights_quant_pvalue_plane2,
1290 	endpoints& ep,
1291 	vfloat4& rgbs_vector,
1292 	vfloat4& rgbo_vector,
1293 	int plane2_component
1294 ) {
1295 	unsigned int weight_count = di.weight_count;
1296 	unsigned int total_texel_count = blk.texel_count;
1297 
1298 	promise(total_texel_count > 0);
1299 	promise(weight_count > 0);
1300 
1301 	const quantization_and_transfer_table *qat = &(quant_and_xfer_tables[weight_quant_mode]);
1302 
1303 	float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
1304 	float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
1305 
1306 	assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
1307 	for (unsigned int i = 0; i < weight_count; i++)
1308 	{
1309 		dec_weight_plane1[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane1[i]] * (1.0f / 64.0f);
1310 		dec_weight_plane2[i] = qat->unquantized_value[dec_weights_quant_pvalue_plane2[i]] * (1.0f / 64.0f);
1311 	}
1312 
1313 	alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
1314 	alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
1315 
1316 	float* undec_weight_plane1_ref;
1317 	float* undec_weight_plane2_ref;
1318 
1319 	if (di.max_texel_weight_count == 1)
1320 	{
1321 		undec_weight_plane1_ref = dec_weight_plane1;
1322 		undec_weight_plane2_ref = dec_weight_plane2;
1323 	}
1324 	else if (di.max_texel_weight_count <= 2)
1325 	{
1326 		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1327 		{
1328 			vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
1329 			storea(weight, undec_weight_plane1 + i);
1330 
1331 			weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
1332 			storea(weight, undec_weight_plane2 + i);
1333 		}
1334 
1335 		undec_weight_plane1_ref = undec_weight_plane1;
1336 		undec_weight_plane2_ref = undec_weight_plane2;
1337 	}
1338 	else
1339 	{
1340 		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
1341 		{
1342 			vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
1343 			storea(weight, undec_weight_plane1 + i);
1344 
1345 			weight = bilinear_infill_vla(di, dec_weight_plane2, i);
1346 			storea(weight, undec_weight_plane2 + i);
1347 		}
1348 
1349 		undec_weight_plane1_ref = undec_weight_plane1;
1350 		undec_weight_plane2_ref = undec_weight_plane2;
1351 	}
1352 
1353 	unsigned int texel_count = bsd.texel_count;
1354 	vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
1355 	vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
1356 
1357 	float scale_max = 0.0f;
1358 	float scale_min = 1e10f;
1359 
1360 	float wmin1 = 1.0f;
1361 	float wmax1 = 0.0f;
1362 
1363 	float wmin2 = 1.0f;
1364 	float wmax2 = 0.0f;
1365 
1366 	float left1_sum_s = 0.0f;
1367 	float middle1_sum_s = 0.0f;
1368 	float right1_sum_s = 0.0f;
1369 
1370 	float left2_sum_s = 0.0f;
1371 	float middle2_sum_s = 0.0f;
1372 	float right2_sum_s = 0.0f;
1373 
1374 	vfloat4 color_vec_x = vfloat4::zero();
1375 	vfloat4 color_vec_y = vfloat4::zero();
1376 
1377 	vfloat4 scale_vec = vfloat4::zero();
1378 
1379 	vfloat4 weight_weight_sum = vfloat4(1e-17f);
1380 
1381 	vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
1382 	vfloat4 color_weight = blk.channel_weight;
1383 	float ls_weight = hadd_rgb_s(color_weight);
1384 
1385 	for (unsigned int j = 0; j < texel_count; j++)
1386 	{
1387 		vfloat4 rgba = blk.texel(j);
1388 
1389 		float idx0 = undec_weight_plane1_ref[j];
1390 
1391 		float om_idx0 = 1.0f - idx0;
1392 		wmin1 = astc::min(idx0, wmin1);
1393 		wmax1 = astc::max(idx0, wmax1);
1394 
1395 		float scale = dot3_s(scale_dir, rgba);
1396 		scale_min = astc::min(scale, scale_min);
1397 		scale_max = astc::max(scale, scale_max);
1398 
1399 		left1_sum_s   += om_idx0 * om_idx0;
1400 		middle1_sum_s += om_idx0 * idx0;
1401 		right1_sum_s  += idx0 * idx0;
1402 
1403 		float idx1 = undec_weight_plane2_ref[j];
1404 
1405 		float om_idx1 = 1.0f - idx1;
1406 		wmin2 = astc::min(idx1, wmin2);
1407 		wmax2 = astc::max(idx1, wmax2);
1408 
1409 		left2_sum_s   += om_idx1 * om_idx1;
1410 		middle2_sum_s += om_idx1 * idx1;
1411 		right2_sum_s  += idx1 * idx1;
1412 
1413 		vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
1414 
1415 		vfloat4 cwprod = rgba;
1416 		vfloat4 cwiprod = cwprod * color_idx;
1417 
1418 		color_vec_y += cwiprod;
1419 		color_vec_x += cwprod - cwiprod;
1420 
1421 		scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
1422 		weight_weight_sum += (color_weight * color_idx);
1423 	}
1424 
1425 	vfloat4 left1_sum   = vfloat4(left1_sum_s) * color_weight;
1426 	vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
1427 	vfloat4 right1_sum  = vfloat4(right1_sum_s) * color_weight;
1428 	vfloat4 lmrs_sum    = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
1429 
1430 	vfloat4 left2_sum   = vfloat4(left2_sum_s) * color_weight;
1431 	vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
1432 	vfloat4 right2_sum  = vfloat4(right2_sum_s) * color_weight;
1433 
1434 	float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
1435 
1436 	color_vec_x = color_vec_x * color_weight;
1437 	color_vec_y = color_vec_y * color_weight;
1438 
1439 	// Initialize the luminance and scale vectors with a reasonable default
1440 	float scalediv = scale_min * (1.0f / astc::max(scale_max, 1e-10f));
1441 	scalediv = astc::clamp1f(scalediv);
1442 
1443 	vfloat4 sds = scale_dir * scale_max;
1444 
1445 	rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
1446 
1447 	if (wmin1 >= wmax1 * 0.999f)
1448 	{
1449 		// If all weights in the partition were equal, then just take average of all colors in
1450 		// the partition and use that as both endpoint colors
1451 		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1452 
1453 		vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1454 		vmask4 notnan_mask = avg == avg;
1455 		vmask4 full_mask = p1_mask & notnan_mask;
1456 
1457 		ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1458 		ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1459 
1460 		rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
1461 	}
1462 	else
1463 	{
1464 		// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1465 		// set of texel weights and pixel colors
1466 		vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
1467 		vfloat4 color_rdet1 = 1.0f / color_det1;
1468 
1469 		float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
1470 		float ls_rdet1 = 1.0f / ls_det1;
1471 
1472 		vfloat4 color_mss1 = (left1_sum * left1_sum)
1473 		                   + (2.0f * middle1_sum * middle1_sum)
1474 		                   + (right1_sum * right1_sum);
1475 
1476 		float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
1477 		              + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
1478 		              + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
1479 
1480 		vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
1481 		vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
1482 
1483 		float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
1484 		float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
1485 
1486 		vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
1487 		vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
1488 		vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1489 		vmask4 full_mask = p1_mask & det_mask & notnan_mask;
1490 
1491 		ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1492 		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1493 
1494 		if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
1495 		{
1496 			float scalediv2 = scale_ep0 * (1.0f / scale_ep1);
1497 			vfloat4 sdsm = scale_dir * scale_ep1;
1498 			rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
1499 		}
1500 	}
1501 
1502 	if (wmin2 >= wmax2 * 0.999f)
1503 	{
1504 		// If all weights in the partition were equal, then just take average of all colors in
1505 		// the partition and use that as both endpoint colors
1506 		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
1507 
1508 		vmask4 notnan_mask = avg == avg;
1509 		vmask4 full_mask = p2_mask & notnan_mask;
1510 
1511 		ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
1512 		ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
1513 	}
1514 	else
1515 	{
1516 		// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
1517 		// set of texel weights and pixel colors
1518 		vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
1519 		vfloat4 color_rdet2 = 1.0f / color_det2;
1520 
1521 		vfloat4 color_mss2 = (left2_sum * left2_sum)
1522 		                   + (2.0f * middle2_sum * middle2_sum)
1523 		                   + (right2_sum * right2_sum);
1524 
1525 		vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
1526 		vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
1527 
1528 		vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
1529 		vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
1530 		vmask4 full_mask = p2_mask & det_mask & notnan_mask;
1531 
1532 		ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
1533 		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
1534 	}
1535 
1536 	// Calculations specific to mode #7, the HDR RGB-scale mode
1537 	vfloat4 rgbq_sum = color_vec_x + color_vec_y;
1538 	rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
1539 
1540 	rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
1541 
1542 	// We can get a failure due to the use of a singular (non-invertible) matrix
1543 	// If it failed, compute rgbo_vectors[] with a different method ...
1544 	if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
1545 	{
1546 		vfloat4 v0 = ep.endpt0[0];
1547 		vfloat4 v1 = ep.endpt1[0];
1548 
1549 		float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
1550 		avgdif = astc::max(avgdif, 0.0f);
1551 
1552 		vfloat4 avg = (v0 + v1) * 0.5f;
1553 		vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
1554 
1555 		rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
1556 	}
1557 }
1558 
1559 #endif
1560