• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions for finding dominant direction of a set of colors.
20  */
21 #if !defined(ASTCENC_DECOMPRESS_ONLY)
22 
23 #include "astcenc_internal.h"
24 
25 #include <cassert>
26 
27 /**
28  * @brief Compute the average RGB color of each partition.
29  *
30  * The algorithm here uses a vectorized sequential scan and per-partition
31  * color accumulators, using select() to mask texel lanes in other partitions.
32  *
33  * We only accumulate sums for N-1 partitions during the scan; the value for
34  * the last partition can be computed given that we know the block-wide average
35  * already.
36  *
37  * Because of this we could reduce the loop iteration count so it "just" spans
38  * the max texel index needed for the N-1 partitions, which could need fewer
39  * iterations than the full block texel count. However, this makes the loop
40  * count erratic and causes more branch mispredictions so is a net loss.
41  *
42  * @param      pi         The partitioning to use.
43  * @param      blk        The block data to process.
44  * @param[out] averages   The output averages. Unused partition indices will
45  *                        not be initialized, and lane<3> will be zero.
46  */
compute_partition_averages_rgb(const partition_info & pi,const image_block & blk,vfloat4 averages[BLOCK_MAX_PARTITIONS])47 static void compute_partition_averages_rgb(
48 	const partition_info& pi,
49 	const image_block& blk,
50 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
51 ) {
52 	unsigned int partition_count = pi.partition_count;
53 	unsigned int texel_count = blk.texel_count;
54 	promise(texel_count > 0);
55 
56 	// For 1 partition just use the precomputed mean
57 	if (partition_count == 1)
58 	{
59 		averages[0] = blk.data_mean.swz<0, 1, 2>();
60 	}
61 	// For 2 partitions scan results for partition 0, compute partition 1
62 	else if (partition_count == 2)
63 	{
64 		vfloatacc pp_avg_rgb[3] {};
65 
66 		vint lane_id = vint::lane_id();
67 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
68 		{
69 			vint texel_partition(pi.partition_of_texel + i);
70 
71 			vmask lane_mask = lane_id < vint(texel_count);
72 			lane_id += vint(ASTCENC_SIMD_WIDTH);
73 
74 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
75 
76 			vfloat data_r = loada(blk.data_r + i);
77 			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
78 
79 			vfloat data_g = loada(blk.data_g + i);
80 			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
81 
82 			vfloat data_b = loada(blk.data_b + i);
83 			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
84 		}
85 
86 		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
87 
88 		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
89 		                           hadd_s(pp_avg_rgb[1]),
90 		                           hadd_s(pp_avg_rgb[2]));
91 
92 		vfloat4 p1_total = block_total - p0_total;
93 
94 		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
95 		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
96 	}
97 	// For 3 partitions scan results for partition 0/1, compute partition 2
98 	else if (partition_count == 3)
99 	{
100 		vfloatacc pp_avg_rgb[2][3] {};
101 
102 		vint lane_id = vint::lane_id();
103 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
104 		{
105 			vint texel_partition(pi.partition_of_texel + i);
106 
107 			vmask lane_mask = lane_id < vint(texel_count);
108 			lane_id += vint(ASTCENC_SIMD_WIDTH);
109 
110 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
111 			vmask p1_mask = lane_mask & (texel_partition == vint(1));
112 
113 			vfloat data_r = loada(blk.data_r + i);
114 			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
115 			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
116 
117 			vfloat data_g = loada(blk.data_g + i);
118 			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
119 			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
120 
121 			vfloat data_b = loada(blk.data_b + i);
122 			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
123 			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
124 		}
125 
126 		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
127 
128 		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
129 		                           hadd_s(pp_avg_rgb[0][1]),
130 		                           hadd_s(pp_avg_rgb[0][2]));
131 
132 		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
133 		                           hadd_s(pp_avg_rgb[1][1]),
134 		                           hadd_s(pp_avg_rgb[1][2]));
135 
136 		vfloat4 p2_total = block_total - p0_total - p1_total;
137 
138 		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
139 		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
140 		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
141 	}
142 	else
143 	{
144 		// For 4 partitions scan results for partition 0/1/2, compute partition 3
145 		vfloatacc pp_avg_rgb[3][3] {};
146 
147 		vint lane_id = vint::lane_id();
148 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
149 		{
150 			vint texel_partition(pi.partition_of_texel + i);
151 
152 			vmask lane_mask = lane_id < vint(texel_count);
153 			lane_id += vint(ASTCENC_SIMD_WIDTH);
154 
155 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
156 			vmask p1_mask = lane_mask & (texel_partition == vint(1));
157 			vmask p2_mask = lane_mask & (texel_partition == vint(2));
158 
159 			vfloat data_r = loada(blk.data_r + i);
160 			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
161 			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
162 			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
163 
164 			vfloat data_g = loada(blk.data_g + i);
165 			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
166 			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
167 			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
168 
169 			vfloat data_b = loada(blk.data_b + i);
170 			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
171 			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
172 			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
173 		}
174 
175 		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
176 
177 		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
178 		                           hadd_s(pp_avg_rgb[0][1]),
179 		                           hadd_s(pp_avg_rgb[0][2]));
180 
181 		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
182 		                           hadd_s(pp_avg_rgb[1][1]),
183 		                           hadd_s(pp_avg_rgb[1][2]));
184 
185 		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
186 		                           hadd_s(pp_avg_rgb[2][1]),
187 		                           hadd_s(pp_avg_rgb[2][2]));
188 
189 		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
190 
191 		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
192 		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
193 		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
194 		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
195 	}
196 }
197 
198 /**
199  * @brief Compute the average RGBA color of each partition.
200  *
201  * The algorithm here uses a vectorized sequential scan and per-partition
202  * color accumulators, using select() to mask texel lanes in other partitions.
203  *
204  * We only accumulate sums for N-1 partitions during the scan; the value for
205  * the last partition can be computed given that we know the block-wide average
206  * already.
207  *
208  * Because of this we could reduce the loop iteration count so it "just" spans
209  * the max texel index needed for the N-1 partitions, which could need fewer
210  * iterations than the full block texel count. However, this makes the loop
211  * count erratic and causes more branch mispredictions so is a net loss.
212  *
213  * @param      pi         The partitioning to use.
214  * @param      blk        The block data to process.
215  * @param[out] averages   The output averages. Unused partition indices will
216  *                        not be initialized.
217  */
compute_partition_averages_rgba(const partition_info & pi,const image_block & blk,vfloat4 averages[BLOCK_MAX_PARTITIONS])218 static void compute_partition_averages_rgba(
219 	const partition_info& pi,
220 	const image_block& blk,
221 	vfloat4 averages[BLOCK_MAX_PARTITIONS]
222 ) {
223 	unsigned int partition_count = pi.partition_count;
224 	unsigned int texel_count = blk.texel_count;
225 	promise(texel_count > 0);
226 
227 	// For 1 partition just use the precomputed mean
228 	if (partition_count == 1)
229 	{
230 		averages[0] = blk.data_mean;
231 	}
232 	// For 2 partitions scan results for partition 0, compute partition 1
233 	else if (partition_count == 2)
234 	{
235 		vfloat4 pp_avg_rgba[4] {};
236 
237 		vint lane_id = vint::lane_id();
238 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
239 		{
240 			vint texel_partition(pi.partition_of_texel + i);
241 
242 			vmask lane_mask = lane_id < vint(texel_count);
243 			lane_id += vint(ASTCENC_SIMD_WIDTH);
244 
245 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
246 
247 			vfloat data_r = loada(blk.data_r + i);
248 			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
249 
250 			vfloat data_g = loada(blk.data_g + i);
251 			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
252 
253 			vfloat data_b = loada(blk.data_b + i);
254 			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
255 
256 			vfloat data_a = loada(blk.data_a + i);
257 			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
258 		}
259 
260 		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
261 
262 		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
263 		                           hadd_s(pp_avg_rgba[1]),
264 		                           hadd_s(pp_avg_rgba[2]),
265 		                           hadd_s(pp_avg_rgba[3]));
266 
267 		vfloat4 p1_total = block_total - p0_total;
268 
269 		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
270 		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
271 	}
272 	// For 3 partitions scan results for partition 0/1, compute partition 2
273 	else if (partition_count == 3)
274 	{
275 		vfloat4 pp_avg_rgba[2][4] {};
276 
277 		vint lane_id = vint::lane_id();
278 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
279 		{
280 			vint texel_partition(pi.partition_of_texel + i);
281 
282 			vmask lane_mask = lane_id < vint(texel_count);
283 			lane_id += vint(ASTCENC_SIMD_WIDTH);
284 
285 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
286 			vmask p1_mask = lane_mask & (texel_partition == vint(1));
287 
288 			vfloat data_r = loada(blk.data_r + i);
289 			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
290 			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
291 
292 			vfloat data_g = loada(blk.data_g + i);
293 			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
294 			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
295 
296 			vfloat data_b = loada(blk.data_b + i);
297 			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
298 			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
299 
300 			vfloat data_a = loada(blk.data_a + i);
301 			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
302 			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
303 		}
304 
305 		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
306 
307 		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
308 		                           hadd_s(pp_avg_rgba[0][1]),
309 		                           hadd_s(pp_avg_rgba[0][2]),
310 		                           hadd_s(pp_avg_rgba[0][3]));
311 
312 		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
313 		                           hadd_s(pp_avg_rgba[1][1]),
314 		                           hadd_s(pp_avg_rgba[1][2]),
315 		                           hadd_s(pp_avg_rgba[1][3]));
316 
317 		vfloat4 p2_total = block_total - p0_total - p1_total;
318 
319 		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
320 		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
321 		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
322 	}
323 	else
324 	{
325 		// For 4 partitions scan results for partition 0/1/2, compute partition 3
326 		vfloat4 pp_avg_rgba[3][4] {};
327 
328 		vint lane_id = vint::lane_id();
329 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
330 		{
331 			vint texel_partition(pi.partition_of_texel + i);
332 
333 			vmask lane_mask = lane_id < vint(texel_count);
334 			lane_id += vint(ASTCENC_SIMD_WIDTH);
335 
336 			vmask p0_mask = lane_mask & (texel_partition == vint(0));
337 			vmask p1_mask = lane_mask & (texel_partition == vint(1));
338 			vmask p2_mask = lane_mask & (texel_partition == vint(2));
339 
340 			vfloat data_r = loada(blk.data_r + i);
341 			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
342 			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
343 			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
344 
345 			vfloat data_g = loada(blk.data_g + i);
346 			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
347 			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
348 			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
349 
350 			vfloat data_b = loada(blk.data_b + i);
351 			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
352 			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
353 			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
354 
355 			vfloat data_a = loada(blk.data_a + i);
356 			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
357 			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
358 			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
359 		}
360 
361 		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
362 
363 		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
364 		                           hadd_s(pp_avg_rgba[0][1]),
365 		                           hadd_s(pp_avg_rgba[0][2]),
366 		                           hadd_s(pp_avg_rgba[0][3]));
367 
368 		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
369 		                           hadd_s(pp_avg_rgba[1][1]),
370 		                           hadd_s(pp_avg_rgba[1][2]),
371 		                           hadd_s(pp_avg_rgba[1][3]));
372 
373 		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
374 		                           hadd_s(pp_avg_rgba[2][1]),
375 		                           hadd_s(pp_avg_rgba[2][2]),
376 		                           hadd_s(pp_avg_rgba[2][3]));
377 
378 		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
379 
380 		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
381 		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
382 		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
383 		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
384 	}
385 }
386 
387 /* See header for documentation. */
compute_avgs_and_dirs_4_comp(const partition_info & pi,const image_block & blk,partition_metrics pm[BLOCK_MAX_PARTITIONS])388 void compute_avgs_and_dirs_4_comp(
389 	const partition_info& pi,
390 	const image_block& blk,
391 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
392 ) {
393 	float texel_weight = hadd_s(blk.channel_weight) / 4.0f;
394 
395 	int partition_count = pi.partition_count;
396 	promise(partition_count > 0);
397 
398 	// Pre-compute partition_averages
399 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
400 	compute_partition_averages_rgba(pi, blk, partition_averages);
401 
402 	for (int partition = 0; partition < partition_count; partition++)
403 	{
404 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
405 		unsigned int texel_count = pi.partition_texel_count[partition];
406 		promise(texel_count > 0);
407 
408 		vfloat4 average = partition_averages[partition];
409 		pm[partition].avg = average;
410 
411 		vfloat4 sum_xp = vfloat4::zero();
412 		vfloat4 sum_yp = vfloat4::zero();
413 		vfloat4 sum_zp = vfloat4::zero();
414 		vfloat4 sum_wp = vfloat4::zero();
415 
416 		for (unsigned int i = 0; i < texel_count; i++)
417 		{
418 			unsigned int iwt = texel_indexes[i];
419 			vfloat4 texel_datum = blk.texel(iwt);
420 			texel_datum = texel_datum - average;
421 
422 			vfloat4 zero = vfloat4::zero();
423 
424 			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
425 			sum_xp += select(zero, texel_datum, tdm0);
426 
427 			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
428 			sum_yp += select(zero, texel_datum, tdm1);
429 
430 			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
431 			sum_zp += select(zero, texel_datum, tdm2);
432 
433 			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
434 			sum_wp += select(zero, texel_datum, tdm3);
435 		}
436 
437 		sum_xp = sum_xp * texel_weight;
438 		sum_yp = sum_yp * texel_weight;
439 		sum_zp = sum_zp * texel_weight;
440 		sum_wp = sum_wp * texel_weight;
441 
442 		vfloat4 prod_xp = dot(sum_xp, sum_xp);
443 		vfloat4 prod_yp = dot(sum_yp, sum_yp);
444 		vfloat4 prod_zp = dot(sum_zp, sum_zp);
445 		vfloat4 prod_wp = dot(sum_wp, sum_wp);
446 
447 		vfloat4 best_vector = sum_xp;
448 		vfloat4 best_sum = prod_xp;
449 
450 		vmask4 mask = prod_yp > best_sum;
451 		best_vector = select(best_vector, sum_yp, mask);
452 		best_sum = select(best_sum, prod_yp, mask);
453 
454 		mask = prod_zp > best_sum;
455 		best_vector = select(best_vector, sum_zp, mask);
456 		best_sum = select(best_sum, prod_zp, mask);
457 
458 		mask = prod_wp > best_sum;
459 		best_vector = select(best_vector, sum_wp, mask);
460 
461 		pm[partition].dir = best_vector;
462 	}
463 }
464 
465 /* See header for documentation. */
compute_avgs_and_dirs_3_comp(const partition_info & pi,const image_block & blk,unsigned int omitted_component,partition_metrics pm[BLOCK_MAX_PARTITIONS])466 void compute_avgs_and_dirs_3_comp(
467 	const partition_info& pi,
468 	const image_block& blk,
469 	unsigned int omitted_component,
470 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
471 ) {
472 	// Pre-compute partition_averages
473 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
474 	compute_partition_averages_rgba(pi, blk, partition_averages);
475 
476 	float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
477 
478 	const float* data_vr = blk.data_r;
479 	const float* data_vg = blk.data_g;
480 	const float* data_vb = blk.data_b;
481 
482 	// TODO: Data-driven permute would be useful to avoid this ...
483 	if (omitted_component == 0)
484 	{
485 		texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>());
486 
487 		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
488 		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
489 		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
490 		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
491 
492 		data_vr = blk.data_g;
493 		data_vg = blk.data_b;
494 		data_vb = blk.data_a;
495 	}
496 	else if (omitted_component == 1)
497 	{
498 		texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
499 
500 		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
501 		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
502 		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
503 		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
504 
505 		data_vg = blk.data_b;
506 		data_vb = blk.data_a;
507 	}
508 	else if (omitted_component == 2)
509 	{
510 		texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
511 
512 		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
513 		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
514 		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
515 		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
516 
517 		data_vb = blk.data_a;
518 	}
519 	else
520 	{
521 		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
522 		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
523 		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
524 		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
525 	}
526 
527  	texel_weight = texel_weight * (1.0f / 3.0f);
528 
529 	unsigned int partition_count = pi.partition_count;
530 	promise(partition_count > 0);
531 
532 	for (unsigned int partition = 0; partition < partition_count; partition++)
533 	{
534 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
535 		unsigned int texel_count = pi.partition_texel_count[partition];
536 		promise(texel_count > 0);
537 
538 		vfloat4 average = partition_averages[partition];
539 		pm[partition].avg = average;
540 
541 		vfloat4 sum_xp = vfloat4::zero();
542 		vfloat4 sum_yp = vfloat4::zero();
543 		vfloat4 sum_zp = vfloat4::zero();
544 
545 		for (unsigned int i = 0; i < texel_count; i++)
546 		{
547 			unsigned int iwt = texel_indexes[i];
548 
549 			vfloat4 texel_datum = vfloat3(data_vr[iwt],
550 			                              data_vg[iwt],
551 			                              data_vb[iwt]);
552 			texel_datum = texel_datum - average;
553 
554 			vfloat4 zero = vfloat4::zero();
555 
556 			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
557 			sum_xp += select(zero, texel_datum, tdm0);
558 
559 			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
560 			sum_yp += select(zero, texel_datum, tdm1);
561 
562 			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
563 			sum_zp += select(zero, texel_datum, tdm2);
564 		}
565 
566 		sum_xp = sum_xp * texel_weight;
567 		sum_yp = sum_yp * texel_weight;
568 		sum_zp = sum_zp * texel_weight;
569 
570 		vfloat4 prod_xp = dot(sum_xp, sum_xp);
571 		vfloat4 prod_yp = dot(sum_yp, sum_yp);
572 		vfloat4 prod_zp = dot(sum_zp, sum_zp);
573 
574 		vfloat4 best_vector = sum_xp;
575 		vfloat4 best_sum = prod_xp;
576 
577 		vmask4 mask = prod_yp > best_sum;
578 		best_vector = select(best_vector, sum_yp, mask);
579 		best_sum = select(best_sum, prod_yp, mask);
580 
581 		mask = prod_zp > best_sum;
582 		best_vector = select(best_vector, sum_zp, mask);
583 
584 		pm[partition].dir = best_vector;
585 	}
586 }
587 
588 /* See header for documentation. */
compute_avgs_and_dirs_3_comp_rgb(const partition_info & pi,const image_block & blk,partition_metrics pm[BLOCK_MAX_PARTITIONS])589 void compute_avgs_and_dirs_3_comp_rgb(
590 	const partition_info& pi,
591 	const image_block& blk,
592 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
593 ) {
594 	float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) * (1.0f / 3.0f);
595 
596 	unsigned int partition_count = pi.partition_count;
597 	promise(partition_count > 0);
598 
599 	// Pre-compute partition_averages
600 	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
601 	compute_partition_averages_rgb(pi, blk, partition_averages);
602 
603 	for (unsigned int partition = 0; partition < partition_count; partition++)
604 	{
605 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
606 		unsigned int texel_count = pi.partition_texel_count[partition];
607 		promise(texel_count > 0);
608 
609 		vfloat4 average = partition_averages[partition];
610 		pm[partition].avg = average;
611 
612 		vfloat4 sum_xp = vfloat4::zero();
613 		vfloat4 sum_yp = vfloat4::zero();
614 		vfloat4 sum_zp = vfloat4::zero();
615 
616 		for (unsigned int i = 0; i < texel_count; i++)
617 		{
618 			unsigned int iwt = texel_indexes[i];
619 
620 			vfloat4 texel_datum = blk.texel3(iwt);
621 			texel_datum = texel_datum - average;
622 
623 			vfloat4 zero = vfloat4::zero();
624 
625 			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
626 			sum_xp += select(zero, texel_datum, tdm0);
627 
628 			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
629 			sum_yp += select(zero, texel_datum, tdm1);
630 
631 			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
632 			sum_zp += select(zero, texel_datum, tdm2);
633 		}
634 
635 		sum_xp = sum_xp * texel_weight;
636 		sum_yp = sum_yp * texel_weight;
637 		sum_zp = sum_zp * texel_weight;
638 
639 		vfloat4 prod_xp = dot(sum_xp, sum_xp);
640 		vfloat4 prod_yp = dot(sum_yp, sum_yp);
641 		vfloat4 prod_zp = dot(sum_zp, sum_zp);
642 
643 		vfloat4 best_vector = sum_xp;
644 		vfloat4 best_sum = prod_xp;
645 
646 		vmask4 mask = prod_yp > best_sum;
647 		best_vector = select(best_vector, sum_yp, mask);
648 		best_sum = select(best_sum, prod_yp, mask);
649 
650 		mask = prod_zp > best_sum;
651 		best_vector = select(best_vector, sum_zp, mask);
652 
653 		pm[partition].dir = best_vector;
654 	}
655 }
656 
657 /* See header for documentation. */
compute_avgs_and_dirs_2_comp(const partition_info & pt,const image_block & blk,unsigned int component1,unsigned int component2,partition_metrics pm[BLOCK_MAX_PARTITIONS])658 void compute_avgs_and_dirs_2_comp(
659 	const partition_info& pt,
660 	const image_block& blk,
661 	unsigned int component1,
662 	unsigned int component2,
663 	partition_metrics pm[BLOCK_MAX_PARTITIONS]
664 ) {
665 	float texel_weight;
666 	vfloat4 average;
667 
668 	const float* data_vr = nullptr;
669 	const float* data_vg = nullptr;
670 
671 	if (component1 == 0 && component2 == 1)
672 	{
673 		texel_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
674 		average = blk.data_mean.swz<0, 1>();
675 
676 		data_vr = blk.data_r;
677 		data_vg = blk.data_g;
678 	}
679 	else if (component1 == 0 && component2 == 2)
680 	{
681 		texel_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
682 		average = blk.data_mean.swz<0, 2>();
683 
684 		data_vr = blk.data_r;
685 		data_vg = blk.data_b;
686 	}
687 	else // (component1 == 1 && component2 == 2)
688 	{
689 		assert(component1 == 1 && component2 == 2);
690 
691 		texel_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
692 		average = blk.data_mean.swz<1, 2>();
693 
694 		data_vr = blk.data_g;
695 		data_vg = blk.data_b;
696 	}
697 
698 	unsigned int partition_count = pt.partition_count;
699 	promise(partition_count > 0);
700 
701 	for (unsigned int partition = 0; partition < partition_count; partition++)
702 	{
703 		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
704 		unsigned int texel_count = pt.partition_texel_count[partition];
705 		promise(texel_count > 0);
706 
707 		// Only compute a partition mean if more than one partition
708 		if (partition_count > 1)
709 		{
710 			average = vfloat4::zero();
711 			for (unsigned int i = 0; i < texel_count; i++)
712 			{
713 				unsigned int iwt = texel_indexes[i];
714 				average += vfloat2(data_vr[iwt], data_vg[iwt]);
715 			}
716 
717 			average = average * (1.0f / static_cast<float>(texel_count));
718 		}
719 
720 		pm[partition].avg = average;
721 
722 		vfloat4 sum_xp = vfloat4::zero();
723 		vfloat4 sum_yp = vfloat4::zero();
724 
725 		for (unsigned int i = 0; i < texel_count; i++)
726 		{
727 			unsigned int iwt = texel_indexes[i];
728 			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
729 			texel_datum = texel_datum - average;
730 
731 			vfloat4 zero = vfloat4::zero();
732 
733 			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
734 			sum_xp += select(zero, texel_datum, tdm0);
735 
736 			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
737 			sum_yp += select(zero, texel_datum, tdm1);
738 		}
739 
740 		sum_xp = sum_xp * texel_weight;
741 		sum_yp = sum_yp * texel_weight;
742 
743 		vfloat4 prod_xp = dot(sum_xp, sum_xp);
744 		vfloat4 prod_yp = dot(sum_yp, sum_yp);
745 
746 		vfloat4 best_vector = sum_xp;
747 		vfloat4 best_sum = prod_xp;
748 
749 		vmask4 mask = prod_yp > best_sum;
750 		best_vector = select(best_vector, sum_yp, mask);
751 
752 		pm[partition].dir = best_vector;
753 	}
754 }
755 
756 /* See header for documentation. */
compute_error_squared_rgba(const partition_info & pi,const image_block & blk,const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],float uncor_lengths[BLOCK_MAX_PARTITIONS],float samec_lengths[BLOCK_MAX_PARTITIONS],float & uncor_error,float & samec_error)757 void compute_error_squared_rgba(
758 	const partition_info& pi,
759 	const image_block& blk,
760 	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
761 	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
762 	float uncor_lengths[BLOCK_MAX_PARTITIONS],
763 	float samec_lengths[BLOCK_MAX_PARTITIONS],
764 	float& uncor_error,
765 	float& samec_error
766 ) {
767 	unsigned int partition_count = pi.partition_count;
768 	promise(partition_count > 0);
769 
770 	vfloatacc uncor_errorsumv = vfloatacc::zero();
771 	vfloatacc samec_errorsumv = vfloatacc::zero();
772 
773 	for (unsigned int partition = 0; partition < partition_count; partition++)
774 	{
775 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
776 
777 		float uncor_loparam = 1e10f;
778 		float uncor_hiparam = -1e10f;
779 
780 		float samec_loparam = 1e10f;
781 		float samec_hiparam = -1e10f;
782 
783 		processed_line4 l_uncor = uncor_plines[partition];
784 		processed_line4 l_samec = samec_plines[partition];
785 
786 		unsigned int texel_count = pi.partition_texel_count[partition];
787 		promise(texel_count > 0);
788 
789 		// Vectorize some useful scalar inputs
790 		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
791 		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
792 		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
793 		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
794 
795 		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
796 		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
797 		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
798 		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
799 
800 		vfloat l_samec_bs0(l_samec.bs.lane<0>());
801 		vfloat l_samec_bs1(l_samec.bs.lane<1>());
802 		vfloat l_samec_bs2(l_samec.bs.lane<2>());
803 		vfloat l_samec_bs3(l_samec.bs.lane<3>());
804 
805 		assert(all(l_samec.amod == vfloat4(0.0f)));
806 
807 		vfloat uncor_loparamv(1e10f);
808 		vfloat uncor_hiparamv(-1e10f);
809 
810 		vfloat samec_loparamv(1e10f);
811 		vfloat samec_hiparamv(-1e10f);
812 
813 		vfloat ew_r(blk.channel_weight.lane<0>());
814 		vfloat ew_g(blk.channel_weight.lane<1>());
815 		vfloat ew_b(blk.channel_weight.lane<2>());
816 		vfloat ew_a(blk.channel_weight.lane<3>());
817 
818 		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
819 		// array to extend the last value. This means min/max are not impacted, but we need to mask
820 		// out the dummy values when we compute the line weighting.
821 		vint lane_ids = vint::lane_id();
822 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
823 		{
824 			vmask mask = lane_ids < vint(texel_count);
825 			vint texel_idxs(&(texel_indexes[i]));
826 
827 			vfloat data_r = gatherf(blk.data_r, texel_idxs);
828 			vfloat data_g = gatherf(blk.data_g, texel_idxs);
829 			vfloat data_b = gatherf(blk.data_b, texel_idxs);
830 			vfloat data_a = gatherf(blk.data_a, texel_idxs);
831 
832 			vfloat uncor_param  = (data_r * l_uncor_bs0)
833 			                    + (data_g * l_uncor_bs1)
834 			                    + (data_b * l_uncor_bs2)
835 			                    + (data_a * l_uncor_bs3);
836 
837 			uncor_loparamv = min(uncor_param, uncor_loparamv);
838 			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
839 
840 			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
841 			                   + (uncor_param * l_uncor_bs0);
842 			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
843 			                   + (uncor_param * l_uncor_bs1);
844 			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
845 			                   + (uncor_param * l_uncor_bs2);
846 			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
847 			                   + (uncor_param * l_uncor_bs3);
848 
849 			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
850 			                 + (ew_g * uncor_dist1 * uncor_dist1)
851 			                 + (ew_b * uncor_dist2 * uncor_dist2)
852 			                 + (ew_a * uncor_dist3 * uncor_dist3);
853 
854 			haccumulate(uncor_errorsumv, uncor_err, mask);
855 
856 			// Process samechroma data
857 			vfloat samec_param = (data_r * l_samec_bs0)
858 			                   + (data_g * l_samec_bs1)
859 			                   + (data_b * l_samec_bs2)
860 			                   + (data_a * l_samec_bs3);
861 
862 			samec_loparamv = min(samec_param, samec_loparamv);
863 			samec_hiparamv = max(samec_param, samec_hiparamv);
864 
865 			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
866 			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
867 			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
868 			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
869 
870 			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
871 			                 + (ew_g * samec_dist1 * samec_dist1)
872 			                 + (ew_b * samec_dist2 * samec_dist2)
873 			                 + (ew_a * samec_dist3 * samec_dist3);
874 
875 			haccumulate(samec_errorsumv, samec_err, mask);
876 
877 			lane_ids += vint(ASTCENC_SIMD_WIDTH);
878 		}
879 
880 		uncor_loparam = hmin_s(uncor_loparamv);
881 		uncor_hiparam = hmax_s(uncor_hiparamv);
882 
883 		samec_loparam = hmin_s(samec_loparamv);
884 		samec_hiparam = hmax_s(samec_hiparamv);
885 
886 		float uncor_linelen = uncor_hiparam - uncor_loparam;
887 		float samec_linelen = samec_hiparam - samec_loparam;
888 
889 		// Turn very small numbers and NaNs into a small number
890 		uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
891 		samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
892 	}
893 
894 	uncor_error = hadd_s(uncor_errorsumv);
895 	samec_error = hadd_s(samec_errorsumv);
896 }
897 
898 /* See header for documentation. */
compute_error_squared_rgb(const partition_info & pi,const image_block & blk,partition_lines3 plines[BLOCK_MAX_PARTITIONS],float & uncor_error,float & samec_error)899 void compute_error_squared_rgb(
900 	const partition_info& pi,
901 	const image_block& blk,
902 	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
903 	float& uncor_error,
904 	float& samec_error
905 ) {
906 	unsigned int partition_count = pi.partition_count;
907 	promise(partition_count > 0);
908 
909 	vfloatacc uncor_errorsumv = vfloatacc::zero();
910 	vfloatacc samec_errorsumv = vfloatacc::zero();
911 
912 	for (unsigned int partition = 0; partition < partition_count; partition++)
913 	{
914 		partition_lines3& pl = plines[partition];
915 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
916 		unsigned int texel_count = pi.partition_texel_count[partition];
917 		promise(texel_count > 0);
918 
919 		float uncor_loparam = 1e10f;
920 		float uncor_hiparam = -1e10f;
921 
922 		float samec_loparam = 1e10f;
923 		float samec_hiparam = -1e10f;
924 
925 		processed_line3 l_uncor = pl.uncor_pline;
926 		processed_line3 l_samec = pl.samec_pline;
927 
928 		// This implementation is an example vectorization of this function.
929 		// It works for - the codec is a 2-4% faster than not vectorizing - but
930 		// the benefit is limited by the use of gathers and register pressure
931 
932 		// Vectorize some useful scalar inputs
933 		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
934 		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
935 		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
936 
937 		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
938 		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
939 		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
940 
941 		vfloat l_samec_bs0(l_samec.bs.lane<0>());
942 		vfloat l_samec_bs1(l_samec.bs.lane<1>());
943 		vfloat l_samec_bs2(l_samec.bs.lane<2>());
944 
945 		assert(all(l_samec.amod == vfloat4(0.0f)));
946 
947 		vfloat uncor_loparamv(1e10f);
948 		vfloat uncor_hiparamv(-1e10f);
949 
950 		vfloat samec_loparamv(1e10f);
951 		vfloat samec_hiparamv(-1e10f);
952 
953 		vfloat ew_r(blk.channel_weight.lane<0>());
954 		vfloat ew_g(blk.channel_weight.lane<1>());
955 		vfloat ew_b(blk.channel_weight.lane<2>());
956 
957 		// This implementation over-shoots, but this is safe as we initialize the weights array
958 		// to extend the last value. This means min/max are not impacted, but we need to mask
959 		// out the dummy values when we compute the line weighting.
960 		vint lane_ids = vint::lane_id();
961 		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
962 		{
963 			vmask mask = lane_ids < vint(texel_count);
964 			vint texel_idxs(&(texel_indexes[i]));
965 
966 			vfloat data_r = gatherf(blk.data_r, texel_idxs);
967 			vfloat data_g = gatherf(blk.data_g, texel_idxs);
968 			vfloat data_b = gatherf(blk.data_b, texel_idxs);
969 
970 			vfloat uncor_param  = (data_r * l_uncor_bs0)
971 			                    + (data_g * l_uncor_bs1)
972 			                    + (data_b * l_uncor_bs2);
973 
974 			uncor_loparamv = min(uncor_param, uncor_loparamv);
975 			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
976 
977 			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
978 			                   + (uncor_param * l_uncor_bs0);
979 			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
980 			                   + (uncor_param * l_uncor_bs1);
981 			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
982 			                   + (uncor_param * l_uncor_bs2);
983 
984 			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
985 			                 + (ew_g * uncor_dist1 * uncor_dist1)
986 			                 + (ew_b * uncor_dist2 * uncor_dist2);
987 
988 			haccumulate(uncor_errorsumv, uncor_err, mask);
989 
990 			// Process samechroma data
991 			vfloat samec_param = (data_r * l_samec_bs0)
992 			                   + (data_g * l_samec_bs1)
993 			                   + (data_b * l_samec_bs2);
994 
995 			samec_loparamv = min(samec_param, samec_loparamv);
996 			samec_hiparamv = max(samec_param, samec_hiparamv);
997 
998 			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
999 			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
1000 			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
1001 
1002 			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
1003 			                 + (ew_g * samec_dist1 * samec_dist1)
1004 			                 + (ew_b * samec_dist2 * samec_dist2);
1005 
1006 			haccumulate(samec_errorsumv, samec_err, mask);
1007 
1008 			lane_ids += vint(ASTCENC_SIMD_WIDTH);
1009 		}
1010 
1011 		uncor_loparam = hmin_s(uncor_loparamv);
1012 		uncor_hiparam = hmax_s(uncor_hiparamv);
1013 
1014 		samec_loparam = hmin_s(samec_loparamv);
1015 		samec_hiparam = hmax_s(samec_hiparamv);
1016 
1017 		float uncor_linelen = uncor_hiparam - uncor_loparam;
1018 		float samec_linelen = samec_hiparam - samec_loparam;
1019 
1020 		// Turn very small numbers and NaNs into a small number
1021 		pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
1022 		pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
1023 	}
1024 
1025 	uncor_error = hadd_s(uncor_errorsumv);
1026 	samec_error = hadd_s(samec_errorsumv);
1027 }
1028 
1029 #endif
1030