1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 /**
19 * @brief Functions for finding dominant direction of a set of colors.
20 */
21 #if !defined(ASTCENC_DECOMPRESS_ONLY)
22
23 #include "astcenc_internal.h"
24
25 #include <cassert>
26
27 /**
28 * @brief Compute the average RGB color of each partition.
29 *
30 * The algorithm here uses a vectorized sequential scan and per-partition
31 * color accumulators, using select() to mask texel lanes in other partitions.
32 *
33 * We only accumulate sums for N-1 partitions during the scan; the value for
34 * the last partition can be computed given that we know the block-wide average
35 * already.
36 *
37 * Because of this we could reduce the loop iteration count so it "just" spans
38 * the max texel index needed for the N-1 partitions, which could need fewer
39 * iterations than the full block texel count. However, this makes the loop
40 * count erratic and causes more branch mispredictions so is a net loss.
41 *
42 * @param pi The partitioning to use.
43 * @param blk The block data to process.
44 * @param[out] averages The output averages. Unused partition indices will
45 * not be initialized, and lane<3> will be zero.
46 */
compute_partition_averages_rgb(const partition_info & pi,const image_block & blk,vfloat4 averages[BLOCK_MAX_PARTITIONS])47 static void compute_partition_averages_rgb(
48 const partition_info& pi,
49 const image_block& blk,
50 vfloat4 averages[BLOCK_MAX_PARTITIONS]
51 ) {
52 unsigned int partition_count = pi.partition_count;
53 unsigned int texel_count = blk.texel_count;
54 promise(texel_count > 0);
55
56 // For 1 partition just use the precomputed mean
57 if (partition_count == 1)
58 {
59 averages[0] = blk.data_mean.swz<0, 1, 2>();
60 }
61 // For 2 partitions scan results for partition 0, compute partition 1
62 else if (partition_count == 2)
63 {
64 vfloatacc pp_avg_rgb[3] {};
65
66 vint lane_id = vint::lane_id();
67 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
68 {
69 vint texel_partition(pi.partition_of_texel + i);
70
71 vmask lane_mask = lane_id < vint(texel_count);
72 lane_id += vint(ASTCENC_SIMD_WIDTH);
73
74 vmask p0_mask = lane_mask & (texel_partition == vint(0));
75
76 vfloat data_r = loada(blk.data_r + i);
77 haccumulate(pp_avg_rgb[0], data_r, p0_mask);
78
79 vfloat data_g = loada(blk.data_g + i);
80 haccumulate(pp_avg_rgb[1], data_g, p0_mask);
81
82 vfloat data_b = loada(blk.data_b + i);
83 haccumulate(pp_avg_rgb[2], data_b, p0_mask);
84 }
85
86 vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
87
88 vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
89 hadd_s(pp_avg_rgb[1]),
90 hadd_s(pp_avg_rgb[2]));
91
92 vfloat4 p1_total = block_total - p0_total;
93
94 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
95 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
96 }
97 // For 3 partitions scan results for partition 0/1, compute partition 2
98 else if (partition_count == 3)
99 {
100 vfloatacc pp_avg_rgb[2][3] {};
101
102 vint lane_id = vint::lane_id();
103 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
104 {
105 vint texel_partition(pi.partition_of_texel + i);
106
107 vmask lane_mask = lane_id < vint(texel_count);
108 lane_id += vint(ASTCENC_SIMD_WIDTH);
109
110 vmask p0_mask = lane_mask & (texel_partition == vint(0));
111 vmask p1_mask = lane_mask & (texel_partition == vint(1));
112
113 vfloat data_r = loada(blk.data_r + i);
114 haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
115 haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
116
117 vfloat data_g = loada(blk.data_g + i);
118 haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
119 haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
120
121 vfloat data_b = loada(blk.data_b + i);
122 haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
123 haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
124 }
125
126 vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
127
128 vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
129 hadd_s(pp_avg_rgb[0][1]),
130 hadd_s(pp_avg_rgb[0][2]));
131
132 vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
133 hadd_s(pp_avg_rgb[1][1]),
134 hadd_s(pp_avg_rgb[1][2]));
135
136 vfloat4 p2_total = block_total - p0_total - p1_total;
137
138 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
139 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
140 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
141 }
142 else
143 {
144 // For 4 partitions scan results for partition 0/1/2, compute partition 3
145 vfloatacc pp_avg_rgb[3][3] {};
146
147 vint lane_id = vint::lane_id();
148 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
149 {
150 vint texel_partition(pi.partition_of_texel + i);
151
152 vmask lane_mask = lane_id < vint(texel_count);
153 lane_id += vint(ASTCENC_SIMD_WIDTH);
154
155 vmask p0_mask = lane_mask & (texel_partition == vint(0));
156 vmask p1_mask = lane_mask & (texel_partition == vint(1));
157 vmask p2_mask = lane_mask & (texel_partition == vint(2));
158
159 vfloat data_r = loada(blk.data_r + i);
160 haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
161 haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
162 haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
163
164 vfloat data_g = loada(blk.data_g + i);
165 haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
166 haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
167 haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
168
169 vfloat data_b = loada(blk.data_b + i);
170 haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
171 haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
172 haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
173 }
174
175 vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
176
177 vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
178 hadd_s(pp_avg_rgb[0][1]),
179 hadd_s(pp_avg_rgb[0][2]));
180
181 vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
182 hadd_s(pp_avg_rgb[1][1]),
183 hadd_s(pp_avg_rgb[1][2]));
184
185 vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
186 hadd_s(pp_avg_rgb[2][1]),
187 hadd_s(pp_avg_rgb[2][2]));
188
189 vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
190
191 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
192 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
193 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
194 averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
195 }
196 }
197
198 /**
199 * @brief Compute the average RGBA color of each partition.
200 *
201 * The algorithm here uses a vectorized sequential scan and per-partition
202 * color accumulators, using select() to mask texel lanes in other partitions.
203 *
204 * We only accumulate sums for N-1 partitions during the scan; the value for
205 * the last partition can be computed given that we know the block-wide average
206 * already.
207 *
208 * Because of this we could reduce the loop iteration count so it "just" spans
209 * the max texel index needed for the N-1 partitions, which could need fewer
210 * iterations than the full block texel count. However, this makes the loop
211 * count erratic and causes more branch mispredictions so is a net loss.
212 *
213 * @param pi The partitioning to use.
214 * @param blk The block data to process.
215 * @param[out] averages The output averages. Unused partition indices will
216 * not be initialized.
217 */
compute_partition_averages_rgba(const partition_info & pi,const image_block & blk,vfloat4 averages[BLOCK_MAX_PARTITIONS])218 static void compute_partition_averages_rgba(
219 const partition_info& pi,
220 const image_block& blk,
221 vfloat4 averages[BLOCK_MAX_PARTITIONS]
222 ) {
223 unsigned int partition_count = pi.partition_count;
224 unsigned int texel_count = blk.texel_count;
225 promise(texel_count > 0);
226
227 // For 1 partition just use the precomputed mean
228 if (partition_count == 1)
229 {
230 averages[0] = blk.data_mean;
231 }
232 // For 2 partitions scan results for partition 0, compute partition 1
233 else if (partition_count == 2)
234 {
235 vfloat4 pp_avg_rgba[4] {};
236
237 vint lane_id = vint::lane_id();
238 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
239 {
240 vint texel_partition(pi.partition_of_texel + i);
241
242 vmask lane_mask = lane_id < vint(texel_count);
243 lane_id += vint(ASTCENC_SIMD_WIDTH);
244
245 vmask p0_mask = lane_mask & (texel_partition == vint(0));
246
247 vfloat data_r = loada(blk.data_r + i);
248 haccumulate(pp_avg_rgba[0], data_r, p0_mask);
249
250 vfloat data_g = loada(blk.data_g + i);
251 haccumulate(pp_avg_rgba[1], data_g, p0_mask);
252
253 vfloat data_b = loada(blk.data_b + i);
254 haccumulate(pp_avg_rgba[2], data_b, p0_mask);
255
256 vfloat data_a = loada(blk.data_a + i);
257 haccumulate(pp_avg_rgba[3], data_a, p0_mask);
258 }
259
260 vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
261
262 vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
263 hadd_s(pp_avg_rgba[1]),
264 hadd_s(pp_avg_rgba[2]),
265 hadd_s(pp_avg_rgba[3]));
266
267 vfloat4 p1_total = block_total - p0_total;
268
269 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
270 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
271 }
272 // For 3 partitions scan results for partition 0/1, compute partition 2
273 else if (partition_count == 3)
274 {
275 vfloat4 pp_avg_rgba[2][4] {};
276
277 vint lane_id = vint::lane_id();
278 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
279 {
280 vint texel_partition(pi.partition_of_texel + i);
281
282 vmask lane_mask = lane_id < vint(texel_count);
283 lane_id += vint(ASTCENC_SIMD_WIDTH);
284
285 vmask p0_mask = lane_mask & (texel_partition == vint(0));
286 vmask p1_mask = lane_mask & (texel_partition == vint(1));
287
288 vfloat data_r = loada(blk.data_r + i);
289 haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
290 haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
291
292 vfloat data_g = loada(blk.data_g + i);
293 haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
294 haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
295
296 vfloat data_b = loada(blk.data_b + i);
297 haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
298 haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
299
300 vfloat data_a = loada(blk.data_a + i);
301 haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
302 haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
303 }
304
305 vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
306
307 vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
308 hadd_s(pp_avg_rgba[0][1]),
309 hadd_s(pp_avg_rgba[0][2]),
310 hadd_s(pp_avg_rgba[0][3]));
311
312 vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
313 hadd_s(pp_avg_rgba[1][1]),
314 hadd_s(pp_avg_rgba[1][2]),
315 hadd_s(pp_avg_rgba[1][3]));
316
317 vfloat4 p2_total = block_total - p0_total - p1_total;
318
319 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
320 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
321 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
322 }
323 else
324 {
325 // For 4 partitions scan results for partition 0/1/2, compute partition 3
326 vfloat4 pp_avg_rgba[3][4] {};
327
328 vint lane_id = vint::lane_id();
329 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
330 {
331 vint texel_partition(pi.partition_of_texel + i);
332
333 vmask lane_mask = lane_id < vint(texel_count);
334 lane_id += vint(ASTCENC_SIMD_WIDTH);
335
336 vmask p0_mask = lane_mask & (texel_partition == vint(0));
337 vmask p1_mask = lane_mask & (texel_partition == vint(1));
338 vmask p2_mask = lane_mask & (texel_partition == vint(2));
339
340 vfloat data_r = loada(blk.data_r + i);
341 haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
342 haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
343 haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
344
345 vfloat data_g = loada(blk.data_g + i);
346 haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
347 haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
348 haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
349
350 vfloat data_b = loada(blk.data_b + i);
351 haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
352 haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
353 haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
354
355 vfloat data_a = loada(blk.data_a + i);
356 haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
357 haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
358 haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
359 }
360
361 vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
362
363 vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
364 hadd_s(pp_avg_rgba[0][1]),
365 hadd_s(pp_avg_rgba[0][2]),
366 hadd_s(pp_avg_rgba[0][3]));
367
368 vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
369 hadd_s(pp_avg_rgba[1][1]),
370 hadd_s(pp_avg_rgba[1][2]),
371 hadd_s(pp_avg_rgba[1][3]));
372
373 vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
374 hadd_s(pp_avg_rgba[2][1]),
375 hadd_s(pp_avg_rgba[2][2]),
376 hadd_s(pp_avg_rgba[2][3]));
377
378 vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
379
380 averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
381 averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
382 averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
383 averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
384 }
385 }
386
387 /* See header for documentation. */
compute_avgs_and_dirs_4_comp(const partition_info & pi,const image_block & blk,partition_metrics pm[BLOCK_MAX_PARTITIONS])388 void compute_avgs_and_dirs_4_comp(
389 const partition_info& pi,
390 const image_block& blk,
391 partition_metrics pm[BLOCK_MAX_PARTITIONS]
392 ) {
393 float texel_weight = hadd_s(blk.channel_weight) / 4.0f;
394
395 int partition_count = pi.partition_count;
396 promise(partition_count > 0);
397
398 // Pre-compute partition_averages
399 vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
400 compute_partition_averages_rgba(pi, blk, partition_averages);
401
402 for (int partition = 0; partition < partition_count; partition++)
403 {
404 const uint8_t *texel_indexes = pi.texels_of_partition[partition];
405 unsigned int texel_count = pi.partition_texel_count[partition];
406 promise(texel_count > 0);
407
408 vfloat4 average = partition_averages[partition];
409 pm[partition].avg = average;
410
411 vfloat4 sum_xp = vfloat4::zero();
412 vfloat4 sum_yp = vfloat4::zero();
413 vfloat4 sum_zp = vfloat4::zero();
414 vfloat4 sum_wp = vfloat4::zero();
415
416 for (unsigned int i = 0; i < texel_count; i++)
417 {
418 unsigned int iwt = texel_indexes[i];
419 vfloat4 texel_datum = blk.texel(iwt);
420 texel_datum = texel_datum - average;
421
422 vfloat4 zero = vfloat4::zero();
423
424 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
425 sum_xp += select(zero, texel_datum, tdm0);
426
427 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
428 sum_yp += select(zero, texel_datum, tdm1);
429
430 vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
431 sum_zp += select(zero, texel_datum, tdm2);
432
433 vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
434 sum_wp += select(zero, texel_datum, tdm3);
435 }
436
437 sum_xp = sum_xp * texel_weight;
438 sum_yp = sum_yp * texel_weight;
439 sum_zp = sum_zp * texel_weight;
440 sum_wp = sum_wp * texel_weight;
441
442 vfloat4 prod_xp = dot(sum_xp, sum_xp);
443 vfloat4 prod_yp = dot(sum_yp, sum_yp);
444 vfloat4 prod_zp = dot(sum_zp, sum_zp);
445 vfloat4 prod_wp = dot(sum_wp, sum_wp);
446
447 vfloat4 best_vector = sum_xp;
448 vfloat4 best_sum = prod_xp;
449
450 vmask4 mask = prod_yp > best_sum;
451 best_vector = select(best_vector, sum_yp, mask);
452 best_sum = select(best_sum, prod_yp, mask);
453
454 mask = prod_zp > best_sum;
455 best_vector = select(best_vector, sum_zp, mask);
456 best_sum = select(best_sum, prod_zp, mask);
457
458 mask = prod_wp > best_sum;
459 best_vector = select(best_vector, sum_wp, mask);
460
461 pm[partition].dir = best_vector;
462 }
463 }
464
465 /* See header for documentation. */
compute_avgs_and_dirs_3_comp(const partition_info & pi,const image_block & blk,unsigned int omitted_component,partition_metrics pm[BLOCK_MAX_PARTITIONS])466 void compute_avgs_and_dirs_3_comp(
467 const partition_info& pi,
468 const image_block& blk,
469 unsigned int omitted_component,
470 partition_metrics pm[BLOCK_MAX_PARTITIONS]
471 ) {
472 // Pre-compute partition_averages
473 vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
474 compute_partition_averages_rgba(pi, blk, partition_averages);
475
476 float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
477
478 const float* data_vr = blk.data_r;
479 const float* data_vg = blk.data_g;
480 const float* data_vb = blk.data_b;
481
482 // TODO: Data-driven permute would be useful to avoid this ...
483 if (omitted_component == 0)
484 {
485 texel_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>());
486
487 partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
488 partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
489 partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
490 partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
491
492 data_vr = blk.data_g;
493 data_vg = blk.data_b;
494 data_vb = blk.data_a;
495 }
496 else if (omitted_component == 1)
497 {
498 texel_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
499
500 partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
501 partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
502 partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
503 partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
504
505 data_vg = blk.data_b;
506 data_vb = blk.data_a;
507 }
508 else if (omitted_component == 2)
509 {
510 texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
511
512 partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
513 partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
514 partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
515 partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
516
517 data_vb = blk.data_a;
518 }
519 else
520 {
521 partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
522 partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
523 partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
524 partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
525 }
526
527 texel_weight = texel_weight * (1.0f / 3.0f);
528
529 unsigned int partition_count = pi.partition_count;
530 promise(partition_count > 0);
531
532 for (unsigned int partition = 0; partition < partition_count; partition++)
533 {
534 const uint8_t *texel_indexes = pi.texels_of_partition[partition];
535 unsigned int texel_count = pi.partition_texel_count[partition];
536 promise(texel_count > 0);
537
538 vfloat4 average = partition_averages[partition];
539 pm[partition].avg = average;
540
541 vfloat4 sum_xp = vfloat4::zero();
542 vfloat4 sum_yp = vfloat4::zero();
543 vfloat4 sum_zp = vfloat4::zero();
544
545 for (unsigned int i = 0; i < texel_count; i++)
546 {
547 unsigned int iwt = texel_indexes[i];
548
549 vfloat4 texel_datum = vfloat3(data_vr[iwt],
550 data_vg[iwt],
551 data_vb[iwt]);
552 texel_datum = texel_datum - average;
553
554 vfloat4 zero = vfloat4::zero();
555
556 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
557 sum_xp += select(zero, texel_datum, tdm0);
558
559 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
560 sum_yp += select(zero, texel_datum, tdm1);
561
562 vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
563 sum_zp += select(zero, texel_datum, tdm2);
564 }
565
566 sum_xp = sum_xp * texel_weight;
567 sum_yp = sum_yp * texel_weight;
568 sum_zp = sum_zp * texel_weight;
569
570 vfloat4 prod_xp = dot(sum_xp, sum_xp);
571 vfloat4 prod_yp = dot(sum_yp, sum_yp);
572 vfloat4 prod_zp = dot(sum_zp, sum_zp);
573
574 vfloat4 best_vector = sum_xp;
575 vfloat4 best_sum = prod_xp;
576
577 vmask4 mask = prod_yp > best_sum;
578 best_vector = select(best_vector, sum_yp, mask);
579 best_sum = select(best_sum, prod_yp, mask);
580
581 mask = prod_zp > best_sum;
582 best_vector = select(best_vector, sum_zp, mask);
583
584 pm[partition].dir = best_vector;
585 }
586 }
587
588 /* See header for documentation. */
compute_avgs_and_dirs_3_comp_rgb(const partition_info & pi,const image_block & blk,partition_metrics pm[BLOCK_MAX_PARTITIONS])589 void compute_avgs_and_dirs_3_comp_rgb(
590 const partition_info& pi,
591 const image_block& blk,
592 partition_metrics pm[BLOCK_MAX_PARTITIONS]
593 ) {
594 float texel_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()) * (1.0f / 3.0f);
595
596 unsigned int partition_count = pi.partition_count;
597 promise(partition_count > 0);
598
599 // Pre-compute partition_averages
600 vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
601 compute_partition_averages_rgb(pi, blk, partition_averages);
602
603 for (unsigned int partition = 0; partition < partition_count; partition++)
604 {
605 const uint8_t *texel_indexes = pi.texels_of_partition[partition];
606 unsigned int texel_count = pi.partition_texel_count[partition];
607 promise(texel_count > 0);
608
609 vfloat4 average = partition_averages[partition];
610 pm[partition].avg = average;
611
612 vfloat4 sum_xp = vfloat4::zero();
613 vfloat4 sum_yp = vfloat4::zero();
614 vfloat4 sum_zp = vfloat4::zero();
615
616 for (unsigned int i = 0; i < texel_count; i++)
617 {
618 unsigned int iwt = texel_indexes[i];
619
620 vfloat4 texel_datum = blk.texel3(iwt);
621 texel_datum = texel_datum - average;
622
623 vfloat4 zero = vfloat4::zero();
624
625 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
626 sum_xp += select(zero, texel_datum, tdm0);
627
628 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
629 sum_yp += select(zero, texel_datum, tdm1);
630
631 vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
632 sum_zp += select(zero, texel_datum, tdm2);
633 }
634
635 sum_xp = sum_xp * texel_weight;
636 sum_yp = sum_yp * texel_weight;
637 sum_zp = sum_zp * texel_weight;
638
639 vfloat4 prod_xp = dot(sum_xp, sum_xp);
640 vfloat4 prod_yp = dot(sum_yp, sum_yp);
641 vfloat4 prod_zp = dot(sum_zp, sum_zp);
642
643 vfloat4 best_vector = sum_xp;
644 vfloat4 best_sum = prod_xp;
645
646 vmask4 mask = prod_yp > best_sum;
647 best_vector = select(best_vector, sum_yp, mask);
648 best_sum = select(best_sum, prod_yp, mask);
649
650 mask = prod_zp > best_sum;
651 best_vector = select(best_vector, sum_zp, mask);
652
653 pm[partition].dir = best_vector;
654 }
655 }
656
657 /* See header for documentation. */
compute_avgs_and_dirs_2_comp(const partition_info & pt,const image_block & blk,unsigned int component1,unsigned int component2,partition_metrics pm[BLOCK_MAX_PARTITIONS])658 void compute_avgs_and_dirs_2_comp(
659 const partition_info& pt,
660 const image_block& blk,
661 unsigned int component1,
662 unsigned int component2,
663 partition_metrics pm[BLOCK_MAX_PARTITIONS]
664 ) {
665 float texel_weight;
666 vfloat4 average;
667
668 const float* data_vr = nullptr;
669 const float* data_vg = nullptr;
670
671 if (component1 == 0 && component2 == 1)
672 {
673 texel_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
674 average = blk.data_mean.swz<0, 1>();
675
676 data_vr = blk.data_r;
677 data_vg = blk.data_g;
678 }
679 else if (component1 == 0 && component2 == 2)
680 {
681 texel_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
682 average = blk.data_mean.swz<0, 2>();
683
684 data_vr = blk.data_r;
685 data_vg = blk.data_b;
686 }
687 else // (component1 == 1 && component2 == 2)
688 {
689 assert(component1 == 1 && component2 == 2);
690
691 texel_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
692 average = blk.data_mean.swz<1, 2>();
693
694 data_vr = blk.data_g;
695 data_vg = blk.data_b;
696 }
697
698 unsigned int partition_count = pt.partition_count;
699 promise(partition_count > 0);
700
701 for (unsigned int partition = 0; partition < partition_count; partition++)
702 {
703 const uint8_t *texel_indexes = pt.texels_of_partition[partition];
704 unsigned int texel_count = pt.partition_texel_count[partition];
705 promise(texel_count > 0);
706
707 // Only compute a partition mean if more than one partition
708 if (partition_count > 1)
709 {
710 average = vfloat4::zero();
711 for (unsigned int i = 0; i < texel_count; i++)
712 {
713 unsigned int iwt = texel_indexes[i];
714 average += vfloat2(data_vr[iwt], data_vg[iwt]);
715 }
716
717 average = average * (1.0f / static_cast<float>(texel_count));
718 }
719
720 pm[partition].avg = average;
721
722 vfloat4 sum_xp = vfloat4::zero();
723 vfloat4 sum_yp = vfloat4::zero();
724
725 for (unsigned int i = 0; i < texel_count; i++)
726 {
727 unsigned int iwt = texel_indexes[i];
728 vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
729 texel_datum = texel_datum - average;
730
731 vfloat4 zero = vfloat4::zero();
732
733 vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
734 sum_xp += select(zero, texel_datum, tdm0);
735
736 vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
737 sum_yp += select(zero, texel_datum, tdm1);
738 }
739
740 sum_xp = sum_xp * texel_weight;
741 sum_yp = sum_yp * texel_weight;
742
743 vfloat4 prod_xp = dot(sum_xp, sum_xp);
744 vfloat4 prod_yp = dot(sum_yp, sum_yp);
745
746 vfloat4 best_vector = sum_xp;
747 vfloat4 best_sum = prod_xp;
748
749 vmask4 mask = prod_yp > best_sum;
750 best_vector = select(best_vector, sum_yp, mask);
751
752 pm[partition].dir = best_vector;
753 }
754 }
755
756 /* See header for documentation. */
compute_error_squared_rgba(const partition_info & pi,const image_block & blk,const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],float uncor_lengths[BLOCK_MAX_PARTITIONS],float samec_lengths[BLOCK_MAX_PARTITIONS],float & uncor_error,float & samec_error)757 void compute_error_squared_rgba(
758 const partition_info& pi,
759 const image_block& blk,
760 const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
761 const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
762 float uncor_lengths[BLOCK_MAX_PARTITIONS],
763 float samec_lengths[BLOCK_MAX_PARTITIONS],
764 float& uncor_error,
765 float& samec_error
766 ) {
767 unsigned int partition_count = pi.partition_count;
768 promise(partition_count > 0);
769
770 vfloatacc uncor_errorsumv = vfloatacc::zero();
771 vfloatacc samec_errorsumv = vfloatacc::zero();
772
773 for (unsigned int partition = 0; partition < partition_count; partition++)
774 {
775 const uint8_t *texel_indexes = pi.texels_of_partition[partition];
776
777 float uncor_loparam = 1e10f;
778 float uncor_hiparam = -1e10f;
779
780 float samec_loparam = 1e10f;
781 float samec_hiparam = -1e10f;
782
783 processed_line4 l_uncor = uncor_plines[partition];
784 processed_line4 l_samec = samec_plines[partition];
785
786 unsigned int texel_count = pi.partition_texel_count[partition];
787 promise(texel_count > 0);
788
789 // Vectorize some useful scalar inputs
790 vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
791 vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
792 vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
793 vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
794
795 vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
796 vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
797 vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
798 vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
799
800 vfloat l_samec_bs0(l_samec.bs.lane<0>());
801 vfloat l_samec_bs1(l_samec.bs.lane<1>());
802 vfloat l_samec_bs2(l_samec.bs.lane<2>());
803 vfloat l_samec_bs3(l_samec.bs.lane<3>());
804
805 assert(all(l_samec.amod == vfloat4(0.0f)));
806
807 vfloat uncor_loparamv(1e10f);
808 vfloat uncor_hiparamv(-1e10f);
809
810 vfloat samec_loparamv(1e10f);
811 vfloat samec_hiparamv(-1e10f);
812
813 vfloat ew_r(blk.channel_weight.lane<0>());
814 vfloat ew_g(blk.channel_weight.lane<1>());
815 vfloat ew_b(blk.channel_weight.lane<2>());
816 vfloat ew_a(blk.channel_weight.lane<3>());
817
818 // This implementation over-shoots, but this is safe as we initialize the texel_indexes
819 // array to extend the last value. This means min/max are not impacted, but we need to mask
820 // out the dummy values when we compute the line weighting.
821 vint lane_ids = vint::lane_id();
822 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
823 {
824 vmask mask = lane_ids < vint(texel_count);
825 vint texel_idxs(&(texel_indexes[i]));
826
827 vfloat data_r = gatherf(blk.data_r, texel_idxs);
828 vfloat data_g = gatherf(blk.data_g, texel_idxs);
829 vfloat data_b = gatherf(blk.data_b, texel_idxs);
830 vfloat data_a = gatherf(blk.data_a, texel_idxs);
831
832 vfloat uncor_param = (data_r * l_uncor_bs0)
833 + (data_g * l_uncor_bs1)
834 + (data_b * l_uncor_bs2)
835 + (data_a * l_uncor_bs3);
836
837 uncor_loparamv = min(uncor_param, uncor_loparamv);
838 uncor_hiparamv = max(uncor_param, uncor_hiparamv);
839
840 vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
841 + (uncor_param * l_uncor_bs0);
842 vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
843 + (uncor_param * l_uncor_bs1);
844 vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
845 + (uncor_param * l_uncor_bs2);
846 vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
847 + (uncor_param * l_uncor_bs3);
848
849 vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
850 + (ew_g * uncor_dist1 * uncor_dist1)
851 + (ew_b * uncor_dist2 * uncor_dist2)
852 + (ew_a * uncor_dist3 * uncor_dist3);
853
854 haccumulate(uncor_errorsumv, uncor_err, mask);
855
856 // Process samechroma data
857 vfloat samec_param = (data_r * l_samec_bs0)
858 + (data_g * l_samec_bs1)
859 + (data_b * l_samec_bs2)
860 + (data_a * l_samec_bs3);
861
862 samec_loparamv = min(samec_param, samec_loparamv);
863 samec_hiparamv = max(samec_param, samec_hiparamv);
864
865 vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
866 vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
867 vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
868 vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
869
870 vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
871 + (ew_g * samec_dist1 * samec_dist1)
872 + (ew_b * samec_dist2 * samec_dist2)
873 + (ew_a * samec_dist3 * samec_dist3);
874
875 haccumulate(samec_errorsumv, samec_err, mask);
876
877 lane_ids += vint(ASTCENC_SIMD_WIDTH);
878 }
879
880 uncor_loparam = hmin_s(uncor_loparamv);
881 uncor_hiparam = hmax_s(uncor_hiparamv);
882
883 samec_loparam = hmin_s(samec_loparamv);
884 samec_hiparam = hmax_s(samec_hiparamv);
885
886 float uncor_linelen = uncor_hiparam - uncor_loparam;
887 float samec_linelen = samec_hiparam - samec_loparam;
888
889 // Turn very small numbers and NaNs into a small number
890 uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
891 samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
892 }
893
894 uncor_error = hadd_s(uncor_errorsumv);
895 samec_error = hadd_s(samec_errorsumv);
896 }
897
898 /* See header for documentation. */
compute_error_squared_rgb(const partition_info & pi,const image_block & blk,partition_lines3 plines[BLOCK_MAX_PARTITIONS],float & uncor_error,float & samec_error)899 void compute_error_squared_rgb(
900 const partition_info& pi,
901 const image_block& blk,
902 partition_lines3 plines[BLOCK_MAX_PARTITIONS],
903 float& uncor_error,
904 float& samec_error
905 ) {
906 unsigned int partition_count = pi.partition_count;
907 promise(partition_count > 0);
908
909 vfloatacc uncor_errorsumv = vfloatacc::zero();
910 vfloatacc samec_errorsumv = vfloatacc::zero();
911
912 for (unsigned int partition = 0; partition < partition_count; partition++)
913 {
914 partition_lines3& pl = plines[partition];
915 const uint8_t *texel_indexes = pi.texels_of_partition[partition];
916 unsigned int texel_count = pi.partition_texel_count[partition];
917 promise(texel_count > 0);
918
919 float uncor_loparam = 1e10f;
920 float uncor_hiparam = -1e10f;
921
922 float samec_loparam = 1e10f;
923 float samec_hiparam = -1e10f;
924
925 processed_line3 l_uncor = pl.uncor_pline;
926 processed_line3 l_samec = pl.samec_pline;
927
928 // This implementation is an example vectorization of this function.
929 // It works for - the codec is a 2-4% faster than not vectorizing - but
930 // the benefit is limited by the use of gathers and register pressure
931
932 // Vectorize some useful scalar inputs
933 vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
934 vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
935 vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
936
937 vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
938 vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
939 vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
940
941 vfloat l_samec_bs0(l_samec.bs.lane<0>());
942 vfloat l_samec_bs1(l_samec.bs.lane<1>());
943 vfloat l_samec_bs2(l_samec.bs.lane<2>());
944
945 assert(all(l_samec.amod == vfloat4(0.0f)));
946
947 vfloat uncor_loparamv(1e10f);
948 vfloat uncor_hiparamv(-1e10f);
949
950 vfloat samec_loparamv(1e10f);
951 vfloat samec_hiparamv(-1e10f);
952
953 vfloat ew_r(blk.channel_weight.lane<0>());
954 vfloat ew_g(blk.channel_weight.lane<1>());
955 vfloat ew_b(blk.channel_weight.lane<2>());
956
957 // This implementation over-shoots, but this is safe as we initialize the weights array
958 // to extend the last value. This means min/max are not impacted, but we need to mask
959 // out the dummy values when we compute the line weighting.
960 vint lane_ids = vint::lane_id();
961 for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
962 {
963 vmask mask = lane_ids < vint(texel_count);
964 vint texel_idxs(&(texel_indexes[i]));
965
966 vfloat data_r = gatherf(blk.data_r, texel_idxs);
967 vfloat data_g = gatherf(blk.data_g, texel_idxs);
968 vfloat data_b = gatherf(blk.data_b, texel_idxs);
969
970 vfloat uncor_param = (data_r * l_uncor_bs0)
971 + (data_g * l_uncor_bs1)
972 + (data_b * l_uncor_bs2);
973
974 uncor_loparamv = min(uncor_param, uncor_loparamv);
975 uncor_hiparamv = max(uncor_param, uncor_hiparamv);
976
977 vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
978 + (uncor_param * l_uncor_bs0);
979 vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
980 + (uncor_param * l_uncor_bs1);
981 vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
982 + (uncor_param * l_uncor_bs2);
983
984 vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
985 + (ew_g * uncor_dist1 * uncor_dist1)
986 + (ew_b * uncor_dist2 * uncor_dist2);
987
988 haccumulate(uncor_errorsumv, uncor_err, mask);
989
990 // Process samechroma data
991 vfloat samec_param = (data_r * l_samec_bs0)
992 + (data_g * l_samec_bs1)
993 + (data_b * l_samec_bs2);
994
995 samec_loparamv = min(samec_param, samec_loparamv);
996 samec_hiparamv = max(samec_param, samec_hiparamv);
997
998 vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
999 vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
1000 vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
1001
1002 vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
1003 + (ew_g * samec_dist1 * samec_dist1)
1004 + (ew_b * samec_dist2 * samec_dist2);
1005
1006 haccumulate(samec_errorsumv, samec_err, mask);
1007
1008 lane_ids += vint(ASTCENC_SIMD_WIDTH);
1009 }
1010
1011 uncor_loparam = hmin_s(uncor_loparamv);
1012 uncor_hiparam = hmax_s(uncor_hiparamv);
1013
1014 samec_loparam = hmin_s(samec_loparamv);
1015 samec_hiparam = hmax_s(samec_hiparamv);
1016
1017 float uncor_linelen = uncor_hiparam - uncor_loparam;
1018 float samec_linelen = samec_hiparam - samec_loparam;
1019
1020 // Turn very small numbers and NaNs into a small number
1021 pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
1022 pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
1023 }
1024
1025 uncor_error = hadd_s(uncor_errorsumv);
1026 samec_error = hadd_s(samec_errorsumv);
1027 }
1028
1029 #endif
1030