• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2023 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19 
20 /**
21  * @brief Functions for finding best partition for a block.
22  *
23  * The partition search operates in two stages. The first pass uses kmeans clustering to group
24  * texels into an ideal partitioning for the requested partition count, and then compares that
25  * against the 1024 partitionings generated by the ASTC partition hash function. The generated
26  * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
27  * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
28  * partitionings that actually generate fewer than the requested partition count, but only the top
29  * N candidates are actually put through a more detailed search. N is determined by the compressor
30  * quality preset.
31  *
32  * For the detailed search, each candidate is checked against two possible encoding methods:
33  *
34  *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
35  *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
36  *
37  * This is implemented by computing the compute mean color and dominant direction for each
38  * partition. This defines two lines, both of which go through the mean color value.
39  *
40  * - One line has a direction defined by the dominant direction; this is used to assess the error
41  *   from using an uncorrelated color representation.
42  * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
43  *   (RGB + scale) color representation.
44  *
45  * The best candidate is selected by computing the squared-errors that result from using these
46  * lines for endpoint selection.
47  */
48 
49 #include <limits>
50 #include "astcenc_internal.h"
51 
52 /**
53  * @brief Pick some initial kmeans cluster centers.
54  *
55  * @param      blk               The image block color data to compress.
56  * @param      texel_count       The number of texels in the block.
57  * @param      partition_count   The number of partitions in the block.
58  * @param[out] cluster_centers   The initial partition cluster center colors.
59  */
kmeans_init(const image_block & blk,unsigned int texel_count,unsigned int partition_count,vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS])60 static void kmeans_init(
61 	const image_block& blk,
62 	unsigned int texel_count,
63 	unsigned int partition_count,
64 	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
65 ) {
66 	promise(texel_count > 0);
67 	promise(partition_count > 0);
68 
69 	unsigned int clusters_selected = 0;
70 	float distances[BLOCK_MAX_TEXELS];
71 
72 	// Pick a random sample as first cluster center; 145897 from random.org
73 	unsigned int sample = 145897 % texel_count;
74 	vfloat4 center_color = blk.texel(sample);
75 	cluster_centers[clusters_selected] = center_color;
76 	clusters_selected++;
77 
78 	// Compute the distance to the first cluster center
79 	float distance_sum = 0.0f;
80 	for (unsigned int i = 0; i < texel_count; i++)
81 	{
82 		vfloat4 color = blk.texel(i);
83 		vfloat4 diff = color - center_color;
84 		float distance = dot_s(diff * diff, blk.channel_weight);
85 		distance_sum += distance;
86 		distances[i] = distance;
87 	}
88 
89 	// More numbers from random.org for weighted-random center selection
90 	const float cluster_cutoffs[9] {
91 		0.626220f, 0.932770f, 0.275454f,
92 		0.318558f, 0.240113f, 0.009190f,
93 		0.347661f, 0.731960f, 0.156391f
94 	};
95 
96 	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
97 
98 	// Pick the remaining samples as needed
99 	while (true)
100 	{
101 		// Pick the next center in a weighted-random fashion.
102 		float summa = 0.0f;
103 		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
104 		for (sample = 0; sample < texel_count; sample++)
105 		{
106 			summa += distances[sample];
107 			if (summa >= distance_cutoff)
108 			{
109 				break;
110 			}
111 		}
112 
113 		// Clamp to a valid range and store the selected cluster center
114 		sample = astc::min(sample, texel_count - 1);
115 
116 		center_color = blk.texel(sample);
117 		cluster_centers[clusters_selected++] = center_color;
118 		if (clusters_selected >= partition_count)
119 		{
120 			break;
121 		}
122 
123 		// Compute the distance to the new cluster center, keep the min dist
124 		distance_sum = 0.0f;
125 		for (unsigned int i = 0; i < texel_count; i++)
126 		{
127 			vfloat4 color = blk.texel(i);
128 			vfloat4 diff = color - center_color;
129 			float distance = dot_s(diff * diff, blk.channel_weight);
130 			distance = astc::min(distance, distances[i]);
131 			distance_sum += distance;
132 			distances[i] = distance;
133 		}
134 	}
135 }
136 
137 /**
138  * @brief Assign texels to clusters, based on a set of chosen center points.
139  *
140  * @param      blk                  The image block color data to compress.
141  * @param      texel_count          The number of texels in the block.
142  * @param      partition_count      The number of partitions in the block.
143  * @param      cluster_centers      The partition cluster center colors.
144  * @param[out] partition_of_texel   The partition assigned for each texel.
145  */
kmeans_assign(const image_block & blk,unsigned int texel_count,unsigned int partition_count,const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],uint8_t partition_of_texel[BLOCK_MAX_TEXELS])146 static void kmeans_assign(
147 	const image_block& blk,
148 	unsigned int texel_count,
149 	unsigned int partition_count,
150 	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
151 	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
152 ) {
153 	promise(texel_count > 0);
154 	promise(partition_count > 0);
155 
156 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
157 
158 	// Find the best partition for every texel
159 	for (unsigned int i = 0; i < texel_count; i++)
160 	{
161 		float best_distance = std::numeric_limits<float>::max();
162 		unsigned int best_partition = 0;
163 
164 		vfloat4 color = blk.texel(i);
165 		for (unsigned int j = 0; j < partition_count; j++)
166 		{
167 			vfloat4 diff = color - cluster_centers[j];
168 			float distance = dot_s(diff * diff, blk.channel_weight);
169 			if (distance < best_distance)
170 			{
171 				best_distance = distance;
172 				best_partition = j;
173 			}
174 		}
175 
176 		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
177 		partition_texel_count[best_partition]++;
178 	}
179 
180 	// It is possible to get a situation where a partition ends up without any texels. In this case,
181 	// assign texel N to partition N. This is silly, but ensures that every partition retains at
182 	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
183 	// so if we actually did a reassignment, run the whole loop over again.
184 	bool problem_case;
185 	do
186 	{
187 		problem_case = false;
188 		for (unsigned int i = 0; i < partition_count; i++)
189 		{
190 			if (partition_texel_count[i] == 0)
191 			{
192 				partition_texel_count[partition_of_texel[i]]--;
193 				partition_texel_count[i]++;
194 				partition_of_texel[i] = static_cast<uint8_t>(i);
195 				problem_case = true;
196 			}
197 		}
198 	} while (problem_case);
199 }
200 
201 /**
202  * @brief Compute new cluster centers based on their center of gravity.
203  *
204  * @param       blk                  The image block color data to compress.
205  * @param       texel_count          The number of texels in the block.
206  * @param       partition_count      The number of partitions in the block.
207  * @param[out]  cluster_centers      The new cluster center colors.
208  * @param       partition_of_texel   The partition assigned for each texel.
209  */
kmeans_update(const image_block & blk,unsigned int texel_count,unsigned int partition_count,vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],const uint8_t partition_of_texel[BLOCK_MAX_TEXELS])210 static void kmeans_update(
211 	const image_block& blk,
212 	unsigned int texel_count,
213 	unsigned int partition_count,
214 	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
215 	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
216 ) {
217 	promise(texel_count > 0);
218 	promise(partition_count > 0);
219 
220 	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
221 		vfloat4::zero(),
222 		vfloat4::zero(),
223 		vfloat4::zero(),
224 		vfloat4::zero()
225 	};
226 
227 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
228 
229 	// Find the center-of-gravity in each cluster
230 	for (unsigned int i = 0; i < texel_count; i++)
231 	{
232 		uint8_t partition = partition_of_texel[i];
233 		color_sum[partition] += blk.texel(i);
234 		partition_texel_count[partition]++;
235 	}
236 
237 	// Set the center of gravity to be the new cluster center
238 	for (unsigned int i = 0; i < partition_count; i++)
239 	{
240 		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
241 		cluster_centers[i] = color_sum[i] * scale;
242 	}
243 }
244 
245 /**
246  * @brief Compute bit-mismatch for partitioning in 2-partition mode.
247  *
248  * @param a   The texel assignment bitvector for the block.
249  * @param b   The texel assignment bitvector for the partition table.
250  *
251  * @return    The number of bit mismatches.
252  */
253 #if ASTCENC_NEON != 0
partition_mismatch2(const uint64_t a[2],const uint64_t b[2])254 static inline uint8_t partition_mismatch2(
255 	const uint64_t a[2],
256 	const uint64_t b[2]
257 ) {
258 	uint64x2_t a01 = vld1q_u64(a);
259 	uint64x2_t b01 = vld1q_u64(b);
260 	uint64x2_t b10 = vextq_u64(b01, b01, 1);
261 	uint8_t c1 = popcount(veorq_u64(a01, b01));
262 	uint8_t c2 = popcount(veorq_u64(a01, b10));
263 	return static_cast<uint8_t>(astc::min(c1, c2) / 2);    // 2 is the number of partitions
264 }
265 #else
partition_mismatch2(const uint64_t a[2],const uint64_t b[2])266 static inline uint8_t partition_mismatch2(
267 	const uint64_t a[2],
268 	const uint64_t b[2]
269 ) {
270 	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
271 	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
272 
273 	// Divide by 2 because XOR always counts errors twice, once when missing
274 	// in the expected position, and again when present in the wrong partition
275 	return static_cast<uint8_t>(astc::min(v1, v2) / 2);
276 }
277 #endif
278 
279 /**
280  * @brief Compute bit-mismatch for partitioning in 3-partition mode.
281  *
282  * @param a   The texel assignment bitvector for the block.
283  * @param b   The texel assignment bitvector for the partition table.
284  *
285  * @return    The number of bit mismatches.
286  */
partition_mismatch3(const uint64_t a[3],const uint64_t b[3])287 static inline uint8_t partition_mismatch3(
288 	const uint64_t a[3],
289 	const uint64_t b[3]
290 ) {
291 	int p00 = popcount(a[0] ^ b[0]);
292 	int p01 = popcount(a[0] ^ b[1]);
293 	int p02 = popcount(a[0] ^ b[2]);
294 
295 	int p10 = popcount(a[1] ^ b[0]);
296 	int p11 = popcount(a[1] ^ b[1]);
297 	int p12 = popcount(a[1] ^ b[2]);
298 
299 	int p20 = popcount(a[2] ^ b[0]);
300 	int p21 = popcount(a[2] ^ b[1]);
301 	int p22 = popcount(a[2] ^ b[2]);
302 
303 	int s0 = p11 + p22;
304 	int s1 = p12 + p21;
305 	int v0 = astc::min(s0, s1) + p00;
306 
307 	int s2 = p10 + p22;
308 	int s3 = p12 + p20;
309 	int v1 = astc::min(s2, s3) + p01;
310 
311 	int s4 = p10 + p21;
312 	int s5 = p11 + p20;
313 	int v2 = astc::min(s4, s5) + p02;
314 
315 	// Divide by 2 because XOR always counts errors twice, once when missing
316 	// in the expected position, and again when present in the wrong partition
317 	return static_cast<uint8_t>(astc::min(v0, v1, v2) / 2);
318 }
319 
320 /**
321  * @brief Compute bit-mismatch for partitioning in 4-partition mode.
322  *
323  * @param a   The texel assignment bitvector for the block.
324  * @param b   The texel assignment bitvector for the partition table.
325  *
326  * @return    The number of bit mismatches.
327  */
partition_mismatch4(const uint64_t a[4],const uint64_t b[4])328 static inline uint8_t partition_mismatch4(
329 	const uint64_t a[4],
330 	const uint64_t b[4]
331 ) {
332 	int p00 = popcount(a[0] ^ b[0]);
333 	int p01 = popcount(a[0] ^ b[1]);
334 	int p02 = popcount(a[0] ^ b[2]);
335 	int p03 = popcount(a[0] ^ b[3]);
336 
337 	int p10 = popcount(a[1] ^ b[0]);
338 	int p11 = popcount(a[1] ^ b[1]);
339 	int p12 = popcount(a[1] ^ b[2]);
340 	int p13 = popcount(a[1] ^ b[3]);
341 
342 	int p20 = popcount(a[2] ^ b[0]);
343 	int p21 = popcount(a[2] ^ b[1]);
344 	int p22 = popcount(a[2] ^ b[2]);
345 	int p23 = popcount(a[2] ^ b[3]);
346 
347 	int p30 = popcount(a[3] ^ b[0]);
348 	int p31 = popcount(a[3] ^ b[1]);
349 	int p32 = popcount(a[3] ^ b[2]);
350 	int p33 = popcount(a[3] ^ b[3]);
351 
352 	int mx23 = astc::min(p22 + p33, p23 + p32);
353 	int mx13 = astc::min(p21 + p33, p23 + p31);
354 	int mx12 = astc::min(p21 + p32, p22 + p31);
355 	int mx03 = astc::min(p20 + p33, p23 + p30);
356 	int mx02 = astc::min(p20 + p32, p22 + p30);
357 	int mx01 = astc::min(p21 + p30, p20 + p31);
358 
359 	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
360 	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
361 	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
362 	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
363 
364 	// Divide by 2 because XOR always counts errors twice, once when missing
365 	// in the expected position, and again when present in the wrong partition
366 	return static_cast<uint8_t>(astc::min(v0, v1, v2, v3) / 2);
367 }
368 
369 using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
370 
371 /**
372  * @brief Count the partition table mismatches vs the data clustering.
373  *
374  * @param      bsd               The block size information.
375  * @param      partition_count   The number of partitions in the block.
376  * @param      bitmaps           The block texel partition assignment patterns.
377  * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
378  */
count_partition_mismatch_bits(const block_size_descriptor & bsd,unsigned int partition_count,const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS])379 static void count_partition_mismatch_bits(
380 	const block_size_descriptor& bsd,
381 	unsigned int partition_count,
382 	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
383 	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS]
384 ) {
385 	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
386 	promise(active_count > 0);
387 
388 	if (partition_count == 2)
389 	{
390 		for (unsigned int i = 0; i < active_count; i++)
391 		{
392 			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
393 			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
394 			assert(mismatch_counts[i] < bsd.texel_count);
395 		}
396 	}
397 	else if (partition_count == 3)
398 	{
399 		for (unsigned int i = 0; i < active_count; i++)
400 		{
401 			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
402 			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
403 			assert(mismatch_counts[i] < bsd.texel_count);
404 		}
405 	}
406 	else
407 	{
408 		for (unsigned int i = 0; i < active_count; i++)
409 		{
410 			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
411 			assert(mismatch_counts[i] < BLOCK_MAX_KMEANS_TEXELS);
412 			assert(mismatch_counts[i] < bsd.texel_count);
413 		}
414 	}
415 }
416 
417 /**
418  * @brief Use counting sort on the mismatch array to sort partition candidates.
419  *
420  * @param      partitioning_count   The number of packed partitionings.
421  * @param      mismatch_count       Partitioning mismatch counts, in index order.
422  * @param[out] partition_ordering   Partition index values, in mismatch order.
423  *
424  * @return The number of active partitions in this selection.
425  */
get_partition_ordering_by_mismatch_bits(unsigned int texel_count,unsigned int partitioning_count,const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS])426 static unsigned int get_partition_ordering_by_mismatch_bits(
427 	unsigned int texel_count,
428 	unsigned int partitioning_count,
429 	const uint8_t mismatch_count[BLOCK_MAX_PARTITIONINGS],
430 	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
431 ) {
432 	promise(partitioning_count > 0);
433 	uint16_t mscount[BLOCK_MAX_KMEANS_TEXELS] { 0 };
434 
435 	// Create the histogram of mismatch counts
436 	for (unsigned int i = 0; i < partitioning_count; i++)
437 	{
438 		mscount[mismatch_count[i]]++;
439 	}
440 
441 	// Create a running sum from the histogram array
442 	// Cells store previous values only; i.e. exclude self after sum
443 	unsigned int sum = 0;
444 	for (unsigned int i = 0; i < texel_count; i++)
445 	{
446 		uint16_t cnt = mscount[i];
447 		mscount[i] = sum;
448 		sum += cnt;
449 	}
450 
451 	// Use the running sum as the index, incrementing after read to allow
452 	// sequential entries with the same count
453 	for (unsigned int i = 0; i < partitioning_count; i++)
454 	{
455 		unsigned int idx = mscount[mismatch_count[i]]++;
456 		partition_ordering[idx] = static_cast<uint16_t>(i);
457 	}
458 
459 	return partitioning_count;
460 }
461 
462 /**
463  * @brief Use k-means clustering to compute a partition ordering for a block..
464  *
465  * @param      bsd                  The block size information.
466  * @param      blk                  The image block color data to compress.
467  * @param      partition_count      The desired number of partitions in the block.
468  * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
469  *
470  * @return The number of active partitionings in this selection.
471  */
compute_kmeans_partition_ordering(const block_size_descriptor & bsd,const image_block & blk,unsigned int partition_count,uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS])472 static unsigned int compute_kmeans_partition_ordering(
473 	const block_size_descriptor& bsd,
474 	const image_block& blk,
475 	unsigned int partition_count,
476 	uint16_t partition_ordering[BLOCK_MAX_PARTITIONINGS]
477 ) {
478 	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
479 	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
480 
481 	// Use three passes of k-means clustering to partition the block data
482 	for (unsigned int i = 0; i < 3; i++)
483 	{
484 		if (i == 0)
485 		{
486 			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
487 		}
488 		else
489 		{
490 			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
491 		}
492 
493 		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
494 	}
495 
496 	// Construct the block bitmaps of texel assignments to each partition
497 	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
498 	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
499 	promise(texels_to_process > 0);
500 	for (unsigned int i = 0; i < texels_to_process; i++)
501 	{
502 		unsigned int idx = bsd.kmeans_texels[i];
503 		bitmaps[texel_partitions[idx]] |= 1ULL << i;
504 	}
505 
506 	// Count the mismatch between the block and the format's partition tables
507 	uint8_t mismatch_counts[BLOCK_MAX_PARTITIONINGS];
508 	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
509 
510 	// Sort the partitions based on the number of mismatched bits
511 	return get_partition_ordering_by_mismatch_bits(
512 	    texels_to_process,
513 	    bsd.partitioning_count_selected[partition_count - 1],
514 	    mismatch_counts, partition_ordering);
515 }
516 
517 /**
518  * @brief Insert a partitioning into an order list of results, sorted by error.
519  *
520  * @param      max_values      The max number of entries in the best result arrays.
521  * @param      this_error      The error of the new entry.
522  * @param      this_partition  The partition ID of the new entry.
523  * @param[out] best_errors     The array of best error values.
524  * @param[out] best_partitions The array of best partition values.
525  */
insert_result(unsigned int max_values,float this_error,unsigned int this_partition,float * best_errors,unsigned int * best_partitions)526 static void insert_result(
527 	unsigned int max_values,
528 	float this_error,
529 	unsigned int this_partition,
530 	float* best_errors,
531 	unsigned int* best_partitions)
532 {
533 	promise(max_values > 0);
534 
535 	// Don't bother searching if the current worst error beats the new error
536 	if (this_error >= best_errors[max_values - 1])
537 	{
538 		return;
539 	}
540 
541 	// Else insert into the list in error-order
542 	for (unsigned int i = 0; i < max_values; i++)
543 	{
544 		// Existing result is better - move on ...
545 		if (this_error > best_errors[i])
546 		{
547 			continue;
548 		}
549 
550 		// Move existing results down one
551 		for (unsigned int j = max_values - 1; j > i; j--)
552 		{
553 			best_errors[j] = best_errors[j - 1];
554 			best_partitions[j] = best_partitions[j - 1];
555 		}
556 
557 		// Insert new result
558 		best_errors[i] = this_error;
559 		best_partitions[i] = this_partition;
560 		break;
561 	}
562 }
563 
564 /* See header for documentation. */
find_best_partition_candidates(const block_size_descriptor & bsd,const image_block & blk,unsigned int partition_count,unsigned int partition_search_limit,unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],unsigned int requested_candidates)565 unsigned int find_best_partition_candidates(
566 	const block_size_descriptor& bsd,
567 	const image_block& blk,
568 	unsigned int partition_count,
569 	unsigned int partition_search_limit,
570 	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
571 	unsigned int requested_candidates
572 ) {
573 	// Constant used to estimate quantization error for a given partitioning; the optimal value for
574 	// this depends on bitrate. These values have been determined empirically.
575 	unsigned int texels_per_block = bsd.texel_count;
576 	float weight_imprecision_estim = 0.055f;
577 	if (texels_per_block <= 20)
578 	{
579 		weight_imprecision_estim = 0.03f;
580 	}
581 	else if (texels_per_block <= 31)
582 	{
583 		weight_imprecision_estim = 0.04f;
584 	}
585 	else if (texels_per_block <= 41)
586 	{
587 		weight_imprecision_estim = 0.05f;
588 	}
589 
590 	promise(partition_count > 0);
591 	promise(partition_search_limit > 0);
592 
593 	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
594 
595 	uint16_t partition_sequence[BLOCK_MAX_PARTITIONINGS];
596 	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
597 	partition_search_limit = astc::min(partition_search_limit, sequence_len);
598 	requested_candidates = astc::min(partition_search_limit, requested_candidates);
599 
600 	bool uses_alpha = !blk.is_constant_channel(3);
601 
602 	// Partitioning errors assuming uncorrelated-chrominance endpoints
603 	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
604 	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
605 
606 	// Partitioning errors assuming same-chrominance endpoints
607 	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
608 	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
609 
610 	for (unsigned int i = 0; i < requested_candidates; i++)
611 	{
612 		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
613 		samec_best_errors[i] = ERROR_CALC_DEFAULT;
614 	}
615 
616 	if (uses_alpha)
617 	{
618 		for (unsigned int i = 0; i < partition_search_limit; i++)
619 		{
620 			unsigned int partition = partition_sequence[i];
621 			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
622 
623 			// Compute weighting to give to each component in each partition
624 			partition_metrics pms[BLOCK_MAX_PARTITIONS];
625 
626 			compute_avgs_and_dirs_4_comp(pi, blk, pms);
627 
628 			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
629 			line4 samec_lines[BLOCK_MAX_PARTITIONS];
630 
631 			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
632 			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
633 
634 			float line_lengths[BLOCK_MAX_PARTITIONS];
635 
636 			for (unsigned int j = 0; j < partition_count; j++)
637 			{
638 				partition_metrics& pm = pms[j];
639 
640 				uncor_lines[j].a = pm.avg;
641 				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
642 
643 				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
644 				uncor_plines[j].bs = uncor_lines[j].b;
645 
646 				samec_lines[j].a = vfloat4::zero();
647 				samec_lines[j].b = normalize_safe(pm.avg, unit4());
648 
649 				samec_plines[j].amod = vfloat4::zero();
650 				samec_plines[j].bs = samec_lines[j].b;
651 			}
652 
653 			float uncor_error = 0.0f;
654 			float samec_error = 0.0f;
655 
656 			compute_error_squared_rgba(pi,
657 			                           blk,
658 			                           uncor_plines,
659 			                           samec_plines,
660 			                           line_lengths,
661 			                           uncor_error,
662 			                           samec_error);
663 
664 			// Compute an estimate of error introduced by weight quantization imprecision.
665 			// This error is computed as follows, for each partition
666 			//     1: compute the principal-axis vector (full length) in error-space
667 			//     2: convert the principal-axis vector to regular RGB-space
668 			//     3: scale the vector by a constant that estimates average quantization error
669 			//     4: for each texel, square the vector, then do a dot-product with the texel's
670 			//        error weight; sum up the results across all texels.
671 			//     4(optimized): square the vector once, then do a dot-product with the average
672 			//        texel error, then multiply by the number of texels.
673 
674 			for (unsigned int j = 0; j < partition_count; j++)
675 			{
676 				float tpp = static_cast<float>(pi.partition_texel_count[j]);
677 				vfloat4 error_weights(tpp * weight_imprecision_estim);
678 
679 				vfloat4 uncor_vector = uncor_lines[j].b * line_lengths[j];
680 				vfloat4 samec_vector = samec_lines[j].b * line_lengths[j];
681 
682 				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
683 				samec_error += dot_s(samec_vector * samec_vector, error_weights);
684 			}
685 
686 			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
687 			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
688 		}
689 	}
690 	else
691 	{
692 		for (unsigned int i = 0; i < partition_search_limit; i++)
693 		{
694 			unsigned int partition = partition_sequence[i];
695 			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
696 
697 			// Compute weighting to give to each component in each partition
698 			partition_metrics pms[BLOCK_MAX_PARTITIONS];
699 			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
700 
701 			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
702 
703 			for (unsigned int j = 0; j < partition_count; j++)
704 			{
705 				partition_metrics& pm = pms[j];
706 				partition_lines3& pl = plines[j];
707 
708 				pl.uncor_line.a = pm.avg;
709 				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
710 
711 				pl.samec_line.a = vfloat4::zero();
712 				pl.samec_line.b = normalize_safe(pm.avg, unit3());
713 
714 				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
715 				pl.uncor_pline.bs   = pl.uncor_line.b;
716 
717 				pl.samec_pline.amod = vfloat4::zero();
718 				pl.samec_pline.bs   = pl.samec_line.b;
719 			}
720 
721 			float uncor_error = 0.0f;
722 			float samec_error = 0.0f;
723 
724 			compute_error_squared_rgb(pi,
725 			                          blk,
726 			                          plines,
727 			                          uncor_error,
728 			                          samec_error);
729 
730 			// Compute an estimate of error introduced by weight quantization imprecision.
731 			// This error is computed as follows, for each partition
732 			//     1: compute the principal-axis vector (full length) in error-space
733 			//     2: convert the principal-axis vector to regular RGB-space
734 			//     3: scale the vector by a constant that estimates average quantization error
735 			//     4: for each texel, square the vector, then do a dot-product with the texel's
736 			//        error weight; sum up the results across all texels.
737 			//     4(optimized): square the vector once, then do a dot-product with the average
738 			//        texel error, then multiply by the number of texels.
739 
740 			for (unsigned int j = 0; j < partition_count; j++)
741 			{
742 				partition_lines3& pl = plines[j];
743 
744 				float tpp = static_cast<float>(pi.partition_texel_count[j]);
745 				vfloat4 error_weights(tpp * weight_imprecision_estim);
746 
747 				vfloat4 uncor_vector = pl.uncor_line.b * pl.line_length;
748 				vfloat4 samec_vector = pl.samec_line.b * pl.line_length;
749 
750 				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
751 				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
752 			}
753 
754 			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
755 			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
756 		}
757 	}
758 
759 	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
760 	for (unsigned int i = 0; i < requested_candidates; i++)
761 	{
762 		interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
763 		interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
764 	}
765 
766 	uint64_t bitmasks[1024/64] { 0 };
767 	unsigned int emitted = 0;
768 
769 	// Deduplicate the first "requested" entries
770 	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
771 	{
772 		unsigned int partition = interleave[i];
773 
774 		unsigned int word = partition / 64;
775 		unsigned int bit = partition % 64;
776 
777 		bool written = bitmasks[word] & (1ull << bit);
778 
779 		if (!written)
780 		{
781 			best_partitions[emitted] = partition;
782 			bitmasks[word] |= 1ull << bit;
783 			emitted++;
784 
785 			if (emitted == requested_candidates)
786 			{
787 				break;
788 			}
789 		}
790 	}
791 
792 	return emitted;
793 }
794 
795 #endif
796