• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 //     http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17 
18 /**
19  * @brief Functions and data declarations.
20  */
21 
22 #ifndef ASTCENC_INTERNAL_INCLUDED
23 #define ASTCENC_INTERNAL_INCLUDED
24 
25 #include <algorithm>
26 #include <atomic>
27 #include <cstddef>
28 #include <cstdint>
29 #include <cstdio>
30 #include <cstdlib>
31 #include <condition_variable>
32 #include <functional>
33 #include <mutex>
34 #include <type_traits>
35 
36 #include "astcenc.h"
37 #include "astcenc_mathlib.h"
38 #include "astcenc_vecmathlib.h"
39 
40 /**
41  * @brief Make a promise to the compiler's optimizer.
42  *
43  * A promise is an expression that the optimizer is can assume is true for to help it generate
44  * faster code. Common use cases for this are to promise that a for loop will iterate more than
45  * once, or that the loop iteration count is a multiple of a vector length, which avoids pre-loop
46  * checks and can avoid loop tails if loops are unrolled by the auto-vectorizer.
47  */
48 #if defined(NDEBUG)
49 	#if !defined(__clang__) && defined(_MSC_VER)
50 		#define promise(cond) __assume(cond)
51 	#elif defined(__clang__)
52 		#if __has_builtin(__builtin_assume)
53 			#define promise(cond) __builtin_assume(cond)
54 		#elif __has_builtin(__builtin_unreachable)
55 			#define promise(cond) if (!(cond)) { __builtin_unreachable(); }
56 		#else
57 			#define promise(cond)
58 		#endif
59 	#else // Assume GCC
60 		#define promise(cond) if (!(cond)) { __builtin_unreachable(); }
61 	#endif
62 #else
63 	#define promise(cond) assert(cond)
64 #endif
65 
66 /* ============================================================================
67   Constants
68 ============================================================================ */
69 #if !defined(ASTCENC_BLOCK_MAX_TEXELS)
70 	#define ASTCENC_BLOCK_MAX_TEXELS 216 // A 3D 6x6x6 block
71 #endif
72 
73 /** @brief The maximum number of texels a block can support (6x6x6 block). */
74 static constexpr unsigned int BLOCK_MAX_TEXELS { ASTCENC_BLOCK_MAX_TEXELS };
75 
76 /** @brief The maximum number of components a block can support. */
77 static constexpr unsigned int BLOCK_MAX_COMPONENTS { 4 };
78 
79 /** @brief The maximum number of partitions a block can support. */
80 static constexpr unsigned int BLOCK_MAX_PARTITIONS { 4 };
81 
82 /** @brief The number of partitionings, per partition count, suported by the ASTC format. */
83 static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 };
84 
85 /** @brief The maximum number of weights used during partition selection for texel clustering. */
86 static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 };
87 
88 /** @brief The maximum number of weights a block can support. */
89 static constexpr unsigned int BLOCK_MAX_WEIGHTS { 64 };
90 
91 /** @brief The maximum number of weights a block can support per plane in 2 plane mode. */
92 static constexpr unsigned int BLOCK_MAX_WEIGHTS_2PLANE { BLOCK_MAX_WEIGHTS / 2 };
93 
94 /** @brief The minimum number of weight bits a candidate encoding must encode. */
95 static constexpr unsigned int BLOCK_MIN_WEIGHT_BITS { 24 };
96 
97 /** @brief The maximum number of weight bits a candidate encoding can encode. */
98 static constexpr unsigned int BLOCK_MAX_WEIGHT_BITS { 96 };
99 
100 /** @brief The index indicating a bad (unused) block mode in the remap array. */
101 static constexpr uint16_t BLOCK_BAD_BLOCK_MODE { 0xFFFFu };
102 
103 /** @brief The index indicating a bad (unused) partitioning in the remap array. */
104 static constexpr uint16_t BLOCK_BAD_PARTITIONING { 0xFFFFu };
105 
106 /** @brief The number of partition index bits supported by the ASTC format . */
107 static constexpr unsigned int PARTITION_INDEX_BITS { 10 };
108 
109 /** @brief The offset of the plane 2 weights in shared weight arrays. */
110 static constexpr unsigned int WEIGHTS_PLANE2_OFFSET { BLOCK_MAX_WEIGHTS_2PLANE };
111 
112 /** @brief The sum of quantized weights for one texel. */
113 static constexpr float WEIGHTS_TEXEL_SUM { 16.0f };
114 
115 /** @brief The number of block modes supported by the ASTC format. */
116 static constexpr unsigned int WEIGHTS_MAX_BLOCK_MODES { 2048 };
117 
118 /** @brief The number of weight grid decimation modes supported by the ASTC format. */
119 static constexpr unsigned int WEIGHTS_MAX_DECIMATION_MODES { 87 };
120 
121 /** @brief The high default error used to initialize error trackers. */
122 static constexpr float ERROR_CALC_DEFAULT { 1e30f };
123 
124 /**
125  * @brief The minimum texel count for a block to use the one partition fast path.
126  *
127  * This setting skips 4x4 and 5x4 block sizes.
128  */
129 static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
130 
131 /**
132  * @brief The maximum number of candidate encodings tested for each encoding mode..
133  *
134  * This can be dynamically reduced by the compression quality preset.
135  */
136 static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 4 };
137 
138 
139 static_assert((BLOCK_MAX_TEXELS % ASTCENC_SIMD_WIDTH) == 0,
140               "BLOCK_MAX_TEXELS must be multiple of ASTCENC_SIMD_WIDTH");
141 
142 static_assert((BLOCK_MAX_WEIGHTS % ASTCENC_SIMD_WIDTH) == 0,
143               "BLOCK_MAX_WEIGHTS must be multiple of ASTCENC_SIMD_WIDTH");
144 
145 static_assert((WEIGHTS_MAX_BLOCK_MODES % ASTCENC_SIMD_WIDTH) == 0,
146               "WEIGHTS_MAX_BLOCK_MODES must be multiple of ASTCENC_SIMD_WIDTH");
147 
148 
149 /* ============================================================================
150   Parallel execution control
151 ============================================================================ */
152 
153 /**
154  * @brief A simple counter-based manager for parallel task execution.
155  *
156  * The task processing execution consists of:
157  *
158  *     * A single-threaded init stage.
159  *     * A multi-threaded processing stage.
160  *     * A condition variable so threads can wait for processing completion.
161  *
162  * The init stage will be executed by the first thread to arrive in the critical section, there is
163  * no main thread in the thread pool.
164  *
165  * The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
166  * basis. Threads may each therefore executed different numbers of tasks, depending on their
167  * processing complexity. The task queue and the task tickets are just counters; the caller must map
168  * these integers to an actual processing partition in a specific problem domain.
169  *
170  * The exit wait condition is needed to ensure processing has finished before a worker thread can
171  * progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
172  * because there are no new tasks to assign to it while other worker threads are still processing.
173  * Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
174  *
175  * The basic usage model:
176  *
177  *     // --------- From single-threaded code ---------
178  *
179  *     // Reset the tracker state
180  *     manager->reset()
181  *
182  *     // --------- From multi-threaded code ---------
183  *
184  *     // Run the stage init; only first thread actually runs the lambda
185  *     manager->init(<lambda>)
186  *
187  *     do
188  *     {
189  *         // Request a task assignment
190  *         uint task_count;
191  *         uint base_index = manager->get_tasks(<granule>, task_count);
192  *
193  *         // Process any tasks we were given (task_count <= granule size)
194  *         if (task_count)
195  *         {
196  *             // Run the user task processing code for N tasks here
197  *             ...
198  *
199  *             // Flag these tasks as complete
200  *             manager->complete_tasks(task_count);
201  *         }
202  *     } while (task_count);
203  *
204  *     // Wait for all threads to complete tasks before progressing
205  *     manager->wait()
206  *
207   *     // Run the stage term; only first thread actually runs the lambda
208  *     manager->term(<lambda>)
209  */
210 class ParallelManager
211 {
212 private:
213 	/** @brief Lock used for critical section and condition synchronization. */
214 	std::mutex m_lock;
215 
216 	/** @brief True if the stage init() step has been executed. */
217 	bool m_init_done;
218 
219 	/** @brief True if the stage term() step has been executed. */
220 	bool m_term_done;
221 
222 	/** @brief Contition variable for tracking stage processing completion. */
223 	std::condition_variable m_complete;
224 
225 	/** @brief Number of tasks started, but not necessarily finished. */
226 	std::atomic<unsigned int> m_start_count;
227 
228 	/** @brief Number of tasks finished. */
229 	unsigned int m_done_count;
230 
231 	/** @brief Number of tasks that need to be processed. */
232 	unsigned int m_task_count;
233 
234 public:
235 	/** @brief Create a new ParallelManager. */
ParallelManager()236 	ParallelManager()
237 	{
238 		reset();
239 	}
240 
241 	/**
242 	 * @brief Reset the tracker for a new processing batch.
243 	 *
244 	 * This must be called from single-threaded code before starting the multi-threaded procesing
245 	 * operations.
246 	 */
reset()247 	void reset()
248 	{
249 		m_init_done = false;
250 		m_term_done = false;
251 		m_start_count = 0;
252 		m_done_count = 0;
253 		m_task_count = 0;
254 	}
255 
256 	/**
257 	 * @brief Trigger the pipeline stage init step.
258 	 *
259 	 * This can be called from multi-threaded code. The first thread to hit this will process the
260 	 * initialization. Other threads will block and wait for it to complete.
261 	 *
262 	 * @param init_func   Callable which executes the stage initialization. It must return the
263 	 *                    total number of tasks in the stage.
264 	 */
init(std::function<unsigned int (void)> init_func)265 	void init(std::function<unsigned int(void)> init_func)
266 	{
267 		std::lock_guard<std::mutex> lck(m_lock);
268 		if (!m_init_done)
269 		{
270 			m_task_count = init_func();
271 			m_init_done = true;
272 		}
273 	}
274 
275 	/**
276 	 * @brief Trigger the pipeline stage init step.
277 	 *
278 	 * This can be called from multi-threaded code. The first thread to hit this will process the
279 	 * initialization. Other threads will block and wait for it to complete.
280 	 *
281 	 * @param task_count   Total number of tasks needing processing.
282 	 */
init(unsigned int task_count)283 	void init(unsigned int task_count)
284 	{
285 		std::lock_guard<std::mutex> lck(m_lock);
286 		if (!m_init_done)
287 		{
288 			m_task_count = task_count;
289 			m_init_done = true;
290 		}
291 	}
292 
293 	/**
294 	 * @brief Request a task assignment.
295 	 *
296 	 * Assign up to @c granule tasks to the caller for processing.
297 	 *
298 	 * @param      granule   Maximum number of tasks that can be assigned.
299 	 * @param[out] count     Actual number of tasks assigned, or zero if no tasks were assigned.
300 	 *
301 	 * @return Task index of the first assigned task; assigned tasks increment from this.
302 	 */
get_task_assignment(unsigned int granule,unsigned int & count)303 	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
304 	{
305 		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
306 		if (base >= m_task_count)
307 		{
308 			count = 0;
309 			return 0;
310 		}
311 
312 		count = astc::min(m_task_count - base, granule);
313 		return base;
314 	}
315 
316 	/**
317 	 * @brief Complete a task assignment.
318 	 *
319 	 * Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
320 	 * completes the processing of the stage.
321 	 *
322 	 * @param count   The number of completed tasks.
323 	 */
complete_task_assignment(unsigned int count)324 	void complete_task_assignment(unsigned int count)
325 	{
326 		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
327 		// update here and the wait() for other threads
328 		std::unique_lock<std::mutex> lck(m_lock);
329 		this->m_done_count += count;
330 		if (m_done_count == m_task_count)
331 		{
332 			lck.unlock();
333 			m_complete.notify_all();
334 		}
335 	}
336 
337 	/**
338 	 * @brief Wait for stage processing to complete.
339 	 */
wait()340 	void wait()
341 	{
342 		std::unique_lock<std::mutex> lck(m_lock);
343 		m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
344 	}
345 
346 	/**
347 	 * @brief Trigger the pipeline stage term step.
348 	 *
349 	 * This can be called from multi-threaded code. The first thread to hit this will process the
350 	 * thread termintion. Caller must have called @c wait() prior to calling this function to ensure
351 	 * that processing is complete.
352 	 *
353 	 * @param term_func   Callable which executes the stage termination.
354 	 */
term(std::function<void (void)> term_func)355 	void term(std::function<void(void)> term_func)
356 	{
357 		std::lock_guard<std::mutex> lck(m_lock);
358 		if (!m_term_done)
359 		{
360 			term_func();
361 			m_term_done = true;
362 		}
363 	}
364 };
365 
366 /* ============================================================================
367   Commonly used data structures
368 ============================================================================ */
369 
370 /**
371  * @brief The ASTC endpoint formats.
372  *
373  * Note, the values here are used directly in the encoding in the format so do not rearrange.
374  */
375 enum endpoint_formats
376 {
377 	FMT_LUMINANCE = 0,
378 	FMT_LUMINANCE_DELTA = 1,
379 	FMT_HDR_LUMINANCE_LARGE_RANGE = 2,
380 	FMT_HDR_LUMINANCE_SMALL_RANGE = 3,
381 	FMT_LUMINANCE_ALPHA = 4,
382 	FMT_LUMINANCE_ALPHA_DELTA = 5,
383 	FMT_RGB_SCALE = 6,
384 	FMT_HDR_RGB_SCALE = 7,
385 	FMT_RGB = 8,
386 	FMT_RGB_DELTA = 9,
387 	FMT_RGB_SCALE_ALPHA = 10,
388 	FMT_HDR_RGB = 11,
389 	FMT_RGBA = 12,
390 	FMT_RGBA_DELTA = 13,
391 	FMT_HDR_RGB_LDR_ALPHA = 14,
392 	FMT_HDR_RGBA = 15
393 };
394 
395 /**
396  * @brief The ASTC quantization methods.
397  *
398  * Note, the values here are used directly in the encoding in the format so do not rearrange.
399  */
400 enum quant_method
401 {
402 	QUANT_2 = 0,
403 	QUANT_3 = 1,
404 	QUANT_4 = 2,
405 	QUANT_5 = 3,
406 	QUANT_6 = 4,
407 	QUANT_8 = 5,
408 	QUANT_10 = 6,
409 	QUANT_12 = 7,
410 	QUANT_16 = 8,
411 	QUANT_20 = 9,
412 	QUANT_24 = 10,
413 	QUANT_32 = 11,
414 	QUANT_40 = 12,
415 	QUANT_48 = 13,
416 	QUANT_64 = 14,
417 	QUANT_80 = 15,
418 	QUANT_96 = 16,
419 	QUANT_128 = 17,
420 	QUANT_160 = 18,
421 	QUANT_192 = 19,
422 	QUANT_256 = 20
423 };
424 
425 /**
426  * @brief The number of levels use by an ASTC quantization method.
427  *
428  * @param method   The quantization method
429  *
430  * @return   The number of levels used by @c method.
431  */
get_quant_level(quant_method method)432 static inline unsigned int get_quant_level(quant_method method)
433 {
434 	switch (method)
435 	{
436 	case QUANT_2:   return   2;
437 	case QUANT_3:   return   3;
438 	case QUANT_4:   return   4;
439 	case QUANT_5:   return   5;
440 	case QUANT_6:   return   6;
441 	case QUANT_8:   return   8;
442 	case QUANT_10:  return  10;
443 	case QUANT_12:  return  12;
444 	case QUANT_16:  return  16;
445 	case QUANT_20:  return  20;
446 	case QUANT_24:  return  24;
447 	case QUANT_32:  return  32;
448 	case QUANT_40:  return  40;
449 	case QUANT_48:  return  48;
450 	case QUANT_64:  return  64;
451 	case QUANT_80:  return  80;
452 	case QUANT_96:  return  96;
453 	case QUANT_128: return 128;
454 	case QUANT_160: return 160;
455 	case QUANT_192: return 192;
456 	case QUANT_256: return 256;
457 	}
458 
459 	// Unreachable - the enum is fully described
460 	return 0;
461 }
462 
463 /**
464  * @brief Computed metrics about a partition in a block.
465  */
466 struct partition_metrics
467 {
468 	/** @brief The error-weighted average color in the partition. */
469 	vfloat4 avg;
470 
471 	/** @brief The dominant error-weighted direction in the partition. */
472 	vfloat4 dir;
473 };
474 
475 /**
476  * @brief Computed lines for a a three component analysis.
477  */
478 struct partition_lines3
479 {
480 	/** @brief Line for uncorrelated chroma. */
481 	line3 uncor_line;
482 
483 	/** @brief Line for correlated chroma, passing though the origin. */
484 	line3 samec_line;
485 
486 	/** @brief Postprocessed line for uncorrelated chroma. */
487 	processed_line3 uncor_pline;
488 
489 	/** @brief Postprocessed line for correlated chroma, passing though the origin. */
490 	processed_line3 samec_pline;
491 
492 	/** @brief The length of the line for uncorrelated chroma. */
493 	float uncor_line_len;
494 
495 	/** @brief The length of the line for correlated chroma. */
496 	float samec_line_len;
497 };
498 
499 /**
500  * @brief The partition information for a single partition.
501  *
502  * ASTC has a total of 1024 candidate partitions for each of 2/3/4 partition counts, although this
503  * 1024 includes seeds that generate duplicates of other seeds and seeds that generate completely
504  * empty partitions. These are both valid encodings, but astcenc will skip both during compression
505  * as they are not useful.
506  */
507 struct partition_info
508 {
509 	/** @brief The number of partitions in this partitioning. */
510 	uint16_t partition_count;
511 
512 	/** @brief The index (seed) of this partitioning. */
513 	uint16_t partition_index;
514 
515 	/**
516 	 * @brief The number of texels in each partition.
517 	 *
518 	 * Note that some seeds result in zero texels assigned to a partition are valid, but are skipped
519 	 * by this compressor as there is no point spending bits encoding an unused color endpoint.
520 	 */
521 	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
522 
523 	/** @brief The partition of each texel in the block. */
524 	uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
525 
526 	/** @brief The list of texels in each partition. */
527 	uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
528 };
529 
530 /**
531  * @brief The weight grid information for a single decimation pattern.
532  *
533  * ASTC can store one weight per texel, but is also capable of storing lower resoution weight grids
534  * that are interpolated during decompression to assign a with to a texel. Storing fewer weights
535  * can free up a substantial amount of bits that we can then spend on more useful things, such as
536  * more accurate endpoints and weights, or additional partitions.
537  *
538  * This data structure is used to store information about a single weight grid decimation pattern,
539  * for a single block size.
540  */
541 struct decimation_info
542 {
543 	/** @brief The total number of texels in the block. */
544 	uint8_t texel_count;
545 
546 	/** @brief The maximum number of stored weights that contribute to each texel, between 1 and 4. */
547 	uint8_t max_texel_weight_count;
548 
549 	/** @brief The total number of weights stored. */
550 	uint8_t weight_count;
551 
552 	/** @brief The number of stored weights in the X dimension. */
553 	uint8_t weight_x;
554 
555 	/** @brief The number of stored weights in the Y dimension. */
556 	uint8_t weight_y;
557 
558 	/** @brief The number of stored weights in the Z dimension. */
559 	uint8_t weight_z;
560 
561 	/** @brief The number of stored weights that contribute to each texel, between 1 and 4. */
562 	uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
563 
564 	/** @brief The weight index of the N weights that need to be interpolated for each texel. */
565 	uint8_t texel_weights_4t[4][BLOCK_MAX_TEXELS];
566 
567 	/** @brief The bilinear interpolation weighting of the N input weights for each texel, between 0 and 16. */
568 	uint8_t texel_weights_int_4t[4][BLOCK_MAX_TEXELS];
569 
570 	/** @brief The bilinear interpolation weighting of the N input weights for each texel, between 0 and 1. */
571 	alignas(ASTCENC_VECALIGN) float texel_weights_float_4t[4][BLOCK_MAX_TEXELS];
572 
573 	/** @brief The number of texels that each stored weight contributes to. */
574 	uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
575 
576 	/** @brief The list of weights that contribute to each texel. */
577 	uint8_t weight_texel[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
578 
579 	/** @brief The list of weight indices that contribute to each texel. */
580 	alignas(ASTCENC_VECALIGN) float weights_flt[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
581 
582 	/**
583 	 * @brief Folded structure for faster access:
584 	 *     texel_weights_texel[i][j][.] = texel_weights[.][weight_texel[i][j]]
585 	 */
586 	uint8_t texel_weights_texel[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS][4];
587 
588 	/**
589 	 * @brief Folded structure for faster access:
590 	 *     texel_weights_float_texel[i][j][.] = texel_weights_float[.][weight_texel[i][j]]
591 	 */
592 	float texel_weights_float_texel[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS][4];
593 };
594 
595 /**
596  * @brief Metadata for single block mode for a specific block size.
597  */
598 struct block_mode
599 {
600 	/** @brief The block mode index in the ASTC encoded form. */
601 	uint16_t mode_index;
602 
603 	/** @brief The decimation mode index in the compressor reindexed list. */
604 	uint8_t decimation_mode;
605 
606 	/** @brief The weight quantization used by this block mode. */
607 	uint8_t quant_mode;
608 
609 	/** @brief The weight quantization used by this block mode. */
610 	uint8_t weight_bits;
611 
612 	/** @brief Is a dual weight plane used by this block mode? */
613 	uint8_t is_dual_plane : 1;
614 
615 	/**
616 	 * @brief Get the weight quantization used by this block mode.
617 	 *
618 	 * @return The quantization level.
619 	 */
get_weight_quant_modeblock_mode620 	inline quant_method get_weight_quant_mode() const
621 	{
622 		return static_cast<quant_method>(this->quant_mode);
623 	}
624 };
625 
626 /**
627  * @brief Metadata for single decimation mode for a specific block size.
628  */
629 struct decimation_mode
630 {
631 	/** @brief The max weight precision for 1 plane, or -1 if not supported. */
632 	int8_t maxprec_1plane;
633 
634 	/** @brief The max weight precision for 2 planes, or -1 if not supported. */
635 	int8_t maxprec_2planes;
636 
637 	/** @brief Was this actually referenced by an active 1 plane mode? */
638 	uint8_t ref_1_plane;
639 
640 	/** @brief Was this actually referenced by an active 2 plane mode? */
641 	uint8_t ref_2_planes;
642 };
643 
644 /**
645  * @brief Data tables for a single block size.
646  *
647  * The decimation tables store the information to apply weight grid dimension reductions. We only
648  * store the decimation modes that are actually needed by the current context; many of the possible
649  * modes will be unused (too many weights for the current block size or disabled by heuristics). The
650  * actual number of weights stored is @c decimation_mode_count, and the @c decimation_modes and
651  * @c decimation_tables arrays store the active modes contiguously at the start of the array. These
652  * entries are not stored in any particuar order.
653  *
654  * The block mode tables store the unpacked block mode settings. Block modes are stored in the
655  * compressed block as an 11 bit field, but for any given block size and set of compressor
656  * heuristics, only a subset of the block modes will be used. The actual number of block modes
657  * stored is indicated in @c block_mode_count, and the @c block_modes array store the active modes
658  * contiguously at the start of the array. These entries are stored in incrementing "packed" value
659  * order, which doesn't mean much once unpacked. To allow decompressors to reference the packed data
660  * efficiently the @c block_mode_packed_index array stores the mapping between physical ID and the
661  * actual remapped array index.
662  */
663 struct block_size_descriptor
664 {
665 	/** @brief The block X dimension, in texels. */
666 	uint8_t xdim;
667 
668 	/** @brief The block Y dimension, in texels. */
669 	uint8_t ydim;
670 
671 	/** @brief The block Z dimension, in texels. */
672 	uint8_t zdim;
673 
674 	/** @brief The block total texel count. */
675 	uint8_t texel_count;
676 
677 	/**
678 	 * @brief The number of stored decimation modes which are "always" modes.
679 	 *
680 	 * Always modes are stored at the start of the decimation_modes list.
681 	 */
682 	unsigned int decimation_mode_count_always;
683 
684 	/** @brief The number of stored decimation modes for selected encodings. */
685 	unsigned int decimation_mode_count_selected;
686 
687 	/** @brief The number of stored decimation modes for any encoding. */
688 	unsigned int decimation_mode_count_all;
689 
690 	/**
691 	 * @brief The number of stored block modes which are "always" modes.
692 	 *
693 	 * Always modes are stored at the start of the block_modes list.
694 	 */
695 	unsigned int block_mode_count_1plane_always;
696 
697 	/** @brief The number of stored block modes for active 1 plane encodings. */
698 	unsigned int block_mode_count_1plane_selected;
699 
700 	/** @brief The number of stored block modes for active 1 and 2 plane encodings. */
701 	unsigned int block_mode_count_1plane_2plane_selected;
702 
703 	/** @brief The number of stored block modes for any encoding. */
704 	unsigned int block_mode_count_all;
705 
706 	/** @brief The number of selected partitionings for 1/2/3/4 partitionings. */
707 	unsigned int partitioning_count_selected[BLOCK_MAX_PARTITIONS];
708 
709 	/** @brief The number of partitionings for 1/2/3/4 partitionings. */
710 	unsigned int partitioning_count_all[BLOCK_MAX_PARTITIONS];
711 
712 	/** @brief The active decimation modes, stored in low indices. */
713 	decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
714 
715 	/** @brief The active decimation tables, stored in low indices. */
716 	alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
717 
718 	/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
719 	uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
720 
721 	/** @brief The active block modes, stored in low indices. */
722 	block_mode block_modes[WEIGHTS_MAX_BLOCK_MODES];
723 
724 	/** @brief The active partition tables, stored in low indices per-count. */
725 	partition_info partitionings[(3 * BLOCK_MAX_PARTITIONINGS) + 1];
726 
727 	/**
728 	 * @brief The packed partition table array index, or @c BLOCK_BAD_PARTITIONING if not active.
729 	 *
730 	 * Indexed by partition_count - 2, containing 2, 3 and 4 partitions.
731 	 */
732 	uint16_t partitioning_packed_index[3][BLOCK_MAX_PARTITIONINGS];
733 
734 	/** @brief The active texels for k-means partition selection. */
735 	uint8_t kmeans_texels[BLOCK_MAX_KMEANS_TEXELS];
736 
737 	/**
738 	 * @brief Is 0 if this 2-partition is valid for compression 255 otherwise.
739 	 *
740 	 * Indexed by remapped index, not physical index.
741 	 */
742 	uint8_t partitioning_valid_2[BLOCK_MAX_PARTITIONINGS];
743 
744 	/**
745 	 * @brief The canonical 2-partition coverage pattern used during block partition search.
746 	 *
747 	 * Indexed by remapped index, not physical index.
748 	 */
749 	uint64_t coverage_bitmaps_2[BLOCK_MAX_PARTITIONINGS][2];
750 
751 	/**
752 	 * @brief Is 0 if this 3-partition is valid for compression 255 otherwise.
753 	 *
754 	 * Indexed by remapped index, not physical index.
755 	 */
756 	uint8_t partitioning_valid_3[BLOCK_MAX_PARTITIONINGS];
757 
758 	/**
759 	 * @brief The canonical 3-partition coverage pattern used during block partition search.
760 	 *
761 	 * Indexed by remapped index, not physical index.
762 	 */
763 	uint64_t coverage_bitmaps_3[BLOCK_MAX_PARTITIONINGS][3];
764 
765 	/**
766 	 * @brief Is 0 if this 4-partition is valid for compression 255 otherwise.
767 	 *
768 	 * Indexed by remapped index, not physical index.
769 	 */
770 	uint8_t partitioning_valid_4[BLOCK_MAX_PARTITIONINGS];
771 
772 	/**
773 	 * @brief The canonical 4-partition coverage pattern used during block partition search.
774 	 *
775 	 * Indexed by remapped index, not physical index.
776 	 */
777 	uint64_t coverage_bitmaps_4[BLOCK_MAX_PARTITIONINGS][4];
778 
779 	/**
780 	 * @brief Get the block mode structure for index @c block_mode.
781 	 *
782 	 * This function can only return block modes that are enabled by the current compressor config.
783 	 * Decompression from an arbitrary source should not use this without first checking that the
784 	 * packed block mode index is not @c BLOCK_BAD_BLOCK_MODE.
785 	 *
786 	 * @param block_mode   The packed block mode index.
787 	 *
788 	 * @return The block mode structure.
789 	 */
get_block_modeblock_size_descriptor790 	const block_mode& get_block_mode(unsigned int block_mode) const
791 	{
792 		unsigned int packed_index = this->block_mode_packed_index[block_mode];
793 		assert(packed_index != BLOCK_BAD_BLOCK_MODE && packed_index < this->block_mode_count_all);
794 		return this->block_modes[packed_index];
795 	}
796 
797 	/**
798 	 * @brief Get the decimation mode structure for index @c decimation_mode.
799 	 *
800 	 * This function can only return decimation modes that are enabled by the current compressor
801 	 * config. The mode array is stored packed, but this is only ever indexed by the packed index
802 	 * stored in the @c block_mode and never exists in an unpacked form.
803 	 *
804 	 * @param decimation_mode   The packed decimation mode index.
805 	 *
806 	 * @return The decimation mode structure.
807 	 */
get_decimation_modeblock_size_descriptor808 	const decimation_mode& get_decimation_mode(unsigned int decimation_mode) const
809 	{
810 		return this->decimation_modes[decimation_mode];
811 	}
812 
813 	/**
814 	 * @brief Get the decimation info structure for index @c decimation_mode.
815 	 *
816 	 * This function can only return decimation modes that are enabled by the current compressor
817 	 * config. The mode array is stored packed, but this is only ever indexed by the packed index
818 	 * stored in the @c block_mode and never exists in an unpacked form.
819 	 *
820 	 * @param decimation_mode   The packed decimation mode index.
821 	 *
822 	 * @return The decimation info structure.
823 	 */
get_decimation_infoblock_size_descriptor824 	const decimation_info& get_decimation_info(unsigned int decimation_mode) const
825 	{
826 		return this->decimation_tables[decimation_mode];
827 	}
828 
829 	/**
830 	 * @brief Get the partition info table for a given partition count.
831 	 *
832 	 * @param partition_count   The number of partitions we want the table for.
833 	 *
834 	 * @return The pointer to the table of 1024 entries (for 2/3/4 parts) or 1 entry (for 1 part).
835 	 */
get_partition_tableblock_size_descriptor836 	const partition_info* get_partition_table(unsigned int partition_count) const
837 	{
838 		if (partition_count == 1)
839 		{
840 			partition_count = 5;
841 		}
842 		unsigned int index = (partition_count - 2) * BLOCK_MAX_PARTITIONINGS;
843 		return this->partitionings + index;
844 	}
845 
846 	/**
847 	 * @brief Get the partition info structure for a given partition count and seed.
848 	 *
849 	 * @param partition_count   The number of partitions we want the info for.
850 	 * @param index             The partition seed (between 0 and 1023).
851 	 *
852 	 * @return The partition info structure.
853 	 */
get_partition_infoblock_size_descriptor854 	const partition_info& get_partition_info(unsigned int partition_count, unsigned int index) const
855 	{
856 		unsigned int packed_index = 0;
857 		if (partition_count >= 2)
858 		{
859 			packed_index = this->partitioning_packed_index[partition_count - 2][index];
860 		}
861 
862 		assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]);
863 		auto& result = get_partition_table(partition_count)[packed_index];
864 		assert(index == result.partition_index);
865 		return result;
866 	}
867 
868 	/**
869 	 * @brief Get the partition info structure for a given partition count and seed.
870 	 *
871 	 * @param partition_count   The number of partitions we want the info for.
872 	 * @param packed_index      The raw array offset.
873 	 *
874 	 * @return The partition info structure.
875 	 */
get_raw_partition_infoblock_size_descriptor876 	const partition_info& get_raw_partition_info(unsigned int partition_count, unsigned int packed_index) const
877 	{
878 		assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]);
879 		auto& result = get_partition_table(partition_count)[packed_index];
880 		return result;
881 	}
882 };
883 
884 /**
885  * @brief The image data for a single block.
886  *
887  * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
888  * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
889  * data is stored as direct UNORM data, HDR data is stored as LNS data.
890  *
891  * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
892  * decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
893  */
894 struct image_block
895 {
896 	/** @brief The input (compress) or output (decompress) data for the red color component. */
897 	alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
898 
899 	/** @brief The input (compress) or output (decompress) data for the green color component. */
900 	alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
901 
902 	/** @brief The input (compress) or output (decompress) data for the blue color component. */
903 	alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
904 
905 	/** @brief The input (compress) or output (decompress) data for the alpha color component. */
906 	alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
907 
908 	/** @brief The number of texels in the block. */
909 	uint8_t texel_count;
910 
911 	/** @brief The original data for texel 0 for constant color block encoding. */
912 	vfloat4 origin_texel;
913 
914 	/** @brief The min component value of all texels in the block. */
915 	vfloat4 data_min;
916 
917 	/** @brief The mean component value of all texels in the block. */
918 	vfloat4 data_mean;
919 
920 	/** @brief The max component value of all texels in the block. */
921 	vfloat4 data_max;
922 
923 	/** @brief The relative error significance of the color channels. */
924 	vfloat4 channel_weight;
925 
926 	/** @brief Is this grayscale block where R == G == B for all texels? */
927 	bool grayscale;
928 
929 	/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
930 	uint8_t rgb_lns[BLOCK_MAX_TEXELS];
931 
932 	/** @brief Set to 1 if a texel is using HDR alpha endpoints (decompression only). */
933 	uint8_t alpha_lns[BLOCK_MAX_TEXELS];
934 
935 	/** @brief The X position of this block in the input or output image. */
936 	unsigned int xpos;
937 
938 	/** @brief The Y position of this block in the input or output image. */
939 	unsigned int ypos;
940 
941 	/** @brief The Z position of this block in the input or output image. */
942 	unsigned int zpos;
943 
944 	/**
945 	 * @brief Get an RGBA texel value from the data.
946 	 *
947 	 * @param index   The texel index.
948 	 *
949 	 * @return The texel in RGBA component ordering.
950 	 */
texelimage_block951 	inline vfloat4 texel(unsigned int index) const
952 	{
953 		return vfloat4(data_r[index],
954 		               data_g[index],
955 		               data_b[index],
956 		               data_a[index]);
957 	}
958 
959 	/**
960 	 * @brief Get an RGB texel value from the data.
961 	 *
962 	 * @param index   The texel index.
963 	 *
964 	 * @return The texel in RGB0 component ordering.
965 	 */
texel3image_block966 	inline vfloat4 texel3(unsigned int index) const
967 	{
968 		return vfloat3(data_r[index],
969 		               data_g[index],
970 		               data_b[index]);
971 	}
972 
973 	/**
974 	 * @brief Get the default alpha value for endpoints that don't store it.
975 	 *
976 	 * The default depends on whether the alpha endpoint is LDR or HDR.
977 	 *
978 	 * @return The alpha value in the scaled range used by the compressor.
979 	 */
get_default_alphaimage_block980 	inline float get_default_alpha() const
981 	{
982 		return this->alpha_lns[0] ? static_cast<float>(0x7800) : static_cast<float>(0xFFFF);
983 	}
984 
985 	/**
986 	 * @brief Test if a single color channel is constant across the block.
987 	 *
988 	 * Constant color channels are easier to compress as interpolating between two identical colors
989 	 * always returns the same value, irrespective of the weight used. They therefore can be ignored
990 	 * for the purposes of weight selection and use of a second weight plane.
991 	 *
992 	 * @return @c true if the channel is constant across the block, @c false otherwise.
993 	 */
is_constant_channelimage_block994 	inline bool is_constant_channel(int channel) const
995 	{
996 		vmask4 lane_mask = vint4::lane_id() == vint4(channel);
997 		vmask4 color_mask = this->data_min == this->data_max;
998 		return any(lane_mask & color_mask);
999 	}
1000 
1001 	/**
1002 	 * @brief Test if this block is a luminance block with constant 1.0 alpha.
1003 	 *
1004 	 * @return @c true if the block is a luminance block , @c false otherwise.
1005 	 */
is_luminanceimage_block1006 	inline bool is_luminance() const
1007 	{
1008 		float default_alpha = this->get_default_alpha();
1009 		bool alpha1 = (this->data_min.lane<3>() == default_alpha) &&
1010 		              (this->data_max.lane<3>() == default_alpha);
1011 		return this->grayscale && alpha1;
1012 	}
1013 
1014 	/**
1015 	 * @brief Test if this block is a luminance block with variable alpha.
1016 	 *
1017 	 * @return @c true if the block is a luminance + alpha block , @c false otherwise.
1018 	 */
is_luminancealphaimage_block1019 	inline bool is_luminancealpha() const
1020 	{
1021 		float default_alpha = this->get_default_alpha();
1022 		bool alpha1 = (this->data_min.lane<3>() == default_alpha) &&
1023 		              (this->data_max.lane<3>() == default_alpha);
1024 		return this->grayscale && !alpha1;
1025 	}
1026 };
1027 
1028 /**
1029  * @brief Data structure storing the color endpoints for a block.
1030  */
1031 struct endpoints
1032 {
1033 	/** @brief The number of partition endpoints stored. */
1034 	unsigned int partition_count;
1035 
1036 	/** @brief The colors for endpoint 0. */
1037 	vfloat4 endpt0[BLOCK_MAX_PARTITIONS];
1038 
1039 	/** @brief The colors for endpoint 1. */
1040 	vfloat4 endpt1[BLOCK_MAX_PARTITIONS];
1041 };
1042 
1043 /**
1044  * @brief Data structure storing the color endpoints and weights.
1045  */
1046 struct endpoints_and_weights
1047 {
1048 	/** @brief True if all active values in weight_error_scale are the same. */
1049 	bool is_constant_weight_error_scale;
1050 
1051 	/** @brief The color endpoints. */
1052 	endpoints ep;
1053 
1054 	/** @brief The ideal weight for each texel; may be undecimated or decimated. */
1055 	alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
1056 
1057 	/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
1058 	alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
1059 };
1060 
1061 /**
1062  * @brief Utility storing estimated errors from choosing particular endpoint encodings.
1063  */
1064 struct encoding_choice_errors
1065 {
1066 	/** @brief Error of using LDR RGB-scale instead of complete endpoints. */
1067 	float rgb_scale_error;
1068 
1069 	/** @brief Error of using HDR RGB-scale instead of complete endpoints. */
1070 	float rgb_luma_error;
1071 
1072 	/** @brief Error of using luminance instead of RGB. */
1073 	float luminance_error;
1074 
1075 	/** @brief Error of discarding alpha and using a constant 1.0 alpha. */
1076 	float alpha_drop_error;
1077 
1078 	/** @brief Can we use delta offset encoding? */
1079 	bool can_offset_encode;
1080 
1081 	/** @brief CAn we use blue contraction encoding? */
1082 	bool can_blue_contract;
1083 };
1084 
1085 /**
1086  * @brief Preallocated working buffers, allocated per thread during context creation.
1087  */
1088 struct alignas(ASTCENC_VECALIGN) compression_working_buffers
1089 {
1090 	/** @brief Ideal endpoints and weights for plane 1. */
1091 	endpoints_and_weights ei1;
1092 
1093 	/** @brief Ideal endpoints and weights for plane 2. */
1094 	endpoints_and_weights ei2;
1095 
1096 	/** @brief Ideal decimated endpoints and weights for plane 1. */
1097 	endpoints_and_weights eix1[WEIGHTS_MAX_DECIMATION_MODES];
1098 
1099 	/** @brief Ideal decimated endpoints and weights for plane 2. */
1100 	endpoints_and_weights eix2[WEIGHTS_MAX_DECIMATION_MODES];
1101 
1102 	/**
1103 	 * @brief Decimated ideal weight values.
1104 	 *
1105 	 * For two plane encodings, second plane weights start at @c WEIGHTS_PLANE2_OFFSET offsets.
1106 	 */
1107 	alignas(ASTCENC_VECALIGN) float dec_weights_ideal_value[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
1108 
1109 	/**
1110 	 * @brief Decimated and quantized weight values stored in the unpacked quantized weight range.
1111 	 *
1112 	 * For two plane encodings, second plane weights start at @c WEIGHTS_PLANE2_OFFSET offsets.
1113 	 */
1114 	alignas(ASTCENC_VECALIGN) float dec_weights_quant_uvalue[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
1115 
1116 	/**
1117 	 * @brief Decimated and quantized weight values stored in the packed quantized weight range.
1118 	 *
1119 	 * For two plane encodings, second plane weights start at @c WEIGHTS_PLANE2_OFFSET offsets.
1120 	 */
1121 	alignas(ASTCENC_VECALIGN) uint8_t dec_weights_quant_pvalue[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
1122 
1123 	/** @brief Error of the best encoding combination for each block mode. */
1124 	alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
1125 
1126 	/** @brief The best color quant for each block mode. */
1127 	alignas(ASTCENC_VECALIGN) quant_method best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
1128 
1129 	/** @brief The best color quant for each block mode if modes are the same and we have spare bits. */
1130 	quant_method best_quant_levels_mod[WEIGHTS_MAX_BLOCK_MODES];
1131 
1132 	/** @brief The best endpoint format for each partition. */
1133 	int best_ep_formats[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS];
1134 
1135 	/** @brief The total bit storage needed for quantized weights for each block mode. */
1136 	int qwt_bitcounts[WEIGHTS_MAX_BLOCK_MODES];
1137 
1138 	/** @brief The cumulative error for quantized weights for each block mode. */
1139 	float qwt_errors[WEIGHTS_MAX_BLOCK_MODES];
1140 
1141 	/** @brief The low weight value in plane 1 for each block mode. */
1142 	float weight_low_value1[WEIGHTS_MAX_BLOCK_MODES];
1143 
1144 	/** @brief The high weight value in plane 1 for each block mode. */
1145 	float weight_high_value1[WEIGHTS_MAX_BLOCK_MODES];
1146 
1147 	/** @brief The low weight value in plane 1 for each quant level and decimation mode. */
1148 	float weight_low_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
1149 
1150 	/** @brief The high weight value in plane 1 for each quant level and decimation mode. */
1151 	float weight_high_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
1152 
1153 	/** @brief The low weight value in plane 2 for each block mode. */
1154 	float weight_low_value2[WEIGHTS_MAX_BLOCK_MODES];
1155 
1156 	/** @brief The high weight value in plane 2 for each block mode. */
1157 	float weight_high_value2[WEIGHTS_MAX_BLOCK_MODES];
1158 
1159 	/** @brief The low weight value in plane 2 for each quant level and decimation mode. */
1160 	float weight_low_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
1161 
1162 	/** @brief The high weight value in plane 2 for each quant level and decimation mode. */
1163 	float weight_high_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
1164 };
1165 
1166 struct dt_init_working_buffers
1167 {
1168 	uint8_t weight_count_of_texel[BLOCK_MAX_TEXELS];
1169 	uint8_t grid_weights_of_texel[BLOCK_MAX_TEXELS][4];
1170 	uint8_t weights_of_texel[BLOCK_MAX_TEXELS][4];
1171 
1172 	uint8_t texel_count_of_weight[BLOCK_MAX_WEIGHTS];
1173 	uint8_t texels_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
1174 	uint8_t texel_weights_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
1175 };
1176 
1177 /**
1178  * @brief Weight quantization transfer table.
1179  *
1180  * ASTC can store texel weights at many quantization levels, so for performance we store essential
1181  * information about each level as a precomputed data structure. Unquantized weights are integers
1182  * or floats in the range [0, 64].
1183  *
1184  * This structure provides a table, used to estimate the closest quantized weight for a given
1185  * floating-point weight. For each quantized weight, the corresponding unquantized values. For each
1186  * quantized weight, a previous-value and a next-value.
1187 */
1188 struct quantization_and_transfer_table
1189 {
1190 	/** @brief The quantization level used */
1191 	quant_method method;
1192 
1193 	/** @brief The unscrambled unquantized value. */
1194 	float unquantized_value_unsc[33];
1195 
1196 	/** @brief The scrambling order: value[map[i]] == value_unsc[i] */
1197 	int32_t scramble_map[32];
1198 
1199 	/** @brief The scrambled unquantized values. */
1200 	uint8_t unquantized_value[32];
1201 
1202 	/**
1203 	 * @brief A table of previous-and-next weights, indexed by the current unquantized value.
1204 	 *  * bits 7:0 = previous-index, unquantized
1205 	 *  * bits 15:8 = next-index, unquantized
1206 	 *  * bits 23:16 = previous-index, quantized
1207 	 *  * bits 31:24 = next-index, quantized
1208 	 */
1209 	uint32_t prev_next_values[65];
1210 };
1211 
1212 
1213 /** @brief The precomputed quant and transfer table. */
1214 extern const quantization_and_transfer_table quant_and_xfer_tables[12];
1215 
1216 /** @brief The block is an error block, and will return error color or NaN. */
1217 static constexpr uint8_t SYM_BTYPE_ERROR { 0 };
1218 
1219 /** @brief The block is a constant color block using FP16 colors. */
1220 static constexpr uint8_t SYM_BTYPE_CONST_F16 { 1 };
1221 
1222 /** @brief The block is a constant color block using UNORM16 colors. */
1223 static constexpr uint8_t SYM_BTYPE_CONST_U16 { 2 };
1224 
1225 /** @brief The block is a normal non-constant color block. */
1226 static constexpr uint8_t SYM_BTYPE_NONCONST { 3 };
1227 
1228 /**
1229  * @brief A symbolic representation of a compressed block.
1230  *
1231  * The symbolic representation stores the unpacked content of a single
1232  * @c physical_compressed_block, in a form which is much easier to access for
1233  * the rest of the compressor code.
1234  */
1235 struct symbolic_compressed_block
1236 {
1237 	/** @brief The block type, one of the @c SYM_BTYPE_* constants. */
1238 	uint8_t block_type;
1239 
1240 	/** @brief The number of partitions; valid for @c NONCONST blocks. */
1241 	uint8_t partition_count;
1242 
1243 	/** @brief Non-zero if the color formats matched; valid for @c NONCONST blocks. */
1244 	uint8_t color_formats_matched;
1245 
1246 	/** @brief The plane 2 color component, or -1 if single plane; valid for @c NONCONST blocks. */
1247 	// Try unsigned sentintel to avoid signext on load
1248 	int8_t plane2_component;
1249 
1250 	/** @brief The block mode; valid for @c NONCONST blocks. */
1251 	uint16_t block_mode;
1252 
1253 	/** @brief The partition index; valid for @c NONCONST blocks if 2 or more partitions. */
1254 	uint16_t partition_index;
1255 
1256 	/** @brief The endpoint color formats for each partition; valid for @c NONCONST blocks. */
1257 	uint8_t color_formats[BLOCK_MAX_PARTITIONS];
1258 
1259 	/** @brief The endpoint color quant mode; valid for @c NONCONST blocks. */
1260 	quant_method quant_mode;
1261 
1262 	/** @brief The error of the current encoding; valid for @c NONCONST blocks. */
1263 	float errorval;
1264 
1265 	// We can't have both of these at the same time
1266 	union {
1267 		/** @brief The constant color; valid for @c CONST blocks. */
1268 		int constant_color[BLOCK_MAX_COMPONENTS];
1269 
1270 		/** @brief The quantized endpoint color pairs; valid for @c NONCONST blocks. */
1271 		uint8_t color_values[BLOCK_MAX_PARTITIONS][8];
1272 	};
1273 
1274 	/** @brief The quantized and decimated weights.
1275 	 *
1276 	 * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
1277 	 */
1278 	uint8_t weights[BLOCK_MAX_WEIGHTS];
1279 
1280 	/**
1281 	 * @brief Get the weight quantization used by this block mode.
1282 	 *
1283 	 * @return The quantization level.
1284 	 */
get_color_quant_modesymbolic_compressed_block1285 	inline quant_method get_color_quant_mode() const
1286 	{
1287 		return this->quant_mode;
1288 	}
1289 	QualityProfile privateProfile;
1290 };
1291 
1292 /**
1293  * @brief A physical representation of a compressed block.
1294  *
1295  * The physical representation stores the raw bytes of the format in memory.
1296  */
1297 struct physical_compressed_block
1298 {
1299 	/** @brief The ASTC encoded data for a single block. */
1300 	uint8_t data[16];
1301 };
1302 
1303 
1304 /**
1305  * @brief Parameter structure for @c compute_pixel_region_variance().
1306  *
1307  * This function takes a structure to avoid spilling arguments to the stack on every function
1308  * invocation, as there are a lot of parameters.
1309  */
1310 struct pixel_region_args
1311 {
1312 	/** @brief The image to analyze. */
1313 	const astcenc_image* img;
1314 
1315 	/** @brief The component swizzle pattern. */
1316 	astcenc_swizzle swz;
1317 
1318 	/** @brief Should the algorithm bother with Z axis processing? */
1319 	bool have_z;
1320 
1321 	/** @brief The kernel radius for alpha processing. */
1322 	unsigned int alpha_kernel_radius;
1323 
1324 	/** @brief The X dimension of the working data to process. */
1325 	unsigned int size_x;
1326 
1327 	/** @brief The Y dimension of the working data to process. */
1328 	unsigned int size_y;
1329 
1330 	/** @brief The Z dimension of the working data to process. */
1331 	unsigned int size_z;
1332 
1333 	/** @brief The X position of first src and dst data in the data set. */
1334 	unsigned int offset_x;
1335 
1336 	/** @brief The Y position of first src and dst data in the data set. */
1337 	unsigned int offset_y;
1338 
1339 	/** @brief The Z position of first src and dst data in the data set. */
1340 	unsigned int offset_z;
1341 
1342 	/** @brief The working memory buffer. */
1343 	vfloat4 *work_memory;
1344 };
1345 
1346 /**
1347  * @brief Parameter structure for @c compute_averages_proc().
1348  */
1349 struct avg_args
1350 {
1351 	/** @brief The arguments for the nested variance computation. */
1352 	pixel_region_args arg;
1353 
1354 	// The above has a reference to the image altread?
1355 	/** @brief The image Stride dimensions. */
1356 	unsigned int img_size_stride;
1357 
1358 	/** @brief The image X dimensions. */
1359 	unsigned int img_size_x;
1360 
1361 	/** @brief The image Y dimensions. */
1362 	unsigned int img_size_y;
1363 
1364 	/** @brief The image Z dimensions. */
1365 	unsigned int img_size_z;
1366 
1367 	/** @brief The maximum working block dimensions in X and Y dimensions. */
1368 	unsigned int blk_size_xy;
1369 
1370 	/** @brief The maximum working block dimensions in Z dimensions. */
1371 	unsigned int blk_size_z;
1372 
1373 	/** @brief The working block memory size. */
1374 	unsigned int work_memory_size;
1375 };
1376 
1377 #if defined(ASTCENC_DIAGNOSTICS)
1378 /* See astcenc_diagnostic_trace header for details. */
1379 class TraceLog;
1380 #endif
1381 
1382 /**
1383  * @brief The astcenc compression context.
1384  */
1385 struct astcenc_context
1386 {
1387 	/** @brief The configuration this context was created with. */
1388 	astcenc_config config;
1389 
1390 	/** @brief The thread count supported by this context. */
1391 	unsigned int thread_count;
1392 
1393 	/** @brief The block size descriptor this context was created with. */
1394 	block_size_descriptor* bsd;
1395 
1396 	/*
1397 	 * Fields below here are not needed in a decompress-only build, but some remain as they are
1398 	 * small and it avoids littering the code with #ifdefs. The most significant contributors to
1399 	 * large structure size are omitted.
1400 	 */
1401 
1402 	/** @brief The input image alpha channel averages table, may be @c nullptr if not needed. */
1403 	float *input_alpha_averages;
1404 
1405 	/** @brief The scratch working buffers, one per thread (see @c thread_count). */
1406 	compression_working_buffers* working_buffers;
1407 
1408 #if !defined(ASTCENC_DECOMPRESS_ONLY)
1409 	/** @brief The pixel region and variance worker arguments. */
1410 	avg_args avg_preprocess_args;
1411 
1412 	/** @brief The parallel manager for averages computation. */
1413 	ParallelManager manage_avg;
1414 
1415 	/** @brief The parallel manager for compression. */
1416 	ParallelManager manage_compress;
1417 #endif
1418 
1419 	/** @brief The parallel manager for decompression. */
1420 	ParallelManager manage_decompress;
1421 
1422 #if defined(ASTCENC_DIAGNOSTICS)
1423 	/**
1424 	 * @brief The diagnostic trace logger.
1425 	 *
1426 	 * Note that this is a singleton, so can only be used in single threaded mode. It only exists
1427 	 * here so we have a reference to close the file at the end of the capture.
1428 	 */
1429 	TraceLog* trace_log;
1430 #endif
1431 };
1432 
1433 /* ============================================================================
1434   Functionality for managing block sizes and partition tables.
1435 ============================================================================ */
1436 
1437 /**
1438  * @brief Populate the block size descriptor for the target block size.
1439  *
1440  * This will also initialize the partition table metadata, which is stored as part of the BSD
1441  * structure.
1442  *
1443  * @param      x_texels                 The number of texels in the block X dimension.
1444  * @param      y_texels                 The number of texels in the block Y dimension.
1445  * @param      z_texels                 The number of texels in the block Z dimension.
1446  * @param      can_omit_modes           Can we discard modes and partitionings that astcenc won't use?
1447  * @param      partition_count_cutoff   The partition count cutoff to use, if we can omit partitionings.
1448  * @param      mode_cutoff              The block mode percentile cutoff [0-1].
1449  * @param[out] bsd                      The descriptor to initialize.
1450  */
1451 void init_block_size_descriptor(
1452 	QualityProfile privateProfile,
1453 	unsigned int x_texels,
1454 	unsigned int y_texels,
1455 	unsigned int z_texels,
1456 	bool can_omit_modes,
1457 	unsigned int partition_count_cutoff,
1458 	float mode_cutoff,
1459 	block_size_descriptor& bsd);
1460 
1461 /**
1462  * @brief Populate the partition tables for the target block size.
1463  *
1464  * Note the @c bsd descriptor must be initialized by calling @c init_block_size_descriptor() before
1465  * calling this function.
1466  *
1467  * @param[out] bsd                      The block size information structure to populate.
1468  * @param      can_omit_partitionings   True if we can we drop partitionings that astcenc won't use.
1469  * @param      partition_count_cutoff   The partition count cutoff to use, if we can omit partitionings.
1470  */
1471 void init_partition_tables(
1472 	block_size_descriptor& bsd,
1473 	bool can_omit_partitionings,
1474 	unsigned int partition_count_cutoff);
1475 
1476 /**
1477  * @brief Get the percentile table for 2D block modes.
1478  *
1479  * This is an empirically determined prioritization of which block modes to use in the search in
1480  * terms of their centile (lower centiles = more useful).
1481  *
1482  * Returns a dynamically allocated array; caller must free with delete[].
1483  *
1484  * @param xdim The block x size.
1485  * @param ydim The block y size.
1486  *
1487  * @return The unpacked table.
1488  */
1489 const float *get_2d_percentile_table(
1490 	unsigned int xdim,
1491 	unsigned int ydim);
1492 
1493 /**
1494  * @brief Query if a 2D block size is legal.
1495  *
1496  * @return True if legal, false otherwise.
1497  */
1498 bool is_legal_2d_block_size(
1499 	unsigned int xdim,
1500 	unsigned int ydim);
1501 
1502 /**
1503  * @brief Query if a 3D block size is legal.
1504  *
1505  * @return True if legal, false otherwise.
1506  */
1507 bool is_legal_3d_block_size(
1508 	unsigned int xdim,
1509 	unsigned int ydim,
1510 	unsigned int zdim);
1511 
1512 /* ============================================================================
1513   Functionality for managing BISE quantization and unquantization.
1514 ============================================================================ */
1515 
1516 /**
1517  * @brief The precomputed table for quantizing color values.
1518  *
1519  * Returned value is in the ASTC BISE scrambled order.
1520  *
1521  * Indexed by [quant_mode - 4][data_value].
1522  */
1523 extern const uint8_t color_quant_tables[17][256];
1524 
1525 /**
1526  * @brief The precomputed table for unquantizing color values.
1527  *
1528  * Returned value is in the ASTC BISE scrambled order.
1529  *
1530  * Indexed by [quant_mode - 4][data_value].
1531  */
1532 extern const uint8_t color_unquant_tables[17][256];
1533 
1534 /**
1535  * @brief The precomputed quant mode storage table.
1536  *
1537  * Indexing by [integercount/2][bits] gives us the quantization level for a given integer count and
1538  * number of compressed storage bits. Returns -1 for cases where the requested integer count cannot
1539  * ever fit in the supplied storage size.
1540  */
1541 extern const int8_t quant_mode_table[10][128];
1542 
1543 /**
1544  * @brief Encode a packed string using BISE.
1545  *
1546  * Note that BISE can return strings that are not a whole number of bytes in length, and ASTC can
1547  * start storing strings in a block at arbitrary bit offsets in the encoded data.
1548  *
1549  * @param         quant_level      The BISE alphabet size.
1550  * @param         character_count  The number of characters in the string.
1551  * @param         input_data       The unpacked string, one byte per character.
1552  * @param[in,out] output_data      The output packed string.
1553  * @param         bit_offset       The starting offset in the output storage.
1554  */
1555 void encode_ise(
1556 	quant_method quant_level,
1557 	unsigned int character_count,
1558 	const uint8_t* input_data,
1559 	uint8_t* output_data,
1560 	unsigned int bit_offset);
1561 
1562 /**
1563  * @brief Decode a packed string using BISE.
1564  *
1565  * Note that BISE input strings are not a whole number of bytes in length, and ASTC can start
1566  * strings at arbitrary bit offsets in the encoded data.
1567  *
1568  * @param         quant_level      The BISE alphabet size.
1569  * @param         character_count  The number of characters in the string.
1570  * @param         input_data       The packed string.
1571  * @param[in,out] output_data      The output storage, one byte per character.
1572  * @param         bit_offset       The starting offset in the output storage.
1573  */
1574 void decode_ise(
1575 	quant_method quant_level,
1576 	unsigned int character_count,
1577 	const uint8_t* input_data,
1578 	uint8_t* output_data,
1579 	unsigned int bit_offset);
1580 
1581 /**
1582  * @brief Return the number of bits needed to encode an ISE sequence.
1583  *
1584  * This implementation assumes that the @c quant level is untrusted, given it may come from random
1585  * data being decompressed, so we return an arbitrary unencodable size if that is the case.
1586  *
1587  * @param character_count   The number of items in the sequence.
1588  * @param quant_level       The desired quantization level.
1589  *
1590  * @return The number of bits needed to encode the BISE string.
1591  */
1592 unsigned int get_ise_sequence_bitcount(
1593 	unsigned int character_count,
1594 	quant_method quant_level);
1595 
1596 /* ============================================================================
1597   Functionality for managing color partitioning.
1598 ============================================================================ */
1599 
1600 /**
1601  * @brief Compute averages and dominant directions for each partition in a 2 component texture.
1602  *
1603  * @param      pi           The partition info for the current trial.
1604  * @param      blk          The image block color data to be compressed.
1605  * @param      component1   The first component included in the analysis.
1606  * @param      component2   The second component included in the analysis.
1607  * @param[out] pm           The output partition metrics.
1608  *                          - Only pi.partition_count array entries actually get initialized.
1609  *                          - Direction vectors @c pm.dir are not normalized.
1610  */
1611 void compute_avgs_and_dirs_2_comp(
1612 	const partition_info& pi,
1613 	const image_block& blk,
1614 	unsigned int component1,
1615 	unsigned int component2,
1616 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
1617 
1618 /**
1619  * @brief Compute averages and dominant directions for each partition in a 3 component texture.
1620  *
1621  * @param      pi                  The partition info for the current trial.
1622  * @param      blk                 The image block color data to be compressed.
1623  * @param      omitted_component   The component excluded from the analysis.
1624  * @param[out] pm                  The output partition metrics.
1625  *                                 - Only pi.partition_count array entries actually get initialized.
1626  *                                 - Direction vectors @c pm.dir are not normalized.
1627  */
1628 void compute_avgs_and_dirs_3_comp(
1629 	const partition_info& pi,
1630 	const image_block& blk,
1631 	unsigned int omitted_component,
1632 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
1633 
1634 /**
1635  * @brief Compute averages and dominant directions for each partition in a 3 component texture.
1636  *
1637  * This is a specialization of @c compute_avgs_and_dirs_3_comp where the omitted component is
1638  * always alpha, a common case during partition search.
1639  *
1640  * @param      pi                  The partition info for the current trial.
1641  * @param      blk                 The image block color data to be compressed.
1642  * @param[out] pm                  The output partition metrics.
1643  *                                 - Only pi.partition_count array entries actually get initialized.
1644  *                                 - Direction vectors @c pm.dir are not normalized.
1645  */
1646 void compute_avgs_and_dirs_3_comp_rgb(
1647 	const partition_info& pi,
1648 	const image_block& blk,
1649 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
1650 
1651 /**
1652  * @brief Compute averages and dominant directions for each partition in a 4 component texture.
1653  *
1654  * @param      pi    The partition info for the current trial.
1655  * @param      blk   The image block color data to be compressed.
1656  * @param[out] pm    The output partition metrics.
1657  *                   - Only pi.partition_count array entries actually get initialized.
1658  *                   - Direction vectors @c pm.dir are not normalized.
1659  */
1660 void compute_avgs_and_dirs_4_comp(
1661 	const partition_info& pi,
1662 	const image_block& blk,
1663 	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
1664 
1665 /**
1666  * @brief Compute the RGB error for uncorrelated and same chroma projections.
1667  *
1668  * The output of compute averages and dirs is post processed to define two lines, both of which go
1669  * through the mean-color-value.  One line has a direction defined by the dominant direction; this
1670  * is used to assess the error from using an uncorrelated color representation. The other line goes
1671  * through (0,0,0) and is used to assess the error from using an RGBS color representation.
1672  *
1673  * This function computes the squared error when using these two representations.
1674  *
1675  * @param         pi              The partition info for the current trial.
1676  * @param         blk             The image block color data to be compressed.
1677  * @param[in,out] plines          Processed line inputs, and line length outputs.
1678  * @param[out]    uncor_error     The cumulative error for using the uncorrelated line.
1679  * @param[out]    samec_error     The cumulative error for using the same chroma line.
1680  */
1681 void compute_error_squared_rgb(
1682 	const partition_info& pi,
1683 	const image_block& blk,
1684 	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
1685 	float& uncor_error,
1686 	float& samec_error);
1687 
1688 /**
1689  * @brief Compute the RGBA error for uncorrelated and same chroma projections.
1690  *
1691  * The output of compute averages and dirs is post processed to define two lines, both of which go
1692  * through the mean-color-value.  One line has a direction defined by the dominant direction; this
1693  * is used to assess the error from using an uncorrelated color representation. The other line goes
1694  * through (0,0,0,1) and is used to assess the error from using an RGBS color representation.
1695  *
1696  * This function computes the squared error when using these two representations.
1697  *
1698  * @param      pi              The partition info for the current trial.
1699  * @param      blk             The image block color data to be compressed.
1700  * @param      uncor_plines    Processed uncorrelated partition lines for each partition.
1701  * @param      samec_plines    Processed same chroma partition lines for each partition.
1702  * @param[out] uncor_lengths   The length of each components deviation from the line.
1703  * @param[out] samec_lengths   The length of each components deviation from the line.
1704  * @param[out] uncor_error     The cumulative error for using the uncorrelated line.
1705  * @param[out] samec_error     The cumulative error for using the same chroma line.
1706  */
1707 void compute_error_squared_rgba(
1708 	const partition_info& pi,
1709 	const image_block& blk,
1710 	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
1711 	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
1712 	float uncor_lengths[BLOCK_MAX_PARTITIONS],
1713 	float samec_lengths[BLOCK_MAX_PARTITIONS],
1714 	float& uncor_error,
1715 	float& samec_error);
1716 
1717 /**
1718  * @brief Find the best set of partitions to trial for a given block.
1719  *
1720  * On return the @c best_partitions list will contain the two best partition
1721  * candidates; one assuming data has uncorrelated chroma and one assuming the
1722  * data has corelated chroma. The best candidate is returned first in the list.
1723  *
1724  * @param      bsd                        The block size information.
1725  * @param      blk                        The image block color data to compress.
1726  * @param      partition_count            The number of partitions in the block.
1727  * @param      partition_search_limit     The number of candidate partition encodings to trial.
1728  * @param[out] best_partitions            The best partition candidates.
1729  */
1730 void find_best_partition_candidates(
1731 	const block_size_descriptor& bsd,
1732 	const image_block& blk,
1733 	unsigned int partition_count,
1734 	unsigned int partition_search_limit,
1735 	unsigned int best_partitions[2]);
1736 
1737 /* ============================================================================
1738   Functionality for managing images and image related data.
1739 ============================================================================ */
1740 
1741 /**
1742  * @brief Setup computation of regional averages in an image.
1743  *
1744  * This must be done by only a single thread per image, before any thread calls
1745  * @c compute_averages().
1746  *
1747  * Results are written back into @c img->input_alpha_averages.
1748  *
1749  * @param      img                     The input image data, also holds output data.
1750  * @param      alpha_kernel_radius     The kernel radius (in pixels) for alpha mods.
1751  * @param      swz                     Input data component swizzle.
1752  * @param[out] ag                      The average variance arguments to init.
1753  *
1754  * @return The number of tasks in the processing stage.
1755  */
1756 unsigned int init_compute_averages(
1757 	const astcenc_image& img,
1758 	unsigned int alpha_kernel_radius,
1759 	const astcenc_swizzle& swz,
1760 	avg_args& ag);
1761 
1762 /**
1763  * @brief Compute regional averages in an image.
1764  *
1765  * This function can be called by multiple threads, but only after a single
1766  * thread calls the setup function @c init_compute_averages().
1767  *
1768  * Results are written back into @c img->input_alpha_averages.
1769  *
1770  * @param[out] ctx   The context.
1771  * @param      ag    The average and variance arguments created during setup.
1772  */
1773 void compute_averages(
1774 	astcenc_context& ctx,
1775 	const avg_args& ag);
1776 
1777 /**
1778  * @brief Fetch a single image block from the input image.
1779  *
1780  * @param      decode_mode   The compression color profile.
1781  * @param      img           The input image data.
1782  * @param[out] blk           The image block to populate.
1783  * @param      bsd           The block size information.
1784  * @param      xpos          The block X coordinate in the input image.
1785  * @param      ypos          The block Y coordinate in the input image.
1786  * @param      zpos          The block Z coordinate in the input image.
1787  * @param      swz           The swizzle to apply on load.
1788  */
1789 void fetch_image_block(
1790 	astcenc_profile decode_mode,
1791 	const astcenc_image& img,
1792 	image_block& blk,
1793 	const block_size_descriptor& bsd,
1794 	unsigned int xpos,
1795 	unsigned int ypos,
1796 	unsigned int zpos,
1797 	const astcenc_swizzle& swz);
1798 
1799 /**
1800  * @brief Fetch a single image block from the input image.
1801  *
1802  * This specialized variant can be used only if the block is 2D LDR U8 data,
1803  * with no swizzle.
1804  *
1805  * @param      decode_mode   The compression color profile.
1806  * @param      img           The input image data.
1807  * @param[out] blk           The image block to populate.
1808  * @param      bsd           The block size information.
1809  * @param      xpos          The block X coordinate in the input image.
1810  * @param      ypos          The block Y coordinate in the input image.
1811  * @param      zpos          The block Z coordinate in the input image.
1812  * @param      swz           The swizzle to apply on load.
1813  */
1814 void fetch_image_block_fast_ldr(
1815 	astcenc_profile decode_mode,
1816 	const astcenc_image& img,
1817 	image_block& blk,
1818 	const block_size_descriptor& bsd,
1819 	unsigned int xpos,
1820 	unsigned int ypos,
1821 	unsigned int zpos,
1822 	const astcenc_swizzle& swz);
1823 
1824 /**
1825  * @brief Write a single image block from the output image.
1826  *
1827  * @param[out] img           The input image data.
1828  * @param      blk           The image block to populate.
1829  * @param      bsd           The block size information.
1830  * @param      xpos          The block X coordinate in the input image.
1831  * @param      ypos          The block Y coordinate in the input image.
1832  * @param      zpos          The block Z coordinate in the input image.
1833  * @param      swz           The swizzle to apply on store.
1834  */
1835 void write_image_block(
1836 	astcenc_image& img,
1837 	const image_block& blk,
1838 	const block_size_descriptor& bsd,
1839 	unsigned int xpos,
1840 	unsigned int ypos,
1841 	unsigned int zpos,
1842 	const astcenc_swizzle& swz);
1843 
1844 /* ============================================================================
1845   Functionality for computing endpoint colors and weights for a block.
1846 ============================================================================ */
1847 
1848 /**
1849  * @brief Compute ideal endpoint colors and weights for 1 plane of weights.
1850  *
1851  * The ideal endpoints define a color line for the partition. For each texel the ideal weight
1852  * defines an exact position on the partition color line. We can then use these to assess the error
1853  * introduced by removing and quantizing the weight grid.
1854  *
1855  * @param      blk   The image block color data to compress.
1856  * @param      pi    The partition info for the current trial.
1857  * @param[out] ei    The endpoint and weight values.
1858  */
1859 void compute_ideal_colors_and_weights_1plane(
1860 	const image_block& blk,
1861 	const partition_info& pi,
1862 	endpoints_and_weights& ei);
1863 
1864 /**
1865  * @brief Compute ideal endpoint colors and weights for 2 planes of weights.
1866  *
1867  * The ideal endpoints define a color line for the partition. For each texel the ideal weight
1868  * defines an exact position on the partition color line. We can then use these to assess the error
1869  * introduced by removing and quantizing the weight grid.
1870  *
1871  * @param      bsd                The block size information.
1872  * @param      blk                The image block color data to compress.
1873  * @param      plane2_component   The component assigned to plane 2.
1874  * @param[out] ei1                The endpoint and weight values for plane 1.
1875  * @param[out] ei2                The endpoint and weight values for plane 2.
1876  */
1877 void compute_ideal_colors_and_weights_2planes(
1878 	const block_size_descriptor& bsd,
1879 	const image_block& blk,
1880 	unsigned int plane2_component,
1881 	endpoints_and_weights& ei1,
1882 	endpoints_and_weights& ei2);
1883 
1884 /**
1885  * @brief Compute the optimal unquantized weights for a decimation table.
1886  *
1887  * After computing ideal weights for the case for a complete weight grid, we we want to compute the
1888  * ideal weights for the case where weights exist only for some texels. We do this with a
1889  * steepest-descent grid solver which works as follows:
1890  *
1891  * First, for each actual weight, perform a weighted averaging of the texels affected by the weight.
1892  * Then, set step size to <some initial value> and attempt one step towards the original ideal
1893  * weight if it helps to reduce error.
1894  *
1895  * @param      eai_in                   The non-decimated endpoints and weights.
1896  * @param      eai_out                  A copy of eai_in we can modify later for refinement.
1897  * @param      di                       The selected weight decimation.
1898  * @param[out] dec_weight_ideal_value   The ideal values for the decimated weight set.
1899  */
1900 void compute_ideal_weights_for_decimation(
1901 	const endpoints_and_weights& eai_in,
1902 	endpoints_and_weights& eai_out,
1903 	const decimation_info& di,
1904 	float* dec_weight_ideal_value);
1905 
1906 /**
1907  * @brief Compute the optimal quantized weights for a decimation table.
1908  *
1909  * We test the two closest weight indices in the allowed quantization range and keep the weight that
1910  * is the closest match.
1911  *
1912  * @param      di                        The selected weight decimation.
1913  * @param      low_bound                 The lowest weight allowed.
1914  * @param      high_bound                The highest weight allowed.
1915  * @param      dec_weight_ideal_value    The ideal weight set.
1916  * @param[out] dec_weight_quant_uvalue   The output quantized weight as a float.
1917  * @param[out] dec_weight_quant_pvalue   The output quantized weight as encoded int.
1918  * @param      quant_level               The desired weight quant level.
1919  */
1920 void compute_quantized_weights_for_decimation(
1921 	const decimation_info& di,
1922 	float low_bound,
1923 	float high_bound,
1924 	const float* dec_weight_ideal_value,
1925 	float* dec_weight_quant_uvalue,
1926 	uint8_t* dec_weight_quant_pvalue,
1927 	quant_method quant_level);
1928 
1929 /**
1930  * @brief Compute the infilled weight for a texel index in a decimated grid.
1931  *
1932  * @param di        The weight grid decimation to use.
1933  * @param weights   The decimated weight values to use.
1934  * @param index     The texel index to interpolate.
1935  *
1936  * @return The interpolated weight for the given texel.
1937  */
bilinear_infill(const decimation_info & di,const float * weights,unsigned int index)1938 static inline float bilinear_infill(
1939 	const decimation_info& di,
1940 	const float* weights,
1941 	unsigned int index
1942 ) {
1943 	return (weights[di.texel_weights_4t[0][index]] * di.texel_weights_float_4t[0][index] +
1944 	        weights[di.texel_weights_4t[1][index]] * di.texel_weights_float_4t[1][index]) +
1945 	       (weights[di.texel_weights_4t[2][index]] * di.texel_weights_float_4t[2][index] +
1946 	        weights[di.texel_weights_4t[3][index]] * di.texel_weights_float_4t[3][index]);
1947 }
1948 
1949 /**
1950  * @brief Compute the infilled weight for a texel index in a decimated grid.
1951  *
1952  * This is specialized version which computes only two weights per texel for
1953  * encodings that are only decimated in a single axis.
1954  *
1955  * @param di        The weight grid decimation to use.
1956  * @param weights   The decimated weight values to use.
1957  * @param index     The texel index to interpolate.
1958  *
1959  * @return The interpolated weight for the given texel.
1960  */
bilinear_infill_2(const decimation_info & di,const float * weights,unsigned int index)1961 static inline float bilinear_infill_2(
1962 	const decimation_info& di,
1963 	const float* weights,
1964 	unsigned int index
1965 ) {
1966 	return (weights[di.texel_weights_4t[0][index]] * di.texel_weights_float_4t[0][index] +
1967 	        weights[di.texel_weights_4t[1][index]] * di.texel_weights_float_4t[1][index]);
1968 }
1969 
1970 
1971 /**
1972  * @brief Compute the infilled weight for N texel indices in a decimated grid.
1973  *
1974  * @param di        The weight grid decimation to use.
1975  * @param weights   The decimated weight values to use.
1976  * @param index     The first texel index to interpolate.
1977  *
1978  * @return The interpolated weight for the given set of SIMD_WIDTH texels.
1979  */
bilinear_infill_vla(const decimation_info & di,const float * weights,unsigned int index)1980 static inline vfloat bilinear_infill_vla(
1981 	const decimation_info& di,
1982 	const float* weights,
1983 	unsigned int index
1984 ) {
1985 	// Load the bilinear filter texel weight indexes in the decimated grid
1986 	vint weight_idx0 = vint(di.texel_weights_4t[0] + index);
1987 	vint weight_idx1 = vint(di.texel_weights_4t[1] + index);
1988 	vint weight_idx2 = vint(di.texel_weights_4t[2] + index);
1989 	vint weight_idx3 = vint(di.texel_weights_4t[3] + index);
1990 
1991 	// Load the bilinear filter weights from the decimated grid
1992 	vfloat weight_val0 = gatherf(weights, weight_idx0);
1993 	vfloat weight_val1 = gatherf(weights, weight_idx1);
1994 	vfloat weight_val2 = gatherf(weights, weight_idx2);
1995 	vfloat weight_val3 = gatherf(weights, weight_idx3);
1996 
1997 	// Load the weight contribution factors for each decimated weight
1998 	vfloat tex_weight_float0 = loada(di.texel_weights_float_4t[0] + index);
1999 	vfloat tex_weight_float1 = loada(di.texel_weights_float_4t[1] + index);
2000 	vfloat tex_weight_float2 = loada(di.texel_weights_float_4t[2] + index);
2001 	vfloat tex_weight_float3 = loada(di.texel_weights_float_4t[3] + index);
2002 
2003 	// Compute the bilinear interpolation to generate the per-texel weight
2004 	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
2005 	       (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
2006 }
2007 
2008 /**
2009  * @brief Compute the infilled weight for N texel indices in a decimated grid.
2010  *
2011  * This is specialized version which computes only two weights per texel for
2012  * encodings that are only decimated in a single axis.
2013  *
2014  * @param di        The weight grid decimation to use.
2015  * @param weights   The decimated weight values to use.
2016  * @param index     The first texel index to interpolate.
2017  *
2018  * @return The interpolated weight for the given set of SIMD_WIDTH texels.
2019  */
bilinear_infill_vla_2(const decimation_info & di,const float * weights,unsigned int index)2020 static inline vfloat bilinear_infill_vla_2(
2021 	const decimation_info& di,
2022 	const float* weights,
2023 	unsigned int index
2024 ) {
2025 	// Load the bilinear filter texel weight indexes in the decimated grid
2026 	vint weight_idx0 = vint(di.texel_weights_4t[0] + index);
2027 	vint weight_idx1 = vint(di.texel_weights_4t[1] + index);
2028 
2029 	// Load the bilinear filter weights from the decimated grid
2030 	vfloat weight_val0 = gatherf(weights, weight_idx0);
2031 	vfloat weight_val1 = gatherf(weights, weight_idx1);
2032 
2033 	// Load the weight contribution factors for each decimated weight
2034 	vfloat tex_weight_float0 = loada(di.texel_weights_float_4t[0] + index);
2035 	vfloat tex_weight_float1 = loada(di.texel_weights_float_4t[1] + index);
2036 
2037 	// Compute the bilinear interpolation to generate the per-texel weight
2038 	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
2039 }
2040 
2041 /**
2042  * @brief Compute the error of a decimated weight set for 1 plane.
2043  *
2044  * After computing ideal weights for the case with one weight per texel, we want to compute the
2045  * error for decimated weight grids where weights are stored at a lower resolution. This function
2046  * computes the error of the reduced grid, compared to the full grid.
2047  *
2048  * @param eai                       The ideal weights for the full grid.
2049  * @param di                        The selected weight decimation.
2050  * @param dec_weight_quant_uvalue   The quantized weights for the decimated grid.
2051  *
2052  * @return The accumulated error.
2053  */
2054 float compute_error_of_weight_set_1plane(
2055 	const endpoints_and_weights& eai,
2056 	const decimation_info& di,
2057 	const float* dec_weight_quant_uvalue);
2058 
2059 /**
2060  * @brief Compute the error of a decimated weight set for 2 planes.
2061  *
2062  * After computing ideal weights for the case with one weight per texel, we want to compute the
2063  * error for decimated weight grids where weights are stored at a lower resolution. This function
2064  * computes the error of the reduced grid, compared to the full grid.
2065  *
2066  * @param eai1                             The ideal weights for the full grid and plane 1.
2067  * @param eai2                             The ideal weights for the full grid and plane 2.
2068  * @param di                               The selected weight decimation.
2069  * @param dec_weight_quant_uvalue_plane1   The quantized weights for the decimated grid plane 1.
2070  * @param dec_weight_quant_uvalue_plane2   The quantized weights for the decimated grid plane 2.
2071  *
2072  * @return The accumulated error.
2073  */
2074 float compute_error_of_weight_set_2planes(
2075 	const endpoints_and_weights& eai1,
2076 	const endpoints_and_weights& eai2,
2077 	const decimation_info& di,
2078 	const float* dec_weight_quant_uvalue_plane1,
2079 	const float* dec_weight_quant_uvalue_plane2);
2080 
2081 /**
2082  * @brief Pack a single pair of color endpoints as effectively as possible.
2083  *
2084  * The user requests a base color endpoint mode in @c format, but the quantizer may choose a
2085  * delta-based representation. It will report back the format variant it actually used.
2086  *
2087  * @param      color0       The input unquantized color0 endpoint for absolute endpoint pairs.
2088  * @param      color1       The input unquantized color1 endpoint for absolute endpoint pairs.
2089  * @param      rgbs_color   The input unquantized RGBS variant endpoint for same chroma endpoints.
2090  * @param      rgbo_color   The input unquantized RGBS variant endpoint for HDR endpoints..
2091  * @param      format       The desired base format.
2092  * @param[out] output       The output storage for the quantized colors/
2093  * @param      quant_level  The quantization level requested.
2094  *
2095  * @return The actual endpoint mode used.
2096  */
2097 uint8_t pack_color_endpoints(
2098 	QualityProfile privateProfile,
2099 	vfloat4 color0,
2100 	vfloat4 color1,
2101 	vfloat4 rgbs_color,
2102 	vfloat4 rgbo_color,
2103 	int format,
2104 	uint8_t* output,
2105 	quant_method quant_level);
2106 
2107 /**
2108  * @brief Unpack a single pair of encoded and quantized color endpoints.
2109  *
2110  * @param      decode_mode   The decode mode (LDR, HDR).
2111  * @param      format        The color endpoint mode used.
2112  * @param      quant_level   The quantization level used.
2113  * @param      input         The raw array of encoded input integers. The length of this array
2114  *                           depends on @c format; it can be safely assumed to be large enough.
2115  * @param[out] rgb_hdr       Is the endpoint using HDR for the RGB channels?
2116  * @param[out] alpha_hdr     Is the endpoint using HDR for the A channel?
2117  * @param[out] output0       The output color for endpoint 0.
2118  * @param[out] output1       The output color for endpoint 1.
2119  */
2120 void unpack_color_endpoints(
2121 	astcenc_profile decode_mode,
2122 	int format,
2123 	quant_method quant_level,
2124 	const uint8_t* input,
2125 	bool& rgb_hdr,
2126 	bool& alpha_hdr,
2127 	vint4& output0,
2128 	vint4& output1);
2129 
2130 /**
2131  * @brief Unpack a set of quantized and decimated weights.
2132  *
2133  * @param      bsd              The block size information.
2134  * @param      scb              The symbolic compressed encoding.
2135  * @param      di               The weight grid decimation table.
2136  * @param      is_dual_plane    @c true if this is a dual plane block, @c false otherwise.
2137  * @param      quant_level      The weight quantization level.
2138  * @param[out] weights_plane1   The output array for storing the plane 1 weights.
2139  * @param[out] weights_plane2   The output array for storing the plane 2 weights.
2140  */
2141 void unpack_weights(
2142 	const block_size_descriptor& bsd,
2143 	const symbolic_compressed_block& scb,
2144 	const decimation_info& di,
2145 	bool is_dual_plane,
2146 	quant_method quant_level,
2147 	int weights_plane1[BLOCK_MAX_TEXELS],
2148 	int weights_plane2[BLOCK_MAX_TEXELS]);
2149 
2150 /**
2151  * @brief Identify, for each mode, which set of color endpoint produces the best result.
2152  *
2153  * Returns the best @c tune_candidate_limit best looking modes, along with the ideal color encoding
2154  * combination for each. The modified quantization level can be used when all formats are the same,
2155  * as this frees up two additional bits of storage.
2156  *
2157  * @param      pi                            The partition info for the current trial.
2158  * @param      blk                           The image block color data to compress.
2159  * @param      ep                            The ideal endpoints.
2160  * @param      qwt_bitcounts                 Bit counts for different quantization methods.
2161  * @param      qwt_errors                    Errors for different quantization methods.
2162  * @param      tune_candidate_limit          The max number of candidates to return, may be less.
2163  * @param      start_block_mode              The first block mode to inspect.
2164  * @param      end_block_mode                The last block mode to inspect.
2165  * @param[out] partition_format_specifiers   The best formats per partition.
2166  * @param[out] block_mode                    The best packed block mode indexes.
2167  * @param[out] quant_level                   The best color quant level.
2168  * @param[out] quant_level_mod               The best color quant level if endpoints are the same.
2169  * @param[out] tmpbuf                        Preallocated scratch buffers for the compressor.
2170  *
2171  * @return The actual number of candidate matches returned.
2172  */
2173 unsigned int compute_ideal_endpoint_formats(
2174 	QualityProfile privateProfile,
2175 	const partition_info& pi,
2176 	const image_block& blk,
2177 	const endpoints& ep,
2178 	const int* qwt_bitcounts,
2179 	const float* qwt_errors,
2180 	unsigned int tune_candidate_limit,
2181 	unsigned int start_block_mode,
2182 	unsigned int end_block_mode,
2183 	int partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
2184 	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
2185 	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
2186 	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
2187 	compression_working_buffers& tmpbuf);
2188 
2189 /**
2190  * @brief For a given 1 plane weight set recompute the endpoint colors.
2191  *
2192  * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
2193  * recompute the ideal colors for a specific weight set.
2194  *
2195  * @param         blk                        The image block color data to compress.
2196  * @param         pi                         The partition info for the current trial.
2197  * @param         di                         The weight grid decimation table.
2198  * @param         weight_quant_mode          The weight grid quantization level.
2199  * @param         dec_weights_quant_pvalue   The quantized weight set.
2200  * @param[in,out] ep                         The color endpoints (modifed in place).
2201  * @param[out]    rgbs_vectors               The RGB+scale vectors for LDR blocks.
2202  * @param[out]    rgbo_vectors               The RGB+offset vectors for HDR blocks.
2203  */
2204 void recompute_ideal_colors_1plane(
2205 	const image_block& blk,
2206 	const partition_info& pi,
2207 	const decimation_info& di,
2208 	int weight_quant_mode,
2209 	const uint8_t* dec_weights_quant_pvalue,
2210 	endpoints& ep,
2211 	vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
2212 	vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]);
2213 
2214 /**
2215  * @brief For a given 2 plane weight set recompute the endpoint colors.
2216  *
2217  * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
2218  * recompute the ideal colors for a specific weight set.
2219  *
2220  * @param         blk                               The image block color data to compress.
2221  * @param         bsd                               The block_size descriptor.
2222  * @param         di                                The weight grid decimation table.
2223  * @param         weight_quant_mode                 The weight grid quantization level.
2224  * @param         dec_weights_quant_pvalue_plane1   The quantized weight set for plane 1.
2225  * @param         dec_weights_quant_pvalue_plane2   The quantized weight set for plane 2.
2226  * @param[in,out] ep                                The color endpoints (modifed in place).
2227  * @param[out]    rgbs_vector                       The RGB+scale color for LDR blocks.
2228  * @param[out]    rgbo_vector                       The RGB+offset color for HDR blocks.
2229  * @param         plane2_component                  The component assigned to plane 2.
2230  */
2231 void recompute_ideal_colors_2planes(
2232 	const image_block& blk,
2233 	const block_size_descriptor& bsd,
2234 	const decimation_info& di,
2235 	int weight_quant_mode,
2236 	const uint8_t* dec_weights_quant_pvalue_plane1,
2237 	const uint8_t* dec_weights_quant_pvalue_plane2,
2238 	endpoints& ep,
2239 	vfloat4& rgbs_vector,
2240 	vfloat4& rgbo_vector,
2241 	int plane2_component);
2242 
2243 /**
2244  * @brief Expand the angular tables needed for the alternative to PCA that we use.
2245  */
2246 void prepare_angular_tables();
2247 
2248 /**
2249  * @brief Compute the angular endpoints for one plane for each block mode.
2250  *
2251  * @param      tune_low_weight_limit     Weight count cutoff below which we use simpler searches.
2252  * @param      only_always               Only consider block modes that are always enabled.
2253  * @param      bsd                       The block size descriptor for the current trial.
2254  * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
2255  * @param[out] tmpbuf                    Preallocated scratch buffers for the compressor.
2256  */
2257 void compute_angular_endpoints_1plane(
2258 	unsigned int tune_low_weight_limit,
2259 	bool only_always,
2260 	const block_size_descriptor& bsd,
2261 	const float* dec_weight_ideal_value,
2262 	compression_working_buffers& tmpbuf);
2263 
2264 /**
2265  * @brief Compute the angular endpoints for two planes for each block mode.
2266  *
2267  * @param      tune_low_weight_limit     Weight count cutoff below which we use simpler searches.
2268  * @param      bsd                       The block size descriptor for the current trial.
2269  * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
2270  * @param[out] tmpbuf                    Preallocated scratch buffers for the compressor.
2271  */
2272 void compute_angular_endpoints_2planes(
2273 	unsigned int tune_low_weight_limit,
2274 	const block_size_descriptor& bsd,
2275 	const float* dec_weight_ideal_value,
2276 	compression_working_buffers& tmpbuf);
2277 
2278 /* ============================================================================
2279   Functionality for high level compression and decompression access.
2280 ============================================================================ */
2281 
2282 /**
2283  * @brief Compress an image block into a physical block.
2284  *
2285  * @param      ctx      The compressor context and configuration.
2286  * @param      blk      The image block color data to compress.
2287  * @param[out] pcb      The physical compressed block output.
2288  * @param[out] tmpbuf   Preallocated scratch buffers for the compressor.
2289  */
2290 void compress_block(
2291 	const astcenc_context& ctx,
2292 	const image_block& blk,
2293 	physical_compressed_block& pcb,
2294 #if QUALITY_CONTROL
2295 	compression_working_buffers& tmpbuf,
2296 	bool calQualityEnable,
2297 	int32_t *mseBlock[RGBA_COM]
2298 #else
2299     compression_working_buffers& tmpbuf
2300 #endif
2301 	);
2302 
2303 /**
2304  * @brief Decompress a symbolic block in to an image block.
2305  *
2306  * @param      decode_mode   The decode mode (LDR, HDR, etc).
2307  * @param      bsd           The block size information.
2308  * @param      xpos          The X coordinate of the block in the overall image.
2309  * @param      ypos          The Y coordinate of the block in the overall image.
2310  * @param      zpos          The Z coordinate of the block in the overall image.
2311  * @param[out] blk           The decompressed image block color data.
2312  */
2313 void decompress_symbolic_block(
2314 	astcenc_profile decode_mode,
2315 	const block_size_descriptor& bsd,
2316 	int xpos,
2317 	int ypos,
2318 	int zpos,
2319 	const symbolic_compressed_block& scb,
2320 	image_block& blk);
2321 
2322 /**
2323  * @brief Compute the error between a symbolic block and the original input data.
2324  *
2325  * This function is specialized for 2 plane and 1 partition search.
2326  *
2327  * In RGBM mode this will reject blocks that attempt to encode a zero M value.
2328  *
2329  * @param config   The compressor config.
2330  * @param bsd      The block size information.
2331  * @param scb      The symbolic compressed encoding.
2332  * @param blk      The original image block color data.
2333  *
2334  * @return Returns the computed error, or a negative value if the encoding
2335  *         should be rejected for any reason.
2336  */
2337 float compute_symbolic_block_difference_2plane(
2338 	const astcenc_config& config,
2339 	const block_size_descriptor& bsd,
2340 	const symbolic_compressed_block& scb,
2341 	const image_block& blk);
2342 
2343 /**
2344  * @brief Compute the error between a symbolic block and the original input data.
2345  *
2346  * This function is specialized for 1 plane and N partition search.
2347  *
2348  * In RGBM mode this will reject blocks that attempt to encode a zero M value.
2349  *
2350  * @param config   The compressor config.
2351  * @param bsd      The block size information.
2352  * @param scb      The symbolic compressed encoding.
2353  * @param blk      The original image block color data.
2354  *
2355  * @return Returns the computed error, or a negative value if the encoding
2356  *         should be rejected for any reason.
2357  */
2358 float compute_symbolic_block_difference_1plane(
2359 	const astcenc_config& config,
2360 	const block_size_descriptor& bsd,
2361 	const symbolic_compressed_block& scb,
2362 	const image_block& blk);
2363 
2364 /**
2365  * @brief Compute the error between a symbolic block and the original input data.
2366  *
2367  * This function is specialized for 1 plane and 1 partition search.
2368  *
2369  * In RGBM mode this will reject blocks that attempt to encode a zero M value.
2370  *
2371  * @param config   The compressor config.
2372  * @param bsd      The block size information.
2373  * @param scb      The symbolic compressed encoding.
2374  * @param blk      The original image block color data.
2375  *
2376  * @return Returns the computed error, or a negative value if the encoding
2377  *         should be rejected for any reason.
2378  */
2379 float compute_symbolic_block_difference_1plane_1partition(
2380 	const astcenc_config& config,
2381 	const block_size_descriptor& bsd,
2382 	const symbolic_compressed_block& scb,
2383 	const image_block& blk);
2384 
2385 /**
2386  * @brief Convert a symbolic representation into a binary physical encoding.
2387  *
2388  * It is assumed that the symbolic encoding is valid and encodable, or
2389  * previously flagged as an error block if an error color it to be encoded.
2390  *
2391  * @param      bsd   The block size information.
2392  * @param      scb   The symbolic representation.
2393  * @param[out] pcb   The binary encoded data.
2394  */
2395 void symbolic_to_physical(
2396 	const block_size_descriptor& bsd,
2397 	const symbolic_compressed_block& scb,
2398 	physical_compressed_block& pcb);
2399 
2400 /**
2401  * @brief Convert a binary physical encoding into a symbolic representation.
2402  *
2403  * This function can cope with arbitrary input data; output blocks will be
2404  * flagged as an error block if the encoding is invalid.
2405  *
2406  * @param      bsd   The block size information.
2407  * @param      pcb   The binary encoded data.
2408  * @param[out] scb   The output symbolic representation.
2409  */
2410 void physical_to_symbolic(
2411 	const block_size_descriptor& bsd,
2412 	const physical_compressed_block& pcb,
2413 	symbolic_compressed_block& scb);
2414 
2415 /* ============================================================================
2416 Platform-specific functions.
2417 ============================================================================ */
2418 /**
2419  * @brief Run-time detection if the host CPU supports the POPCNT extension.
2420  *
2421  * @return @c true if supported, @c false if not.
2422  */
2423 bool cpu_supports_popcnt();
2424 
2425 /**
2426  * @brief Run-time detection if the host CPU supports F16C extension.
2427  *
2428  * @return @c true if supported, @c false if not.
2429  */
2430 bool cpu_supports_f16c();
2431 
2432 /**
2433  * @brief Run-time detection if the host CPU supports SSE 4.1 extension.
2434  *
2435  * @return @c true if supported, @c false if not.
2436  */
2437 bool cpu_supports_sse41();
2438 
2439 /**
2440  * @brief Run-time detection if the host CPU supports AVX 2 extension.
2441  *
2442  * @return @c true if supported, @c false if not.
2443  */
2444 bool cpu_supports_avx2();
2445 
2446 /**
2447  * @brief Allocate an aligned memory buffer.
2448  *
2449  * Allocated memory must be freed by aligned_free;
2450  *
2451  * @param size    The desired buffer size.
2452  * @param align   The desired buffer alignment; must be 2^N.
2453  *
2454  * @return The memory buffer pointer or nullptr on allocation failure.
2455  */
2456 template<typename T>
aligned_malloc(size_t size,size_t align)2457 T* aligned_malloc(size_t size, size_t align)
2458 {
2459 	void* ptr;
2460 	int error = 0;
2461 
2462 #if defined(_WIN32)
2463 	ptr = _aligned_malloc(size, align);
2464 #else
2465 	error = posix_memalign(&ptr, align, size);
2466 #endif
2467 
2468 	if (error || (!ptr))
2469 	{
2470 		return nullptr;
2471 	}
2472 
2473 	return static_cast<T*>(ptr);
2474 }
2475 
2476 /**
2477  * @brief Free an aligned memory buffer.
2478  *
2479  * @param ptr   The buffer to free.
2480  */
2481 template<typename T>
aligned_free(T * ptr)2482 void aligned_free(T* ptr)
2483 {
2484 #if defined(_WIN32)
2485 	_aligned_free(reinterpret_cast<void*>(ptr));
2486 #else
2487 	free(reinterpret_cast<void*>(ptr));
2488 #endif
2489 }
2490 
2491 #endif
2492