• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2015 Philip Taylor <philip@zaynar.co.uk>
3  * Copyright 2018 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22  * DEALINGS IN THE SOFTWARE.
23  */
24 
25 /**
26  * \file texcompress_astc.c
27  *
28  * Decompression code for GL_KHR_texture_compression_astc_ldr, which is just
29  * ASTC 2D LDR.
30  *
31  * The ASTC 2D LDR decoder (without the sRGB part) was copied from the OASTC
32  * library written by Philip Taylor. I added sRGB support and adjusted it for
33  * Mesa. - Marek
34  */
35 
36 #include "texcompress_astc.h"
37 #include "macros.h"
38 #include "util/half_float.h"
39 #include <stdio.h>
40 #include <cstdlib>  // for abort() on windows
41 
42 static bool VERBOSE_DECODE = false;
43 static bool VERBOSE_WRITE = false;
44 
45 class decode_error
46 {
47 public:
48    enum type {
49       ok,
50       unsupported_hdr_void_extent,
51       reserved_block_mode_1,
52       reserved_block_mode_2,
53       dual_plane_and_too_many_partitions,
54       invalid_range_in_void_extent,
55       weight_grid_exceeds_block_size,
56       invalid_colour_endpoints_size,
57       invalid_colour_endpoints_count,
58       invalid_weight_bits,
59       invalid_num_weights,
60    };
61 };
62 
63 
64 struct cem_range {
65    uint8_t max;
66    uint8_t t, q, b;
67 };
68 
69 /* Based on the Color Unquantization Parameters table,
70  * plus the bit-only representations, sorted by increasing size
71  */
72 static cem_range cem_ranges[] = {
73    { 5, 1, 0, 1 },
74    { 7, 0, 0, 3 },
75    { 9, 0, 1, 1 },
76    { 11, 1, 0, 2 },
77    { 15, 0, 0, 4 },
78    { 19, 0, 1, 2 },
79    { 23, 1, 0, 3 },
80    { 31, 0, 0, 5 },
81    { 39, 0, 1, 3 },
82    { 47, 1, 0, 4 },
83    { 63, 0, 0, 6 },
84    { 79, 0, 1, 4 },
85    { 95, 1, 0, 5 },
86    { 127, 0, 0, 7 },
87    { 159, 0, 1, 5 },
88    { 191, 1, 0, 6 },
89    { 255, 0, 0, 8 },
90 };
91 
92 #define CAT_BITS_2(a, b)          ( ((a) << 1) | (b) )
93 #define CAT_BITS_3(a, b, c)       ( ((a) << 2) | ((b) << 1) | (c) )
94 #define CAT_BITS_4(a, b, c, d)    ( ((a) << 3) | ((b) << 2) | ((c) << 1) | (d) )
95 #define CAT_BITS_5(a, b, c, d, e) ( ((a) << 4) | ((b) << 3) | ((c) << 2) | ((d) << 1) | (e) )
96 
97 /**
98  * Unpack 5n+8 bits from 'in' into 5 output values.
99  * If n <= 4 then T should be uint32_t, else it must be uint64_t.
100  */
101 template <typename T>
unpack_trit_block(int n,T in,uint8_t * out)102 static void unpack_trit_block(int n, T in, uint8_t *out)
103 {
104    assert(n <= 6); /* else output will overflow uint8_t */
105 
106    uint8_t T0 = (in >> (n)) & 0x1;
107    uint8_t T1 = (in >> (n+1)) & 0x1;
108    uint8_t T2 = (in >> (2*n+2)) & 0x1;
109    uint8_t T3 = (in >> (2*n+3)) & 0x1;
110    uint8_t T4 = (in >> (3*n+4)) & 0x1;
111    uint8_t T5 = (in >> (4*n+5)) & 0x1;
112    uint8_t T6 = (in >> (4*n+6)) & 0x1;
113    uint8_t T7 = (in >> (5*n+7)) & 0x1;
114    uint8_t mmask = (1 << n) - 1;
115    uint8_t m0 = (in >> (0)) & mmask;
116    uint8_t m1 = (in >> (n+2)) & mmask;
117    uint8_t m2 = (in >> (2*n+4)) & mmask;
118    uint8_t m3 = (in >> (3*n+5)) & mmask;
119    uint8_t m4 = (in >> (4*n+7)) & mmask;
120 
121    uint8_t C;
122    uint8_t t4, t3, t2, t1, t0;
123    if (CAT_BITS_3(T4, T3, T2) == 0x7) {
124       C = CAT_BITS_5(T7, T6, T5, T1, T0);
125       t4 = t3 = 2;
126    } else {
127       C = CAT_BITS_5(T4, T3, T2, T1, T0);
128       if (CAT_BITS_2(T6, T5) == 0x3) {
129          t4 = 2;
130          t3 = T7;
131       } else {
132          t4 = T7;
133          t3 = CAT_BITS_2(T6, T5);
134       }
135    }
136 
137    if ((C & 0x3) == 0x3) {
138       t2 = 2;
139       t1 = (C >> 4) & 0x1;
140       uint8_t C3 = (C >> 3) & 0x1;
141       uint8_t C2 = (C >> 2) & 0x1;
142       t0 = (C3 << 1) | (C2 & ~C3);
143    } else if (((C >> 2) & 0x3) == 0x3) {
144       t2 = 2;
145       t1 = 2;
146       t0 = C & 0x3;
147    } else {
148       t2 = (C >> 4) & 0x1;
149       t1 = (C >> 2) & 0x3;
150       uint8_t C1 = (C >> 1) & 0x1;
151       uint8_t C0 = (C >> 0) & 0x1;
152       t0 = (C1 << 1) | (C0 & ~C1);
153    }
154 
155    out[0] = (t0 << n) | m0;
156    out[1] = (t1 << n) | m1;
157    out[2] = (t2 << n) | m2;
158    out[3] = (t3 << n) | m3;
159    out[4] = (t4 << n) | m4;
160 }
161 
162 /**
163  * Unpack 3n+7 bits from 'in' into 3 output values
164  */
unpack_quint_block(int n,uint32_t in,uint8_t * out)165 static void unpack_quint_block(int n, uint32_t in, uint8_t *out)
166 {
167    assert(n <= 5); /* else output will overflow uint8_t */
168 
169    uint8_t Q0 = (in >> (n)) & 0x1;
170    uint8_t Q1 = (in >> (n+1)) & 0x1;
171    uint8_t Q2 = (in >> (n+2)) & 0x1;
172    uint8_t Q3 = (in >> (2*n+3)) & 0x1;
173    uint8_t Q4 = (in >> (2*n+4)) & 0x1;
174    uint8_t Q5 = (in >> (3*n+5)) & 0x1;
175    uint8_t Q6 = (in >> (3*n+6)) & 0x1;
176    uint8_t mmask = (1 << n) - 1;
177    uint8_t m0 = (in >> (0)) & mmask;
178    uint8_t m1 = (in >> (n+3)) & mmask;
179    uint8_t m2 = (in >> (2*n+5)) & mmask;
180 
181    uint8_t C;
182    uint8_t q2, q1, q0;
183    if (CAT_BITS_4(Q6, Q5, Q2, Q1) == 0x3) {
184       q2 = CAT_BITS_3(Q0, Q4 & ~Q0, Q3 & ~Q0);
185       q1 = 4;
186       q0 = 4;
187    } else {
188       if (CAT_BITS_2(Q2, Q1) == 0x3) {
189          q2 = 4;
190          C = CAT_BITS_5(Q4, Q3, 0x1 & ~Q6, 0x1 & ~Q5, Q0);
191       } else {
192          q2 = CAT_BITS_2(Q6, Q5);
193          C = CAT_BITS_5(Q4, Q3, Q2, Q1, Q0);
194       }
195       if ((C & 0x7) == 0x5) {
196          q1 = 4;
197          q0 = (C >> 3) & 0x3;
198       } else {
199          q1 = (C >> 3) & 0x3;
200          q0 = C & 0x7;
201       }
202    }
203    out[0] = (q0 << n) | m0;
204    out[1] = (q1 << n) | m1;
205    out[2] = (q2 << n) | m2;
206 }
207 
208 
209 struct uint8x4_t
210 {
211    uint8_t v[4];
212 
uint8x4_tuint8x4_t213    uint8x4_t() { }
214 
uint8x4_tuint8x4_t215    uint8x4_t(int a, int b, int c, int d)
216    {
217       assert(0 <= a && a <= 255);
218       assert(0 <= b && b <= 255);
219       assert(0 <= c && c <= 255);
220       assert(0 <= d && d <= 255);
221       v[0] = a;
222       v[1] = b;
223       v[2] = c;
224       v[3] = d;
225    }
226 
clampeduint8x4_t227    static uint8x4_t clamped(int a, int b, int c, int d)
228    {
229       uint8x4_t r;
230       r.v[0] = MAX2(0, MIN2(255, a));
231       r.v[1] = MAX2(0, MIN2(255, b));
232       r.v[2] = MAX2(0, MIN2(255, c));
233       r.v[3] = MAX2(0, MIN2(255, d));
234       return r;
235    }
236 };
237 
blue_contract(int r,int g,int b,int a)238 static uint8x4_t blue_contract(int r, int g, int b, int a)
239 {
240    return uint8x4_t((r+b) >> 1, (g+b) >> 1, b, a);
241 }
242 
blue_contract_clamped(int r,int g,int b,int a)243 static uint8x4_t blue_contract_clamped(int r, int g, int b, int a)
244 {
245    return uint8x4_t::clamped((r+b) >> 1, (g+b) >> 1, b, a);
246 }
247 
bit_transfer_signed(int & a,int & b)248 static void bit_transfer_signed(int &a, int &b)
249 {
250    b >>= 1;
251    b |= a & 0x80;
252    a >>= 1;
253    a &= 0x3f;
254    if (a & 0x20)
255       a -= 0x40;
256 }
257 
hash52(uint32_t p)258 static uint32_t hash52(uint32_t p)
259 {
260    p ^= p >> 15;
261    p -= p << 17;
262    p += p << 7;
263    p += p << 4;
264    p ^= p >> 5;
265    p += p << 16;
266    p ^= p >> 7;
267    p ^= p >> 3;
268    p ^= p << 6;
269    p ^= p >> 17;
270    return p;
271 }
272 
select_partition(int seed,int x,int y,int z,int partitioncount,int small_block)273 static int select_partition(int seed, int x, int y, int z, int partitioncount,
274                             int small_block)
275 {
276    if (small_block) {
277       x <<= 1;
278       y <<= 1;
279       z <<= 1;
280    }
281    seed += (partitioncount - 1) * 1024;
282    uint32_t rnum = hash52(seed);
283    uint8_t seed1 = rnum & 0xF;
284    uint8_t seed2 = (rnum >> 4) & 0xF;
285    uint8_t seed3 = (rnum >> 8) & 0xF;
286    uint8_t seed4 = (rnum >> 12) & 0xF;
287    uint8_t seed5 = (rnum >> 16) & 0xF;
288    uint8_t seed6 = (rnum >> 20) & 0xF;
289    uint8_t seed7 = (rnum >> 24) & 0xF;
290    uint8_t seed8 = (rnum >> 28) & 0xF;
291    uint8_t seed9 = (rnum >> 18) & 0xF;
292    uint8_t seed10 = (rnum >> 22) & 0xF;
293    uint8_t seed11 = (rnum >> 26) & 0xF;
294    uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
295 
296    seed1 *= seed1;
297    seed2 *= seed2;
298    seed3 *= seed3;
299    seed4 *= seed4;
300    seed5 *= seed5;
301    seed6 *= seed6;
302    seed7 *= seed7;
303    seed8 *= seed8;
304    seed9 *= seed9;
305    seed10 *= seed10;
306    seed11 *= seed11;
307    seed12 *= seed12;
308 
309    int sh1, sh2, sh3;
310    if (seed & 1) {
311       sh1 = (seed & 2 ? 4 : 5);
312       sh2 = (partitioncount == 3 ? 6 : 5);
313    } else {
314       sh1 = (partitioncount == 3 ? 6 : 5);
315       sh2 = (seed & 2 ? 4 : 5);
316    }
317    sh3 = (seed & 0x10) ? sh1 : sh2;
318 
319    seed1 >>= sh1;
320    seed2 >>= sh2;
321    seed3 >>= sh1;
322    seed4 >>= sh2;
323    seed5 >>= sh1;
324    seed6 >>= sh2;
325    seed7 >>= sh1;
326    seed8 >>= sh2;
327    seed9 >>= sh3;
328    seed10 >>= sh3;
329    seed11 >>= sh3;
330    seed12 >>= sh3;
331 
332    int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
333    int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
334    int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
335    int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
336 
337    a &= 0x3F;
338    b &= 0x3F;
339    c &= 0x3F;
340    d &= 0x3F;
341 
342    if (partitioncount < 4)
343       d = 0;
344    if (partitioncount < 3)
345       c = 0;
346 
347    if (a >= b && a >= c && a >= d)
348       return 0;
349    else if (b >= c && b >= d)
350       return 1;
351    else if (c >= d)
352       return 2;
353    else
354       return 3;
355 }
356 
357 
358 struct InputBitVector
359 {
360    uint32_t data[4];
361 
printf_bitsInputBitVector362    void printf_bits(int offset, int count, const char *fmt = "", ...)
363    {
364       char out[129];
365       memset(out, '.', 128);
366       out[128] = '\0';
367       int idx = offset;
368       for (int i = 0; i < count; ++i) {
369          out[127 - idx] = ((data[idx >> 5] >> (idx & 31)) & 1) ? '1' : '0';
370          ++idx;
371       }
372       printf("%s ", out);
373       va_list ap;
374       va_start(ap, fmt);
375       vprintf(fmt, ap);
376       va_end(ap);
377       printf("\n");
378    }
379 
get_bitsInputBitVector380    uint32_t get_bits(int offset, int count)
381    {
382       assert(count >= 0 && count < 32);
383 
384       uint32_t out = 0;
385       if (offset < 32)
386          out |= data[0] >> offset;
387 
388       if (0 < offset && offset <= 32)
389          out |= data[1] << (32 - offset);
390       if (32 < offset && offset < 64)
391          out |= data[1] >> (offset - 32);
392 
393       if (32 < offset && offset <= 64)
394          out |= data[2] << (64 - offset);
395       if (64 < offset && offset < 96)
396          out |= data[2] >> (offset - 64);
397 
398       if (64 < offset && offset <= 96)
399          out |= data[3] << (96 - offset);
400       if (96 < offset && offset < 128)
401          out |= data[3] >> (offset - 96);
402 
403       out &= (1 << count) - 1;
404       return out;
405    }
406 
get_bits64InputBitVector407    uint64_t get_bits64(int offset, int count)
408    {
409       assert(count >= 0 && count < 64);
410 
411       uint64_t out = 0;
412       if (offset < 32)
413          out |= data[0] >> offset;
414 
415       if (offset <= 32)
416          out |= (uint64_t)data[1] << (32 - offset);
417       if (32 < offset && offset < 64)
418          out |= data[1] >> (offset - 32);
419 
420       if (0 < offset && offset <= 64)
421          out |= (uint64_t)data[2] << (64 - offset);
422       if (64 < offset && offset < 96)
423          out |= data[2] >> (offset - 64);
424 
425       if (32 < offset && offset <= 96)
426          out |= (uint64_t)data[3] << (96 - offset);
427       if (96 < offset && offset < 128)
428          out |= data[3] >> (offset - 96);
429 
430       out &= ((uint64_t)1 << count) - 1;
431       return out;
432    }
433 
get_bits_revInputBitVector434    uint32_t get_bits_rev(int offset, int count)
435    {
436       assert(offset >= count);
437       uint32_t tmp = get_bits(offset - count, count);
438       uint32_t out = 0;
439       for (int i = 0; i < count; ++i)
440          out |= ((tmp >> i) & 1) << (count - 1 - i);
441       return out;
442    }
443 };
444 
445 struct OutputBitVector
446 {
447    uint32_t data[4];
448    int offset;
449 
OutputBitVectorOutputBitVector450    OutputBitVector()
451       : offset(0)
452    {
453       memset(data, 0, sizeof(data));
454    }
455 
appendOutputBitVector456    void append(uint32_t value, int size)
457    {
458       if (VERBOSE_WRITE)
459          printf("append offset=%d size=%d values=0x%x\n", offset, size, value);
460 
461       assert(offset + size <= 128);
462 
463       assert(size <= 32);
464       if (size < 32)
465          assert((value >> size) == 0);
466 
467       while (size) {
468          int c = MIN2(size, 32 - (offset & 31));
469          data[offset >> 5] |= (value << (offset & 31));
470          offset += c;
471          size -= c;
472          value >>= c;
473       }
474    }
475 
append64OutputBitVector476    void append64(uint64_t value, int size)
477    {
478       if (VERBOSE_WRITE)
479          printf("append offset=%d size=%d values=0x%llx\n", offset, size, (unsigned long long)value);
480 
481       assert(offset + size <= 128);
482 
483       assert(size <= 64);
484       if (size < 64)
485          assert((value >> size) == 0);
486 
487       while (size) {
488          int c = MIN2(size, 32 - (offset & 31));
489          data[offset >> 5] |= (value << (offset & 31));
490          offset += c;
491          size -= c;
492          value >>= c;
493       }
494    }
495 
appendOutputBitVector496    void append(OutputBitVector &v, int size)
497    {
498       if (VERBOSE_WRITE)
499          printf("append vector offset=%d size=%d\n", offset, size);
500 
501       assert(offset + size <= 128);
502       int i = 0;
503       while (size >= 32) {
504          append(v.data[i++], 32);
505          size -= 32;
506       }
507       if (size > 0)
508          append(v.data[i] & ((1 << size) - 1), size);
509    }
510 
append_endOutputBitVector511    void append_end(OutputBitVector &v, int size)
512    {
513       for (int i = 0; i < size; ++i)
514          data[(127 - i) >> 5] |= ((v.data[i >> 5] >> (i & 31)) & 1) << ((127 - i) & 31);
515    }
516 
517    /* Insert the given number of '1' bits. (We could use 0s instead, but 1s are
518     * more likely to flush out bugs where we accidentally read undefined bits.)
519     */
skipOutputBitVector520    void skip(int size)
521    {
522       if (VERBOSE_WRITE)
523          printf("skip offset=%d size=%d\n", offset, size);
524 
525       assert(offset + size <= 128);
526       while (size >= 32) {
527          append(0xffffffff, 32);
528          size -= 32;
529       }
530       if (size > 0)
531          append(0xffffffff >> (32 - size), size);
532    }
533 };
534 
535 
536 class Decoder
537 {
538 public:
Decoder(int block_w,int block_h,int block_d,bool srgb,bool output_unorm8)539    Decoder(int block_w, int block_h, int block_d, bool srgb, bool output_unorm8)
540       : block_w(block_w), block_h(block_h), block_d(block_d), srgb(srgb),
541         output_unorm8(output_unorm8) {}
542 
543    decode_error::type decode(const uint8_t *in, uint16_t *output) const;
544 
545    int block_w, block_h, block_d;
546    bool srgb, output_unorm8;
547 };
548 
549 struct Block
550 {
551    bool is_error;
552    bool bogus_colour_endpoints;
553    bool bogus_weights;
554 
555    int high_prec;
556    int dual_plane;
557    int colour_component_selector;
558    int wt_range;
559    int wt_w, wt_h, wt_d;
560    int num_parts;
561    int partition_index;
562 
563    bool is_void_extent;
564    int void_extent_d;
565    int void_extent_min_s;
566    int void_extent_max_s;
567    int void_extent_min_t;
568    int void_extent_max_t;
569    uint16_t void_extent_colour_r;
570    uint16_t void_extent_colour_g;
571    uint16_t void_extent_colour_b;
572    uint16_t void_extent_colour_a;
573 
574    bool is_multi_cem;
575    int num_extra_cem_bits;
576    int colour_endpoint_data_offset;
577    int extra_cem_bits;
578    int cem_base_class;
579    int cems[4];
580 
581    int num_cem_values;
582 
583    /* Calculated by unpack_weights(): */
584    uint8_t weights_quant[64 + 4]; /* max 64 values, plus padding for overflows in trit parsing */
585 
586    /* Calculated by unquantise_weights(): */
587    uint8_t weights[64 + 18]; /* max 64 values, plus padding for the infill interpolation */
588 
589    /* Calculated by unpack_colour_endpoints(): */
590    uint8_t colour_endpoints_quant[18 + 4]; /* max 18 values, plus padding for overflows in trit parsing */
591 
592    /* Calculated by unquantise_colour_endpoints(): */
593    uint8_t colour_endpoints[18];
594 
595    /* Calculated by calculate_from_weights(): */
596    int wt_trits;
597    int wt_quints;
598    int wt_bits;
599    int wt_max;
600    int num_weights;
601    int weight_bits;
602 
603    /* Calculated by calculate_remaining_bits(): */
604    int remaining_bits;
605 
606    /* Calculated by calculate_colour_endpoints_size(): */
607    int colour_endpoint_bits;
608    int ce_max;
609    int ce_trits;
610    int ce_quints;
611    int ce_bits;
612 
613    /* Calculated by compute_infill_weights(); */
614    uint8_t infill_weights[2][216]; /* large enough for 6x6x6 */
615 
616    /* Calculated by decode_colour_endpoints(); */
617    uint8x4_t endpoints_decoded[2][4];
618 
619    void calculate_from_weights();
620    void calculate_remaining_bits();
621    decode_error::type calculate_colour_endpoints_size();
622 
623    void unquantise_weights();
624    void unquantise_colour_endpoints();
625 
626    decode_error::type decode(const Decoder &decoder, InputBitVector in);
627 
628    decode_error::type decode_block_mode(InputBitVector in);
629    decode_error::type decode_void_extent(InputBitVector in);
630    void decode_cem(InputBitVector in);
631    void unpack_colour_endpoints(InputBitVector in);
632    void decode_colour_endpoints();
633    void unpack_weights(InputBitVector in);
634    void compute_infill_weights(int block_w, int block_h, int block_d);
635 
636    void write_decoded(const Decoder &decoder, uint16_t *output);
637 };
638 
639 
decode(const uint8_t * in,uint16_t * output) const640 decode_error::type Decoder::decode(const uint8_t *in, uint16_t *output) const
641 {
642    Block blk;
643    InputBitVector in_vec;
644    memcpy(&in_vec.data, in, 16);
645    decode_error::type err = blk.decode(*this, in_vec);
646    if (err == decode_error::ok) {
647       blk.write_decoded(*this, output);
648    } else {
649       /* Fill output with the error colour */
650       for (int i = 0; i < block_w * block_h * block_d; ++i) {
651          if (output_unorm8) {
652             output[i*4+0] = 0xff;
653             output[i*4+1] = 0;
654             output[i*4+2] = 0xff;
655             output[i*4+3] = 0xff;
656          } else {
657             assert(!srgb); /* srgb must use unorm8 */
658 
659             output[i*4+0] = FP16_ONE;
660             output[i*4+1] = FP16_ZERO;
661             output[i*4+2] = FP16_ONE;
662             output[i*4+3] = FP16_ONE;
663          }
664       }
665    }
666    return err;
667 }
668 
669 
decode_void_extent(InputBitVector block)670 decode_error::type Block::decode_void_extent(InputBitVector block)
671 {
672    /* TODO: 3D */
673 
674    is_void_extent = true;
675    void_extent_d = block.get_bits(9, 1);
676    void_extent_min_s = block.get_bits(12, 13);
677    void_extent_max_s = block.get_bits(25, 13);
678    void_extent_min_t = block.get_bits(38, 13);
679    void_extent_max_t = block.get_bits(51, 13);
680    void_extent_colour_r = block.get_bits(64, 16);
681    void_extent_colour_g = block.get_bits(80, 16);
682    void_extent_colour_b = block.get_bits(96, 16);
683    void_extent_colour_a = block.get_bits(112, 16);
684 
685    /* TODO: maybe we should do something useful with the extent coordinates? */
686 
687    if (void_extent_d) {
688       return decode_error::unsupported_hdr_void_extent;
689    }
690 
691    if (void_extent_min_s == 0x1fff && void_extent_max_s == 0x1fff
692        && void_extent_min_t == 0x1fff && void_extent_max_t == 0x1fff) {
693 
694       /* No extents */
695 
696    } else {
697 
698       /* Check for illegal encoding */
699       if (void_extent_min_s >= void_extent_max_s || void_extent_min_t >= void_extent_max_t) {
700          return decode_error::invalid_range_in_void_extent;
701       }
702    }
703 
704    return decode_error::ok;
705 }
706 
decode_block_mode(InputBitVector in)707 decode_error::type Block::decode_block_mode(InputBitVector in)
708 {
709    dual_plane = in.get_bits(10, 1);
710    high_prec = in.get_bits(9, 1);
711 
712    if (in.get_bits(0, 2) != 0x0) {
713       wt_range = (in.get_bits(0, 2) << 1) | in.get_bits(4, 1);
714       int a = in.get_bits(5, 2);
715       int b = in.get_bits(7, 2);
716       switch (in.get_bits(2, 2)) {
717       case 0x0:
718          if (VERBOSE_DECODE)
719             in.printf_bits(0, 11, "DHBBAAR00RR");
720          wt_w = b + 4;
721          wt_h = a + 2;
722          break;
723       case 0x1:
724          if (VERBOSE_DECODE)
725             in.printf_bits(0, 11, "DHBBAAR01RR");
726          wt_w = b + 8;
727          wt_h = a + 2;
728          break;
729       case 0x2:
730          if (VERBOSE_DECODE)
731             in.printf_bits(0, 11, "DHBBAAR10RR");
732          wt_w = a + 2;
733          wt_h = b + 8;
734          break;
735       case 0x3:
736          if ((b & 0x2) == 0) {
737             if (VERBOSE_DECODE)
738                in.printf_bits(0, 11, "DH0BAAR11RR");
739             wt_w = a + 2;
740             wt_h = b + 6;
741          } else {
742             if (VERBOSE_DECODE)
743                in.printf_bits(0, 11, "DH1BAAR11RR");
744             wt_w = (b & 0x1) + 2;
745             wt_h = a + 2;
746          }
747          break;
748       }
749    } else {
750       if (in.get_bits(6, 3) == 0x7) {
751          if (in.get_bits(0, 9) == 0x1fc) {
752             if (VERBOSE_DECODE)
753                in.printf_bits(0, 11, "xx111111100 (void extent)");
754             return decode_void_extent(in);
755          } else {
756             if (VERBOSE_DECODE)
757                in.printf_bits(0, 11, "xx111xxxx00");
758             return decode_error::reserved_block_mode_1;
759          }
760       }
761       if (in.get_bits(0, 4) == 0x0) {
762          if (VERBOSE_DECODE)
763             in.printf_bits(0, 11, "xxxxxxx0000");
764          return decode_error::reserved_block_mode_2;
765       }
766 
767       wt_range = in.get_bits(1, 3) | in.get_bits(4, 1);
768       int a = in.get_bits(5, 2);
769       int b;
770 
771       switch (in.get_bits(7, 2)) {
772       case 0x0:
773          if (VERBOSE_DECODE)
774             in.printf_bits(0, 11, "DH00AARRR00");
775          wt_w = 12;
776          wt_h = a + 2;
777          break;
778       case 0x1:
779          if (VERBOSE_DECODE)
780             in.printf_bits(0, 11, "DH01AARRR00");
781          wt_w = a + 2;
782          wt_h = 12;
783          break;
784       case 0x3:
785          if (in.get_bits(5, 1) == 0) {
786             if (VERBOSE_DECODE)
787                in.printf_bits(0, 11, "DH1100RRR00");
788             wt_w = 6;
789             wt_h = 10;
790          } else {
791             if (VERBOSE_DECODE)
792                in.printf_bits(0, 11, "DH1101RRR00");
793             wt_w = 10;
794             wt_h = 6;
795          }
796          break;
797       case 0x2:
798          if (VERBOSE_DECODE)
799             in.printf_bits(0, 11, "BB10AARRR00");
800          b = in.get_bits(9, 2);
801          wt_w = a + 6;
802          wt_h = b + 6;
803          dual_plane = 0;
804          high_prec = 0;
805          break;
806       }
807    }
808    return decode_error::ok;
809 }
810 
decode_cem(InputBitVector in)811 void Block::decode_cem(InputBitVector in)
812 {
813    cems[0] = cems[1] = cems[2] = cems[3] = -1;
814 
815    num_extra_cem_bits = 0;
816    extra_cem_bits = 0;
817 
818    if (num_parts > 1) {
819 
820       partition_index = in.get_bits(13, 10);
821       if (VERBOSE_DECODE)
822          in.printf_bits(13, 10, "partition ID (%d)", partition_index);
823 
824       uint32_t cem = in.get_bits(23, 6);
825 
826       if ((cem & 0x3) == 0x0) {
827          cem >>= 2;
828          cem_base_class = cem >> 2;
829          is_multi_cem = false;
830 
831          for (int i = 0; i < num_parts; ++i)
832             cems[i] = cem;
833 
834          if (VERBOSE_DECODE)
835             in.printf_bits(23, 6, "CEM (single, %d)", cem);
836       } else {
837 
838          cem_base_class = (cem & 0x3) - 1;
839          is_multi_cem = true;
840 
841          if (VERBOSE_DECODE)
842             in.printf_bits(23, 6, "CEM (multi, base class %d)", cem_base_class);
843 
844          int offset = 128 - weight_bits;
845 
846          if (num_parts == 2) {
847             if (VERBOSE_DECODE) {
848                in.printf_bits(25, 4, "M0M0 C1 C0");
849                in.printf_bits(offset - 2, 2, "M1M1");
850             }
851 
852             uint32_t c0 = in.get_bits(25, 1);
853             uint32_t c1 = in.get_bits(26, 1);
854 
855             extra_cem_bits = c0 + c1;
856 
857             num_extra_cem_bits = 2;
858 
859             uint32_t m0 = in.get_bits(27, 2);
860             uint32_t m1 = in.get_bits(offset - 2, 2);
861 
862             cems[0] = ((cem_base_class + c0) << 2) | m0;
863             cems[1] = ((cem_base_class + c1) << 2) | m1;
864 
865          } else if (num_parts == 3) {
866             if (VERBOSE_DECODE) {
867                in.printf_bits(25, 4, "M0 C2 C1 C0");
868                in.printf_bits(offset - 5, 5, "M2M2 M1M1 M0");
869             }
870 
871             uint32_t c0 = in.get_bits(25, 1);
872             uint32_t c1 = in.get_bits(26, 1);
873             uint32_t c2 = in.get_bits(27, 1);
874 
875             extra_cem_bits = c0 + c1 + c2;
876 
877             num_extra_cem_bits = 5;
878 
879             uint32_t m0 = in.get_bits(28, 1) | (in.get_bits(128 - weight_bits - 5, 1) << 1);
880             uint32_t m1 = in.get_bits(offset - 4, 2);
881             uint32_t m2 = in.get_bits(offset - 2, 2);
882 
883             cems[0] = ((cem_base_class + c0) << 2) | m0;
884             cems[1] = ((cem_base_class + c1) << 2) | m1;
885             cems[2] = ((cem_base_class + c2) << 2) | m2;
886 
887          } else if (num_parts == 4) {
888             if (VERBOSE_DECODE) {
889                in.printf_bits(25, 4, "C3 C2 C1 C0");
890                in.printf_bits(offset - 8, 8, "M3M3 M2M2 M1M1 M0M0");
891             }
892 
893             uint32_t c0 = in.get_bits(25, 1);
894             uint32_t c1 = in.get_bits(26, 1);
895             uint32_t c2 = in.get_bits(27, 1);
896             uint32_t c3 = in.get_bits(28, 1);
897 
898             extra_cem_bits = c0 + c1 + c2 + c3;
899 
900             num_extra_cem_bits = 8;
901 
902             uint32_t m0 = in.get_bits(offset - 8, 2);
903             uint32_t m1 = in.get_bits(offset - 6, 2);
904             uint32_t m2 = in.get_bits(offset - 4, 2);
905             uint32_t m3 = in.get_bits(offset - 2, 2);
906 
907             cems[0] = ((cem_base_class + c0) << 2) | m0;
908             cems[1] = ((cem_base_class + c1) << 2) | m1;
909             cems[2] = ((cem_base_class + c2) << 2) | m2;
910             cems[3] = ((cem_base_class + c3) << 2) | m3;
911          } else {
912             unreachable("");
913          }
914       }
915 
916       colour_endpoint_data_offset = 29;
917 
918    } else {
919       uint32_t cem = in.get_bits(13, 4);
920 
921       cem_base_class = cem >> 2;
922       is_multi_cem = false;
923 
924       cems[0] = cem;
925 
926       partition_index = -1;
927 
928       if (VERBOSE_DECODE)
929          in.printf_bits(13, 4, "CEM = %d (class %d)", cem, cem_base_class);
930 
931       colour_endpoint_data_offset = 17;
932    }
933 }
934 
unpack_colour_endpoints(InputBitVector in)935 void Block::unpack_colour_endpoints(InputBitVector in)
936 {
937    if (ce_trits) {
938       int offset = colour_endpoint_data_offset;
939       int bits_left = colour_endpoint_bits;
940       for (int i = 0; i < num_cem_values; i += 5) {
941          int bits_to_read = MIN2(bits_left, 8 + ce_bits * 5);
942          /* If ce_trits then ce_bits <= 6, so bits_to_read <= 38 and we have to use uint64_t */
943          uint64_t raw = in.get_bits64(offset, bits_to_read);
944          unpack_trit_block(ce_bits, raw, &colour_endpoints_quant[i]);
945 
946          if (VERBOSE_DECODE)
947             in.printf_bits(offset, bits_to_read,
948                            "trits [%d,%d,%d,%d,%d]",
949                            colour_endpoints_quant[i+0], colour_endpoints_quant[i+1],
950                   colour_endpoints_quant[i+2], colour_endpoints_quant[i+3],
951                   colour_endpoints_quant[i+4]);
952 
953          offset += 8 + ce_bits * 5;
954          bits_left -= 8 + ce_bits * 5;
955       }
956    } else if (ce_quints) {
957       int offset = colour_endpoint_data_offset;
958       int bits_left = colour_endpoint_bits;
959       for (int i = 0; i < num_cem_values; i += 3) {
960          int bits_to_read = MIN2(bits_left, 7 + ce_bits * 3);
961          /* If ce_quints then ce_bits <= 5, so bits_to_read <= 22 and we can use uint32_t */
962          uint32_t raw = in.get_bits(offset, bits_to_read);
963          unpack_quint_block(ce_bits, raw, &colour_endpoints_quant[i]);
964 
965          if (VERBOSE_DECODE)
966             in.printf_bits(offset, bits_to_read,
967                            "quints [%d,%d,%d]",
968                            colour_endpoints_quant[i], colour_endpoints_quant[i+1], colour_endpoints_quant[i+2]);
969 
970          offset += 7 + ce_bits * 3;
971          bits_left -= 7 + ce_bits * 3;
972       }
973    } else {
974       assert((colour_endpoint_bits % ce_bits) == 0);
975       int offset = colour_endpoint_data_offset;
976       for (int i = 0; i < num_cem_values; i++) {
977          colour_endpoints_quant[i] = in.get_bits(offset, ce_bits);
978 
979          if (VERBOSE_DECODE)
980             in.printf_bits(offset, ce_bits, "bits [%d]", colour_endpoints_quant[i]);
981 
982          offset += ce_bits;
983       }
984    }
985 }
986 
decode_colour_endpoints()987 void Block::decode_colour_endpoints()
988 {
989    int cem_values_idx = 0;
990    for (int part = 0; part < num_parts; ++part) {
991       uint8_t *v = &colour_endpoints[cem_values_idx];
992       int v0 = v[0];
993       int v1 = v[1];
994       int v2 = v[2];
995       int v3 = v[3];
996       int v4 = v[4];
997       int v5 = v[5];
998       int v6 = v[6];
999       int v7 = v[7];
1000       cem_values_idx += ((cems[part] >> 2) + 1) * 2;
1001 
1002       uint8x4_t e0, e1;
1003       int s0, s1, L0, L1;
1004 
1005       switch (cems[part])
1006       {
1007       case 0:
1008          e0 = uint8x4_t(v0, v0, v0, 0xff);
1009          e1 = uint8x4_t(v1, v1, v1, 0xff);
1010          break;
1011       case 1:
1012          L0 = (v0 >> 2) | (v1 & 0xc0);
1013          L1 = L0 + (v1 & 0x3f);
1014          if (L1 > 0xff)
1015             L1 = 0xff;
1016          e0 = uint8x4_t(L0, L0, L0, 0xff);
1017          e1 = uint8x4_t(L1, L1, L1, 0xff);
1018          break;
1019       case 4:
1020          e0 = uint8x4_t(v0, v0, v0, v2);
1021          e1 = uint8x4_t(v1, v1, v1, v3);
1022          break;
1023       case 5:
1024          bit_transfer_signed(v1, v0);
1025          bit_transfer_signed(v3, v2);
1026          e0 = uint8x4_t(v0, v0, v0, v2);
1027          e1 = uint8x4_t::clamped(v0+v1, v0+v1, v0+v1, v2+v3);
1028          break;
1029       case 6:
1030          e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, 0xff);
1031          e1 = uint8x4_t(v0, v1, v2, 0xff);
1032          break;
1033       case 8:
1034          s0 = v0 + v2 + v4;
1035          s1 = v1 + v3 + v5;
1036          if (s1 >= s0) {
1037             e0 = uint8x4_t(v0, v2, v4, 0xff);
1038             e1 = uint8x4_t(v1, v3, v5, 0xff);
1039          } else {
1040             e0 = blue_contract(v1, v3, v5, 0xff);
1041             e1 = blue_contract(v0, v2, v4, 0xff);
1042          }
1043          break;
1044       case 9:
1045          bit_transfer_signed(v1, v0);
1046          bit_transfer_signed(v3, v2);
1047          bit_transfer_signed(v5, v4);
1048          if (v1 + v3 + v5 >= 0) {
1049             e0 = uint8x4_t(v0, v2, v4, 0xff);
1050             e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, 0xff);
1051          } else {
1052             e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, 0xff);
1053             e1 = blue_contract(v0, v2, v4, 0xff);
1054          }
1055          break;
1056       case 10:
1057          e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, v4);
1058          e1 = uint8x4_t(v0, v1, v2, v5);
1059          break;
1060       case 12:
1061          s0 = v0 + v2 + v4;
1062          s1 = v1 + v3 + v5;
1063          if (s1 >= s0) {
1064             e0 = uint8x4_t(v0, v2, v4, v6);
1065             e1 = uint8x4_t(v1, v3, v5, v7);
1066          } else {
1067             e0 = blue_contract(v1, v3, v5, v7);
1068             e1 = blue_contract(v0, v2, v4, v6);
1069          }
1070          break;
1071       case 13:
1072          bit_transfer_signed(v1, v0);
1073          bit_transfer_signed(v3, v2);
1074          bit_transfer_signed(v5, v4);
1075          bit_transfer_signed(v7, v6);
1076          if (v1 + v3 + v5 >= 0) {
1077             e0 = uint8x4_t(v0, v2, v4, v6);
1078             e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, v6+v7);
1079          } else {
1080             e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, v6+v7);
1081             e1 = blue_contract(v0, v2, v4, v6);
1082          }
1083          break;
1084       default:
1085          /* HDR endpoints not supported; return error colour */
1086          e0 = uint8x4_t(255, 0, 255, 255);
1087          e1 = uint8x4_t(255, 0, 255, 255);
1088          break;
1089       }
1090 
1091       endpoints_decoded[0][part] = e0;
1092       endpoints_decoded[1][part] = e1;
1093 
1094       if (VERBOSE_DECODE) {
1095          printf("cems[%d]=%d v=[", part, cems[part]);
1096          for (int i = 0; i < (cems[part] >> 2) + 1; ++i) {
1097             if (i)
1098                printf(", ");
1099             printf("%3d", v[i]);
1100          }
1101          printf("] e0=[%3d,%4d,%4d,%4d] e1=[%3d,%4d,%4d,%4d]\n",
1102                 e0.v[0], e0.v[1], e0.v[2], e0.v[3],
1103                e1.v[0], e1.v[1], e1.v[2], e1.v[3]);
1104       }
1105    }
1106 }
1107 
unpack_weights(InputBitVector in)1108 void Block::unpack_weights(InputBitVector in)
1109 {
1110    if (wt_trits) {
1111       int offset = 128;
1112       int bits_left = weight_bits;
1113       for (int i = 0; i < num_weights; i += 5) {
1114          int bits_to_read = MIN2(bits_left, 8 + 5*wt_bits);
1115          /* If wt_trits then wt_bits <= 3, so bits_to_read <= 23 and we can use uint32_t */
1116          uint32_t raw = in.get_bits_rev(offset, bits_to_read);
1117          unpack_trit_block(wt_bits, raw, &weights_quant[i]);
1118 
1119          if (VERBOSE_DECODE)
1120             in.printf_bits(offset - bits_to_read, bits_to_read, "weight trits [%d,%d,%d,%d,%d]",
1121                            weights_quant[i+0], weights_quant[i+1],
1122                   weights_quant[i+2], weights_quant[i+3],
1123                   weights_quant[i+4]);
1124 
1125          offset -= 8 + wt_bits * 5;
1126          bits_left -= 8 + wt_bits * 5;
1127       }
1128 
1129    } else if (wt_quints) {
1130 
1131       int offset = 128;
1132       int bits_left = weight_bits;
1133       for (int i = 0; i < num_weights; i += 3) {
1134          int bits_to_read = MIN2(bits_left, 7 + 3*wt_bits);
1135          /* If wt_quints then wt_bits <= 2, so bits_to_read <= 13 and we can use uint32_t */
1136          uint32_t raw = in.get_bits_rev(offset, bits_to_read);
1137          unpack_quint_block(wt_bits, raw, &weights_quant[i]);
1138 
1139          if (VERBOSE_DECODE)
1140             in.printf_bits(offset - bits_to_read, bits_to_read, "weight quints [%d,%d,%d]",
1141                            weights_quant[i], weights_quant[i+1], weights_quant[i+2]);
1142 
1143          offset -= 7 + wt_bits * 3;
1144          bits_left -= 7 + wt_bits * 3;
1145       }
1146 
1147    } else {
1148       int offset = 128;
1149       assert((weight_bits % wt_bits) == 0);
1150       for (int i = 0; i < num_weights; ++i) {
1151          weights_quant[i] = in.get_bits_rev(offset, wt_bits);
1152 
1153          if (VERBOSE_DECODE)
1154             in.printf_bits(offset - wt_bits, wt_bits, "weight bits [%d]", weights_quant[i]);
1155 
1156          offset -= wt_bits;
1157       }
1158    }
1159 }
1160 
unquantise_weights()1161 void Block::unquantise_weights()
1162 {
1163    assert(num_weights <= (int)ARRAY_SIZE(weights_quant));
1164    assert(num_weights <= (int)ARRAY_SIZE(weights));
1165 
1166    memset(weights, 0, sizeof(weights));
1167 
1168    for (int i = 0; i < num_weights; ++i) {
1169 
1170       uint8_t v = weights_quant[i];
1171       uint8_t w;
1172 
1173       if (wt_trits) {
1174 
1175          if (wt_bits == 0) {
1176             w = v * 32;
1177          } else {
1178             uint8_t A, B, C, D;
1179             A = (v & 0x1) ? 0x7F : 0x00;
1180             switch (wt_bits) {
1181             case 1:
1182                B = 0;
1183                C = 50;
1184                D = v >> 1;
1185                break;
1186             case 2:
1187                B = (v & 0x2) ? 0x45 : 0x00;
1188                C = 23;
1189                D = v >> 2;
1190                break;
1191             case 3:
1192                B = ((v & 0x6) >> 1) | ((v & 0x6) << 4);
1193                C = 11;
1194                D = v >> 3;
1195                break;
1196             default:
1197                unreachable("");
1198             }
1199             uint16_t T = D * C + B;
1200             T = T ^ A;
1201             T = (A & 0x20) | (T >> 2);
1202             assert(T < 64);
1203             if (T > 32)
1204                T++;
1205             w = T;
1206          }
1207 
1208       } else if (wt_quints) {
1209 
1210          if (wt_bits == 0) {
1211             w = v * 16;
1212          } else {
1213             uint8_t A, B, C, D;
1214             A = (v & 0x1) ? 0x7F : 0x00;
1215             switch (wt_bits) {
1216             case 1:
1217                B = 0;
1218                C = 28;
1219                D = v >> 1;
1220                break;
1221             case 2:
1222                B = (v & 0x2) ? 0x42 : 0x00;
1223                C = 13;
1224                D = v >> 2;
1225                break;
1226             default:
1227                unreachable("");
1228             }
1229             uint16_t T = D * C + B;
1230             T = T ^ A;
1231             T = (A & 0x20) | (T >> 2);
1232             assert(T < 64);
1233             if (T > 32)
1234                T++;
1235             w = T;
1236          }
1237          weights[i] = w;
1238 
1239       } else {
1240 
1241          switch (wt_bits) {
1242          case 1: w = v ? 0x3F : 0x00; break;
1243          case 2: w = v | (v << 2) | (v << 4); break;
1244          case 3: w = v | (v << 3); break;
1245          case 4: w = (v >> 2) | (v << 2); break;
1246          case 5: w = (v >> 4) | (v << 1); break;
1247          default: unreachable("");
1248          }
1249          assert(w < 64);
1250          if (w > 32)
1251             w++;
1252       }
1253       weights[i] = w;
1254    }
1255 }
1256 
compute_infill_weights(int block_w,int block_h,int block_d)1257 void Block::compute_infill_weights(int block_w, int block_h, int block_d)
1258 {
1259    int Ds = block_w <= 1 ? 0 : (1024 + block_w / 2) / (block_w - 1);
1260    int Dt = block_h <= 1 ? 0 : (1024 + block_h / 2) / (block_h - 1);
1261    int Dr = block_d <= 1 ? 0 : (1024 + block_d / 2) / (block_d - 1);
1262    for (int r = 0; r < block_d; ++r) {
1263       for (int t = 0; t < block_h; ++t) {
1264          for (int s = 0; s < block_w; ++s) {
1265             int cs = Ds * s;
1266             int ct = Dt * t;
1267             int cr = Dr * r;
1268             int gs = (cs * (wt_w - 1) + 32) >> 6;
1269             int gt = (ct * (wt_h - 1) + 32) >> 6;
1270             int gr = (cr * (wt_d - 1) + 32) >> 6;
1271             assert(gs >= 0 && gs <= 176);
1272             assert(gt >= 0 && gt <= 176);
1273             assert(gr >= 0 && gr <= 176);
1274             int js = gs >> 4;
1275             int fs = gs & 0xf;
1276             int jt = gt >> 4;
1277             int ft = gt & 0xf;
1278             int jr = gr >> 4;
1279             int fr = gr & 0xf;
1280 
1281             /* TODO: 3D */
1282             (void)jr;
1283             (void)fr;
1284 
1285             int w11 = (fs * ft + 8) >> 4;
1286             int w10 = ft - w11;
1287             int w01 = fs - w11;
1288             int w00 = 16 - fs - ft + w11;
1289 
1290             if (dual_plane) {
1291                int p00, p01, p10, p11, i0, i1;
1292                int v0 = js + jt * wt_w;
1293                p00 = weights[(v0) * 2];
1294                p01 = weights[(v0 + 1) * 2];
1295                p10 = weights[(v0 + wt_w) * 2];
1296                p11 = weights[(v0 + wt_w + 1) * 2];
1297                i0 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1298                p00 = weights[(v0) * 2 + 1];
1299                p01 = weights[(v0 + 1) * 2 + 1];
1300                p10 = weights[(v0 + wt_w) * 2 + 1];
1301                p11 = weights[(v0 + wt_w + 1) * 2 + 1];
1302                assert((v0 + wt_w + 1) * 2 + 1 < (int)ARRAY_SIZE(weights));
1303                i1 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1304                assert(0 <= i0 && i0 <= 64);
1305                infill_weights[0][s + t*block_w + r*block_w*block_h] = i0;
1306                infill_weights[1][s + t*block_w + r*block_w*block_h] = i1;
1307             } else {
1308                int p00, p01, p10, p11, i;
1309                int v0 = js + jt * wt_w;
1310                p00 = weights[v0];
1311                p01 = weights[v0 + 1];
1312                p10 = weights[v0 + wt_w];
1313                p11 = weights[v0 + wt_w + 1];
1314                assert(v0 + wt_w + 1 < (int)ARRAY_SIZE(weights));
1315                i = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1316                assert(0 <= i && i <= 64);
1317                infill_weights[0][s + t*block_w + r*block_w*block_h] = i;
1318             }
1319          }
1320       }
1321    }
1322 }
1323 
unquantise_colour_endpoints()1324 void Block::unquantise_colour_endpoints()
1325 {
1326    assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints_quant));
1327    assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints));
1328 
1329    for (int i = 0; i < num_cem_values; ++i) {
1330       uint8_t v = colour_endpoints_quant[i];
1331 
1332       if (ce_trits) {
1333          uint16_t A, B, C, D;
1334          uint16_t t;
1335          A = (v & 0x1) ? 0x1FF : 0x000;
1336          switch (ce_bits) {
1337          case 1:
1338             B = 0;
1339             C = 204;
1340             D = v >> 1;
1341             break;
1342          case 2:
1343             B = (v & 0x2) ? 0x116 : 0x000;
1344             C = 93;
1345             D = v >> 2;
1346             break;
1347          case 3:
1348             t = ((v >> 1) & 0x3);
1349             B = t | (t << 2) | (t << 7);
1350             C = 44;
1351             D = v >> 3;
1352             break;
1353          case 4:
1354             t = ((v >> 1) & 0x7);
1355             B = t | (t << 6);
1356             C = 22;
1357             D = v >> 4;
1358             break;
1359          case 5:
1360             t = ((v >> 1) & 0xF);
1361             B = (t >> 2) | (t << 5);
1362             C = 11;
1363             D = v >> 5;
1364             break;
1365          case 6:
1366             B = ((v & 0x3E) << 3) | ((v >> 5) & 0x1);
1367             C = 5;
1368             D = v >> 6;
1369             break;
1370          default:
1371             unreachable("");
1372          }
1373          uint16_t T = D * C + B;
1374          T = T ^ A;
1375          T = (A & 0x80) | (T >> 2);
1376          assert(T < 256);
1377          colour_endpoints[i] = T;
1378       } else if (ce_quints) {
1379          uint16_t A, B, C, D;
1380          uint16_t t;
1381          A = (v & 0x1) ? 0x1FF : 0x000;
1382          switch (ce_bits) {
1383          case 1:
1384             B = 0;
1385             C = 113;
1386             D = v >> 1;
1387             break;
1388          case 2:
1389             B = (v & 0x2) ? 0x10C : 0x000;
1390             C = 54;
1391             D = v >> 2;
1392             break;
1393          case 3:
1394             t = ((v >> 1) & 0x3);
1395             B = (t >> 1) | (t << 1) | (t << 7);
1396             C = 26;
1397             D = v >> 3;
1398             break;
1399          case 4:
1400             t = ((v >> 1) & 0x7);
1401             B = (t >> 1) | (t << 6);
1402             C = 13;
1403             D = v >> 4;
1404             break;
1405          case 5:
1406             t = ((v >> 1) & 0xF);
1407             B = (t >> 4) | (t << 5);
1408             C = 6;
1409             D = v >> 5;
1410             break;
1411          default:
1412             unreachable("");
1413          }
1414          uint16_t T = D * C + B;
1415          T = T ^ A;
1416          T = (A & 0x80) | (T >> 2);
1417          assert(T < 256);
1418          colour_endpoints[i] = T;
1419       } else {
1420          switch (ce_bits) {
1421          case 1: v = v ? 0xFF : 0x00; break;
1422          case 2: v = (v << 6) | (v << 4) | (v << 2) | v; break;
1423          case 3: v = (v << 5) | (v << 2) | (v >> 1); break;
1424          case 4: v = (v << 4) | v; break;
1425          case 5: v = (v << 3) | (v >> 2); break;
1426          case 6: v = (v << 2) | (v >> 4); break;
1427          case 7: v = (v << 1) | (v >> 6); break;
1428          case 8: break;
1429          default: unreachable("");
1430          }
1431          colour_endpoints[i] = v;
1432       }
1433    }
1434 }
1435 
decode(const Decoder & decoder,InputBitVector in)1436 decode_error::type Block::decode(const Decoder &decoder, InputBitVector in)
1437 {
1438    decode_error::type err;
1439 
1440    is_error = false;
1441    bogus_colour_endpoints = false;
1442    bogus_weights = false;
1443    is_void_extent = false;
1444 
1445    wt_d = 1;
1446    /* TODO: 3D */
1447 
1448    /* TODO: test for all the illegal encodings */
1449 
1450    if (VERBOSE_DECODE)
1451       in.printf_bits(0, 128);
1452 
1453    err = decode_block_mode(in);
1454    if (err != decode_error::ok)
1455       return err;
1456 
1457    if (is_void_extent)
1458       return decode_error::ok;
1459 
1460    /* TODO: 3D */
1461 
1462    calculate_from_weights();
1463 
1464    if (VERBOSE_DECODE)
1465       printf("weights_grid=%dx%dx%d dual_plane=%d num_weights=%d high_prec=%d r=%d range=0..%d (%dt %dq %db) weight_bits=%d\n",
1466              wt_w, wt_h, wt_d, dual_plane, num_weights, high_prec, wt_range, wt_max, wt_trits, wt_quints, wt_bits, weight_bits);
1467 
1468    if (wt_w > decoder.block_w || wt_h > decoder.block_h || wt_d > decoder.block_d)
1469       return decode_error::weight_grid_exceeds_block_size;
1470 
1471    num_parts = in.get_bits(11, 2) + 1;
1472 
1473    if (VERBOSE_DECODE)
1474       in.printf_bits(11, 2, "partitions = %d", num_parts);
1475 
1476    if (dual_plane && num_parts > 3)
1477       return decode_error::dual_plane_and_too_many_partitions;
1478 
1479    decode_cem(in);
1480 
1481    if (VERBOSE_DECODE)
1482       printf("cem=[%d,%d,%d,%d] base_cem_class=%d\n", cems[0], cems[1], cems[2], cems[3], cem_base_class);
1483 
1484    int num_cem_pairs = (cem_base_class + 1) * num_parts + extra_cem_bits;
1485    num_cem_values = num_cem_pairs * 2;
1486 
1487    calculate_remaining_bits();
1488    err = calculate_colour_endpoints_size();
1489    if (err != decode_error::ok)
1490       return err;
1491 
1492    if (VERBOSE_DECODE)
1493       in.printf_bits(colour_endpoint_data_offset, colour_endpoint_bits,
1494                      "endpoint data (%d bits, %d vals, %dt %dq %db)",
1495                      colour_endpoint_bits, num_cem_values, ce_trits, ce_quints, ce_bits);
1496 
1497    unpack_colour_endpoints(in);
1498 
1499    if (VERBOSE_DECODE) {
1500       printf("cem values raw =[");
1501       for (int i = 0; i < num_cem_values; i++) {
1502          if (i)
1503             printf(", ");
1504          printf("%3d", colour_endpoints_quant[i]);
1505       }
1506       printf("]\n");
1507    }
1508 
1509    if (num_cem_values > 18)
1510       return decode_error::invalid_colour_endpoints_count;
1511 
1512    unquantise_colour_endpoints();
1513 
1514    if (VERBOSE_DECODE) {
1515       printf("cem values norm=[");
1516       for (int i = 0; i < num_cem_values; i++) {
1517          if (i)
1518             printf(", ");
1519          printf("%3d", colour_endpoints[i]);
1520       }
1521       printf("]\n");
1522    }
1523 
1524    decode_colour_endpoints();
1525 
1526    if (dual_plane) {
1527       int ccs_offset = 128 - weight_bits - num_extra_cem_bits - 2;
1528       colour_component_selector = in.get_bits(ccs_offset, 2);
1529 
1530       if (VERBOSE_DECODE)
1531          in.printf_bits(ccs_offset, 2, "colour component selector = %d", colour_component_selector);
1532    } else {
1533       colour_component_selector = 0;
1534    }
1535 
1536 
1537    if (VERBOSE_DECODE)
1538       in.printf_bits(128 - weight_bits, weight_bits, "weights (%d bits)", weight_bits);
1539 
1540    if (num_weights > 64)
1541       return decode_error::invalid_num_weights;
1542 
1543    if (weight_bits < 24 || weight_bits > 96)
1544       return decode_error::invalid_weight_bits;
1545 
1546    unpack_weights(in);
1547 
1548    unquantise_weights();
1549 
1550    if (VERBOSE_DECODE) {
1551       printf("weights=[");
1552       for (int i = 0; i < num_weights; ++i) {
1553          if (i)
1554             printf(", ");
1555          printf("%d", weights[i]);
1556       }
1557       printf("]\n");
1558 
1559       for (int plane = 0; plane <= dual_plane; ++plane) {
1560          printf("weights (plane %d):\n", plane);
1561          int i = 0;
1562          (void)i;
1563 
1564          for (int r = 0; r < wt_d; ++r) {
1565             for (int t = 0; t < wt_h; ++t) {
1566                for (int s = 0; s < wt_w; ++s) {
1567                   printf("%3d", weights[i++ * (1 + dual_plane) + plane]);
1568                }
1569                printf("\n");
1570             }
1571             if (r < wt_d - 1)
1572                printf("\n");
1573          }
1574       }
1575    }
1576 
1577    compute_infill_weights(decoder.block_w, decoder.block_h, decoder.block_d);
1578 
1579    if (VERBOSE_DECODE) {
1580       for (int plane = 0; plane <= dual_plane; ++plane) {
1581          printf("infilled weights (plane %d):\n", plane);
1582          int i = 0;
1583          (void)i;
1584 
1585          for (int r = 0; r < decoder.block_d; ++r) {
1586             for (int t = 0; t < decoder.block_h; ++t) {
1587                for (int s = 0; s < decoder.block_w; ++s) {
1588                   printf("%3d", infill_weights[plane][i++]);
1589                }
1590                printf("\n");
1591             }
1592             if (r < decoder.block_d - 1)
1593                printf("\n");
1594          }
1595       }
1596    }
1597    if (VERBOSE_DECODE)
1598       printf("\n");
1599 
1600    return decode_error::ok;
1601 }
1602 
write_decoded(const Decoder & decoder,uint16_t * output)1603 void Block::write_decoded(const Decoder &decoder, uint16_t *output)
1604 {
1605    /* sRGB can only be stored as unorm8. */
1606    assert(!decoder.srgb || decoder.output_unorm8);
1607 
1608    if (is_void_extent) {
1609       for (int idx = 0; idx < decoder.block_w*decoder.block_h*decoder.block_d; ++idx) {
1610          if (decoder.output_unorm8) {
1611             output[idx*4+0] = void_extent_colour_r >> 8;
1612             output[idx*4+1] = void_extent_colour_g >> 8;
1613             output[idx*4+2] = void_extent_colour_b >> 8;
1614             output[idx*4+3] = void_extent_colour_a >> 8;
1615          } else {
1616             /* Store the color as FP16. */
1617             output[idx*4+0] = _mesa_uint16_div_64k_to_half(void_extent_colour_r);
1618             output[idx*4+1] = _mesa_uint16_div_64k_to_half(void_extent_colour_g);
1619             output[idx*4+2] = _mesa_uint16_div_64k_to_half(void_extent_colour_b);
1620             output[idx*4+3] = _mesa_uint16_div_64k_to_half(void_extent_colour_a);
1621          }
1622       }
1623       return;
1624    }
1625 
1626    int small_block = (decoder.block_w * decoder.block_h * decoder.block_d) < 31;
1627 
1628    int idx = 0;
1629    for (int z = 0; z < decoder.block_d; ++z) {
1630       for (int y = 0; y < decoder.block_h; ++y) {
1631          for (int x = 0; x < decoder.block_w; ++x) {
1632 
1633             int partition;
1634             if (num_parts > 1) {
1635                partition = select_partition(partition_index, x, y, z, num_parts, small_block);
1636                assert(partition < num_parts);
1637             } else {
1638                partition = 0;
1639             }
1640 
1641             /* TODO: HDR */
1642 
1643             uint8x4_t e0 = endpoints_decoded[0][partition];
1644             uint8x4_t e1 = endpoints_decoded[1][partition];
1645             uint16_t c0[4], c1[4];
1646 
1647             /* Expand to 16 bits. */
1648             if (decoder.srgb) {
1649                c0[0] = (uint16_t)((e0.v[0] << 8) | 0x80);
1650                c0[1] = (uint16_t)((e0.v[1] << 8) | 0x80);
1651                c0[2] = (uint16_t)((e0.v[2] << 8) | 0x80);
1652                c0[3] = (uint16_t)((e0.v[3] << 8) | 0x80);
1653 
1654                c1[0] = (uint16_t)((e1.v[0] << 8) | 0x80);
1655                c1[1] = (uint16_t)((e1.v[1] << 8) | 0x80);
1656                c1[2] = (uint16_t)((e1.v[2] << 8) | 0x80);
1657                c1[3] = (uint16_t)((e1.v[3] << 8) | 0x80);
1658             } else {
1659                c0[0] = (uint16_t)((e0.v[0] << 8) | e0.v[0]);
1660                c0[1] = (uint16_t)((e0.v[1] << 8) | e0.v[1]);
1661                c0[2] = (uint16_t)((e0.v[2] << 8) | e0.v[2]);
1662                c0[3] = (uint16_t)((e0.v[3] << 8) | e0.v[3]);
1663 
1664                c1[0] = (uint16_t)((e1.v[0] << 8) | e1.v[0]);
1665                c1[1] = (uint16_t)((e1.v[1] << 8) | e1.v[1]);
1666                c1[2] = (uint16_t)((e1.v[2] << 8) | e1.v[2]);
1667                c1[3] = (uint16_t)((e1.v[3] << 8) | e1.v[3]);
1668             }
1669 
1670             int w[4];
1671             if (dual_plane) {
1672                int w0 = infill_weights[0][idx];
1673                int w1 = infill_weights[1][idx];
1674                w[0] = w[1] = w[2] = w[3] = w0;
1675                w[colour_component_selector] = w1;
1676             } else {
1677                int w0 = infill_weights[0][idx];
1678                w[0] = w[1] = w[2] = w[3] = w0;
1679             }
1680 
1681             /* Interpolate to produce UNORM16, applying weights. */
1682             uint16_t c[4] = {
1683                (uint16_t)((c0[0] * (64 - w[0]) + c1[0] * w[0] + 32) >> 6),
1684                (uint16_t)((c0[1] * (64 - w[1]) + c1[1] * w[1] + 32) >> 6),
1685                (uint16_t)((c0[2] * (64 - w[2]) + c1[2] * w[2] + 32) >> 6),
1686                (uint16_t)((c0[3] * (64 - w[3]) + c1[3] * w[3] + 32) >> 6),
1687             };
1688 
1689             if (decoder.output_unorm8) {
1690                output[idx*4+0] = c[0] >> 8;
1691                output[idx*4+1] = c[1] >> 8;
1692                output[idx*4+2] = c[2] >> 8;
1693                output[idx*4+3] = c[3] >> 8;
1694             } else {
1695                /* Store the color as FP16. */
1696                output[idx*4+0] = c[0] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[0]);
1697                output[idx*4+1] = c[1] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[1]);
1698                output[idx*4+2] = c[2] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[2]);
1699                output[idx*4+3] = c[3] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[3]);
1700             }
1701 
1702             idx++;
1703          }
1704       }
1705    }
1706 }
1707 
calculate_from_weights()1708 void Block::calculate_from_weights()
1709 {
1710    wt_trits = 0;
1711    wt_quints = 0;
1712    wt_bits = 0;
1713    switch (high_prec) {
1714    case 0:
1715       switch (wt_range) {
1716       case 0x2: wt_max = 1; wt_bits = 1; break;
1717       case 0x3: wt_max = 2; wt_trits = 1; break;
1718       case 0x4: wt_max = 3; wt_bits = 2; break;
1719       case 0x5: wt_max = 4; wt_quints = 1; break;
1720       case 0x6: wt_max = 5; wt_trits = 1; wt_bits = 1; break;
1721       case 0x7: wt_max = 7; wt_bits = 3; break;
1722       default: abort();
1723       }
1724       break;
1725    case 1:
1726       switch (wt_range) {
1727       case 0x2: wt_max = 9; wt_quints = 1; wt_bits = 1; break;
1728       case 0x3: wt_max = 11; wt_trits = 1; wt_bits = 2; break;
1729       case 0x4: wt_max = 15; wt_bits = 4; break;
1730       case 0x5: wt_max = 19; wt_quints = 1; wt_bits = 2; break;
1731       case 0x6: wt_max = 23; wt_trits = 1; wt_bits = 3; break;
1732       case 0x7: wt_max = 31; wt_bits = 5; break;
1733       default: abort();
1734       }
1735       break;
1736    }
1737 
1738    assert(wt_trits || wt_quints || wt_bits);
1739 
1740    num_weights = wt_w * wt_h * wt_d;
1741 
1742    if (dual_plane)
1743       num_weights *= 2;
1744 
1745    weight_bits =
1746          (num_weights * 8 * wt_trits + 4) / 5
1747          + (num_weights * 7 * wt_quints + 2) / 3
1748          +  num_weights * wt_bits;
1749 }
1750 
calculate_remaining_bits()1751 void Block::calculate_remaining_bits()
1752 {
1753    int config_bits;
1754    if (num_parts > 1) {
1755       if (!is_multi_cem)
1756          config_bits = 29;
1757       else
1758          config_bits = 25 + 3 * num_parts;
1759    } else {
1760       config_bits = 17;
1761    }
1762 
1763    if (dual_plane)
1764       config_bits += 2;
1765 
1766    remaining_bits = 128 - config_bits - weight_bits;
1767 }
1768 
calculate_colour_endpoints_size()1769 decode_error::type Block::calculate_colour_endpoints_size()
1770 {
1771    /* Specified as illegal */
1772    if (remaining_bits < (13 * num_cem_values + 4) / 5) {
1773       colour_endpoint_bits = ce_max = ce_trits = ce_quints = ce_bits = 0;
1774       return decode_error::invalid_colour_endpoints_size;
1775    }
1776 
1777    /* Find the largest cem_ranges that fits within remaining_bits */
1778    for (int i = ARRAY_SIZE(cem_ranges)-1; i >= 0; --i) {
1779       int cem_bits;
1780       cem_bits = (num_cem_values * 8 * cem_ranges[i].t + 4) / 5
1781                  + (num_cem_values * 7 * cem_ranges[i].q + 2) / 3
1782                  +  num_cem_values * cem_ranges[i].b;
1783 
1784       if (cem_bits <= remaining_bits)
1785       {
1786          colour_endpoint_bits = cem_bits;
1787          ce_max = cem_ranges[i].max;
1788          ce_trits = cem_ranges[i].t;
1789          ce_quints = cem_ranges[i].q;
1790          ce_bits = cem_ranges[i].b;
1791          return decode_error::ok;
1792       }
1793    }
1794 
1795    assert(0);
1796    return decode_error::invalid_colour_endpoints_size;
1797 }
1798 
1799 /**
1800  * Decode ASTC 2D LDR texture data.
1801  *
1802  * \param src_width in pixels
1803  * \param src_height in pixels
1804  * \param dst_stride in bytes
1805  */
1806 extern "C" void
_mesa_unpack_astc_2d_ldr(uint8_t * dst_row,unsigned dst_stride,const uint8_t * src_row,unsigned src_stride,unsigned src_width,unsigned src_height,mesa_format format)1807 _mesa_unpack_astc_2d_ldr(uint8_t *dst_row,
1808                          unsigned dst_stride,
1809                          const uint8_t *src_row,
1810                          unsigned src_stride,
1811                          unsigned src_width,
1812                          unsigned src_height,
1813                          mesa_format format)
1814 {
1815    assert(_mesa_is_format_astc_2d(format));
1816    bool srgb = _mesa_is_format_srgb(format);
1817 
1818    unsigned blk_w, blk_h;
1819    _mesa_get_format_block_size(format, &blk_w, &blk_h);
1820 
1821    const unsigned block_size = 16;
1822    unsigned x_blocks = (src_width + blk_w - 1) / blk_w;
1823    unsigned y_blocks = (src_height + blk_h - 1) / blk_h;
1824 
1825    Decoder dec(blk_w, blk_h, 1, srgb, true);
1826 
1827    for (unsigned y = 0; y < y_blocks; ++y) {
1828       for (unsigned x = 0; x < x_blocks; ++x) {
1829          /* Same size as the largest block. */
1830          uint16_t block_out[12 * 12 * 4];
1831 
1832          dec.decode(src_row + x * block_size, block_out);
1833 
1834          /* This can be smaller with NPOT dimensions. */
1835          unsigned dst_blk_w = MIN2(blk_w, src_width  - x*blk_w);
1836          unsigned dst_blk_h = MIN2(blk_h, src_height - y*blk_h);
1837 
1838          for (unsigned sub_y = 0; sub_y < dst_blk_h; ++sub_y) {
1839             for (unsigned sub_x = 0; sub_x < dst_blk_w; ++sub_x) {
1840                uint8_t *dst = dst_row + sub_y * dst_stride +
1841                               (x * blk_w + sub_x) * 4;
1842                const uint16_t *src = &block_out[(sub_y * blk_w + sub_x) * 4];
1843 
1844                dst[0] = src[0];
1845                dst[1] = src[1];
1846                dst[2] = src[2];
1847                dst[3] = src[3];
1848             }
1849          }
1850       }
1851       src_row += src_stride;
1852       dst_row += dst_stride * blk_h;
1853    }
1854 }
1855