1 /**************************************************************************
2 *
3 * Copyright (C) 1999-2008 Brian Paul All Rights Reserved.
4 * Copyright (c) 2008 VMware, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 **************************************************************************/
25
26 #include "util/format/u_format.h"
27 #include "util/format/u_format_fxt1.h"
28 #include "util/format/u_format_pack.h"
29 #include "util/format_srgb.h"
30 #include "util/u_math.h"
31
32 #define RCOMP 0
33 #define GCOMP 1
34 #define BCOMP 2
35 #define ACOMP 3
36
37 #define FXT1_BLOCK_SIZE 16
38
39 static void
40 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
41 const void *source, int32_t srcRowStride,
42 void *dest, int32_t destRowStride);
43
44 static void
45 fxt1_decode_1 (const void *texture, int32_t stride,
46 int32_t i, int32_t j, uint8_t *rgba);
47
48 /***************************************************************************\
49 * FXT1 encoder
50 *
51 * The encoder was built by reversing the decoder,
52 * and is vaguely based on Texus2 by 3dfx. Note that this code
53 * is merely a proof of concept, since it is highly UNoptimized;
54 * moreover, it is sub-optimal due to initial conditions passed
55 * to Lloyd's algorithm (the interpolation modes are even worse).
56 \***************************************************************************/
57
58
59 #define MAX_COMP 4 /* ever needed maximum number of components in texel */
60 #define MAX_VECT 4 /* ever needed maximum number of base vectors to find */
61 #define N_TEXELS 32 /* number of texels in a block (always 32) */
62 #define LL_N_REP 50 /* number of iterations in lloyd's vq */
63 #define LL_RMS_D 10 /* fault tolerance (maximum delta) */
64 #define LL_RMS_E 255 /* fault tolerance (maximum error) */
65 #define ALPHA_TS 2 /* alpha threshold: (255 - ALPHA_TS) deemed opaque */
66 static const uint32_t zero = 0;
67 #define ISTBLACK(v) (memcmp(&(v), &zero, sizeof(zero)) == 0)
68
69 /*
70 * Define a 64-bit unsigned integer type and macros
71 */
72 #if 1
73
74 #define FX64_NATIVE 1
75
76 typedef uint64_t Fx64;
77
78 #define FX64_MOV32(a, b) a = b
79 #define FX64_OR32(a, b) a |= b
80 #define FX64_SHL(a, c) a <<= c
81
82 #else
83
84 #define FX64_NATIVE 0
85
86 typedef struct {
87 uint32_t lo, hi;
88 } Fx64;
89
90 #define FX64_MOV32(a, b) a.lo = b
91 #define FX64_OR32(a, b) a.lo |= b
92
93 #define FX64_SHL(a, c) \
94 do { \
95 if ((c) >= 32) { \
96 a.hi = a.lo << ((c) - 32); \
97 a.lo = 0; \
98 } else { \
99 a.hi = (a.hi << (c)) | (a.lo >> (32 - (c))); \
100 a.lo <<= (c); \
101 } \
102 } while (0)
103
104 #endif
105
106
107 #define F(i) (float)1 /* can be used to obtain an oblong metric: 0.30 / 0.59 / 0.11 */
108 #define SAFECDOT 1 /* for paranoids */
109
110 #define MAKEIVEC(NV, NC, IV, B, V0, V1) \
111 do { \
112 /* compute interpolation vector */ \
113 float d2 = 0.0F; \
114 float rd2; \
115 \
116 for (i = 0; i < NC; i++) { \
117 IV[i] = (V1[i] - V0[i]) * F(i); \
118 d2 += IV[i] * IV[i]; \
119 } \
120 rd2 = (float)NV / d2; \
121 B = 0; \
122 for (i = 0; i < NC; i++) { \
123 IV[i] *= F(i); \
124 B -= IV[i] * V0[i]; \
125 IV[i] *= rd2; \
126 } \
127 B = B * rd2 + 0.5f; \
128 } while (0)
129
130 #define CALCCDOT(TEXEL, NV, NC, IV, B, V)\
131 do { \
132 float dot = 0.0F; \
133 for (i = 0; i < NC; i++) { \
134 dot += V[i] * IV[i]; \
135 } \
136 TEXEL = (int32_t)(dot + B); \
137 if (SAFECDOT) { \
138 if (TEXEL < 0) { \
139 TEXEL = 0; \
140 } else if (TEXEL > NV) { \
141 TEXEL = NV; \
142 } \
143 } \
144 } while (0)
145
146
147 static int32_t
fxt1_bestcol(float vec[][MAX_COMP],int32_t nv,uint8_t input[MAX_COMP],int32_t nc)148 fxt1_bestcol (float vec[][MAX_COMP], int32_t nv,
149 uint8_t input[MAX_COMP], int32_t nc)
150 {
151 int32_t i, j, best = -1;
152 float err = 1e9; /* big enough */
153
154 for (j = 0; j < nv; j++) {
155 float e = 0.0F;
156 for (i = 0; i < nc; i++) {
157 e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
158 }
159 if (e < err) {
160 err = e;
161 best = j;
162 }
163 }
164
165 return best;
166 }
167
168
169 static int32_t
fxt1_worst(float vec[MAX_COMP],uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)170 fxt1_worst (float vec[MAX_COMP],
171 uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
172 {
173 int32_t i, k, worst = -1;
174 float err = -1.0F; /* small enough */
175
176 for (k = 0; k < n; k++) {
177 float e = 0.0F;
178 for (i = 0; i < nc; i++) {
179 e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
180 }
181 if (e > err) {
182 err = e;
183 worst = k;
184 }
185 }
186
187 return worst;
188 }
189
190
191 static int32_t
fxt1_variance(uint8_t input[N_TEXELS/2][MAX_COMP],int32_t nc)192 fxt1_variance (uint8_t input[N_TEXELS / 2][MAX_COMP], int32_t nc)
193 {
194 const int n = N_TEXELS / 2;
195 int32_t i, k, best = 0;
196 int32_t sx, sx2;
197 double var, maxvar = -1; /* small enough */
198 double teenth = 1.0 / n;
199
200 for (i = 0; i < nc; i++) {
201 sx = sx2 = 0;
202 for (k = 0; k < n; k++) {
203 int32_t t = input[k][i];
204 sx += t;
205 sx2 += t * t;
206 }
207 var = sx2 * teenth - sx * sx * teenth * teenth;
208 if (maxvar < var) {
209 maxvar = var;
210 best = i;
211 }
212 }
213
214 return best;
215 }
216
217
218 static int32_t
fxt1_choose(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)219 fxt1_choose (float vec[][MAX_COMP], int32_t nv,
220 uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
221 {
222 #if 0
223 /* Choose colors from a grid.
224 */
225 int32_t i, j;
226
227 for (j = 0; j < nv; j++) {
228 int32_t m = j * (n - 1) / (nv - 1);
229 for (i = 0; i < nc; i++) {
230 vec[j][i] = input[m][i];
231 }
232 }
233 #else
234 /* Our solution here is to find the darkest and brightest colors in
235 * the 8x4 tile and use those as the two representative colors.
236 * There are probably better algorithms to use (histogram-based).
237 */
238 int32_t i, j, k;
239 int32_t minSum = 2000; /* big enough */
240 int32_t maxSum = -1; /* small enough */
241 int32_t minCol = 0; /* phoudoin: silent compiler! */
242 int32_t maxCol = 0; /* phoudoin: silent compiler! */
243
244 struct {
245 int32_t flag;
246 int32_t key;
247 int32_t freq;
248 int32_t idx;
249 } hist[N_TEXELS];
250 int32_t lenh = 0;
251
252 memset(hist, 0, sizeof(hist));
253
254 for (k = 0; k < n; k++) {
255 int32_t l;
256 int32_t key = 0;
257 int32_t sum = 0;
258 for (i = 0; i < nc; i++) {
259 key <<= 8;
260 key |= input[k][i];
261 sum += input[k][i];
262 }
263 for (l = 0; l < n; l++) {
264 if (!hist[l].flag) {
265 /* alloc new slot */
266 hist[l].flag = !0;
267 hist[l].key = key;
268 hist[l].freq = 1;
269 hist[l].idx = k;
270 lenh = l + 1;
271 break;
272 } else if (hist[l].key == key) {
273 hist[l].freq++;
274 break;
275 }
276 }
277 if (minSum > sum) {
278 minSum = sum;
279 minCol = k;
280 }
281 if (maxSum < sum) {
282 maxSum = sum;
283 maxCol = k;
284 }
285 }
286
287 if (lenh <= nv) {
288 for (j = 0; j < lenh; j++) {
289 for (i = 0; i < nc; i++) {
290 vec[j][i] = (float)input[hist[j].idx][i];
291 }
292 }
293 for (; j < nv; j++) {
294 for (i = 0; i < nc; i++) {
295 vec[j][i] = vec[0][i];
296 }
297 }
298 return 0;
299 }
300
301 for (j = 0; j < nv; j++) {
302 for (i = 0; i < nc; i++) {
303 vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
304 }
305 }
306 #endif
307
308 return !0;
309 }
310
311
312 static int32_t
fxt1_lloyd(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)313 fxt1_lloyd (float vec[][MAX_COMP], int32_t nv,
314 uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
315 {
316 /* Use the generalized lloyd's algorithm for VQ:
317 * find 4 color vectors.
318 *
319 * for each sample color
320 * sort to nearest vector.
321 *
322 * replace each vector with the centroid of its matching colors.
323 *
324 * repeat until RMS doesn't improve.
325 *
326 * if a color vector has no samples, or becomes the same as another
327 * vector, replace it with the color which is farthest from a sample.
328 *
329 * vec[][MAX_COMP] initial vectors and resulting colors
330 * nv number of resulting colors required
331 * input[N_TEXELS][MAX_COMP] input texels
332 * nc number of components in input / vec
333 * n number of input samples
334 */
335
336 int32_t sum[MAX_VECT][MAX_COMP]; /* used to accumulate closest texels */
337 int32_t cnt[MAX_VECT]; /* how many times a certain vector was chosen */
338 float error, lasterror = 1e9;
339
340 int32_t i, j, k, rep;
341
342 /* the quantizer */
343 for (rep = 0; rep < LL_N_REP; rep++) {
344 /* reset sums & counters */
345 for (j = 0; j < nv; j++) {
346 for (i = 0; i < nc; i++) {
347 sum[j][i] = 0;
348 }
349 cnt[j] = 0;
350 }
351 error = 0;
352
353 /* scan whole block */
354 for (k = 0; k < n; k++) {
355 #if 1
356 int32_t best = -1;
357 float err = 1e9; /* big enough */
358 /* determine best vector */
359 for (j = 0; j < nv; j++) {
360 float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
361 (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
362 (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
363 if (nc == 4) {
364 e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
365 }
366 if (e < err) {
367 err = e;
368 best = j;
369 }
370 }
371 #else
372 int32_t best = fxt1_bestcol(vec, nv, input[k], nc, &err);
373 #endif
374 assert(best >= 0);
375 /* add in closest color */
376 for (i = 0; i < nc; i++) {
377 sum[best][i] += input[k][i];
378 }
379 /* mark this vector as used */
380 cnt[best]++;
381 /* accumulate error */
382 error += err;
383 }
384
385 /* check RMS */
386 if ((error < LL_RMS_E) ||
387 ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
388 return !0; /* good match */
389 }
390 lasterror = error;
391
392 /* move each vector to the barycenter of its closest colors */
393 for (j = 0; j < nv; j++) {
394 if (cnt[j]) {
395 float div = 1.0F / cnt[j];
396 for (i = 0; i < nc; i++) {
397 vec[j][i] = div * sum[j][i];
398 }
399 } else {
400 /* this vec has no samples or is identical with a previous vec */
401 int32_t worst = fxt1_worst(vec[j], input, nc, n);
402 for (i = 0; i < nc; i++) {
403 vec[j][i] = input[worst][i];
404 }
405 }
406 }
407 }
408
409 return 0; /* could not converge fast enough */
410 }
411
412
413 static void
fxt1_quantize_CHROMA(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])414 fxt1_quantize_CHROMA (uint32_t *cc,
415 uint8_t input[N_TEXELS][MAX_COMP])
416 {
417 const int32_t n_vect = 4; /* 4 base vectors to find */
418 const int32_t n_comp = 3; /* 3 components: R, G, B */
419 float vec[MAX_VECT][MAX_COMP];
420 int32_t i, j, k;
421 Fx64 hi; /* high quadword */
422 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
423
424 if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
425 fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
426 }
427
428 FX64_MOV32(hi, 4); /* cc-chroma = "010" + unused bit */
429 for (j = n_vect - 1; j >= 0; j--) {
430 for (i = 0; i < n_comp; i++) {
431 /* add in colors */
432 FX64_SHL(hi, 5);
433 FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
434 }
435 }
436 ((Fx64 *)cc)[1] = hi;
437
438 lohi = lolo = 0;
439 /* right microtile */
440 for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
441 lohi <<= 2;
442 lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
443 }
444 /* left microtile */
445 for (; k >= 0; k--) {
446 lolo <<= 2;
447 lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
448 }
449 cc[1] = lohi;
450 cc[0] = lolo;
451 }
452
453
454 static void
fxt1_quantize_ALPHA0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)455 fxt1_quantize_ALPHA0 (uint32_t *cc,
456 uint8_t input[N_TEXELS][MAX_COMP],
457 uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
458 {
459 const int32_t n_vect = 3; /* 3 base vectors to find */
460 const int32_t n_comp = 4; /* 4 components: R, G, B, A */
461 float vec[MAX_VECT][MAX_COMP];
462 int32_t i, j, k;
463 Fx64 hi; /* high quadword */
464 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
465
466 /* the last vector indicates zero */
467 for (i = 0; i < n_comp; i++) {
468 vec[n_vect][i] = 0;
469 }
470
471 /* the first n texels in reord are guaranteed to be non-zero */
472 if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
473 fxt1_lloyd(vec, n_vect, reord, n_comp, n);
474 }
475
476 FX64_MOV32(hi, 6); /* alpha = "011" + lerp = 0 */
477 for (j = n_vect - 1; j >= 0; j--) {
478 /* add in alphas */
479 FX64_SHL(hi, 5);
480 FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
481 }
482 for (j = n_vect - 1; j >= 0; j--) {
483 for (i = 0; i < n_comp - 1; i++) {
484 /* add in colors */
485 FX64_SHL(hi, 5);
486 FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
487 }
488 }
489 ((Fx64 *)cc)[1] = hi;
490
491 lohi = lolo = 0;
492 /* right microtile */
493 for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
494 lohi <<= 2;
495 lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
496 }
497 /* left microtile */
498 for (; k >= 0; k--) {
499 lolo <<= 2;
500 lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
501 }
502 cc[1] = lohi;
503 cc[0] = lolo;
504 }
505
506
507 static void
fxt1_quantize_ALPHA1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])508 fxt1_quantize_ALPHA1 (uint32_t *cc,
509 uint8_t input[N_TEXELS][MAX_COMP])
510 {
511 const int32_t n_vect = 3; /* highest vector number in each microtile */
512 const int32_t n_comp = 4; /* 4 components: R, G, B, A */
513 float vec[1 + 1 + 1][MAX_COMP]; /* 1.5 extrema for each sub-block */
514 float b, iv[MAX_COMP]; /* interpolation vector */
515 int32_t i, j, k;
516 Fx64 hi; /* high quadword */
517 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
518
519 int32_t minSum;
520 int32_t maxSum;
521 int32_t minColL = 0, maxColL = 0;
522 int32_t minColR = 0, maxColR = 0;
523 int32_t sumL = 0, sumR = 0;
524 int32_t nn_comp;
525 /* Our solution here is to find the darkest and brightest colors in
526 * the 4x4 tile and use those as the two representative colors.
527 * There are probably better algorithms to use (histogram-based).
528 */
529 nn_comp = n_comp;
530 while ((minColL == maxColL) && nn_comp) {
531 minSum = 2000; /* big enough */
532 maxSum = -1; /* small enough */
533 for (k = 0; k < N_TEXELS / 2; k++) {
534 int32_t sum = 0;
535 for (i = 0; i < nn_comp; i++) {
536 sum += input[k][i];
537 }
538 if (minSum > sum) {
539 minSum = sum;
540 minColL = k;
541 }
542 if (maxSum < sum) {
543 maxSum = sum;
544 maxColL = k;
545 }
546 sumL += sum;
547 }
548
549 nn_comp--;
550 }
551
552 nn_comp = n_comp;
553 while ((minColR == maxColR) && nn_comp) {
554 minSum = 2000; /* big enough */
555 maxSum = -1; /* small enough */
556 for (k = N_TEXELS / 2; k < N_TEXELS; k++) {
557 int32_t sum = 0;
558 for (i = 0; i < nn_comp; i++) {
559 sum += input[k][i];
560 }
561 if (minSum > sum) {
562 minSum = sum;
563 minColR = k;
564 }
565 if (maxSum < sum) {
566 maxSum = sum;
567 maxColR = k;
568 }
569 sumR += sum;
570 }
571
572 nn_comp--;
573 }
574
575 /* choose the common vector (yuck!) */
576 {
577 int32_t j1, j2;
578 int32_t v1 = 0, v2 = 0;
579 float err = 1e9; /* big enough */
580 float tv[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
581 for (i = 0; i < n_comp; i++) {
582 tv[0][i] = input[minColL][i];
583 tv[1][i] = input[maxColL][i];
584 tv[2][i] = input[minColR][i];
585 tv[3][i] = input[maxColR][i];
586 }
587 for (j1 = 0; j1 < 2; j1++) {
588 for (j2 = 2; j2 < 4; j2++) {
589 float e = 0.0F;
590 for (i = 0; i < n_comp; i++) {
591 e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
592 }
593 if (e < err) {
594 err = e;
595 v1 = j1;
596 v2 = j2;
597 }
598 }
599 }
600 for (i = 0; i < n_comp; i++) {
601 vec[0][i] = tv[1 - v1][i];
602 vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
603 vec[2][i] = tv[5 - v2][i];
604 }
605 }
606
607 /* left microtile */
608 cc[0] = 0;
609 if (minColL != maxColL) {
610 /* compute interpolation vector */
611 MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
612
613 /* add in texels */
614 lolo = 0;
615 for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
616 int32_t texel;
617 /* interpolate color */
618 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
619 /* add in texel */
620 lolo <<= 2;
621 lolo |= texel;
622 }
623
624 cc[0] = lolo;
625 }
626
627 /* right microtile */
628 cc[1] = 0;
629 if (minColR != maxColR) {
630 /* compute interpolation vector */
631 MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);
632
633 /* add in texels */
634 lohi = 0;
635 for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
636 int32_t texel;
637 /* interpolate color */
638 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
639 /* add in texel */
640 lohi <<= 2;
641 lohi |= texel;
642 }
643
644 cc[1] = lohi;
645 }
646
647 FX64_MOV32(hi, 7); /* alpha = "011" + lerp = 1 */
648 for (j = n_vect - 1; j >= 0; j--) {
649 /* add in alphas */
650 FX64_SHL(hi, 5);
651 FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
652 }
653 for (j = n_vect - 1; j >= 0; j--) {
654 for (i = 0; i < n_comp - 1; i++) {
655 /* add in colors */
656 FX64_SHL(hi, 5);
657 FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
658 }
659 }
660 ((Fx64 *)cc)[1] = hi;
661 }
662
663
664 static void
fxt1_quantize_HI(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)665 fxt1_quantize_HI (uint32_t *cc,
666 uint8_t input[N_TEXELS][MAX_COMP],
667 uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
668 {
669 const int32_t n_vect = 6; /* highest vector number */
670 const int32_t n_comp = 3; /* 3 components: R, G, B */
671 float b = 0.0F; /* phoudoin: silent compiler! */
672 float iv[MAX_COMP]; /* interpolation vector */
673 int32_t i, k;
674 uint32_t hihi; /* high quadword: hi dword */
675
676 int32_t minSum = 2000; /* big enough */
677 int32_t maxSum = -1; /* small enough */
678 int32_t minCol = 0; /* phoudoin: silent compiler! */
679 int32_t maxCol = 0; /* phoudoin: silent compiler! */
680
681 /* Our solution here is to find the darkest and brightest colors in
682 * the 8x4 tile and use those as the two representative colors.
683 * There are probably better algorithms to use (histogram-based).
684 */
685 for (k = 0; k < n; k++) {
686 int32_t sum = 0;
687 for (i = 0; i < n_comp; i++) {
688 sum += reord[k][i];
689 }
690 if (minSum > sum) {
691 minSum = sum;
692 minCol = k;
693 }
694 if (maxSum < sum) {
695 maxSum = sum;
696 maxCol = k;
697 }
698 }
699
700 hihi = 0; /* cc-hi = "00" */
701 for (i = 0; i < n_comp; i++) {
702 /* add in colors */
703 hihi <<= 5;
704 hihi |= reord[maxCol][i] >> 3;
705 }
706 for (i = 0; i < n_comp; i++) {
707 /* add in colors */
708 hihi <<= 5;
709 hihi |= reord[minCol][i] >> 3;
710 }
711 cc[3] = hihi;
712 cc[0] = cc[1] = cc[2] = 0;
713
714 /* compute interpolation vector */
715 if (minCol != maxCol) {
716 MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
717 }
718
719 /* add in texels */
720 for (k = N_TEXELS - 1; k >= 0; k--) {
721 int32_t t = k * 3;
722 uint32_t *kk = (uint32_t *)((char *)cc + t / 8);
723 int32_t texel = n_vect + 1; /* transparent black */
724
725 if (!ISTBLACK(input[k])) {
726 if (minCol != maxCol) {
727 /* interpolate color */
728 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
729 /* add in texel */
730 kk[0] |= texel << (t & 7);
731 }
732 } else {
733 /* add in texel */
734 kk[0] |= texel << (t & 7);
735 }
736 }
737 }
738
739
740 static void
fxt1_quantize_MIXED1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])741 fxt1_quantize_MIXED1 (uint32_t *cc,
742 uint8_t input[N_TEXELS][MAX_COMP])
743 {
744 const int32_t n_vect = 2; /* highest vector number in each microtile */
745 const int32_t n_comp = 3; /* 3 components: R, G, B */
746 uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
747 float b, iv[MAX_COMP]; /* interpolation vector */
748 int32_t i, j, k;
749 Fx64 hi; /* high quadword */
750 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
751
752 int32_t minSum;
753 int32_t maxSum;
754 int32_t minColL = 0, maxColL = -1;
755 int32_t minColR = 0, maxColR = -1;
756
757 /* Our solution here is to find the darkest and brightest colors in
758 * the 4x4 tile and use those as the two representative colors.
759 * There are probably better algorithms to use (histogram-based).
760 */
761 minSum = 2000; /* big enough */
762 maxSum = -1; /* small enough */
763 for (k = 0; k < N_TEXELS / 2; k++) {
764 if (!ISTBLACK(input[k])) {
765 int32_t sum = 0;
766 for (i = 0; i < n_comp; i++) {
767 sum += input[k][i];
768 }
769 if (minSum > sum) {
770 minSum = sum;
771 minColL = k;
772 }
773 if (maxSum < sum) {
774 maxSum = sum;
775 maxColL = k;
776 }
777 }
778 }
779 minSum = 2000; /* big enough */
780 maxSum = -1; /* small enough */
781 for (; k < N_TEXELS; k++) {
782 if (!ISTBLACK(input[k])) {
783 int32_t sum = 0;
784 for (i = 0; i < n_comp; i++) {
785 sum += input[k][i];
786 }
787 if (minSum > sum) {
788 minSum = sum;
789 minColR = k;
790 }
791 if (maxSum < sum) {
792 maxSum = sum;
793 maxColR = k;
794 }
795 }
796 }
797
798 /* left microtile */
799 if (maxColL == -1) {
800 /* all transparent black */
801 cc[0] = ~0u;
802 for (i = 0; i < n_comp; i++) {
803 vec[0][i] = 0;
804 vec[1][i] = 0;
805 }
806 } else {
807 cc[0] = 0;
808 for (i = 0; i < n_comp; i++) {
809 vec[0][i] = input[minColL][i];
810 vec[1][i] = input[maxColL][i];
811 }
812 if (minColL != maxColL) {
813 /* compute interpolation vector */
814 MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
815
816 /* add in texels */
817 lolo = 0;
818 for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
819 int32_t texel = n_vect + 1; /* transparent black */
820 if (!ISTBLACK(input[k])) {
821 /* interpolate color */
822 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
823 }
824 /* add in texel */
825 lolo <<= 2;
826 lolo |= texel;
827 }
828 cc[0] = lolo;
829 }
830 }
831
832 /* right microtile */
833 if (maxColR == -1) {
834 /* all transparent black */
835 cc[1] = ~0u;
836 for (i = 0; i < n_comp; i++) {
837 vec[2][i] = 0;
838 vec[3][i] = 0;
839 }
840 } else {
841 cc[1] = 0;
842 for (i = 0; i < n_comp; i++) {
843 vec[2][i] = input[minColR][i];
844 vec[3][i] = input[maxColR][i];
845 }
846 if (minColR != maxColR) {
847 /* compute interpolation vector */
848 MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
849
850 /* add in texels */
851 lohi = 0;
852 for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
853 int32_t texel = n_vect + 1; /* transparent black */
854 if (!ISTBLACK(input[k])) {
855 /* interpolate color */
856 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
857 }
858 /* add in texel */
859 lohi <<= 2;
860 lohi |= texel;
861 }
862 cc[1] = lohi;
863 }
864 }
865
866 FX64_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
867 for (j = 2 * 2 - 1; j >= 0; j--) {
868 for (i = 0; i < n_comp; i++) {
869 /* add in colors */
870 FX64_SHL(hi, 5);
871 FX64_OR32(hi, vec[j][i] >> 3);
872 }
873 }
874 ((Fx64 *)cc)[1] = hi;
875 }
876
877
878 static void
fxt1_quantize_MIXED0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])879 fxt1_quantize_MIXED0 (uint32_t *cc,
880 uint8_t input[N_TEXELS][MAX_COMP])
881 {
882 const int32_t n_vect = 3; /* highest vector number in each microtile */
883 const int32_t n_comp = 3; /* 3 components: R, G, B */
884 uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
885 float b, iv[MAX_COMP]; /* interpolation vector */
886 int32_t i, j, k;
887 Fx64 hi; /* high quadword */
888 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
889
890 int32_t minColL = 0, maxColL = 0;
891 int32_t minColR = 0, maxColR = 0;
892 #if 0
893 int32_t minSum;
894 int32_t maxSum;
895
896 /* Our solution here is to find the darkest and brightest colors in
897 * the 4x4 tile and use those as the two representative colors.
898 * There are probably better algorithms to use (histogram-based).
899 */
900 minSum = 2000; /* big enough */
901 maxSum = -1; /* small enough */
902 for (k = 0; k < N_TEXELS / 2; k++) {
903 int32_t sum = 0;
904 for (i = 0; i < n_comp; i++) {
905 sum += input[k][i];
906 }
907 if (minSum > sum) {
908 minSum = sum;
909 minColL = k;
910 }
911 if (maxSum < sum) {
912 maxSum = sum;
913 maxColL = k;
914 }
915 }
916 minSum = 2000; /* big enough */
917 maxSum = -1; /* small enough */
918 for (; k < N_TEXELS; k++) {
919 int32_t sum = 0;
920 for (i = 0; i < n_comp; i++) {
921 sum += input[k][i];
922 }
923 if (minSum > sum) {
924 minSum = sum;
925 minColR = k;
926 }
927 if (maxSum < sum) {
928 maxSum = sum;
929 maxColR = k;
930 }
931 }
932 #else
933 int32_t minVal;
934 int32_t maxVal;
935 int32_t maxVarL = fxt1_variance(input, n_comp);
936 int32_t maxVarR = fxt1_variance(&input[N_TEXELS / 2], n_comp);
937
938 /* Scan the channel with max variance for lo & hi
939 * and use those as the two representative colors.
940 */
941 minVal = 2000; /* big enough */
942 maxVal = -1; /* small enough */
943 for (k = 0; k < N_TEXELS / 2; k++) {
944 int32_t t = input[k][maxVarL];
945 if (minVal > t) {
946 minVal = t;
947 minColL = k;
948 }
949 if (maxVal < t) {
950 maxVal = t;
951 maxColL = k;
952 }
953 }
954 minVal = 2000; /* big enough */
955 maxVal = -1; /* small enough */
956 for (; k < N_TEXELS; k++) {
957 int32_t t = input[k][maxVarR];
958 if (minVal > t) {
959 minVal = t;
960 minColR = k;
961 }
962 if (maxVal < t) {
963 maxVal = t;
964 maxColR = k;
965 }
966 }
967 #endif
968
969 /* left microtile */
970 cc[0] = 0;
971 for (i = 0; i < n_comp; i++) {
972 vec[0][i] = input[minColL][i];
973 vec[1][i] = input[maxColL][i];
974 }
975 if (minColL != maxColL) {
976 /* compute interpolation vector */
977 MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
978
979 /* add in texels */
980 lolo = 0;
981 for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
982 int32_t texel;
983 /* interpolate color */
984 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
985 /* add in texel */
986 lolo <<= 2;
987 lolo |= texel;
988 }
989
990 /* funky encoding for LSB of green */
991 if ((int32_t)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
992 for (i = 0; i < n_comp; i++) {
993 vec[1][i] = input[minColL][i];
994 vec[0][i] = input[maxColL][i];
995 }
996 lolo = ~lolo;
997 }
998
999 cc[0] = lolo;
1000 }
1001
1002 /* right microtile */
1003 cc[1] = 0;
1004 for (i = 0; i < n_comp; i++) {
1005 vec[2][i] = input[minColR][i];
1006 vec[3][i] = input[maxColR][i];
1007 }
1008 if (minColR != maxColR) {
1009 /* compute interpolation vector */
1010 MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
1011
1012 /* add in texels */
1013 lohi = 0;
1014 for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
1015 int32_t texel;
1016 /* interpolate color */
1017 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
1018 /* add in texel */
1019 lohi <<= 2;
1020 lohi |= texel;
1021 }
1022
1023 /* funky encoding for LSB of green */
1024 if ((int32_t)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
1025 for (i = 0; i < n_comp; i++) {
1026 vec[3][i] = input[minColR][i];
1027 vec[2][i] = input[maxColR][i];
1028 }
1029 lohi = ~lohi;
1030 }
1031
1032 cc[1] = lohi;
1033 }
1034
1035 FX64_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
1036 for (j = 2 * 2 - 1; j >= 0; j--) {
1037 for (i = 0; i < n_comp; i++) {
1038 /* add in colors */
1039 FX64_SHL(hi, 5);
1040 FX64_OR32(hi, vec[j][i] >> 3);
1041 }
1042 }
1043 ((Fx64 *)cc)[1] = hi;
1044 }
1045
1046
1047 static void
fxt1_quantize(uint32_t * cc,const uint8_t * lines[],int32_t comps)1048 fxt1_quantize (uint32_t *cc, const uint8_t *lines[], int32_t comps)
1049 {
1050 int32_t trualpha;
1051 uint8_t reord[N_TEXELS][MAX_COMP];
1052
1053 uint8_t input[N_TEXELS][MAX_COMP];
1054 int32_t i, k, l;
1055
1056 if (comps == 3) {
1057 /* make the whole block opaque */
1058 memset(input, -1, sizeof(input));
1059 }
1060
1061 /* 8 texels each line */
1062 for (l = 0; l < 4; l++) {
1063 for (k = 0; k < 4; k++) {
1064 for (i = 0; i < comps; i++) {
1065 input[k + l * 4][i] = *lines[l]++;
1066 }
1067 }
1068 for (; k < 8; k++) {
1069 for (i = 0; i < comps; i++) {
1070 input[k + l * 4 + 12][i] = *lines[l]++;
1071 }
1072 }
1073 }
1074
1075 /* block layout:
1076 * 00, 01, 02, 03, 08, 09, 0a, 0b
1077 * 10, 11, 12, 13, 18, 19, 1a, 1b
1078 * 04, 05, 06, 07, 0c, 0d, 0e, 0f
1079 * 14, 15, 16, 17, 1c, 1d, 1e, 1f
1080 */
1081
1082 /* [dBorca]
1083 * stupidity flows forth from this
1084 */
1085 l = N_TEXELS;
1086 trualpha = 0;
1087 if (comps == 4) {
1088 /* skip all transparent black texels */
1089 l = 0;
1090 for (k = 0; k < N_TEXELS; k++) {
1091 /* test all components against 0 */
1092 if (!ISTBLACK(input[k])) {
1093 /* texel is not transparent black */
1094 memcpy(reord[l], input[k], 4);
1095 if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
1096 /* non-opaque texel */
1097 trualpha = !0;
1098 }
1099 l++;
1100 }
1101 }
1102 }
1103
1104 #if 0
1105 if (trualpha) {
1106 fxt1_quantize_ALPHA0(cc, input, reord, l);
1107 } else if (l == 0) {
1108 cc[0] = cc[1] = cc[2] = -1;
1109 cc[3] = 0;
1110 } else if (l < N_TEXELS) {
1111 fxt1_quantize_HI(cc, input, reord, l);
1112 } else {
1113 fxt1_quantize_CHROMA(cc, input);
1114 }
1115 (void)fxt1_quantize_ALPHA1;
1116 (void)fxt1_quantize_MIXED1;
1117 (void)fxt1_quantize_MIXED0;
1118 #else
1119 if (trualpha) {
1120 fxt1_quantize_ALPHA1(cc, input);
1121 } else if (l == 0) {
1122 cc[0] = cc[1] = cc[2] = ~0u;
1123 cc[3] = 0;
1124 } else if (l < N_TEXELS) {
1125 fxt1_quantize_MIXED1(cc, input);
1126 } else {
1127 fxt1_quantize_MIXED0(cc, input);
1128 }
1129 (void)fxt1_quantize_ALPHA0;
1130 (void)fxt1_quantize_HI;
1131 (void)fxt1_quantize_CHROMA;
1132 #endif
1133 }
1134
1135
1136
1137 /**
1138 * Upscale an image by replication, not (typical) stretching.
1139 * We use this when the image width or height is less than a
1140 * certain size (4, 8) and we need to upscale an image.
1141 */
1142 static void
upscale_teximage2d(int32_t inWidth,int32_t inHeight,int32_t outWidth,int32_t outHeight,int32_t comps,const uint8_t * src,int32_t srcRowStride,uint8_t * dest)1143 upscale_teximage2d(int32_t inWidth, int32_t inHeight,
1144 int32_t outWidth, int32_t outHeight,
1145 int32_t comps, const uint8_t *src, int32_t srcRowStride,
1146 uint8_t *dest )
1147 {
1148 int32_t i, j, k;
1149
1150 assert(outWidth >= inWidth);
1151 assert(outHeight >= inHeight);
1152 #if 0
1153 assert(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
1154 assert((outWidth & 3) == 0);
1155 assert((outHeight & 3) == 0);
1156 #endif
1157
1158 for (i = 0; i < outHeight; i++) {
1159 const int32_t ii = i % inHeight;
1160 for (j = 0; j < outWidth; j++) {
1161 const int32_t jj = j % inWidth;
1162 for (k = 0; k < comps; k++) {
1163 dest[(i * outWidth + j) * comps + k]
1164 = src[ii * srcRowStride + jj * comps + k];
1165 }
1166 }
1167 }
1168 }
1169
1170
1171 static void
fxt1_encode(uint32_t width,uint32_t height,int32_t comps,const void * source,int32_t srcRowStride,void * dest,int32_t destRowStride)1172 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
1173 const void *source, int32_t srcRowStride,
1174 void *dest, int32_t destRowStride)
1175 {
1176 uint32_t x, y;
1177 const uint8_t *data;
1178 uint32_t *encoded = (uint32_t *)dest;
1179 void *newSource = NULL;
1180
1181 assert(comps == 3 || comps == 4);
1182
1183 /* Replicate image if width is not M8 or height is not M4 */
1184 if ((width & 7) | (height & 3)) {
1185 int32_t newWidth = (width + 7) & ~7;
1186 int32_t newHeight = (height + 3) & ~3;
1187 newSource = malloc(comps * newWidth * newHeight * sizeof(uint8_t));
1188 if (!newSource)
1189 return;
1190 upscale_teximage2d(width, height, newWidth, newHeight,
1191 comps, (const uint8_t *) source,
1192 srcRowStride, (uint8_t *) newSource);
1193 source = newSource;
1194 width = newWidth;
1195 height = newHeight;
1196 srcRowStride = comps * newWidth;
1197 }
1198
1199 data = (const uint8_t *) source;
1200 destRowStride = (destRowStride - width * 2) / 4;
1201 for (y = 0; y < height; y += 4) {
1202 uint32_t offs = 0 + (y + 0) * srcRowStride;
1203 for (x = 0; x < width; x += 8) {
1204 const uint8_t *lines[4];
1205 lines[0] = &data[offs];
1206 lines[1] = lines[0] + srcRowStride;
1207 lines[2] = lines[1] + srcRowStride;
1208 lines[3] = lines[2] + srcRowStride;
1209 offs += 8 * comps;
1210 fxt1_quantize(encoded, lines, comps);
1211 /* 128 bits per 8x4 block */
1212 encoded += 4;
1213 }
1214 encoded += destRowStride;
1215 }
1216
1217 free(newSource);
1218 }
1219
1220
1221 /***************************************************************************\
1222 * FXT1 decoder
1223 *
1224 * The decoder is based on GL_3DFX_texture_compression_FXT1
1225 * specification and serves as a concept for the encoder.
1226 \***************************************************************************/
1227
1228
1229 /* lookup table for scaling 5 bit colors up to 8 bits */
1230 static const uint8_t _rgb_scale_5[] = {
1231 0, 8, 16, 25, 33, 41, 49, 58,
1232 66, 74, 82, 90, 99, 107, 115, 123,
1233 132, 140, 148, 156, 165, 173, 181, 189,
1234 197, 206, 214, 222, 230, 239, 247, 255
1235 };
1236
1237 /* lookup table for scaling 6 bit colors up to 8 bits */
1238 static const uint8_t _rgb_scale_6[] = {
1239 0, 4, 8, 12, 16, 20, 24, 28,
1240 32, 36, 40, 45, 49, 53, 57, 61,
1241 65, 69, 73, 77, 81, 85, 89, 93,
1242 97, 101, 105, 109, 113, 117, 121, 125,
1243 130, 134, 138, 142, 146, 150, 154, 158,
1244 162, 166, 170, 174, 178, 182, 186, 190,
1245 194, 198, 202, 206, 210, 215, 219, 223,
1246 227, 231, 235, 239, 243, 247, 251, 255
1247 };
1248
1249
1250 #define CC_SEL(cc, which) (((uint32_t *)(cc))[(which) / 32] >> ((which) & 31))
1251 #define UP5(c) _rgb_scale_5[(c) & 31]
1252 #define UP6(c, b) _rgb_scale_6[(((c) & 31) << 1) | ((b) & 1)]
1253 #define LERP(n, t, c0, c1) (((n) - (t)) * (c0) + (t) * (c1) + (n) / 2) / (n)
1254
1255
1256 static void
fxt1_decode_1HI(const uint8_t * code,int32_t t,uint8_t * rgba)1257 fxt1_decode_1HI (const uint8_t *code, int32_t t, uint8_t *rgba)
1258 {
1259 const uint32_t *cc;
1260
1261 t *= 3;
1262 cc = (const uint32_t *)(code + t / 8);
1263 t = (cc[0] >> (t & 7)) & 7;
1264
1265 if (t == 7) {
1266 rgba[RCOMP] = rgba[GCOMP] = rgba[BCOMP] = rgba[ACOMP] = 0;
1267 } else {
1268 uint8_t r, g, b;
1269 cc = (const uint32_t *)(code + 12);
1270 if (t == 0) {
1271 b = UP5(CC_SEL(cc, 0));
1272 g = UP5(CC_SEL(cc, 5));
1273 r = UP5(CC_SEL(cc, 10));
1274 } else if (t == 6) {
1275 b = UP5(CC_SEL(cc, 15));
1276 g = UP5(CC_SEL(cc, 20));
1277 r = UP5(CC_SEL(cc, 25));
1278 } else {
1279 b = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
1280 g = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
1281 r = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
1282 }
1283 rgba[RCOMP] = r;
1284 rgba[GCOMP] = g;
1285 rgba[BCOMP] = b;
1286 rgba[ACOMP] = 255;
1287 }
1288 }
1289
1290
1291 static void
fxt1_decode_1CHROMA(const uint8_t * code,int32_t t,uint8_t * rgba)1292 fxt1_decode_1CHROMA (const uint8_t *code, int32_t t, uint8_t *rgba)
1293 {
1294 const uint32_t *cc;
1295 uint32_t kk;
1296
1297 cc = (const uint32_t *)code;
1298 if (t & 16) {
1299 cc++;
1300 t &= 15;
1301 }
1302 t = (cc[0] >> (t * 2)) & 3;
1303
1304 t *= 15;
1305 cc = (const uint32_t *)(code + 8 + t / 8);
1306 kk = cc[0] >> (t & 7);
1307 rgba[BCOMP] = UP5(kk);
1308 rgba[GCOMP] = UP5(kk >> 5);
1309 rgba[RCOMP] = UP5(kk >> 10);
1310 rgba[ACOMP] = 255;
1311 }
1312
1313
1314 static void
fxt1_decode_1MIXED(const uint8_t * code,int32_t t,uint8_t * rgba)1315 fxt1_decode_1MIXED (const uint8_t *code, int32_t t, uint8_t *rgba)
1316 {
1317 const uint32_t *cc;
1318 uint32_t col[2][3];
1319 int32_t glsb, selb;
1320
1321 cc = (const uint32_t *)code;
1322 if (t & 16) {
1323 t &= 15;
1324 t = (cc[1] >> (t * 2)) & 3;
1325 /* col 2 */
1326 col[0][BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1327 col[0][GCOMP] = CC_SEL(cc, 99);
1328 col[0][RCOMP] = CC_SEL(cc, 104);
1329 /* col 3 */
1330 col[1][BCOMP] = CC_SEL(cc, 109);
1331 col[1][GCOMP] = CC_SEL(cc, 114);
1332 col[1][RCOMP] = CC_SEL(cc, 119);
1333 glsb = CC_SEL(cc, 126);
1334 selb = CC_SEL(cc, 33);
1335 } else {
1336 t = (cc[0] >> (t * 2)) & 3;
1337 /* col 0 */
1338 col[0][BCOMP] = CC_SEL(cc, 64);
1339 col[0][GCOMP] = CC_SEL(cc, 69);
1340 col[0][RCOMP] = CC_SEL(cc, 74);
1341 /* col 1 */
1342 col[1][BCOMP] = CC_SEL(cc, 79);
1343 col[1][GCOMP] = CC_SEL(cc, 84);
1344 col[1][RCOMP] = CC_SEL(cc, 89);
1345 glsb = CC_SEL(cc, 125);
1346 selb = CC_SEL(cc, 1);
1347 }
1348
1349 if (CC_SEL(cc, 124) & 1) {
1350 /* alpha[0] == 1 */
1351
1352 if (t == 3) {
1353 /* zero */
1354 rgba[RCOMP] = rgba[BCOMP] = rgba[GCOMP] = rgba[ACOMP] = 0;
1355 } else {
1356 uint8_t r, g, b;
1357 if (t == 0) {
1358 b = UP5(col[0][BCOMP]);
1359 g = UP5(col[0][GCOMP]);
1360 r = UP5(col[0][RCOMP]);
1361 } else if (t == 2) {
1362 b = UP5(col[1][BCOMP]);
1363 g = UP6(col[1][GCOMP], glsb);
1364 r = UP5(col[1][RCOMP]);
1365 } else {
1366 b = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
1367 g = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
1368 r = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
1369 }
1370 rgba[RCOMP] = r;
1371 rgba[GCOMP] = g;
1372 rgba[BCOMP] = b;
1373 rgba[ACOMP] = 255;
1374 }
1375 } else {
1376 /* alpha[0] == 0 */
1377 uint8_t r, g, b;
1378 if (t == 0) {
1379 b = UP5(col[0][BCOMP]);
1380 g = UP6(col[0][GCOMP], glsb ^ selb);
1381 r = UP5(col[0][RCOMP]);
1382 } else if (t == 3) {
1383 b = UP5(col[1][BCOMP]);
1384 g = UP6(col[1][GCOMP], glsb);
1385 r = UP5(col[1][RCOMP]);
1386 } else {
1387 b = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
1388 g = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
1389 UP6(col[1][GCOMP], glsb));
1390 r = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
1391 }
1392 rgba[RCOMP] = r;
1393 rgba[GCOMP] = g;
1394 rgba[BCOMP] = b;
1395 rgba[ACOMP] = 255;
1396 }
1397 }
1398
1399
1400 static void
fxt1_decode_1ALPHA(const uint8_t * code,int32_t t,uint8_t * rgba)1401 fxt1_decode_1ALPHA (const uint8_t *code, int32_t t, uint8_t *rgba)
1402 {
1403 const uint32_t *cc;
1404 uint8_t r, g, b, a;
1405
1406 cc = (const uint32_t *)code;
1407 if (CC_SEL(cc, 124) & 1) {
1408 /* lerp == 1 */
1409 uint32_t col0[4];
1410
1411 if (t & 16) {
1412 t &= 15;
1413 t = (cc[1] >> (t * 2)) & 3;
1414 /* col 2 */
1415 col0[BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1416 col0[GCOMP] = CC_SEL(cc, 99);
1417 col0[RCOMP] = CC_SEL(cc, 104);
1418 col0[ACOMP] = CC_SEL(cc, 119);
1419 } else {
1420 t = (cc[0] >> (t * 2)) & 3;
1421 /* col 0 */
1422 col0[BCOMP] = CC_SEL(cc, 64);
1423 col0[GCOMP] = CC_SEL(cc, 69);
1424 col0[RCOMP] = CC_SEL(cc, 74);
1425 col0[ACOMP] = CC_SEL(cc, 109);
1426 }
1427
1428 if (t == 0) {
1429 b = UP5(col0[BCOMP]);
1430 g = UP5(col0[GCOMP]);
1431 r = UP5(col0[RCOMP]);
1432 a = UP5(col0[ACOMP]);
1433 } else if (t == 3) {
1434 b = UP5(CC_SEL(cc, 79));
1435 g = UP5(CC_SEL(cc, 84));
1436 r = UP5(CC_SEL(cc, 89));
1437 a = UP5(CC_SEL(cc, 114));
1438 } else {
1439 b = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
1440 g = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
1441 r = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
1442 a = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
1443 }
1444 } else {
1445 /* lerp == 0 */
1446
1447 if (t & 16) {
1448 cc++;
1449 t &= 15;
1450 }
1451 t = (cc[0] >> (t * 2)) & 3;
1452
1453 if (t == 3) {
1454 /* zero */
1455 r = g = b = a = 0;
1456 } else {
1457 uint32_t kk;
1458 cc = (const uint32_t *)code;
1459 a = UP5(cc[3] >> (t * 5 + 13));
1460 t *= 15;
1461 cc = (const uint32_t *)(code + 8 + t / 8);
1462 kk = cc[0] >> (t & 7);
1463 b = UP5(kk);
1464 g = UP5(kk >> 5);
1465 r = UP5(kk >> 10);
1466 }
1467 }
1468 rgba[RCOMP] = r;
1469 rgba[GCOMP] = g;
1470 rgba[BCOMP] = b;
1471 rgba[ACOMP] = a;
1472 }
1473
1474
1475 static void
fxt1_decode_1(const void * texture,int32_t stride,int32_t i,int32_t j,uint8_t * rgba)1476 fxt1_decode_1 (const void *texture, int32_t stride, /* in pixels */
1477 int32_t i, int32_t j, uint8_t *rgba)
1478 {
1479 static void (*decode_1[]) (const uint8_t *, int32_t, uint8_t *) = {
1480 fxt1_decode_1HI, /* cc-high = "00?" */
1481 fxt1_decode_1HI, /* cc-high = "00?" */
1482 fxt1_decode_1CHROMA, /* cc-chroma = "010" */
1483 fxt1_decode_1ALPHA, /* alpha = "011" */
1484 fxt1_decode_1MIXED, /* mixed = "1??" */
1485 fxt1_decode_1MIXED, /* mixed = "1??" */
1486 fxt1_decode_1MIXED, /* mixed = "1??" */
1487 fxt1_decode_1MIXED /* mixed = "1??" */
1488 };
1489
1490 const uint8_t *code = (const uint8_t *)texture +
1491 ((j / 4) * (stride / 8) + (i / 8)) * 16;
1492 int32_t mode = CC_SEL(code, 125);
1493 int32_t t = i & 7;
1494
1495 if (t & 4) {
1496 t += 12;
1497 }
1498 t += (j & 3) * 4;
1499
1500 decode_1[mode](code, t, rgba);
1501 }
1502
1503 /*
1504 * Pixel fetch within a block.
1505 */
1506
1507 void
util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1508 util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1509 {
1510 fxt1_decode_1(src, 0, i, j, dst);
1511 }
1512
1513 void
util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1514 util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1515 {
1516 fxt1_decode_1(src, 0, i, j, dst);
1517 dst[3] = 0xff;
1518 }
1519
1520 void
util_format_fxt1_rgb_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1521 util_format_fxt1_rgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1522 {
1523 float *dst = in_dst;
1524 uint8_t tmp[4];
1525 fxt1_decode_1(src, 0, i, j, tmp);
1526 dst[0] = ubyte_to_float(tmp[0]);
1527 dst[1] = ubyte_to_float(tmp[1]);
1528 dst[2] = ubyte_to_float(tmp[2]);
1529 dst[3] = 1.0;
1530 }
1531
1532 void
util_format_fxt1_rgba_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1533 util_format_fxt1_rgba_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1534 {
1535 float *dst = in_dst;
1536 uint8_t tmp[4];
1537 fxt1_decode_1(src, 0, i, j, tmp);
1538 dst[0] = ubyte_to_float(tmp[0]);
1539 dst[1] = ubyte_to_float(tmp[1]);
1540 dst[2] = ubyte_to_float(tmp[2]);
1541 dst[3] = ubyte_to_float(tmp[3]);
1542 }
1543
1544 /*
1545 * Block decompression.
1546 */
1547
1548 static inline void
util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,boolean rgba)1549 util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1550 const uint8_t *restrict src_row, unsigned src_stride,
1551 unsigned width, unsigned height,
1552 boolean rgba)
1553 {
1554 const unsigned bw = 8, bh = 4, comps = 4;
1555 unsigned x, y, i, j;
1556 for (y = 0; y < height; y += bh) {
1557 const uint8_t *src = src_row;
1558 for (x = 0; x < width; x += bw) {
1559 for (j = 0; j < bh; ++j) {
1560 for (i = 0; i < bw; ++i) {
1561 uint8_t *dst = dst_row + (y + j) * dst_stride / sizeof(*dst_row) + (x + i) * comps;
1562 fxt1_decode_1(src, 0, i, j, dst);
1563 if (!rgba)
1564 dst[3] = 0xff;
1565 }
1566 }
1567 src += FXT1_BLOCK_SIZE;
1568 }
1569 src_row += src_stride;
1570 }
1571 }
1572
1573 void
util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1574 util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1575 const uint8_t *restrict src_row, unsigned src_stride,
1576 unsigned width, unsigned height)
1577 {
1578 util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1579 src_row, src_stride,
1580 width, height,
1581 false);
1582 }
1583
1584 void
util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1585 util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1586 const uint8_t *restrict src_row, unsigned src_stride,
1587 unsigned width, unsigned height)
1588 {
1589 util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1590 src_row, src_stride,
1591 width, height,
1592 true);
1593 }
1594
1595 static inline void
util_format_fxtn_rgb_unpack_rgba_float(float * dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,boolean rgba)1596 util_format_fxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride,
1597 const uint8_t *restrict src_row, unsigned src_stride,
1598 unsigned width, unsigned height,
1599 boolean rgba)
1600 {
1601 const unsigned bw = 8, bh = 4, comps = 4;
1602 unsigned x, y, i, j;
1603 for (y = 0; y < height; y += 4) {
1604 const uint8_t *src = src_row;
1605 for (x = 0; x < width; x += 8) {
1606 for (j = 0; j < bh; ++j) {
1607 for (i = 0; i < bw; ++i) {
1608 float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i) * comps;
1609 uint8_t tmp[4];
1610 fxt1_decode_1(src, 0, i, j, tmp);
1611 dst[0] = ubyte_to_float(tmp[0]);
1612 dst[1] = ubyte_to_float(tmp[1]);
1613 dst[2] = ubyte_to_float(tmp[2]);
1614 if (rgba)
1615 dst[3] = ubyte_to_float(tmp[3]);
1616 else
1617 dst[3] = 1.0;
1618 }
1619 }
1620 src += FXT1_BLOCK_SIZE;
1621 }
1622 src_row += src_stride;
1623 }
1624 }
1625
1626 void
util_format_fxt1_rgb_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1627 util_format_fxt1_rgb_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1628 const uint8_t *restrict src_row, unsigned src_stride,
1629 unsigned width, unsigned height)
1630 {
1631 util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1632 src_row, src_stride,
1633 width, height,
1634 false);
1635 }
1636
1637 void
util_format_fxt1_rgba_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1638 util_format_fxt1_rgba_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1639 const uint8_t *restrict src_row, unsigned src_stride,
1640 unsigned width, unsigned height)
1641 {
1642 util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1643 src_row, src_stride,
1644 width, height,
1645 true);
1646 }
1647
1648 /*
1649 * Block compression.
1650 */
1651
1652 void
util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1653 util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1654 const uint8_t *restrict src, unsigned src_stride,
1655 unsigned width, unsigned height)
1656 {
1657 /* The encoder for FXT1_RGB wants 24bpp packed rgb, so make a temporary to do that.
1658 */
1659 int temp_stride = width * 3;
1660 uint8_t *temp = malloc(height * temp_stride);
1661 if (!temp)
1662 return;
1663
1664 for (int y = 0; y < height; y++) {
1665 for (int x = 0; x < width; x++) {
1666 temp[y * temp_stride + x * 3 + 0] = src[x * 4 + 0];
1667 temp[y * temp_stride + x * 3 + 1] = src[x * 4 + 1];
1668 temp[y * temp_stride + x * 3 + 2] = src[x * 4 + 2];
1669 }
1670 src += src_stride;
1671 }
1672
1673 fxt1_encode(width, height, 3, temp, temp_stride, dst_row, dst_stride);
1674
1675 free(temp);
1676 }
1677
1678 void
util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1679 util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1680 const uint8_t *restrict src, unsigned src_stride,
1681 unsigned width, unsigned height)
1682 {
1683 fxt1_encode(width, height, 4, src, src_stride, dst_row, dst_stride);
1684 }
1685
1686 void
util_format_fxt1_rgb_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1687 util_format_fxt1_rgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1688 const float *restrict src, unsigned src_stride,
1689 unsigned width, unsigned height)
1690 {
1691 int temp_stride = width * 4;
1692 uint8_t *temp = malloc(height * temp_stride);
1693 if (!temp)
1694 return;
1695
1696 util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1697 src, src_stride,
1698 width, height);
1699
1700 util_format_fxt1_rgb_pack_rgba_8unorm(dst_row, dst_stride,
1701 temp, temp_stride,
1702 width, height);
1703
1704 free(temp);
1705 }
1706
1707 void
util_format_fxt1_rgba_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1708 util_format_fxt1_rgba_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1709 const float *restrict src, unsigned src_stride,
1710 unsigned width, unsigned height)
1711 {
1712 int temp_stride = width * 4;
1713 uint8_t *temp = malloc(height * temp_stride);
1714 if (!temp)
1715 return;
1716
1717 util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1718 src, src_stride,
1719 width, height);
1720
1721 util_format_fxt1_rgba_pack_rgba_8unorm(dst_row, dst_stride,
1722 temp, temp_stride,
1723 width, height);
1724
1725 free(temp);
1726 }
1727