• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2011-2013 Luc Verhaegen <libv@skynet.be>
3  * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4  * Copyright (c) 2018 Vasily Khoruzhick <anarsoul@gmail.com>
5  * Copyright (c) 2019 Collabora, Ltd.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sub license,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24  * DEALINGS IN THE SOFTWARE.
25  *
26  */
27 
28 #include "pan_tiling.h"
29 #include <stdbool.h>
30 #include "util/macros.h"
31 #include "util/bitscan.h"
32 
33 /*
34  * This file implements software encode/decode of u-interleaved textures.
35  * See docs/drivers/panfrost.rst for details on the format.
36  *
37  * The tricky bit is ordering along the space-filling curve:
38  *
39  *    | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
40  *
41  * While interleaving bits is trivial in hardware, it is nontrivial in software.
42  * The trick is to divide the pattern up:
43  *
44  *    | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
45  *  ^ |  0 | x3 |  0 | x2 |  0 | x1 |  0 | x0 |
46  *
47  * That is, duplicate the bits of the Y and space out the bits of the X. The top
48  * line is a function only of Y, so it can be calculated once per row and stored
49  * in a register. The bottom line is simply X with the bits spaced out. Spacing
50  * out the X is easy enough with a LUT, or by subtracting+ANDing the mask
51  * pattern (abusing carry bits).
52  *
53  */
54 
55 /* Given the lower 4-bits of the Y coordinate, we would like to
56  * duplicate every bit over. So instead of 0b1010, we would like
57  * 0b11001100. The idea is that for the bits in the solely Y place, we
58  * get a Y place, and the bits in the XOR place *also* get a Y. */
59 
60 const uint32_t bit_duplication[16] = {
61    0b00000000,
62    0b00000011,
63    0b00001100,
64    0b00001111,
65    0b00110000,
66    0b00110011,
67    0b00111100,
68    0b00111111,
69    0b11000000,
70    0b11000011,
71    0b11001100,
72    0b11001111,
73    0b11110000,
74    0b11110011,
75    0b11111100,
76    0b11111111,
77 };
78 
79 /* Space the bits out of a 4-bit nibble */
80 
81 const unsigned space_4[16] = {
82    0b0000000,
83    0b0000001,
84    0b0000100,
85    0b0000101,
86    0b0010000,
87    0b0010001,
88    0b0010100,
89    0b0010101,
90    0b1000000,
91    0b1000001,
92    0b1000100,
93    0b1000101,
94    0b1010000,
95    0b1010001,
96    0b1010100,
97    0b1010101
98 };
99 
100 /* The scheme uses 16x16 tiles */
101 
102 #define TILE_WIDTH 16
103 #define TILE_HEIGHT 16
104 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
105 
106 /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
107  * only support copies and sizeof, so emulating with a packed structure works
108  * well enough, but if there's a native 128-bit type we may we well prefer
109  * that. */
110 
111 #ifdef __SIZEOF_INT128__
112 typedef __uint128_t pan_uint128_t;
113 #else
114 typedef struct {
115   uint64_t lo;
116   uint64_t hi;
117 } __attribute__((packed)) pan_uint128_t;
118 #endif
119 
120 typedef struct {
121   uint16_t lo;
122   uint8_t hi;
123 } __attribute__((packed)) pan_uint24_t;
124 
125 typedef struct {
126   uint32_t lo;
127   uint16_t hi;
128 } __attribute__((packed)) pan_uint48_t;
129 
130 typedef struct {
131   uint64_t lo;
132   uint32_t hi;
133 } __attribute__((packed)) pan_uint96_t;
134 
135 /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
136  *
137  * dest_start precomputes the offset to the beginning of the first horizontal
138  * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
139  * stored linearly, so we get the X tile number by shifting and then multiply
140  * by the bytes per tile .
141  *
142  * We iterate across the pixels we're trying to store in source-order. For each
143  * row in the destination image, we figure out which row of 16x16 block we're
144  * in, by slicing off the lower 4-bits (block_y).
145  *
146  * dest then precomputes the location of the top-left corner of the block the
147  * row starts in. In pixel coordinates (where the origin is the top-left),
148  * (block_y, 0) is the top-left corner of the leftmost tile in this row.  While
149  * pixels are reordered within a block, the blocks themselves are stored
150  * linearly, so multiplying block_y by the pixel stride of the destination
151  * image equals the byte offset of that top-left corner of the block this row
152  * is in.
153  *
154  * On the other hand, the source is linear so we compute the locations of the
155  * start and end of the row in the source by a simple linear addressing.
156  *
157  * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
158  * y0] value. Since this is constant across a row, we look it up per-row and
159  * store in expanded_y.
160  *
161  * Finally, we iterate each row in source order. In the outer loop, we iterate
162  * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
163  * be unrolled), calculating the index within the tile and writing.
164  */
165 
166 #define TILED_ACCESS_TYPE(pixel_t, shift) \
167 static ALWAYS_INLINE void \
168 panfrost_access_tiled_image_##pixel_t \
169                               (void *dst, void *src, \
170                                uint16_t sx, uint16_t sy, \
171                                uint16_t w, uint16_t h, \
172                                uint32_t dst_stride, \
173                                uint32_t src_stride, \
174                                bool is_store) \
175 { \
176    uint8_t *dest_start = dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
177    for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
178       uint8_t *dest = (uint8_t *) (dest_start + ((y >> 4) * dst_stride)); \
179       pixel_t *source = src + (src_y * src_stride); \
180       pixel_t *source_end = source + w; \
181       unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
182       for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
183          for (uint8_t i = 0; i < 16; ++i) { \
184             unsigned index = expanded_y ^ (space_4[i] << shift); \
185             if (is_store) \
186                 *((pixel_t *) (dest + index)) = *(source++); \
187             else \
188                 *(source++) = *((pixel_t *) (dest + index)); \
189          } \
190       } \
191    } \
192 } \
193 
194 TILED_ACCESS_TYPE(uint8_t, 0);
195 TILED_ACCESS_TYPE(uint16_t, 1);
196 TILED_ACCESS_TYPE(uint32_t, 2);
197 TILED_ACCESS_TYPE(uint64_t, 3);
198 TILED_ACCESS_TYPE(pan_uint128_t, 4);
199 
200 #define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) { \
201    const unsigned mask = (1 << tile_shift) - 1; \
202    for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
203       unsigned block_start_s = (y >> tile_shift) * dst_stride; \
204       unsigned source_start = src_y * src_stride; \
205       unsigned expanded_y = bit_duplication[y & mask]; \
206  \
207       for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
208          unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
209          unsigned index = expanded_y ^ space_4[x & mask]; \
210          uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
211          uint8_t *dest = dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
212  \
213          pixel_t *outp = (pixel_t *) (is_store ? dest : source); \
214          pixel_t *inp = (pixel_t *) (is_store ? source : dest); \
215          *outp = *inp; \
216       } \
217    } \
218 }
219 
220 #define TILED_UNALIGNED_TYPES(store, shift) { \
221    if (bpp == 8) \
222       TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
223    else if (bpp == 16) \
224       TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
225    else if (bpp == 24) \
226       TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \
227    else if (bpp == 32) \
228       TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
229    else if (bpp == 48) \
230       TILED_UNALIGNED_TYPE(pan_uint48_t, store, shift) \
231    else if (bpp == 64) \
232       TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
233    else if (bpp == 96) \
234       TILED_UNALIGNED_TYPE(pan_uint96_t, store, shift) \
235    else if (bpp == 128) \
236       TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
237 }
238 
239 /*
240  * Perform a generic access to a tiled image with a given format. This works
241  * even for block-compressed images on entire blocks at a time. sx/sy/w/h are
242  * specified in pixels, not blocks, but our internal routines work in blocks,
243  * so we divide here. Alignment is assumed.
244  */
245 static void
panfrost_access_tiled_image_generic(void * dst,void * src,unsigned sx,unsigned sy,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,const struct util_format_description * desc,bool _is_store)246 panfrost_access_tiled_image_generic(void *dst, void *src,
247                                unsigned sx, unsigned sy,
248                                unsigned w, unsigned h,
249                                uint32_t dst_stride,
250                                uint32_t src_stride,
251                                const struct util_format_description *desc,
252                                bool _is_store)
253 {
254    unsigned bpp = desc->block.bits;
255 
256    /* Convert units */
257    sx /= desc->block.width;
258    sy /= desc->block.height;
259    w = DIV_ROUND_UP(w, desc->block.width);
260    h = DIV_ROUND_UP(h, desc->block.height);
261 
262    if (desc->block.width > 1) {
263       if (_is_store)
264          TILED_UNALIGNED_TYPES(true, 2)
265       else
266          TILED_UNALIGNED_TYPES(false, 2)
267    } else {
268       if (_is_store)
269          TILED_UNALIGNED_TYPES(true, 4)
270       else
271          TILED_UNALIGNED_TYPES(false, 4)
272    }
273 }
274 
275 #define OFFSET(src, _x, _y) (void *) ((uint8_t *) src + ((_y) - orig_y) * src_stride + (((_x) - orig_x) * (bpp / 8)))
276 
277 static ALWAYS_INLINE void
panfrost_access_tiled_image(void * dst,void * src,unsigned x,unsigned y,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,enum pipe_format format,bool is_store)278 panfrost_access_tiled_image(void *dst, void *src,
279                            unsigned x, unsigned y,
280                            unsigned w, unsigned h,
281                            uint32_t dst_stride,
282                            uint32_t src_stride,
283                            enum pipe_format format,
284                            bool is_store)
285 {
286    const struct util_format_description *desc = util_format_description(format);
287    unsigned bpp = desc->block.bits;
288 
289    /* Our optimized routines cannot handle unaligned blocks (without depending
290     * on platform-specific behaviour), and there is no good reason to do so. If
291     * these assertions fail, there is either a driver bug or a non-portable unit
292     * test.
293     */
294    assert((dst_stride % (bpp / 8)) == 0 && "unaligned destination stride");
295    assert((src_stride % (bpp / 8)) == 0 && "unaligned source stride");
296 
297    if (desc->block.width > 1 || !util_is_power_of_two_nonzero(desc->block.bits)) {
298       panfrost_access_tiled_image_generic(dst, (void *) src,
299             x, y, w, h,
300             dst_stride, src_stride, desc, is_store);
301 
302       return;
303    }
304 
305    unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
306    unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
307    unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
308    unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
309 
310    /* First, tile the top portion */
311 
312    unsigned orig_x = x, orig_y = y;
313 
314    if (first_full_tile_y != y) {
315       unsigned dist = MIN2(first_full_tile_y - y, h);
316 
317       panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
318             x, y, w, dist,
319             dst_stride, src_stride, desc, is_store);
320 
321       if (dist == h)
322          return;
323 
324       y += dist;
325       h -= dist;
326    }
327 
328    /* Next, the bottom portion */
329    if (last_full_tile_y != (y + h)) {
330       unsigned dist = (y + h) - last_full_tile_y;
331 
332       panfrost_access_tiled_image_generic(dst, OFFSET(src, x, last_full_tile_y),
333             x, last_full_tile_y, w, dist,
334             dst_stride, src_stride, desc, is_store);
335 
336       h -= dist;
337    }
338 
339    /* The left portion */
340    if (first_full_tile_x != x) {
341       unsigned dist = MIN2(first_full_tile_x - x, w);
342 
343       panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y),
344             x, y, dist, h,
345             dst_stride, src_stride, desc, is_store);
346 
347       if (dist == w)
348          return;
349 
350       x += dist;
351       w -= dist;
352    }
353 
354    /* Finally, the right portion */
355    if (last_full_tile_x != (x + w)) {
356       unsigned dist = (x + w) - last_full_tile_x;
357 
358       panfrost_access_tiled_image_generic(dst, OFFSET(src, last_full_tile_x, y),
359             last_full_tile_x, y, dist, h,
360             dst_stride, src_stride, desc, is_store);
361 
362       w -= dist;
363    }
364 
365    if (bpp == 8)
366       panfrost_access_tiled_image_uint8_t(dst,  OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
367    else if (bpp == 16)
368       panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
369    else if (bpp == 32)
370       panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
371    else if (bpp == 64)
372       panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
373    else if (bpp == 128)
374       panfrost_access_tiled_image_pan_uint128_t(dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
375 }
376 
377 /**
378  * Access a tiled image (load or store). Note: the region of interest (x, y, w,
379  * h) is specified in pixels, not blocks. It is expected that these quantities
380  * are aligned to the block size.
381  */
382 void
panfrost_store_tiled_image(void * dst,const void * src,unsigned x,unsigned y,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,enum pipe_format format)383 panfrost_store_tiled_image(void *dst, const void *src,
384                            unsigned x, unsigned y,
385                            unsigned w, unsigned h,
386                            uint32_t dst_stride,
387                            uint32_t src_stride,
388                            enum pipe_format format)
389 {
390     panfrost_access_tiled_image(dst, (void *) src,
391         x, y, w, h,
392         dst_stride, src_stride, format, true);
393 }
394 
395 void
panfrost_load_tiled_image(void * dst,const void * src,unsigned x,unsigned y,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,enum pipe_format format)396 panfrost_load_tiled_image(void *dst, const void *src,
397                            unsigned x, unsigned y,
398                            unsigned w, unsigned h,
399                            uint32_t dst_stride,
400                            uint32_t src_stride,
401                            enum pipe_format format)
402 {
403    panfrost_access_tiled_image((void *) src, dst,
404        x, y, w, h,
405        src_stride, dst_stride, format, false);
406 }
407