1 /*
2  * Copyright (c) 2023 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "freedreno_layout.h"
25 
26 #if DETECT_ARCH_AARCH64
27 #include <arm_neon.h>
28 #endif
29 
30 /* The tiling scheme on Qualcomm consists of four levels:
31  *
32  * 1. The UBWC block. Normally these use a compressed encoding format with the
33  *    compressed size stored in the corresponding metadata byte. However for
34  *    uncompressed blocks, or blocks in a texture where UBWC is disabled, the
35  *    pixels within the block are stored using a straightforward
36  *    coordinate-interleaving scheme:
37  *
38  *    b7 b6 b5 b4 b3 b2 b1 b0
39  *    -----------------------
40  *    y2 x4 x3 x2 y1 x1 y0 x0
41  *
42  *    Pixel contents are always stored linearly, only the pixel offset is
43  *    swizzled. UBWC blocks for most formats are smaller than 256 pixels and
44  *    only use the first xN and yN, ignoring the higher bits.
45  *
46  *    There is a special case for single-sampled R8G8 formats, where the 32x8
47  *    block is stored as a 32x8 R8 tile where the left half stores the R
48  *    components for each pixel and the right half stores the G components.
49  *    However non-compressed tiled R8G8 textures are not supported so we
50  *    ignore it here.
51  *
52  * 2. The 256 byte tile. Most UBWC blocks are 256 bytes already, but UBWC
53  *    blocks for some smaller formats are only 128 bytes, so 2x1 or 2x2 blocks
54  *    are combined to get a 256 byte tile. This can also be thought of as
55  *    re-adding bits that were dropped in the coordinate-interleaving scheme
56  *    above, and we take advantage of this to fold this level into the
57  *    previous one as we don't care about compression.
58  *
59  * 3. The 2K macrotile. This consists of 2x4 tiles, with a complicated
60  *    xor-based bank swizzling scheme. There are two possible modes, chosen by
61  *    the "macrotile mode" in RBBM_NC_MODE_CNTL. For formats with cpp of 16 or
62  *    greater, both modes are identical and the scheme is this:
63  *
64  *    b0 = x0 ^ y1
65  *    b1 = x0 ^ y1 ^ y0
66  *    b2 = x0 ^ y0
67  *
68  *    For all formats with a cpp less than 16, additional higher-order bits
69  *    are xor'ed into the upper 2 offset bits depending on the macrotile mode.
70  *    In "4 channel" mode:
71  *
72  *    b1' = b1 ^ x1
73  *
74  *    and in "8 channel" mode:
75  *
76  *    b1' = b1 ^ x1
77  *    b2' = b2 ^ x2 ^ y2
78  *
79  *    The macrotile stride is always a multiple of 2, so that pairs of 2K
80  *    macrotiles can be considered 4K macrotiles with one additional offset
81  *    bit:
82  *
83  *    b3 = x1
84  *
85  *    This is closer to the hardware representation as the base address is
86  *    aligned to 4K. However for our purposes this can be folded into
87  *    the next level:
88  *
89  * 4. Swizzled macrotile offset. The macrotile offset is usually linear,
90  *    however with strides that are aligned to the number of DDR banks this
91  *    can result in bank conflicts between vertically adjacent macrotiles that
92  *    map to the same bank. This is mitigated by xor'ing up to 3 bits of the
93  *    y offset into x based on how aligned the stride is before computing the
94  *    offset, or equivalently xor'ing them into the final offset. The
95  *    alignment is based on a value called the "highest bank bit" that is
96  *    programmed by the kernel based on the memory bank configuration.
97  *
98  *    The kernel also chooses which bits of y to xor in, which are called
99  *    "bank swizzle levels." The naming is weird, because the lowest level,
100  *    level 1, actually involves the highest bit of y:
101  *    - "Level 1 bank swizzling" swizzles bit 2 of the macrotile y offset into
102  *    the highest bank bit plus 1 when the stride between macrotiles (in
103  *    bytes) is a multiple of 2^{hbb + 2} where hbb is the highest bank bit.
104  *    - "Level 2 bank swizzling" swizzles bit 0 of the macrotile y offset into
105  *    the highest bank bit minus 1 when the stride is a multiple of 2^{hbb}.
106  *    - "Level 3 bank swizzling" swizzles bit 1 of the macrotile y offset into
107  *    the highest bank bit when the stride is a multiple of 2^{hbb + 1},
108  *
109  *    Level 1 bank swizzling is only enabled in UBWC 1.0 mode. Levels 2 and 3
110  *    can be selectively disabled starting with UBWC 4.0.
111  *
112  * This implementation uses ideas from
113  * https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/.
114  * Steps 1 and 2 map straightforwardly to the ideas explained there, but step
115  * 3 is very different. Luckily the offset of a block can still be split into
116  * a combination of values depending only on x and y, however they may be
117  * overlapping and instead of adding them together we have to xor them
118  * together.
119  *
120  * We choose the size of the innermost loop to be the size of a block, which
121  * is 256 bytes and therefore larger than strictly necessary, for two reasons:
122  * it simplifies the code a bit by not having to keep track of separate block
123  * sizes and "inner" block sizes, and in some cases a cacheline-sized inner
124  * tile wouldn't be wide enough to use ldp to get the fastest-possible 32 byte
125  * load.
126  */
127 
128 #define USE_SLOW_PATH 0
129 
130 static uint32_t
get_pixel_offset(uint32_t x,uint32_t y)131 get_pixel_offset(uint32_t x, uint32_t y)
132 {
133    return
134       (x & 1) |
135       (y & 1) << 1 |
136       ((x & 2) >> 1) << 2 |
137       ((y & 2) >> 1) << 3 |
138       ((x & 0x1c) >> 2) << 4 |
139       ((y & 4) >> 2) << 7;
140 }
141 
142 /* Take the x and y block coordinates and return two masks which when combined
143  * give us the block offset in bytes.  This includes the block offset within a
144  * macrotile and the macrotile x offset, which is swizzled based on the
145  * highest bank bit and enabled levels, but not the macrotile y offset which
146  * has to be added separately.
147  *
148  * This partially depends on the macrotile mode and block_x_xormask is called
149  * in the hot path, so we have to templatize it.
150  */
151 
152 template<enum fdl_macrotile_mode macrotile_mode>
153 static uint32_t
154 block_x_xormask(uint32_t x, uint32_t cpp);
155 
156 template<>
157 uint32_t
block_x_xormask(uint32_t x,uint32_t cpp)158 block_x_xormask<FDL_MACROTILE_4_CHANNEL>(uint32_t x, uint32_t cpp)
159 {
160    return (((x & 1) * 0b111) ^ (cpp < 16 ? (x & 0b010) : 0) ^ ((x >> 1) << 3)) << 8;
161 }
162 
163 template<>
164 uint32_t
block_x_xormask(uint32_t x,uint32_t cpp)165 block_x_xormask<FDL_MACROTILE_8_CHANNEL>(uint32_t x, uint32_t cpp)
166 {
167    return (((x & 1) * 0b111) ^ (cpp < 16 ? (x & 0b110) : 0) ^ ((x >> 1) << 3)) << 8;
168 }
169 
170 template<enum fdl_macrotile_mode macrotile_mode>
171 static uint32_t
172 block_y_xormask(uint32_t y, uint32_t cpp, uint32_t bank_mask, uint32_t bank_shift);
173 
174 template<>
175 uint32_t
block_y_xormask(uint32_t y,uint32_t cpp,uint32_t bank_mask,uint32_t bank_shift)176 block_y_xormask<FDL_MACROTILE_4_CHANNEL>(uint32_t y, uint32_t cpp,
177                                          uint32_t bank_mask,
178                                          uint32_t bank_shift)
179 {
180    return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011)) << 8) |
181       ((y & bank_mask) << bank_shift);
182 }
183 
184 template<>
185 uint32_t
block_y_xormask(uint32_t y,uint32_t cpp,uint32_t bank_mask,uint32_t bank_shift)186 block_y_xormask<FDL_MACROTILE_8_CHANNEL>(uint32_t y, uint32_t cpp,
187                                          uint32_t bank_mask,
188                                          uint32_t bank_shift)
189 {
190    return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011) ^
191             (cpp < 16 ? (y & 0b100) : 0)) << 8) |
192       ((y & bank_mask) << bank_shift);
193 }
194 
195 /* Figure out how y is swizzled into x based on the UBWC config and block
196  * stride and return values to be plugged into block_y_xormask().
197  */
198 
199 static uint32_t
get_bank_mask(uint32_t block_stride,uint32_t cpp,const struct fdl_ubwc_config * config)200 get_bank_mask(uint32_t block_stride, uint32_t cpp,
201               const struct fdl_ubwc_config *config)
202 {
203    /* For some reason, for cpp=1 (or R8G8 media formats) the alignment
204     * required is doubled.
205     */
206    unsigned offset = cpp == 1 ? 1 : 0;
207    uint32_t mask = 0;
208    if ((config->bank_swizzle_levels & 0x2) &&
209        (block_stride & ((1u << (config->highest_bank_bit - 10 + offset)) - 1)) == 0)
210       mask |= 0b100;
211    if ((config->bank_swizzle_levels & 0x4) &&
212        (block_stride & ((1u << (config->highest_bank_bit - 9 + offset)) - 1)) == 0)
213       mask |= 0b1000;
214    if ((config->bank_swizzle_levels & 0x1) &&
215        (block_stride & ((1u << (config->highest_bank_bit - 8 + offset)) - 1)) == 0)
216       mask |= 0b10000;
217    return mask;
218 }
219 
220 static uint32_t
get_bank_shift(const struct fdl_ubwc_config * config)221 get_bank_shift(const struct fdl_ubwc_config *config)
222 {
223    return config->highest_bank_bit - 3;
224 }
225 
226 #if USE_SLOW_PATH
227 static uint32_t
get_block_offset(uint32_t x,uint32_t y,unsigned block_stride,unsigned cpp,const struct fdl_ubwc_config * config)228 get_block_offset(uint32_t x, uint32_t y, unsigned block_stride, unsigned cpp,
229                  const struct fdl_ubwc_config *config)
230 {
231    uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
232    unsigned bank_shift = get_bank_shift(config);
233    uint32_t x_mask, y_mask;
234    if (config->macrotile_mode == FDL_MACROTILE_4_CHANNEL) {
235       x_mask = block_x_xormask<FDL_MACROTILE_4_CHANNEL>(x, cpp);
236       y_mask = block_y_xormask<FDL_MACROTILE_4_CHANNEL>(y, cpp, bank_mask,
237                                                         bank_shift);
238    } else {
239       x_mask = block_x_xormask<FDL_MACROTILE_8_CHANNEL>(x, cpp);
240       y_mask = block_y_xormask<FDL_MACROTILE_8_CHANNEL>(y, cpp, bank_mask,
241                                                         bank_shift);
242    }
243    uint32_t macrotile_y = y >> 2;
244    uint32_t macrotile_stride = block_stride / 2;
245    return ((x_mask ^ y_mask) >> 8) + ((macrotile_y * macrotile_stride) << 3);
246 }
247 #endif
248 
249 static void
get_block_size(unsigned cpp,uint32_t * block_width,uint32_t * block_height)250 get_block_size(unsigned cpp, uint32_t *block_width,
251                uint32_t *block_height)
252 {
253    switch (cpp) {
254    case 1:
255       *block_width = 32;
256       *block_height = 8;
257       break;
258    case 2:
259       *block_width = 32;
260       *block_height = 4;
261       break;
262    case 4:
263       *block_width = 16;
264       *block_height = 4;
265       break;
266    case 8:
267       *block_width = 8;
268       *block_height = 4;
269       break;
270    case 16:
271       *block_width = 4;
272       *block_height = 4;
273       break;
274    default:
275       unreachable("unknown cpp");
276    }
277 }
278 
279 enum copy_dir {
280    LINEAR_TO_TILED,
281    TILED_TO_LINEAR,
282 };
283 
284 template<unsigned cpp, enum copy_dir direction,
285    enum fdl_macrotile_mode macrotile_mode>
286 static void
memcpy_small(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * tiled,char * linear,uint32_t linear_pitch,uint32_t block_stride,const struct fdl_ubwc_config * config)287 memcpy_small(uint32_t x_start, uint32_t y_start,
288              uint32_t width, uint32_t height,
289              char *tiled, char *linear,
290              uint32_t linear_pitch, uint32_t block_stride,
291              const struct fdl_ubwc_config *config)
292 {
293    unsigned block_width, block_height;
294    get_block_size(cpp, &block_width, &block_height);
295    const uint32_t block_size = 256;
296 
297    uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
298    uint32_t bank_shift = get_bank_shift(config);
299    uint32_t x_mask = (get_pixel_offset(~0u, 0)) & (block_size / cpp - 1);
300    uint32_t y_mask = (get_pixel_offset(0, ~0u)) & (block_size / cpp - 1);
301 
302    /* The pitch between vertically adjacent 2K macrotiles. */
303    uint32_t macrotile_pitch = (block_stride / 2) * 2048;
304 
305    uint32_t x_block_start = x_start / block_width;
306    uint32_t y_block_start = y_start / block_height;
307 
308    tiled += (y_block_start >> 2) * macrotile_pitch;
309 
310    uint32_t x_pixel_start = get_pixel_offset(x_start % block_width, 0);
311    uint32_t y_pixel_start = get_pixel_offset(0, y_start % block_height);
312 
313    uint32_t y_block = y_block_start;
314    uint32_t y_pixel = y_pixel_start;
315    uint32_t y_xormask =
316       block_y_xormask<macrotile_mode>(y_block, cpp, bank_mask, bank_shift);
317    for (uint32_t y = 0; y < height; y++) {
318       uint32_t x_block = x_block_start;
319       uint32_t x_pixel = x_pixel_start;
320       uint32_t block_offset =
321          block_x_xormask<macrotile_mode>(x_block, cpp) ^ y_xormask;
322 
323       char *tiled_line = tiled + y_pixel * cpp;
324       char *linear_pixel = linear;
325 
326       for (uint32_t x = 0; x < width; x++) {
327          char *tiled_pixel = tiled_line + x_pixel * cpp + block_offset;
328 
329          if (direction == LINEAR_TO_TILED)
330             memcpy(tiled_pixel, linear_pixel, cpp);
331          else
332             memcpy(linear_pixel, tiled_pixel, cpp);
333 
334          x_pixel = (x_pixel - x_mask) & x_mask;
335          linear_pixel += cpp;
336 
337          if (x_pixel == 0) {
338             x_block++;
339             block_offset =
340                block_x_xormask<macrotile_mode>(x_block, cpp) ^ y_xormask;
341          }
342       }
343 
344       y_pixel = (y_pixel - y_mask) & y_mask;
345       if (y_pixel == 0) {
346          y_block++;
347          y_xormask =
348             block_y_xormask<macrotile_mode>(y_block, cpp, bank_mask, bank_shift);
349          if ((y_block & 3) == 0) {
350             tiled += macrotile_pitch;
351          }
352       }
353 
354       linear += linear_pitch;
355    }
356 }
357 
358 typedef void (*copy_fn)(char *tiled, char *linear, uint32_t linear_pitch);
359 
360 typedef uint8_t pixel8_t __attribute__((vector_size(8), aligned(8)));
361 typedef uint8_t pixel8a1_t __attribute__((vector_size(8), aligned(1)));
362 typedef uint8_t pixel16_t __attribute__((vector_size(16), aligned(16)));
363 typedef uint8_t pixel16a1_t __attribute__((vector_size(16), aligned(1)));
364 
365 /* We use memcpy_small as a fallback for copying a tile when there isn't
366  * optimized assembly, which requires a config, but because we're just copying
367  * a tile it doesn't matter which config we pass. Just pass an arbitrary valid
368  * config.
369  */
370 static const struct fdl_ubwc_config dummy_config = {
371    .highest_bank_bit = 13,
372 };
373 
374 /* We use handwritten assembly for the smaller cpp's because gcc is too dumb
375  * to register allocate the vector registers without inserting extra moves,
376  * and it can't use the post-increment register mode so it emits too many add
377  * instructions. This means a ~10% performance regression compared to the
378  * hand-written assembly in the cpp=4 case.
379  */
380 
381 static void
linear_to_tiled_1cpp(char * _tiled,char * _linear,uint32_t linear_pitch)382 linear_to_tiled_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
383 {
384 #if DETECT_ARCH_AARCH64
385    uint32_t *tiled = (uint32_t *)_tiled;
386    for (unsigned y = 0; y < 2; y++, _linear += 4 * linear_pitch) {
387       uint16x8_t *linear0 = (uint16x8_t *)_linear;
388       uint16x8_t *linear1 = (uint16x8_t *)(_linear + linear_pitch);
389       uint16x8_t *linear2 = (uint16x8_t *)(_linear + 2 * linear_pitch);
390       uint16x8_t *linear3 = (uint16x8_t *)(_linear + 3 * linear_pitch);
391       asm volatile(
392           "ldp q0, q4, [%2]\n"
393           "ldp q1, q5, [%3]\n"
394           "ldp q2, q6, [%4]\n"
395           "ldp q3, q7, [%5]\n"
396           "zip1 v8.8h, v0.8h, v1.8h\n"
397           "zip1 v9.8h, v2.8h, v3.8h\n"
398           "zip2 v10.8h, v0.8h, v1.8h\n"
399           "zip2 v11.8h, v2.8h, v3.8h\n"
400           "zip1 v12.8h, v4.8h, v5.8h\n"
401           "zip1 v13.8h, v6.8h, v7.8h\n"
402           "zip2 v14.8h, v4.8h, v5.8h\n"
403           "zip2 v15.8h, v6.8h, v7.8h\n"
404           "st2 {v8.2d, v9.2d}, [%0], #32\n"
405           "st2 {v10.2d, v11.2d}, [%0], #32\n"
406           "st2 {v12.2d, v13.2d}, [%0], #32\n"
407           "st2 {v14.2d, v15.2d}, [%0], #32\n"
408           : "=r"(tiled)
409           : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
410           : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
411             "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
412    }
413 #else
414    memcpy_small<1, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
415       0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, &dummy_config);
416 #endif
417 }
418 
419 static void
tiled_to_linear_1cpp(char * _tiled,char * _linear,uint32_t linear_pitch)420 tiled_to_linear_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
421 {
422 #if DETECT_ARCH_AARCH64
423    uint32_t *tiled = (uint32_t *)_tiled;
424    for (unsigned y = 0; y < 2; y++, _linear += 4 * linear_pitch) {
425       uint16x8_t *linear0 = (uint16x8_t *)_linear;
426       uint16x8_t *linear1 = (uint16x8_t *)(_linear + linear_pitch);
427       uint16x8_t *linear2 = (uint16x8_t *)(_linear + 2 * linear_pitch);
428       uint16x8_t *linear3 = (uint16x8_t *)(_linear + 3 * linear_pitch);
429       asm volatile(
430           "ld2 {v8.2d, v9.2d}, [%0], #32\n"
431           "ld2 {v10.2d, v11.2d}, [%0], #32\n"
432           "ld2 {v12.2d, v13.2d}, [%0], #32\n"
433           "ld2 {v14.2d, v15.2d}, [%0], #32\n"
434           "uzp1 v0.8h, v8.8h, v10.8h\n"
435           "uzp2 v1.8h, v8.8h, v10.8h\n"
436           "uzp1 v2.8h, v9.8h, v11.8h\n"
437           "uzp2 v3.8h, v9.8h, v11.8h\n"
438           "uzp1 v4.8h, v12.8h, v14.8h\n"
439           "uzp2 v5.8h, v12.8h, v14.8h\n"
440           "uzp1 v6.8h, v13.8h, v15.8h\n"
441           "uzp2 v7.8h, v13.8h, v15.8h\n"
442           "stp q0, q4, [%2]\n"
443           "stp q1, q5, [%3]\n"
444           "stp q2, q6, [%4]\n"
445           "stp q3, q7, [%5]\n"
446           : "=r"(tiled)
447           : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
448           : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
449             "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
450    }
451 #else
452    memcpy_small<1, TILED_TO_LINEAR, FDL_MACROTILE_4_CHANNEL>(
453       0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, &dummy_config);
454 #endif
455 }
456 
457 static void
linear_to_tiled_2cpp(char * _tiled,char * _linear,uint32_t linear_pitch)458 linear_to_tiled_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
459 {
460 #if DETECT_ARCH_AARCH64
461    uint32_t *tiled = (uint32_t *)_tiled;
462    for (unsigned x = 0; x < 2; x++, _linear += 32) {
463       uint32x4_t *linear0 = (uint32x4_t *)_linear;
464       uint32x4_t *linear1 = (uint32x4_t *)(_linear + linear_pitch);
465       uint32x4_t *linear2 = (uint32x4_t *)(_linear + 2 * linear_pitch);
466       uint32x4_t *linear3 = (uint32x4_t *)(_linear + 3 * linear_pitch);
467       asm volatile(
468           "ldp q0, q4, [%2]\n"
469           "ldp q1, q5, [%3]\n"
470           "ldp q2, q6, [%4]\n"
471           "ldp q3, q7, [%5]\n"
472           "zip1 v8.4s, v0.4s, v1.4s\n"
473           "zip1 v9.4s, v2.4s, v3.4s\n"
474           "zip2 v10.4s, v0.4s, v1.4s\n"
475           "zip2 v11.4s, v2.4s, v3.4s\n"
476           "zip1 v12.4s, v4.4s, v5.4s\n"
477           "zip1 v13.4s, v6.4s, v7.4s\n"
478           "zip2 v14.4s, v4.4s, v5.4s\n"
479           "zip2 v15.4s, v6.4s, v7.4s\n"
480           "stp q8, q9, [%0], #32\n"
481           "stp q10, q11, [%0], #32\n"
482           "stp q12, q13, [%0], #32\n"
483           "stp q14, q15, [%0], #32\n"
484           : "=r"(tiled)
485           : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
486           : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
487             "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
488    }
489 #else
490    memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
491       0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, &dummy_config);
492 #endif
493 }
494 
495 static void
tiled_to_linear_2cpp(char * _tiled,char * _linear,uint32_t linear_pitch)496 tiled_to_linear_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
497 {
498 #if DETECT_ARCH_AARCH64
499    uint32_t *tiled = (uint32_t *)_tiled;
500    for (unsigned x = 0; x < 2; x++, _linear += 32) {
501       uint32x4_t *linear0 = (uint32x4_t *)_linear;
502       uint32x4_t *linear1 = (uint32x4_t *)(_linear + linear_pitch);
503       uint32x4_t *linear2 = (uint32x4_t *)(_linear + 2 * linear_pitch);
504       uint32x4_t *linear3 = (uint32x4_t *)(_linear + 3 * linear_pitch);
505       asm volatile(
506           "ldp q8, q9, [%0], #32\n"
507           "ldp q10, q11, [%0], #32\n"
508           "ldp q12, q13, [%0], #32\n"
509           "ldp q14, q15, [%0], #32\n"
510           "uzp1 v0.4s, v8.4s, v10.4s\n"
511           "uzp2 v1.4s, v8.4s, v10.4s\n"
512           "uzp1 v2.4s, v9.4s, v11.4s\n"
513           "uzp2 v3.4s, v9.4s, v11.4s\n"
514           "uzp1 v4.4s, v12.4s, v14.4s\n"
515           "uzp2 v5.4s, v12.4s, v14.4s\n"
516           "uzp1 v6.4s, v13.4s, v15.4s\n"
517           "uzp2 v7.4s, v13.4s, v15.4s\n"
518           "stp q0, q4, [%2]\n"
519           "stp q1, q5, [%3]\n"
520           "stp q2, q6, [%4]\n"
521           "stp q3, q7, [%5]\n"
522           : "=r"(tiled)
523           : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
524           : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
525             "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
526    }
527 #else
528    memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
529       0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, &dummy_config);
530 #endif
531 }
532 
533 static void
linear_to_tiled_4cpp(char * _tiled,char * _linear,uint32_t linear_pitch)534 linear_to_tiled_4cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
535 {
536 #if DETECT_ARCH_AARCH64
537    uint64_t *tiled = (uint64_t *)_tiled;
538    uint64x2_t *linear0 = (uint64x2_t *)_linear;
539    uint64x2_t *linear1 = (uint64x2_t *)(_linear + linear_pitch);
540    uint64x2_t *linear2 = (uint64x2_t *)(_linear + 2 * linear_pitch);
541    uint64x2_t *linear3 = (uint64x2_t *)(_linear + 3 * linear_pitch);
542 
543    asm volatile(
544        "ldp q0, q4, [%2]\n"
545        "ldp q1, q5, [%3]\n"
546        "ldp q2, q6, [%4]\n"
547        "ldp q3, q7, [%5]\n"
548        "ldp q8, q12, [%2, #32]\n"
549        "ldp q9, q13, [%3, #32]\n"
550        "ldp q10, q14, [%4, #32]\n"
551        "ldp q11, q15, [%5, #32]\n"
552        "st2 {v0.2d, v1.2d}, [%0], #32\n"
553        "st2 {v2.2d, v3.2d}, [%0], #32\n"
554        "st2 {v4.2d, v5.2d}, [%0], #32\n"
555        "st2 {v6.2d, v7.2d}, [%0], #32\n"
556        "st2 {v8.2d, v9.2d}, [%0], #32\n"
557        "st2 {v10.2d, v11.2d}, [%0], #32\n"
558        "st2 {v12.2d, v13.2d}, [%0], #32\n"
559        "st2 {v14.2d, v15.2d}, [%0], #32\n"
560        : "=r"(tiled)
561        : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
562        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
563          "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
564 #else
565    pixel8_t *tiled = (pixel8_t *)_tiled;
566    for (unsigned x = 0; x < 4; x++, _linear += 4 * 4, tiled += 8) {
567       pixel8a1_t *linear0 = (pixel8a1_t *)_linear;
568       pixel8a1_t *linear1 = (pixel8a1_t *)(_linear + linear_pitch);
569       pixel8a1_t *linear2 = (pixel8a1_t *)(_linear + 2 * linear_pitch);
570       pixel8a1_t *linear3 = (pixel8a1_t *)(_linear + 3 * linear_pitch);
571       pixel8_t p000 = linear0[0];
572       pixel8_t p100 = linear0[1];
573       pixel8_t p001 = linear1[0];
574       pixel8_t p101 = linear1[1];
575       pixel8_t p010 = linear2[0];
576       pixel8_t p110 = linear2[1];
577       pixel8_t p011 = linear3[0];
578       pixel8_t p111 = linear3[1];
579       tiled[0] = p000;
580       tiled[1] = p001;
581       tiled[2] = p100;
582       tiled[3] = p101;
583       tiled[4] = p010;
584       tiled[5] = p011;
585       tiled[6] = p110;
586       tiled[7] = p111;
587    }
588 #endif
589 }
590 
591 static void
tiled_to_linear_4cpp(char * _tiled,char * _linear,uint32_t linear_pitch)592 tiled_to_linear_4cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
593 {
594 #if DETECT_ARCH_AARCH64
595    uint64_t *tiled = (uint64_t *)_tiled;
596    uint64x2_t *linear0 = (uint64x2_t *)_linear;
597    uint64x2_t *linear1 = (uint64x2_t *)(_linear + linear_pitch);
598    uint64x2_t *linear2 = (uint64x2_t *)(_linear + 2 * linear_pitch);
599    uint64x2_t *linear3 = (uint64x2_t *)(_linear + 3 * linear_pitch);
600 
601    asm volatile(
602        "ld2 {v0.2d, v1.2d}, [%0], #32\n"
603        "ld2 {v2.2d, v3.2d}, [%0], #32\n"
604        "ld2 {v4.2d, v5.2d}, [%0], #32\n"
605        "ld2 {v6.2d, v7.2d}, [%0], #32\n"
606        "ld2 {v8.2d, v9.2d}, [%0], #32\n"
607        "ld2 {v10.2d, v11.2d}, [%0], #32\n"
608        "ld2 {v12.2d, v13.2d}, [%0], #32\n"
609        "ld2 {v14.2d, v15.2d}, [%0], #32\n"
610        "stp q0, q4, [%2]\n"
611        "stp q1, q5, [%3]\n"
612        "stp q2, q6, [%4]\n"
613        "stp q3, q7, [%5]\n"
614        "stp q8, q12, [%2, #32]\n"
615        "stp q9, q13, [%3, #32]\n"
616        "stp q10, q14, [%4, #32]\n"
617        "stp q11, q15, [%5, #32]\n"
618        : "=r"(tiled)
619        : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
620        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
621          "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
622 #else
623    pixel8_t *tiled = (pixel8_t *)_tiled;
624    for (unsigned x = 0; x < 4; x++, _linear += 4 * 4, tiled += 8) {
625       pixel8a1_t *linear0 = (pixel8a1_t *)_linear;
626       pixel8a1_t *linear1 = (pixel8a1_t *)(_linear + linear_pitch);
627       pixel8a1_t *linear2 = (pixel8a1_t *)(_linear + 2 * linear_pitch);
628       pixel8a1_t *linear3 = (pixel8a1_t *)(_linear + 3 * linear_pitch);
629       pixel8_t p000 = tiled[0];
630       pixel8_t p001 = tiled[1];
631       pixel8_t p100 = tiled[2];
632       pixel8_t p101 = tiled[3];
633       pixel8_t p010 = tiled[4];
634       pixel8_t p011 = tiled[5];
635       pixel8_t p110 = tiled[6];
636       pixel8_t p111 = tiled[7];
637       linear0[0] = p000;
638       linear0[1] = p100;
639       linear1[0] = p001;
640       linear1[1] = p101;
641       linear2[0] = p010;
642       linear2[1] = p110;
643       linear3[0] = p011;
644       linear3[1] = p111;
645    }
646 #endif
647 }
648 
649 static void
linear_to_tiled_8cpp(char * _tiled,char * _linear,uint32_t linear_pitch)650 linear_to_tiled_8cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
651 {
652    pixel16_t *tiled = (pixel16_t *)_tiled;
653 
654    for (unsigned x = 0; x < 2; x++, _linear += 4 * 8) {
655       for (unsigned y = 0; y < 2; y++, tiled += 4) {
656          pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * y * linear_pitch);
657          pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + (2 * y + 1) * linear_pitch);
658          pixel16_t p00 = linear0[0];
659          pixel16_t p10 = linear0[1];
660          pixel16_t p01 = linear1[0];
661          pixel16_t p11 = linear1[1];
662          tiled[0] = p00;
663          tiled[1] = p01;
664          tiled[2] = p10;
665          tiled[3] = p11;
666       }
667    }
668 }
669 
670 static void
tiled_to_linear_8cpp(char * _tiled,char * _linear,uint32_t linear_pitch)671 tiled_to_linear_8cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
672 {
673    pixel16_t *tiled = (pixel16_t *)_tiled;
674 
675    for (unsigned x = 0; x < 2; x++, _linear += 4 * 8) {
676       for (unsigned y = 0; y < 2; y++, tiled += 4) {
677          pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * y * linear_pitch);
678          pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + (2 * y + 1) * linear_pitch);
679          pixel16_t p00 = tiled[0];
680          pixel16_t p01 = tiled[1];
681          pixel16_t p10 = tiled[2];
682          pixel16_t p11 = tiled[3];
683          linear0[0] = p00;
684          linear0[1] = p10;
685          linear1[0] = p01;
686          linear1[1] = p11;
687       }
688    }
689 }
690 
691 static void
linear_to_tiled_16cpp(char * _tiled,char * _linear,uint32_t linear_pitch)692 linear_to_tiled_16cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
693 {
694    pixel16_t *tiled = (pixel16_t *)_tiled;
695 
696    for (unsigned y = 0; y < 2; y++, _linear += 2 * linear_pitch) {
697       for (unsigned x = 0; x < 2; x++, tiled += 4) {
698          pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * 16 * x);
699          pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + linear_pitch + 2 * 16 * x);
700          pixel16_t p00 = linear0[0];
701          pixel16_t p10 = linear0[1];
702          pixel16_t p01 = linear1[0];
703          pixel16_t p11 = linear1[1];
704          tiled[0] = p00;
705          tiled[1] = p10;
706          tiled[2] = p01;
707          tiled[3] = p11;
708       }
709    }
710 }
711 
712 static void
tiled_to_linear_16cpp(char * _tiled,char * _linear,uint32_t linear_pitch)713 tiled_to_linear_16cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
714 {
715    pixel16_t *tiled = (pixel16_t *)_tiled;
716 
717    for (unsigned y = 0; y < 2; y++, _linear += 2 * linear_pitch) {
718       for (unsigned x = 0; x < 2; x++, tiled += 4) {
719          pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * 16 * x);
720          pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + linear_pitch + 2 * 16 * x);
721          pixel16_t p00 = tiled[0];
722          pixel16_t p10 = tiled[1];
723          pixel16_t p01 = tiled[2];
724          pixel16_t p11 = tiled[3];
725          linear0[0] = p00;
726          linear0[1] = p10;
727          linear1[0] = p01;
728          linear1[1] = p11;
729       }
730    }
731 }
732 
733 template<unsigned cpp, enum copy_dir direction, copy_fn copy_block,
734    enum fdl_macrotile_mode macrotile_mode>
735 static void
memcpy_large(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * tiled,char * linear,uint32_t linear_pitch,uint32_t block_stride,const fdl_ubwc_config * config)736 memcpy_large(uint32_t x_start, uint32_t y_start,
737              uint32_t width, uint32_t height,
738              char *tiled, char *linear,
739              uint32_t linear_pitch, uint32_t block_stride,
740              const fdl_ubwc_config *config)
741 {
742    unsigned block_width, block_height;
743    get_block_size(cpp, &block_width, &block_height);
744 
745    /* The region to copy is divided into 9 parts:
746     *
747     *              x_start x_aligned_start    x_aligned_end x_end
748     *
749     *          y_start /--------------------------------------\
750     *                  |         |                  |         |
751     *  y_aligned_start |--------------------------------------|
752     *                  |         |                  |         |
753     *                  |         |   aligned area   |         |
754     *                  |         |                  |         |
755     *    y_aligned_end |--------------------------------------|
756     *                  |         |                  |         |
757     *            y_end \--------------------------------------/
758     *
759     * The aligned area consists of aligned blocks that we can use our
760     * optimized copy function on, but the rest consists of misaligned pieces
761     * of blocks.
762     */
763 
764    uint32_t x_end = x_start + width;
765    uint32_t x_aligned_start = align(x_start, block_width);
766    uint32_t x_aligned_end = ROUND_DOWN_TO(x_end, block_width);
767 
768    uint32_t y_end = y_start + height;
769    uint32_t y_aligned_start = align(y_start, block_height);
770    uint32_t y_aligned_end = ROUND_DOWN_TO(y_end, block_height);
771 
772    /* If we don't cover any full tiles, use the small loop */
773    if (x_aligned_end <= x_aligned_start || y_aligned_end <= y_aligned_start) {
774       memcpy_small<cpp, direction, macrotile_mode>(
775          x_start, y_start, width, height, tiled, linear, linear_pitch,
776          block_stride, config);
777       return;
778    }
779 
780    /* Handle the top third */
781    if (y_start != y_aligned_start) {
782       memcpy_small<cpp, direction, macrotile_mode>(
783          x_start, y_start, width, y_aligned_start - y_start, tiled, linear,
784          linear_pitch, block_stride, config);
785       linear += (y_aligned_start - y_start) * linear_pitch;
786    }
787 
788    /* Handle left of the aligned block */
789    char *linear_aligned = linear;
790    if (x_start != x_aligned_start) {
791       memcpy_small<cpp, direction, macrotile_mode>(
792          x_start, y_aligned_start, x_aligned_start - x_start,
793          y_aligned_end - y_aligned_start, tiled, linear, linear_pitch,
794          block_stride, config);
795       linear_aligned = linear + (x_aligned_start - x_start) * cpp;
796    }
797 
798    /* Handle the main part */
799    uint32_t macrotile_pitch = (block_stride / 2) * 2048;
800    uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
801    uint32_t bank_shift = get_bank_shift(config);
802    char *tiled_aligned =
803       tiled + macrotile_pitch * (y_aligned_start / (block_height * 4));
804 
805    for (unsigned y_block = y_aligned_start / block_height;
806         y_block < y_aligned_end / block_height;) {
807       uint32_t y_xormask =
808          block_y_xormask<macrotile_mode>(y_block, cpp, bank_mask, bank_shift);
809       char *linear_block = linear_aligned;
810 
811       for (unsigned x_block = x_aligned_start / block_width;
812            x_block < x_aligned_end / block_width; x_block++) {
813          uint32_t block_offset =
814             block_x_xormask<macrotile_mode>(x_block, cpp) ^ y_xormask;
815          copy_block(tiled_aligned + block_offset, linear_block, linear_pitch);
816          linear_block += block_width * cpp;
817       }
818 
819       linear_aligned += block_height * linear_pitch;
820 
821       y_block++;
822       if ((y_block & 3) == 0)
823          tiled_aligned += macrotile_pitch;
824    }
825 
826    /* Handle right of the aligned block */
827    if (x_end != x_aligned_end) {
828       char *linear_end =
829          linear + (x_aligned_end - x_start) * cpp;
830       memcpy_small<cpp, direction, macrotile_mode>(
831          x_aligned_end, y_aligned_start, x_end - x_aligned_end,
832          y_aligned_end - y_aligned_start, tiled, linear_end, linear_pitch,
833          block_stride, config);
834    }
835 
836    /* Handle the bottom third */
837    linear += (y_aligned_end - y_aligned_start) * linear_pitch;
838    if (y_end != y_aligned_end) {
839       memcpy_small<cpp, direction, macrotile_mode>(
840          x_start, y_aligned_end, width, y_end - y_aligned_end,
841          tiled, linear, linear_pitch, block_stride,
842          config);
843    }
844 }
845 
846 void
fdl6_memcpy_linear_to_tiled(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * dst,const char * src,const struct fdl_layout * dst_layout,unsigned dst_miplevel,uint32_t src_pitch,const struct fdl_ubwc_config * config)847 fdl6_memcpy_linear_to_tiled(uint32_t x_start, uint32_t y_start,
848                             uint32_t width, uint32_t height,
849                             char *dst, const char *src,
850                             const struct fdl_layout *dst_layout,
851                             unsigned dst_miplevel,
852                             uint32_t src_pitch,
853                             const struct fdl_ubwc_config *config)
854 {
855    unsigned block_width, block_height;
856    uint32_t cpp = dst_layout->cpp;
857    get_block_size(cpp, &block_width, &block_height);
858    uint32_t block_stride =
859       fdl_pitch(dst_layout, dst_miplevel) / (block_width * dst_layout->cpp);
860    uint32_t block_size = 256;
861    assert(block_size == block_width * block_height * dst_layout->cpp);
862    assert(config->macrotile_mode != FDL_MACROTILE_INVALID);
863 
864 #if USE_SLOW_PATH
865    for (uint32_t y = 0; y < height; y++) {
866       uint32_t y_block = (y + y_start) / block_height;
867       uint32_t y_pixel = (y + y_start) % block_height;
868       for (uint32_t x = 0; x < width; x++) {
869          uint32_t x_block = (x + x_start) / block_width;
870          uint32_t x_pixel = (x + x_start) % block_width;
871 
872          uint32_t block_offset =
873             get_block_offset(x_block, y_block, block_stride, cpp,
874                              config);
875          uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);
876 
877          memcpy(dst + block_size * block_offset + cpp * pixel_offset,
878                 src + y * src_pitch + x * cpp, cpp);
879       }
880    }
881 #else
882    switch (cpp) {
883 #define CASE(case_cpp)                                                        \
884    case case_cpp:                                                             \
885       if (config->macrotile_mode == FDL_MACROTILE_4_CHANNEL) {                \
886          memcpy_large<case_cpp, LINEAR_TO_TILED,                              \
887             linear_to_tiled_##case_cpp##cpp, FDL_MACROTILE_4_CHANNEL>(        \
888             x_start, y_start, width, height, dst, (char *)src, src_pitch,     \
889             block_stride, config);                                            \
890       } else {                                                                \
891          memcpy_large<case_cpp, LINEAR_TO_TILED,                              \
892             linear_to_tiled_##case_cpp##cpp,  FDL_MACROTILE_8_CHANNEL>(       \
893             x_start, y_start, width, height, dst, (char *)src, src_pitch,     \
894             block_stride, config);                                            \
895       }                                                                       \
896       break;
897    CASE(1)
898    CASE(2)
899    CASE(4)
900    CASE(8)
901    CASE(16)
902 #undef CASE
903    default:
904       unreachable("unknown cpp");
905    }
906 #endif
907 }
908 
909 void
fdl6_memcpy_tiled_to_linear(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * dst,const char * src,const struct fdl_layout * src_layout,unsigned src_miplevel,uint32_t dst_pitch,const struct fdl_ubwc_config * config)910 fdl6_memcpy_tiled_to_linear(uint32_t x_start, uint32_t y_start,
911                             uint32_t width, uint32_t height,
912                             char *dst, const char *src,
913                             const struct fdl_layout *src_layout,
914                             unsigned src_miplevel,
915                             uint32_t dst_pitch,
916                             const struct fdl_ubwc_config *config)
917 {
918    unsigned block_width, block_height;
919    unsigned cpp = src_layout->cpp;
920    get_block_size(cpp, &block_width, &block_height);
921    uint32_t block_stride =
922       fdl_pitch(src_layout, src_miplevel) / (block_width * src_layout->cpp);
923    uint32_t block_size = 256;
924    assert(block_size == block_width * block_height * src_layout->cpp);
925    assert(config->macrotile_mode != FDL_MACROTILE_INVALID);
926 
927 #if USE_SLOW_PATH
928    for (uint32_t y = 0; y < height; y++) {
929       uint32_t y_block = (y + y_start) / block_height;
930       uint32_t y_pixel = (y + y_start) % block_height;
931       for (uint32_t x = 0; x < width; x++) {
932          uint32_t x_block = (x + x_start) / block_width;
933          uint32_t x_pixel = (x + x_start) % block_width;
934 
935 	 uint32_t block_offset =
936             get_block_offset(x_block, y_block, block_stride, src_layout->cpp,
937                              config);
938 	 uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);
939 
940          memcpy(dst + y * dst_pitch + x * src_layout->cpp,
941                 src + block_size * block_offset + src_layout->cpp * pixel_offset,
942                 src_layout->cpp);
943       }
944    }
945 #else
946    switch (cpp) {
947 #define CASE(case_cpp)                                                        \
948    case case_cpp:                                                             \
949       if (config->macrotile_mode == FDL_MACROTILE_4_CHANNEL) {                \
950          memcpy_large<case_cpp, TILED_TO_LINEAR,                              \
951             tiled_to_linear_##case_cpp##cpp, FDL_MACROTILE_4_CHANNEL>(        \
952             x_start, y_start, width, height, (char *)src, dst, dst_pitch,     \
953             block_stride, config);                                            \
954       } else {                                                                \
955          memcpy_large<case_cpp, TILED_TO_LINEAR,                              \
956             tiled_to_linear_##case_cpp##cpp, FDL_MACROTILE_8_CHANNEL>(        \
957             x_start, y_start, width, height, (char *)src, dst, dst_pitch,     \
958             block_stride, config);                                            \
959       }                                                                       \
960       break;
961    CASE(1)
962    CASE(2)
963    CASE(4)
964    CASE(8)
965    CASE(16)
966 #undef CASE
967    default:
968       unreachable("unknown cpp");
969    }
970 #endif
971 }
972