1 /*
2 * Copyright (c) 2023 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "freedreno_layout.h"
25
26 #if DETECT_ARCH_AARCH64
27 #include <arm_neon.h>
28 #endif
29
30 /* The tiling scheme on Qualcomm consists of four levels:
31 *
32 * 1. The UBWC block. Normally these use a compressed encoding format with the
33 * compressed size stored in the corresponding metadata byte. However for
34 * uncompressed blocks, or blocks in a texture where UBWC is disabled, the
35 * pixels within the block are stored using a straightforward
36 * coordinate-interleaving scheme:
37 *
38 * b7 b6 b5 b4 b3 b2 b1 b0
39 * -----------------------
40 * y2 x4 x3 x2 y1 x1 y0 x0
41 *
42 * Pixel contents are always stored linearly, only the pixel offset is
43 * swizzled. UBWC blocks for most formats are smaller than 256 pixels and
44 * only use the first xN and yN, ignoring the higher bits.
45 *
46 * There is a special case for single-sampled R8G8 formats, where the 32x8
47 * block is stored as a 32x8 R8 tile where the left half stores the R
48 * components for each pixel and the right half stores the G components.
49 * However non-compressed tiled R8G8 textures are not supported so we
50 * ignore it here.
51 *
52 * 2. The 256 byte tile. Most UBWC blocks are 256 bytes already, but UBWC
53 * blocks for some smaller formats are only 128 bytes, so 2x1 or 2x2 blocks
54 * are combined to get a 256 byte tile. This can also be thought of as
55 * re-adding bits that were dropped in the coordinate-interleaving scheme
56 * above, and we take advantage of this to fold this level into the
57 * previous one as we don't care about compression.
58 *
59 * 3. The 2K macrotile. This consists of 2x4 tiles, with a complicated
60 * xor-based bank swizzling scheme. There are two possible modes, chosen by
61 * the "macrotile mode" in RBBM_NC_MODE_CNTL. For formats with cpp of 16 or
62 * greater, both modes are identical and the scheme is this:
63 *
64 * b0 = x0 ^ y1
65 * b1 = x0 ^ y1 ^ y0
66 * b2 = x0 ^ y0
67 *
68 * For all formats with a cpp less than 16, additional higher-order bits
69 * are xor'ed into the upper 2 offset bits depending on the macrotile mode.
70 * In "4 channel" mode:
71 *
72 * b1' = b1 ^ x1
73 *
74 * and in "8 channel" mode:
75 *
76 * b1' = b1 ^ x1
77 * b2' = b2 ^ x2 ^ y2
78 *
79 * The macrotile stride is always a multiple of 2, so that pairs of 2K
80 * macrotiles can be considered 4K macrotiles with one additional offset
81 * bit:
82 *
83 * b3 = x1
84 *
85 * This is closer to the hardware representation as the base address is
86 * aligned to 4K. However for our purposes this can be folded into
87 * the next level:
88 *
89 * 4. Swizzled macrotile offset. The macrotile offset is usually linear,
90 * however with strides that are aligned to the number of DDR banks this
91 * can result in bank conflicts between vertically adjacent macrotiles that
92 * map to the same bank. This is mitigated by xor'ing up to 3 bits of the
93 * y offset into x based on how aligned the stride is before computing the
94 * offset, or equivalently xor'ing them into the final offset. The
95 * alignment is based on a value called the "highest bank bit" that is
96 * programmed by the kernel based on the memory bank configuration.
97 *
98 * The kernel also chooses which bits of y to xor in, which are called
99 * "bank swizzle levels." The naming is weird, because the lowest level,
100 * level 1, actually involves the highest bit of y:
101 * - "Level 1 bank swizzling" swizzles bit 2 of the macrotile y offset into
102 * the highest bank bit plus 1 when the stride between macrotiles (in
103 * bytes) is a multiple of 2^{hbb + 2} where hbb is the highest bank bit.
104 * - "Level 2 bank swizzling" swizzles bit 0 of the macrotile y offset into
105 * the highest bank bit minus 1 when the stride is a multiple of 2^{hbb}.
106 * - "Level 3 bank swizzling" swizzles bit 1 of the macrotile y offset into
107 * the highest bank bit when the stride is a multiple of 2^{hbb + 1},
108 *
109 * Level 1 bank swizzling is only enabled in UBWC 1.0 mode. Levels 2 and 3
110 * can be selectively disabled starting with UBWC 4.0.
111 *
112 * This implementation uses ideas from
113 * https://fgiesen.wordpress.com/2011/01/17/texture-tiling-and-swizzling/.
114 * Steps 1 and 2 map straightforwardly to the ideas explained there, but step
115 * 3 is very different. Luckily the offset of a block can still be split into
116 * a combination of values depending only on x and y, however they may be
117 * overlapping and instead of adding them together we have to xor them
118 * together.
119 *
120 * We choose the size of the innermost loop to be the size of a block, which
121 * is 256 bytes and therefore larger than strictly necessary, for two reasons:
122 * it simplifies the code a bit by not having to keep track of separate block
123 * sizes and "inner" block sizes, and in some cases a cacheline-sized inner
124 * tile wouldn't be wide enough to use ldp to get the fastest-possible 32 byte
125 * load.
126 */
127
128 #define USE_SLOW_PATH 0
129
130 static uint32_t
get_pixel_offset(uint32_t x,uint32_t y)131 get_pixel_offset(uint32_t x, uint32_t y)
132 {
133 return
134 (x & 1) |
135 (y & 1) << 1 |
136 ((x & 2) >> 1) << 2 |
137 ((y & 2) >> 1) << 3 |
138 ((x & 0x1c) >> 2) << 4 |
139 ((y & 4) >> 2) << 7;
140 }
141
142 /* Take the x and y block coordinates and return two masks which when combined
143 * give us the block offset in bytes. This includes the block offset within a
144 * macrotile and the macrotile x offset, which is swizzled based on the
145 * highest bank bit and enabled levels, but not the macrotile y offset which
146 * has to be added separately.
147 *
148 * This partially depends on the macrotile mode and block_x_xormask is called
149 * in the hot path, so we have to templatize it.
150 */
151
152 template<enum fdl_macrotile_mode macrotile_mode>
153 static uint32_t
154 block_x_xormask(uint32_t x, uint32_t cpp);
155
156 template<>
157 uint32_t
block_x_xormask(uint32_t x,uint32_t cpp)158 block_x_xormask<FDL_MACROTILE_4_CHANNEL>(uint32_t x, uint32_t cpp)
159 {
160 return (((x & 1) * 0b111) ^ (cpp < 16 ? (x & 0b010) : 0) ^ ((x >> 1) << 3)) << 8;
161 }
162
163 template<>
164 uint32_t
block_x_xormask(uint32_t x,uint32_t cpp)165 block_x_xormask<FDL_MACROTILE_8_CHANNEL>(uint32_t x, uint32_t cpp)
166 {
167 return (((x & 1) * 0b111) ^ (cpp < 16 ? (x & 0b110) : 0) ^ ((x >> 1) << 3)) << 8;
168 }
169
170 template<enum fdl_macrotile_mode macrotile_mode>
171 static uint32_t
172 block_y_xormask(uint32_t y, uint32_t cpp, uint32_t bank_mask, uint32_t bank_shift);
173
174 template<>
175 uint32_t
block_y_xormask(uint32_t y,uint32_t cpp,uint32_t bank_mask,uint32_t bank_shift)176 block_y_xormask<FDL_MACROTILE_4_CHANNEL>(uint32_t y, uint32_t cpp,
177 uint32_t bank_mask,
178 uint32_t bank_shift)
179 {
180 return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011)) << 8) |
181 ((y & bank_mask) << bank_shift);
182 }
183
184 template<>
185 uint32_t
block_y_xormask(uint32_t y,uint32_t cpp,uint32_t bank_mask,uint32_t bank_shift)186 block_y_xormask<FDL_MACROTILE_8_CHANNEL>(uint32_t y, uint32_t cpp,
187 uint32_t bank_mask,
188 uint32_t bank_shift)
189 {
190 return ((((y & 1) * 0b110) ^ (((y >> 1) & 1) * 0b011) ^
191 (cpp < 16 ? (y & 0b100) : 0)) << 8) |
192 ((y & bank_mask) << bank_shift);
193 }
194
195 /* Figure out how y is swizzled into x based on the UBWC config and block
196 * stride and return values to be plugged into block_y_xormask().
197 */
198
199 static uint32_t
get_bank_mask(uint32_t block_stride,uint32_t cpp,const struct fdl_ubwc_config * config)200 get_bank_mask(uint32_t block_stride, uint32_t cpp,
201 const struct fdl_ubwc_config *config)
202 {
203 /* For some reason, for cpp=1 (or R8G8 media formats) the alignment
204 * required is doubled.
205 */
206 unsigned offset = cpp == 1 ? 1 : 0;
207 uint32_t mask = 0;
208 if ((config->bank_swizzle_levels & 0x2) &&
209 (block_stride & ((1u << (config->highest_bank_bit - 10 + offset)) - 1)) == 0)
210 mask |= 0b100;
211 if ((config->bank_swizzle_levels & 0x4) &&
212 (block_stride & ((1u << (config->highest_bank_bit - 9 + offset)) - 1)) == 0)
213 mask |= 0b1000;
214 if ((config->bank_swizzle_levels & 0x1) &&
215 (block_stride & ((1u << (config->highest_bank_bit - 8 + offset)) - 1)) == 0)
216 mask |= 0b10000;
217 return mask;
218 }
219
220 static uint32_t
get_bank_shift(const struct fdl_ubwc_config * config)221 get_bank_shift(const struct fdl_ubwc_config *config)
222 {
223 return config->highest_bank_bit - 3;
224 }
225
226 #if USE_SLOW_PATH
227 static uint32_t
get_block_offset(uint32_t x,uint32_t y,unsigned block_stride,unsigned cpp,const struct fdl_ubwc_config * config)228 get_block_offset(uint32_t x, uint32_t y, unsigned block_stride, unsigned cpp,
229 const struct fdl_ubwc_config *config)
230 {
231 uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
232 unsigned bank_shift = get_bank_shift(config);
233 uint32_t x_mask, y_mask;
234 if (config->macrotile_mode == FDL_MACROTILE_4_CHANNEL) {
235 x_mask = block_x_xormask<FDL_MACROTILE_4_CHANNEL>(x, cpp);
236 y_mask = block_y_xormask<FDL_MACROTILE_4_CHANNEL>(y, cpp, bank_mask,
237 bank_shift);
238 } else {
239 x_mask = block_x_xormask<FDL_MACROTILE_8_CHANNEL>(x, cpp);
240 y_mask = block_y_xormask<FDL_MACROTILE_8_CHANNEL>(y, cpp, bank_mask,
241 bank_shift);
242 }
243 uint32_t macrotile_y = y >> 2;
244 uint32_t macrotile_stride = block_stride / 2;
245 return ((x_mask ^ y_mask) >> 8) + ((macrotile_y * macrotile_stride) << 3);
246 }
247 #endif
248
249 static void
get_block_size(unsigned cpp,uint32_t * block_width,uint32_t * block_height)250 get_block_size(unsigned cpp, uint32_t *block_width,
251 uint32_t *block_height)
252 {
253 switch (cpp) {
254 case 1:
255 *block_width = 32;
256 *block_height = 8;
257 break;
258 case 2:
259 *block_width = 32;
260 *block_height = 4;
261 break;
262 case 4:
263 *block_width = 16;
264 *block_height = 4;
265 break;
266 case 8:
267 *block_width = 8;
268 *block_height = 4;
269 break;
270 case 16:
271 *block_width = 4;
272 *block_height = 4;
273 break;
274 default:
275 unreachable("unknown cpp");
276 }
277 }
278
279 enum copy_dir {
280 LINEAR_TO_TILED,
281 TILED_TO_LINEAR,
282 };
283
284 template<unsigned cpp, enum copy_dir direction,
285 enum fdl_macrotile_mode macrotile_mode>
286 static void
memcpy_small(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * tiled,char * linear,uint32_t linear_pitch,uint32_t block_stride,const struct fdl_ubwc_config * config)287 memcpy_small(uint32_t x_start, uint32_t y_start,
288 uint32_t width, uint32_t height,
289 char *tiled, char *linear,
290 uint32_t linear_pitch, uint32_t block_stride,
291 const struct fdl_ubwc_config *config)
292 {
293 unsigned block_width, block_height;
294 get_block_size(cpp, &block_width, &block_height);
295 const uint32_t block_size = 256;
296
297 uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
298 uint32_t bank_shift = get_bank_shift(config);
299 uint32_t x_mask = (get_pixel_offset(~0u, 0)) & (block_size / cpp - 1);
300 uint32_t y_mask = (get_pixel_offset(0, ~0u)) & (block_size / cpp - 1);
301
302 /* The pitch between vertically adjacent 2K macrotiles. */
303 uint32_t macrotile_pitch = (block_stride / 2) * 2048;
304
305 uint32_t x_block_start = x_start / block_width;
306 uint32_t y_block_start = y_start / block_height;
307
308 tiled += (y_block_start >> 2) * macrotile_pitch;
309
310 uint32_t x_pixel_start = get_pixel_offset(x_start % block_width, 0);
311 uint32_t y_pixel_start = get_pixel_offset(0, y_start % block_height);
312
313 uint32_t y_block = y_block_start;
314 uint32_t y_pixel = y_pixel_start;
315 uint32_t y_xormask =
316 block_y_xormask<macrotile_mode>(y_block, cpp, bank_mask, bank_shift);
317 for (uint32_t y = 0; y < height; y++) {
318 uint32_t x_block = x_block_start;
319 uint32_t x_pixel = x_pixel_start;
320 uint32_t block_offset =
321 block_x_xormask<macrotile_mode>(x_block, cpp) ^ y_xormask;
322
323 char *tiled_line = tiled + y_pixel * cpp;
324 char *linear_pixel = linear;
325
326 for (uint32_t x = 0; x < width; x++) {
327 char *tiled_pixel = tiled_line + x_pixel * cpp + block_offset;
328
329 if (direction == LINEAR_TO_TILED)
330 memcpy(tiled_pixel, linear_pixel, cpp);
331 else
332 memcpy(linear_pixel, tiled_pixel, cpp);
333
334 x_pixel = (x_pixel - x_mask) & x_mask;
335 linear_pixel += cpp;
336
337 if (x_pixel == 0) {
338 x_block++;
339 block_offset =
340 block_x_xormask<macrotile_mode>(x_block, cpp) ^ y_xormask;
341 }
342 }
343
344 y_pixel = (y_pixel - y_mask) & y_mask;
345 if (y_pixel == 0) {
346 y_block++;
347 y_xormask =
348 block_y_xormask<macrotile_mode>(y_block, cpp, bank_mask, bank_shift);
349 if ((y_block & 3) == 0) {
350 tiled += macrotile_pitch;
351 }
352 }
353
354 linear += linear_pitch;
355 }
356 }
357
358 typedef void (*copy_fn)(char *tiled, char *linear, uint32_t linear_pitch);
359
360 typedef uint8_t pixel8_t __attribute__((vector_size(8), aligned(8)));
361 typedef uint8_t pixel8a1_t __attribute__((vector_size(8), aligned(1)));
362 typedef uint8_t pixel16_t __attribute__((vector_size(16), aligned(16)));
363 typedef uint8_t pixel16a1_t __attribute__((vector_size(16), aligned(1)));
364
365 /* We use memcpy_small as a fallback for copying a tile when there isn't
366 * optimized assembly, which requires a config, but because we're just copying
367 * a tile it doesn't matter which config we pass. Just pass an arbitrary valid
368 * config.
369 */
370 static const struct fdl_ubwc_config dummy_config = {
371 .highest_bank_bit = 13,
372 };
373
374 /* We use handwritten assembly for the smaller cpp's because gcc is too dumb
375 * to register allocate the vector registers without inserting extra moves,
376 * and it can't use the post-increment register mode so it emits too many add
377 * instructions. This means a ~10% performance regression compared to the
378 * hand-written assembly in the cpp=4 case.
379 */
380
381 static void
linear_to_tiled_1cpp(char * _tiled,char * _linear,uint32_t linear_pitch)382 linear_to_tiled_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
383 {
384 #if DETECT_ARCH_AARCH64
385 uint32_t *tiled = (uint32_t *)_tiled;
386 for (unsigned y = 0; y < 2; y++, _linear += 4 * linear_pitch) {
387 uint16x8_t *linear0 = (uint16x8_t *)_linear;
388 uint16x8_t *linear1 = (uint16x8_t *)(_linear + linear_pitch);
389 uint16x8_t *linear2 = (uint16x8_t *)(_linear + 2 * linear_pitch);
390 uint16x8_t *linear3 = (uint16x8_t *)(_linear + 3 * linear_pitch);
391 asm volatile(
392 "ldp q0, q4, [%2]\n"
393 "ldp q1, q5, [%3]\n"
394 "ldp q2, q6, [%4]\n"
395 "ldp q3, q7, [%5]\n"
396 "zip1 v8.8h, v0.8h, v1.8h\n"
397 "zip1 v9.8h, v2.8h, v3.8h\n"
398 "zip2 v10.8h, v0.8h, v1.8h\n"
399 "zip2 v11.8h, v2.8h, v3.8h\n"
400 "zip1 v12.8h, v4.8h, v5.8h\n"
401 "zip1 v13.8h, v6.8h, v7.8h\n"
402 "zip2 v14.8h, v4.8h, v5.8h\n"
403 "zip2 v15.8h, v6.8h, v7.8h\n"
404 "st2 {v8.2d, v9.2d}, [%0], #32\n"
405 "st2 {v10.2d, v11.2d}, [%0], #32\n"
406 "st2 {v12.2d, v13.2d}, [%0], #32\n"
407 "st2 {v14.2d, v15.2d}, [%0], #32\n"
408 : "=r"(tiled)
409 : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
410 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
411 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
412 }
413 #else
414 memcpy_small<1, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
415 0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, &dummy_config);
416 #endif
417 }
418
419 static void
tiled_to_linear_1cpp(char * _tiled,char * _linear,uint32_t linear_pitch)420 tiled_to_linear_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
421 {
422 #if DETECT_ARCH_AARCH64
423 uint32_t *tiled = (uint32_t *)_tiled;
424 for (unsigned y = 0; y < 2; y++, _linear += 4 * linear_pitch) {
425 uint16x8_t *linear0 = (uint16x8_t *)_linear;
426 uint16x8_t *linear1 = (uint16x8_t *)(_linear + linear_pitch);
427 uint16x8_t *linear2 = (uint16x8_t *)(_linear + 2 * linear_pitch);
428 uint16x8_t *linear3 = (uint16x8_t *)(_linear + 3 * linear_pitch);
429 asm volatile(
430 "ld2 {v8.2d, v9.2d}, [%0], #32\n"
431 "ld2 {v10.2d, v11.2d}, [%0], #32\n"
432 "ld2 {v12.2d, v13.2d}, [%0], #32\n"
433 "ld2 {v14.2d, v15.2d}, [%0], #32\n"
434 "uzp1 v0.8h, v8.8h, v10.8h\n"
435 "uzp2 v1.8h, v8.8h, v10.8h\n"
436 "uzp1 v2.8h, v9.8h, v11.8h\n"
437 "uzp2 v3.8h, v9.8h, v11.8h\n"
438 "uzp1 v4.8h, v12.8h, v14.8h\n"
439 "uzp2 v5.8h, v12.8h, v14.8h\n"
440 "uzp1 v6.8h, v13.8h, v15.8h\n"
441 "uzp2 v7.8h, v13.8h, v15.8h\n"
442 "stp q0, q4, [%2]\n"
443 "stp q1, q5, [%3]\n"
444 "stp q2, q6, [%4]\n"
445 "stp q3, q7, [%5]\n"
446 : "=r"(tiled)
447 : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
448 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
449 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
450 }
451 #else
452 memcpy_small<1, TILED_TO_LINEAR, FDL_MACROTILE_4_CHANNEL>(
453 0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, &dummy_config);
454 #endif
455 }
456
457 static void
linear_to_tiled_2cpp(char * _tiled,char * _linear,uint32_t linear_pitch)458 linear_to_tiled_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
459 {
460 #if DETECT_ARCH_AARCH64
461 uint32_t *tiled = (uint32_t *)_tiled;
462 for (unsigned x = 0; x < 2; x++, _linear += 32) {
463 uint32x4_t *linear0 = (uint32x4_t *)_linear;
464 uint32x4_t *linear1 = (uint32x4_t *)(_linear + linear_pitch);
465 uint32x4_t *linear2 = (uint32x4_t *)(_linear + 2 * linear_pitch);
466 uint32x4_t *linear3 = (uint32x4_t *)(_linear + 3 * linear_pitch);
467 asm volatile(
468 "ldp q0, q4, [%2]\n"
469 "ldp q1, q5, [%3]\n"
470 "ldp q2, q6, [%4]\n"
471 "ldp q3, q7, [%5]\n"
472 "zip1 v8.4s, v0.4s, v1.4s\n"
473 "zip1 v9.4s, v2.4s, v3.4s\n"
474 "zip2 v10.4s, v0.4s, v1.4s\n"
475 "zip2 v11.4s, v2.4s, v3.4s\n"
476 "zip1 v12.4s, v4.4s, v5.4s\n"
477 "zip1 v13.4s, v6.4s, v7.4s\n"
478 "zip2 v14.4s, v4.4s, v5.4s\n"
479 "zip2 v15.4s, v6.4s, v7.4s\n"
480 "stp q8, q9, [%0], #32\n"
481 "stp q10, q11, [%0], #32\n"
482 "stp q12, q13, [%0], #32\n"
483 "stp q14, q15, [%0], #32\n"
484 : "=r"(tiled)
485 : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
486 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
487 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
488 }
489 #else
490 memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
491 0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, &dummy_config);
492 #endif
493 }
494
495 static void
tiled_to_linear_2cpp(char * _tiled,char * _linear,uint32_t linear_pitch)496 tiled_to_linear_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
497 {
498 #if DETECT_ARCH_AARCH64
499 uint32_t *tiled = (uint32_t *)_tiled;
500 for (unsigned x = 0; x < 2; x++, _linear += 32) {
501 uint32x4_t *linear0 = (uint32x4_t *)_linear;
502 uint32x4_t *linear1 = (uint32x4_t *)(_linear + linear_pitch);
503 uint32x4_t *linear2 = (uint32x4_t *)(_linear + 2 * linear_pitch);
504 uint32x4_t *linear3 = (uint32x4_t *)(_linear + 3 * linear_pitch);
505 asm volatile(
506 "ldp q8, q9, [%0], #32\n"
507 "ldp q10, q11, [%0], #32\n"
508 "ldp q12, q13, [%0], #32\n"
509 "ldp q14, q15, [%0], #32\n"
510 "uzp1 v0.4s, v8.4s, v10.4s\n"
511 "uzp2 v1.4s, v8.4s, v10.4s\n"
512 "uzp1 v2.4s, v9.4s, v11.4s\n"
513 "uzp2 v3.4s, v9.4s, v11.4s\n"
514 "uzp1 v4.4s, v12.4s, v14.4s\n"
515 "uzp2 v5.4s, v12.4s, v14.4s\n"
516 "uzp1 v6.4s, v13.4s, v15.4s\n"
517 "uzp2 v7.4s, v13.4s, v15.4s\n"
518 "stp q0, q4, [%2]\n"
519 "stp q1, q5, [%3]\n"
520 "stp q2, q6, [%4]\n"
521 "stp q3, q7, [%5]\n"
522 : "=r"(tiled)
523 : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
524 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
525 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
526 }
527 #else
528 memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
529 0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, &dummy_config);
530 #endif
531 }
532
533 static void
linear_to_tiled_4cpp(char * _tiled,char * _linear,uint32_t linear_pitch)534 linear_to_tiled_4cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
535 {
536 #if DETECT_ARCH_AARCH64
537 uint64_t *tiled = (uint64_t *)_tiled;
538 uint64x2_t *linear0 = (uint64x2_t *)_linear;
539 uint64x2_t *linear1 = (uint64x2_t *)(_linear + linear_pitch);
540 uint64x2_t *linear2 = (uint64x2_t *)(_linear + 2 * linear_pitch);
541 uint64x2_t *linear3 = (uint64x2_t *)(_linear + 3 * linear_pitch);
542
543 asm volatile(
544 "ldp q0, q4, [%2]\n"
545 "ldp q1, q5, [%3]\n"
546 "ldp q2, q6, [%4]\n"
547 "ldp q3, q7, [%5]\n"
548 "ldp q8, q12, [%2, #32]\n"
549 "ldp q9, q13, [%3, #32]\n"
550 "ldp q10, q14, [%4, #32]\n"
551 "ldp q11, q15, [%5, #32]\n"
552 "st2 {v0.2d, v1.2d}, [%0], #32\n"
553 "st2 {v2.2d, v3.2d}, [%0], #32\n"
554 "st2 {v4.2d, v5.2d}, [%0], #32\n"
555 "st2 {v6.2d, v7.2d}, [%0], #32\n"
556 "st2 {v8.2d, v9.2d}, [%0], #32\n"
557 "st2 {v10.2d, v11.2d}, [%0], #32\n"
558 "st2 {v12.2d, v13.2d}, [%0], #32\n"
559 "st2 {v14.2d, v15.2d}, [%0], #32\n"
560 : "=r"(tiled)
561 : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
562 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
563 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
564 #else
565 pixel8_t *tiled = (pixel8_t *)_tiled;
566 for (unsigned x = 0; x < 4; x++, _linear += 4 * 4, tiled += 8) {
567 pixel8a1_t *linear0 = (pixel8a1_t *)_linear;
568 pixel8a1_t *linear1 = (pixel8a1_t *)(_linear + linear_pitch);
569 pixel8a1_t *linear2 = (pixel8a1_t *)(_linear + 2 * linear_pitch);
570 pixel8a1_t *linear3 = (pixel8a1_t *)(_linear + 3 * linear_pitch);
571 pixel8_t p000 = linear0[0];
572 pixel8_t p100 = linear0[1];
573 pixel8_t p001 = linear1[0];
574 pixel8_t p101 = linear1[1];
575 pixel8_t p010 = linear2[0];
576 pixel8_t p110 = linear2[1];
577 pixel8_t p011 = linear3[0];
578 pixel8_t p111 = linear3[1];
579 tiled[0] = p000;
580 tiled[1] = p001;
581 tiled[2] = p100;
582 tiled[3] = p101;
583 tiled[4] = p010;
584 tiled[5] = p011;
585 tiled[6] = p110;
586 tiled[7] = p111;
587 }
588 #endif
589 }
590
591 static void
tiled_to_linear_4cpp(char * _tiled,char * _linear,uint32_t linear_pitch)592 tiled_to_linear_4cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
593 {
594 #if DETECT_ARCH_AARCH64
595 uint64_t *tiled = (uint64_t *)_tiled;
596 uint64x2_t *linear0 = (uint64x2_t *)_linear;
597 uint64x2_t *linear1 = (uint64x2_t *)(_linear + linear_pitch);
598 uint64x2_t *linear2 = (uint64x2_t *)(_linear + 2 * linear_pitch);
599 uint64x2_t *linear3 = (uint64x2_t *)(_linear + 3 * linear_pitch);
600
601 asm volatile(
602 "ld2 {v0.2d, v1.2d}, [%0], #32\n"
603 "ld2 {v2.2d, v3.2d}, [%0], #32\n"
604 "ld2 {v4.2d, v5.2d}, [%0], #32\n"
605 "ld2 {v6.2d, v7.2d}, [%0], #32\n"
606 "ld2 {v8.2d, v9.2d}, [%0], #32\n"
607 "ld2 {v10.2d, v11.2d}, [%0], #32\n"
608 "ld2 {v12.2d, v13.2d}, [%0], #32\n"
609 "ld2 {v14.2d, v15.2d}, [%0], #32\n"
610 "stp q0, q4, [%2]\n"
611 "stp q1, q5, [%3]\n"
612 "stp q2, q6, [%4]\n"
613 "stp q3, q7, [%5]\n"
614 "stp q8, q12, [%2, #32]\n"
615 "stp q9, q13, [%3, #32]\n"
616 "stp q10, q14, [%4, #32]\n"
617 "stp q11, q15, [%5, #32]\n"
618 : "=r"(tiled)
619 : "0"(tiled), "r"(linear0), "r"(linear1), "r"(linear2), "r"(linear3)
620 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
621 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
622 #else
623 pixel8_t *tiled = (pixel8_t *)_tiled;
624 for (unsigned x = 0; x < 4; x++, _linear += 4 * 4, tiled += 8) {
625 pixel8a1_t *linear0 = (pixel8a1_t *)_linear;
626 pixel8a1_t *linear1 = (pixel8a1_t *)(_linear + linear_pitch);
627 pixel8a1_t *linear2 = (pixel8a1_t *)(_linear + 2 * linear_pitch);
628 pixel8a1_t *linear3 = (pixel8a1_t *)(_linear + 3 * linear_pitch);
629 pixel8_t p000 = tiled[0];
630 pixel8_t p001 = tiled[1];
631 pixel8_t p100 = tiled[2];
632 pixel8_t p101 = tiled[3];
633 pixel8_t p010 = tiled[4];
634 pixel8_t p011 = tiled[5];
635 pixel8_t p110 = tiled[6];
636 pixel8_t p111 = tiled[7];
637 linear0[0] = p000;
638 linear0[1] = p100;
639 linear1[0] = p001;
640 linear1[1] = p101;
641 linear2[0] = p010;
642 linear2[1] = p110;
643 linear3[0] = p011;
644 linear3[1] = p111;
645 }
646 #endif
647 }
648
649 static void
linear_to_tiled_8cpp(char * _tiled,char * _linear,uint32_t linear_pitch)650 linear_to_tiled_8cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
651 {
652 pixel16_t *tiled = (pixel16_t *)_tiled;
653
654 for (unsigned x = 0; x < 2; x++, _linear += 4 * 8) {
655 for (unsigned y = 0; y < 2; y++, tiled += 4) {
656 pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * y * linear_pitch);
657 pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + (2 * y + 1) * linear_pitch);
658 pixel16_t p00 = linear0[0];
659 pixel16_t p10 = linear0[1];
660 pixel16_t p01 = linear1[0];
661 pixel16_t p11 = linear1[1];
662 tiled[0] = p00;
663 tiled[1] = p01;
664 tiled[2] = p10;
665 tiled[3] = p11;
666 }
667 }
668 }
669
670 static void
tiled_to_linear_8cpp(char * _tiled,char * _linear,uint32_t linear_pitch)671 tiled_to_linear_8cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
672 {
673 pixel16_t *tiled = (pixel16_t *)_tiled;
674
675 for (unsigned x = 0; x < 2; x++, _linear += 4 * 8) {
676 for (unsigned y = 0; y < 2; y++, tiled += 4) {
677 pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * y * linear_pitch);
678 pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + (2 * y + 1) * linear_pitch);
679 pixel16_t p00 = tiled[0];
680 pixel16_t p01 = tiled[1];
681 pixel16_t p10 = tiled[2];
682 pixel16_t p11 = tiled[3];
683 linear0[0] = p00;
684 linear0[1] = p10;
685 linear1[0] = p01;
686 linear1[1] = p11;
687 }
688 }
689 }
690
691 static void
linear_to_tiled_16cpp(char * _tiled,char * _linear,uint32_t linear_pitch)692 linear_to_tiled_16cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
693 {
694 pixel16_t *tiled = (pixel16_t *)_tiled;
695
696 for (unsigned y = 0; y < 2; y++, _linear += 2 * linear_pitch) {
697 for (unsigned x = 0; x < 2; x++, tiled += 4) {
698 pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * 16 * x);
699 pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + linear_pitch + 2 * 16 * x);
700 pixel16_t p00 = linear0[0];
701 pixel16_t p10 = linear0[1];
702 pixel16_t p01 = linear1[0];
703 pixel16_t p11 = linear1[1];
704 tiled[0] = p00;
705 tiled[1] = p10;
706 tiled[2] = p01;
707 tiled[3] = p11;
708 }
709 }
710 }
711
712 static void
tiled_to_linear_16cpp(char * _tiled,char * _linear,uint32_t linear_pitch)713 tiled_to_linear_16cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
714 {
715 pixel16_t *tiled = (pixel16_t *)_tiled;
716
717 for (unsigned y = 0; y < 2; y++, _linear += 2 * linear_pitch) {
718 for (unsigned x = 0; x < 2; x++, tiled += 4) {
719 pixel16a1_t *linear0 = (pixel16a1_t *)(_linear + 2 * 16 * x);
720 pixel16a1_t *linear1 = (pixel16a1_t *)(_linear + linear_pitch + 2 * 16 * x);
721 pixel16_t p00 = tiled[0];
722 pixel16_t p10 = tiled[1];
723 pixel16_t p01 = tiled[2];
724 pixel16_t p11 = tiled[3];
725 linear0[0] = p00;
726 linear0[1] = p10;
727 linear1[0] = p01;
728 linear1[1] = p11;
729 }
730 }
731 }
732
733 template<unsigned cpp, enum copy_dir direction, copy_fn copy_block,
734 enum fdl_macrotile_mode macrotile_mode>
735 static void
memcpy_large(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * tiled,char * linear,uint32_t linear_pitch,uint32_t block_stride,const fdl_ubwc_config * config)736 memcpy_large(uint32_t x_start, uint32_t y_start,
737 uint32_t width, uint32_t height,
738 char *tiled, char *linear,
739 uint32_t linear_pitch, uint32_t block_stride,
740 const fdl_ubwc_config *config)
741 {
742 unsigned block_width, block_height;
743 get_block_size(cpp, &block_width, &block_height);
744
745 /* The region to copy is divided into 9 parts:
746 *
747 * x_start x_aligned_start x_aligned_end x_end
748 *
749 * y_start /--------------------------------------\
750 * | | | |
751 * y_aligned_start |--------------------------------------|
752 * | | | |
753 * | | aligned area | |
754 * | | | |
755 * y_aligned_end |--------------------------------------|
756 * | | | |
757 * y_end \--------------------------------------/
758 *
759 * The aligned area consists of aligned blocks that we can use our
760 * optimized copy function on, but the rest consists of misaligned pieces
761 * of blocks.
762 */
763
764 uint32_t x_end = x_start + width;
765 uint32_t x_aligned_start = align(x_start, block_width);
766 uint32_t x_aligned_end = ROUND_DOWN_TO(x_end, block_width);
767
768 uint32_t y_end = y_start + height;
769 uint32_t y_aligned_start = align(y_start, block_height);
770 uint32_t y_aligned_end = ROUND_DOWN_TO(y_end, block_height);
771
772 /* If we don't cover any full tiles, use the small loop */
773 if (x_aligned_end <= x_aligned_start || y_aligned_end <= y_aligned_start) {
774 memcpy_small<cpp, direction, macrotile_mode>(
775 x_start, y_start, width, height, tiled, linear, linear_pitch,
776 block_stride, config);
777 return;
778 }
779
780 /* Handle the top third */
781 if (y_start != y_aligned_start) {
782 memcpy_small<cpp, direction, macrotile_mode>(
783 x_start, y_start, width, y_aligned_start - y_start, tiled, linear,
784 linear_pitch, block_stride, config);
785 linear += (y_aligned_start - y_start) * linear_pitch;
786 }
787
788 /* Handle left of the aligned block */
789 char *linear_aligned = linear;
790 if (x_start != x_aligned_start) {
791 memcpy_small<cpp, direction, macrotile_mode>(
792 x_start, y_aligned_start, x_aligned_start - x_start,
793 y_aligned_end - y_aligned_start, tiled, linear, linear_pitch,
794 block_stride, config);
795 linear_aligned = linear + (x_aligned_start - x_start) * cpp;
796 }
797
798 /* Handle the main part */
799 uint32_t macrotile_pitch = (block_stride / 2) * 2048;
800 uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
801 uint32_t bank_shift = get_bank_shift(config);
802 char *tiled_aligned =
803 tiled + macrotile_pitch * (y_aligned_start / (block_height * 4));
804
805 for (unsigned y_block = y_aligned_start / block_height;
806 y_block < y_aligned_end / block_height;) {
807 uint32_t y_xormask =
808 block_y_xormask<macrotile_mode>(y_block, cpp, bank_mask, bank_shift);
809 char *linear_block = linear_aligned;
810
811 for (unsigned x_block = x_aligned_start / block_width;
812 x_block < x_aligned_end / block_width; x_block++) {
813 uint32_t block_offset =
814 block_x_xormask<macrotile_mode>(x_block, cpp) ^ y_xormask;
815 copy_block(tiled_aligned + block_offset, linear_block, linear_pitch);
816 linear_block += block_width * cpp;
817 }
818
819 linear_aligned += block_height * linear_pitch;
820
821 y_block++;
822 if ((y_block & 3) == 0)
823 tiled_aligned += macrotile_pitch;
824 }
825
826 /* Handle right of the aligned block */
827 if (x_end != x_aligned_end) {
828 char *linear_end =
829 linear + (x_aligned_end - x_start) * cpp;
830 memcpy_small<cpp, direction, macrotile_mode>(
831 x_aligned_end, y_aligned_start, x_end - x_aligned_end,
832 y_aligned_end - y_aligned_start, tiled, linear_end, linear_pitch,
833 block_stride, config);
834 }
835
836 /* Handle the bottom third */
837 linear += (y_aligned_end - y_aligned_start) * linear_pitch;
838 if (y_end != y_aligned_end) {
839 memcpy_small<cpp, direction, macrotile_mode>(
840 x_start, y_aligned_end, width, y_end - y_aligned_end,
841 tiled, linear, linear_pitch, block_stride,
842 config);
843 }
844 }
845
846 void
fdl6_memcpy_linear_to_tiled(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * dst,const char * src,const struct fdl_layout * dst_layout,unsigned dst_miplevel,uint32_t src_pitch,const struct fdl_ubwc_config * config)847 fdl6_memcpy_linear_to_tiled(uint32_t x_start, uint32_t y_start,
848 uint32_t width, uint32_t height,
849 char *dst, const char *src,
850 const struct fdl_layout *dst_layout,
851 unsigned dst_miplevel,
852 uint32_t src_pitch,
853 const struct fdl_ubwc_config *config)
854 {
855 unsigned block_width, block_height;
856 uint32_t cpp = dst_layout->cpp;
857 get_block_size(cpp, &block_width, &block_height);
858 uint32_t block_stride =
859 fdl_pitch(dst_layout, dst_miplevel) / (block_width * dst_layout->cpp);
860 uint32_t block_size = 256;
861 assert(block_size == block_width * block_height * dst_layout->cpp);
862 assert(config->macrotile_mode != FDL_MACROTILE_INVALID);
863
864 #if USE_SLOW_PATH
865 for (uint32_t y = 0; y < height; y++) {
866 uint32_t y_block = (y + y_start) / block_height;
867 uint32_t y_pixel = (y + y_start) % block_height;
868 for (uint32_t x = 0; x < width; x++) {
869 uint32_t x_block = (x + x_start) / block_width;
870 uint32_t x_pixel = (x + x_start) % block_width;
871
872 uint32_t block_offset =
873 get_block_offset(x_block, y_block, block_stride, cpp,
874 config);
875 uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);
876
877 memcpy(dst + block_size * block_offset + cpp * pixel_offset,
878 src + y * src_pitch + x * cpp, cpp);
879 }
880 }
881 #else
882 switch (cpp) {
883 #define CASE(case_cpp) \
884 case case_cpp: \
885 if (config->macrotile_mode == FDL_MACROTILE_4_CHANNEL) { \
886 memcpy_large<case_cpp, LINEAR_TO_TILED, \
887 linear_to_tiled_##case_cpp##cpp, FDL_MACROTILE_4_CHANNEL>( \
888 x_start, y_start, width, height, dst, (char *)src, src_pitch, \
889 block_stride, config); \
890 } else { \
891 memcpy_large<case_cpp, LINEAR_TO_TILED, \
892 linear_to_tiled_##case_cpp##cpp, FDL_MACROTILE_8_CHANNEL>( \
893 x_start, y_start, width, height, dst, (char *)src, src_pitch, \
894 block_stride, config); \
895 } \
896 break;
897 CASE(1)
898 CASE(2)
899 CASE(4)
900 CASE(8)
901 CASE(16)
902 #undef CASE
903 default:
904 unreachable("unknown cpp");
905 }
906 #endif
907 }
908
909 void
fdl6_memcpy_tiled_to_linear(uint32_t x_start,uint32_t y_start,uint32_t width,uint32_t height,char * dst,const char * src,const struct fdl_layout * src_layout,unsigned src_miplevel,uint32_t dst_pitch,const struct fdl_ubwc_config * config)910 fdl6_memcpy_tiled_to_linear(uint32_t x_start, uint32_t y_start,
911 uint32_t width, uint32_t height,
912 char *dst, const char *src,
913 const struct fdl_layout *src_layout,
914 unsigned src_miplevel,
915 uint32_t dst_pitch,
916 const struct fdl_ubwc_config *config)
917 {
918 unsigned block_width, block_height;
919 unsigned cpp = src_layout->cpp;
920 get_block_size(cpp, &block_width, &block_height);
921 uint32_t block_stride =
922 fdl_pitch(src_layout, src_miplevel) / (block_width * src_layout->cpp);
923 uint32_t block_size = 256;
924 assert(block_size == block_width * block_height * src_layout->cpp);
925 assert(config->macrotile_mode != FDL_MACROTILE_INVALID);
926
927 #if USE_SLOW_PATH
928 for (uint32_t y = 0; y < height; y++) {
929 uint32_t y_block = (y + y_start) / block_height;
930 uint32_t y_pixel = (y + y_start) % block_height;
931 for (uint32_t x = 0; x < width; x++) {
932 uint32_t x_block = (x + x_start) / block_width;
933 uint32_t x_pixel = (x + x_start) % block_width;
934
935 uint32_t block_offset =
936 get_block_offset(x_block, y_block, block_stride, src_layout->cpp,
937 config);
938 uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);
939
940 memcpy(dst + y * dst_pitch + x * src_layout->cpp,
941 src + block_size * block_offset + src_layout->cpp * pixel_offset,
942 src_layout->cpp);
943 }
944 }
945 #else
946 switch (cpp) {
947 #define CASE(case_cpp) \
948 case case_cpp: \
949 if (config->macrotile_mode == FDL_MACROTILE_4_CHANNEL) { \
950 memcpy_large<case_cpp, TILED_TO_LINEAR, \
951 tiled_to_linear_##case_cpp##cpp, FDL_MACROTILE_4_CHANNEL>( \
952 x_start, y_start, width, height, (char *)src, dst, dst_pitch, \
953 block_stride, config); \
954 } else { \
955 memcpy_large<case_cpp, TILED_TO_LINEAR, \
956 tiled_to_linear_##case_cpp##cpp, FDL_MACROTILE_8_CHANNEL>( \
957 x_start, y_start, width, height, (char *)src, dst, dst_pitch, \
958 block_stride, config); \
959 } \
960 break;
961 CASE(1)
962 CASE(2)
963 CASE(4)
964 CASE(8)
965 CASE(16)
966 #undef CASE
967 default:
968 unreachable("unknown cpp");
969 }
970 #endif
971 }
972