1 /*
2 * Mesa 3-D graphics library
3 *
4 * Copyright 2012 Intel Corporation
5 * Copyright 2013 Google
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Authors:
28 * Chad Versace <chad.versace@linux.intel.com>
29 * Frank Henigman <fjhenigman@google.com>
30 */
31
32 #include <string.h>
33
34 #include "util/macros.h"
35 #include "util/u_math.h"
36 #include "util/rounding.h"
37 #include "isl_priv.h"
38
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49
50 /* Tile dimensions. Width and span are in bytes, height is in pixels (i.e.
51 * unitless). A "span" is the most number of bytes we can copy from linear
52 * to tiled without needing to calculate a new destination address.
53 */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60 static const uint32_t wtile_width = 64;
61 static const uint32_t wtile_height = 64;
62 static const uint32_t wtile_span = 2;
63
64 static inline uint32_t
ror(uint32_t n,uint32_t d)65 ror(uint32_t n, uint32_t d)
66 {
67 return (n >> d) | (n << (32 - d));
68 }
69
70 // bswap32 already exists as a macro on some platforms (FreeBSD)
71 #ifndef bswap32
72 static inline uint32_t
bswap32(uint32_t n)73 bswap32(uint32_t n)
74 {
75 #if defined(HAVE___BUILTIN_BSWAP32)
76 return __builtin_bswap32(n);
77 #else
78 return (n >> 24) |
79 ((n >> 8) & 0x0000ff00) |
80 ((n << 8) & 0x00ff0000) |
81 (n << 24);
82 #endif
83 }
84 #endif
85
86 /**
87 * Copy RGBA to BGRA - swap R and B.
88 */
89 static inline void *
rgba8_copy(void * dst,const void * src,size_t bytes)90 rgba8_copy(void *dst, const void *src, size_t bytes)
91 {
92 uint32_t *d = dst;
93 uint32_t const *s = src;
94
95 assert(bytes % 4 == 0);
96
97 while (bytes >= 4) {
98 *d = ror(bswap32(*s), 8);
99 d += 1;
100 s += 1;
101 bytes -= 4;
102 }
103 return dst;
104 }
105
106 #define wtile_block_id(x, y) \
107 (((((x) >> 3) & 0x7) << 3) | \
108 (((y) >> 3) & 0x7))
109
110 #define wtile_block_offset(x, y) \
111 ((((y) & 4) << 3) + \
112 (((y) & 2) << 2) + \
113 (((y) & 1) << 1) + \
114 (((x) & 4) << 2) + \
115 (((x) & 2) << 1) + \
116 (((x) & 1) << 0))
117
118 /**
119 * Copy from linear into a W tile block.
120 *
121 * @dst is a pointer to a block in a W tile, @src is a pointer to the linear
122 * data, coordinates are relative to the surface (not the tile).
123 */
124 static inline void
wtile_block_copy_from_linear(void * dst,const void * src,unsigned x0,unsigned x1,unsigned y0,unsigned y1,unsigned src_pitch)125 wtile_block_copy_from_linear(void *dst, const void *src,
126 unsigned x0, unsigned x1,
127 unsigned y0, unsigned y1,
128 unsigned src_pitch)
129 {
130 uint8_t *dst_data = dst + wtile_block_id(x0, y0) * 64;
131 const uint8_t *src_data = src;
132
133 for (unsigned y = y0; y < y1; y++)
134 for (unsigned x = x0; x < x1; x++)
135 dst_data[wtile_block_offset(x, y)] = src_data[y * src_pitch + x];
136 }
137
138 /**
139 * Copy from linear into a full W tile block.
140 *
141 * @dst is a pointer to a block in a W tile, @src is a pointer to the linear
142 * data.
143 */
144 static inline void
wtile_block_full_copy_from_linear(void * dst,const void * src,unsigned x0,unsigned y0,unsigned src_pitch)145 wtile_block_full_copy_from_linear(void *dst, const void *src,
146 unsigned x0, unsigned y0,
147 unsigned src_pitch)
148 {
149 uint16_t *dst_data = dst + wtile_block_id(x0, y0) * 64;
150 const uint8_t *src_data = src;
151
152 /*
153 * The layout of a block is a series of 2 consecutive bytes elements.
154 * _________________________________
155 * |B00|B01|B04|B05|B16|B17|B20|B21|
156 * |B02|B03|B06|B07|B18|B19|B22|B23|
157 * |B08|B09|B12|B13|B24|B25|B28|B29|
158 * |B10|B11|B14|B15|B26|B27|B30|B31|
159 * |B32|B33|B36|B37|B48|B49|B52|B53|
160 * |B34|B35|B38|B39|B50|B51|B54|B55|
161 * |B40|B41|B44|B45|B56|B57|B60|B61|
162 * |B42|B43|B46|B47|B58|B59|B62|B64|
163 * ---------------------------------
164 */
165
166 #define src_lin(bx, by) \
167 (*((const uint16_t *)(src_data + (y0 + by) * src_pitch + x0 + bx * 2)))
168
169 dst_data[0] = src_lin(0, 0);
170 dst_data[1] = src_lin(0, 1);
171 dst_data[2] = src_lin(1, 0);
172 dst_data[3] = src_lin(1, 1);
173 dst_data[4] = src_lin(0, 2);
174 dst_data[5] = src_lin(0, 3);
175 dst_data[6] = src_lin(1, 2);
176 dst_data[7] = src_lin(1, 3);
177
178 dst_data[8] = src_lin(2, 0);
179 dst_data[9] = src_lin(2, 1);
180 dst_data[10] = src_lin(3, 0);
181 dst_data[11] = src_lin(3, 1);
182 dst_data[12] = src_lin(2, 2);
183 dst_data[13] = src_lin(2, 3);
184 dst_data[14] = src_lin(3, 2);
185 dst_data[15] = src_lin(3, 3);
186
187 dst_data[16] = src_lin(0, 4);
188 dst_data[17] = src_lin(0, 5);
189 dst_data[18] = src_lin(1, 4);
190 dst_data[19] = src_lin(1, 5);
191 dst_data[20] = src_lin(0, 6);
192 dst_data[21] = src_lin(0, 7);
193 dst_data[22] = src_lin(1, 6);
194 dst_data[23] = src_lin(1, 7);
195
196 dst_data[24] = src_lin(2, 4);
197 dst_data[25] = src_lin(2, 5);
198 dst_data[26] = src_lin(3, 4);
199 dst_data[27] = src_lin(3, 5);
200 dst_data[28] = src_lin(2, 6);
201 dst_data[29] = src_lin(2, 7);
202 dst_data[30] = src_lin(3, 6);
203 dst_data[31] = src_lin(3, 7);
204
205 #undef src_lin
206 }
207
208 /**
209 * Copy from W tile block into linear.
210 *
211 * @dst is a pointer to the linear data, @src is a pointer to a block in the W
212 * tile.
213 */
214 static inline void
wtile_block_copy_to_linear(void * dst,const void * src,unsigned x0,unsigned x1,unsigned y0,unsigned y1,unsigned dst_pitch)215 wtile_block_copy_to_linear(void *dst, const void *src,
216 unsigned x0, unsigned x1,
217 unsigned y0, unsigned y1,
218 unsigned dst_pitch)
219 {
220 uint8_t *dst_data = dst;
221 const uint8_t *src_data = src + wtile_block_id(x0, y0) * 64;
222
223 for (unsigned y = y0; y < y1; y++)
224 for (unsigned x = x0; x < x1; x++)
225 dst_data[y * dst_pitch + x] = src_data[wtile_block_offset(x, y)];
226 }
227
228 /**
229 * Copy to linear from a full W tile block.
230 *
231 * @dst is a pointer to the linear data, @src is a pointer to a block in a W
232 * tile.
233 */
234 static inline void
wtile_block_full_copy_to_linear(void * dst,const void * src,unsigned x0,unsigned y0,unsigned dst_pitch)235 wtile_block_full_copy_to_linear(void *dst, const void *src,
236 unsigned x0, unsigned y0,
237 unsigned dst_pitch)
238 {
239 uint8_t *dst_data = dst;
240 const uint16_t *src_data = src + wtile_block_id(x0, y0) * 64;
241
242 /*
243 * The layout of a block is a series of 2 consecutive bytes elements.
244 * _________________________________
245 * |B00|B01|B04|B05|B16|B17|B20|B21|
246 * |B02|B03|B06|B07|B18|B19|B22|B23|
247 * |B08|B09|B12|B13|B24|B25|B28|B29|
248 * |B10|B11|B14|B15|B26|B27|B30|B31|
249 * |B32|B33|B36|B37|B48|B49|B52|B53|
250 * |B34|B35|B38|B39|B50|B51|B54|B55|
251 * |B40|B41|B44|B45|B56|B57|B60|B61|
252 * |B42|B43|B46|B47|B58|B59|B62|B64|
253 * ---------------------------------
254 */
255
256 #define dst_lin(bx, by) \
257 (*((uint16_t *)(dst_data + (y0 + by) * dst_pitch + x0 + bx * 2)))
258
259 dst_lin(0, 0) = src_data[0];
260 dst_lin(0, 1) = src_data[1];
261 dst_lin(1, 0) = src_data[2];
262 dst_lin(1, 1) = src_data[3];
263 dst_lin(0, 2) = src_data[4];
264 dst_lin(0, 3) = src_data[5];
265 dst_lin(1, 2) = src_data[6];
266 dst_lin(1, 3) = src_data[7];
267
268 dst_lin(2, 0) = src_data[8];
269 dst_lin(2, 1) = src_data[9];
270 dst_lin(3, 0) = src_data[10];
271 dst_lin(3, 1) = src_data[11];
272 dst_lin(2, 2) = src_data[12];
273 dst_lin(2, 3) = src_data[13];
274 dst_lin(3, 2) = src_data[14];
275 dst_lin(3, 3) = src_data[15];
276
277 dst_lin(0, 4) = src_data[16];
278 dst_lin(0, 5) = src_data[17];
279 dst_lin(1, 4) = src_data[18];
280 dst_lin(1, 5) = src_data[19];
281 dst_lin(0, 6) = src_data[20];
282 dst_lin(0, 7) = src_data[21];
283 dst_lin(1, 6) = src_data[22];
284 dst_lin(1, 7) = src_data[23];
285
286 dst_lin(2, 4) = src_data[24];
287 dst_lin(2, 5) = src_data[25];
288 dst_lin(3, 4) = src_data[26];
289 dst_lin(3, 5) = src_data[27];
290 dst_lin(2, 6) = src_data[28];
291 dst_lin(2, 7) = src_data[29];
292 dst_lin(3, 6) = src_data[30];
293 dst_lin(3, 7) = src_data[31];
294
295 #undef dst_lin
296 }
297
298 #ifdef __SSSE3__
299 static const uint8_t rgba8_permutation[16] =
300 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
301
302 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)303 rgba8_copy_16_aligned_dst(void *dst, const void *src)
304 {
305 _mm_store_si128(dst,
306 _mm_shuffle_epi8(_mm_loadu_si128(src),
307 *(__m128i *)rgba8_permutation));
308 }
309
310 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)311 rgba8_copy_16_aligned_src(void *dst, const void *src)
312 {
313 _mm_storeu_si128(dst,
314 _mm_shuffle_epi8(_mm_load_si128(src),
315 *(__m128i *)rgba8_permutation));
316 }
317
318 #elif defined(__SSE2__)
319 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)320 rgba8_copy_16_aligned_dst(void *dst, const void *src)
321 {
322 __m128i srcreg, dstreg, agmask, ag, rb, br;
323
324 agmask = _mm_set1_epi32(0xFF00FF00);
325 srcreg = _mm_loadu_si128((__m128i *)src);
326
327 rb = _mm_andnot_si128(agmask, srcreg);
328 ag = _mm_and_si128(agmask, srcreg);
329 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
330 _MM_SHUFFLE(2, 3, 0, 1));
331 dstreg = _mm_or_si128(ag, br);
332
333 _mm_store_si128((__m128i *)dst, dstreg);
334 }
335
336 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)337 rgba8_copy_16_aligned_src(void *dst, const void *src)
338 {
339 __m128i srcreg, dstreg, agmask, ag, rb, br;
340
341 agmask = _mm_set1_epi32(0xFF00FF00);
342 srcreg = _mm_load_si128((__m128i *)src);
343
344 rb = _mm_andnot_si128(agmask, srcreg);
345 ag = _mm_and_si128(agmask, srcreg);
346 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
347 _MM_SHUFFLE(2, 3, 0, 1));
348 dstreg = _mm_or_si128(ag, br);
349
350 _mm_storeu_si128((__m128i *)dst, dstreg);
351 }
352 #endif
353
354 /**
355 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
356 */
357 static inline void *
rgba8_copy_aligned_dst(void * dst,const void * src,size_t bytes)358 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
359 {
360 assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
361
362 #if defined(__SSSE3__) || defined(__SSE2__)
363 if (bytes == 64) {
364 rgba8_copy_16_aligned_dst(dst + 0, src + 0);
365 rgba8_copy_16_aligned_dst(dst + 16, src + 16);
366 rgba8_copy_16_aligned_dst(dst + 32, src + 32);
367 rgba8_copy_16_aligned_dst(dst + 48, src + 48);
368 return dst;
369 }
370
371 while (bytes >= 16) {
372 rgba8_copy_16_aligned_dst(dst, src);
373 src += 16;
374 dst += 16;
375 bytes -= 16;
376 }
377 #endif
378
379 rgba8_copy(dst, src, bytes);
380
381 return dst;
382 }
383
384 /**
385 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
386 */
387 static inline void *
rgba8_copy_aligned_src(void * dst,const void * src,size_t bytes)388 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
389 {
390 assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
391
392 #if defined(__SSSE3__) || defined(__SSE2__)
393 if (bytes == 64) {
394 rgba8_copy_16_aligned_src(dst + 0, src + 0);
395 rgba8_copy_16_aligned_src(dst + 16, src + 16);
396 rgba8_copy_16_aligned_src(dst + 32, src + 32);
397 rgba8_copy_16_aligned_src(dst + 48, src + 48);
398 return dst;
399 }
400
401 while (bytes >= 16) {
402 rgba8_copy_16_aligned_src(dst, src);
403 src += 16;
404 dst += 16;
405 bytes -= 16;
406 }
407 #endif
408
409 rgba8_copy(dst, src, bytes);
410
411 return dst;
412 }
413
414 /**
415 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
416 * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
417 * The first and last ranges must be shorter than a "span" (the longest linear
418 * stretch within a tile) and the middle must equal a whole number of spans.
419 * Ranges may be empty. The region copied must land entirely within one tile.
420 * 'dst' is the start of the tile and 'src' is the corresponding
421 * address to copy from, though copying begins at (x0, y0).
422 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
423 * Swizzling flips bit 6 in the copy destination offset, when certain other
424 * bits are set in it.
425 */
426 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
427 uint32_t y0, uint32_t y1,
428 char *dst, const char *src,
429 int32_t linear_pitch,
430 uint32_t swizzle_bit,
431 isl_memcpy_type copy_type);
432
433 /**
434 * Copy texture data from linear to X tile layout.
435 *
436 * \copydoc tile_copy_fn
437 *
438 * The mem_copy parameters allow the user to specify an alternative mem_copy
439 * function that, for instance, may do RGBA -> BGRA swizzling. The first
440 * function must handle any memory alignment while the second function must
441 * only handle 16-byte alignment in whichever side (source or destination) is
442 * tiled.
443 */
444 static inline void
linear_to_xtiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)445 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
446 uint32_t y0, uint32_t y1,
447 char *dst, const char *src,
448 int32_t src_pitch,
449 uint32_t swizzle_bit,
450 isl_mem_copy_fn mem_copy,
451 isl_mem_copy_fn mem_copy_align16)
452 {
453 /* The copy destination offset for each range copied is the sum of
454 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
455 */
456 uint32_t xo, yo;
457
458 src += (ptrdiff_t)y0 * src_pitch;
459
460 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
461 /* Bits 9 and 10 of the copy destination offset control swizzling.
462 * Only 'yo' contributes to those bits in the total offset,
463 * so calculate 'swizzle' just once per row.
464 * Move bits 9 and 10 three and four places respectively down
465 * to bit 6 and xor them.
466 */
467 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
468
469 mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
470
471 for (xo = x1; xo < x2; xo += xtile_span) {
472 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
473 }
474
475 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
476
477 src += src_pitch;
478 }
479 }
480
481 /**
482 * Copy texture data from linear to Y tile layout.
483 *
484 * \copydoc tile_copy_fn
485 */
486 static inline void
linear_to_ytiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)487 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
488 uint32_t y0, uint32_t y3,
489 char *dst, const char *src,
490 int32_t src_pitch,
491 uint32_t swizzle_bit,
492 isl_mem_copy_fn mem_copy,
493 isl_mem_copy_fn mem_copy_align16)
494 {
495 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
496 * as the tile). Thus the destination offset for (x,y) is the sum of:
497 * (x % column_width) // position within column
498 * (x / column_width) * bytes_per_column // column number * bytes per column
499 * y * column_width
500 *
501 * The copy destination offset for each range copied is the sum of
502 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
503 */
504 const uint32_t column_width = ytile_span;
505 const uint32_t bytes_per_column = column_width * ytile_height;
506
507 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
508 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
509
510 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
511 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
512
513 /* Bit 9 of the destination offset control swizzling.
514 * Only the X offset contributes to bit 9 of the total offset,
515 * so swizzle can be calculated in advance for these X positions.
516 * Move bit 9 three places down to bit 6.
517 */
518 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
519 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
520
521 uint32_t x, yo;
522
523 src += (ptrdiff_t)y0 * src_pitch;
524
525 if (y0 != y1) {
526 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
527 uint32_t xo = xo1;
528 uint32_t swizzle = swizzle1;
529
530 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
531
532 /* Step by spans/columns. As it happens, the swizzle bit flips
533 * at each step so we don't need to calculate it explicitly.
534 */
535 for (x = x1; x < x2; x += ytile_span) {
536 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
537 xo += bytes_per_column;
538 swizzle ^= swizzle_bit;
539 }
540
541 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
542
543 src += src_pitch;
544 }
545 }
546
547 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
548 uint32_t xo = xo1;
549 uint32_t swizzle = swizzle1;
550
551 if (x0 != x1) {
552 mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
553 mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
554 mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
555 mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
556 }
557
558 /* Step by spans/columns. As it happens, the swizzle bit flips
559 * at each step so we don't need to calculate it explicitly.
560 */
561 for (x = x1; x < x2; x += ytile_span) {
562 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
563 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
564 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
565 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
566 xo += bytes_per_column;
567 swizzle ^= swizzle_bit;
568 }
569
570 if (x2 != x3) {
571 mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
572 mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
573 mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
574 mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
575 }
576
577 src += 4 * src_pitch;
578 }
579
580 if (y2 != y3) {
581 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
582 uint32_t xo = xo1;
583 uint32_t swizzle = swizzle1;
584
585 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
586
587 /* Step by spans/columns. As it happens, the swizzle bit flips
588 * at each step so we don't need to calculate it explicitly.
589 */
590 for (x = x1; x < x2; x += ytile_span) {
591 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
592 xo += bytes_per_column;
593 swizzle ^= swizzle_bit;
594 }
595
596 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
597
598 src += src_pitch;
599 }
600 }
601 }
602
603 /**
604 * Copy texture data from linear to Tile-4 layout.
605 *
606 * \copydoc tile_copy_fn
607 */
608 static inline void
linear_to_tile4(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)609 linear_to_tile4(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
610 uint32_t y0, uint32_t y3,
611 char *dst, const char *src,
612 int32_t src_pitch,
613 uint32_t swizzle_bit,
614 isl_mem_copy_fn mem_copy,
615 isl_mem_copy_fn mem_copy_align16)
616 {
617 /* Tile 4 consist of columns that are 'ytile_span' wide and each 64B tile
618 * block consists of 4 row of Y-tile ordered data.
619 * Each 512B block within a 4kB tile contains 8 such block.
620 *
621 * To calculate the tiled offset, we need to identify:
622 * Block X and Block Y offset at each 512B block boundary in X and Y
623 * direction.
624 *
625 * A Tile4 has the following layout :
626 *
627 * |<------------- 128 B-------------------|
628 * _________________________________________
629 * 512B blk(Blk0)^| 0 | 1 | 2 | 3 | 8 | 9 | 10 | 11 | ^ 512B blk(Blk1)
630 * (cell 0..7)) v| 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | v (cell 8..15))
631 * -----------------------------------------
632 * | 16 | 17 | 18 | 19 | 24 | 25 | 26 | 27 |
633 * | 20 | 21 | 22 | 23 | 28 | 29 | 30 | 31 |
634 * -----------------------------------------
635 * | 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 |
636 * | 36 | 37 | 38 | 39 | 44 | 45 | 46 | 47 |
637 * -----------------------------------------
638 * | 48 | 49 | 50 | 51 | 56 | 57 | 58 | 59 |
639 * | 52 | 53 | 54 | 55 | 60 | 61 | 62 | 63 |
640 * -----------------------------------------
641 *
642 * The tile is divided in 512B blocks[Blk0..Blk7], themselves made of 2
643 * rows of 256B sub-blocks.
644 *
645 * Each sub-block is composed of 4 64B elements[cell(0)-cell(3)] (a cell
646 * in the figure above).
647 *
648 * Each 64B cell represents 4 rows of data.[cell(0), cell(1), .., cell(63)]
649 *
650 *
651 * Block X - Adds 256B to offset when we encounter block boundary in
652 * X direction.(Ex: Blk 0 --> Blk 1(BlkX_off = 256))
653 * Block Y - Adds 512B to offset when we encounter block boundary in
654 * Y direction.(Ex: Blk 0 --> Blk 3(BlkY_off = 512))
655 *
656 * (x / ytile_span) * cacheline_size_B //Byte offset in the X dir of
657 * the containing 64B block
658 * x % ytile_span //Byte offset in X dir within a 64B block/cacheline
659 *
660 * (y % 4) * 16 // Byte offset of the Y dir within a 64B block/cacheline
661 * (y / 4) * 256// Byte offset of the Y dir within 512B block after 1 row
662 * of 64B blocks/cachelines
663 *
664 * The copy destination offset for each range copied is the sum of
665 * Block X offset 'BlkX_off', Block Y offset 'BlkY_off', X offset 'xo'
666 * and a Y offset 'yo.'
667 */
668 const uint32_t column_width = ytile_span;
669 const uint32_t tile4_blkh = 4;
670
671 assert(ytile_span * tile4_blkh == 64);
672 const uint32_t cacheline_size_B = 64;
673
674 /* Find intermediate Y offsets that are aligned to a 64B element
675 * (4 rows), so that we can do fully 64B memcpys on those.
676 */
677 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
678 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
679
680 /* xsb0 and xsb1 are the byte offset within a 256B sub block for x0 and x1 */
681 uint32_t xsb0 = (x0 % ytile_span) + (x0 / ytile_span) * cacheline_size_B;
682 uint32_t xsb1 = (x1 % ytile_span) + (x1 / ytile_span) * cacheline_size_B;
683
684 uint32_t Blkxsb0_off = ALIGN_DOWN(xsb0, 256);
685 uint32_t Blky0_off = (y0 / 8) * 512;
686
687 uint32_t BlkX_off, BlkY_off;
688
689 uint32_t x, yo, Y0, Y2;
690
691 /* Y0 determines the initial byte offset in the Y direction */
692 Y0 = (y0 / 4) * 256 + (y0 % 4) * ytile_span;
693
694 /* Y2 determines the byte offset required for reaching y2 if y2 doesn't map
695 * exactly to 512B block boundary
696 */
697 Y2 = y2 * 4 * column_width;
698
699 src += (ptrdiff_t)y0 * src_pitch;
700
701 /* To maximize memcpy speed, we do the copy in 3 parts :
702 * - copy the first lines that are not aligned to the 64B cell's height (4 rows)
703 * - copy the lines that are aligned to 64B cell's height
704 * - copy the remaining lines not making up for a full 64B cell's height
705 */
706 if (y0 != y1) {
707 for (yo = Y0; yo < Y0 + (y1 - y0) * column_width; yo += column_width) {
708 uint32_t xo = xsb1;
709
710 if (x0 != x1)
711 mem_copy(dst + (Blky0_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0);
712
713 for (x = x1; x < x2; x += ytile_span) {
714 BlkX_off = ALIGN_DOWN(xo, 256);
715
716 mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x, ytile_span);
717 xo += cacheline_size_B;
718 }
719
720 if (x3 != x2) {
721 BlkX_off = ALIGN_DOWN(xo, 256);
722 mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x2, x3 - x2);
723 }
724
725 src += src_pitch;
726 }
727 }
728
729 for (yo = y1 * 4 * column_width; yo < y2 * 4 * column_width; yo += 16 * column_width) {
730 uint32_t xo = xsb1;
731 BlkY_off = ALIGN_DOWN(yo, 512);
732
733 if (x0 != x1) {
734 mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 0 * column_width),
735 src + x0 + 0 * src_pitch, x1 - x0);
736 mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 1 * column_width),
737 src + x0 + 1 * src_pitch, x1 - x0);
738 mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 2 * column_width),
739 src + x0 + 2 * src_pitch, x1 - x0);
740 mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 3 * column_width),
741 src + x0 + 3 * src_pitch, x1 - x0);
742 }
743
744 for (x = x1; x < x2; x += ytile_span) {
745 BlkX_off = ALIGN_DOWN(xo, 256);
746
747 mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo+ 0 * column_width),
748 src + x + 0 * src_pitch, ytile_span);
749 mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
750 src + x + 1 * src_pitch, ytile_span);
751 mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
752 src + x + 2 * src_pitch, ytile_span);
753 mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
754 src + x + 3 * src_pitch, ytile_span);
755
756 xo += cacheline_size_B;
757 }
758
759 if (x2 != x3) {
760 BlkX_off = ALIGN_DOWN(xo, 256);
761
762 mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
763 src + x2 + 0 * src_pitch, x3 - x2);
764 mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
765 src + x2 + 1 * src_pitch, x3 - x2);
766 mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
767 src + x2 + 2 * src_pitch, x3 - x2);
768 mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
769 src + x2 + 3 * src_pitch, x3 - x2);
770 }
771
772 src += 4 * src_pitch;
773 }
774
775 if (y2 != y3) {
776 for (yo = Y2; yo < Y2 + (y3 - y2) * column_width; yo += column_width) {
777 uint32_t xo = xsb1;
778 BlkY_off = ALIGN_DOWN(yo, 512);
779
780 if (x0 != x1)
781 mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0);
782
783 for (x = x1; x < x2; x += ytile_span) {
784 BlkX_off = ALIGN_DOWN(xo, 256);
785
786 mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x, ytile_span);
787 xo += cacheline_size_B;
788 }
789
790 if (x3 != x2) {
791 BlkX_off = ALIGN_DOWN(xo, 256);
792 mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x2, x3 - x2);
793 }
794
795 src += src_pitch;
796 }
797 }
798 }
799
800 /**
801 * Copy texture data from linear to W tile layout.
802 *
803 * \copydoc tile_copy_fn
804 */
805 static inline void
linear_to_wtiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch)806 linear_to_wtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
807 uint32_t y0, uint32_t y3,
808 char *dst, const char *src, int32_t src_pitch)
809 {
810 /*
811 * The layout is a series of block of 64B each.
812 * ___________________________________________
813 * |blk00|blk08|blk16|blk24|blk32|blk48|blk56|
814 * |blk01|blk09|blk17|blk25|blk33|blk49|blk57|
815 * |blk02|blk10|blk18|blk26|blk34|blk50|blk58|
816 * |blk03|blk11|blk19|blk27|blk35|blk51|blk59|
817 * |blk04|blk12|blk20|blk28|blk36|blk52|blk60|
818 * |blk05|blk13|blk21|blk29|blk37|blk53|blk61|
819 * |blk06|blk14|blk22|blk30|blk38|blk54|blk62|
820 * |blk07|blk15|blk23|blk31|blk39|blk55|blk63|
821 * -------------------------------------------
822 */
823
824 /* Find intermediate Y offsets that are aligned to a 64B element (8 rows).
825 */
826 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 8));
827 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 8));
828
829 uint32_t xo, yo;
830
831 /* If the y0 coordinate is not aligned to a block, do partial copies into
832 * blocks 0, 8, 16, 24, 32, 48 & 56.
833 */
834 if (y0 != y1) {
835 if (x0 != x1)
836 wtile_block_copy_from_linear(dst, src, x0, x1, y0, y1, src_pitch);
837 for (xo = x1; xo < x2; xo += 8)
838 wtile_block_copy_from_linear(dst, src, xo, xo + 8, y0, y1, src_pitch);
839 if (x2 != x3)
840 wtile_block_copy_from_linear(dst, src, x2, x3, y0, y1, src_pitch);
841 }
842
843 for (yo = y1; yo < y2; yo += 8) {
844 /* Do partial copies int blocks [1, 6] if x0 is not aligned to block. */
845 if (x0 != x1) {
846 wtile_block_copy_from_linear(dst, src,
847 x0, x1, yo, yo + 8, src_pitch);
848 }
849 /* Full block copies on the inside. */
850 for (xo = x1; xo < x2; xo += 8)
851 wtile_block_full_copy_from_linear(dst, src, xo, yo, src_pitch);
852 /* Do partial copies int blocks [57, 62] if y3 is not aligned to block.
853 */
854 if (x2 != x3) {
855 wtile_block_copy_from_linear(dst, src,
856 x2, x3, yo, yo + 8, src_pitch);
857 }
858 }
859
860 /* If the x3 coordinate is not aligned to a block, do partial copies into
861 * blocks [57,62].
862 */
863 if (y2 != y3) {
864 if (x0 != x1)
865 wtile_block_copy_from_linear(dst, src, x0, x1, y2, y3, src_pitch);
866 for (xo = x1; xo < x2; xo += 8)
867 wtile_block_copy_from_linear(dst, src, xo, xo + 8, y2, y3, src_pitch);
868 if (x2 != x3)
869 wtile_block_copy_from_linear(dst, src, x2, x3, y2, y3, src_pitch);
870 }
871 }
872
873 /**
874 * Copy texture data from X tile layout to linear.
875 *
876 * \copydoc tile_copy_fn
877 */
878 static inline void
xtiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)879 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
880 uint32_t y0, uint32_t y1,
881 char *dst, const char *src,
882 int32_t dst_pitch,
883 uint32_t swizzle_bit,
884 isl_mem_copy_fn mem_copy,
885 isl_mem_copy_fn mem_copy_align16)
886 {
887 /* The copy destination offset for each range copied is the sum of
888 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
889 */
890 uint32_t xo, yo;
891
892 dst += (ptrdiff_t)y0 * dst_pitch;
893
894 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
895 /* Bits 9 and 10 of the copy destination offset control swizzling.
896 * Only 'yo' contributes to those bits in the total offset,
897 * so calculate 'swizzle' just once per row.
898 * Move bits 9 and 10 three and four places respectively down
899 * to bit 6 and xor them.
900 */
901 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
902
903 mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
904
905 for (xo = x1; xo < x2; xo += xtile_span) {
906 mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
907 }
908
909 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
910
911 dst += dst_pitch;
912 }
913 }
914
915 /**
916 * Copy texture data from Y tile layout to linear.
917 *
918 * \copydoc tile_copy_fn
919 */
920 static inline void
ytiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)921 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
922 uint32_t y0, uint32_t y3,
923 char *dst, const char *src,
924 int32_t dst_pitch,
925 uint32_t swizzle_bit,
926 isl_mem_copy_fn mem_copy,
927 isl_mem_copy_fn mem_copy_align16)
928 {
929 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
930 * as the tile). Thus the destination offset for (x,y) is the sum of:
931 * (x % column_width) // position within column
932 * (x / column_width) * bytes_per_column // column number * bytes per column
933 * y * column_width
934 *
935 * The copy destination offset for each range copied is the sum of
936 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
937 */
938 const uint32_t column_width = ytile_span;
939 const uint32_t bytes_per_column = column_width * ytile_height;
940
941 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
942 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
943
944 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
945 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
946
947 /* Bit 9 of the destination offset control swizzling.
948 * Only the X offset contributes to bit 9 of the total offset,
949 * so swizzle can be calculated in advance for these X positions.
950 * Move bit 9 three places down to bit 6.
951 */
952 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
953 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
954
955 uint32_t x, yo;
956
957 dst += (ptrdiff_t)y0 * dst_pitch;
958
959 if (y0 != y1) {
960 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
961 uint32_t xo = xo1;
962 uint32_t swizzle = swizzle1;
963
964 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
965
966 /* Step by spans/columns. As it happens, the swizzle bit flips
967 * at each step so we don't need to calculate it explicitly.
968 */
969 for (x = x1; x < x2; x += ytile_span) {
970 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
971 xo += bytes_per_column;
972 swizzle ^= swizzle_bit;
973 }
974
975 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
976
977 dst += dst_pitch;
978 }
979 }
980
981 for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
982 uint32_t xo = xo1;
983 uint32_t swizzle = swizzle1;
984
985 if (x0 != x1) {
986 mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
987 mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
988 mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
989 mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
990 }
991
992 /* Step by spans/columns. As it happens, the swizzle bit flips
993 * at each step so we don't need to calculate it explicitly.
994 */
995 for (x = x1; x < x2; x += ytile_span) {
996 mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
997 mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
998 mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
999 mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
1000 xo += bytes_per_column;
1001 swizzle ^= swizzle_bit;
1002 }
1003
1004 if (x2 != x3) {
1005 mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
1006 mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
1007 mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
1008 mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
1009 }
1010
1011 dst += 4 * dst_pitch;
1012 }
1013
1014 if (y2 != y3) {
1015 for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
1016 uint32_t xo = xo1;
1017 uint32_t swizzle = swizzle1;
1018
1019 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
1020
1021 /* Step by spans/columns. As it happens, the swizzle bit flips
1022 * at each step so we don't need to calculate it explicitly.
1023 */
1024 for (x = x1; x < x2; x += ytile_span) {
1025 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
1026 xo += bytes_per_column;
1027 swizzle ^= swizzle_bit;
1028 }
1029
1030 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
1031
1032 dst += dst_pitch;
1033 }
1034 }
1035 }
1036
1037
1038 /**
1039 * Copy texture data from linear to Tile-4 layout.
1040 *
1041 * \copydoc tile_copy_fn
1042 */
1043 static inline void
tile4_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)1044 tile4_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1045 uint32_t y0, uint32_t y3,
1046 char *dst, const char *src,
1047 int32_t dst_pitch,
1048 uint32_t swizzle_bit,
1049 isl_mem_copy_fn mem_copy,
1050 isl_mem_copy_fn mem_copy_align16)
1051 {
1052
1053 /* Tile 4 consist of columns that are 'ytile_span' wide and each 64B tile block
1054 * consists of 4 row of Y-tile ordered data.
1055 * Each 512B block within a 4kB tile contains 8 such block.
1056 *
1057 * To calculate the tiled offset, we need to identify:
1058 * Block X and Block Y offset at each 512B block boundary in X and Y direction.
1059 *
1060 * Refer to the Tile4 layout diagram in linear_to_tile4() function.
1061 *
1062 * The tile is divided in 512B blocks[Blk0..Blk7], themselves made of 2
1063 * rows of 256B sub-blocks
1064 *
1065 * Each sub-block is composed of 4 64B elements[cell(0)-cell(3)].
1066 *
1067 * Each 64B cell represents 4 rows of data.[cell(0), cell(1), .., cell(63)]
1068 *
1069 *
1070 * Block X - Adds 256B to offset when we encounter block boundary in
1071 * X direction.(Ex: Blk 0 --> Blk 1(BlkX_off = 256))
1072 * Block Y - Adds 512B to offset when we encounter block boundary in
1073 * Y direction.(Ex: Blk 0 --> Blk 3(BlkY_off = 512))
1074 *
1075 * (x / ytile_span) * cacheline_size_B //Byte offset in the X dir of the
1076 * containing 64B block
1077 * x % ytile_span //Byte offset in X dir within a 64B block/cacheline
1078 *
1079 * (y % 4) * 16 // Byte offset of the Y dir within a 64B block/cacheline
1080 * (y / 4) * 256// Byte offset of the Y dir within 512B block after 1 row
1081 * of 64B blocks/cachelines
1082 *
1083 * The copy destination offset for each range copied is the sum of
1084 * Block X offset 'BlkX_off', Block Y offset 'BlkY_off', X offset 'xo'
1085 * and a Y offset 'yo.'
1086 */
1087
1088 const uint32_t column_width = ytile_span;
1089 const uint32_t tile4_blkh = 4;
1090
1091 assert(ytile_span * tile4_blkh == 64);
1092 const uint32_t cacheline_size_B = 64;
1093
1094 /* Find intermediate Y offsets that are aligned to a 64B element
1095 * (4 rows), so that we can do fully 64B memcpys on those.
1096 */
1097 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
1098 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
1099
1100 /* xsb0 and xsb1 are the byte offset within a 256B sub block for x0 and x1 */
1101 uint32_t xsb0 = (x0 % ytile_span) + (x0 / ytile_span) * cacheline_size_B;
1102 uint32_t xsb1 = (x1 % ytile_span) + (x1 / ytile_span) * cacheline_size_B;
1103
1104 uint32_t Blkxsb0_off = ALIGN_DOWN(xsb0, 256);
1105 uint32_t Blky0_off = (y0 / 8) * 512;
1106
1107 uint32_t BlkX_off, BlkY_off;
1108
1109 uint32_t x, yo, Y0, Y2;
1110
1111 /* Y0 determines the initial byte offset in the Y direction */
1112 Y0 = (y0 / 4) * 256 + (y0 % 4) * 16;
1113
1114 /* Y2 determines the byte offset required for reaching y2 if y2 doesn't map
1115 * exactly to 512B block boundary
1116 */
1117 Y2 = y2 * 4 * column_width;
1118
1119 dst += (ptrdiff_t)y0 * dst_pitch;
1120
1121 /* To maximize memcpy speed, we do the copy in 3 parts :
1122 * - copy the first lines that are not aligned to the 64B cell's height (4 rows)
1123 * - copy the lines that are aligned to 64B cell's height
1124 * - copy the remaining lines not making up for a full 64B cell's height
1125 */
1126 if (y0 != y1) {
1127 for (yo = Y0; yo < Y0 + (y1 - y0) * column_width; yo += column_width) {
1128 uint32_t xo = xsb1;
1129
1130 if (x0 != x1)
1131 mem_copy(dst + x0, src + (Blky0_off + Blkxsb0_off) + (xsb0 + yo), x1 - x0);
1132
1133 for (x = x1; x < x2; x += ytile_span) {
1134 BlkX_off = ALIGN_DOWN(xo, 256);
1135
1136 mem_copy_align16(dst + x, src + (Blky0_off + BlkX_off) + (xo + yo), ytile_span);
1137 xo += cacheline_size_B;
1138 }
1139
1140 if (x3 != x2) {
1141 BlkX_off = ALIGN_DOWN(xo, 256);
1142 mem_copy_align16(dst + x2, src + (Blky0_off + BlkX_off) + (xo + yo), x3 - x2);
1143 }
1144
1145 dst += dst_pitch;
1146 }
1147 }
1148
1149 for (yo = y1 * 4 * column_width; yo < y2 * 4 * column_width; yo += 16 * column_width) {
1150 uint32_t xo = xsb1;
1151 BlkY_off = ALIGN_DOWN(yo, 512);
1152
1153 if (x0 != x1) {
1154 mem_copy(dst + x0 + 0 * dst_pitch,
1155 src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 0 * column_width),
1156 x1 - x0);
1157 mem_copy(dst + x0 + 1 * dst_pitch,
1158 src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 1 * column_width),
1159 x1 - x0);
1160 mem_copy(dst + x0 + 2 * dst_pitch,
1161 src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 2 * column_width),
1162 x1 - x0);
1163 mem_copy(dst + x0 + 3 * dst_pitch,
1164 src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 3 * column_width),
1165 x1 - x0);
1166 }
1167
1168 for (x = x1; x < x2; x += ytile_span) {
1169 BlkX_off = ALIGN_DOWN(xo, 256);
1170
1171 mem_copy_align16(dst + x + 0 * dst_pitch,
1172 src + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
1173 ytile_span);
1174 mem_copy_align16(dst + x + 1 * dst_pitch,
1175 src + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
1176 ytile_span);
1177 mem_copy_align16(dst + x + 2 * dst_pitch,
1178 src + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
1179 ytile_span);
1180 mem_copy_align16(dst + x + 3 * dst_pitch,
1181 src + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
1182 ytile_span);
1183
1184 xo += cacheline_size_B;
1185 }
1186
1187 if (x2 != x3) {
1188 BlkX_off = ALIGN_DOWN(xo, 256);
1189
1190 mem_copy(dst + x2 + 0 * dst_pitch,
1191 src + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
1192 x3 - x2);
1193 mem_copy(dst + x2 + 1 * dst_pitch,
1194 src + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
1195 x3 - x2);
1196 mem_copy(dst + x2 + 2 * dst_pitch,
1197 src + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
1198 x3 - x2);
1199 mem_copy(dst + x2 + 3 * dst_pitch,
1200 src + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
1201 x3 - x2);
1202 }
1203
1204 dst += 4 * dst_pitch;
1205 }
1206
1207 if (y2 != y3) {
1208 for (yo = Y2; yo < Y2 + (y3 - y2) * column_width; yo += column_width) {
1209 uint32_t xo = xsb1;
1210 BlkY_off = ALIGN_DOWN(yo, 512);
1211
1212 if (x0 != x1)
1213 mem_copy(dst + x0, src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo), x1 - x0);
1214
1215 for (x = x1; x < x2; x += ytile_span) {
1216 BlkX_off = ALIGN_DOWN(xo, 256);
1217
1218 mem_copy_align16(dst + x, src + (BlkY_off + BlkX_off) + (xo + yo), ytile_span);
1219 xo += cacheline_size_B;
1220 }
1221
1222 if (x3 != x2) {
1223 BlkX_off = ALIGN_DOWN(xo, 256);
1224 mem_copy_align16(dst + x2, src + (BlkY_off + BlkX_off) + (xo + yo), x3 - x2);
1225 }
1226
1227 dst += dst_pitch;
1228 }
1229 }
1230 }
1231
1232 /**
1233 * Copy texture data from W tile layout to linear.
1234 *
1235 * \copydoc tile_copy_fn
1236 */
1237 static inline void
wtiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch)1238 wtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1239 uint32_t y0, uint32_t y3,
1240 char *dst, const char *src,
1241 int32_t dst_pitch)
1242 {
1243 /*
1244 * The layout is a series of block of 64B each.
1245 * ___________________________________________
1246 * |blk00|blk08|blk16|blk24|blk32|blk48|blk56|
1247 * |blk01|blk09|blk17|blk25|blk33|blk49|blk57|
1248 * |blk02|blk10|blk18|blk26|blk34|blk50|blk58|
1249 * |blk03|blk11|blk19|blk27|blk35|blk51|blk59|
1250 * |blk04|blk12|blk20|blk28|blk36|blk52|blk60|
1251 * |blk05|blk13|blk21|blk29|blk37|blk53|blk61|
1252 * |blk06|blk14|blk22|blk30|blk38|blk54|blk62|
1253 * |blk07|blk15|blk23|blk31|blk39|blk55|blk63|
1254 * -------------------------------------------
1255 */
1256
1257 /* Find intermediate Y offsets that are aligned to a 64B element (8 rows).
1258 */
1259 uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 8));
1260 uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 8));
1261
1262 uint32_t xo, yo;
1263
1264 /* If the y0 coordinate is not aligned to a block, do partial copies into
1265 * blocks 0, 8, 16, 24, 32, 48 & 56.
1266 */
1267 if (y0 != y1) {
1268 if (x0 != x1)
1269 wtile_block_copy_to_linear(dst, src, x0, x1, y0, y1, dst_pitch);
1270 for (xo = x1; xo < x2; xo += 8)
1271 wtile_block_copy_to_linear(dst, src, xo, xo + 8, y0, y1, dst_pitch);
1272 if (x2 != x3)
1273 wtile_block_copy_to_linear(dst, src, x2, x3, y0, y1, dst_pitch);
1274 }
1275
1276 for (yo = y1; yo < y2; yo += 8) {
1277 /* Do partial copies int blocks [1, 6] if x0 is not aligned to block. */
1278 if (x0 != x1)
1279 wtile_block_copy_to_linear(dst, src, x0, x1, yo, yo + 8, dst_pitch);
1280 /* Full block copies on the inside. */
1281 for (xo = x1; xo < x2; xo += 8)
1282 wtile_block_full_copy_to_linear(dst, src, xo, yo, dst_pitch);
1283 /* Do partial copies int blocks [57, 62] if y3 is not aligned to block.
1284 */
1285 if (x2 != x3)
1286 wtile_block_copy_to_linear(dst, src, x2, x3, yo, yo + 8, dst_pitch);
1287 }
1288
1289 /* If the x3 coordinate is not aligned to a block, do partial copies into
1290 * blocks [57,62].
1291 */
1292 if (y2 != y3) {
1293 if (x0 != x1)
1294 wtile_block_copy_to_linear(dst, src, x0, x1, y2, y3, dst_pitch);
1295 for (xo = x1; xo < x2; xo += 8) {
1296 wtile_block_copy_to_linear(dst, src,
1297 xo, MIN2(xo + 8, x3), y2, y3, dst_pitch);
1298 }
1299 if (x2 != x3)
1300 wtile_block_copy_to_linear(dst, src, x2, x3, y2, y3, dst_pitch);
1301 }
1302 }
1303
1304 #if defined(INLINE_SSE41)
1305 static ALWAYS_INLINE void *
_memcpy_streaming_load(void * dest,const void * src,size_t count)1306 _memcpy_streaming_load(void *dest, const void *src, size_t count)
1307 {
1308 if (count == 16) {
1309 __m128i val = _mm_stream_load_si128((__m128i *)src);
1310 _mm_storeu_si128((__m128i *)dest, val);
1311 return dest;
1312 } else if (count == 64) {
1313 __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
1314 __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
1315 __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
1316 __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
1317 _mm_storeu_si128(((__m128i *)dest) + 0, val0);
1318 _mm_storeu_si128(((__m128i *)dest) + 1, val1);
1319 _mm_storeu_si128(((__m128i *)dest) + 2, val2);
1320 _mm_storeu_si128(((__m128i *)dest) + 3, val3);
1321 return dest;
1322 } else {
1323 assert(count < 64); /* and (count < 16) for ytiled */
1324 return memcpy(dest, src, count);
1325 }
1326 }
1327 #endif
1328
1329 static isl_mem_copy_fn
choose_copy_function(isl_memcpy_type copy_type)1330 choose_copy_function(isl_memcpy_type copy_type)
1331 {
1332 switch(copy_type) {
1333 case ISL_MEMCPY:
1334 return memcpy;
1335 case ISL_MEMCPY_BGRA8:
1336 return rgba8_copy;
1337 case ISL_MEMCPY_STREAMING_LOAD:
1338 #if defined(INLINE_SSE41)
1339 return _memcpy_streaming_load;
1340 #else
1341 unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
1342 #endif
1343 case ISL_MEMCPY_INVALID:
1344 unreachable("invalid copy_type");
1345 }
1346 unreachable("unhandled copy_type");
1347 return NULL;
1348 }
1349
1350 /**
1351 * Copy texture data from linear to X tile layout, faster.
1352 *
1353 * Same as \ref linear_to_xtiled but faster, because it passes constant
1354 * parameters for common cases, allowing the compiler to inline code
1355 * optimized for those cases.
1356 *
1357 * \copydoc tile_copy_fn
1358 */
1359 static FLATTEN void
linear_to_xtiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1360 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1361 uint32_t y0, uint32_t y1,
1362 char *dst, const char *src,
1363 int32_t src_pitch,
1364 uint32_t swizzle_bit,
1365 isl_memcpy_type copy_type)
1366 {
1367 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1368
1369 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
1370 if (mem_copy == memcpy)
1371 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
1372 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1373 else if (mem_copy == rgba8_copy)
1374 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
1375 dst, src, src_pitch, swizzle_bit,
1376 rgba8_copy, rgba8_copy_aligned_dst);
1377 else
1378 unreachable("not reached");
1379 } else {
1380 if (mem_copy == memcpy)
1381 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
1382 dst, src, src_pitch, swizzle_bit,
1383 memcpy, memcpy);
1384 else if (mem_copy == rgba8_copy)
1385 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
1386 dst, src, src_pitch, swizzle_bit,
1387 rgba8_copy, rgba8_copy_aligned_dst);
1388 else
1389 unreachable("not reached");
1390 }
1391 }
1392
1393 /**
1394 * Copy texture data from linear to Y tile layout, faster.
1395 *
1396 * Same as \ref linear_to_ytiled but faster, because it passes constant
1397 * parameters for common cases, allowing the compiler to inline code
1398 * optimized for those cases.
1399 *
1400 * \copydoc tile_copy_fn
1401 */
1402 static FLATTEN void
linear_to_ytiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1403 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1404 uint32_t y0, uint32_t y1,
1405 char *dst, const char *src,
1406 int32_t src_pitch,
1407 uint32_t swizzle_bit,
1408 isl_memcpy_type copy_type)
1409 {
1410 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1411
1412 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1413 if (mem_copy == memcpy)
1414 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
1415 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1416 else if (mem_copy == rgba8_copy)
1417 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
1418 dst, src, src_pitch, swizzle_bit,
1419 rgba8_copy, rgba8_copy_aligned_dst);
1420 else
1421 unreachable("not reached");
1422 } else {
1423 if (mem_copy == memcpy)
1424 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
1425 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1426 else if (mem_copy == rgba8_copy)
1427 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
1428 dst, src, src_pitch, swizzle_bit,
1429 rgba8_copy, rgba8_copy_aligned_dst);
1430 else
1431 unreachable("not reached");
1432 }
1433 }
1434
1435 /**
1436 * Copy texture data from linear to tile 4 layout, faster.
1437 *
1438 * Same as \ref linear_to_tile4 but faster, because it passes constant
1439 * parameters for common cases, allowing the compiler to inline code
1440 * optimized for those cases.
1441 *
1442 * \copydoc tile_copy_fn
1443 */
1444 static FLATTEN void
linear_to_tile4_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1445 linear_to_tile4_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1446 uint32_t y0, uint32_t y1,
1447 char *dst, const char *src,
1448 int32_t src_pitch,
1449 uint32_t swizzle_bit,
1450 isl_memcpy_type copy_type)
1451 {
1452 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1453 assert(swizzle_bit == 0);
1454
1455 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1456 if (mem_copy == memcpy)
1457 return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height,
1458 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1459 else if (mem_copy == rgba8_copy)
1460 return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height,
1461 dst, src, src_pitch, swizzle_bit,
1462 rgba8_copy, rgba8_copy_aligned_dst);
1463 else
1464 unreachable("not reached");
1465 } else {
1466 if (mem_copy == memcpy)
1467 return linear_to_tile4(x0, x1, x2, x3, y0, y1,
1468 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1469 else if (mem_copy == rgba8_copy)
1470 return linear_to_tile4(x0, x1, x2, x3, y0, y1,
1471 dst, src, src_pitch, swizzle_bit,
1472 rgba8_copy, rgba8_copy_aligned_dst);
1473 else
1474 unreachable("not reached");
1475 }
1476 }
1477
1478 /**
1479 * Copy texture data from linear to tile W layout, faster.
1480 *
1481 * Same as \ref linear_to_tilew but faster, because it passes constant
1482 * parameters for common cases, allowing the compiler to inline code
1483 * optimized for those cases.
1484 *
1485 * \copydoc tile_copy_fn
1486 */
1487 static FLATTEN void
linear_to_wtiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1488 linear_to_wtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1489 uint32_t y0, uint32_t y1,
1490 char *dst, const char *src,
1491 int32_t src_pitch,
1492 uint32_t swizzle_bit,
1493 isl_memcpy_type copy_type)
1494 {
1495 assert(swizzle_bit == 0);
1496 if (x0 == 0 && x3 == wtile_width && y0 == 0 && y1 == wtile_height) {
1497 return linear_to_wtiled(0, 0,
1498 wtile_width, wtile_width,
1499 0, wtile_height,
1500 dst, src, src_pitch);
1501 } else {
1502 return linear_to_wtiled(x0, x1, x2, x3, y0, y1,
1503 dst, src, src_pitch);
1504 }
1505 }
1506
1507 /**
1508 * Copy texture data from X tile layout to linear, faster.
1509 *
1510 * Same as \ref xtile_to_linear but faster, because it passes constant
1511 * parameters for common cases, allowing the compiler to inline code
1512 * optimized for those cases.
1513 *
1514 * \copydoc tile_copy_fn
1515 */
1516 static FLATTEN void
xtiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1517 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1518 uint32_t y0, uint32_t y1,
1519 char *dst, const char *src,
1520 int32_t dst_pitch,
1521 uint32_t swizzle_bit,
1522 isl_memcpy_type copy_type)
1523 {
1524 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1525
1526 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
1527 if (mem_copy == memcpy)
1528 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1529 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1530 else if (mem_copy == rgba8_copy)
1531 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1532 dst, src, dst_pitch, swizzle_bit,
1533 rgba8_copy, rgba8_copy_aligned_src);
1534 #if defined(INLINE_SSE41)
1535 else if (mem_copy == _memcpy_streaming_load)
1536 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1537 dst, src, dst_pitch, swizzle_bit,
1538 memcpy, _memcpy_streaming_load);
1539 #endif
1540 else
1541 unreachable("not reached");
1542 } else {
1543 if (mem_copy == memcpy)
1544 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1545 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1546 else if (mem_copy == rgba8_copy)
1547 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1548 dst, src, dst_pitch, swizzle_bit,
1549 rgba8_copy, rgba8_copy_aligned_src);
1550 #if defined(INLINE_SSE41)
1551 else if (mem_copy == _memcpy_streaming_load)
1552 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1553 dst, src, dst_pitch, swizzle_bit,
1554 memcpy, _memcpy_streaming_load);
1555 #endif
1556 else
1557 unreachable("not reached");
1558 }
1559 }
1560
1561 /**
1562 * Copy texture data from Y tile layout to linear, faster.
1563 *
1564 * Same as \ref ytile_to_linear but faster, because it passes constant
1565 * parameters for common cases, allowing the compiler to inline code
1566 * optimized for those cases.
1567 *
1568 * \copydoc tile_copy_fn
1569 */
1570 static FLATTEN void
ytiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1571 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1572 uint32_t y0, uint32_t y1,
1573 char *dst, const char *src,
1574 int32_t dst_pitch,
1575 uint32_t swizzle_bit,
1576 isl_memcpy_type copy_type)
1577 {
1578 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1579
1580 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1581 if (mem_copy == memcpy)
1582 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1583 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1584 else if (mem_copy == rgba8_copy)
1585 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1586 dst, src, dst_pitch, swizzle_bit,
1587 rgba8_copy, rgba8_copy_aligned_src);
1588 #if defined(INLINE_SSE41)
1589 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1590 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1591 dst, src, dst_pitch, swizzle_bit,
1592 memcpy, _memcpy_streaming_load);
1593 #endif
1594 else
1595 unreachable("not reached");
1596 } else {
1597 if (mem_copy == memcpy)
1598 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1599 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1600 else if (mem_copy == rgba8_copy)
1601 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1602 dst, src, dst_pitch, swizzle_bit,
1603 rgba8_copy, rgba8_copy_aligned_src);
1604 #if defined(INLINE_SSE41)
1605 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1606 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1607 dst, src, dst_pitch, swizzle_bit,
1608 memcpy, _memcpy_streaming_load);
1609 #endif
1610 else
1611 unreachable("not reached");
1612 }
1613 }
1614
1615 /**
1616 * Copy texture data from tile4 layout to linear, faster.
1617 *
1618 * Same as \ref tile4_to_linear but faster, because it passes constant
1619 * parameters for common cases, allowing the compiler to inline code
1620 * optimized for those cases.
1621 *
1622 * \copydoc tile_copy_fn
1623 */
1624 static FLATTEN void
tile4_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1625 tile4_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1626 uint32_t y0, uint32_t y1,
1627 char *dst, const char *src,
1628 int32_t dst_pitch,
1629 uint32_t swizzle_bit,
1630 isl_memcpy_type copy_type)
1631 {
1632 isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1633 assert(swizzle_bit == 0);
1634
1635 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1636 if (mem_copy == memcpy)
1637 return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1638 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1639 else if (mem_copy == rgba8_copy)
1640 return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1641 dst, src, dst_pitch, swizzle_bit,
1642 rgba8_copy, rgba8_copy_aligned_src);
1643 #if defined(INLINE_SSE41)
1644 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1645 return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1646 dst, src, dst_pitch, swizzle_bit,
1647 memcpy, _memcpy_streaming_load);
1648 #endif
1649 else
1650 unreachable("not reached");
1651 } else {
1652 if (mem_copy == memcpy)
1653 return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1654 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1655 else if (mem_copy == rgba8_copy)
1656 return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1657 dst, src, dst_pitch, swizzle_bit,
1658 rgba8_copy, rgba8_copy_aligned_src);
1659 #if defined(INLINE_SSE41)
1660 else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1661 return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1662 dst, src, dst_pitch, swizzle_bit,
1663 memcpy, _memcpy_streaming_load);
1664 #endif
1665 else
1666 unreachable("not reached");
1667 }
1668 }
1669
1670 /**
1671 * Copy texture data from tileW layout to linear, faster.
1672 *
1673 * Same as \ref tilew_to_linear but faster, because it passes constant
1674 * parameters for common cases, allowing the compiler to inline code
1675 * optimized for those cases.
1676 *
1677 * \copydoc tile_copy_fn
1678 */
1679 static FLATTEN void
wtiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1680 wtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1681 uint32_t y0, uint32_t y1,
1682 char *dst, const char *src,
1683 int32_t dst_pitch,
1684 uint32_t swizzle_bit,
1685 isl_memcpy_type copy_type)
1686 {
1687 assert(swizzle_bit == 0);
1688
1689 if (x0 == 0 && x3 == wtile_width && y0 == 0 && y1 == wtile_height) {
1690 return wtiled_to_linear(0, 0,
1691 wtile_width, wtile_width,
1692 0, wtile_height,
1693 dst, src, dst_pitch);
1694 } else {
1695 return wtiled_to_linear(x0, x1, x2, x3, y0, y1,
1696 dst, src, dst_pitch);
1697 }
1698 }
1699
1700 /**
1701 * Copy from linear to tiled texture.
1702 *
1703 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
1704 * pieces that do not cross tile boundaries and copy each piece with a tile
1705 * copy function (\ref tile_copy_fn).
1706 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
1707 * The Y range is in pixels (i.e. unitless).
1708 * 'dst' is the address of (0, 0) in the destination tiled texture.
1709 * 'src' is the address of (xt1, yt1) in the source linear texture.
1710 */
1711 static void
linear_to_tiled(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,uint32_t dst_pitch,int32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,isl_memcpy_type copy_type)1712 linear_to_tiled(uint32_t xt1, uint32_t xt2,
1713 uint32_t yt1, uint32_t yt2,
1714 char *dst, const char *src,
1715 uint32_t dst_pitch, int32_t src_pitch,
1716 bool has_swizzling,
1717 enum isl_tiling tiling,
1718 isl_memcpy_type copy_type)
1719 {
1720 tile_copy_fn tile_copy;
1721 uint32_t xt0, xt3;
1722 uint32_t yt0, yt3;
1723 uint32_t xt, yt;
1724 uint32_t tw, th, xt_sub_range_alignment;
1725 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
1726
1727 if (tiling == ISL_TILING_X) {
1728 tw = xtile_width;
1729 th = xtile_height;
1730 xt_sub_range_alignment = xtile_span;
1731 tile_copy = linear_to_xtiled_faster;
1732 } else if (tiling == ISL_TILING_Y0) {
1733 tw = ytile_width;
1734 th = ytile_height;
1735 xt_sub_range_alignment = ytile_span;
1736 tile_copy = linear_to_ytiled_faster;
1737 } else if (tiling == ISL_TILING_4) {
1738 tw = ytile_width;
1739 th = ytile_height;
1740 xt_sub_range_alignment = ytile_span;
1741 tile_copy = linear_to_tile4_faster;
1742 } else if (tiling == ISL_TILING_W) {
1743 tw = wtile_width;
1744 th = wtile_height;
1745 /* The copy function prioritizes W-Tile blocks. The width of a W-Tile
1746 * block is four W-Tile spans.
1747 */
1748 xt_sub_range_alignment = wtile_span * 4;
1749 tile_copy = linear_to_wtiled_faster;
1750 /* TileW is a special case with doubled physical tile width due to HW
1751 * programming requirements (see isl_tiling_get_info() in
1752 * src/intel/isl/isl.c)
1753 */
1754 dst_pitch /= 2;
1755 } else {
1756 unreachable("unsupported tiling");
1757 }
1758
1759 /* Round out to tile boundaries. */
1760 xt0 = ALIGN_DOWN(xt1, tw);
1761 xt3 = ALIGN_UP (xt2, tw);
1762 yt0 = ALIGN_DOWN(yt1, th);
1763 yt3 = ALIGN_UP (yt2, th);
1764
1765 /* Loop over all tiles to which we have something to copy.
1766 * 'xt' and 'yt' are the origin of the destination tile, whether copying
1767 * copying a full or partial tile.
1768 * tile_copy() copies one tile or partial tile.
1769 * Looping x inside y is the faster memory access pattern.
1770 */
1771 for (yt = yt0; yt < yt3; yt += th) {
1772 for (xt = xt0; xt < xt3; xt += tw) {
1773 /* The area to update is [x0,x3) x [y0,y1).
1774 * May not want the whole tile, hence the min and max.
1775 */
1776 uint32_t x0 = MAX2(xt1, xt);
1777 uint32_t y0 = MAX2(yt1, yt);
1778 uint32_t x3 = MIN2(xt2, xt + tw);
1779 uint32_t y1 = MIN2(yt2, yt + th);
1780
1781 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
1782 * the middle interval is the longest span-aligned part.
1783 * The sub-ranges could be empty.
1784 */
1785 uint32_t x1, x2;
1786 x1 = ALIGN_UP(x0, xt_sub_range_alignment);
1787 if (x1 > x3)
1788 x1 = x2 = x3;
1789 else
1790 x2 = ALIGN_DOWN(x3, xt_sub_range_alignment);
1791
1792 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
1793 assert(x1 - x0 < xt_sub_range_alignment &&
1794 x3 - x2 < xt_sub_range_alignment);
1795 assert(x3 - x0 <= tw);
1796 assert((x2 - x1) % xt_sub_range_alignment == 0);
1797
1798 /* Translate by (xt,yt) for single-tile copier. */
1799 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1800 y0-yt, y1-yt,
1801 dst + (ptrdiff_t)xt * th + (ptrdiff_t)yt * dst_pitch,
1802 src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
1803 src_pitch,
1804 swizzle_bit,
1805 copy_type);
1806 }
1807 }
1808 }
1809
1810 /**
1811 * Copy from tiled to linear texture.
1812 *
1813 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
1814 * pieces that do not cross tile boundaries and copy each piece with a tile
1815 * copy function (\ref tile_copy_fn).
1816 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
1817 * The Y range is in pixels (i.e. unitless).
1818 * 'dst' is the address of (xt1, yt1) in the destination linear texture.
1819 * 'src' is the address of (0, 0) in the source tiled texture.
1820 */
1821 static void
tiled_to_linear(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,int32_t dst_pitch,uint32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,isl_memcpy_type copy_type)1822 tiled_to_linear(uint32_t xt1, uint32_t xt2,
1823 uint32_t yt1, uint32_t yt2,
1824 char *dst, const char *src,
1825 int32_t dst_pitch, uint32_t src_pitch,
1826 bool has_swizzling,
1827 enum isl_tiling tiling,
1828 isl_memcpy_type copy_type)
1829 {
1830 tile_copy_fn tile_copy;
1831 uint32_t xt0, xt3;
1832 uint32_t yt0, yt3;
1833 uint32_t xt, yt;
1834 uint32_t tw, th, xt_sub_range_alignment;
1835 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
1836
1837 if (tiling == ISL_TILING_X) {
1838 tw = xtile_width;
1839 th = xtile_height;
1840 xt_sub_range_alignment = xtile_span;
1841 tile_copy = xtiled_to_linear_faster;
1842 } else if (tiling == ISL_TILING_Y0) {
1843 tw = ytile_width;
1844 th = ytile_height;
1845 xt_sub_range_alignment = ytile_span;
1846 tile_copy = ytiled_to_linear_faster;
1847 } else if (tiling == ISL_TILING_4) {
1848 tw = ytile_width;
1849 th = ytile_height;
1850 xt_sub_range_alignment = ytile_span;
1851 tile_copy = tile4_to_linear_faster;
1852 } else if (tiling == ISL_TILING_W) {
1853 tw = wtile_width;
1854 th = wtile_height;
1855 /* The copy function prioritizes W-Tile blocks. The width of a W-Tile
1856 * block is four W-Tile spans.
1857 */
1858 xt_sub_range_alignment = wtile_span * 4;
1859 tile_copy = wtiled_to_linear_faster;
1860 /* TileW is a special case with doubled physical tile width due to HW
1861 * programming requirements (see isl_tiling_get_info() in
1862 * src/intel/isl/isl.c)
1863 */
1864 src_pitch /= 2;
1865 } else {
1866 unreachable("unsupported tiling");
1867 }
1868
1869 #if defined(INLINE_SSE41)
1870 if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
1871 /* The hidden cacheline sized register used by movntdqa can apparently
1872 * give you stale data, so do an mfence to invalidate it.
1873 */
1874 _mm_mfence();
1875 }
1876 #endif
1877
1878 /* Round out to tile boundaries. */
1879 xt0 = ALIGN_DOWN(xt1, tw);
1880 xt3 = ALIGN_UP (xt2, tw);
1881 yt0 = ALIGN_DOWN(yt1, th);
1882 yt3 = ALIGN_UP (yt2, th);
1883
1884 /* Loop over all tiles to which we have something to copy.
1885 * 'xt' and 'yt' are the origin of the destination tile, whether copying
1886 * copying a full or partial tile.
1887 * tile_copy() copies one tile or partial tile.
1888 * Looping x inside y is the faster memory access pattern.
1889 */
1890 for (yt = yt0; yt < yt3; yt += th) {
1891 for (xt = xt0; xt < xt3; xt += tw) {
1892 /* The area to update is [x0,x3) x [y0,y1).
1893 * May not want the whole tile, hence the min and max.
1894 */
1895 uint32_t x0 = MAX2(xt1, xt);
1896 uint32_t y0 = MAX2(yt1, yt);
1897 uint32_t x3 = MIN2(xt2, xt + tw);
1898 uint32_t y1 = MIN2(yt2, yt + th);
1899
1900 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that the
1901 * middle interval is the longest xt_sub_range_alignment aligned
1902 * part. The sub-ranges could be empty.
1903 */
1904 uint32_t x1, x2;
1905 x1 = ALIGN_UP(x0, xt_sub_range_alignment);
1906 if (x1 > x3)
1907 x1 = x2 = x3;
1908 else
1909 x2 = ALIGN_DOWN(x3, xt_sub_range_alignment);
1910
1911 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
1912 assert(x1 - x0 < xt_sub_range_alignment &&
1913 x3 - x2 < xt_sub_range_alignment);
1914 assert(x3 - x0 <= tw);
1915 assert((x2 - x1) % xt_sub_range_alignment == 0);
1916
1917 /* Translate by (xt,yt) for single-tile copier. */
1918 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1919 y0-yt, y1-yt,
1920 dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
1921 src + (ptrdiff_t)xt * th + (ptrdiff_t)yt * src_pitch,
1922 dst_pitch,
1923 swizzle_bit,
1924 copy_type);
1925 }
1926 }
1927 }
1928