• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Mesa 3-D graphics library
3  *
4  * Copyright 2012 Intel Corporation
5  * Copyright 2013 Google
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sublicense, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  * Authors:
28  *    Chad Versace <chad.versace@linux.intel.com>
29  *    Frank Henigman <fjhenigman@google.com>
30  */
31 
32 #include <string.h>
33 
34 #include "util/macros.h"
35 #include "util/u_math.h"
36 #include "util/rounding.h"
37 #include "isl_priv.h"
38 
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44 
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46 
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49 
50 /* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
51  * unitless).  A "span" is the most number of bytes we can copy from linear
52  * to tiled without needing to calculate a new destination address.
53  */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60 static const uint32_t wtile_width = 64;
61 static const uint32_t wtile_height = 64;
62 static const uint32_t wtile_span = 2;
63 
64 static inline uint32_t
ror(uint32_t n,uint32_t d)65 ror(uint32_t n, uint32_t d)
66 {
67    return (n >> d) | (n << (32 - d));
68 }
69 
70 // bswap32 already exists as a macro on some platforms (FreeBSD)
71 #ifndef bswap32
72 static inline uint32_t
bswap32(uint32_t n)73 bswap32(uint32_t n)
74 {
75 #if defined(HAVE___BUILTIN_BSWAP32)
76    return __builtin_bswap32(n);
77 #else
78    return (n >> 24) |
79           ((n >> 8) & 0x0000ff00) |
80           ((n << 8) & 0x00ff0000) |
81           (n << 24);
82 #endif
83 }
84 #endif
85 
86 /**
87  * Copy RGBA to BGRA - swap R and B.
88  */
89 static inline void *
rgba8_copy(void * dst,const void * src,size_t bytes)90 rgba8_copy(void *dst, const void *src, size_t bytes)
91 {
92    uint32_t *d = dst;
93    uint32_t const *s = src;
94 
95    assert(bytes % 4 == 0);
96 
97    while (bytes >= 4) {
98       *d = ror(bswap32(*s), 8);
99       d += 1;
100       s += 1;
101       bytes -= 4;
102    }
103    return dst;
104 }
105 
106 #define wtile_block_id(x, y)                    \
107    (((((x) >> 3) & 0x7) << 3) |                 \
108     (((y) >> 3) & 0x7))
109 
110 #define wtile_block_offset(x, y)                \
111    ((((y) & 4) << 3) +                          \
112     (((y) & 2) << 2) +                          \
113     (((y) & 1) << 1) +                          \
114     (((x) & 4) << 2) +                          \
115     (((x) & 2) << 1) +                          \
116     (((x) & 1) << 0))
117 
118 /**
119  * Copy from linear into a W tile block.
120  *
121  * @dst is a pointer to a block in a W tile, @src is a pointer to the linear
122  * data, coordinates are relative to the surface (not the tile).
123  */
124 static inline void
wtile_block_copy_from_linear(void * dst,const void * src,unsigned x0,unsigned x1,unsigned y0,unsigned y1,unsigned src_pitch)125 wtile_block_copy_from_linear(void *dst, const void *src,
126                              unsigned x0, unsigned x1,
127                              unsigned y0, unsigned y1,
128                              unsigned src_pitch)
129 {
130    uint8_t *dst_data = dst + wtile_block_id(x0, y0) * 64;
131    const uint8_t *src_data = src;
132 
133    for (unsigned y = y0; y < y1; y++)
134       for (unsigned x = x0; x < x1; x++)
135          dst_data[wtile_block_offset(x, y)] = src_data[y * src_pitch + x];
136 }
137 
138 /**
139  * Copy from linear into a full W tile block.
140  *
141  * @dst is a pointer to a block in a W tile, @src is a pointer to the linear
142  * data.
143  */
144 static inline void
wtile_block_full_copy_from_linear(void * dst,const void * src,unsigned x0,unsigned y0,unsigned src_pitch)145 wtile_block_full_copy_from_linear(void *dst, const void *src,
146                                   unsigned x0, unsigned y0,
147                                   unsigned src_pitch)
148 {
149    uint16_t *dst_data = dst + wtile_block_id(x0, y0) * 64;
150    const uint8_t *src_data = src;
151 
152    /*
153     * The layout of a block is a series of 2 consecutive bytes elements.
154     * _________________________________
155     * |B00|B01|B04|B05|B16|B17|B20|B21|
156     * |B02|B03|B06|B07|B18|B19|B22|B23|
157     * |B08|B09|B12|B13|B24|B25|B28|B29|
158     * |B10|B11|B14|B15|B26|B27|B30|B31|
159     * |B32|B33|B36|B37|B48|B49|B52|B53|
160     * |B34|B35|B38|B39|B50|B51|B54|B55|
161     * |B40|B41|B44|B45|B56|B57|B60|B61|
162     * |B42|B43|B46|B47|B58|B59|B62|B64|
163     * ---------------------------------
164     */
165 
166 #define src_lin(bx, by) \
167    (*((const uint16_t *)(src_data + (y0 + by) * src_pitch + x0 + bx * 2)))
168 
169    dst_data[0]  = src_lin(0, 0);
170    dst_data[1]  = src_lin(0, 1);
171    dst_data[2]  = src_lin(1, 0);
172    dst_data[3]  = src_lin(1, 1);
173    dst_data[4]  = src_lin(0, 2);
174    dst_data[5]  = src_lin(0, 3);
175    dst_data[6]  = src_lin(1, 2);
176    dst_data[7]  = src_lin(1, 3);
177 
178    dst_data[8]  = src_lin(2, 0);
179    dst_data[9]  = src_lin(2, 1);
180    dst_data[10] = src_lin(3, 0);
181    dst_data[11] = src_lin(3, 1);
182    dst_data[12] = src_lin(2, 2);
183    dst_data[13] = src_lin(2, 3);
184    dst_data[14] = src_lin(3, 2);
185    dst_data[15] = src_lin(3, 3);
186 
187    dst_data[16] = src_lin(0, 4);
188    dst_data[17] = src_lin(0, 5);
189    dst_data[18] = src_lin(1, 4);
190    dst_data[19] = src_lin(1, 5);
191    dst_data[20] = src_lin(0, 6);
192    dst_data[21] = src_lin(0, 7);
193    dst_data[22] = src_lin(1, 6);
194    dst_data[23] = src_lin(1, 7);
195 
196    dst_data[24] = src_lin(2, 4);
197    dst_data[25] = src_lin(2, 5);
198    dst_data[26] = src_lin(3, 4);
199    dst_data[27] = src_lin(3, 5);
200    dst_data[28] = src_lin(2, 6);
201    dst_data[29] = src_lin(2, 7);
202    dst_data[30] = src_lin(3, 6);
203    dst_data[31] = src_lin(3, 7);
204 
205 #undef src_lin
206 }
207 
208 /**
209  * Copy from W tile block into linear.
210  *
211  * @dst is a pointer to the linear data, @src is a pointer to a block in the W
212  * tile.
213  */
214 static inline void
wtile_block_copy_to_linear(void * dst,const void * src,unsigned x0,unsigned x1,unsigned y0,unsigned y1,unsigned dst_pitch)215 wtile_block_copy_to_linear(void *dst, const void *src,
216                            unsigned x0, unsigned x1,
217                            unsigned y0, unsigned y1,
218                            unsigned dst_pitch)
219 {
220    uint8_t *dst_data = dst;
221    const uint8_t *src_data = src + wtile_block_id(x0, y0) * 64;
222 
223    for (unsigned y = y0; y < y1; y++)
224       for (unsigned x = x0; x < x1; x++)
225          dst_data[y * dst_pitch + x] = src_data[wtile_block_offset(x, y)];
226 }
227 
228 /**
229  * Copy to linear from a full W tile block.
230  *
231  * @dst is a pointer to the linear data, @src is a pointer to a block in a W
232  * tile.
233  */
234 static inline void
wtile_block_full_copy_to_linear(void * dst,const void * src,unsigned x0,unsigned y0,unsigned dst_pitch)235 wtile_block_full_copy_to_linear(void *dst, const void *src,
236                                 unsigned x0, unsigned y0,
237                                 unsigned dst_pitch)
238 {
239    uint8_t *dst_data = dst;
240    const uint16_t *src_data = src + wtile_block_id(x0, y0) * 64;
241 
242    /*
243     * The layout of a block is a series of 2 consecutive bytes elements.
244     * _________________________________
245     * |B00|B01|B04|B05|B16|B17|B20|B21|
246     * |B02|B03|B06|B07|B18|B19|B22|B23|
247     * |B08|B09|B12|B13|B24|B25|B28|B29|
248     * |B10|B11|B14|B15|B26|B27|B30|B31|
249     * |B32|B33|B36|B37|B48|B49|B52|B53|
250     * |B34|B35|B38|B39|B50|B51|B54|B55|
251     * |B40|B41|B44|B45|B56|B57|B60|B61|
252     * |B42|B43|B46|B47|B58|B59|B62|B64|
253     * ---------------------------------
254     */
255 
256 #define dst_lin(bx, by) \
257    (*((uint16_t *)(dst_data + (y0 + by) * dst_pitch + x0 + bx * 2)))
258 
259    dst_lin(0, 0) = src_data[0];
260    dst_lin(0, 1) = src_data[1];
261    dst_lin(1, 0) = src_data[2];
262    dst_lin(1, 1) = src_data[3];
263    dst_lin(0, 2) = src_data[4];
264    dst_lin(0, 3) = src_data[5];
265    dst_lin(1, 2) = src_data[6];
266    dst_lin(1, 3) = src_data[7];
267 
268    dst_lin(2, 0) = src_data[8];
269    dst_lin(2, 1) = src_data[9];
270    dst_lin(3, 0) = src_data[10];
271    dst_lin(3, 1) = src_data[11];
272    dst_lin(2, 2) = src_data[12];
273    dst_lin(2, 3) = src_data[13];
274    dst_lin(3, 2) = src_data[14];
275    dst_lin(3, 3) = src_data[15];
276 
277    dst_lin(0, 4) = src_data[16];
278    dst_lin(0, 5) = src_data[17];
279    dst_lin(1, 4) = src_data[18];
280    dst_lin(1, 5) = src_data[19];
281    dst_lin(0, 6) = src_data[20];
282    dst_lin(0, 7) = src_data[21];
283    dst_lin(1, 6) = src_data[22];
284    dst_lin(1, 7) = src_data[23];
285 
286    dst_lin(2, 4) = src_data[24];
287    dst_lin(2, 5) = src_data[25];
288    dst_lin(3, 4) = src_data[26];
289    dst_lin(3, 5) = src_data[27];
290    dst_lin(2, 6) = src_data[28];
291    dst_lin(2, 7) = src_data[29];
292    dst_lin(3, 6) = src_data[30];
293    dst_lin(3, 7) = src_data[31];
294 
295 #undef dst_lin
296 }
297 
298 #ifdef __SSSE3__
299 static const uint8_t rgba8_permutation[16] =
300    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
301 
302 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)303 rgba8_copy_16_aligned_dst(void *dst, const void *src)
304 {
305    _mm_store_si128(dst,
306                    _mm_shuffle_epi8(_mm_loadu_si128(src),
307                                     *(__m128i *)rgba8_permutation));
308 }
309 
310 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)311 rgba8_copy_16_aligned_src(void *dst, const void *src)
312 {
313    _mm_storeu_si128(dst,
314                     _mm_shuffle_epi8(_mm_load_si128(src),
315                                      *(__m128i *)rgba8_permutation));
316 }
317 
318 #elif defined(__SSE2__)
319 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)320 rgba8_copy_16_aligned_dst(void *dst, const void *src)
321 {
322    __m128i srcreg, dstreg, agmask, ag, rb, br;
323 
324    agmask = _mm_set1_epi32(0xFF00FF00);
325    srcreg = _mm_loadu_si128((__m128i *)src);
326 
327    rb = _mm_andnot_si128(agmask, srcreg);
328    ag = _mm_and_si128(agmask, srcreg);
329    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
330                             _MM_SHUFFLE(2, 3, 0, 1));
331    dstreg = _mm_or_si128(ag, br);
332 
333    _mm_store_si128((__m128i *)dst, dstreg);
334 }
335 
336 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)337 rgba8_copy_16_aligned_src(void *dst, const void *src)
338 {
339    __m128i srcreg, dstreg, agmask, ag, rb, br;
340 
341    agmask = _mm_set1_epi32(0xFF00FF00);
342    srcreg = _mm_load_si128((__m128i *)src);
343 
344    rb = _mm_andnot_si128(agmask, srcreg);
345    ag = _mm_and_si128(agmask, srcreg);
346    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
347                             _MM_SHUFFLE(2, 3, 0, 1));
348    dstreg = _mm_or_si128(ag, br);
349 
350    _mm_storeu_si128((__m128i *)dst, dstreg);
351 }
352 #endif
353 
354 /**
355  * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
356  */
357 static inline void *
rgba8_copy_aligned_dst(void * dst,const void * src,size_t bytes)358 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
359 {
360    assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
361 
362 #if defined(__SSSE3__) || defined(__SSE2__)
363    if (bytes == 64) {
364       rgba8_copy_16_aligned_dst(dst +  0, src +  0);
365       rgba8_copy_16_aligned_dst(dst + 16, src + 16);
366       rgba8_copy_16_aligned_dst(dst + 32, src + 32);
367       rgba8_copy_16_aligned_dst(dst + 48, src + 48);
368       return dst;
369    }
370 
371    while (bytes >= 16) {
372       rgba8_copy_16_aligned_dst(dst, src);
373       src += 16;
374       dst += 16;
375       bytes -= 16;
376    }
377 #endif
378 
379    rgba8_copy(dst, src, bytes);
380 
381    return dst;
382 }
383 
384 /**
385  * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
386  */
387 static inline void *
rgba8_copy_aligned_src(void * dst,const void * src,size_t bytes)388 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
389 {
390    assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
391 
392 #if defined(__SSSE3__) || defined(__SSE2__)
393    if (bytes == 64) {
394       rgba8_copy_16_aligned_src(dst +  0, src +  0);
395       rgba8_copy_16_aligned_src(dst + 16, src + 16);
396       rgba8_copy_16_aligned_src(dst + 32, src + 32);
397       rgba8_copy_16_aligned_src(dst + 48, src + 48);
398       return dst;
399    }
400 
401    while (bytes >= 16) {
402       rgba8_copy_16_aligned_src(dst, src);
403       src += 16;
404       dst += 16;
405       bytes -= 16;
406    }
407 #endif
408 
409    rgba8_copy(dst, src, bytes);
410 
411    return dst;
412 }
413 
414 /**
415  * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
416  * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
417  * The first and last ranges must be shorter than a "span" (the longest linear
418  * stretch within a tile) and the middle must equal a whole number of spans.
419  * Ranges may be empty.  The region copied must land entirely within one tile.
420  * 'dst' is the start of the tile and 'src' is the corresponding
421  * address to copy from, though copying begins at (x0, y0).
422  * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
423  * Swizzling flips bit 6 in the copy destination offset, when certain other
424  * bits are set in it.
425  */
426 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
427                              uint32_t y0, uint32_t y1,
428                              char *dst, const char *src,
429                              int32_t linear_pitch,
430                              uint32_t swizzle_bit,
431                              isl_memcpy_type copy_type);
432 
433 /**
434  * Copy texture data from linear to X tile layout.
435  *
436  * \copydoc tile_copy_fn
437  *
438  * The mem_copy parameters allow the user to specify an alternative mem_copy
439  * function that, for instance, may do RGBA -> BGRA swizzling.  The first
440  * function must handle any memory alignment while the second function must
441  * only handle 16-byte alignment in whichever side (source or destination) is
442  * tiled.
443  */
444 static inline void
linear_to_xtiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)445 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
446                  uint32_t y0, uint32_t y1,
447                  char *dst, const char *src,
448                  int32_t src_pitch,
449                  uint32_t swizzle_bit,
450                  isl_mem_copy_fn mem_copy,
451                  isl_mem_copy_fn mem_copy_align16)
452 {
453    /* The copy destination offset for each range copied is the sum of
454     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
455     */
456    uint32_t xo, yo;
457 
458    src += (ptrdiff_t)y0 * src_pitch;
459 
460    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
461       /* Bits 9 and 10 of the copy destination offset control swizzling.
462        * Only 'yo' contributes to those bits in the total offset,
463        * so calculate 'swizzle' just once per row.
464        * Move bits 9 and 10 three and four places respectively down
465        * to bit 6 and xor them.
466        */
467       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
468 
469       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
470 
471       for (xo = x1; xo < x2; xo += xtile_span) {
472          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
473       }
474 
475       mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
476 
477       src += src_pitch;
478    }
479 }
480 
481 /**
482  * Copy texture data from linear to Y tile layout.
483  *
484  * \copydoc tile_copy_fn
485  */
486 static inline void
linear_to_ytiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)487 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
488                  uint32_t y0, uint32_t y3,
489                  char *dst, const char *src,
490                  int32_t src_pitch,
491                  uint32_t swizzle_bit,
492                  isl_mem_copy_fn mem_copy,
493                  isl_mem_copy_fn mem_copy_align16)
494 {
495    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
496     * as the tile).  Thus the destination offset for (x,y) is the sum of:
497     *   (x % column_width)                    // position within column
498     *   (x / column_width) * bytes_per_column // column number * bytes per column
499     *   y * column_width
500     *
501     * The copy destination offset for each range copied is the sum of
502     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
503     */
504    const uint32_t column_width = ytile_span;
505    const uint32_t bytes_per_column = column_width * ytile_height;
506 
507    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
508    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
509 
510    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
511    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
512 
513    /* Bit 9 of the destination offset control swizzling.
514     * Only the X offset contributes to bit 9 of the total offset,
515     * so swizzle can be calculated in advance for these X positions.
516     * Move bit 9 three places down to bit 6.
517     */
518    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
519    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
520 
521    uint32_t x, yo;
522 
523    src += (ptrdiff_t)y0 * src_pitch;
524 
525    if (y0 != y1) {
526       for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
527          uint32_t xo = xo1;
528          uint32_t swizzle = swizzle1;
529 
530          mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
531 
532          /* Step by spans/columns.  As it happens, the swizzle bit flips
533           * at each step so we don't need to calculate it explicitly.
534           */
535          for (x = x1; x < x2; x += ytile_span) {
536             mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
537             xo += bytes_per_column;
538             swizzle ^= swizzle_bit;
539          }
540 
541          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
542 
543          src += src_pitch;
544       }
545    }
546 
547    for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
548       uint32_t xo = xo1;
549       uint32_t swizzle = swizzle1;
550 
551       if (x0 != x1) {
552          mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
553          mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
554          mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
555          mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
556       }
557 
558       /* Step by spans/columns.  As it happens, the swizzle bit flips
559        * at each step so we don't need to calculate it explicitly.
560        */
561       for (x = x1; x < x2; x += ytile_span) {
562          mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
563          mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
564          mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
565          mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
566          xo += bytes_per_column;
567          swizzle ^= swizzle_bit;
568       }
569 
570       if (x2 != x3) {
571          mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
572          mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
573          mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
574          mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
575       }
576 
577       src += 4 * src_pitch;
578    }
579 
580    if (y2 != y3) {
581       for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
582          uint32_t xo = xo1;
583          uint32_t swizzle = swizzle1;
584 
585          mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
586 
587          /* Step by spans/columns.  As it happens, the swizzle bit flips
588           * at each step so we don't need to calculate it explicitly.
589           */
590          for (x = x1; x < x2; x += ytile_span) {
591             mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
592             xo += bytes_per_column;
593             swizzle ^= swizzle_bit;
594          }
595 
596          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
597 
598          src += src_pitch;
599       }
600    }
601 }
602 
603 /**
604  * Copy texture data from linear to Tile-4 layout.
605  *
606  * \copydoc tile_copy_fn
607  */
608 static inline void
linear_to_tile4(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)609 linear_to_tile4(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
610                 uint32_t y0, uint32_t y3,
611                 char *dst, const char *src,
612                 int32_t src_pitch,
613                 uint32_t swizzle_bit,
614                 isl_mem_copy_fn mem_copy,
615                 isl_mem_copy_fn mem_copy_align16)
616 {
617    /* Tile 4 consist of columns that are 'ytile_span' wide and each 64B tile
618     * block consists of 4 row of Y-tile ordered data.
619     * Each 512B block within a 4kB tile contains 8 such block.
620     *
621     * To calculate the tiled  offset, we need to identify:
622     * Block X and Block Y offset at each 512B block boundary in X and Y
623     * direction.
624     *
625     * A Tile4 has the following layout :
626     *
627     *                |<------------- 128 B-------------------|
628     *                _________________________________________
629     * 512B blk(Blk0)^|  0 |  1 |  2 |  3 |  8 |  9 | 10 | 11 | ^ 512B blk(Blk1)
630     * (cell 0..7))  v|  4 |  5 |  6 |  7 | 12 | 13 | 14 | 15 | v (cell 8..15))
631     *                -----------------------------------------
632     *                | 16 | 17 | 18 | 19 | 24 | 25 | 26 | 27 |
633     *                | 20 | 21 | 22 | 23 | 28 | 29 | 30 | 31 |
634     *                -----------------------------------------
635     *                | 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 |
636     *                | 36 | 37 | 38 | 39 | 44 | 45 | 46 | 47 |
637     *                -----------------------------------------
638     *                | 48 | 49 | 50 | 51 | 56 | 57 | 58 | 59 |
639     *                | 52 | 53 | 54 | 55 | 60 | 61 | 62 | 63 |
640     *                -----------------------------------------
641     *
642     * The tile is divided in 512B blocks[Blk0..Blk7], themselves made of 2
643     * rows of 256B sub-blocks.
644     *
645     * Each sub-block is composed of 4 64B elements[cell(0)-cell(3)] (a cell
646     * in the figure above).
647     *
648     * Each 64B cell represents 4 rows of data.[cell(0), cell(1), .., cell(63)]
649     *
650     *
651     *   Block X - Adds 256B to offset when we encounter block boundary in
652     *             X direction.(Ex: Blk 0 --> Blk 1(BlkX_off = 256))
653     *   Block Y - Adds 512B to offset when we encounter block boundary in
654     *             Y direction.(Ex: Blk 0 --> Blk 3(BlkY_off = 512))
655     *
656     *   (x / ytile_span) * cacheline_size_B //Byte offset in the X dir of
657     *                                         the containing 64B block
658     *   x % ytile_span //Byte offset in X dir within a 64B block/cacheline
659     *
660     *   (y % 4) * 16 // Byte offset of the Y dir within a 64B block/cacheline
661     *   (y / 4) * 256// Byte offset of the Y dir within 512B block after 1 row
662     *                   of 64B blocks/cachelines
663     *
664     * The copy destination offset for each range copied is the sum of
665     * Block X offset 'BlkX_off', Block Y offset 'BlkY_off', X offset 'xo'
666     * and a Y offset 'yo.'
667     */
668    const uint32_t column_width = ytile_span;
669    const uint32_t tile4_blkh = 4;
670 
671    assert(ytile_span * tile4_blkh == 64);
672    const uint32_t cacheline_size_B = 64;
673 
674    /* Find intermediate Y offsets that are aligned to a 64B element
675     * (4 rows), so that we can do fully 64B memcpys on those.
676     */
677    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
678    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
679 
680    /* xsb0 and xsb1 are the byte offset within a 256B sub block for x0 and x1 */
681    uint32_t xsb0 = (x0 % ytile_span) + (x0 / ytile_span) * cacheline_size_B;
682    uint32_t xsb1 = (x1 % ytile_span) + (x1 / ytile_span) * cacheline_size_B;
683 
684    uint32_t Blkxsb0_off = ALIGN_DOWN(xsb0, 256);
685    uint32_t Blky0_off = (y0 / 8) * 512;
686 
687    uint32_t BlkX_off, BlkY_off;
688 
689    uint32_t x, yo, Y0, Y2;
690 
691    /* Y0 determines the initial byte offset in the Y direction */
692    Y0 = (y0 / 4) * 256 + (y0 % 4) * ytile_span;
693 
694    /* Y2 determines the byte offset required for reaching y2 if y2 doesn't map
695     * exactly to 512B block boundary
696     */
697    Y2 = y2 * 4 * column_width;
698 
699    src += (ptrdiff_t)y0 * src_pitch;
700 
701    /* To maximize memcpy speed, we do the copy in 3 parts :
702     *   - copy the first lines that are not aligned to the 64B cell's height (4 rows)
703     *   - copy the lines that are aligned to 64B cell's height
704     *   - copy the remaining lines not making up for a full 64B cell's height
705     */
706    if (y0 != y1) {
707       for (yo = Y0; yo < Y0 + (y1 - y0) * column_width; yo += column_width) {
708          uint32_t xo = xsb1;
709 
710          if (x0 != x1)
711             mem_copy(dst + (Blky0_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0);
712 
713          for (x = x1; x < x2; x += ytile_span) {
714             BlkX_off = ALIGN_DOWN(xo, 256);
715 
716             mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x, ytile_span);
717             xo += cacheline_size_B;
718          }
719 
720          if (x3 != x2) {
721             BlkX_off = ALIGN_DOWN(xo, 256);
722             mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x2, x3 - x2);
723          }
724 
725          src += src_pitch;
726       }
727    }
728 
729    for (yo = y1 * 4 * column_width; yo < y2 * 4 * column_width; yo += 16 * column_width) {
730       uint32_t xo = xsb1;
731       BlkY_off = ALIGN_DOWN(yo, 512);
732 
733       if (x0 != x1) {
734          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 0 * column_width),
735                   src + x0 + 0 * src_pitch, x1 - x0);
736          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 1 * column_width),
737                   src + x0 + 1 * src_pitch, x1 - x0);
738          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 2 * column_width),
739                   src + x0 + 2 * src_pitch, x1 - x0);
740          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 3 * column_width),
741                   src + x0 + 3 * src_pitch, x1 - x0);
742       }
743 
744       for (x = x1; x < x2; x += ytile_span) {
745          BlkX_off = ALIGN_DOWN(xo, 256);
746 
747          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo+ 0 * column_width),
748                           src + x + 0 * src_pitch, ytile_span);
749          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
750                           src + x + 1 * src_pitch, ytile_span);
751          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
752                           src + x + 2 * src_pitch, ytile_span);
753          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
754                           src + x + 3 * src_pitch, ytile_span);
755 
756          xo += cacheline_size_B;
757       }
758 
759       if (x2 != x3) {
760          BlkX_off = ALIGN_DOWN(xo, 256);
761 
762          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
763                   src + x2 + 0 * src_pitch, x3 - x2);
764          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
765                   src + x2 + 1 * src_pitch, x3 - x2);
766          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
767                   src + x2 + 2 * src_pitch, x3 - x2);
768          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
769                   src + x2 + 3 * src_pitch, x3 - x2);
770       }
771 
772       src += 4 * src_pitch;
773    }
774 
775    if (y2 != y3) {
776       for (yo = Y2; yo < Y2 + (y3 - y2) * column_width; yo += column_width) {
777          uint32_t xo = xsb1;
778          BlkY_off = ALIGN_DOWN(yo, 512);
779 
780          if (x0 != x1)
781             mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0);
782 
783          for (x = x1; x < x2; x += ytile_span) {
784             BlkX_off = ALIGN_DOWN(xo, 256);
785 
786             mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x, ytile_span);
787             xo += cacheline_size_B;
788          }
789 
790          if (x3 != x2) {
791             BlkX_off = ALIGN_DOWN(xo, 256);
792             mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x2, x3 - x2);
793          }
794 
795          src += src_pitch;
796       }
797    }
798 }
799 
800 /**
801  * Copy texture data from linear to W tile layout.
802  *
803  * \copydoc tile_copy_fn
804  */
805 static inline void
linear_to_wtiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch)806 linear_to_wtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
807                  uint32_t y0, uint32_t y3,
808                  char *dst, const char *src, int32_t src_pitch)
809 {
810    /*
811     * The layout is a series of block of 64B each.
812     * ___________________________________________
813     * |blk00|blk08|blk16|blk24|blk32|blk48|blk56|
814     * |blk01|blk09|blk17|blk25|blk33|blk49|blk57|
815     * |blk02|blk10|blk18|blk26|blk34|blk50|blk58|
816     * |blk03|blk11|blk19|blk27|blk35|blk51|blk59|
817     * |blk04|blk12|blk20|blk28|blk36|blk52|blk60|
818     * |blk05|blk13|blk21|blk29|blk37|blk53|blk61|
819     * |blk06|blk14|blk22|blk30|blk38|blk54|blk62|
820     * |blk07|blk15|blk23|blk31|blk39|blk55|blk63|
821     * -------------------------------------------
822     */
823 
824    /* Find intermediate Y offsets that are aligned to a 64B element (8 rows).
825     */
826    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 8));
827    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 8));
828 
829    uint32_t xo, yo;
830 
831    /* If the y0 coordinate is not aligned to a block, do partial copies into
832     * blocks 0, 8, 16, 24, 32, 48 & 56.
833     */
834    if (y0 != y1) {
835       if (x0 != x1)
836          wtile_block_copy_from_linear(dst, src, x0, x1, y0, y1, src_pitch);
837       for (xo = x1; xo < x2; xo += 8)
838          wtile_block_copy_from_linear(dst, src, xo, xo + 8, y0, y1, src_pitch);
839       if (x2 != x3)
840          wtile_block_copy_from_linear(dst, src, x2, x3, y0, y1, src_pitch);
841    }
842 
843    for (yo = y1; yo < y2; yo += 8) {
844       /* Do partial copies int blocks [1, 6] if x0 is not aligned to block. */
845       if (x0 != x1) {
846          wtile_block_copy_from_linear(dst, src,
847                                       x0, x1, yo, yo + 8, src_pitch);
848       }
849       /* Full block copies on the inside. */
850       for (xo = x1; xo < x2; xo += 8)
851          wtile_block_full_copy_from_linear(dst, src, xo, yo, src_pitch);
852       /* Do partial copies int blocks [57, 62] if y3 is not aligned to block.
853        */
854       if (x2 != x3) {
855          wtile_block_copy_from_linear(dst, src,
856                                       x2, x3, yo, yo + 8, src_pitch);
857       }
858    }
859 
860    /* If the x3 coordinate is not aligned to a block, do partial copies into
861     * blocks [57,62].
862     */
863    if (y2 != y3) {
864       if (x0 != x1)
865          wtile_block_copy_from_linear(dst, src, x0, x1, y2, y3, src_pitch);
866       for (xo = x1; xo < x2; xo += 8)
867          wtile_block_copy_from_linear(dst, src, xo, xo + 8, y2, y3, src_pitch);
868       if (x2 != x3)
869          wtile_block_copy_from_linear(dst, src, x2, x3, y2, y3, src_pitch);
870    }
871 }
872 
873 /**
874  * Copy texture data from X tile layout to linear.
875  *
876  * \copydoc tile_copy_fn
877  */
878 static inline void
xtiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)879 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
880                  uint32_t y0, uint32_t y1,
881                  char *dst, const char *src,
882                  int32_t dst_pitch,
883                  uint32_t swizzle_bit,
884                  isl_mem_copy_fn mem_copy,
885                  isl_mem_copy_fn mem_copy_align16)
886 {
887    /* The copy destination offset for each range copied is the sum of
888     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
889     */
890    uint32_t xo, yo;
891 
892    dst += (ptrdiff_t)y0 * dst_pitch;
893 
894    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
895       /* Bits 9 and 10 of the copy destination offset control swizzling.
896        * Only 'yo' contributes to those bits in the total offset,
897        * so calculate 'swizzle' just once per row.
898        * Move bits 9 and 10 three and four places respectively down
899        * to bit 6 and xor them.
900        */
901       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
902 
903       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
904 
905       for (xo = x1; xo < x2; xo += xtile_span) {
906          mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
907       }
908 
909       mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
910 
911       dst += dst_pitch;
912    }
913 }
914 
915  /**
916  * Copy texture data from Y tile layout to linear.
917  *
918  * \copydoc tile_copy_fn
919  */
920 static inline void
ytiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)921 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
922                  uint32_t y0, uint32_t y3,
923                  char *dst, const char *src,
924                  int32_t dst_pitch,
925                  uint32_t swizzle_bit,
926                  isl_mem_copy_fn mem_copy,
927                  isl_mem_copy_fn mem_copy_align16)
928 {
929    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
930     * as the tile).  Thus the destination offset for (x,y) is the sum of:
931     *   (x % column_width)                    // position within column
932     *   (x / column_width) * bytes_per_column // column number * bytes per column
933     *   y * column_width
934     *
935     * The copy destination offset for each range copied is the sum of
936     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
937     */
938    const uint32_t column_width = ytile_span;
939    const uint32_t bytes_per_column = column_width * ytile_height;
940 
941    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
942    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
943 
944    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
945    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
946 
947    /* Bit 9 of the destination offset control swizzling.
948     * Only the X offset contributes to bit 9 of the total offset,
949     * so swizzle can be calculated in advance for these X positions.
950     * Move bit 9 three places down to bit 6.
951     */
952    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
953    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
954 
955    uint32_t x, yo;
956 
957    dst += (ptrdiff_t)y0 * dst_pitch;
958 
959    if (y0 != y1) {
960       for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
961          uint32_t xo = xo1;
962          uint32_t swizzle = swizzle1;
963 
964          mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
965 
966          /* Step by spans/columns.  As it happens, the swizzle bit flips
967           * at each step so we don't need to calculate it explicitly.
968           */
969          for (x = x1; x < x2; x += ytile_span) {
970             mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
971             xo += bytes_per_column;
972             swizzle ^= swizzle_bit;
973          }
974 
975          mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
976 
977          dst += dst_pitch;
978       }
979    }
980 
981    for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
982       uint32_t xo = xo1;
983       uint32_t swizzle = swizzle1;
984 
985       if (x0 != x1) {
986          mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
987          mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
988          mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
989          mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
990       }
991 
992       /* Step by spans/columns.  As it happens, the swizzle bit flips
993        * at each step so we don't need to calculate it explicitly.
994        */
995       for (x = x1; x < x2; x += ytile_span) {
996          mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
997          mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
998          mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
999          mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
1000          xo += bytes_per_column;
1001          swizzle ^= swizzle_bit;
1002       }
1003 
1004       if (x2 != x3) {
1005          mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
1006          mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
1007          mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
1008          mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
1009       }
1010 
1011       dst += 4 * dst_pitch;
1012    }
1013 
1014    if (y2 != y3) {
1015       for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
1016          uint32_t xo = xo1;
1017          uint32_t swizzle = swizzle1;
1018 
1019          mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
1020 
1021          /* Step by spans/columns.  As it happens, the swizzle bit flips
1022           * at each step so we don't need to calculate it explicitly.
1023           */
1024          for (x = x1; x < x2; x += ytile_span) {
1025             mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
1026             xo += bytes_per_column;
1027             swizzle ^= swizzle_bit;
1028          }
1029 
1030          mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
1031 
1032          dst += dst_pitch;
1033       }
1034    }
1035 }
1036 
1037 
1038 /**
1039  * Copy texture data from linear to Tile-4 layout.
1040  *
1041  * \copydoc tile_copy_fn
1042  */
1043 static inline void
tile4_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)1044 tile4_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1045                 uint32_t y0, uint32_t y3,
1046                 char *dst, const char *src,
1047                 int32_t dst_pitch,
1048                 uint32_t swizzle_bit,
1049                 isl_mem_copy_fn mem_copy,
1050                 isl_mem_copy_fn mem_copy_align16)
1051 {
1052 
1053    /* Tile 4 consist of columns that are 'ytile_span' wide and each 64B tile block
1054     * consists of 4 row of Y-tile ordered data.
1055     * Each 512B block within a 4kB tile contains 8 such block.
1056     *
1057     * To calculate the tiled  offset, we need to identify:
1058     * Block X and Block Y offset at each 512B block boundary in X and Y direction.
1059     *
1060     * Refer to the Tile4 layout diagram in linear_to_tile4() function.
1061     *
1062     * The tile is divided in 512B blocks[Blk0..Blk7], themselves made of 2
1063     * rows of 256B sub-blocks
1064     *
1065     * Each sub-block is composed of 4 64B elements[cell(0)-cell(3)].
1066     *
1067     * Each 64B cell represents 4 rows of data.[cell(0), cell(1), .., cell(63)]
1068     *
1069     *
1070     *   Block X - Adds 256B to offset when we encounter block boundary in
1071     *             X direction.(Ex: Blk 0 --> Blk 1(BlkX_off = 256))
1072     *   Block Y - Adds 512B to offset when we encounter block boundary in
1073     *             Y direction.(Ex: Blk 0 --> Blk 3(BlkY_off = 512))
1074     *
1075     *   (x / ytile_span) * cacheline_size_B //Byte offset in the X dir of the
1076     *                                         containing 64B block
1077     *   x % ytile_span //Byte offset in X dir within a 64B block/cacheline
1078     *
1079     *   (y % 4) * 16 // Byte offset of the Y dir within a 64B block/cacheline
1080     *   (y / 4) * 256// Byte offset of the Y dir within 512B block after 1 row
1081     *                   of 64B blocks/cachelines
1082     *
1083     * The copy destination offset for each range copied is the sum of
1084     * Block X offset 'BlkX_off', Block Y offset 'BlkY_off', X offset 'xo'
1085     * and a Y offset 'yo.'
1086     */
1087 
1088    const uint32_t column_width = ytile_span;
1089    const uint32_t tile4_blkh = 4;
1090 
1091    assert(ytile_span * tile4_blkh == 64);
1092    const uint32_t cacheline_size_B = 64;
1093 
1094    /* Find intermediate Y offsets that are aligned to a 64B element
1095     * (4 rows), so that we can do fully 64B memcpys on those.
1096     */
1097    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
1098    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
1099 
1100    /* xsb0 and xsb1 are the byte offset within a 256B sub block for x0 and x1 */
1101    uint32_t xsb0 = (x0 % ytile_span) + (x0 / ytile_span) * cacheline_size_B;
1102    uint32_t xsb1 = (x1 % ytile_span) + (x1 / ytile_span) * cacheline_size_B;
1103 
1104    uint32_t Blkxsb0_off = ALIGN_DOWN(xsb0, 256);
1105    uint32_t Blky0_off = (y0 / 8) * 512;
1106 
1107    uint32_t BlkX_off, BlkY_off;
1108 
1109    uint32_t x, yo, Y0, Y2;
1110 
1111    /* Y0 determines the initial byte offset in the Y direction */
1112    Y0 = (y0 / 4) * 256 + (y0 % 4) * 16;
1113 
1114    /* Y2 determines the byte offset required for reaching y2 if y2 doesn't map
1115     * exactly to 512B block boundary
1116     */
1117    Y2 = y2 * 4 * column_width;
1118 
1119    dst += (ptrdiff_t)y0 * dst_pitch;
1120 
1121    /* To maximize memcpy speed, we do the copy in 3 parts :
1122     *   - copy the first lines that are not aligned to the 64B cell's height (4 rows)
1123     *   - copy the lines that are aligned to 64B cell's height
1124     *   - copy the remaining lines not making up for a full 64B cell's height
1125     */
1126    if (y0 != y1) {
1127       for (yo = Y0; yo < Y0 + (y1 - y0) * column_width; yo += column_width) {
1128          uint32_t xo = xsb1;
1129 
1130          if (x0 != x1)
1131             mem_copy(dst + x0, src + (Blky0_off + Blkxsb0_off) + (xsb0 + yo), x1 - x0);
1132 
1133          for (x = x1; x < x2; x += ytile_span) {
1134             BlkX_off = ALIGN_DOWN(xo, 256);
1135 
1136             mem_copy_align16(dst + x, src + (Blky0_off + BlkX_off) + (xo + yo), ytile_span);
1137             xo += cacheline_size_B;
1138          }
1139 
1140          if (x3 != x2) {
1141             BlkX_off = ALIGN_DOWN(xo, 256);
1142             mem_copy_align16(dst + x2, src + (Blky0_off + BlkX_off) + (xo + yo), x3 - x2);
1143          }
1144 
1145          dst += dst_pitch;
1146       }
1147    }
1148 
1149    for (yo = y1 * 4 * column_width; yo < y2 * 4 * column_width; yo += 16 * column_width) {
1150       uint32_t xo = xsb1;
1151       BlkY_off = ALIGN_DOWN(yo, 512);
1152 
1153       if (x0 != x1) {
1154          mem_copy(dst + x0 + 0 * dst_pitch,
1155                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 0 * column_width),
1156                   x1 - x0);
1157          mem_copy(dst + x0 + 1 * dst_pitch,
1158                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 1 * column_width),
1159                   x1 - x0);
1160          mem_copy(dst + x0 + 2 * dst_pitch,
1161                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 2 * column_width),
1162                   x1 - x0);
1163          mem_copy(dst + x0 + 3 * dst_pitch,
1164                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 3 * column_width),
1165                   x1 - x0);
1166       }
1167 
1168       for (x = x1; x < x2; x += ytile_span) {
1169          BlkX_off = ALIGN_DOWN(xo, 256);
1170 
1171          mem_copy_align16(dst + x + 0 * dst_pitch,
1172                           src + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
1173                           ytile_span);
1174          mem_copy_align16(dst + x + 1 * dst_pitch,
1175                           src + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
1176                           ytile_span);
1177          mem_copy_align16(dst + x + 2 * dst_pitch,
1178                           src + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
1179                           ytile_span);
1180          mem_copy_align16(dst + x + 3 * dst_pitch,
1181                           src + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
1182                           ytile_span);
1183 
1184          xo += cacheline_size_B;
1185       }
1186 
1187       if (x2 != x3) {
1188          BlkX_off = ALIGN_DOWN(xo, 256);
1189 
1190          mem_copy(dst + x2 + 0 * dst_pitch,
1191                   src + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
1192                   x3 - x2);
1193          mem_copy(dst + x2 + 1 * dst_pitch,
1194                   src + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
1195                   x3 - x2);
1196          mem_copy(dst + x2 + 2 * dst_pitch,
1197                   src + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
1198                   x3 - x2);
1199          mem_copy(dst + x2 + 3 * dst_pitch,
1200                   src + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
1201                   x3 - x2);
1202       }
1203 
1204       dst += 4 * dst_pitch;
1205    }
1206 
1207    if (y2 != y3) {
1208       for (yo = Y2; yo < Y2 + (y3 - y2) * column_width; yo += column_width) {
1209          uint32_t xo = xsb1;
1210          BlkY_off = ALIGN_DOWN(yo, 512);
1211 
1212          if (x0 != x1)
1213             mem_copy(dst + x0, src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo), x1 - x0);
1214 
1215          for (x = x1; x < x2; x += ytile_span) {
1216             BlkX_off = ALIGN_DOWN(xo, 256);
1217 
1218             mem_copy_align16(dst + x, src + (BlkY_off + BlkX_off) + (xo + yo), ytile_span);
1219             xo += cacheline_size_B;
1220          }
1221 
1222          if (x3 != x2) {
1223             BlkX_off = ALIGN_DOWN(xo, 256);
1224             mem_copy_align16(dst + x2, src + (BlkY_off + BlkX_off) + (xo + yo), x3 - x2);
1225          }
1226 
1227          dst += dst_pitch;
1228       }
1229    }
1230 }
1231 
1232 /**
1233  * Copy texture data from W tile layout to linear.
1234  *
1235  * \copydoc tile_copy_fn
1236  */
1237 static inline void
wtiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch)1238 wtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1239                  uint32_t y0, uint32_t y3,
1240                  char *dst, const char *src,
1241                  int32_t dst_pitch)
1242 {
1243    /*
1244     * The layout is a series of block of 64B each.
1245     * ___________________________________________
1246     * |blk00|blk08|blk16|blk24|blk32|blk48|blk56|
1247     * |blk01|blk09|blk17|blk25|blk33|blk49|blk57|
1248     * |blk02|blk10|blk18|blk26|blk34|blk50|blk58|
1249     * |blk03|blk11|blk19|blk27|blk35|blk51|blk59|
1250     * |blk04|blk12|blk20|blk28|blk36|blk52|blk60|
1251     * |blk05|blk13|blk21|blk29|blk37|blk53|blk61|
1252     * |blk06|blk14|blk22|blk30|blk38|blk54|blk62|
1253     * |blk07|blk15|blk23|blk31|blk39|blk55|blk63|
1254     * -------------------------------------------
1255     */
1256 
1257    /* Find intermediate Y offsets that are aligned to a 64B element (8 rows).
1258     */
1259    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 8));
1260    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 8));
1261 
1262    uint32_t xo, yo;
1263 
1264    /* If the y0 coordinate is not aligned to a block, do partial copies into
1265     * blocks 0, 8, 16, 24, 32, 48 & 56.
1266     */
1267    if (y0 != y1) {
1268       if (x0 != x1)
1269          wtile_block_copy_to_linear(dst, src, x0, x1, y0, y1, dst_pitch);
1270       for (xo = x1; xo < x2; xo += 8)
1271          wtile_block_copy_to_linear(dst, src, xo, xo + 8, y0, y1, dst_pitch);
1272       if (x2 != x3)
1273          wtile_block_copy_to_linear(dst, src, x2, x3, y0, y1, dst_pitch);
1274    }
1275 
1276    for (yo = y1; yo < y2; yo += 8) {
1277       /* Do partial copies int blocks [1, 6] if x0 is not aligned to block. */
1278       if (x0 != x1)
1279          wtile_block_copy_to_linear(dst, src, x0, x1, yo, yo + 8, dst_pitch);
1280       /* Full block copies on the inside. */
1281       for (xo = x1; xo < x2; xo += 8)
1282          wtile_block_full_copy_to_linear(dst, src, xo, yo, dst_pitch);
1283       /* Do partial copies int blocks [57, 62] if y3 is not aligned to block.
1284        */
1285       if (x2 != x3)
1286          wtile_block_copy_to_linear(dst, src, x2, x3, yo, yo + 8, dst_pitch);
1287    }
1288 
1289    /* If the x3 coordinate is not aligned to a block, do partial copies into
1290     * blocks [57,62].
1291     */
1292    if (y2 != y3) {
1293       if (x0 != x1)
1294          wtile_block_copy_to_linear(dst, src, x0, x1, y2, y3, dst_pitch);
1295       for (xo = x1; xo < x2; xo += 8) {
1296          wtile_block_copy_to_linear(dst, src,
1297                                     xo, MIN2(xo + 8, x3), y2, y3, dst_pitch);
1298       }
1299       if (x2 != x3)
1300          wtile_block_copy_to_linear(dst, src, x2, x3, y2, y3, dst_pitch);
1301    }
1302 }
1303 
1304 #if defined(INLINE_SSE41)
1305 static ALWAYS_INLINE void *
_memcpy_streaming_load(void * dest,const void * src,size_t count)1306 _memcpy_streaming_load(void *dest, const void *src, size_t count)
1307 {
1308    if (count == 16) {
1309       __m128i val = _mm_stream_load_si128((__m128i *)src);
1310       _mm_storeu_si128((__m128i *)dest, val);
1311       return dest;
1312    } else if (count == 64) {
1313       __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
1314       __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
1315       __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
1316       __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
1317       _mm_storeu_si128(((__m128i *)dest) + 0, val0);
1318       _mm_storeu_si128(((__m128i *)dest) + 1, val1);
1319       _mm_storeu_si128(((__m128i *)dest) + 2, val2);
1320       _mm_storeu_si128(((__m128i *)dest) + 3, val3);
1321       return dest;
1322    } else {
1323       assert(count < 64); /* and (count < 16) for ytiled */
1324       return memcpy(dest, src, count);
1325    }
1326 }
1327 #endif
1328 
1329 static isl_mem_copy_fn
choose_copy_function(isl_memcpy_type copy_type)1330 choose_copy_function(isl_memcpy_type copy_type)
1331 {
1332    switch(copy_type) {
1333    case ISL_MEMCPY:
1334       return memcpy;
1335    case ISL_MEMCPY_BGRA8:
1336       return rgba8_copy;
1337    case ISL_MEMCPY_STREAMING_LOAD:
1338 #if defined(INLINE_SSE41)
1339       return _memcpy_streaming_load;
1340 #else
1341       unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
1342 #endif
1343    case ISL_MEMCPY_INVALID:
1344       unreachable("invalid copy_type");
1345    }
1346    unreachable("unhandled copy_type");
1347    return NULL;
1348 }
1349 
1350 /**
1351  * Copy texture data from linear to X tile layout, faster.
1352  *
1353  * Same as \ref linear_to_xtiled but faster, because it passes constant
1354  * parameters for common cases, allowing the compiler to inline code
1355  * optimized for those cases.
1356  *
1357  * \copydoc tile_copy_fn
1358  */
1359 static FLATTEN void
linear_to_xtiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1360 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1361                         uint32_t y0, uint32_t y1,
1362                         char *dst, const char *src,
1363                         int32_t src_pitch,
1364                         uint32_t swizzle_bit,
1365                         isl_memcpy_type copy_type)
1366 {
1367    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1368 
1369    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
1370       if (mem_copy == memcpy)
1371          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
1372                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1373       else if (mem_copy == rgba8_copy)
1374          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
1375                                  dst, src, src_pitch, swizzle_bit,
1376                                  rgba8_copy, rgba8_copy_aligned_dst);
1377       else
1378          unreachable("not reached");
1379    } else {
1380       if (mem_copy == memcpy)
1381          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
1382                                  dst, src, src_pitch, swizzle_bit,
1383                                  memcpy, memcpy);
1384       else if (mem_copy == rgba8_copy)
1385          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
1386                                  dst, src, src_pitch, swizzle_bit,
1387                                  rgba8_copy, rgba8_copy_aligned_dst);
1388       else
1389          unreachable("not reached");
1390    }
1391 }
1392 
1393 /**
1394  * Copy texture data from linear to Y tile layout, faster.
1395  *
1396  * Same as \ref linear_to_ytiled but faster, because it passes constant
1397  * parameters for common cases, allowing the compiler to inline code
1398  * optimized for those cases.
1399  *
1400  * \copydoc tile_copy_fn
1401  */
1402 static FLATTEN void
linear_to_ytiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1403 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1404                         uint32_t y0, uint32_t y1,
1405                         char *dst, const char *src,
1406                         int32_t src_pitch,
1407                         uint32_t swizzle_bit,
1408                         isl_memcpy_type copy_type)
1409 {
1410    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1411 
1412    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1413       if (mem_copy == memcpy)
1414          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
1415                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1416       else if (mem_copy == rgba8_copy)
1417          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
1418                                  dst, src, src_pitch, swizzle_bit,
1419                                  rgba8_copy, rgba8_copy_aligned_dst);
1420       else
1421          unreachable("not reached");
1422    } else {
1423       if (mem_copy == memcpy)
1424          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
1425                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1426       else if (mem_copy == rgba8_copy)
1427          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
1428                                  dst, src, src_pitch, swizzle_bit,
1429                                  rgba8_copy, rgba8_copy_aligned_dst);
1430       else
1431          unreachable("not reached");
1432    }
1433 }
1434 
1435 /**
1436  * Copy texture data from linear to tile 4 layout, faster.
1437  *
1438  * Same as \ref linear_to_tile4 but faster, because it passes constant
1439  * parameters for common cases, allowing the compiler to inline code
1440  * optimized for those cases.
1441  *
1442  * \copydoc tile_copy_fn
1443  */
1444 static FLATTEN void
linear_to_tile4_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1445 linear_to_tile4_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1446                         uint32_t y0, uint32_t y1,
1447                         char *dst, const char *src,
1448                         int32_t src_pitch,
1449                         uint32_t swizzle_bit,
1450                         isl_memcpy_type copy_type)
1451 {
1452    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1453    assert(swizzle_bit == 0);
1454 
1455    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1456       if (mem_copy == memcpy)
1457          return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height,
1458                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1459       else if (mem_copy == rgba8_copy)
1460          return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height,
1461                                  dst, src, src_pitch, swizzle_bit,
1462                                  rgba8_copy, rgba8_copy_aligned_dst);
1463       else
1464          unreachable("not reached");
1465    } else {
1466       if (mem_copy == memcpy)
1467          return linear_to_tile4(x0, x1, x2, x3, y0, y1,
1468                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1469       else if (mem_copy == rgba8_copy)
1470          return linear_to_tile4(x0, x1, x2, x3, y0, y1,
1471                                  dst, src, src_pitch, swizzle_bit,
1472                                  rgba8_copy, rgba8_copy_aligned_dst);
1473       else
1474          unreachable("not reached");
1475    }
1476 }
1477 
1478 /**
1479  * Copy texture data from linear to tile W layout, faster.
1480  *
1481  * Same as \ref linear_to_tilew but faster, because it passes constant
1482  * parameters for common cases, allowing the compiler to inline code
1483  * optimized for those cases.
1484  *
1485  * \copydoc tile_copy_fn
1486  */
1487 static FLATTEN void
linear_to_wtiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1488 linear_to_wtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1489                         uint32_t y0, uint32_t y1,
1490                         char *dst, const char *src,
1491                         int32_t src_pitch,
1492                         uint32_t swizzle_bit,
1493                         isl_memcpy_type copy_type)
1494 {
1495    assert(swizzle_bit == 0);
1496    if (x0 == 0 && x3 == wtile_width && y0 == 0 && y1 == wtile_height) {
1497       return linear_to_wtiled(0, 0,
1498                               wtile_width, wtile_width,
1499                               0, wtile_height,
1500                               dst, src, src_pitch);
1501    } else {
1502       return linear_to_wtiled(x0, x1, x2, x3, y0, y1,
1503                               dst, src, src_pitch);
1504    }
1505 }
1506 
1507 /**
1508  * Copy texture data from X tile layout to linear, faster.
1509  *
1510  * Same as \ref xtile_to_linear but faster, because it passes constant
1511  * parameters for common cases, allowing the compiler to inline code
1512  * optimized for those cases.
1513  *
1514  * \copydoc tile_copy_fn
1515  */
1516 static FLATTEN void
xtiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1517 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1518                         uint32_t y0, uint32_t y1,
1519                         char *dst, const char *src,
1520                         int32_t dst_pitch,
1521                         uint32_t swizzle_bit,
1522                         isl_memcpy_type copy_type)
1523 {
1524    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1525 
1526    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
1527       if (mem_copy == memcpy)
1528          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1529                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1530       else if (mem_copy == rgba8_copy)
1531          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1532                                  dst, src, dst_pitch, swizzle_bit,
1533                                  rgba8_copy, rgba8_copy_aligned_src);
1534 #if defined(INLINE_SSE41)
1535       else if (mem_copy == _memcpy_streaming_load)
1536          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1537                                  dst, src, dst_pitch, swizzle_bit,
1538                                  memcpy, _memcpy_streaming_load);
1539 #endif
1540       else
1541          unreachable("not reached");
1542    } else {
1543       if (mem_copy == memcpy)
1544          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1545                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1546       else if (mem_copy == rgba8_copy)
1547          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1548                                  dst, src, dst_pitch, swizzle_bit,
1549                                  rgba8_copy, rgba8_copy_aligned_src);
1550 #if defined(INLINE_SSE41)
1551       else if (mem_copy == _memcpy_streaming_load)
1552          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1553                                  dst, src, dst_pitch, swizzle_bit,
1554                                  memcpy, _memcpy_streaming_load);
1555 #endif
1556       else
1557          unreachable("not reached");
1558    }
1559 }
1560 
1561 /**
1562  * Copy texture data from Y tile layout to linear, faster.
1563  *
1564  * Same as \ref ytile_to_linear but faster, because it passes constant
1565  * parameters for common cases, allowing the compiler to inline code
1566  * optimized for those cases.
1567  *
1568  * \copydoc tile_copy_fn
1569  */
1570 static FLATTEN void
ytiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1571 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1572                         uint32_t y0, uint32_t y1,
1573                         char *dst, const char *src,
1574                         int32_t dst_pitch,
1575                         uint32_t swizzle_bit,
1576                         isl_memcpy_type copy_type)
1577 {
1578    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1579 
1580    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1581       if (mem_copy == memcpy)
1582          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1583                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1584       else if (mem_copy == rgba8_copy)
1585          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1586                                  dst, src, dst_pitch, swizzle_bit,
1587                                  rgba8_copy, rgba8_copy_aligned_src);
1588 #if defined(INLINE_SSE41)
1589       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1590          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1591                                  dst, src, dst_pitch, swizzle_bit,
1592                                  memcpy, _memcpy_streaming_load);
1593 #endif
1594       else
1595          unreachable("not reached");
1596    } else {
1597       if (mem_copy == memcpy)
1598          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1599                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1600       else if (mem_copy == rgba8_copy)
1601          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1602                                  dst, src, dst_pitch, swizzle_bit,
1603                                  rgba8_copy, rgba8_copy_aligned_src);
1604 #if defined(INLINE_SSE41)
1605       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1606          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1607                                  dst, src, dst_pitch, swizzle_bit,
1608                                  memcpy, _memcpy_streaming_load);
1609 #endif
1610       else
1611          unreachable("not reached");
1612    }
1613 }
1614 
1615 /**
1616  * Copy texture data from tile4 layout to linear, faster.
1617  *
1618  * Same as \ref tile4_to_linear but faster, because it passes constant
1619  * parameters for common cases, allowing the compiler to inline code
1620  * optimized for those cases.
1621  *
1622  * \copydoc tile_copy_fn
1623  */
1624 static FLATTEN void
tile4_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1625 tile4_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1626                         uint32_t y0, uint32_t y1,
1627                         char *dst, const char *src,
1628                         int32_t dst_pitch,
1629                         uint32_t swizzle_bit,
1630                         isl_memcpy_type copy_type)
1631 {
1632    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1633    assert(swizzle_bit == 0);
1634 
1635    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1636       if (mem_copy == memcpy)
1637          return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1638                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1639       else if (mem_copy == rgba8_copy)
1640          return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1641                                  dst, src, dst_pitch, swizzle_bit,
1642                                  rgba8_copy, rgba8_copy_aligned_src);
1643 #if defined(INLINE_SSE41)
1644       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1645          return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1646                                  dst, src, dst_pitch, swizzle_bit,
1647                                  memcpy, _memcpy_streaming_load);
1648 #endif
1649       else
1650          unreachable("not reached");
1651    } else {
1652       if (mem_copy == memcpy)
1653          return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1654                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1655       else if (mem_copy == rgba8_copy)
1656          return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1657                                  dst, src, dst_pitch, swizzle_bit,
1658                                  rgba8_copy, rgba8_copy_aligned_src);
1659 #if defined(INLINE_SSE41)
1660       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1661          return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1662                                  dst, src, dst_pitch, swizzle_bit,
1663                                  memcpy, _memcpy_streaming_load);
1664 #endif
1665       else
1666          unreachable("not reached");
1667    }
1668 }
1669 
1670 /**
1671  * Copy texture data from tileW layout to linear, faster.
1672  *
1673  * Same as \ref tilew_to_linear but faster, because it passes constant
1674  * parameters for common cases, allowing the compiler to inline code
1675  * optimized for those cases.
1676  *
1677  * \copydoc tile_copy_fn
1678  */
1679 static FLATTEN void
wtiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1680 wtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1681                         uint32_t y0, uint32_t y1,
1682                         char *dst, const char *src,
1683                         int32_t dst_pitch,
1684                         uint32_t swizzle_bit,
1685                         isl_memcpy_type copy_type)
1686 {
1687    assert(swizzle_bit == 0);
1688 
1689    if (x0 == 0 && x3 == wtile_width && y0 == 0 && y1 == wtile_height) {
1690       return wtiled_to_linear(0, 0,
1691                               wtile_width, wtile_width,
1692                               0, wtile_height,
1693                               dst, src, dst_pitch);
1694    } else {
1695       return wtiled_to_linear(x0, x1, x2, x3, y0, y1,
1696                               dst, src, dst_pitch);
1697    }
1698 }
1699 
1700 /**
1701  * Copy from linear to tiled texture.
1702  *
1703  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
1704  * pieces that do not cross tile boundaries and copy each piece with a tile
1705  * copy function (\ref tile_copy_fn).
1706  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
1707  * The Y range is in pixels (i.e. unitless).
1708  * 'dst' is the address of (0, 0) in the destination tiled texture.
1709  * 'src' is the address of (xt1, yt1) in the source linear texture.
1710  */
1711 static void
linear_to_tiled(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,uint32_t dst_pitch,int32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,isl_memcpy_type copy_type)1712 linear_to_tiled(uint32_t xt1, uint32_t xt2,
1713                       uint32_t yt1, uint32_t yt2,
1714                       char *dst, const char *src,
1715                       uint32_t dst_pitch, int32_t src_pitch,
1716                       bool has_swizzling,
1717                       enum isl_tiling tiling,
1718                       isl_memcpy_type copy_type)
1719 {
1720    tile_copy_fn tile_copy;
1721    uint32_t xt0, xt3;
1722    uint32_t yt0, yt3;
1723    uint32_t xt, yt;
1724    uint32_t tw, th, xt_sub_range_alignment;
1725    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
1726 
1727    if (tiling == ISL_TILING_X) {
1728       tw = xtile_width;
1729       th = xtile_height;
1730       xt_sub_range_alignment = xtile_span;
1731       tile_copy = linear_to_xtiled_faster;
1732    } else if (tiling == ISL_TILING_Y0) {
1733       tw = ytile_width;
1734       th = ytile_height;
1735       xt_sub_range_alignment = ytile_span;
1736       tile_copy = linear_to_ytiled_faster;
1737    } else if (tiling == ISL_TILING_4) {
1738       tw = ytile_width;
1739       th = ytile_height;
1740       xt_sub_range_alignment = ytile_span;
1741       tile_copy = linear_to_tile4_faster;
1742    } else if (tiling == ISL_TILING_W) {
1743       tw = wtile_width;
1744       th = wtile_height;
1745       /* The copy function prioritizes W-Tile blocks. The width of a W-Tile
1746        * block is four W-Tile spans.
1747        */
1748       xt_sub_range_alignment = wtile_span * 4;
1749       tile_copy = linear_to_wtiled_faster;
1750       /* TileW is a special case with doubled physical tile width due to HW
1751        * programming requirements (see isl_tiling_get_info() in
1752        * src/intel/isl/isl.c)
1753        */
1754       dst_pitch /= 2;
1755    } else {
1756       unreachable("unsupported tiling");
1757    }
1758 
1759    /* Round out to tile boundaries. */
1760    xt0 = ALIGN_DOWN(xt1, tw);
1761    xt3 = ALIGN_UP  (xt2, tw);
1762    yt0 = ALIGN_DOWN(yt1, th);
1763    yt3 = ALIGN_UP  (yt2, th);
1764 
1765    /* Loop over all tiles to which we have something to copy.
1766     * 'xt' and 'yt' are the origin of the destination tile, whether copying
1767     * copying a full or partial tile.
1768     * tile_copy() copies one tile or partial tile.
1769     * Looping x inside y is the faster memory access pattern.
1770     */
1771    for (yt = yt0; yt < yt3; yt += th) {
1772       for (xt = xt0; xt < xt3; xt += tw) {
1773          /* The area to update is [x0,x3) x [y0,y1).
1774           * May not want the whole tile, hence the min and max.
1775           */
1776          uint32_t x0 = MAX2(xt1, xt);
1777          uint32_t y0 = MAX2(yt1, yt);
1778          uint32_t x3 = MIN2(xt2, xt + tw);
1779          uint32_t y1 = MIN2(yt2, yt + th);
1780 
1781          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
1782           * the middle interval is the longest span-aligned part.
1783           * The sub-ranges could be empty.
1784           */
1785          uint32_t x1, x2;
1786          x1 = ALIGN_UP(x0, xt_sub_range_alignment);
1787          if (x1 > x3)
1788             x1 = x2 = x3;
1789          else
1790             x2 = ALIGN_DOWN(x3, xt_sub_range_alignment);
1791 
1792          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
1793          assert(x1 - x0 < xt_sub_range_alignment &&
1794                 x3 - x2 < xt_sub_range_alignment);
1795          assert(x3 - x0 <= tw);
1796          assert((x2 - x1) % xt_sub_range_alignment == 0);
1797 
1798          /* Translate by (xt,yt) for single-tile copier. */
1799          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1800                    y0-yt, y1-yt,
1801                    dst + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * dst_pitch,
1802                    src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
1803                    src_pitch,
1804                    swizzle_bit,
1805                    copy_type);
1806       }
1807    }
1808 }
1809 
1810 /**
1811  * Copy from tiled to linear texture.
1812  *
1813  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
1814  * pieces that do not cross tile boundaries and copy each piece with a tile
1815  * copy function (\ref tile_copy_fn).
1816  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
1817  * The Y range is in pixels (i.e. unitless).
1818  * 'dst' is the address of (xt1, yt1) in the destination linear texture.
1819  * 'src' is the address of (0, 0) in the source tiled texture.
1820  */
1821 static void
tiled_to_linear(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,int32_t dst_pitch,uint32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,isl_memcpy_type copy_type)1822 tiled_to_linear(uint32_t xt1, uint32_t xt2,
1823                       uint32_t yt1, uint32_t yt2,
1824                       char *dst, const char *src,
1825                       int32_t dst_pitch, uint32_t src_pitch,
1826                       bool has_swizzling,
1827                       enum isl_tiling tiling,
1828                       isl_memcpy_type copy_type)
1829 {
1830    tile_copy_fn tile_copy;
1831    uint32_t xt0, xt3;
1832    uint32_t yt0, yt3;
1833    uint32_t xt, yt;
1834    uint32_t tw, th, xt_sub_range_alignment;
1835    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
1836 
1837    if (tiling == ISL_TILING_X) {
1838       tw = xtile_width;
1839       th = xtile_height;
1840       xt_sub_range_alignment = xtile_span;
1841       tile_copy = xtiled_to_linear_faster;
1842    } else if (tiling == ISL_TILING_Y0) {
1843       tw = ytile_width;
1844       th = ytile_height;
1845       xt_sub_range_alignment = ytile_span;
1846       tile_copy = ytiled_to_linear_faster;
1847    } else if (tiling == ISL_TILING_4) {
1848       tw = ytile_width;
1849       th = ytile_height;
1850       xt_sub_range_alignment = ytile_span;
1851       tile_copy = tile4_to_linear_faster;
1852    } else if (tiling == ISL_TILING_W) {
1853       tw = wtile_width;
1854       th = wtile_height;
1855       /* The copy function prioritizes W-Tile blocks. The width of a W-Tile
1856        * block is four W-Tile spans.
1857        */
1858       xt_sub_range_alignment = wtile_span * 4;
1859       tile_copy = wtiled_to_linear_faster;
1860       /* TileW is a special case with doubled physical tile width due to HW
1861        * programming requirements (see isl_tiling_get_info() in
1862        * src/intel/isl/isl.c)
1863        */
1864       src_pitch /= 2;
1865    } else {
1866       unreachable("unsupported tiling");
1867    }
1868 
1869 #if defined(INLINE_SSE41)
1870    if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
1871       /* The hidden cacheline sized register used by movntdqa can apparently
1872        * give you stale data, so do an mfence to invalidate it.
1873        */
1874       _mm_mfence();
1875    }
1876 #endif
1877 
1878    /* Round out to tile boundaries. */
1879    xt0 = ALIGN_DOWN(xt1, tw);
1880    xt3 = ALIGN_UP  (xt2, tw);
1881    yt0 = ALIGN_DOWN(yt1, th);
1882    yt3 = ALIGN_UP  (yt2, th);
1883 
1884    /* Loop over all tiles to which we have something to copy.
1885     * 'xt' and 'yt' are the origin of the destination tile, whether copying
1886     * copying a full or partial tile.
1887     * tile_copy() copies one tile or partial tile.
1888     * Looping x inside y is the faster memory access pattern.
1889     */
1890    for (yt = yt0; yt < yt3; yt += th) {
1891       for (xt = xt0; xt < xt3; xt += tw) {
1892          /* The area to update is [x0,x3) x [y0,y1).
1893           * May not want the whole tile, hence the min and max.
1894           */
1895          uint32_t x0 = MAX2(xt1, xt);
1896          uint32_t y0 = MAX2(yt1, yt);
1897          uint32_t x3 = MIN2(xt2, xt + tw);
1898          uint32_t y1 = MIN2(yt2, yt + th);
1899 
1900          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that the
1901           * middle interval is the longest xt_sub_range_alignment aligned
1902           * part. The sub-ranges could be empty.
1903           */
1904          uint32_t x1, x2;
1905          x1 = ALIGN_UP(x0, xt_sub_range_alignment);
1906          if (x1 > x3)
1907             x1 = x2 = x3;
1908          else
1909             x2 = ALIGN_DOWN(x3, xt_sub_range_alignment);
1910 
1911          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
1912          assert(x1 - x0 < xt_sub_range_alignment &&
1913                 x3 - x2 < xt_sub_range_alignment);
1914          assert(x3 - x0 <= tw);
1915          assert((x2 - x1) % xt_sub_range_alignment == 0);
1916 
1917          /* Translate by (xt,yt) for single-tile copier. */
1918          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1919                    y0-yt, y1-yt,
1920                    dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
1921                    src + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * src_pitch,
1922                    dst_pitch,
1923                    swizzle_bit,
1924                    copy_type);
1925       }
1926    }
1927 }
1928