1 /*
2 * Mesa 3-D graphics library
3 *
4 * Copyright 2012 Intel Corporation
5 * Copyright 2013 Google
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Authors:
28 * Chad Versace <chad.versace@linux.intel.com>
29 * Frank Henigman <fjhenigman@google.com>
30 */
31
32 #include <string.h>
33
34 #include "util/macros.h"
35
36 #include "brw_context.h"
37 #include "intel_tiled_memcpy.h"
38
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49
50 /* Tile dimensions. Width and span are in bytes, height is in pixels (i.e.
51 * unitless). A "span" is the most number of bytes we can copy from linear
52 * to tiled without needing to calculate a new destination address.
53 */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60
61 static inline uint32_t
ror(uint32_t n,uint32_t d)62 ror(uint32_t n, uint32_t d)
63 {
64 return (n >> d) | (n << (32 - d));
65 }
66
67 static inline uint32_t
bswap32(uint32_t n)68 bswap32(uint32_t n)
69 {
70 #if defined(HAVE___BUILTIN_BSWAP32)
71 return __builtin_bswap32(n);
72 #else
73 return (n >> 24) |
74 ((n >> 8) & 0x0000ff00) |
75 ((n << 8) & 0x00ff0000) |
76 (n << 24);
77 #endif
78 }
79
80 /**
81 * Copy RGBA to BGRA - swap R and B.
82 */
83 static inline void *
rgba8_copy(void * dst,const void * src,size_t bytes)84 rgba8_copy(void *dst, const void *src, size_t bytes)
85 {
86 uint32_t *d = dst;
87 uint32_t const *s = src;
88
89 assert(bytes % 4 == 0);
90
91 while (bytes >= 4) {
92 *d = ror(bswap32(*s), 8);
93 d += 1;
94 s += 1;
95 bytes -= 4;
96 }
97 return dst;
98 }
99
100 #ifdef __SSSE3__
101 static const uint8_t rgba8_permutation[16] =
102 { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
103
104 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)105 rgba8_copy_16_aligned_dst(void *dst, const void *src)
106 {
107 _mm_store_si128(dst,
108 _mm_shuffle_epi8(_mm_loadu_si128(src),
109 *(__m128i *)rgba8_permutation));
110 }
111
112 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)113 rgba8_copy_16_aligned_src(void *dst, const void *src)
114 {
115 _mm_storeu_si128(dst,
116 _mm_shuffle_epi8(_mm_load_si128(src),
117 *(__m128i *)rgba8_permutation));
118 }
119
120 #elif defined(__SSE2__)
121 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)122 rgba8_copy_16_aligned_dst(void *dst, const void *src)
123 {
124 __m128i srcreg, dstreg, agmask, ag, rb, br;
125
126 agmask = _mm_set1_epi32(0xFF00FF00);
127 srcreg = _mm_loadu_si128((__m128i *)src);
128
129 rb = _mm_andnot_si128(agmask, srcreg);
130 ag = _mm_and_si128(agmask, srcreg);
131 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
132 _MM_SHUFFLE(2, 3, 0, 1));
133 dstreg = _mm_or_si128(ag, br);
134
135 _mm_store_si128((__m128i *)dst, dstreg);
136 }
137
138 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)139 rgba8_copy_16_aligned_src(void *dst, const void *src)
140 {
141 __m128i srcreg, dstreg, agmask, ag, rb, br;
142
143 agmask = _mm_set1_epi32(0xFF00FF00);
144 srcreg = _mm_load_si128((__m128i *)src);
145
146 rb = _mm_andnot_si128(agmask, srcreg);
147 ag = _mm_and_si128(agmask, srcreg);
148 br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
149 _MM_SHUFFLE(2, 3, 0, 1));
150 dstreg = _mm_or_si128(ag, br);
151
152 _mm_storeu_si128((__m128i *)dst, dstreg);
153 }
154 #endif
155
156 /**
157 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
158 */
159 static inline void *
rgba8_copy_aligned_dst(void * dst,const void * src,size_t bytes)160 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
161 {
162 assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
163
164 #if defined(__SSSE3__) || defined(__SSE2__)
165 if (bytes == 64) {
166 rgba8_copy_16_aligned_dst(dst + 0, src + 0);
167 rgba8_copy_16_aligned_dst(dst + 16, src + 16);
168 rgba8_copy_16_aligned_dst(dst + 32, src + 32);
169 rgba8_copy_16_aligned_dst(dst + 48, src + 48);
170 return dst;
171 }
172
173 while (bytes >= 16) {
174 rgba8_copy_16_aligned_dst(dst, src);
175 src += 16;
176 dst += 16;
177 bytes -= 16;
178 }
179 #endif
180
181 rgba8_copy(dst, src, bytes);
182
183 return dst;
184 }
185
186 /**
187 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
188 */
189 static inline void *
rgba8_copy_aligned_src(void * dst,const void * src,size_t bytes)190 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
191 {
192 assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
193
194 #if defined(__SSSE3__) || defined(__SSE2__)
195 if (bytes == 64) {
196 rgba8_copy_16_aligned_src(dst + 0, src + 0);
197 rgba8_copy_16_aligned_src(dst + 16, src + 16);
198 rgba8_copy_16_aligned_src(dst + 32, src + 32);
199 rgba8_copy_16_aligned_src(dst + 48, src + 48);
200 return dst;
201 }
202
203 while (bytes >= 16) {
204 rgba8_copy_16_aligned_src(dst, src);
205 src += 16;
206 dst += 16;
207 bytes -= 16;
208 }
209 #endif
210
211 rgba8_copy(dst, src, bytes);
212
213 return dst;
214 }
215
216 /**
217 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
218 * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
219 * The first and last ranges must be shorter than a "span" (the longest linear
220 * stretch within a tile) and the middle must equal a whole number of spans.
221 * Ranges may be empty. The region copied must land entirely within one tile.
222 * 'dst' is the start of the tile and 'src' is the corresponding
223 * address to copy from, though copying begins at (x0, y0).
224 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
225 * Swizzling flips bit 6 in the copy destination offset, when certain other
226 * bits are set in it.
227 */
228 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
229 uint32_t y0, uint32_t y1,
230 char *dst, const char *src,
231 int32_t linear_pitch,
232 uint32_t swizzle_bit,
233 mem_copy_fn mem_copy);
234
235 /**
236 * Copy texture data from linear to X tile layout.
237 *
238 * \copydoc tile_copy_fn
239 *
240 * The mem_copy parameters allow the user to specify an alternative mem_copy
241 * function that, for instance, may do RGBA -> BGRA swizzling. The first
242 * function must handle any memory alignment while the second function must
243 * only handle 16-byte alignment in whichever side (source or destination) is
244 * tiled.
245 */
246 static inline void
linear_to_xtiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)247 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
248 uint32_t y0, uint32_t y1,
249 char *dst, const char *src,
250 int32_t src_pitch,
251 uint32_t swizzle_bit,
252 mem_copy_fn mem_copy,
253 mem_copy_fn mem_copy_align16)
254 {
255 /* The copy destination offset for each range copied is the sum of
256 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
257 */
258 uint32_t xo, yo;
259
260 src += (ptrdiff_t)y0 * src_pitch;
261
262 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
263 /* Bits 9 and 10 of the copy destination offset control swizzling.
264 * Only 'yo' contributes to those bits in the total offset,
265 * so calculate 'swizzle' just once per row.
266 * Move bits 9 and 10 three and four places respectively down
267 * to bit 6 and xor them.
268 */
269 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
270
271 mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
272
273 for (xo = x1; xo < x2; xo += xtile_span) {
274 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
275 }
276
277 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
278
279 src += src_pitch;
280 }
281 }
282
283 /**
284 * Copy texture data from linear to Y tile layout.
285 *
286 * \copydoc tile_copy_fn
287 */
288 static inline void
linear_to_ytiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)289 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
290 uint32_t y0, uint32_t y1,
291 char *dst, const char *src,
292 int32_t src_pitch,
293 uint32_t swizzle_bit,
294 mem_copy_fn mem_copy,
295 mem_copy_fn mem_copy_align16)
296 {
297 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
298 * as the tile). Thus the destination offset for (x,y) is the sum of:
299 * (x % column_width) // position within column
300 * (x / column_width) * bytes_per_column // column number * bytes per column
301 * y * column_width
302 *
303 * The copy destination offset for each range copied is the sum of
304 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
305 */
306 const uint32_t column_width = ytile_span;
307 const uint32_t bytes_per_column = column_width * ytile_height;
308
309 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
310 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
311
312 /* Bit 9 of the destination offset control swizzling.
313 * Only the X offset contributes to bit 9 of the total offset,
314 * so swizzle can be calculated in advance for these X positions.
315 * Move bit 9 three places down to bit 6.
316 */
317 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
318 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
319
320 uint32_t x, yo;
321
322 src += (ptrdiff_t)y0 * src_pitch;
323
324 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
325 uint32_t xo = xo1;
326 uint32_t swizzle = swizzle1;
327
328 mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
329
330 /* Step by spans/columns. As it happens, the swizzle bit flips
331 * at each step so we don't need to calculate it explicitly.
332 */
333 for (x = x1; x < x2; x += ytile_span) {
334 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
335 xo += bytes_per_column;
336 swizzle ^= swizzle_bit;
337 }
338
339 mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
340
341 src += src_pitch;
342 }
343 }
344
345 /**
346 * Copy texture data from X tile layout to linear.
347 *
348 * \copydoc tile_copy_fn
349 */
350 static inline void
xtiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)351 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
352 uint32_t y0, uint32_t y1,
353 char *dst, const char *src,
354 int32_t dst_pitch,
355 uint32_t swizzle_bit,
356 mem_copy_fn mem_copy,
357 mem_copy_fn mem_copy_align16)
358 {
359 /* The copy destination offset for each range copied is the sum of
360 * an X offset 'x0' or 'xo' and a Y offset 'yo.'
361 */
362 uint32_t xo, yo;
363
364 dst += (ptrdiff_t)y0 * dst_pitch;
365
366 for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
367 /* Bits 9 and 10 of the copy destination offset control swizzling.
368 * Only 'yo' contributes to those bits in the total offset,
369 * so calculate 'swizzle' just once per row.
370 * Move bits 9 and 10 three and four places respectively down
371 * to bit 6 and xor them.
372 */
373 uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
374
375 mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
376
377 for (xo = x1; xo < x2; xo += xtile_span) {
378 mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
379 }
380
381 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
382
383 dst += dst_pitch;
384 }
385 }
386
387 /**
388 * Copy texture data from Y tile layout to linear.
389 *
390 * \copydoc tile_copy_fn
391 */
392 static inline void
ytiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)393 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
394 uint32_t y0, uint32_t y1,
395 char *dst, const char *src,
396 int32_t dst_pitch,
397 uint32_t swizzle_bit,
398 mem_copy_fn mem_copy,
399 mem_copy_fn mem_copy_align16)
400 {
401 /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
402 * as the tile). Thus the destination offset for (x,y) is the sum of:
403 * (x % column_width) // position within column
404 * (x / column_width) * bytes_per_column // column number * bytes per column
405 * y * column_width
406 *
407 * The copy destination offset for each range copied is the sum of
408 * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
409 */
410 const uint32_t column_width = ytile_span;
411 const uint32_t bytes_per_column = column_width * ytile_height;
412
413 uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
414 uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
415
416 /* Bit 9 of the destination offset control swizzling.
417 * Only the X offset contributes to bit 9 of the total offset,
418 * so swizzle can be calculated in advance for these X positions.
419 * Move bit 9 three places down to bit 6.
420 */
421 uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
422 uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
423
424 uint32_t x, yo;
425
426 dst += (ptrdiff_t)y0 * dst_pitch;
427
428 for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
429 uint32_t xo = xo1;
430 uint32_t swizzle = swizzle1;
431
432 mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
433
434 /* Step by spans/columns. As it happens, the swizzle bit flips
435 * at each step so we don't need to calculate it explicitly.
436 */
437 for (x = x1; x < x2; x += ytile_span) {
438 mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
439 xo += bytes_per_column;
440 swizzle ^= swizzle_bit;
441 }
442
443 mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
444
445 dst += dst_pitch;
446 }
447 }
448
449
450 /**
451 * Copy texture data from linear to X tile layout, faster.
452 *
453 * Same as \ref linear_to_xtiled but faster, because it passes constant
454 * parameters for common cases, allowing the compiler to inline code
455 * optimized for those cases.
456 *
457 * \copydoc tile_copy_fn
458 */
459 static FLATTEN void
linear_to_xtiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)460 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
461 uint32_t y0, uint32_t y1,
462 char *dst, const char *src,
463 int32_t src_pitch,
464 uint32_t swizzle_bit,
465 mem_copy_fn mem_copy)
466 {
467 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
468 if (mem_copy == memcpy)
469 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
470 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
471 else if (mem_copy == rgba8_copy)
472 return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
473 dst, src, src_pitch, swizzle_bit,
474 rgba8_copy, rgba8_copy_aligned_dst);
475 else
476 unreachable("not reached");
477 } else {
478 if (mem_copy == memcpy)
479 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
480 dst, src, src_pitch, swizzle_bit,
481 memcpy, memcpy);
482 else if (mem_copy == rgba8_copy)
483 return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
484 dst, src, src_pitch, swizzle_bit,
485 rgba8_copy, rgba8_copy_aligned_dst);
486 else
487 unreachable("not reached");
488 }
489 linear_to_xtiled(x0, x1, x2, x3, y0, y1,
490 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
491 }
492
493 /**
494 * Copy texture data from linear to Y tile layout, faster.
495 *
496 * Same as \ref linear_to_ytiled but faster, because it passes constant
497 * parameters for common cases, allowing the compiler to inline code
498 * optimized for those cases.
499 *
500 * \copydoc tile_copy_fn
501 */
502 static FLATTEN void
linear_to_ytiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)503 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
504 uint32_t y0, uint32_t y1,
505 char *dst, const char *src,
506 int32_t src_pitch,
507 uint32_t swizzle_bit,
508 mem_copy_fn mem_copy)
509 {
510 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
511 if (mem_copy == memcpy)
512 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
513 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
514 else if (mem_copy == rgba8_copy)
515 return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
516 dst, src, src_pitch, swizzle_bit,
517 rgba8_copy, rgba8_copy_aligned_dst);
518 else
519 unreachable("not reached");
520 } else {
521 if (mem_copy == memcpy)
522 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
523 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
524 else if (mem_copy == rgba8_copy)
525 return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
526 dst, src, src_pitch, swizzle_bit,
527 rgba8_copy, rgba8_copy_aligned_dst);
528 else
529 unreachable("not reached");
530 }
531 linear_to_ytiled(x0, x1, x2, x3, y0, y1,
532 dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
533 }
534
535 /**
536 * Copy texture data from X tile layout to linear, faster.
537 *
538 * Same as \ref xtile_to_linear but faster, because it passes constant
539 * parameters for common cases, allowing the compiler to inline code
540 * optimized for those cases.
541 *
542 * \copydoc tile_copy_fn
543 */
544 static FLATTEN void
xtiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)545 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
546 uint32_t y0, uint32_t y1,
547 char *dst, const char *src,
548 int32_t dst_pitch,
549 uint32_t swizzle_bit,
550 mem_copy_fn mem_copy)
551 {
552 if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
553 if (mem_copy == memcpy)
554 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
555 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
556 else if (mem_copy == rgba8_copy)
557 return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
558 dst, src, dst_pitch, swizzle_bit,
559 rgba8_copy, rgba8_copy_aligned_src);
560 else
561 unreachable("not reached");
562 } else {
563 if (mem_copy == memcpy)
564 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
565 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
566 else if (mem_copy == rgba8_copy)
567 return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
568 dst, src, dst_pitch, swizzle_bit,
569 rgba8_copy, rgba8_copy_aligned_src);
570 else
571 unreachable("not reached");
572 }
573 xtiled_to_linear(x0, x1, x2, x3, y0, y1,
574 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
575 }
576
577 /**
578 * Copy texture data from Y tile layout to linear, faster.
579 *
580 * Same as \ref ytile_to_linear but faster, because it passes constant
581 * parameters for common cases, allowing the compiler to inline code
582 * optimized for those cases.
583 *
584 * \copydoc tile_copy_fn
585 */
586 static FLATTEN void
ytiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)587 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
588 uint32_t y0, uint32_t y1,
589 char *dst, const char *src,
590 int32_t dst_pitch,
591 uint32_t swizzle_bit,
592 mem_copy_fn mem_copy)
593 {
594 if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
595 if (mem_copy == memcpy)
596 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
597 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
598 else if (mem_copy == rgba8_copy)
599 return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
600 dst, src, dst_pitch, swizzle_bit,
601 rgba8_copy, rgba8_copy_aligned_src);
602 else
603 unreachable("not reached");
604 } else {
605 if (mem_copy == memcpy)
606 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
607 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
608 else if (mem_copy == rgba8_copy)
609 return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
610 dst, src, dst_pitch, swizzle_bit,
611 rgba8_copy, rgba8_copy_aligned_src);
612 else
613 unreachable("not reached");
614 }
615 ytiled_to_linear(x0, x1, x2, x3, y0, y1,
616 dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
617 }
618
619 /**
620 * Copy from linear to tiled texture.
621 *
622 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
623 * pieces that do not cross tile boundaries and copy each piece with a tile
624 * copy function (\ref tile_copy_fn).
625 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
626 * The Y range is in pixels (i.e. unitless).
627 * 'dst' is the start of the texture and 'src' is the corresponding
628 * address to copy from, though copying begins at (xt1, yt1).
629 */
630 void
linear_to_tiled(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,uint32_t dst_pitch,int32_t src_pitch,bool has_swizzling,uint32_t tiling,mem_copy_fn mem_copy)631 linear_to_tiled(uint32_t xt1, uint32_t xt2,
632 uint32_t yt1, uint32_t yt2,
633 char *dst, const char *src,
634 uint32_t dst_pitch, int32_t src_pitch,
635 bool has_swizzling,
636 uint32_t tiling,
637 mem_copy_fn mem_copy)
638 {
639 tile_copy_fn tile_copy;
640 uint32_t xt0, xt3;
641 uint32_t yt0, yt3;
642 uint32_t xt, yt;
643 uint32_t tw, th, span;
644 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
645
646 if (tiling == I915_TILING_X) {
647 tw = xtile_width;
648 th = xtile_height;
649 span = xtile_span;
650 tile_copy = linear_to_xtiled_faster;
651 } else if (tiling == I915_TILING_Y) {
652 tw = ytile_width;
653 th = ytile_height;
654 span = ytile_span;
655 tile_copy = linear_to_ytiled_faster;
656 } else {
657 unreachable("unsupported tiling");
658 }
659
660 /* Round out to tile boundaries. */
661 xt0 = ALIGN_DOWN(xt1, tw);
662 xt3 = ALIGN_UP (xt2, tw);
663 yt0 = ALIGN_DOWN(yt1, th);
664 yt3 = ALIGN_UP (yt2, th);
665
666 /* Loop over all tiles to which we have something to copy.
667 * 'xt' and 'yt' are the origin of the destination tile, whether copying
668 * copying a full or partial tile.
669 * tile_copy() copies one tile or partial tile.
670 * Looping x inside y is the faster memory access pattern.
671 */
672 for (yt = yt0; yt < yt3; yt += th) {
673 for (xt = xt0; xt < xt3; xt += tw) {
674 /* The area to update is [x0,x3) x [y0,y1).
675 * May not want the whole tile, hence the min and max.
676 */
677 uint32_t x0 = MAX2(xt1, xt);
678 uint32_t y0 = MAX2(yt1, yt);
679 uint32_t x3 = MIN2(xt2, xt + tw);
680 uint32_t y1 = MIN2(yt2, yt + th);
681
682 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
683 * the middle interval is the longest span-aligned part.
684 * The sub-ranges could be empty.
685 */
686 uint32_t x1, x2;
687 x1 = ALIGN_UP(x0, span);
688 if (x1 > x3)
689 x1 = x2 = x3;
690 else
691 x2 = ALIGN_DOWN(x3, span);
692
693 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
694 assert(x1 - x0 < span && x3 - x2 < span);
695 assert(x3 - x0 <= tw);
696 assert((x2 - x1) % span == 0);
697
698 /* Translate by (xt,yt) for single-tile copier. */
699 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
700 y0-yt, y1-yt,
701 dst + (ptrdiff_t) xt * th + (ptrdiff_t) yt * dst_pitch,
702 src + (ptrdiff_t) xt + (ptrdiff_t) yt * src_pitch,
703 src_pitch,
704 swizzle_bit,
705 mem_copy);
706 }
707 }
708 }
709
710 /**
711 * Copy from tiled to linear texture.
712 *
713 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
714 * pieces that do not cross tile boundaries and copy each piece with a tile
715 * copy function (\ref tile_copy_fn).
716 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
717 * The Y range is in pixels (i.e. unitless).
718 * 'dst' is the start of the texture and 'src' is the corresponding
719 * address to copy from, though copying begins at (xt1, yt1).
720 */
721 void
tiled_to_linear(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,int32_t dst_pitch,uint32_t src_pitch,bool has_swizzling,uint32_t tiling,mem_copy_fn mem_copy)722 tiled_to_linear(uint32_t xt1, uint32_t xt2,
723 uint32_t yt1, uint32_t yt2,
724 char *dst, const char *src,
725 int32_t dst_pitch, uint32_t src_pitch,
726 bool has_swizzling,
727 uint32_t tiling,
728 mem_copy_fn mem_copy)
729 {
730 tile_copy_fn tile_copy;
731 uint32_t xt0, xt3;
732 uint32_t yt0, yt3;
733 uint32_t xt, yt;
734 uint32_t tw, th, span;
735 uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
736
737 if (tiling == I915_TILING_X) {
738 tw = xtile_width;
739 th = xtile_height;
740 span = xtile_span;
741 tile_copy = xtiled_to_linear_faster;
742 } else if (tiling == I915_TILING_Y) {
743 tw = ytile_width;
744 th = ytile_height;
745 span = ytile_span;
746 tile_copy = ytiled_to_linear_faster;
747 } else {
748 unreachable("unsupported tiling");
749 }
750
751 /* Round out to tile boundaries. */
752 xt0 = ALIGN_DOWN(xt1, tw);
753 xt3 = ALIGN_UP (xt2, tw);
754 yt0 = ALIGN_DOWN(yt1, th);
755 yt3 = ALIGN_UP (yt2, th);
756
757 /* Loop over all tiles to which we have something to copy.
758 * 'xt' and 'yt' are the origin of the destination tile, whether copying
759 * copying a full or partial tile.
760 * tile_copy() copies one tile or partial tile.
761 * Looping x inside y is the faster memory access pattern.
762 */
763 for (yt = yt0; yt < yt3; yt += th) {
764 for (xt = xt0; xt < xt3; xt += tw) {
765 /* The area to update is [x0,x3) x [y0,y1).
766 * May not want the whole tile, hence the min and max.
767 */
768 uint32_t x0 = MAX2(xt1, xt);
769 uint32_t y0 = MAX2(yt1, yt);
770 uint32_t x3 = MIN2(xt2, xt + tw);
771 uint32_t y1 = MIN2(yt2, yt + th);
772
773 /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
774 * the middle interval is the longest span-aligned part.
775 * The sub-ranges could be empty.
776 */
777 uint32_t x1, x2;
778 x1 = ALIGN_UP(x0, span);
779 if (x1 > x3)
780 x1 = x2 = x3;
781 else
782 x2 = ALIGN_DOWN(x3, span);
783
784 assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
785 assert(x1 - x0 < span && x3 - x2 < span);
786 assert(x3 - x0 <= tw);
787 assert((x2 - x1) % span == 0);
788
789 /* Translate by (xt,yt) for single-tile copier. */
790 tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
791 y0-yt, y1-yt,
792 dst + (ptrdiff_t) xt + (ptrdiff_t) yt * dst_pitch,
793 src + (ptrdiff_t) xt * th + (ptrdiff_t) yt * src_pitch,
794 dst_pitch,
795 swizzle_bit,
796 mem_copy);
797 }
798 }
799 }
800
801
802 /**
803 * Determine which copy function to use for the given format combination
804 *
805 * The only two possible copy functions which are ever returned are a
806 * direct memcpy and a RGBA <-> BGRA copy function. Since RGBA -> BGRA and
807 * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
808 * symmetric), it doesn't matter whether the copy is from the tiled image
809 * to the untiled or vice versa. The copy function required is the same in
810 * either case so this function can be used.
811 *
812 * \param[in] tiledFormat The format of the tiled image
813 * \param[in] format The GL format of the client data
814 * \param[in] type The GL type of the client data
815 * \param[out] mem_copy Will be set to one of either the standard
816 * library's memcpy or a different copy function
817 * that performs an RGBA to BGRA conversion
818 * \param[out] cpp Number of bytes per channel
819 *
820 * \return true if the format and type combination are valid
821 */
intel_get_memcpy(mesa_format tiledFormat,GLenum format,GLenum type,mem_copy_fn * mem_copy,uint32_t * cpp)822 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
823 GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp)
824 {
825 if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
826 !(format == GL_RGBA || format == GL_BGRA))
827 return false; /* Invalid type/format combination */
828
829 if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
830 (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
831 *cpp = 1;
832 *mem_copy = memcpy;
833 } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
834 (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
835 (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
836 (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
837 *cpp = 4;
838 if (format == GL_BGRA) {
839 *mem_copy = memcpy;
840 } else if (format == GL_RGBA) {
841 *mem_copy = rgba8_copy;
842 }
843 } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
844 (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
845 (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
846 (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
847 *cpp = 4;
848 if (format == GL_BGRA) {
849 /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
850 * use the same function.
851 */
852 *mem_copy = rgba8_copy;
853 } else if (format == GL_RGBA) {
854 *mem_copy = memcpy;
855 }
856 }
857
858 if (!(*mem_copy))
859 return false;
860
861 return true;
862 }
863