1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_defines.h"
30
31 #include "util/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 #include "util/u_math.h"
35
36 #include "lp_bld_type.h"
37 #include "lp_bld_const.h"
38 #include "lp_bld_conv.h"
39 #include "lp_bld_swizzle.h"
40 #include "lp_bld_gather.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_format.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_pack.h"
45
46
47 static void
convert_to_soa(struct gallivm_state * gallivm,LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH/32],LLVMValueRef dst_soa[4],const struct lp_type soa_type)48 convert_to_soa(struct gallivm_state *gallivm,
49 LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
50 LLVMValueRef dst_soa[4],
51 const struct lp_type soa_type)
52 {
53 unsigned j, k;
54 struct lp_type aos_channel_type = soa_type;
55
56 LLVMValueRef aos_channels[4];
57 unsigned pixels_per_channel = soa_type.length / 4;
58
59 debug_assert((soa_type.length % 4) == 0);
60
61 aos_channel_type.length >>= 1;
62
63 for (j = 0; j < 4; ++j) {
64 LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
65
66 assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
67
68 for (k = 0; k < pixels_per_channel; ++k) {
69 channel[k] = src_aos[j + 4 * k];
70 }
71
72 aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
73 }
74
75 lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
76 }
77
78
79 void
lp_build_format_swizzle_soa(const struct util_format_description * format_desc,struct lp_build_context * bld,const LLVMValueRef * unswizzled,LLVMValueRef swizzled_out[4])80 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
81 struct lp_build_context *bld,
82 const LLVMValueRef *unswizzled,
83 LLVMValueRef swizzled_out[4])
84 {
85 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
86 enum pipe_swizzle swizzle;
87 LLVMValueRef depth_or_stencil;
88
89 if (util_format_has_stencil(format_desc) &&
90 !util_format_has_depth(format_desc)) {
91 assert(!bld->type.floating);
92 swizzle = format_desc->swizzle[1];
93 }
94 else {
95 assert(bld->type.floating);
96 swizzle = format_desc->swizzle[0];
97 }
98 /*
99 * Return zzz1 or sss1 for depth-stencil formats here.
100 * Correct swizzling will be handled by apply_sampler_swizzle() later.
101 */
102 depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
103
104 swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
105 swizzled_out[3] = bld->one;
106 }
107 else {
108 unsigned chan;
109 for (chan = 0; chan < 4; ++chan) {
110 enum pipe_swizzle swizzle = format_desc->swizzle[chan];
111 swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
112 }
113 }
114 }
115
116
117
118 static LLVMValueRef
lp_build_extract_soa_chan(struct lp_build_context * bld,unsigned blockbits,boolean srgb_chan,struct util_format_channel_description chan_desc,LLVMValueRef packed)119 lp_build_extract_soa_chan(struct lp_build_context *bld,
120 unsigned blockbits,
121 boolean srgb_chan,
122 struct util_format_channel_description chan_desc,
123 LLVMValueRef packed)
124 {
125 struct gallivm_state *gallivm = bld->gallivm;
126 LLVMBuilderRef builder = gallivm->builder;
127 struct lp_type type = bld->type;
128 LLVMValueRef input = packed;
129 const unsigned width = chan_desc.size;
130 const unsigned start = chan_desc.shift;
131 const unsigned stop = start + width;
132
133 /* Decode the input vector component */
134
135 switch(chan_desc.type) {
136 case UTIL_FORMAT_TYPE_VOID:
137 input = bld->undef;
138 break;
139
140 case UTIL_FORMAT_TYPE_UNSIGNED:
141 /*
142 * Align the LSB
143 */
144 if (start) {
145 input = LLVMBuildLShr(builder, input,
146 lp_build_const_int_vec(gallivm, type, start), "");
147 }
148
149 /*
150 * Zero the MSBs
151 */
152 if (stop < blockbits) {
153 unsigned mask = ((unsigned long long)1 << width) - 1;
154 input = LLVMBuildAnd(builder, input,
155 lp_build_const_int_vec(gallivm, type, mask), "");
156 }
157
158 /*
159 * Type conversion
160 */
161 if (type.floating) {
162 if (srgb_chan) {
163 struct lp_type conv_type = lp_uint_type(type);
164 input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
165 }
166 else {
167 if(chan_desc.normalized)
168 input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
169 else
170 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
171 }
172 }
173 else if (chan_desc.pure_integer) {
174 /* Nothing to do */
175 } else {
176 /* FIXME */
177 assert(0);
178 }
179 break;
180
181 case UTIL_FORMAT_TYPE_SIGNED:
182 /*
183 * Align the sign bit first.
184 */
185 if (stop < type.width) {
186 unsigned bits = type.width - stop;
187 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
188 input = LLVMBuildShl(builder, input, bits_val, "");
189 }
190
191 /*
192 * Align the LSB (with an arithmetic shift to preserve the sign)
193 */
194 if (chan_desc.size < type.width) {
195 unsigned bits = type.width - chan_desc.size;
196 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
197 input = LLVMBuildAShr(builder, input, bits_val, "");
198 }
199
200 /*
201 * Type conversion
202 */
203 if (type.floating) {
204 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
205 if (chan_desc.normalized) {
206 double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
207 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
208 input = LLVMBuildFMul(builder, input, scale_val, "");
209 /*
210 * The formula above will produce value below -1.0 for most negative
211 * value but everything seems happy with that hence disable for now.
212 */
213 if (0)
214 input = lp_build_max(bld, input,
215 lp_build_const_vec(gallivm, type, -1.0f));
216 }
217 }
218 else if (chan_desc.pure_integer) {
219 /* Nothing to do */
220 } else {
221 /* FIXME */
222 assert(0);
223 }
224 break;
225
226 case UTIL_FORMAT_TYPE_FLOAT:
227 if (type.floating) {
228 if (chan_desc.size == 16) {
229 struct lp_type f16i_type = type;
230 f16i_type.width /= 2;
231 f16i_type.floating = 0;
232 if (start) {
233 input = LLVMBuildLShr(builder, input,
234 lp_build_const_int_vec(gallivm, type, start), "");
235 }
236 input = LLVMBuildTrunc(builder, input,
237 lp_build_vec_type(gallivm, f16i_type), "");
238 input = lp_build_half_to_float(gallivm, input);
239 } else {
240 assert(start == 0);
241 assert(stop == 32);
242 assert(type.width == 32);
243 }
244 input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
245 }
246 else {
247 /* FIXME */
248 assert(0);
249 input = bld->undef;
250 }
251 break;
252
253 case UTIL_FORMAT_TYPE_FIXED:
254 if (type.floating) {
255 double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
256 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
257 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
258 input = LLVMBuildFMul(builder, input, scale_val, "");
259 }
260 else {
261 /* FIXME */
262 assert(0);
263 input = bld->undef;
264 }
265 break;
266
267 default:
268 assert(0);
269 input = bld->undef;
270 break;
271 }
272
273 return input;
274 }
275
276
277 /**
278 * Unpack several pixels in SoA.
279 *
280 * It takes a vector of packed pixels:
281 *
282 * packed = {P0, P1, P2, P3, ..., Pn}
283 *
284 * And will produce four vectors:
285 *
286 * red = {R0, R1, R2, R3, ..., Rn}
287 * green = {G0, G1, G2, G3, ..., Gn}
288 * blue = {B0, B1, B2, B3, ..., Bn}
289 * alpha = {A0, A1, A2, A3, ..., An}
290 *
291 * It requires that a packed pixel fits into an element of the output
292 * channels. The common case is when converting pixel with a depth of 32 bit or
293 * less into floats.
294 *
295 * \param format_desc the format of the 'packed' incoming pixel vector
296 * \param type the desired type for rgba_out (type.length = n, above)
297 * \param packed the incoming vector of packed pixels
298 * \param rgba_out returns the SoA R,G,B,A vectors
299 */
300 void
lp_build_unpack_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef packed,LLVMValueRef rgba_out[4])301 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
302 const struct util_format_description *format_desc,
303 struct lp_type type,
304 LLVMValueRef packed,
305 LLVMValueRef rgba_out[4])
306 {
307 struct lp_build_context bld;
308 LLVMValueRef inputs[4];
309 unsigned chan;
310
311 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
312 assert(format_desc->block.width == 1);
313 assert(format_desc->block.height == 1);
314 assert(format_desc->block.bits <= type.width);
315 /* FIXME: Support more output types */
316 assert(type.width == 32);
317
318 lp_build_context_init(&bld, gallivm, type);
319
320 /* Decode the input vector components */
321 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
322 struct util_format_channel_description chan_desc = format_desc->channel[chan];
323 boolean srgb_chan = FALSE;
324
325 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
326 format_desc->swizzle[3] != chan) {
327 srgb_chan = TRUE;
328 }
329
330 inputs[chan] = lp_build_extract_soa_chan(&bld,
331 format_desc->block.bits,
332 srgb_chan,
333 chan_desc,
334 packed);
335 }
336
337 lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
338 }
339
340
341 /**
342 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
343 *
344 * \param dst_type The desired return type. For pure integer formats
345 * this should be a 32bit wide int or uint vector type,
346 * otherwise a float vector type.
347 *
348 * \param packed The rgba8 values to pack.
349 *
350 * \param rgba The 4 SoA return vectors.
351 */
352 void
lp_build_rgba8_to_fi32_soa(struct gallivm_state * gallivm,struct lp_type dst_type,LLVMValueRef packed,LLVMValueRef * rgba)353 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
354 struct lp_type dst_type,
355 LLVMValueRef packed,
356 LLVMValueRef *rgba)
357 {
358 LLVMBuilderRef builder = gallivm->builder;
359 LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
360 unsigned chan;
361
362 /* XXX technically shouldn't use that for uint dst_type */
363 packed = LLVMBuildBitCast(builder, packed,
364 lp_build_int_vec_type(gallivm, dst_type), "");
365
366 /* Decode the input vector components */
367 for (chan = 0; chan < 4; ++chan) {
368 #ifdef PIPE_ARCH_LITTLE_ENDIAN
369 unsigned start = chan*8;
370 #else
371 unsigned start = (3-chan)*8;
372 #endif
373 unsigned stop = start + 8;
374 LLVMValueRef input;
375
376 input = packed;
377
378 if (start)
379 input = LLVMBuildLShr(builder, input,
380 lp_build_const_int_vec(gallivm, dst_type, start), "");
381
382 if (stop < 32)
383 input = LLVMBuildAnd(builder, input, mask, "");
384
385 if (dst_type.floating)
386 input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
387
388 rgba[chan] = input;
389 }
390 }
391
392
393
394 /**
395 * Fetch a texels from a texture, returning them in SoA layout.
396 *
397 * \param type the desired return type for 'rgba'. The vector length
398 * is the number of texels to fetch
399 * \param aligned if the offset is guaranteed to be aligned to element width
400 *
401 * \param base_ptr points to the base of the texture mip tree.
402 * \param offset offset to start of the texture image block. For non-
403 * compressed formats, this simply is an offset to the texel.
404 * For compressed formats, it is an offset to the start of the
405 * compressed data block.
406 *
407 * \param i, j the sub-block pixel coordinates. For non-compressed formats
408 * these will always be (0,0). For compressed formats, i will
409 * be in [0, block_width-1] and j will be in [0, block_height-1].
410 * \param cache optional value pointing to a lp_build_format_cache structure
411 */
412 void
lp_build_fetch_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache,LLVMValueRef rgba_out[4])413 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
414 const struct util_format_description *format_desc,
415 struct lp_type type,
416 boolean aligned,
417 LLVMValueRef base_ptr,
418 LLVMValueRef offset,
419 LLVMValueRef i,
420 LLVMValueRef j,
421 LLVMValueRef cache,
422 LLVMValueRef rgba_out[4])
423 {
424 LLVMBuilderRef builder = gallivm->builder;
425 enum pipe_format format = format_desc->format;
426 struct lp_type fetch_type;
427
428 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
429 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
430 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
431 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
432 format_desc->block.width == 1 &&
433 format_desc->block.height == 1 &&
434 format_desc->block.bits <= type.width &&
435 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
436 format_desc->channel[0].size == 32 ||
437 format_desc->channel[0].size == 16))
438 {
439 /*
440 * The packed pixel fits into an element of the destination format. Put
441 * the packed pixels into a vector and extract each component for all
442 * vector elements in parallel.
443 */
444
445 LLVMValueRef packed;
446
447 /*
448 * gather the texels from the texture
449 * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
450 */
451 assert(format_desc->block.bits <= type.width);
452 fetch_type = lp_type_uint(type.width);
453 packed = lp_build_gather(gallivm,
454 type.length,
455 format_desc->block.bits,
456 fetch_type,
457 aligned,
458 base_ptr, offset, FALSE);
459
460 /*
461 * convert texels to float rgba
462 */
463 lp_build_unpack_rgba_soa(gallivm,
464 format_desc,
465 type,
466 packed, rgba_out);
467 return;
468 }
469
470
471 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
472 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
473 format_desc->block.width == 1 &&
474 format_desc->block.height == 1 &&
475 format_desc->block.bits > type.width &&
476 ((format_desc->block.bits <= type.width * type.length &&
477 format_desc->channel[0].size <= type.width) ||
478 (format_desc->channel[0].size == 64 &&
479 format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
480 type.floating)))
481 {
482 /*
483 * Similar to above, but the packed pixel is larger than what fits
484 * into an element of the destination format. The packed pixels will be
485 * shuffled into SoA vectors appropriately, and then the extraction will
486 * be done in parallel as much as possible.
487 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
488 * the gathered vectors can be shuffled easily (even with avx).
489 * 64xn float -> 32xn float is handled too but it's a bit special as
490 * it does the conversion pre-shuffle.
491 */
492
493 LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
494 struct lp_type fetch_type, gather_type = type;
495 unsigned num_gather, fetch_width, i, j;
496 struct lp_build_context bld;
497 boolean fp64 = format_desc->channel[0].size == 64;
498
499 lp_build_context_init(&bld, gallivm, type);
500
501 assert(type.width == 32);
502 assert(format_desc->block.bits > type.width);
503
504 /*
505 * First, figure out fetch order.
506 */
507 fetch_width = util_next_power_of_two(format_desc->block.bits);
508 /*
509 * fp64 are treated like fp32 except we fetch twice wide values
510 * (as we shuffle after trunc). The shuffles for that work out
511 * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
512 * albeit we miss the potential opportunity for hw gather (as it
513 * only handles native size).
514 */
515 num_gather = fetch_width / type.width;
516 gather_type.width *= num_gather;
517 if (fp64) {
518 num_gather /= 2;
519 }
520 gather_type.length /= num_gather;
521
522 for (i = 0; i < num_gather; i++) {
523 LLVMValueRef offsetr, shuf_vec;
524 if(num_gather == 4) {
525 for (j = 0; j < gather_type.length; j++) {
526 unsigned idx = i + 4*j;
527 shuffles[j] = lp_build_const_int32(gallivm, idx);
528 }
529 shuf_vec = LLVMConstVector(shuffles, gather_type.length);
530 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
531
532 }
533 else if (num_gather == 2) {
534 assert(num_gather == 2);
535 for (j = 0; j < gather_type.length; j++) {
536 unsigned idx = i*2 + (j%2) + (j/2)*4;
537 shuffles[j] = lp_build_const_int32(gallivm, idx);
538 }
539 shuf_vec = LLVMConstVector(shuffles, gather_type.length);
540 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
541 }
542 else {
543 assert(num_gather == 1);
544 offsetr = offset;
545 }
546 if (gather_type.length == 1) {
547 LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
548 offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
549 }
550
551 /*
552 * Determine whether to use float or int loads. This is mostly
553 * to outsmart the (stupid) llvm int/float shuffle logic, we
554 * don't really care much if the data is floats or ints...
555 * But llvm will refuse to use single float shuffle with int data
556 * and instead use 3 int shuffles instead, the code looks atrocious.
557 * (Note bitcasts often won't help, as llvm is too smart to be
558 * fooled by that.)
559 * Nobody cares about simd float<->int domain transition penalties,
560 * which usually don't even exist for shuffles anyway.
561 * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
562 * going into transpose, which is unpacks, so doesn't really matter
563 * much).
564 * With 2x32bit or 4x16bit fetch, we use float vec, since those
565 * go into the weird channel separation shuffle. With floats,
566 * this is (with 128bit vectors):
567 * - 2 movq, 2 movhpd, 2 shufps
568 * With ints it would be:
569 * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
570 * I've seen texture functions increase in code size by 15% just due
571 * to that (there's lots of such fetches in them...)
572 * (We could chose a different gather order to improve this somewhat
573 * for the int path, but it would basically just drop the blends,
574 * so the float path with this order really is optimal.)
575 * Albeit it is tricky sometimes llvm doesn't ignore the float->int
576 * casts so must avoid them until we're done with the float shuffle...
577 * 3x16bit formats (the same is also true for 3x8) are pretty bad but
578 * there's nothing we can do about them (we could overallocate by
579 * those couple bytes and use unaligned but pot sized load).
580 * Note that this is very much x86 specific. I don't know if this
581 * affect other archs at all.
582 */
583 if (num_gather > 1) {
584 /*
585 * We always want some float type here (with x86)
586 * due to shuffles being float ones afterwards (albeit for
587 * the num_gather == 4 case int should work fine too
588 * (unless there's some problems with avx but not avx2).
589 */
590 if (format_desc->channel[0].size == 64) {
591 fetch_type = lp_type_float_vec(64, gather_type.width);
592 } else {
593 fetch_type = lp_type_int_vec(32, gather_type.width);
594 }
595 }
596 else {
597 /* type doesn't matter much */
598 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
599 (format_desc->channel[0].size == 32 ||
600 format_desc->channel[0].size == 64)) {
601 fetch_type = lp_type_float(gather_type.width);
602 } else {
603 fetch_type = lp_type_uint(gather_type.width);
604 }
605 }
606
607 /* Now finally gather the values */
608 packed[i] = lp_build_gather(gallivm, gather_type.length,
609 format_desc->block.bits,
610 fetch_type, aligned,
611 base_ptr, offsetr, FALSE);
612 if (fp64) {
613 struct lp_type conv_type = type;
614 conv_type.width *= 2;
615 packed[i] = LLVMBuildBitCast(builder, packed[i],
616 lp_build_vec_type(gallivm, conv_type), "");
617 packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
618 }
619 }
620
621 /* shuffle the gathered values to SoA */
622 if (num_gather == 2) {
623 for (i = 0; i < num_gather; i++) {
624 for (j = 0; j < type.length; j++) {
625 unsigned idx = (j%2)*2 + (j/4)*4 + i;
626 if ((j/2)%2)
627 idx += type.length;
628 shuffles[j] = lp_build_const_int32(gallivm, idx);
629 }
630 dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
631 LLVMConstVector(shuffles, type.length), "");
632 }
633 }
634 else if (num_gather == 4) {
635 lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
636 }
637 else {
638 assert(num_gather == 1);
639 dst[0] = packed[0];
640 }
641
642 /*
643 * And finally unpack exactly as above, except that
644 * chan shift is adjusted and the right vector selected.
645 */
646 if (!fp64) {
647 for (i = 0; i < num_gather; i++) {
648 dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
649 }
650 for (i = 0; i < format_desc->nr_channels; i++) {
651 struct util_format_channel_description chan_desc = format_desc->channel[i];
652 unsigned blockbits = type.width;
653 unsigned vec_nr;
654
655 #ifdef PIPE_ARCH_BIG_ENDIAN
656 vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
657 #else
658 vec_nr = chan_desc.shift / type.width;
659 #endif
660 chan_desc.shift %= type.width;
661
662 output[i] = lp_build_extract_soa_chan(&bld,
663 blockbits,
664 FALSE,
665 chan_desc,
666 dst[vec_nr]);
667 }
668 }
669 else {
670 for (i = 0; i < format_desc->nr_channels; i++) {
671 output[i] = dst[i];
672 }
673 }
674
675 lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
676 return;
677 }
678
679 if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
680 format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
681 /*
682 * similar conceptually to above but requiring special
683 * AoS packed -> SoA float conversion code.
684 */
685 LLVMValueRef packed;
686 struct lp_type fetch_type = lp_type_uint(type.width);
687
688 assert(type.floating);
689 assert(type.width == 32);
690
691 packed = lp_build_gather(gallivm, type.length,
692 format_desc->block.bits,
693 fetch_type, aligned,
694 base_ptr, offset, FALSE);
695 if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
696 lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
697 }
698 else {
699 lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
700 }
701 return;
702 }
703
704 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
705 format_desc->block.bits == 64) {
706 /*
707 * special case the format is 64 bits but we only require
708 * 32bit (or 8bit) from each block.
709 */
710 LLVMValueRef packed;
711 struct lp_type fetch_type = lp_type_uint(type.width);
712
713 if (format == PIPE_FORMAT_X32_S8X24_UINT) {
714 /*
715 * for stencil simply fix up offsets - could in fact change
716 * base_ptr instead even outside the shader.
717 */
718 unsigned mask = (1 << 8) - 1;
719 LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
720 offset = LLVMBuildAdd(builder, offset, s_offset, "");
721 packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
722 aligned, base_ptr, offset, FALSE);
723 packed = LLVMBuildAnd(builder, packed,
724 lp_build_const_int_vec(gallivm, type, mask), "");
725 }
726 else {
727 assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
728 packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
729 aligned, base_ptr, offset, TRUE);
730 packed = LLVMBuildBitCast(builder, packed,
731 lp_build_vec_type(gallivm, type), "");
732 }
733 /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
734 rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
735 rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
736 return;
737 }
738
739 /*
740 * Try calling lp_build_fetch_rgba_aos for all pixels.
741 * Should only really hit subsampled, compressed
742 * (for s3tc srgb too, for rgtc the unorm ones only) by now.
743 * (This is invalid for plain 8unorm formats because we're lazy with
744 * the swizzle since some results would arrive swizzled, some not.)
745 */
746
747 if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
748 (util_format_fits_8unorm(format_desc) ||
749 format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
750 type.floating && type.width == 32 &&
751 (type.length == 1 || (type.length % 4 == 0))) {
752 struct lp_type tmp_type;
753 struct lp_build_context bld;
754 LLVMValueRef packed, rgba[4];
755 const struct util_format_description *flinear_desc;
756 const struct util_format_description *frgba8_desc;
757 unsigned chan;
758
759 lp_build_context_init(&bld, gallivm, type);
760
761 /*
762 * Make sure the conversion in aos really only does convert to rgba8
763 * and not anything more (so use linear format, adjust type).
764 */
765 flinear_desc = util_format_description(util_format_linear(format));
766 memset(&tmp_type, 0, sizeof tmp_type);
767 tmp_type.width = 8;
768 tmp_type.length = type.length * 4;
769 tmp_type.norm = TRUE;
770
771 packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
772 aligned, base_ptr, offset, i, j, cache);
773 packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
774
775 /*
776 * The values are now packed so they match ordinary (srgb) RGBA8 format,
777 * hence need to use matching format for unpack.
778 */
779 frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM);
780 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
781 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
782 frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
783 }
784 lp_build_unpack_rgba_soa(gallivm,
785 frgba8_desc,
786 type,
787 packed, rgba);
788
789 /*
790 * We converted 4 channels. Make sure llvm can drop unneeded ones
791 * (luckily the rgba order is fixed, only LA needs special case).
792 */
793 for (chan = 0; chan < 4; chan++) {
794 enum pipe_swizzle swizzle = format_desc->swizzle[chan];
795 if (chan == 3 && util_format_is_luminance_alpha(format)) {
796 swizzle = PIPE_SWIZZLE_W;
797 }
798 rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
799 }
800 return;
801 }
802
803
804 /*
805 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
806 *
807 * This is not the most efficient way of fetching pixels, as we
808 * miss some opportunities to do vectorization, but this is
809 * convenient for formats or scenarios for which there was no
810 * opportunity or incentive to optimize.
811 *
812 * We do NOT want to end up here, this typically is quite terrible,
813 * in particular if the formats have less than 4 channels.
814 *
815 * Right now, this should only be hit for:
816 * - RGTC snorm formats
817 * (those miss fast fetch functions hence they are terrible anyway)
818 */
819
820 {
821 unsigned k;
822 struct lp_type tmp_type;
823 LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
824
825 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
826 debug_printf("%s: AoS fetch fallback for %s\n",
827 __FUNCTION__, format_desc->short_name);
828 }
829
830 tmp_type = type;
831 tmp_type.length = 4;
832
833 /*
834 * Note that vector transpose can be worse compared to insert/extract
835 * for aos->soa conversion (for formats with 1 or 2 channels). However,
836 * we should try to avoid getting here for just about all formats, so
837 * don't bother.
838 */
839
840 /* loop over number of pixels */
841 for(k = 0; k < type.length; ++k) {
842 LLVMValueRef index = lp_build_const_int32(gallivm, k);
843 LLVMValueRef offset_elem;
844 LLVMValueRef i_elem, j_elem;
845
846 offset_elem = LLVMBuildExtractElement(builder, offset,
847 index, "");
848
849 i_elem = LLVMBuildExtractElement(builder, i, index, "");
850 j_elem = LLVMBuildExtractElement(builder, j, index, "");
851
852 /* Get a single float[4]={R,G,B,A} pixel */
853 aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
854 aligned, base_ptr, offset_elem,
855 i_elem, j_elem, cache);
856
857 }
858 convert_to_soa(gallivm, aos_fetch, rgba_out, type);
859 }
860 }
861