1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 #include "pipe/p_defines.h"
30
31 #include "util/format/u_format.h"
32 #include "util/u_memory.h"
33 #include "util/u_string.h"
34 #include "util/u_math.h"
35
36 #include "lp_bld_type.h"
37 #include "lp_bld_const.h"
38 #include "lp_bld_conv.h"
39 #include "lp_bld_swizzle.h"
40 #include "lp_bld_gather.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_format.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_pack.h"
45 #include "lp_bld_flow.h"
46 #include "lp_bld_printf.h"
47 #include "lp_bld_intr.h"
48
49 static void
convert_to_soa(struct gallivm_state * gallivm,LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH/32],LLVMValueRef dst_soa[4],const struct lp_type soa_type)50 convert_to_soa(struct gallivm_state *gallivm,
51 LLVMValueRef src_aos[LP_MAX_VECTOR_WIDTH / 32],
52 LLVMValueRef dst_soa[4],
53 const struct lp_type soa_type)
54 {
55 unsigned j, k;
56 struct lp_type aos_channel_type = soa_type;
57
58 LLVMValueRef aos_channels[4];
59 unsigned pixels_per_channel = soa_type.length / 4;
60
61 debug_assert((soa_type.length % 4) == 0);
62
63 aos_channel_type.length >>= 1;
64
65 for (j = 0; j < 4; ++j) {
66 LLVMValueRef channel[LP_MAX_VECTOR_LENGTH] = { 0 };
67
68 assert(pixels_per_channel <= LP_MAX_VECTOR_LENGTH);
69
70 for (k = 0; k < pixels_per_channel; ++k) {
71 channel[k] = src_aos[j + 4 * k];
72 }
73
74 aos_channels[j] = lp_build_concat(gallivm, channel, aos_channel_type, pixels_per_channel);
75 }
76
77 lp_build_transpose_aos(gallivm, soa_type, aos_channels, dst_soa);
78 }
79
80
81 void
lp_build_format_swizzle_soa(const struct util_format_description * format_desc,struct lp_build_context * bld,const LLVMValueRef * unswizzled,LLVMValueRef swizzled_out[4])82 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
83 struct lp_build_context *bld,
84 const LLVMValueRef *unswizzled,
85 LLVMValueRef swizzled_out[4])
86 {
87 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
88 enum pipe_swizzle swizzle;
89 LLVMValueRef depth_or_stencil;
90
91 if (util_format_has_stencil(format_desc) &&
92 !util_format_has_depth(format_desc)) {
93 assert(!bld->type.floating);
94 swizzle = format_desc->swizzle[1];
95 }
96 else {
97 assert(bld->type.floating);
98 swizzle = format_desc->swizzle[0];
99 }
100 /*
101 * Return zzz1 or sss1 for depth-stencil formats here.
102 * Correct swizzling will be handled by apply_sampler_swizzle() later.
103 */
104 depth_or_stencil = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
105
106 swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth_or_stencil;
107 swizzled_out[3] = bld->one;
108 }
109 else {
110 unsigned chan;
111 for (chan = 0; chan < 4; ++chan) {
112 enum pipe_swizzle swizzle = format_desc->swizzle[chan];
113 swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
114 }
115 }
116 }
117
118
119
120 static LLVMValueRef
lp_build_extract_soa_chan(struct lp_build_context * bld,unsigned blockbits,boolean srgb_chan,struct util_format_channel_description chan_desc,LLVMValueRef packed)121 lp_build_extract_soa_chan(struct lp_build_context *bld,
122 unsigned blockbits,
123 boolean srgb_chan,
124 struct util_format_channel_description chan_desc,
125 LLVMValueRef packed)
126 {
127 struct gallivm_state *gallivm = bld->gallivm;
128 LLVMBuilderRef builder = gallivm->builder;
129 struct lp_type type = bld->type;
130 LLVMValueRef input = packed;
131 const unsigned width = chan_desc.size;
132 const unsigned start = chan_desc.shift;
133 const unsigned stop = start + width;
134
135 /* Decode the input vector component */
136
137 switch(chan_desc.type) {
138 case UTIL_FORMAT_TYPE_VOID:
139 input = bld->undef;
140 break;
141
142 case UTIL_FORMAT_TYPE_UNSIGNED:
143 /*
144 * Align the LSB
145 */
146 if (start) {
147 input = LLVMBuildLShr(builder, input,
148 lp_build_const_int_vec(gallivm, type, start), "");
149 }
150
151 /*
152 * Zero the MSBs
153 */
154 if (stop < blockbits) {
155 unsigned mask = ((unsigned long long)1 << width) - 1;
156 input = LLVMBuildAnd(builder, input,
157 lp_build_const_int_vec(gallivm, type, mask), "");
158 }
159
160 /*
161 * Type conversion
162 */
163 if (type.floating) {
164 if (srgb_chan) {
165 struct lp_type conv_type = lp_uint_type(type);
166 input = lp_build_srgb_to_linear(gallivm, conv_type, width, input);
167 }
168 else {
169 if(chan_desc.normalized)
170 input = lp_build_unsigned_norm_to_float(gallivm, width, type, input);
171 else
172 input = LLVMBuildUIToFP(builder, input, bld->vec_type, "");
173 }
174 }
175 else if (chan_desc.pure_integer) {
176 /* Nothing to do */
177 } else {
178 /* FIXME */
179 assert(0);
180 }
181 break;
182
183 case UTIL_FORMAT_TYPE_SIGNED:
184 /*
185 * Align the sign bit first.
186 */
187 if (stop < type.width) {
188 unsigned bits = type.width - stop;
189 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
190 input = LLVMBuildShl(builder, input, bits_val, "");
191 }
192
193 /*
194 * Align the LSB (with an arithmetic shift to preserve the sign)
195 */
196 if (chan_desc.size < type.width) {
197 unsigned bits = type.width - chan_desc.size;
198 LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits);
199 input = LLVMBuildAShr(builder, input, bits_val, "");
200 }
201
202 /*
203 * Type conversion
204 */
205 if (type.floating) {
206 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
207 if (chan_desc.normalized) {
208 double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1);
209 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
210 input = LLVMBuildFMul(builder, input, scale_val, "");
211 /*
212 * The formula above will produce value below -1.0 for most negative values.
213 * compliance requires clamping it.
214 * GTF-GL45.gtf33.GL3Tests.vertex_type_2_10_10_10_rev.vertex_type_2_10_10_10_rev_conversion.
215 */
216 input = lp_build_max(bld, input,
217 lp_build_const_vec(gallivm, type, -1.0f));
218 }
219 }
220 else if (chan_desc.pure_integer) {
221 /* Nothing to do */
222 } else {
223 /* FIXME */
224 assert(0);
225 }
226 break;
227
228 case UTIL_FORMAT_TYPE_FLOAT:
229 if (type.floating) {
230 if (chan_desc.size == 16) {
231 struct lp_type f16i_type = type;
232 f16i_type.width /= 2;
233 f16i_type.floating = 0;
234 if (start) {
235 input = LLVMBuildLShr(builder, input,
236 lp_build_const_int_vec(gallivm, type, start), "");
237 }
238 input = LLVMBuildTrunc(builder, input,
239 lp_build_vec_type(gallivm, f16i_type), "");
240 input = lp_build_half_to_float(gallivm, input);
241 } else {
242 assert(start == 0);
243 assert(stop == 32);
244 assert(type.width == 32);
245 }
246 input = LLVMBuildBitCast(builder, input, bld->vec_type, "");
247 }
248 else {
249 /* FIXME */
250 assert(0);
251 input = bld->undef;
252 }
253 break;
254
255 case UTIL_FORMAT_TYPE_FIXED:
256 if (type.floating) {
257 double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1);
258 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
259 input = LLVMBuildSIToFP(builder, input, bld->vec_type, "");
260 input = LLVMBuildFMul(builder, input, scale_val, "");
261 }
262 else {
263 /* FIXME */
264 assert(0);
265 input = bld->undef;
266 }
267 break;
268
269 default:
270 assert(0);
271 input = bld->undef;
272 break;
273 }
274
275 return input;
276 }
277
278
279 /**
280 * Unpack several pixels in SoA.
281 *
282 * It takes a vector of packed pixels:
283 *
284 * packed = {P0, P1, P2, P3, ..., Pn}
285 *
286 * And will produce four vectors:
287 *
288 * red = {R0, R1, R2, R3, ..., Rn}
289 * green = {G0, G1, G2, G3, ..., Gn}
290 * blue = {B0, B1, B2, B3, ..., Bn}
291 * alpha = {A0, A1, A2, A3, ..., An}
292 *
293 * It requires that a packed pixel fits into an element of the output
294 * channels. The common case is when converting pixel with a depth of 32 bit or
295 * less into floats.
296 *
297 * \param format_desc the format of the 'packed' incoming pixel vector
298 * \param type the desired type for rgba_out (type.length = n, above)
299 * \param packed the incoming vector of packed pixels
300 * \param rgba_out returns the SoA R,G,B,A vectors
301 */
302 void
lp_build_unpack_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef packed,LLVMValueRef rgba_out[4])303 lp_build_unpack_rgba_soa(struct gallivm_state *gallivm,
304 const struct util_format_description *format_desc,
305 struct lp_type type,
306 LLVMValueRef packed,
307 LLVMValueRef rgba_out[4])
308 {
309 struct lp_build_context bld;
310 LLVMValueRef inputs[4];
311 unsigned chan;
312
313 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
314 assert(format_desc->block.width == 1);
315 assert(format_desc->block.height == 1);
316 assert(format_desc->block.bits <= type.width);
317 /* FIXME: Support more output types */
318 assert(type.width == 32);
319
320 lp_build_context_init(&bld, gallivm, type);
321
322 /* Decode the input vector components */
323 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
324 struct util_format_channel_description chan_desc = format_desc->channel[chan];
325 boolean srgb_chan = FALSE;
326
327 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
328 format_desc->swizzle[3] != chan) {
329 srgb_chan = TRUE;
330 }
331
332 inputs[chan] = lp_build_extract_soa_chan(&bld,
333 format_desc->block.bits,
334 srgb_chan,
335 chan_desc,
336 packed);
337 }
338
339 lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
340 }
341
342
343 /**
344 * Convert a vector of rgba8 values into 32bit wide SoA vectors.
345 *
346 * \param dst_type The desired return type. For pure integer formats
347 * this should be a 32bit wide int or uint vector type,
348 * otherwise a float vector type.
349 *
350 * \param packed The rgba8 values to pack.
351 *
352 * \param rgba The 4 SoA return vectors.
353 */
354 void
lp_build_rgba8_to_fi32_soa(struct gallivm_state * gallivm,struct lp_type dst_type,LLVMValueRef packed,LLVMValueRef * rgba)355 lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
356 struct lp_type dst_type,
357 LLVMValueRef packed,
358 LLVMValueRef *rgba)
359 {
360 LLVMBuilderRef builder = gallivm->builder;
361 LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff);
362 unsigned chan;
363
364 /* XXX technically shouldn't use that for uint dst_type */
365 packed = LLVMBuildBitCast(builder, packed,
366 lp_build_int_vec_type(gallivm, dst_type), "");
367
368 /* Decode the input vector components */
369 for (chan = 0; chan < 4; ++chan) {
370 #if UTIL_ARCH_LITTLE_ENDIAN
371 unsigned start = chan*8;
372 #else
373 unsigned start = (3-chan)*8;
374 #endif
375 unsigned stop = start + 8;
376 LLVMValueRef input;
377
378 input = packed;
379
380 if (start)
381 input = LLVMBuildLShr(builder, input,
382 lp_build_const_int_vec(gallivm, dst_type, start), "");
383
384 if (stop < 32)
385 input = LLVMBuildAnd(builder, input, mask, "");
386
387 if (dst_type.floating)
388 input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input);
389
390 rgba[chan] = input;
391 }
392 }
393
394
395
396 /**
397 * Fetch a texels from a texture, returning them in SoA layout.
398 *
399 * \param type the desired return type for 'rgba'. The vector length
400 * is the number of texels to fetch
401 * \param aligned if the offset is guaranteed to be aligned to element width
402 *
403 * \param base_ptr points to the base of the texture mip tree.
404 * \param offset offset to start of the texture image block. For non-
405 * compressed formats, this simply is an offset to the texel.
406 * For compressed formats, it is an offset to the start of the
407 * compressed data block.
408 *
409 * \param i, j the sub-block pixel coordinates. For non-compressed formats
410 * these will always be (0,0). For compressed formats, i will
411 * be in [0, block_width-1] and j will be in [0, block_height-1].
412 * \param cache optional value pointing to a lp_build_format_cache structure
413 */
414 void
lp_build_fetch_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache,LLVMValueRef rgba_out[4])415 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
416 const struct util_format_description *format_desc,
417 struct lp_type type,
418 boolean aligned,
419 LLVMValueRef base_ptr,
420 LLVMValueRef offset,
421 LLVMValueRef i,
422 LLVMValueRef j,
423 LLVMValueRef cache,
424 LLVMValueRef rgba_out[4])
425 {
426 LLVMBuilderRef builder = gallivm->builder;
427 enum pipe_format format = format_desc->format;
428 struct lp_type fetch_type;
429
430 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
431 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
432 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ||
433 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
434 format_desc->block.width == 1 &&
435 format_desc->block.height == 1 &&
436 format_desc->block.bits <= type.width &&
437 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
438 format_desc->channel[0].size == 32 ||
439 format_desc->channel[0].size == 16))
440 {
441 /*
442 * The packed pixel fits into an element of the destination format. Put
443 * the packed pixels into a vector and extract each component for all
444 * vector elements in parallel.
445 */
446
447 LLVMValueRef packed;
448
449 /*
450 * gather the texels from the texture
451 * Ex: packed = {XYZW, XYZW, XYZW, XYZW}
452 */
453 assert(format_desc->block.bits <= type.width);
454 fetch_type = lp_type_uint(type.width);
455 packed = lp_build_gather(gallivm,
456 type.length,
457 format_desc->block.bits,
458 fetch_type,
459 aligned,
460 base_ptr, offset, FALSE);
461
462 /*
463 * convert texels to float rgba
464 */
465 lp_build_unpack_rgba_soa(gallivm,
466 format_desc,
467 type,
468 packed, rgba_out);
469 return;
470 }
471
472
473 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
474 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
475 format_desc->block.width == 1 &&
476 format_desc->block.height == 1 &&
477 format_desc->block.bits > type.width &&
478 ((format_desc->block.bits <= type.width * type.length &&
479 format_desc->channel[0].size <= type.width) ||
480 (format_desc->channel[0].size == 64 &&
481 format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
482 type.floating)))
483 {
484 /*
485 * Similar to above, but the packed pixel is larger than what fits
486 * into an element of the destination format. The packed pixels will be
487 * shuffled into SoA vectors appropriately, and then the extraction will
488 * be done in parallel as much as possible.
489 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
490 * the gathered vectors can be shuffled easily (even with avx).
491 * 64xn float -> 32xn float is handled too but it's a bit special as
492 * it does the conversion pre-shuffle.
493 */
494
495 LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32];
496 struct lp_type fetch_type, gather_type = type;
497 unsigned num_gather, fetch_width, i, j;
498 struct lp_build_context bld;
499 boolean fp64 = format_desc->channel[0].size == 64;
500
501 lp_build_context_init(&bld, gallivm, type);
502
503 assert(type.width == 32);
504 assert(format_desc->block.bits > type.width);
505
506 /*
507 * First, figure out fetch order.
508 */
509 fetch_width = util_next_power_of_two(format_desc->block.bits);
510 /*
511 * fp64 are treated like fp32 except we fetch twice wide values
512 * (as we shuffle after trunc). The shuffles for that work out
513 * mostly fine (slightly suboptimal for 4-wide, perfect for AVX)
514 * albeit we miss the potential opportunity for hw gather (as it
515 * only handles native size).
516 */
517 num_gather = fetch_width / type.width;
518 gather_type.width *= num_gather;
519 if (fp64) {
520 num_gather /= 2;
521 }
522 gather_type.length /= num_gather;
523
524 for (i = 0; i < num_gather; i++) {
525 LLVMValueRef offsetr, shuf_vec;
526 if(num_gather == 4) {
527 for (j = 0; j < gather_type.length; j++) {
528 unsigned idx = i + 4*j;
529 shuffles[j] = lp_build_const_int32(gallivm, idx);
530 }
531 shuf_vec = LLVMConstVector(shuffles, gather_type.length);
532 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
533
534 }
535 else if (num_gather == 2) {
536 assert(num_gather == 2);
537 for (j = 0; j < gather_type.length; j++) {
538 unsigned idx = i*2 + (j%2) + (j/2)*4;
539 shuffles[j] = lp_build_const_int32(gallivm, idx);
540 }
541 shuf_vec = LLVMConstVector(shuffles, gather_type.length);
542 offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, "");
543 }
544 else {
545 assert(num_gather == 1);
546 offsetr = offset;
547 }
548 if (gather_type.length == 1) {
549 LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
550 offsetr = LLVMBuildExtractElement(builder, offsetr, zero, "");
551 }
552
553 /*
554 * Determine whether to use float or int loads. This is mostly
555 * to outsmart the (stupid) llvm int/float shuffle logic, we
556 * don't really care much if the data is floats or ints...
557 * But llvm will refuse to use single float shuffle with int data
558 * and instead use 3 int shuffles instead, the code looks atrocious.
559 * (Note bitcasts often won't help, as llvm is too smart to be
560 * fooled by that.)
561 * Nobody cares about simd float<->int domain transition penalties,
562 * which usually don't even exist for shuffles anyway.
563 * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is
564 * going into transpose, which is unpacks, so doesn't really matter
565 * much).
566 * With 2x32bit or 4x16bit fetch, we use float vec, since those
567 * go into the weird channel separation shuffle. With floats,
568 * this is (with 128bit vectors):
569 * - 2 movq, 2 movhpd, 2 shufps
570 * With ints it would be:
571 * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw
572 * I've seen texture functions increase in code size by 15% just due
573 * to that (there's lots of such fetches in them...)
574 * (We could chose a different gather order to improve this somewhat
575 * for the int path, but it would basically just drop the blends,
576 * so the float path with this order really is optimal.)
577 * Albeit it is tricky sometimes llvm doesn't ignore the float->int
578 * casts so must avoid them until we're done with the float shuffle...
579 * 3x16bit formats (the same is also true for 3x8) are pretty bad but
580 * there's nothing we can do about them (we could overallocate by
581 * those couple bytes and use unaligned but pot sized load).
582 * Note that this is very much x86 specific. I don't know if this
583 * affect other archs at all.
584 */
585 if (num_gather > 1) {
586 /*
587 * We always want some float type here (with x86)
588 * due to shuffles being float ones afterwards (albeit for
589 * the num_gather == 4 case int should work fine too
590 * (unless there's some problems with avx but not avx2).
591 */
592 if (format_desc->channel[0].size == 64) {
593 fetch_type = lp_type_float_vec(64, gather_type.width);
594 } else {
595 fetch_type = lp_type_int_vec(32, gather_type.width);
596 }
597 }
598 else {
599 /* type doesn't matter much */
600 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
601 (format_desc->channel[0].size == 32 ||
602 format_desc->channel[0].size == 64)) {
603 fetch_type = lp_type_float(gather_type.width);
604 } else {
605 fetch_type = lp_type_uint(gather_type.width);
606 }
607 }
608
609 /* Now finally gather the values */
610 packed[i] = lp_build_gather(gallivm, gather_type.length,
611 format_desc->block.bits,
612 fetch_type, aligned,
613 base_ptr, offsetr, FALSE);
614 if (fp64) {
615 struct lp_type conv_type = type;
616 conv_type.width *= 2;
617 packed[i] = LLVMBuildBitCast(builder, packed[i],
618 lp_build_vec_type(gallivm, conv_type), "");
619 packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, "");
620 }
621 }
622
623 /* shuffle the gathered values to SoA */
624 if (num_gather == 2) {
625 for (i = 0; i < num_gather; i++) {
626 for (j = 0; j < type.length; j++) {
627 unsigned idx = (j%2)*2 + (j/4)*4 + i;
628 if ((j/2)%2)
629 idx += type.length;
630 shuffles[j] = lp_build_const_int32(gallivm, idx);
631 }
632 dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1],
633 LLVMConstVector(shuffles, type.length), "");
634 }
635 }
636 else if (num_gather == 4) {
637 lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst);
638 }
639 else {
640 assert(num_gather == 1);
641 dst[0] = packed[0];
642 }
643
644 /*
645 * And finally unpack exactly as above, except that
646 * chan shift is adjusted and the right vector selected.
647 */
648 if (!fp64) {
649 for (i = 0; i < num_gather; i++) {
650 dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, "");
651 }
652 for (i = 0; i < format_desc->nr_channels; i++) {
653 struct util_format_channel_description chan_desc = format_desc->channel[i];
654 unsigned blockbits = type.width;
655 unsigned vec_nr;
656
657 #if UTIL_ARCH_BIG_ENDIAN
658 vec_nr = (format_desc->block.bits - (chan_desc.shift + chan_desc.size)) / type.width;
659 #else
660 vec_nr = chan_desc.shift / type.width;
661 #endif
662 chan_desc.shift %= type.width;
663
664 output[i] = lp_build_extract_soa_chan(&bld,
665 blockbits,
666 FALSE,
667 chan_desc,
668 dst[vec_nr]);
669 }
670 }
671 else {
672 for (i = 0; i < format_desc->nr_channels; i++) {
673 output[i] = dst[i];
674 }
675 }
676
677 lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out);
678 return;
679 }
680
681 if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
682 format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
683 /*
684 * similar conceptually to above but requiring special
685 * AoS packed -> SoA float conversion code.
686 */
687 LLVMValueRef packed;
688 struct lp_type fetch_type = lp_type_uint(type.width);
689
690 assert(type.floating);
691 assert(type.width == 32);
692
693 packed = lp_build_gather(gallivm, type.length,
694 format_desc->block.bits,
695 fetch_type, aligned,
696 base_ptr, offset, FALSE);
697 if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
698 lp_build_r11g11b10_to_float(gallivm, packed, rgba_out);
699 }
700 else {
701 lp_build_rgb9e5_to_float(gallivm, packed, rgba_out);
702 }
703 return;
704 }
705
706 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS &&
707 format_desc->block.bits == 64) {
708 /*
709 * special case the format is 64 bits but we only require
710 * 32bit (or 8bit) from each block.
711 */
712 LLVMValueRef packed;
713 struct lp_type fetch_type = lp_type_uint(type.width);
714
715 if (format == PIPE_FORMAT_X32_S8X24_UINT) {
716 /*
717 * for stencil simply fix up offsets - could in fact change
718 * base_ptr instead even outside the shader.
719 */
720 unsigned mask = (1 << 8) - 1;
721 LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4);
722 offset = LLVMBuildAdd(builder, offset, s_offset, "");
723 packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
724 aligned, base_ptr, offset, FALSE);
725 packed = LLVMBuildAnd(builder, packed,
726 lp_build_const_int_vec(gallivm, type, mask), "");
727 }
728 else {
729 assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
730 packed = lp_build_gather(gallivm, type.length, 32, fetch_type,
731 aligned, base_ptr, offset, TRUE);
732 packed = LLVMBuildBitCast(builder, packed,
733 lp_build_vec_type(gallivm, type), "");
734 }
735 /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */
736 rgba_out[0] = rgba_out[1] = rgba_out[2] = packed;
737 rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f);
738 return;
739 }
740
741 /*
742 * Try calling lp_build_fetch_rgba_aos for all pixels.
743 * Should only really hit subsampled, compressed
744 * (for s3tc srgb and rgtc too).
745 * (This is invalid for plain 8unorm formats because we're lazy with
746 * the swizzle since some results would arrive swizzled, some not.)
747 */
748
749 if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) &&
750 (util_format_fits_8unorm(format_desc) ||
751 format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
752 format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) &&
753 type.floating && type.width == 32 &&
754 (type.length == 1 || (type.length % 4 == 0))) {
755 struct lp_type tmp_type;
756 struct lp_build_context bld;
757 LLVMValueRef packed, rgba[4];
758 const struct util_format_description *flinear_desc;
759 const struct util_format_description *frgba8_desc;
760 unsigned chan;
761 bool is_signed = (format_desc->format == PIPE_FORMAT_RGTC1_SNORM ||
762 format_desc->format == PIPE_FORMAT_RGTC2_SNORM ||
763 format_desc->format == PIPE_FORMAT_LATC1_SNORM ||
764 format_desc->format == PIPE_FORMAT_LATC2_SNORM);
765
766 lp_build_context_init(&bld, gallivm, type);
767
768 /*
769 * Make sure the conversion in aos really only does convert to rgba8
770 * and not anything more (so use linear format, adjust type).
771 */
772 flinear_desc = util_format_description(util_format_linear(format));
773 memset(&tmp_type, 0, sizeof tmp_type);
774 tmp_type.width = 8;
775 tmp_type.length = type.length * 4;
776 tmp_type.norm = TRUE;
777 tmp_type.sign = is_signed;
778
779 packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type,
780 aligned, base_ptr, offset, i, j, cache);
781 packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, "");
782
783 /*
784 * The values are now packed so they match ordinary (srgb) RGBA8 format,
785 * hence need to use matching format for unpack.
786 */
787 frgba8_desc = util_format_description(is_signed ? PIPE_FORMAT_R8G8B8A8_SNORM : PIPE_FORMAT_R8G8B8A8_UNORM);
788 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
789 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
790 frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
791 }
792 lp_build_unpack_rgba_soa(gallivm,
793 frgba8_desc,
794 type,
795 packed, rgba);
796
797 /*
798 * We converted 4 channels. Make sure llvm can drop unneeded ones
799 * (luckily the rgba order is fixed, only LA needs special case).
800 */
801 for (chan = 0; chan < 4; chan++) {
802 enum pipe_swizzle swizzle = format_desc->swizzle[chan];
803 if (chan == 3 && util_format_is_luminance_alpha(format)) {
804 swizzle = PIPE_SWIZZLE_W;
805 }
806 rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle);
807 }
808 return;
809 }
810
811
812 /*
813 * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
814 *
815 * This is not the most efficient way of fetching pixels, as we
816 * miss some opportunities to do vectorization, but this is
817 * convenient for formats or scenarios for which there was no
818 * opportunity or incentive to optimize.
819 *
820 * We do NOT want to end up here, this typically is quite terrible,
821 * in particular if the formats have less than 4 channels.
822 *
823 * Right now, this should only be hit for:
824 * - ETC formats
825 * (those miss fast fetch functions hence they are terrible anyway)
826 */
827
828 {
829 unsigned k;
830 struct lp_type tmp_type;
831 LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
832
833 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
834 debug_printf("%s: AoS fetch fallback for %s\n",
835 __FUNCTION__, format_desc->short_name);
836 }
837
838 tmp_type = type;
839 tmp_type.length = 4;
840
841 if (type.length == 1) {
842 LLVMValueRef fetch = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
843 aligned, base_ptr, offset,
844 i, j, cache);
845
846 for (k = 0; k < 4; k++)
847 rgba_out[k] = LLVMBuildExtractElement(gallivm->builder, fetch, lp_build_const_int32(gallivm, k), "");
848 return;
849 }
850
851 /*
852 * Note that vector transpose can be worse compared to insert/extract
853 * for aos->soa conversion (for formats with 1 or 2 channels). However,
854 * we should try to avoid getting here for just about all formats, so
855 * don't bother.
856 */
857
858 /* loop over number of pixels */
859 for(k = 0; k < type.length; ++k) {
860 LLVMValueRef index = lp_build_const_int32(gallivm, k);
861 LLVMValueRef offset_elem;
862 LLVMValueRef i_elem, j_elem;
863
864 offset_elem = LLVMBuildExtractElement(builder, offset,
865 index, "");
866
867 i_elem = LLVMBuildExtractElement(builder, i, index, "");
868 j_elem = LLVMBuildExtractElement(builder, j, index, "");
869
870 /* Get a single float[4]={R,G,B,A} pixel */
871 aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
872 aligned, base_ptr, offset_elem,
873 i_elem, j_elem, cache);
874
875 }
876 convert_to_soa(gallivm, aos_fetch, rgba_out, type);
877 }
878 }
879
880 static void
lp_build_insert_soa_chan(struct lp_build_context * bld,unsigned blockbits,struct util_format_channel_description chan_desc,LLVMValueRef * output,LLVMValueRef rgba)881 lp_build_insert_soa_chan(struct lp_build_context *bld,
882 unsigned blockbits,
883 struct util_format_channel_description chan_desc,
884 LLVMValueRef *output,
885 LLVMValueRef rgba)
886 {
887 struct gallivm_state *gallivm = bld->gallivm;
888 LLVMBuilderRef builder = gallivm->builder;
889 struct lp_type type = bld->type;
890 const unsigned width = chan_desc.size;
891 const unsigned start = chan_desc.shift;
892 const uint32_t chan_mask = (1ULL << width) - 1;
893 ASSERTED const unsigned stop = start + width;
894 LLVMValueRef chan = NULL;
895 switch(chan_desc.type) {
896 case UTIL_FORMAT_TYPE_UNSIGNED:
897
898 if (chan_desc.pure_integer) {
899 chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
900 LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, type, chan_mask);
901 LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chan, mask_val, "");
902 chan = LLVMBuildSelect(builder, mask, mask_val, chan, "");
903 }
904 else if (type.floating) {
905 if (chan_desc.normalized) {
906 rgba = lp_build_clamp(bld, rgba, bld->zero, bld->one);
907 chan = lp_build_clamped_float_to_unsigned_norm(gallivm, type, width, rgba);
908 } else
909 chan = LLVMBuildFPToSI(builder, rgba, bld->vec_type, "");
910 }
911 if (start)
912 chan = LLVMBuildShl(builder, chan,
913 lp_build_const_int_vec(gallivm, type, start), "");
914 if (!*output)
915 *output = chan;
916 else
917 *output = LLVMBuildOr(builder, *output, chan, "");
918 break;
919 case UTIL_FORMAT_TYPE_SIGNED:
920 if (chan_desc.pure_integer) {
921 chan = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
922 chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
923 } else if (type.floating) {
924 if (chan_desc.normalized) {
925 char intrin[32];
926 double scale = ((1 << (chan_desc.size - 1)) - 1);
927 LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale);
928 rgba = lp_build_clamp(bld, rgba, lp_build_negate(bld, bld->one), bld->one);
929 rgba = LLVMBuildFMul(builder, rgba, scale_val, "");
930 lp_format_intrinsic(intrin, sizeof intrin, "llvm.rint", bld->vec_type);
931 rgba = lp_build_intrinsic_unary(builder, intrin, bld->vec_type, rgba);
932 }
933 chan = LLVMBuildFPToSI(builder, rgba, bld->int_vec_type, "");
934 chan = LLVMBuildAnd(builder, chan, lp_build_const_int_vec(gallivm, type, chan_mask), "");
935 }
936 if (start)
937 chan = LLVMBuildShl(builder, chan,
938 lp_build_const_int_vec(gallivm, type, start), "");
939 if (!*output)
940 *output = chan;
941 else
942 *output = LLVMBuildOr(builder, *output, chan, "");
943 break;
944 case UTIL_FORMAT_TYPE_FLOAT:
945 if (type.floating) {
946 if (chan_desc.size == 16) {
947 chan = lp_build_float_to_half(gallivm, rgba);
948 chan = LLVMBuildZExt(builder, chan, bld->int_vec_type, "");
949 if (start)
950 chan = LLVMBuildShl(builder, chan,
951 lp_build_const_int_vec(gallivm, type, start), "");
952 if (!*output)
953 *output = chan;
954 else
955 *output = LLVMBuildOr(builder, *output, chan, "");
956 } else {
957 assert(start == 0);
958 assert(stop == 32);
959 assert(type.width == 32);
960 *output = LLVMBuildBitCast(builder, rgba, bld->int_vec_type, "");
961 }
962 } else
963 assert(0);
964 break;
965 default:
966 assert(0);
967 *output = bld->undef;
968 }
969 }
970
971 static void
lp_build_pack_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,const LLVMValueRef rgba_in[4],LLVMValueRef * packed)972 lp_build_pack_rgba_soa(struct gallivm_state *gallivm,
973 const struct util_format_description *format_desc,
974 struct lp_type type,
975 const LLVMValueRef rgba_in[4],
976 LLVMValueRef *packed)
977 {
978 unsigned chan;
979 struct lp_build_context bld;
980 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
981 assert(format_desc->block.width == 1);
982 assert(format_desc->block.height == 1);
983 assert(format_desc->block.bits <= type.width);
984 /* FIXME: Support more output types */
985 assert(type.width == 32);
986
987 lp_build_context_init(&bld, gallivm, type);
988 for (chan = 0; chan < format_desc->nr_channels; ++chan) {
989 struct util_format_channel_description chan_desc = format_desc->channel[chan];
990
991 lp_build_insert_soa_chan(&bld, format_desc->block.bits,
992 chan_desc,
993 packed,
994 rgba_in[chan]);
995 }
996 }
997
998 void
lp_build_store_rgba_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef exec_mask,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef out_of_bounds,const LLVMValueRef rgba_in[4])999 lp_build_store_rgba_soa(struct gallivm_state *gallivm,
1000 const struct util_format_description *format_desc,
1001 struct lp_type type,
1002 LLVMValueRef exec_mask,
1003 LLVMValueRef base_ptr,
1004 LLVMValueRef offset,
1005 LLVMValueRef out_of_bounds,
1006 const LLVMValueRef rgba_in[4])
1007 {
1008 enum pipe_format format = format_desc->format;
1009 LLVMValueRef packed[4];
1010 unsigned num_stores = 0;
1011
1012 memset(packed, 0, sizeof(LLVMValueRef) * 4);
1013 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1014 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
1015 format_desc->block.width == 1 &&
1016 format_desc->block.height == 1 &&
1017 format_desc->block.bits <= type.width &&
1018 (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT ||
1019 format_desc->channel[0].size == 32 ||
1020 format_desc->channel[0].size == 16))
1021 {
1022 lp_build_pack_rgba_soa(gallivm, format_desc, type, rgba_in, &packed[0]);
1023
1024 num_stores = 1;
1025 } else if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
1026 (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) &&
1027 format_desc->block.width == 1 &&
1028 format_desc->block.height == 1 &&
1029 format_desc->block.bits > type.width &&
1030 ((format_desc->block.bits <= type.width * type.length &&
1031 format_desc->channel[0].size <= type.width) ||
1032 (format_desc->channel[0].size == 64 &&
1033 format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT &&
1034 type.floating)))
1035 {
1036 /*
1037 * Similar to above, but the packed pixel is larger than what fits
1038 * into an element of the destination format. The packed pixels will be
1039 * shuffled into SoA vectors appropriately, and then the extraction will
1040 * be done in parallel as much as possible.
1041 * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so
1042 * the gathered vectors can be shuffled easily (even with avx).
1043 * 64xn float -> 32xn float is handled too but it's a bit special as
1044 * it does the conversion pre-shuffle.
1045 */
1046 struct lp_build_context bld;
1047
1048 lp_build_context_init(&bld, gallivm, type);
1049 assert(type.width == 32);
1050 assert(format_desc->block.bits > type.width);
1051
1052 unsigned store_width = util_next_power_of_two(format_desc->block.bits);
1053 num_stores = store_width / type.width;
1054 for (unsigned i = 0; i < format_desc->nr_channels; i++) {
1055 struct util_format_channel_description chan_desc = format_desc->channel[i];
1056 unsigned blockbits = type.width;
1057 unsigned vec_nr;
1058
1059 vec_nr = chan_desc.shift / type.width;
1060 chan_desc.shift %= type.width;
1061
1062 lp_build_insert_soa_chan(&bld, blockbits,
1063 chan_desc,
1064 &packed[vec_nr],
1065 rgba_in[i]);
1066 }
1067
1068 assert(num_stores == 4 || num_stores == 2);
1069 /* we can transpose and store at the same time */
1070 } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1071 packed[0] = lp_build_float_to_r11g11b10(gallivm, rgba_in);
1072 num_stores = 1;
1073 } else
1074 assert(0);
1075
1076 assert(exec_mask);
1077
1078 LLVMTypeRef int32_ptr_type = LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0);
1079 LLVMTypeRef int16_ptr_type = LLVMPointerType(LLVMInt16TypeInContext(gallivm->context), 0);
1080 LLVMTypeRef int8_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
1081
1082 LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
1083 should_store_mask = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
1084 for (unsigned i = 0; i < num_stores; i++) {
1085 struct lp_build_loop_state loop_state;
1086
1087 LLVMValueRef store_offset = LLVMBuildAdd(gallivm->builder, offset, lp_build_const_int_vec(gallivm, type, i * 4), "");
1088 store_offset = LLVMBuildGEP(gallivm->builder, base_ptr, &store_offset, 1, "");
1089
1090 lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
1091
1092 struct lp_build_if_state ifthen;
1093 LLVMValueRef cond = LLVMBuildExtractElement(gallivm->builder, should_store_mask, loop_state.counter, "");
1094 lp_build_if(&ifthen, gallivm, cond);
1095
1096 LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed[i], loop_state.counter, "");
1097 LLVMValueRef this_offset = LLVMBuildExtractElement(gallivm->builder, store_offset, loop_state.counter, "");
1098
1099 if (format_desc->block.bits == 8) {
1100 this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int8_ptr_type, "");
1101 data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt8TypeInContext(gallivm->context), "");
1102 } else if (format_desc->block.bits == 16) {
1103 this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int16_ptr_type, "");
1104 data = LLVMBuildTrunc(gallivm->builder, data, LLVMInt16TypeInContext(gallivm->context), "");
1105 } else
1106 this_offset = LLVMBuildBitCast(gallivm->builder, this_offset, int32_ptr_type, "");
1107 LLVMBuildStore(gallivm->builder, data, this_offset);
1108 lp_build_endif(&ifthen);
1109 lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
1110 NULL, LLVMIntUGE);
1111 }
1112 }
1113