1 /**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 #include "util/u_debug.h"
30 #include "util/u_cpu_detect.h"
31 #include "util/u_math.h"
32 #include "lp_bld_debug.h"
33 #include "lp_bld_const.h"
34 #include "lp_bld_format.h"
35 #include "lp_bld_gather.h"
36 #include "lp_bld_swizzle.h"
37 #include "lp_bld_type.h"
38 #include "lp_bld_init.h"
39 #include "lp_bld_intr.h"
40 #include "lp_bld_pack.h"
41
42
43 /**
44 * Get the pointer to one element from scatter positions in memory.
45 *
46 * @sa lp_build_gather()
47 */
48 LLVMValueRef
lp_build_gather_elem_ptr(struct gallivm_state * gallivm,unsigned length,LLVMValueRef base_ptr,LLVMValueRef offsets,unsigned i)49 lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
50 unsigned length,
51 LLVMValueRef base_ptr,
52 LLVMValueRef offsets,
53 unsigned i)
54 {
55 LLVMValueRef offset;
56 LLVMValueRef ptr;
57
58 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
59
60 if (length == 1) {
61 assert(i == 0);
62 offset = offsets;
63 } else {
64 LLVMValueRef index = lp_build_const_int32(gallivm, i);
65 offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
66 }
67
68 ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
69
70 return ptr;
71 }
72
73
74 /**
75 * Gather one element from scatter positions in memory.
76 *
77 * @sa lp_build_gather()
78 */
79 LLVMValueRef
lp_build_gather_elem(struct gallivm_state * gallivm,unsigned length,unsigned src_width,unsigned dst_width,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offsets,unsigned i,boolean vector_justify)80 lp_build_gather_elem(struct gallivm_state *gallivm,
81 unsigned length,
82 unsigned src_width,
83 unsigned dst_width,
84 boolean aligned,
85 LLVMValueRef base_ptr,
86 LLVMValueRef offsets,
87 unsigned i,
88 boolean vector_justify)
89 {
90 LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
91 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
92 LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
93 LLVMValueRef ptr;
94 LLVMValueRef res;
95
96 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
97
98 ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
99 ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
100 res = LLVMBuildLoad(gallivm->builder, ptr, "");
101
102 /* XXX
103 * On some archs we probably really want to avoid having to deal
104 * with alignments lower than 4 bytes (if fetch size is a power of
105 * two >= 32). On x86 it doesn't matter, however.
106 * We should be able to guarantee full alignment for any kind of texture
107 * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
108 * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
109 * but I don't think that's quite what we wanted).
110 * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
111 * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
112 * enforcing what we want (which is what d3d10 does, the offset needs to
113 * be aligned to element size, but GL has bytes regardless of element
114 * size which would only leave us with minimum alignment restriction of 16
115 * which doesn't make much sense if the type isn't 4x32bit). Due to
116 * translation of offsets to first_elem in sampler_views it actually seems
117 * gallium could not do anything else except 16 no matter what...
118 */
119 if (!aligned) {
120 LLVMSetAlignment(res, 1);
121 } else if (!util_is_power_of_two(src_width)) {
122 /*
123 * Full alignment is impossible, assume the caller really meant
124 * the individual elements were aligned (e.g. 3x32bit format).
125 * And yes the generated code may otherwise crash, llvm will
126 * really assume 128bit alignment with a 96bit fetch (I suppose
127 * that makes sense as it can just assume the upper 32bit to be
128 * whatever).
129 * Maybe the caller should be able to explicitly set this, but
130 * this should cover all the 3-channel formats.
131 */
132 if (((src_width / 24) * 24 == src_width) &&
133 util_is_power_of_two(src_width / 24)) {
134 LLVMSetAlignment(res, src_width / 24);
135 } else {
136 LLVMSetAlignment(res, 1);
137 }
138 }
139
140 assert(src_width <= dst_width);
141 if (src_width < dst_width) {
142 res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
143 if (vector_justify) {
144 #ifdef PIPE_ARCH_BIG_ENDIAN
145 res = LLVMBuildShl(gallivm->builder, res,
146 LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
147 #endif
148 }
149 }
150
151 return res;
152 }
153
154
155 /**
156 * Gather one element from scatter positions in memory.
157 * Nearly the same as above, however the individual elements
158 * may be vectors themselves, and fetches may be float type.
159 * Can also do pad vector instead of ZExt.
160 *
161 * @sa lp_build_gather()
162 */
163 static LLVMValueRef
lp_build_gather_elem_vec(struct gallivm_state * gallivm,unsigned length,unsigned src_width,LLVMTypeRef src_type,struct lp_type dst_type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offsets,unsigned i,boolean vector_justify)164 lp_build_gather_elem_vec(struct gallivm_state *gallivm,
165 unsigned length,
166 unsigned src_width,
167 LLVMTypeRef src_type,
168 struct lp_type dst_type,
169 boolean aligned,
170 LLVMValueRef base_ptr,
171 LLVMValueRef offsets,
172 unsigned i,
173 boolean vector_justify)
174 {
175 LLVMValueRef ptr, res;
176 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
177 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
178
179 ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
180 ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, "");
181 res = LLVMBuildLoad(gallivm->builder, ptr, "");
182
183 /* XXX
184 * On some archs we probably really want to avoid having to deal
185 * with alignments lower than 4 bytes (if fetch size is a power of
186 * two >= 32). On x86 it doesn't matter, however.
187 * We should be able to guarantee full alignment for any kind of texture
188 * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
189 * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
190 * but I don't think that's quite what we wanted).
191 * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
192 * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
193 * enforcing what we want (which is what d3d10 does, the offset needs to
194 * be aligned to element size, but GL has bytes regardless of element
195 * size which would only leave us with minimum alignment restriction of 16
196 * which doesn't make much sense if the type isn't 4x32bit). Due to
197 * translation of offsets to first_elem in sampler_views it actually seems
198 * gallium could not do anything else except 16 no matter what...
199 */
200 if (!aligned) {
201 LLVMSetAlignment(res, 1);
202 } else if (!util_is_power_of_two(src_width)) {
203 /*
204 * Full alignment is impossible, assume the caller really meant
205 * the individual elements were aligned (e.g. 3x32bit format).
206 * And yes the generated code may otherwise crash, llvm will
207 * really assume 128bit alignment with a 96bit fetch (I suppose
208 * that makes sense as it can just assume the upper 32bit to be
209 * whatever).
210 * Maybe the caller should be able to explicitly set this, but
211 * this should cover all the 3-channel formats.
212 */
213 if (((src_width / 24) * 24 == src_width) &&
214 util_is_power_of_two(src_width / 24)) {
215 LLVMSetAlignment(res, src_width / 24);
216 } else {
217 LLVMSetAlignment(res, 1);
218 }
219 }
220
221 assert(src_width <= dst_type.width * dst_type.length);
222 if (src_width < dst_type.width * dst_type.length) {
223 if (dst_type.length > 1) {
224 res = lp_build_pad_vector(gallivm, res, dst_type.length);
225 /*
226 * vector_justify hopefully a non-issue since we only deal
227 * with src_width >= 32 here?
228 */
229 } else {
230 LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
231
232 /*
233 * Only valid if src_ptr_type is int type...
234 */
235 res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
236
237 if (vector_justify) {
238 #ifdef PIPE_ARCH_BIG_ENDIAN
239 res = LLVMBuildShl(gallivm->builder, res,
240 LLVMConstInt(dst_elem_type,
241 dst_type.width - src_width, 0), "");
242 #endif
243 }
244 }
245 }
246 return res;
247 }
248
249
250
251
252 static LLVMValueRef
lp_build_gather_avx2(struct gallivm_state * gallivm,unsigned length,unsigned src_width,struct lp_type dst_type,LLVMValueRef base_ptr,LLVMValueRef offsets)253 lp_build_gather_avx2(struct gallivm_state *gallivm,
254 unsigned length,
255 unsigned src_width,
256 struct lp_type dst_type,
257 LLVMValueRef base_ptr,
258 LLVMValueRef offsets)
259 {
260 LLVMBuilderRef builder = gallivm->builder;
261 LLVMTypeRef src_type, src_vec_type;
262 LLVMValueRef res;
263 struct lp_type res_type = dst_type;
264 res_type.length *= length;
265
266 if (dst_type.floating) {
267 src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
268 LLVMFloatTypeInContext(gallivm->context);
269 } else {
270 src_type = LLVMIntTypeInContext(gallivm->context, src_width);
271 }
272 src_vec_type = LLVMVectorType(src_type, length);
273
274 /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
275 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
276
277 if (0) {
278 /*
279 * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
280 * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
281 * least with Haswell. See
282 * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
283 * And the generated code doing the emulation is quite a bit worse
284 * than what we get by doing it ourselves too.
285 */
286 LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
287 LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
288 LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
289 LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
290 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
291 LLVMValueRef src_ptr;
292
293 base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");
294
295 /* Rescale offsets from bytes to elements */
296 LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
297 scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
298 assert(LLVMTypeOf(offsets) == i32_vec_type);
299 offsets = LLVMBuildSDiv(builder, offsets, scale, "");
300
301 src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep");
302
303 char intrinsic[64];
304 util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
305 length, dst_type.floating ? "f" : "i", src_width);
306 LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
307 LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
308 LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
309
310 LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };
311
312 res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
313 } else {
314 LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
315 const char *intrinsic = NULL;
316 unsigned l_idx = 0;
317
318 assert(src_width == 32 || src_width == 64);
319 if (src_width == 32) {
320 assert(length == 4 || length == 8);
321 } else {
322 assert(length == 2 || length == 4);
323 }
324
325 static const char *intrinsics[2][2][2] = {
326
327 {{"llvm.x86.avx2.gather.d.d",
328 "llvm.x86.avx2.gather.d.d.256"},
329 {"llvm.x86.avx2.gather.d.q",
330 "llvm.x86.avx2.gather.d.q.256"}},
331
332 {{"llvm.x86.avx2.gather.d.ps",
333 "llvm.x86.avx2.gather.d.ps.256"},
334 {"llvm.x86.avx2.gather.d.pd",
335 "llvm.x86.avx2.gather.d.pd.256"}},
336 };
337
338 if ((src_width == 32 && length == 8) ||
339 (src_width == 64 && length == 4)) {
340 l_idx = 1;
341 }
342 intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
343
344 LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
345 LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
346 mask = LLVMConstBitCast(mask, src_vec_type);
347 LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);
348
349 LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };
350
351 res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
352 }
353 res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
354
355 return res;
356 }
357
358
359 /**
360 * Gather elements from scatter positions in memory into a single vector.
361 * Use for fetching texels from a texture.
362 * For SSE, typical values are length=4, src_width=32, dst_width=32.
363 *
364 * When src_width < dst_width, the return value can be justified in
365 * one of two ways:
366 * "integer justification" is used when the caller treats the destination
367 * as a packed integer bitmask, as described by the channels' "shift" and
368 * "width" fields;
369 * "vector justification" is used when the caller casts the destination
370 * to a vector and needs channel X to be in vector element 0.
371 *
372 * @param length length of the offsets
373 * @param src_width src element width in bits
374 * @param dst_type result element type (src will be expanded to fit,
375 * but truncation is not allowed)
376 * (this may be a vector, must be pot sized)
377 * @param aligned whether the data is guaranteed to be aligned (to src_width)
378 * @param base_ptr base pointer, needs to be a i8 pointer type.
379 * @param offsets vector with offsets
380 * @param vector_justify select vector rather than integer justification
381 */
382 LLVMValueRef
lp_build_gather(struct gallivm_state * gallivm,unsigned length,unsigned src_width,struct lp_type dst_type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offsets,boolean vector_justify)383 lp_build_gather(struct gallivm_state *gallivm,
384 unsigned length,
385 unsigned src_width,
386 struct lp_type dst_type,
387 boolean aligned,
388 LLVMValueRef base_ptr,
389 LLVMValueRef offsets,
390 boolean vector_justify)
391 {
392 LLVMValueRef res;
393 boolean need_expansion = src_width < dst_type.width * dst_type.length;
394 boolean vec_fetch;
395 struct lp_type fetch_type, fetch_dst_type;
396 LLVMTypeRef src_type;
397
398 assert(src_width <= dst_type.width * dst_type.length);
399
400 /*
401 * This is quite a mess...
402 * Figure out if the fetch should be done as:
403 * a) scalar or vector
404 * b) float or int
405 *
406 * As an example, for a 96bit fetch expanded into 4x32bit, it is better
407 * to use (3x32bit) vector type (then pad the vector). Otherwise, the
408 * zext will cause extra instructions.
409 * However, the same isn't true for 3x16bit (the codegen for that is
410 * completely worthless on x86 simd, and for 3x8bit is is way worse
411 * still, don't try that... (To get really good code out of llvm for
412 * these cases, the only way is to decompose the fetches manually
413 * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
414 * case requires sse41, otherwise simple scalar zext is way better.
415 * But probably not important enough, so don't bother.)
416 * Also, we try to honor the floating bit of destination (but isn't
417 * possible if caller asks for instance for 2x32bit dst_type with
418 * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
419 * cast to 2x32f type, so the fetch is always int and on top of that
420 * we avoid the vec pad and use scalar zext due the above mentioned
421 * issue).
422 * Note this is optimized for x86 sse2 and up backend. Could be tweaked
423 * for other archs if necessary...
424 */
425 if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
426 (dst_type.length > 1)) {
427 /* use vector fetch (if dst_type is vector) */
428 vec_fetch = TRUE;
429 if (dst_type.floating) {
430 fetch_type = lp_type_float_vec(dst_type.width, src_width);
431 } else {
432 fetch_type = lp_type_int_vec(dst_type.width, src_width);
433 }
434 /* intentionally not using lp_build_vec_type here */
435 src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
436 fetch_type.length);
437 fetch_dst_type = fetch_type;
438 fetch_dst_type.length = dst_type.length;
439 } else {
440 /* use scalar fetch */
441 vec_fetch = FALSE;
442 if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
443 fetch_type = lp_type_float(src_width);
444 } else {
445 fetch_type = lp_type_int(src_width);
446 }
447 src_type = lp_build_vec_type(gallivm, fetch_type);
448 fetch_dst_type = fetch_type;
449 fetch_dst_type.width = dst_type.width * dst_type.length;
450 }
451
452 if (length == 1) {
453 /* Scalar */
454 res = lp_build_gather_elem_vec(gallivm, length,
455 src_width, src_type, fetch_dst_type,
456 aligned, base_ptr, offsets, 0,
457 vector_justify);
458 return LLVMBuildBitCast(gallivm->builder, res,
459 lp_build_vec_type(gallivm, dst_type), "");
460 /*
461 * Excluding expansion from these paths because if you need it for
462 * 32bit/64bit fetches you're doing it wrong (this is gather, not
463 * conversion) and it would be awkward for floats.
464 */
465 } else if (util_cpu_caps.has_avx2 && !need_expansion &&
466 src_width == 32 && (length == 4 || length == 8)) {
467 return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
468 base_ptr, offsets);
469 /*
470 * This looks bad on paper wrt throughtput/latency on Haswell.
471 * Even on Broadwell it doesn't look stellar.
472 * Albeit no measurements were done (but tested to work).
473 * Should definitely enable on Skylake.
474 * (In general, should be more of a win if the fetch is 256bit wide -
475 * this is true for the 32bit case above too.)
476 */
477 } else if (0 && util_cpu_caps.has_avx2 && !need_expansion &&
478 src_width == 64 && (length == 2 || length == 4)) {
479 return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
480 base_ptr, offsets);
481 } else {
482 /* Vector */
483
484 LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
485 unsigned i;
486 boolean vec_zext = FALSE;
487 struct lp_type res_type, gather_res_type;
488 LLVMTypeRef res_t, gather_res_t;
489
490 res_type = fetch_dst_type;
491 res_type.length *= length;
492 gather_res_type = res_type;
493
494 if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
495 /*
496 * Note that llvm is never able to optimize zext/insert combos
497 * directly (i.e. zero the simd reg, then place the elements into
498 * the appropriate place directly). (I think this has to do with
499 * scalar/vector transition.) And scalar 16->32bit zext simd loads
500 * aren't possible (instead loading to scalar reg first).
501 * No idea about other archs...
502 * We could do this manually, but instead we just use a vector
503 * zext, which is simple enough (and, in fact, llvm might optimize
504 * this away).
505 * (We're not trying that with other bit widths as that might not be
506 * easier, in particular with 8 bit values at least with only sse2.)
507 */
508 assert(vec_fetch == FALSE);
509 gather_res_type.width /= 2;
510 fetch_dst_type = fetch_type;
511 src_type = lp_build_vec_type(gallivm, fetch_type);
512 vec_zext = TRUE;
513 }
514 res_t = lp_build_vec_type(gallivm, res_type);
515 gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
516 res = LLVMGetUndef(gather_res_t);
517 for (i = 0; i < length; ++i) {
518 LLVMValueRef index = lp_build_const_int32(gallivm, i);
519 elems[i] = lp_build_gather_elem_vec(gallivm, length,
520 src_width, src_type, fetch_dst_type,
521 aligned, base_ptr, offsets, i,
522 vector_justify);
523 if (!vec_fetch) {
524 res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
525 }
526 }
527 if (vec_zext) {
528 res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
529 if (vector_justify) {
530 #ifdef PIPE_ARCH_BIG_ENDIAN
531 unsigned sv = dst_type.width - src_width;
532 res = LLVMBuildShl(gallivm->builder, res,
533 lp_build_const_int_vec(gallivm, res_type, sv), "");
534 #endif
535 }
536 }
537 if (vec_fetch) {
538 /*
539 * Do bitcast now otherwise llvm might get some funny ideas wrt
540 * float/int types...
541 */
542 for (i = 0; i < length; i++) {
543 elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
544 lp_build_vec_type(gallivm, dst_type), "");
545 }
546 res = lp_build_concat(gallivm, elems, dst_type, length);
547 } else {
548 struct lp_type really_final_type = dst_type;
549 assert(res_type.length * res_type.width ==
550 dst_type.length * dst_type.width * length);
551 really_final_type.length *= length;
552 res = LLVMBuildBitCast(gallivm->builder, res,
553 lp_build_vec_type(gallivm, really_final_type), "");
554 }
555 }
556
557 return res;
558 }
559
560 LLVMValueRef
lp_build_gather_values(struct gallivm_state * gallivm,LLVMValueRef * values,unsigned value_count)561 lp_build_gather_values(struct gallivm_state * gallivm,
562 LLVMValueRef * values,
563 unsigned value_count)
564 {
565 LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
566 LLVMBuilderRef builder = gallivm->builder;
567 LLVMValueRef vec = LLVMGetUndef(vec_type);
568 unsigned i;
569
570 for (i = 0; i < value_count; i++) {
571 LLVMValueRef index = lp_build_const_int32(gallivm, i);
572 vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
573 }
574 return vec;
575 }
576