• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2010 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * The above copyright notice and this permission notice (including the
23  * next paragraph) shall be included in all copies or substantial portions
24  * of the Software.
25  *
26  **************************************************************************/
27 
28 
29 #include "util/u_debug.h"
30 #include "util/u_cpu_detect.h"
31 #include "util/u_math.h"
32 #include "lp_bld_debug.h"
33 #include "lp_bld_const.h"
34 #include "lp_bld_format.h"
35 #include "lp_bld_gather.h"
36 #include "lp_bld_swizzle.h"
37 #include "lp_bld_type.h"
38 #include "lp_bld_init.h"
39 #include "lp_bld_intr.h"
40 #include "lp_bld_pack.h"
41 
42 
43 /**
44  * Get the pointer to one element from scatter positions in memory.
45  *
46  * @sa lp_build_gather()
47  */
48 LLVMValueRef
lp_build_gather_elem_ptr(struct gallivm_state * gallivm,unsigned length,LLVMValueRef base_ptr,LLVMValueRef offsets,unsigned i)49 lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
50                          unsigned length,
51                          LLVMValueRef base_ptr,
52                          LLVMValueRef offsets,
53                          unsigned i)
54 {
55    LLVMValueRef offset;
56    LLVMValueRef ptr;
57 
58    ASSERTED LLVMTypeRef element_type = LLVMInt8TypeInContext(gallivm->context);
59    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(element_type, 0));
60 
61    if (length == 1) {
62       assert(i == 0);
63       offset = offsets;
64    } else {
65       LLVMValueRef index = lp_build_const_int32(gallivm, i);
66       offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
67    }
68 
69    ptr = LLVMBuildGEP2(gallivm->builder, element_type, base_ptr, &offset, 1, "");
70 
71    return ptr;
72 }
73 
74 
75 /**
76  * Gather one element from scatter positions in memory.
77  *
78  * @sa lp_build_gather()
79  */
80 LLVMValueRef
lp_build_gather_elem(struct gallivm_state * gallivm,unsigned length,unsigned src_width,unsigned dst_width,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offsets,unsigned i,boolean vector_justify)81 lp_build_gather_elem(struct gallivm_state *gallivm,
82                      unsigned length,
83                      unsigned src_width,
84                      unsigned dst_width,
85                      boolean aligned,
86                      LLVMValueRef base_ptr,
87                      LLVMValueRef offsets,
88                      unsigned i,
89                      boolean vector_justify)
90 {
91    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
92    LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
93    LLVMValueRef ptr;
94    LLVMValueRef res;
95 
96    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
97 
98    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
99    ptr = LLVMBuildBitCast(gallivm->builder, ptr, LLVMPointerType(src_type, 0), "");
100    res = LLVMBuildLoad2(gallivm->builder, src_type, ptr, "");
101 
102    /* XXX
103     * On some archs we probably really want to avoid having to deal
104     * with alignments lower than 4 bytes (if fetch size is a power of
105     * two >= 32). On x86 it doesn't matter, however.
106     * We should be able to guarantee full alignment for any kind of texture
107     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
108     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
109     * but I don't think that's quite what we wanted).
110     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
111     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
112     * enforcing what we want (which is what d3d10 does, the offset needs to
113     * be aligned to element size, but GL has bytes regardless of element
114     * size which would only leave us with minimum alignment restriction of 16
115     * which doesn't make much sense if the type isn't 4x32bit). Due to
116     * translation of offsets to first_elem in sampler_views it actually seems
117     * gallium could not do anything else except 16 no matter what...
118     */
119    if (!aligned) {
120       LLVMSetAlignment(res, 1);
121    } else if (!util_is_power_of_two_or_zero(src_width)) {
122       /*
123        * Full alignment is impossible, assume the caller really meant
124        * the individual elements were aligned (e.g. 3x32bit format).
125        * And yes the generated code may otherwise crash, llvm will
126        * really assume 128bit alignment with a 96bit fetch (I suppose
127        * that makes sense as it can just assume the upper 32bit to be
128        * whatever).
129        * Maybe the caller should be able to explicitly set this, but
130        * this should cover all the 3-channel formats.
131        */
132       if (((src_width / 24) * 24 == src_width) &&
133            util_is_power_of_two_or_zero(src_width / 24)) {
134           LLVMSetAlignment(res, src_width / 24);
135       } else {
136          LLVMSetAlignment(res, 1);
137       }
138    }
139 
140    assert(src_width <= dst_width);
141    if (src_width < dst_width) {
142       res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
143       if (vector_justify) {
144 #if UTIL_ARCH_BIG_ENDIAN
145          res = LLVMBuildShl(gallivm->builder, res,
146                             LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
147 #endif
148       }
149    }
150 
151    return res;
152 }
153 
154 
155 /**
156  * Gather one element from scatter positions in memory.
157  * Nearly the same as above, however the individual elements
158  * may be vectors themselves, and fetches may be float type.
159  * Can also do pad vector instead of ZExt.
160  *
161  * @sa lp_build_gather()
162  */
163 static LLVMValueRef
lp_build_gather_elem_vec(struct gallivm_state * gallivm,unsigned length,unsigned src_width,LLVMTypeRef src_type,struct lp_type dst_type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offsets,unsigned i,boolean vector_justify)164 lp_build_gather_elem_vec(struct gallivm_state *gallivm,
165                          unsigned length,
166                          unsigned src_width,
167                          LLVMTypeRef src_type,
168                          struct lp_type dst_type,
169                          boolean aligned,
170                          LLVMValueRef base_ptr,
171                          LLVMValueRef offsets,
172                          unsigned i,
173                          boolean vector_justify)
174 {
175    LLVMValueRef ptr, res;
176    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
177 
178    ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
179    ptr = LLVMBuildBitCast(gallivm->builder, ptr, LLVMPointerType(src_type, 0), "");
180    res = LLVMBuildLoad2(gallivm->builder, src_type, ptr, "");
181 
182    /* XXX
183     * On some archs we probably really want to avoid having to deal
184     * with alignments lower than 4 bytes (if fetch size is a power of
185     * two >= 32). On x86 it doesn't matter, however.
186     * We should be able to guarantee full alignment for any kind of texture
187     * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
188     * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
189     * but I don't think that's quite what we wanted).
190     * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
191     * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
192     * enforcing what we want (which is what d3d10 does, the offset needs to
193     * be aligned to element size, but GL has bytes regardless of element
194     * size which would only leave us with minimum alignment restriction of 16
195     * which doesn't make much sense if the type isn't 4x32bit). Due to
196     * translation of offsets to first_elem in sampler_views it actually seems
197     * gallium could not do anything else except 16 no matter what...
198     */
199    if (!aligned) {
200       LLVMSetAlignment(res, 1);
201    } else if (!util_is_power_of_two_or_zero(src_width)) {
202       /*
203        * Full alignment is impossible, assume the caller really meant
204        * the individual elements were aligned (e.g. 3x32bit format).
205        * And yes the generated code may otherwise crash, llvm will
206        * really assume 128bit alignment with a 96bit fetch (I suppose
207        * that makes sense as it can just assume the upper 32bit to be
208        * whatever).
209        * Maybe the caller should be able to explicitly set this, but
210        * this should cover all the 3-channel formats.
211        */
212       if (((src_width / 24) * 24 == src_width) &&
213            util_is_power_of_two_or_zero(src_width / 24)) {
214           LLVMSetAlignment(res, src_width / 24);
215       } else {
216          LLVMSetAlignment(res, 1);
217       }
218    }
219 
220    assert(src_width <= dst_type.width * dst_type.length);
221    if (src_width < dst_type.width * dst_type.length) {
222       if (dst_type.length > 1) {
223          res = lp_build_pad_vector(gallivm, res, dst_type.length);
224          /*
225           * vector_justify hopefully a non-issue since we only deal
226           * with src_width >= 32 here?
227           */
228       } else {
229          LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
230 
231          /*
232           * Only valid if src_ptr_type is int type...
233           */
234          res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
235 
236 #if UTIL_ARCH_BIG_ENDIAN
237          if (vector_justify) {
238          res = LLVMBuildShl(gallivm->builder, res,
239                             LLVMConstInt(dst_elem_type,
240                                          dst_type.width - src_width, 0), "");
241          }
242          if (src_width == 48) {
243             /* Load 3x16 bit vector.
244              * The sequence of loads on big-endian hardware proceeds as follows.
245              * 16-bit fields are denoted by X, Y, Z, and 0.  In memory, the sequence
246              * of three fields appears in the order X, Y, Z.
247              *
248              * Load 32-bit word: 0.0.X.Y
249              * Load 16-bit halfword: 0.0.0.Z
250              * Rotate left: 0.X.Y.0
251              * Bitwise OR: 0.X.Y.Z
252              *
253              * The order in which we need the fields in the result is 0.Z.Y.X,
254              * the same as on little-endian; permute 16-bit fields accordingly
255              * within 64-bit register:
256              */
257             LLVMValueRef shuffles[4] = {
258                lp_build_const_int32(gallivm, 2),
259                lp_build_const_int32(gallivm, 1),
260                lp_build_const_int32(gallivm, 0),
261                lp_build_const_int32(gallivm, 3),
262             };
263             res = LLVMBuildBitCast(gallivm->builder, res,
264                                    lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");
265             res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");
266             res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");
267          }
268 #endif
269       }
270    }
271    return res;
272 }
273 
274 
275 
276 
277 static LLVMValueRef
lp_build_gather_avx2(struct gallivm_state * gallivm,unsigned length,unsigned src_width,struct lp_type dst_type,LLVMValueRef base_ptr,LLVMValueRef offsets)278 lp_build_gather_avx2(struct gallivm_state *gallivm,
279                      unsigned length,
280                      unsigned src_width,
281                      struct lp_type dst_type,
282                      LLVMValueRef base_ptr,
283                      LLVMValueRef offsets)
284 {
285    LLVMBuilderRef builder = gallivm->builder;
286    LLVMTypeRef src_type, src_vec_type;
287    LLVMValueRef res;
288    struct lp_type res_type = dst_type;
289    res_type.length *= length;
290 
291    if (dst_type.floating) {
292       src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
293                                    LLVMFloatTypeInContext(gallivm->context);
294    } else {
295       src_type = LLVMIntTypeInContext(gallivm->context, src_width);
296    }
297    src_vec_type = LLVMVectorType(src_type, length);
298 
299    /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
300    assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
301 
302    if (0) {
303       /*
304        * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
305        * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
306        * least with Haswell. See
307        * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
308        * And the generated code doing the emulation is quite a bit worse
309        * than what we get by doing it ourselves too.
310        */
311       LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
312       LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
313       LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
314       LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
315       LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
316       LLVMValueRef src_ptr;
317 
318       base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");
319 
320       /* Rescale offsets from bytes to elements */
321       LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
322       scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
323       assert(LLVMTypeOf(offsets) == i32_vec_type);
324       offsets = LLVMBuildSDiv(builder, offsets, scale, "");
325 
326       src_ptr = LLVMBuildGEP2(builder, src_type, base_ptr, &offsets, 1, "vector-gep");
327 
328       char intrinsic[64];
329       snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
330                length, dst_type.floating ? "f" : "i", src_width);
331       LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
332       LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
333       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
334 
335       LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };
336 
337       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
338    } else {
339       LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
340       const char *intrinsic = NULL;
341       unsigned l_idx = 0;
342 
343       assert(src_width == 32 || src_width == 64);
344       if (src_width == 32) {
345          assert(length == 4 || length == 8);
346       } else {
347          assert(length == 2 || length == 4);
348       }
349 
350       static const char *intrinsics[2][2][2] = {
351 
352          {{"llvm.x86.avx2.gather.d.d",
353            "llvm.x86.avx2.gather.d.d.256"},
354           {"llvm.x86.avx2.gather.d.q",
355            "llvm.x86.avx2.gather.d.q.256"}},
356 
357          {{"llvm.x86.avx2.gather.d.ps",
358            "llvm.x86.avx2.gather.d.ps.256"},
359           {"llvm.x86.avx2.gather.d.pd",
360            "llvm.x86.avx2.gather.d.pd.256"}},
361       };
362 
363       if ((src_width == 32 && length == 8) ||
364           (src_width == 64 && length == 4)) {
365          l_idx = 1;
366       }
367       intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
368 
369       LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
370       LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
371       mask = LLVMConstBitCast(mask, src_vec_type);
372       LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);
373 
374       LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };
375 
376       res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
377    }
378    res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
379 
380    return res;
381 }
382 
383 
384 /**
385  * Gather elements from scatter positions in memory into a single vector.
386  * Use for fetching texels from a texture.
387  * For SSE, typical values are length=4, src_width=32, dst_width=32.
388  *
389  * When src_width < dst_width, the return value can be justified in
390  * one of two ways:
391  * "integer justification" is used when the caller treats the destination
392  * as a packed integer bitmask, as described by the channels' "shift" and
393  * "width" fields;
394  * "vector justification" is used when the caller casts the destination
395  * to a vector and needs channel X to be in vector element 0.
396  *
397  * @param length length of the offsets
398  * @param src_width src element width in bits
399  * @param dst_type result element type (src will be expanded to fit,
400  *        but truncation is not allowed)
401  *        (this may be a vector, must be pot sized)
402  * @param aligned whether the data is guaranteed to be aligned (to src_width)
403  * @param base_ptr base pointer, needs to be a i8 pointer type.
404  * @param offsets vector with offsets
405  * @param vector_justify select vector rather than integer justification
406  */
407 LLVMValueRef
lp_build_gather(struct gallivm_state * gallivm,unsigned length,unsigned src_width,struct lp_type dst_type,boolean aligned,LLVMValueRef base_ptr,LLVMValueRef offsets,boolean vector_justify)408 lp_build_gather(struct gallivm_state *gallivm,
409                 unsigned length,
410                 unsigned src_width,
411                 struct lp_type dst_type,
412                 boolean aligned,
413                 LLVMValueRef base_ptr,
414                 LLVMValueRef offsets,
415                 boolean vector_justify)
416 {
417    LLVMValueRef res;
418    boolean need_expansion = src_width < dst_type.width * dst_type.length;
419    boolean vec_fetch;
420    struct lp_type fetch_type, fetch_dst_type;
421    LLVMTypeRef src_type;
422 
423    assert(src_width <= dst_type.width * dst_type.length);
424 
425    /*
426     * This is quite a mess...
427     * Figure out if the fetch should be done as:
428     * a) scalar or vector
429     * b) float or int
430     *
431     * As an example, for a 96bit fetch expanded into 4x32bit, it is better
432     * to use (3x32bit) vector type (then pad the vector). Otherwise, the
433     * zext will cause extra instructions.
434     * However, the same isn't true for 3x16bit (the codegen for that is
435     * completely worthless on x86 simd, and for 3x8bit is is way worse
436     * still, don't try that... (To get really good code out of llvm for
437     * these cases, the only way is to decompose the fetches manually
438     * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
439     * case requires sse41, otherwise simple scalar zext is way better.
440     * But probably not important enough, so don't bother.)
441     * Also, we try to honor the floating bit of destination (but isn't
442     * possible if caller asks for instance for 2x32bit dst_type with
443     * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
444     * cast to 2x32f type, so the fetch is always int and on top of that
445     * we avoid the vec pad and use scalar zext due the above mentioned
446     * issue).
447     * Note this is optimized for x86 sse2 and up backend. Could be tweaked
448     * for other archs if necessary...
449     */
450    if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
451        (dst_type.length > 1)) {
452       /* use vector fetch (if dst_type is vector) */
453       vec_fetch = TRUE;
454       if (dst_type.floating) {
455          fetch_type = lp_type_float_vec(dst_type.width, src_width);
456       } else {
457          fetch_type = lp_type_int_vec(dst_type.width, src_width);
458       }
459       /* intentionally not using lp_build_vec_type here */
460       src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
461                                 fetch_type.length);
462       fetch_dst_type = fetch_type;
463       fetch_dst_type.length = dst_type.length;
464     } else {
465       /* use scalar fetch */
466       vec_fetch = FALSE;
467       if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
468          fetch_type = lp_type_float(src_width);
469       } else {
470          fetch_type = lp_type_int(src_width);
471       }
472       src_type = lp_build_vec_type(gallivm, fetch_type);
473       fetch_dst_type = fetch_type;
474       fetch_dst_type.width = dst_type.width * dst_type.length;
475    }
476 
477    if (length == 1) {
478       /* Scalar */
479       res = lp_build_gather_elem_vec(gallivm, length,
480                                      src_width, src_type, fetch_dst_type,
481                                      aligned, base_ptr, offsets, 0,
482                                      vector_justify);
483       return LLVMBuildBitCast(gallivm->builder, res,
484                               lp_build_vec_type(gallivm, dst_type), "");
485       /*
486        * Excluding expansion from these paths because if you need it for
487        * 32bit/64bit fetches you're doing it wrong (this is gather, not
488        * conversion) and it would be awkward for floats.
489        */
490    } else if (util_get_cpu_caps()->has_avx2 && !need_expansion &&
491               src_width == 32 && (length == 4 || length == 8)) {
492       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
493                                   base_ptr, offsets);
494    /*
495     * This looks bad on paper wrt throughtput/latency on Haswell.
496     * Even on Broadwell it doesn't look stellar.
497     * Albeit no measurements were done (but tested to work).
498     * Should definitely enable on Skylake.
499     * (In general, should be more of a win if the fetch is 256bit wide -
500     * this is true for the 32bit case above too.)
501     */
502    } else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion &&
503               src_width == 64 && (length == 2 || length == 4)) {
504       return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
505                                   base_ptr, offsets);
506    } else {
507       /* Vector */
508 
509       LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
510       unsigned i;
511       boolean vec_zext = FALSE;
512       struct lp_type res_type, gather_res_type;
513       LLVMTypeRef res_t, gather_res_t;
514 
515       res_type = fetch_dst_type;
516       res_type.length *= length;
517       gather_res_type = res_type;
518 
519       if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
520          /*
521           * Note that llvm is never able to optimize zext/insert combos
522           * directly (i.e. zero the simd reg, then place the elements into
523           * the appropriate place directly). (I think this has to do with
524           * scalar/vector transition.) And scalar 16->32bit zext simd loads
525           * aren't possible (instead loading to scalar reg first).
526           * No idea about other archs...
527           * We could do this manually, but instead we just use a vector
528           * zext, which is simple enough (and, in fact, llvm might optimize
529           * this away).
530           * (We're not trying that with other bit widths as that might not be
531           * easier, in particular with 8 bit values at least with only sse2.)
532           */
533          assert(vec_fetch == FALSE);
534          gather_res_type.width /= 2;
535          fetch_dst_type = fetch_type;
536          src_type = lp_build_vec_type(gallivm, fetch_type);
537          vec_zext = TRUE;
538       }
539       res_t = lp_build_vec_type(gallivm, res_type);
540       gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
541       res = LLVMGetUndef(gather_res_t);
542       for (i = 0; i < length; ++i) {
543          LLVMValueRef index = lp_build_const_int32(gallivm, i);
544          elems[i] = lp_build_gather_elem_vec(gallivm, length,
545                                              src_width, src_type, fetch_dst_type,
546                                              aligned, base_ptr, offsets, i,
547                                              vector_justify);
548          if (!vec_fetch) {
549             res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
550          }
551       }
552       if (vec_zext) {
553          res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
554          if (vector_justify) {
555 #if UTIL_ARCH_BIG_ENDIAN
556             unsigned sv = dst_type.width - src_width;
557             res = LLVMBuildShl(gallivm->builder, res,
558                                lp_build_const_int_vec(gallivm, res_type, sv), "");
559 #endif
560          }
561       }
562       if (vec_fetch) {
563          /*
564           * Do bitcast now otherwise llvm might get some funny ideas wrt
565           * float/int types...
566           */
567          for (i = 0; i < length; i++) {
568             elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
569                                         lp_build_vec_type(gallivm, dst_type), "");
570          }
571          res = lp_build_concat(gallivm, elems, dst_type, length);
572       } else {
573          struct lp_type really_final_type = dst_type;
574          assert(res_type.length * res_type.width ==
575                 dst_type.length * dst_type.width * length);
576          really_final_type.length *= length;
577          res = LLVMBuildBitCast(gallivm->builder, res,
578                                 lp_build_vec_type(gallivm, really_final_type), "");
579       }
580    }
581 
582    return res;
583 }
584 
585 LLVMValueRef
lp_build_gather_values(struct gallivm_state * gallivm,LLVMValueRef * values,unsigned value_count)586 lp_build_gather_values(struct gallivm_state * gallivm,
587                        LLVMValueRef * values,
588                        unsigned value_count)
589 {
590    LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
591    LLVMBuilderRef builder = gallivm->builder;
592    LLVMValueRef vec = LLVMGetUndef(vec_type);
593    unsigned i;
594 
595    for (i = 0; i < value_count; i++) {
596       LLVMValueRef index = lp_build_const_int32(gallivm, i);
597       vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
598    }
599    return vec;
600 }
601