• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper functions for packing/unpacking.
32  *
33  * Pack/unpacking is necessary for conversion between types of different
34  * bit width.
35  *
36  * They are also commonly used when an computation needs higher
37  * precision for the intermediate values. For example, if one needs the
38  * function:
39  *
40  *   c = compute(a, b);
41  *
42  * to use more precision for intermediate results then one should implement it
43  * as:
44  *
45  *   LLVMValueRef
46  *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
47  *   {
48  *      struct lp_type wide_type = lp_wider_type(type);
49  *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
50  *
51  *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
52  *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
53  *
54  *      cl = compute_half(al, bl);
55  *      ch = compute_half(ah, bh);
56  *
57  *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
58  *
59  *      return c;
60  *   }
61  *
62  * where compute_half() would do the computation for half the elements with
63  * twice the precision.
64  *
65  * @author Jose Fonseca <jfonseca@vmware.com>
66  */
67 
68 
69 #include "util/u_debug.h"
70 #include "util/u_math.h"
71 #include "util/u_cpu_detect.h"
72 #include "util/u_memory.h"
73 
74 #include "lp_bld_type.h"
75 #include "lp_bld_const.h"
76 #include "lp_bld_init.h"
77 #include "lp_bld_intr.h"
78 #include "lp_bld_arit.h"
79 #include "lp_bld_pack.h"
80 #include "lp_bld_swizzle.h"
81 
82 
83 /**
84  * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
85  */
86 static LLVMValueRef
lp_build_const_unpack_shuffle(struct gallivm_state * gallivm,unsigned n,unsigned lo_hi)87 lp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
88                               unsigned n, unsigned lo_hi)
89 {
90    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
91    unsigned i, j;
92 
93    assert(n <= LP_MAX_VECTOR_LENGTH);
94    assert(lo_hi < 2);
95 
96    /* TODO: cache results in a static table */
97 
98    for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
99       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
100       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
101    }
102 
103    return LLVMConstVector(elems, n);
104 }
105 
106 /**
107  * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
108  * See comment above lp_build_interleave2_half for more details.
109  */
110 static LLVMValueRef
lp_build_const_unpack_shuffle_half(struct gallivm_state * gallivm,unsigned n,unsigned lo_hi)111 lp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
112                                    unsigned n, unsigned lo_hi)
113 {
114    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115    unsigned i, j;
116 
117    assert(n <= LP_MAX_VECTOR_LENGTH);
118    assert(lo_hi < 2);
119 
120    for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
121       if (i == (n / 2))
122          j += n / 4;
123 
124       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
125       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
126    }
127 
128    return LLVMConstVector(elems, n);
129 }
130 
131 /**
132  * Build shuffle vectors that match PACKxx (SSE) instructions or
133  * VPERM (Altivec).
134  */
135 static LLVMValueRef
lp_build_const_pack_shuffle(struct gallivm_state * gallivm,unsigned n)136 lp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
137 {
138    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
139    unsigned i;
140 
141    assert(n <= LP_MAX_VECTOR_LENGTH);
142 
143    for(i = 0; i < n; ++i)
144 #ifdef PIPE_ARCH_LITTLE_ENDIAN
145       elems[i] = lp_build_const_int32(gallivm, 2*i);
146 #else
147       elems[i] = lp_build_const_int32(gallivm, 2*i+1);
148 #endif
149 
150    return LLVMConstVector(elems, n);
151 }
152 
153 /**
154  * Return a vector with elements src[start:start+size]
155  * Most useful for getting half the values out of a 256bit sized vector,
156  * otherwise may cause data rearrangement to happen.
157  */
158 LLVMValueRef
lp_build_extract_range(struct gallivm_state * gallivm,LLVMValueRef src,unsigned start,unsigned size)159 lp_build_extract_range(struct gallivm_state *gallivm,
160                        LLVMValueRef src,
161                        unsigned start,
162                        unsigned size)
163 {
164    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
165    unsigned i;
166 
167    assert(size <= ARRAY_SIZE(elems));
168 
169    for (i = 0; i < size; ++i)
170       elems[i] = lp_build_const_int32(gallivm, i + start);
171 
172    if (size == 1) {
173       return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
174    }
175    else {
176       return LLVMBuildShuffleVector(gallivm->builder, src, src,
177                                     LLVMConstVector(elems, size), "");
178    }
179 }
180 
181 /**
182  * Concatenates several (must be a power of 2) vectors (of same type)
183  * into a larger one.
184  * Most useful for building up a 256bit sized vector out of two 128bit ones.
185  */
186 LLVMValueRef
lp_build_concat(struct gallivm_state * gallivm,LLVMValueRef src[],struct lp_type src_type,unsigned num_vectors)187 lp_build_concat(struct gallivm_state *gallivm,
188                 LLVMValueRef src[],
189                 struct lp_type src_type,
190                 unsigned num_vectors)
191 {
192    unsigned new_length, i;
193    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
194    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
195 
196    assert(src_type.length * num_vectors <= ARRAY_SIZE(shuffles));
197    assert(util_is_power_of_two(num_vectors));
198 
199    new_length = src_type.length;
200 
201    for (i = 0; i < num_vectors; i++)
202       tmp[i] = src[i];
203 
204    while (num_vectors > 1) {
205       num_vectors >>= 1;
206       new_length <<= 1;
207       for (i = 0; i < new_length; i++) {
208          shuffles[i] = lp_build_const_int32(gallivm, i);
209       }
210       for (i = 0; i < num_vectors; i++) {
211          tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
212                                          LLVMConstVector(shuffles, new_length), "");
213       }
214    }
215 
216    return tmp[0];
217 }
218 
219 
220 /**
221  * Combines vectors to reduce from num_srcs to num_dsts.
222  * Returns the number of src vectors concatenated in a single dst.
223  *
224  * num_srcs must be exactly divisible by num_dsts.
225  *
226  * e.g. For num_srcs = 4 and src = [x, y, z, w]
227  *          num_dsts = 1  dst = [xyzw]    return = 4
228  *          num_dsts = 2  dst = [xy, zw]  return = 2
229  */
230 int
lp_build_concat_n(struct gallivm_state * gallivm,struct lp_type src_type,LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst,unsigned num_dsts)231 lp_build_concat_n(struct gallivm_state *gallivm,
232                   struct lp_type src_type,
233                   LLVMValueRef *src,
234                   unsigned num_srcs,
235                   LLVMValueRef *dst,
236                   unsigned num_dsts)
237 {
238    int size = num_srcs / num_dsts;
239    unsigned i;
240 
241    assert(num_srcs >= num_dsts);
242    assert((num_srcs % size) == 0);
243 
244    if (num_srcs == num_dsts) {
245       for (i = 0; i < num_dsts; ++i) {
246          dst[i] = src[i];
247       }
248       return 1;
249    }
250 
251    for (i = 0; i < num_dsts; ++i) {
252       dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
253    }
254 
255    return size;
256 }
257 
258 
259 /**
260  * Un-interleave vector.
261  * This will return a vector consisting of every second element
262  * (depending on lo_hi, beginning at 0 or 1).
263  * The returned vector size (elems and width) will only be half
264  * that of the source vector.
265  */
266 LLVMValueRef
lp_build_uninterleave1(struct gallivm_state * gallivm,unsigned num_elems,LLVMValueRef a,unsigned lo_hi)267 lp_build_uninterleave1(struct gallivm_state *gallivm,
268                        unsigned num_elems,
269                        LLVMValueRef a,
270                        unsigned lo_hi)
271 {
272    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
273    unsigned i;
274    assert(num_elems <= LP_MAX_VECTOR_LENGTH);
275 
276    for (i = 0; i < num_elems / 2; ++i)
277       elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
278 
279    shuffle = LLVMConstVector(elems, num_elems / 2);
280 
281    return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
282 }
283 
284 
285 /**
286  * Interleave vector elements.
287  *
288  * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
289  * (but not for 256bit AVX vectors).
290  */
291 LLVMValueRef
lp_build_interleave2(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef a,LLVMValueRef b,unsigned lo_hi)292 lp_build_interleave2(struct gallivm_state *gallivm,
293                      struct lp_type type,
294                      LLVMValueRef a,
295                      LLVMValueRef b,
296                      unsigned lo_hi)
297 {
298    LLVMValueRef shuffle;
299 
300    if (type.length == 2 && type.width == 128 && util_cpu_caps.has_avx) {
301       /*
302        * XXX: This is a workaround for llvm code generation deficiency. Strangely
303        * enough, while this needs vinsertf128/vextractf128 instructions (hence
304        * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
305        * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
306        * So use some different shuffles instead (the exact shuffles don't seem to
307        * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
308        */
309       struct lp_type tmp_type = type;
310       LLVMValueRef srchalf[2], tmpdst;
311       tmp_type.length = 4;
312       tmp_type.width = 64;
313       a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
314       b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
315       srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
316       srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
317       tmp_type.length = 2;
318       tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
319       return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
320    }
321 
322    shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
323 
324    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
325 }
326 
327 /**
328  * Interleave vector elements but with 256 bit,
329  * treats it as interleave with 2 concatenated 128 bit vectors.
330  *
331  * This differs to lp_build_interleave2 as that function would do the following (for lo):
332  * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
333  *
334  *
335  * An example interleave 8x float with 8x float on AVX 256bit unpack:
336  *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
337  *
338  * Equivalent to interleaving 2x 128 bit vectors
339  *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
340  *
341  * So interleave-lo would result in:
342  *   a0 b0 a1 b1 a4 b4 a5 b5
343  *
344  * And interleave-hi would result in:
345  *   a2 b2 a3 b3 a6 b6 a7 b7
346  */
347 LLVMValueRef
lp_build_interleave2_half(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef a,LLVMValueRef b,unsigned lo_hi)348 lp_build_interleave2_half(struct gallivm_state *gallivm,
349                           struct lp_type type,
350                           LLVMValueRef a,
351                           LLVMValueRef b,
352                           unsigned lo_hi)
353 {
354    if (type.length * type.width == 256) {
355       LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
356       return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
357    } else {
358       return lp_build_interleave2(gallivm, type, a, b, lo_hi);
359    }
360 }
361 
362 
363 /**
364  * Double the bit width.
365  *
366  * This will only change the number of bits the values are represented, not the
367  * values themselves.
368  *
369  */
370 void
lp_build_unpack2(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef src,LLVMValueRef * dst_lo,LLVMValueRef * dst_hi)371 lp_build_unpack2(struct gallivm_state *gallivm,
372                  struct lp_type src_type,
373                  struct lp_type dst_type,
374                  LLVMValueRef src,
375                  LLVMValueRef *dst_lo,
376                  LLVMValueRef *dst_hi)
377 {
378    LLVMBuilderRef builder = gallivm->builder;
379    LLVMValueRef msb;
380    LLVMTypeRef dst_vec_type;
381 
382    assert(!src_type.floating);
383    assert(!dst_type.floating);
384    assert(dst_type.width == src_type.width * 2);
385    assert(dst_type.length * 2 == src_type.length);
386 
387    if(dst_type.sign && src_type.sign) {
388       /* Replicate the sign bit in the most significant bits */
389       msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
390    }
391    else
392       /* Most significant bits always zero */
393       msb = lp_build_zero(gallivm, src_type);
394 
395    /* Interleave bits */
396 #ifdef PIPE_ARCH_LITTLE_ENDIAN
397    *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
398    *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
399 
400 #else
401    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
402    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
403 #endif
404 
405    /* Cast the result into the new type (twice as wide) */
406 
407    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
408 
409    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
410    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
411 }
412 
413 
414 /**
415  * Double the bit width, with an order which fits the cpu nicely.
416  *
417  * This will only change the number of bits the values are represented, not the
418  * values themselves.
419  *
420  * The order of the results is not guaranteed, other than it will match
421  * the corresponding lp_build_pack2_native call.
422  */
423 void
lp_build_unpack2_native(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef src,LLVMValueRef * dst_lo,LLVMValueRef * dst_hi)424 lp_build_unpack2_native(struct gallivm_state *gallivm,
425                         struct lp_type src_type,
426                         struct lp_type dst_type,
427                         LLVMValueRef src,
428                         LLVMValueRef *dst_lo,
429                         LLVMValueRef *dst_hi)
430 {
431    LLVMBuilderRef builder = gallivm->builder;
432    LLVMValueRef msb;
433    LLVMTypeRef dst_vec_type;
434 
435    assert(!src_type.floating);
436    assert(!dst_type.floating);
437    assert(dst_type.width == src_type.width * 2);
438    assert(dst_type.length * 2 == src_type.length);
439 
440    if(dst_type.sign && src_type.sign) {
441       /* Replicate the sign bit in the most significant bits */
442       msb = LLVMBuildAShr(builder, src,
443                lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
444    }
445    else
446       /* Most significant bits always zero */
447       msb = lp_build_zero(gallivm, src_type);
448 
449    /* Interleave bits */
450 #ifdef PIPE_ARCH_LITTLE_ENDIAN
451    if (src_type.length * src_type.width == 256 && util_cpu_caps.has_avx2) {
452       *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
453       *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
454    } else {
455       *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
456       *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
457    }
458 #else
459    *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
460    *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
461 #endif
462 
463    /* Cast the result into the new type (twice as wide) */
464 
465    dst_vec_type = lp_build_vec_type(gallivm, dst_type);
466 
467    *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
468    *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
469 }
470 
471 
472 /**
473  * Expand the bit width.
474  *
475  * This will only change the number of bits the values are represented, not the
476  * values themselves.
477  */
478 void
lp_build_unpack(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef src,LLVMValueRef * dst,unsigned num_dsts)479 lp_build_unpack(struct gallivm_state *gallivm,
480                 struct lp_type src_type,
481                 struct lp_type dst_type,
482                 LLVMValueRef src,
483                 LLVMValueRef *dst, unsigned num_dsts)
484 {
485    unsigned num_tmps;
486    unsigned i;
487 
488    /* Register width must remain constant */
489    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
490 
491    /* We must not loose or gain channels. Only precision */
492    assert(src_type.length == dst_type.length * num_dsts);
493 
494    num_tmps = 1;
495    dst[0] = src;
496 
497    while(src_type.width < dst_type.width) {
498       struct lp_type tmp_type = src_type;
499 
500       tmp_type.width *= 2;
501       tmp_type.length /= 2;
502 
503       for(i = num_tmps; i--; ) {
504          lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],
505                           &dst[2*i + 1]);
506       }
507 
508       src_type = tmp_type;
509 
510       num_tmps *= 2;
511    }
512 
513    assert(num_tmps == num_dsts);
514 }
515 
516 
517 /**
518  * Non-interleaved pack.
519  *
520  * This will move values as
521  *         (LSB)                     (MSB)
522  *   lo =   l0 __ l1 __ l2 __..  __ ln __
523  *   hi =   h0 __ h1 __ h2 __..  __ hn __
524  *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
525  *
526  * This will only change the number of bits the values are represented, not the
527  * values themselves.
528  *
529  * It is assumed the values are already clamped into the destination type range.
530  * Values outside that range will produce undefined results. Use
531  * lp_build_packs2 instead.
532  */
533 LLVMValueRef
lp_build_pack2(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef lo,LLVMValueRef hi)534 lp_build_pack2(struct gallivm_state *gallivm,
535                struct lp_type src_type,
536                struct lp_type dst_type,
537                LLVMValueRef lo,
538                LLVMValueRef hi)
539 {
540    LLVMBuilderRef builder = gallivm->builder;
541    LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
542    LLVMValueRef shuffle;
543    LLVMValueRef res = NULL;
544    struct lp_type intr_type = dst_type;
545 
546    assert(!src_type.floating);
547    assert(!dst_type.floating);
548    assert(src_type.width == dst_type.width * 2);
549    assert(src_type.length * 2 == dst_type.length);
550 
551    /* Check for special cases first */
552    if ((util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec) &&
553         src_type.width * src_type.length >= 128) {
554       const char *intrinsic = NULL;
555       boolean swap_intrinsic_operands = FALSE;
556 
557       switch(src_type.width) {
558       case 32:
559          if (util_cpu_caps.has_sse2) {
560            if (dst_type.sign) {
561               intrinsic = "llvm.x86.sse2.packssdw.128";
562            } else {
563               if (util_cpu_caps.has_sse4_1) {
564                  intrinsic = "llvm.x86.sse41.packusdw";
565               }
566            }
567          } else if (util_cpu_caps.has_altivec) {
568             if (dst_type.sign) {
569                intrinsic = "llvm.ppc.altivec.vpkswss";
570             } else {
571                intrinsic = "llvm.ppc.altivec.vpkuwus";
572             }
573 #ifdef PIPE_ARCH_LITTLE_ENDIAN
574             swap_intrinsic_operands = TRUE;
575 #endif
576          }
577          break;
578       case 16:
579          if (dst_type.sign) {
580             if (util_cpu_caps.has_sse2) {
581                intrinsic = "llvm.x86.sse2.packsswb.128";
582             } else if (util_cpu_caps.has_altivec) {
583                intrinsic = "llvm.ppc.altivec.vpkshss";
584 #ifdef PIPE_ARCH_LITTLE_ENDIAN
585                swap_intrinsic_operands = TRUE;
586 #endif
587             }
588          } else {
589             if (util_cpu_caps.has_sse2) {
590                intrinsic = "llvm.x86.sse2.packuswb.128";
591             } else if (util_cpu_caps.has_altivec) {
592                intrinsic = "llvm.ppc.altivec.vpkshus";
593 #ifdef PIPE_ARCH_LITTLE_ENDIAN
594                swap_intrinsic_operands = TRUE;
595 #endif
596             }
597          }
598          break;
599       /* default uses generic shuffle below */
600       }
601       if (intrinsic) {
602          if (src_type.width * src_type.length == 128) {
603             LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
604             if (swap_intrinsic_operands) {
605                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
606             } else {
607                res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
608             }
609             if (dst_vec_type != intr_vec_type) {
610                res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
611             }
612          }
613          else {
614             int num_split = src_type.width * src_type.length / 128;
615             int i;
616             int nlen = 128 / src_type.width;
617             int lo_off = swap_intrinsic_operands ? nlen : 0;
618             int hi_off = swap_intrinsic_operands ? 0 : nlen;
619             struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
620             struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
621             LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
622             LLVMValueRef tmplo, tmphi;
623             LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
624             LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
625 
626             assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
627 
628             for (i = 0; i < num_split / 2; i++) {
629                tmplo = lp_build_extract_range(gallivm,
630                                               lo, i*nlen*2 + lo_off, nlen);
631                tmphi = lp_build_extract_range(gallivm,
632                                               lo, i*nlen*2 + hi_off, nlen);
633                tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
634                                                      nintr_vec_type, tmplo, tmphi);
635                if (ndst_vec_type != nintr_vec_type) {
636                   tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
637                }
638             }
639             for (i = 0; i < num_split / 2; i++) {
640                tmplo = lp_build_extract_range(gallivm,
641                                               hi, i*nlen*2 + lo_off, nlen);
642                tmphi = lp_build_extract_range(gallivm,
643                                               hi, i*nlen*2 + hi_off, nlen);
644                tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
645                                                                  nintr_vec_type,
646                                                                  tmplo, tmphi);
647                if (ndst_vec_type != nintr_vec_type) {
648                   tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
649                                                            ndst_vec_type, "");
650                }
651             }
652             res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
653          }
654          return res;
655       }
656    }
657 
658    /* generic shuffle */
659    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
660    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
661 
662    shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
663 
664    res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
665 
666    return res;
667 }
668 
669 
670 /**
671  * Non-interleaved native pack.
672  *
673  * Similar to lp_build_pack2, but the ordering of values is not
674  * guaranteed, other than it will match lp_build_unpack2_native.
675  *
676  * In particular, with avx2, the lower and upper 128bits of the vectors will
677  * be packed independently, so that (with 32bit->16bit values)
678  *         (LSB)                                       (MSB)
679  *   lo =   l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __
680  *   hi =   h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __
681  *   res =  l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7
682  *
683  * This will only change the number of bits the values are represented, not the
684  * values themselves.
685  *
686  * It is assumed the values are already clamped into the destination type range.
687  * Values outside that range will produce undefined results.
688  */
689 LLVMValueRef
lp_build_pack2_native(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef lo,LLVMValueRef hi)690 lp_build_pack2_native(struct gallivm_state *gallivm,
691                       struct lp_type src_type,
692                       struct lp_type dst_type,
693                       LLVMValueRef lo,
694                       LLVMValueRef hi)
695 {
696    LLVMBuilderRef builder = gallivm->builder;
697    struct lp_type intr_type = dst_type;
698    const char *intrinsic = NULL;
699 
700    assert(!src_type.floating);
701    assert(!dst_type.floating);
702    assert(src_type.width == dst_type.width * 2);
703    assert(src_type.length * 2 == dst_type.length);
704 
705    /* At this point only have special case for avx2 */
706    if (src_type.length * src_type.width == 256 &&
707        util_cpu_caps.has_avx2) {
708       switch(src_type.width) {
709       case 32:
710          if (dst_type.sign) {
711             intrinsic = "llvm.x86.avx2.packssdw";
712          } else {
713             intrinsic = "llvm.x86.avx2.packusdw";
714          }
715          break;
716       case 16:
717          if (dst_type.sign) {
718             intrinsic = "llvm.x86.avx2.packsswb";
719          } else {
720             intrinsic = "llvm.x86.avx2.packuswb";
721          }
722          break;
723       }
724    }
725    if (intrinsic) {
726       LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
727       return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,
728                                        lo, hi);
729    }
730    else {
731       return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
732    }
733 }
734 
735 /**
736  * Non-interleaved pack and saturate.
737  *
738  * Same as lp_build_pack2 but will saturate values so that they fit into the
739  * destination type.
740  */
741 LLVMValueRef
lp_build_packs2(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef lo,LLVMValueRef hi)742 lp_build_packs2(struct gallivm_state *gallivm,
743                 struct lp_type src_type,
744                 struct lp_type dst_type,
745                 LLVMValueRef lo,
746                 LLVMValueRef hi)
747 {
748    boolean clamp;
749 
750    assert(!src_type.floating);
751    assert(!dst_type.floating);
752    assert(src_type.sign == dst_type.sign);
753    assert(src_type.width == dst_type.width * 2);
754    assert(src_type.length * 2 == dst_type.length);
755 
756    clamp = TRUE;
757 
758    /* All X86 SSE non-interleaved pack instructions take signed inputs and
759     * saturate them, so no need to clamp for those cases. */
760    if(util_cpu_caps.has_sse2 &&
761       src_type.width * src_type.length >= 128 &&
762       src_type.sign &&
763       (src_type.width == 32 || src_type.width == 16))
764       clamp = FALSE;
765 
766    if(clamp) {
767       struct lp_build_context bld;
768       unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
769       LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,
770                                 ((unsigned long long)1 << dst_bits) - 1);
771       lp_build_context_init(&bld, gallivm, src_type);
772       lo = lp_build_min(&bld, lo, dst_max);
773       hi = lp_build_min(&bld, hi, dst_max);
774       /* FIXME: What about lower bound? */
775    }
776 
777    return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
778 }
779 
780 
781 /**
782  * Truncate the bit width.
783  *
784  * TODO: Handle saturation consistently.
785  */
786 LLVMValueRef
lp_build_pack(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,boolean clamped,const LLVMValueRef * src,unsigned num_srcs)787 lp_build_pack(struct gallivm_state *gallivm,
788               struct lp_type src_type,
789               struct lp_type dst_type,
790               boolean clamped,
791               const LLVMValueRef *src, unsigned num_srcs)
792 {
793    LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
794                          struct lp_type src_type,
795                          struct lp_type dst_type,
796                          LLVMValueRef lo,
797                          LLVMValueRef hi);
798    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
799    unsigned i;
800 
801    /* Register width must remain constant */
802    assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
803 
804    /* We must not loose or gain channels. Only precision */
805    assert(src_type.length * num_srcs == dst_type.length);
806 
807    if(clamped)
808       pack2 = &lp_build_pack2;
809    else
810       pack2 = &lp_build_packs2;
811 
812    for(i = 0; i < num_srcs; ++i)
813       tmp[i] = src[i];
814 
815    while(src_type.width > dst_type.width) {
816       struct lp_type tmp_type = src_type;
817 
818       tmp_type.width /= 2;
819       tmp_type.length *= 2;
820 
821       /* Take in consideration the sign changes only in the last step */
822       if(tmp_type.width == dst_type.width)
823          tmp_type.sign = dst_type.sign;
824 
825       num_srcs /= 2;
826 
827       for(i = 0; i < num_srcs; ++i)
828          tmp[i] = pack2(gallivm, src_type, tmp_type,
829                         tmp[2*i + 0], tmp[2*i + 1]);
830 
831       src_type = tmp_type;
832    }
833 
834    assert(num_srcs == 1);
835 
836    return tmp[0];
837 }
838 
839 
840 /**
841  * Truncate or expand the bitwidth.
842  *
843  * NOTE: Getting the right sign flags is crucial here, as we employ some
844  * intrinsics that do saturation.
845  */
846 void
lp_build_resize(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst,unsigned num_dsts)847 lp_build_resize(struct gallivm_state *gallivm,
848                 struct lp_type src_type,
849                 struct lp_type dst_type,
850                 const LLVMValueRef *src, unsigned num_srcs,
851                 LLVMValueRef *dst, unsigned num_dsts)
852 {
853    LLVMBuilderRef builder = gallivm->builder;
854    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
855    unsigned i;
856 
857    /*
858     * We don't support float <-> int conversion here. That must be done
859     * before/after calling this function.
860     */
861    assert(src_type.floating == dst_type.floating);
862 
863    /*
864     * We don't support double <-> float conversion yet, although it could be
865     * added with little effort.
866     */
867    assert((!src_type.floating && !dst_type.floating) ||
868           src_type.width == dst_type.width);
869 
870    /* We must not loose or gain channels. Only precision */
871    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
872 
873    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
874    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
875    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
876    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
877 
878    if (src_type.width > dst_type.width) {
879       /*
880        * Truncate bit width.
881        */
882 
883       /* Conversion must be M:1 */
884       assert(num_dsts == 1);
885 
886       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
887         /*
888          * Register width remains constant -- use vector packing intrinsics
889          */
890          tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
891       }
892       else {
893          if (src_type.width / dst_type.width > num_srcs) {
894             /*
895             * First change src vectors size (with shuffle) so they have the
896             * same size as the destination vector, then pack normally.
897             * Note: cannot use cast/extract because llvm generates atrocious code.
898             */
899             unsigned size_ratio = (src_type.width * src_type.length) /
900                                   (dst_type.length * dst_type.width);
901             unsigned new_length = src_type.length / size_ratio;
902 
903             for (i = 0; i < size_ratio * num_srcs; i++) {
904                unsigned start_index = (i % size_ratio) * new_length;
905                tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
906                                                start_index, new_length);
907             }
908             num_srcs *= size_ratio;
909             src_type.length = new_length;
910             tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
911          }
912          else {
913             /*
914              * Truncate bit width but expand vector size - first pack
915              * then expand simply because this should be more AVX-friendly
916              * for the cases we probably hit.
917              */
918             unsigned size_ratio = (dst_type.width * dst_type.length) /
919                                   (src_type.length * src_type.width);
920             unsigned num_pack_srcs = num_srcs / size_ratio;
921             dst_type.length = dst_type.length / size_ratio;
922 
923             for (i = 0; i < size_ratio; i++) {
924                tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
925                                       &src[i*num_pack_srcs], num_pack_srcs);
926             }
927             tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
928          }
929       }
930    }
931    else if (src_type.width < dst_type.width) {
932       /*
933        * Expand bit width.
934        */
935 
936       /* Conversion must be 1:N */
937       assert(num_srcs == 1);
938 
939       if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
940          /*
941           * Register width remains constant -- use vector unpack intrinsics
942           */
943          lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
944       }
945       else {
946          /*
947           * Do it element-wise.
948           */
949          assert(src_type.length * num_srcs == dst_type.length * num_dsts);
950 
951          for (i = 0; i < num_dsts; i++) {
952             tmp[i] = lp_build_undef(gallivm, dst_type);
953          }
954 
955          for (i = 0; i < src_type.length; ++i) {
956             unsigned j = i / dst_type.length;
957             LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
958             LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
959             LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
960 
961             if (src_type.sign && dst_type.sign) {
962                val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
963             } else {
964                val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
965             }
966             tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
967          }
968       }
969    }
970    else {
971       /*
972        * No-op
973        */
974 
975       /* "Conversion" must be N:N */
976       assert(num_srcs == num_dsts);
977 
978       for(i = 0; i < num_dsts; ++i)
979          tmp[i] = src[i];
980    }
981 
982    for(i = 0; i < num_dsts; ++i)
983       dst[i] = tmp[i];
984 }
985 
986 
987 /**
988  * Expands src vector from src.length to dst_length
989  */
990 LLVMValueRef
lp_build_pad_vector(struct gallivm_state * gallivm,LLVMValueRef src,unsigned dst_length)991 lp_build_pad_vector(struct gallivm_state *gallivm,
992                     LLVMValueRef src,
993                     unsigned dst_length)
994 {
995    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
996    LLVMValueRef undef;
997    LLVMTypeRef type;
998    unsigned i, src_length;
999 
1000    type = LLVMTypeOf(src);
1001 
1002    if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
1003       /* Can't use ShuffleVector on non-vector type */
1004       undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
1005       return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
1006    }
1007 
1008    undef      = LLVMGetUndef(type);
1009    src_length = LLVMGetVectorSize(type);
1010 
1011    assert(dst_length <= ARRAY_SIZE(elems));
1012    assert(dst_length >= src_length);
1013 
1014    if (src_length == dst_length)
1015       return src;
1016 
1017    /* All elements from src vector */
1018    for (i = 0; i < src_length; ++i)
1019       elems[i] = lp_build_const_int32(gallivm, i);
1020 
1021    /* Undef fill remaining space */
1022    for (i = src_length; i < dst_length; ++i)
1023       elems[i] = lp_build_const_int32(gallivm, src_length);
1024 
1025    /* Combine the two vectors */
1026    return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
1027 }
1028