• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Helper functions for type conversions.
32  *
33  * We want to use the fastest type for a given computation whenever feasible.
34  * The other side of this is that we need to be able convert between several
35  * types accurately and efficiently.
36  *
37  * Conversion between types of different bit width is quite complex since a
38  *
39  * To remember there are a few invariants in type conversions:
40  *
41  * - register width must remain constant:
42  *
43  *     src_type.width * src_type.length == dst_type.width * dst_type.length
44  *
45  * - total number of elements must remain constant:
46  *
47  *     src_type.length * num_srcs == dst_type.length * num_dsts
48  *
49  * It is not always possible to do the conversion both accurately and
50  * efficiently, usually due to lack of adequate machine instructions. In these
51  * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52  * there this functions can be used anywhere. In the future we might have a
53  * precision parameter which can gauge the accuracy vs efficiency compromise,
54  * but for now if the data conversion between two stages happens to be the
55  * bottleneck, then most likely should just avoid converting at all and run
56  * both stages with the same type.
57  *
58  * Make sure to run lp_test_conv unit test after any change to this file.
59  *
60  * @author Jose Fonseca <jfonseca@vmware.com>
61  */
62 
63 
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/half_float.h"
67 #include "util/u_cpu_detect.h"
68 
69 #include "lp_bld_type.h"
70 #include "lp_bld_const.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_bitarit.h"
73 #include "lp_bld_pack.h"
74 #include "lp_bld_conv.h"
75 #include "lp_bld_logic.h"
76 #include "lp_bld_intr.h"
77 #include "lp_bld_printf.h"
78 #include "lp_bld_format.h"
79 
80 
81 
82 /**
83  * Converts int16 half-float to float32
84  * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
85  * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
86  *
87  * @param src           value to convert
88  *
89  */
90 LLVMValueRef
lp_build_half_to_float(struct gallivm_state * gallivm,LLVMValueRef src)91 lp_build_half_to_float(struct gallivm_state *gallivm,
92                        LLVMValueRef src)
93 {
94    LLVMBuilderRef builder = gallivm->builder;
95    LLVMTypeRef src_type = LLVMTypeOf(src);
96    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
97                             LLVMGetVectorSize(src_type) : 1;
98 
99    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
100    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
101    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
102    LLVMValueRef h;
103 
104    if (util_cpu_caps.has_f16c &&
105        (src_length == 4 || src_length == 8)) {
106       if (LLVM_VERSION_MAJOR < 11) {
107          const char *intrinsic = NULL;
108          if (src_length == 4) {
109             src = lp_build_pad_vector(gallivm, src, 8);
110             intrinsic = "llvm.x86.vcvtph2ps.128";
111          }
112          else {
113             intrinsic = "llvm.x86.vcvtph2ps.256";
114          }
115          return lp_build_intrinsic_unary(builder, intrinsic,
116                                          lp_build_vec_type(gallivm, f32_type), src);
117       } else {
118          /*
119           * XXX: could probably use on other archs as well.
120           * But if the cpu doesn't support it natively it looks like the backends still
121           * can't lower it and will try to call out to external libraries, which will crash.
122           */
123          /*
124           * XXX: lp_build_vec_type() would use int16 vector. Probably need to revisit
125           * this at some point.
126           */
127          src = LLVMBuildBitCast(builder, src,
128                                 LLVMVectorType(LLVMHalfTypeInContext(gallivm->context), src_length), "");
129          return LLVMBuildFPExt(builder, src, lp_build_vec_type(gallivm, f32_type), "");
130       }
131    }
132 
133    h = LLVMBuildZExt(builder, src, int_vec_type, "");
134    return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
135 }
136 
137 
138 /**
139  * Converts float32 to int16 half-float
140  * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
141  * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
142  *
143  * @param src           value to convert
144  *
145  * Convert float32 to half floats, preserving Infs and NaNs,
146  * with rounding towards zero (trunc).
147  * XXX: For GL, would prefer rounding towards nearest(-even).
148  */
149 LLVMValueRef
lp_build_float_to_half(struct gallivm_state * gallivm,LLVMValueRef src)150 lp_build_float_to_half(struct gallivm_state *gallivm,
151                        LLVMValueRef src)
152 {
153    LLVMBuilderRef builder = gallivm->builder;
154    LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
155    unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
156                    ? LLVMGetVectorSize(f32_vec_type) : 1;
157    struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
158    struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
159    LLVMValueRef result;
160 
161    /*
162     * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
163     * directly, without any (x86 or generic) intrinsics.
164     * Albeit the rounding mode cannot be specified (and is undefined,
165     * though in practice on x86 seems to do nearest-even but it may
166     * be dependent on instruction set support), so is essentially
167     * useless.
168     */
169 
170    if (util_cpu_caps.has_f16c &&
171        (length == 4 || length == 8)) {
172       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
173       unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
174       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
175       const char *intrinsic = NULL;
176       if (length == 4) {
177          intrinsic = "llvm.x86.vcvtps2ph.128";
178       }
179       else {
180          intrinsic = "llvm.x86.vcvtps2ph.256";
181       }
182       result = lp_build_intrinsic_binary(builder, intrinsic,
183                                          lp_build_vec_type(gallivm, i168_type),
184                                          src, LLVMConstInt(i32t, mode, 0));
185       if (length == 4) {
186          result = lp_build_extract_range(gallivm, result, 0, 4);
187       }
188    }
189 
190    else {
191       result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
192       /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
193       result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
194    }
195 
196    /*
197     * Debugging code.
198     */
199    if (0) {
200      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
201      LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
202      LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
203      LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
204      unsigned i;
205 
206      LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
207      LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)_mesa_float_to_half));
208      func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "_mesa_float_to_half");
209 
210      for (i = 0; i < length; ++i) {
211         LLVMValueRef index = LLVMConstInt(i32t, i, 0);
212         LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
213 #if 0
214         /*
215          * XXX: not really supported by backends.
216          * Even if they would now, rounding mode cannot be specified and
217          * is undefined.
218          */
219         LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
220 #else
221         LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
222 #endif
223         ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
224      }
225 
226      lp_build_print_value(gallivm, "src  = ", src);
227      lp_build_print_value(gallivm, "llvm = ", result);
228      lp_build_print_value(gallivm, "util = ", ref_result);
229      lp_build_printf(gallivm, "\n");
230   }
231 
232    return result;
233 }
234 
235 
236 /**
237  * Special case for converting clamped IEEE-754 floats to unsigned norms.
238  *
239  * The mathematical voodoo below may seem excessive but it is actually
240  * paramount we do it this way for several reasons. First, there is no single
241  * precision FP to unsigned integer conversion Intel SSE instruction. Second,
242  * secondly, even if there was, since the FP's mantissa takes only a fraction
243  * of register bits the typically scale and cast approach would require double
244  * precision for accurate results, and therefore half the throughput
245  *
246  * Although the result values can be scaled to an arbitrary bit width specified
247  * by dst_width, the actual result type will have the same width.
248  *
249  * Ex: src = { float, float, float, float }
250  * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
251  */
252 LLVMValueRef
lp_build_clamped_float_to_unsigned_norm(struct gallivm_state * gallivm,struct lp_type src_type,unsigned dst_width,LLVMValueRef src)253 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
254                                         struct lp_type src_type,
255                                         unsigned dst_width,
256                                         LLVMValueRef src)
257 {
258    LLVMBuilderRef builder = gallivm->builder;
259    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
260    LLVMValueRef res;
261    unsigned mantissa;
262 
263    assert(src_type.floating);
264    assert(dst_width <= src_type.width);
265    src_type.sign = FALSE;
266 
267    mantissa = lp_mantissa(src_type);
268 
269    if (dst_width <= mantissa) {
270       /*
271        * Apply magic coefficients that will make the desired result to appear
272        * in the lowest significant bits of the mantissa, with correct rounding.
273        *
274        * This only works if the destination width fits in the mantissa.
275        */
276 
277       unsigned long long ubound;
278       unsigned long long mask;
279       double scale;
280       double bias;
281 
282       ubound = (1ULL << dst_width);
283       mask = ubound - 1;
284       scale = (double)mask/ubound;
285       bias = (double)(1ULL << (mantissa - dst_width));
286 
287       res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
288       /* instead of fadd/and could (with sse2) just use lp_build_iround */
289       res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
290       res = LLVMBuildBitCast(builder, res, int_vec_type, "");
291       res = LLVMBuildAnd(builder, res,
292                          lp_build_const_int_vec(gallivm, src_type, mask), "");
293    }
294    else if (dst_width == (mantissa + 1)) {
295       /*
296        * The destination width matches exactly what can be represented in
297        * floating point (i.e., mantissa + 1 bits). Even so correct rounding
298        * still needs to be applied (only for numbers in [0.5-1.0] would
299        * conversion using truncation after scaling be sufficient).
300        */
301       double scale;
302       struct lp_build_context uf32_bld;
303 
304       lp_build_context_init(&uf32_bld, gallivm, src_type);
305       scale = (double)((1ULL << dst_width) - 1);
306 
307       res = LLVMBuildFMul(builder, src,
308                           lp_build_const_vec(gallivm, src_type, scale), "");
309       res = lp_build_iround(&uf32_bld, res);
310    }
311    else {
312       /*
313        * The destination exceeds what can be represented in the floating point.
314        * So multiply by the largest power two we get away with, and when
315        * subtract the most significant bit to rescale to normalized values.
316        *
317        * The largest power of two factor we can get away is
318        * (1 << (src_type.width - 1)), because we need to use signed . In theory it
319        * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
320        * INT_MIN should be returned in FPToSI, which is the correct result for
321        * values near 1.0!
322        *
323        * This means we get (src_type.width - 1) correct bits for values near 0.0,
324        * and (mantissa + 1) correct bits for values near 1.0. Equally or more
325        * important, we also get exact results for 0.0 and 1.0.
326        */
327 
328       unsigned n = MIN2(src_type.width - 1u, dst_width);
329 
330       double scale = (double)(1ULL << n);
331       unsigned lshift = dst_width - n;
332       unsigned rshift = n;
333       LLVMValueRef lshifted;
334       LLVMValueRef rshifted;
335 
336       res = LLVMBuildFMul(builder, src,
337                           lp_build_const_vec(gallivm, src_type, scale), "");
338       if (!src_type.sign && src_type.width == 32)
339          res = LLVMBuildFPToUI(builder, res, int_vec_type, "");
340       else
341          res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
342 
343       /*
344        * Align the most significant bit to its final place.
345        *
346        * This will cause 1.0 to overflow to 0, but the later adjustment will
347        * get it right.
348        */
349       if (lshift) {
350          lshifted = LLVMBuildShl(builder, res,
351                                  lp_build_const_int_vec(gallivm, src_type,
352                                                         lshift), "");
353       } else {
354          lshifted = res;
355       }
356 
357       /*
358        * Align the most significant bit to the right.
359        */
360       rshifted =  LLVMBuildLShr(builder, res,
361                                 lp_build_const_int_vec(gallivm, src_type, rshift),
362                                 "");
363 
364       /*
365        * Subtract the MSB to the LSB, therefore re-scaling from
366        * (1 << dst_width) to ((1 << dst_width) - 1).
367        */
368 
369       res = LLVMBuildSub(builder, lshifted, rshifted, "");
370    }
371 
372    return res;
373 }
374 
375 
376 /**
377  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
378  * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
379  * return {float, float, float, float} with values in range [0, 1].
380  */
381 LLVMValueRef
lp_build_unsigned_norm_to_float(struct gallivm_state * gallivm,unsigned src_width,struct lp_type dst_type,LLVMValueRef src)382 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
383                                 unsigned src_width,
384                                 struct lp_type dst_type,
385                                 LLVMValueRef src)
386 {
387    LLVMBuilderRef builder = gallivm->builder;
388    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
389    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
390    LLVMValueRef bias_;
391    LLVMValueRef res;
392    unsigned mantissa;
393    unsigned n;
394    unsigned long long ubound;
395    unsigned long long mask;
396    double scale;
397    double bias;
398 
399    assert(dst_type.floating);
400 
401    mantissa = lp_mantissa(dst_type);
402 
403    if (src_width <= (mantissa + 1)) {
404       /*
405        * The source width matches fits what can be represented in floating
406        * point (i.e., mantissa + 1 bits). So do a straight multiplication
407        * followed by casting. No further rounding is necessary.
408        */
409 
410       scale = 1.0/(double)((1ULL << src_width) - 1);
411       res = LLVMBuildSIToFP(builder, src, vec_type, "");
412       res = LLVMBuildFMul(builder, res,
413                           lp_build_const_vec(gallivm, dst_type, scale), "");
414       return res;
415    }
416    else {
417       /*
418        * The source width exceeds what can be represented in floating
419        * point. So truncate the incoming values.
420        */
421 
422       n = MIN2(mantissa, src_width);
423 
424       ubound = ((unsigned long long)1 << n);
425       mask = ubound - 1;
426       scale = (double)ubound/mask;
427       bias = (double)((unsigned long long)1 << (mantissa - n));
428 
429       res = src;
430 
431       if (src_width > mantissa) {
432          int shift = src_width - mantissa;
433          res = LLVMBuildLShr(builder, res,
434                              lp_build_const_int_vec(gallivm, dst_type, shift), "");
435       }
436 
437       bias_ = lp_build_const_vec(gallivm, dst_type, bias);
438 
439       res = LLVMBuildOr(builder,
440                         res,
441                         LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
442 
443       res = LLVMBuildBitCast(builder, res, vec_type, "");
444 
445       res = LLVMBuildFSub(builder, res, bias_, "");
446       res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
447    }
448 
449    return res;
450 }
451 
452 
453 /**
454  * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
455  *
456  * Returns the number of dsts created from src
457  */
lp_build_conv_auto(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type * dst_type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst)458 int lp_build_conv_auto(struct gallivm_state *gallivm,
459                        struct lp_type src_type,
460                        struct lp_type* dst_type,
461                        const LLVMValueRef *src,
462                        unsigned num_srcs,
463                        LLVMValueRef *dst)
464 {
465    unsigned i;
466    int num_dsts = num_srcs;
467 
468    if (src_type.floating == dst_type->floating &&
469        src_type.width == dst_type->width &&
470        src_type.length == dst_type->length &&
471        src_type.fixed == dst_type->fixed &&
472        src_type.norm == dst_type->norm &&
473        src_type.sign == dst_type->sign)
474       return num_dsts;
475 
476    /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
477     */
478    if (src_type.norm     == 0 &&
479        src_type.width    == 32 &&
480        src_type.fixed    == 0 &&
481 
482        dst_type->floating == 0 &&
483        dst_type->fixed    == 0 &&
484        dst_type->width    == 8 &&
485 
486        ((src_type.floating == 1 && src_type.sign == 1 && dst_type->norm == 1) ||
487         (src_type.floating == 0 && dst_type->floating == 0 &&
488          src_type.sign == dst_type->sign && dst_type->norm == 0))) {
489 
490       /* Special case 4x4x32 --> 1x16x8 */
491       if (src_type.length == 4 &&
492             (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
493       {
494          num_dsts = (num_srcs + 3) / 4;
495          dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
496 
497          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
498          return num_dsts;
499       }
500 
501       /* Special case 2x8x32 --> 1x16x8 */
502       if (src_type.length == 8 &&
503           util_cpu_caps.has_avx)
504       {
505          num_dsts = (num_srcs + 1) / 2;
506          dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
507 
508          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
509          return num_dsts;
510       }
511    }
512 
513    /* lp_build_resize does not support M:N */
514    if (src_type.width == dst_type->width) {
515       lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
516    } else {
517       /*
518        * If dst_width is 16 bits and src_width 32 and the dst vector size
519        * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
520        * (For AVX, this isn't needed, since we usually get 256bit src and
521        * 128bit dst vectors which works ok. If we do AVX2 pack this should
522        * be extended but need to be able to tell conversion code about pack
523        * ordering first.)
524        */
525       unsigned ratio = 1;
526       if (src_type.width == 2 * dst_type->width &&
527           src_type.length == dst_type->length &&
528           dst_type->floating == 0 && (num_srcs % 2 == 0) &&
529           dst_type->width * dst_type->length == 64) {
530          ratio = 2;
531          num_dsts /= 2;
532          dst_type->length *= 2;
533       }
534       for (i = 0; i < num_dsts; i++) {
535          lp_build_conv(gallivm, src_type, *dst_type, &src[i*ratio], ratio, &dst[i], 1);
536       }
537    }
538 
539    return num_dsts;
540 }
541 
542 
543 /**
544  * Generic type conversion.
545  *
546  * TODO: Take a precision argument, or even better, add a new precision member
547  * to the lp_type union.
548  */
549 void
lp_build_conv(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst,unsigned num_dsts)550 lp_build_conv(struct gallivm_state *gallivm,
551               struct lp_type src_type,
552               struct lp_type dst_type,
553               const LLVMValueRef *src, unsigned num_srcs,
554               LLVMValueRef *dst, unsigned num_dsts)
555 {
556    LLVMBuilderRef builder = gallivm->builder;
557    struct lp_type tmp_type;
558    LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
559    unsigned num_tmps;
560    unsigned i;
561 
562    /* We must not loose or gain channels. Only precision */
563    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
564 
565    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
566    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
567    assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
568    assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
569 
570    tmp_type = src_type;
571    for(i = 0; i < num_srcs; ++i) {
572       assert(lp_check_value(src_type, src[i]));
573       tmp[i] = src[i];
574    }
575    num_tmps = num_srcs;
576 
577 
578    /*
579     * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
580     * Only float -> s/unorm8 and (u)int32->(u)int8.
581     * XXX: This should cover all interesting backend cases for 8 bit,
582     * but should use same strategy if dst is 16 bit.
583     */
584    if (src_type.norm     == 0 &&
585        src_type.width    == 32 &&
586        src_type.length   == 4 &&
587        src_type.fixed    == 0 &&
588 
589        dst_type.floating == 0 &&
590        dst_type.fixed    == 0 &&
591        dst_type.width    == 8 &&
592 
593        ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
594         (src_type.floating == 0 && dst_type.floating == 0 &&
595          src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
596 
597        ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
598         (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
599 
600        (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
601    {
602       struct lp_build_context bld;
603       struct lp_type int16_type, int32_type;
604       struct lp_type dst_type_ext = dst_type;
605       LLVMValueRef const_scale;
606       unsigned i, j;
607 
608       lp_build_context_init(&bld, gallivm, src_type);
609 
610       dst_type_ext.length = 16;
611       int16_type = int32_type = dst_type_ext;
612 
613       int16_type.width *= 2;
614       int16_type.length /= 2;
615       int16_type.sign = 1;
616 
617       int32_type.width *= 4;
618       int32_type.length /= 4;
619       int32_type.sign = 1;
620 
621       const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
622 
623       for (i = 0; i < num_dsts; ++i, src += 4) {
624          LLVMValueRef lo, hi;
625 
626          if (src_type.floating) {
627             for (j = 0; j < dst_type.length / 4; ++j) {
628                /*
629                 * XXX This is not actually fully correct. The float to int
630                 * conversion will produce 0x80000000 value for everything
631                 * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
632                 * Hence, NaNs and negatives will get clamped just fine to zero
633                 * (relying on clamping pack behavior) when converting to unorm,
634                 * however too large values (both finite and infinite) will also
635                 * end up as zero, not 255.
636                 * For snorm, for now we'll keep bug compatibility with generic
637                 * conversion path (meaning too large values are fine, but
638                 * NaNs get converted to -128 (purely by luck, as we don't
639                 * specify nan behavior for the max there) instead of 0).
640                 *
641                 * dEQP has GLES31 tests that expect +inf -> 255.0.
642                 */
643                if (dst_type.sign) {
644                   tmp[j] = lp_build_min(&bld, bld.one, src[j]);
645 
646                }
647                else {
648                   if (1) {
649                      tmp[j] = lp_build_min_ext(&bld, bld.one, src[j],
650                                                GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
651                   }
652                   tmp[j] = src[j];
653                }
654                tmp[j] = LLVMBuildFMul(builder, tmp[j], const_scale, "");
655                tmp[j] = lp_build_iround(&bld, tmp[j]);
656             }
657          } else {
658             for (j = 0; j < dst_type.length / 4; ++j) {
659                if (!dst_type.sign) {
660                   /*
661                    * Pack clamp is always signed->unsigned (or signed->signed).
662                    * Hence need min.
663                    */
664                   LLVMValueRef const_max;
665                   const_max = lp_build_const_int_vec(gallivm, src_type, 255);
666                   tmp[j] = lp_build_min(&bld, src[j], const_max);
667                } else {
668                   tmp[j] = src[j];
669                }
670             }
671          }
672 
673          if (num_srcs == 1) {
674             tmp[1] = tmp[0];
675          }
676 
677          /* relying on clamping behavior of sse2 intrinsics here */
678          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
679 
680          if (num_srcs < 4) {
681             hi = lo;
682          }
683          else {
684             hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
685          }
686          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
687       }
688       if (num_srcs < 4) {
689          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
690       }
691 
692       return;
693    }
694 
695    /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
696     */
697    else if (src_type.norm     == 0 &&
698        src_type.width    == 32 &&
699        src_type.length   == 8 &&
700        src_type.fixed    == 0 &&
701 
702        dst_type.floating == 0 &&
703        dst_type.fixed    == 0 &&
704        dst_type.width    == 8 &&
705 
706        ((src_type.floating == 1 && src_type.sign == 1 && dst_type.norm == 1) ||
707         (src_type.floating == 0 && dst_type.floating == 0 &&
708          src_type.sign == dst_type.sign && dst_type.norm == 0)) &&
709 
710       ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
711        (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
712 
713       util_cpu_caps.has_avx) {
714 
715       struct lp_build_context bld;
716       struct lp_type int16_type, int32_type;
717       struct lp_type dst_type_ext = dst_type;
718       LLVMValueRef const_scale;
719       unsigned i;
720 
721       lp_build_context_init(&bld, gallivm, src_type);
722 
723       dst_type_ext.length = 16;
724       int16_type = int32_type = dst_type_ext;
725 
726       int16_type.width *= 2;
727       int16_type.length /= 2;
728       int16_type.sign = 1;
729 
730       int32_type.width *= 4;
731       int32_type.length /= 4;
732       int32_type.sign = 1;
733 
734       const_scale = lp_build_const_vec(gallivm, src_type, lp_const_scale(dst_type));
735 
736       for (i = 0; i < num_dsts; ++i, src += 2) {
737          unsigned j;
738          for (j = 0; j < (num_srcs == 1 ? 1 : 2); j++) {
739             LLVMValueRef lo, hi, a;
740 
741             a = src[j];
742             if (src_type.floating) {
743                if (dst_type.sign) {
744                   a = lp_build_min(&bld, bld.one, a);
745 
746                }
747                else {
748                   if (1) {
749                      a = lp_build_min_ext(&bld, bld.one, a,
750                                           GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
751                   }
752                }
753                a = LLVMBuildFMul(builder, a, const_scale, "");
754                a = lp_build_iround(&bld, a);
755             } else {
756                if (!dst_type.sign) {
757                   LLVMValueRef const_max;
758                   const_max = lp_build_const_int_vec(gallivm, src_type, 255);
759                   a = lp_build_min(&bld, a, const_max);
760                }
761             }
762             lo = lp_build_extract_range(gallivm, a, 0, 4);
763             hi = lp_build_extract_range(gallivm, a, 4, 4);
764             /* relying on clamping behavior of sse2 intrinsics here */
765             tmp[j] = lp_build_pack2(gallivm, int32_type, int16_type, lo, hi);
766          }
767 
768          if (num_srcs == 1) {
769             tmp[1] = tmp[0];
770          }
771          dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, tmp[0], tmp[1]);
772       }
773 
774       if (num_srcs == 1) {
775          dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
776       }
777 
778       return;
779    }
780 
781    /* Special case -> 16bit half-float
782     */
783    else if (dst_type.floating && dst_type.width == 16)
784    {
785       /* Only support src as 32bit float currently */
786       assert(src_type.floating && src_type.width == 32);
787 
788       for(i = 0; i < num_tmps; ++i)
789          dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
790 
791       return;
792    }
793 
794    /* Pre convert half-floats to floats
795     */
796    else if (src_type.floating && src_type.width == 16)
797    {
798       for(i = 0; i < num_tmps; ++i)
799          tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
800 
801       tmp_type.width = 32;
802    }
803 
804    /*
805     * Clamp if necessary
806     */
807 
808    if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
809       struct lp_build_context bld;
810       double src_min = lp_const_min(src_type);
811       double dst_min = lp_const_min(dst_type);
812       double src_max = lp_const_max(src_type);
813       double dst_max = lp_const_max(dst_type);
814       LLVMValueRef thres;
815 
816       lp_build_context_init(&bld, gallivm, tmp_type);
817 
818       if(src_min < dst_min) {
819          if(dst_min == 0.0)
820             thres = bld.zero;
821          else
822             thres = lp_build_const_vec(gallivm, src_type, dst_min);
823          for(i = 0; i < num_tmps; ++i)
824             tmp[i] = lp_build_max(&bld, tmp[i], thres);
825       }
826 
827       if(src_max > dst_max) {
828          if(dst_max == 1.0)
829             thres = bld.one;
830          else
831             thres = lp_build_const_vec(gallivm, src_type, dst_max);
832          for(i = 0; i < num_tmps; ++i)
833             tmp[i] = lp_build_min(&bld, tmp[i], thres);
834       }
835    }
836 
837    /*
838     * Scale to the narrowest range
839     */
840 
841    if(dst_type.floating) {
842       /* Nothing to do */
843    }
844    else if(tmp_type.floating) {
845       if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
846          for(i = 0; i < num_tmps; ++i) {
847             tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
848                                                              tmp_type,
849                                                              dst_type.width,
850                                                              tmp[i]);
851          }
852          tmp_type.floating = FALSE;
853       }
854       else {
855          double dst_scale = lp_const_scale(dst_type);
856 
857          if (dst_scale != 1.0) {
858             LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
859             for(i = 0; i < num_tmps; ++i)
860                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
861          }
862 
863          /*
864           * these functions will use fptosi in some form which won't work
865           * with 32bit uint dst. Causes lp_test_conv failures though.
866           */
867          if (0)
868             assert(dst_type.sign || dst_type.width < 32);
869 
870          if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
871             struct lp_build_context bld;
872 
873             lp_build_context_init(&bld, gallivm, tmp_type);
874             for(i = 0; i < num_tmps; ++i) {
875                tmp[i] = lp_build_iround(&bld, tmp[i]);
876             }
877             tmp_type.floating = FALSE;
878          }
879          else {
880             LLVMTypeRef tmp_vec_type;
881 
882             tmp_type.floating = FALSE;
883             tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
884             for(i = 0; i < num_tmps; ++i) {
885 #if 0
886                if(dst_type.sign)
887                   tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
888                else
889                   tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
890 #else
891               /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
892                tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
893 #endif
894             }
895          }
896       }
897    }
898    else {
899       unsigned src_shift = lp_const_shift(src_type);
900       unsigned dst_shift = lp_const_shift(dst_type);
901       unsigned src_offset = lp_const_offset(src_type);
902       unsigned dst_offset = lp_const_offset(dst_type);
903       struct lp_build_context bld;
904       lp_build_context_init(&bld, gallivm, tmp_type);
905 
906       /* Compensate for different offsets */
907       /* sscaled -> unorm and similar would cause negative shift count, skip */
908       if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {
909          for (i = 0; i < num_tmps; ++i) {
910             LLVMValueRef shifted;
911 
912             shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
913             tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
914          }
915       }
916 
917       if(src_shift > dst_shift) {
918          for(i = 0; i < num_tmps; ++i)
919             tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
920       }
921    }
922 
923    /*
924     * Truncate or expand bit width
925     *
926     * No data conversion should happen here, although the sign bits are
927     * crucial to avoid bad clamping.
928     */
929 
930    {
931       struct lp_type new_type;
932 
933       new_type = tmp_type;
934       new_type.sign   = dst_type.sign;
935       new_type.width  = dst_type.width;
936       new_type.length = dst_type.length;
937 
938       /*
939        * Note that resize when using packs can sometimes get min/max
940        * clamping for free. Should be able to exploit this...
941        */
942       lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
943 
944       tmp_type = new_type;
945       num_tmps = num_dsts;
946    }
947 
948    /*
949     * Scale to the widest range
950     */
951 
952    if(src_type.floating) {
953       /* Nothing to do */
954    }
955    else if(!src_type.floating && dst_type.floating) {
956       if(!src_type.fixed && !src_type.sign && src_type.norm) {
957          for(i = 0; i < num_tmps; ++i) {
958             tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
959                                                      src_type.width,
960                                                      dst_type,
961                                                      tmp[i]);
962          }
963          tmp_type.floating = TRUE;
964       }
965       else {
966          double src_scale = lp_const_scale(src_type);
967          LLVMTypeRef tmp_vec_type;
968 
969          /* Use an equally sized integer for intermediate computations */
970          tmp_type.floating = TRUE;
971          tmp_type.sign = TRUE;
972          tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
973          for(i = 0; i < num_tmps; ++i) {
974 #if 0
975             if(dst_type.sign)
976                tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
977             else
978                tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
979 #else
980             /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
981             tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
982 #endif
983           }
984 
985           if (src_scale != 1.0) {
986              LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
987              for(i = 0; i < num_tmps; ++i)
988                 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
989           }
990 
991           /* the formula above will produce value below -1.0 for most negative
992            * value but everything seems happy with that hence disable for now */
993           if (0 && !src_type.fixed && src_type.norm && src_type.sign) {
994              struct lp_build_context bld;
995 
996              lp_build_context_init(&bld, gallivm, dst_type);
997              for(i = 0; i < num_tmps; ++i) {
998                 tmp[i] = lp_build_max(&bld, tmp[i],
999                                       lp_build_const_vec(gallivm, dst_type, -1.0f));
1000              }
1001           }
1002       }
1003     }
1004     else {
1005        unsigned src_shift = lp_const_shift(src_type);
1006        unsigned dst_shift = lp_const_shift(dst_type);
1007        unsigned src_offset = lp_const_offset(src_type);
1008        unsigned dst_offset = lp_const_offset(dst_type);
1009        struct lp_build_context bld;
1010        lp_build_context_init(&bld, gallivm, tmp_type);
1011 
1012        if (src_shift < dst_shift) {
1013           LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
1014 
1015           if (dst_shift - src_shift < dst_type.width) {
1016              for (i = 0; i < num_tmps; ++i) {
1017                 pre_shift[i] = tmp[i];
1018                 tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
1019              }
1020           }
1021           else {
1022              /*
1023               * This happens for things like sscaled -> unorm conversions. Shift
1024               * counts equal to bit width cause undefined results, so hack around it.
1025               */
1026              for (i = 0; i < num_tmps; ++i) {
1027                 pre_shift[i] = tmp[i];
1028                 tmp[i] = lp_build_zero(gallivm, dst_type);
1029              }
1030           }
1031 
1032           /* Compensate for different offsets */
1033           if (dst_offset > src_offset) {
1034              for (i = 0; i < num_tmps; ++i) {
1035                 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
1036              }
1037           }
1038        }
1039     }
1040 
1041    for(i = 0; i < num_dsts; ++i) {
1042       dst[i] = tmp[i];
1043       assert(lp_check_value(dst_type, dst[i]));
1044    }
1045 }
1046 
1047 
1048 /**
1049  * Bit mask conversion.
1050  *
1051  * This will convert the integer masks that match the given types.
1052  *
1053  * The mask values should 0 or -1, i.e., all bits either set to zero or one.
1054  * Any other value will likely cause unpredictable results.
1055  *
1056  * This is basically a very trimmed down version of lp_build_conv.
1057  */
1058 void
lp_build_conv_mask(struct gallivm_state * gallivm,struct lp_type src_type,struct lp_type dst_type,const LLVMValueRef * src,unsigned num_srcs,LLVMValueRef * dst,unsigned num_dsts)1059 lp_build_conv_mask(struct gallivm_state *gallivm,
1060                    struct lp_type src_type,
1061                    struct lp_type dst_type,
1062                    const LLVMValueRef *src, unsigned num_srcs,
1063                    LLVMValueRef *dst, unsigned num_dsts)
1064 {
1065 
1066    /* We must not loose or gain channels. Only precision */
1067    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
1068 
1069    /*
1070     * Drop
1071     *
1072     * We assume all values are 0 or -1
1073     */
1074 
1075    src_type.floating = FALSE;
1076    src_type.fixed = FALSE;
1077    src_type.sign = TRUE;
1078    src_type.norm = FALSE;
1079 
1080    dst_type.floating = FALSE;
1081    dst_type.fixed = FALSE;
1082    dst_type.sign = TRUE;
1083    dst_type.norm = FALSE;
1084 
1085    /*
1086     * Truncate or expand bit width
1087     */
1088 
1089    lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
1090 }
1091