• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2013 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * Format conversion code for "special" float formats.
32  *
33  * @author Roland Scheidegger <sroland@vmware.com>
34  */
35 
36 
37 #include "util/u_debug.h"
38 
39 #include "lp_bld_type.h"
40 #include "lp_bld_const.h"
41 #include "lp_bld_arit.h"
42 #include "lp_bld_bitarit.h"
43 #include "lp_bld_logic.h"
44 #include "lp_bld_format.h"
45 
46 
47 /**
48  * Convert float32 to a float-like value with less exponent and mantissa
49  * bits. The mantissa is still biased, and the mantissa still has an implied 1,
50  * and there may be a sign bit.
51  *
52  * @param src             (vector) float value to convert
53  * @param mantissa_bits   the number of mantissa bits
54  * @param exponent_bits   the number of exponent bits
55  * @param mantissa_start  the start position of the small float in result value
56  * @param has_sign        if the small float has a sign bit
57  *
58  * This implements round-towards-zero (trunc) hence too large numbers get
59  * converted to largest representable number, not infinity.
60  * Small numbers may get converted to denorms, depending on normal
61  * float denorm handling of the cpu.
62  * Note that compared to the references, below, we skip any rounding bias
63  * since we do rounding towards zero - OpenGL allows rounding towards zero
64  * (though not preferred) and DX10 even seems to require it.
65  * Note that this will pack mantissa, exponent and sign bit (if any) together,
66  * and shift the result to mantissa_start.
67  *
68  * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
69  * ref https://gist.github.com/rygorous/2156668
70  */
71 LLVMValueRef
lp_build_float_to_smallfloat(struct gallivm_state * gallivm,struct lp_type i32_type,LLVMValueRef src,unsigned mantissa_bits,unsigned exponent_bits,unsigned mantissa_start,boolean has_sign)72 lp_build_float_to_smallfloat(struct gallivm_state *gallivm,
73                              struct lp_type i32_type,
74                              LLVMValueRef src,
75                              unsigned mantissa_bits,
76                              unsigned exponent_bits,
77                              unsigned mantissa_start,
78                              boolean has_sign)
79 {
80    LLVMBuilderRef builder = gallivm->builder;
81    LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
82    LLVMValueRef rescale_src, i32_roundmask, small_max;
83    LLVMValueRef i32_qnanbit, shift, res;
84    LLVMValueRef is_nan_or_inf, nan_or_inf, mask, i32_src;
85    struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length);
86    struct lp_build_context f32_bld, i32_bld;
87    LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
88    unsigned exponent_start = mantissa_start + mantissa_bits;
89    boolean always_preserve_nans = true;
90    boolean maybe_correct_denorm_rounding = true;
91 
92    lp_build_context_init(&f32_bld, gallivm, f32_type);
93    lp_build_context_init(&i32_bld, gallivm, i32_type);
94 
95    i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
96                                              ((1 << exponent_bits) - 1) << 23);
97    i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
98 
99    i32_src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
100 
101    if (has_sign) {
102       rescale_src = src;
103    }
104    else {
105       /* clamp to pos range (can still have sign bit if NaN or negative zero) */
106       rescale_src = lp_build_max(&f32_bld, zero, src);
107    }
108    rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, "");
109 
110    /* "ordinary" number */
111    /*
112     * get rid of excess mantissa bits and sign bit
113     * This is only really needed for correct rounding of denorms I think
114     * but only if we use the preserve NaN path does using
115     * src_abs instead save us any instruction.
116     */
117    if (maybe_correct_denorm_rounding || !always_preserve_nans) {
118       i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
119                                              ~((1 << (23 - mantissa_bits)) - 1) &
120                                              0x7fffffff);
121       rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, "");
122       rescale_src = lp_build_and(&i32_bld, rescale_src, i32_roundmask);
123       rescale_src = LLVMBuildBitCast(builder, rescale_src, f32_bld.vec_type, "");
124    }
125    else {
126       rescale_src = lp_build_abs(&f32_bld, src);
127    }
128 
129    /* bias exponent (and denormalize if necessary) */
130    magic = lp_build_const_int_vec(gallivm, i32_type,
131                                   ((1 << (exponent_bits - 1)) - 1) << 23);
132    magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
133    normal = lp_build_mul(&f32_bld, rescale_src, magic);
134 
135    /* clamp to max value - largest non-infinity number */
136    small_max = lp_build_const_int_vec(gallivm, i32_type,
137                                       (((1 << exponent_bits) - 2) << 23) |
138                                       (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
139    small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
140    normal = lp_build_min(&f32_bld, normal, small_max);
141    normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
142 
143    /*
144     * handle nan/inf cases
145     * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
146     * (for no sign) else ->Inf -> ->Inf too.
147     * could use explicit "unordered" comparison checking for NaNs
148     * which might save us from calculating src_abs too.
149     * (Cannot actually save the comparison since we need to distinguish
150     * Inf and NaN cases anyway, but it would be better for AVX.)
151     */
152    if (always_preserve_nans) {
153       LLVMValueRef infcheck_src, is_inf, is_nan;
154       LLVMValueRef src_abs = lp_build_abs(&f32_bld, src);
155       src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
156 
157       if (has_sign) {
158          infcheck_src = src_abs;
159       }
160       else {
161          infcheck_src = i32_src;
162       }
163       is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
164                                 src_abs, i32_floatexpmask);
165       is_inf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
166                                 infcheck_src, i32_floatexpmask);
167       is_nan_or_inf = lp_build_or(&i32_bld, is_nan, is_inf);
168       /* could also set more mantissa bits but need at least the highest mantissa bit */
169       i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
170       /* combine maxexp with qnanbit */
171       nan_or_inf = lp_build_or(&i32_bld, i32_smallexpmask,
172                                lp_build_and(&i32_bld, is_nan, i32_qnanbit));
173    }
174    else {
175       /*
176        * A couple simplifications, with mostly 2 drawbacks (so disabled):
177        * - it will promote some SNaNs (those which only had bits set
178        * in the mantissa part which got chopped off) to +-Infinity.
179        * (Those bits get chopped off anyway later so can as well use
180        * rescale_src instead of src_abs here saving the calculation of that.)
181        * - for no sign case, it relies on the max() being used for rescale_src
182        * to give back the NaN (which is NOT ieee754r behavior, but should work
183        * with sse2 on a full moon (rather if I got the operand order right) -
184        * we _don't_ have well-defined behavior specified with min/max wrt NaNs,
185        * however, and if it gets converted to cmp/select it may not work (we
186        * don't really have specified behavior for cmp wrt NaNs neither).
187        */
188       rescale_src = LLVMBuildBitCast(builder, rescale_src, i32_bld.vec_type, "");
189       is_nan_or_inf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GEQUAL,
190                                        rescale_src, i32_floatexpmask);
191       /* note this will introduce excess exponent bits */
192       nan_or_inf = rescale_src;
193    }
194    res = lp_build_select(&i32_bld, is_nan_or_inf, nan_or_inf, normal);
195 
196    if (mantissa_start > 0 || !always_preserve_nans) {
197       /* mask off excess bits */
198       unsigned maskbits = (1 << (mantissa_bits + exponent_bits)) - 1;
199       mask = lp_build_const_int_vec(gallivm, i32_type,
200                                     maskbits << (23 - mantissa_bits));
201       res = lp_build_and(&i32_bld, res, mask);
202    }
203 
204    /* add back sign bit at right position */
205    if (has_sign) {
206       LLVMValueRef sign;
207       struct lp_type u32_type = lp_type_uint_vec(32, 32 * i32_type.length);
208       struct lp_build_context u32_bld;
209       lp_build_context_init(&u32_bld, gallivm, u32_type);
210 
211       mask = lp_build_const_int_vec(gallivm, i32_type, 0x80000000);
212       shift = lp_build_const_int_vec(gallivm, i32_type, 8 - exponent_bits);
213       sign = lp_build_and(&i32_bld, mask, i32_src);
214       sign = lp_build_shr(&u32_bld, sign, shift);
215       res = lp_build_or(&i32_bld, sign, res);
216    }
217 
218    /* shift to final position */
219    if (exponent_start < 23) {
220       shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
221       res = lp_build_shr(&i32_bld, res, shift);
222    }
223    else {
224       shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
225       res = lp_build_shl(&i32_bld, res, shift);
226    }
227    return res;
228 }
229 
230 
231 /**
232  * Convert rgba float SoA values to packed r11g11b10 values.
233  *
234  * @param src   SoA float (vector) values to convert.
235  */
236 LLVMValueRef
lp_build_float_to_r11g11b10(struct gallivm_state * gallivm,const LLVMValueRef * src)237 lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
238                             const LLVMValueRef *src)
239 {
240    LLVMValueRef dst, rcomp, bcomp, gcomp;
241    struct lp_build_context i32_bld;
242    LLVMTypeRef src_type = LLVMTypeOf(*src);
243    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
244                             LLVMGetVectorSize(src_type) : 1;
245    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
246 
247    lp_build_context_init(&i32_bld, gallivm, i32_type);
248 
249    /* "rescale" and put in right position */
250    rcomp = lp_build_float_to_smallfloat(gallivm, i32_type, src[0], 6, 5, 0, false);
251    gcomp = lp_build_float_to_smallfloat(gallivm, i32_type, src[1], 6, 5, 11, false);
252    bcomp = lp_build_float_to_smallfloat(gallivm, i32_type, src[2], 5, 5, 22, false);
253 
254    /* combine the values */
255    dst = lp_build_or(&i32_bld, rcomp, gcomp);
256    return lp_build_or(&i32_bld, dst, bcomp);
257 }
258 
259 
260 /**
261  * Convert a float-like value with less exponent and mantissa
262  * bits than a normal float32 to a float32. The mantissa of
263  * the source value is assumed to have an implied 1, and the exponent
264  * is biased. There may be a sign bit.
265  * The source value to extract must be in a 32bit int (bits not part of
266  * the value to convert will be masked off).
267  * This works for things like 11-bit floats or half-floats,
268  * mantissa, exponent (and sign if present) must be packed
269  * the same as they are in a ordinary float.
270  *
271  * @param src             (vector) value to convert
272  * @param mantissa_bits   the number of mantissa bits
273  * @param exponent_bits   the number of exponent bits
274  * @param mantissa_start  the bit start position of the packed component
275  * @param has_sign        if the small float has a sign bit
276  *
277  * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
278  * ref https://gist.github.com/rygorous/2156668
279  */
280 LLVMValueRef
lp_build_smallfloat_to_float(struct gallivm_state * gallivm,struct lp_type f32_type,LLVMValueRef src,unsigned mantissa_bits,unsigned exponent_bits,unsigned mantissa_start,boolean has_sign)281 lp_build_smallfloat_to_float(struct gallivm_state *gallivm,
282                              struct lp_type f32_type,
283                              LLVMValueRef src,
284                              unsigned mantissa_bits,
285                              unsigned exponent_bits,
286                              unsigned mantissa_start,
287                              boolean has_sign)
288 {
289    LLVMBuilderRef builder = gallivm->builder;
290    LLVMValueRef smallexpmask, i32_floatexpmask, magic;
291    LLVMValueRef wasinfnan, tmp, res, shift, maskabs, srcabs, sign;
292    unsigned exponent_start = mantissa_start + mantissa_bits;
293    struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
294    struct lp_build_context f32_bld, i32_bld;
295 
296    lp_build_context_init(&f32_bld, gallivm, f32_type);
297    lp_build_context_init(&i32_bld, gallivm, i32_type);
298 
299    /* extract the component to "float position" */
300    if (exponent_start < 23) {
301       shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
302       src = lp_build_shl(&i32_bld, src, shift);
303    }
304    else {
305       shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
306       src = lp_build_shr(&i32_bld, src, shift);
307    }
308    maskabs = lp_build_const_int_vec(gallivm, i32_type,
309                                     ((1 << (mantissa_bits + exponent_bits)) - 1)
310                                     << (23 - mantissa_bits));
311    srcabs = lp_build_and(&i32_bld, src, maskabs);
312 
313    /* now do the actual scaling */
314    smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
315                                          ((1 << exponent_bits) - 1) << 23);
316    i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
317 
318    if (0) {
319      /*
320       * Note that this code path, while simpler, will convert small
321       * float denorms to floats according to current cpu denorm mode, if
322       * denorms are disabled it will flush them to zero!
323       * If cpu denorms are enabled, it should be faster though as long as
324       * there's no denorms in the inputs, but if there are actually denorms
325       * it's likely to be an order of magnitude slower (on x86 cpus).
326       */
327 
328       srcabs = LLVMBuildBitCast(builder, srcabs, f32_bld.vec_type, "");
329 
330       /*
331        * magic number has exponent new exp bias + (new exp bias - old exp bias),
332        * mantissa is 0.
333        */
334       magic = lp_build_const_int_vec(gallivm, i32_type,
335                                      (255 - (1 << (exponent_bits - 1))) << 23);
336       magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
337 
338       /* adjust exponent and fix denorms */
339       res = lp_build_mul(&f32_bld, srcabs, magic);
340 
341       /*
342        * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
343        * so a simple "or" will do (because exp adjust will leave mantissa intact)
344        */
345       /* use float compare (better for AVX 8-wide / no AVX2 but else should use int) */
346       smallexpmask = LLVMBuildBitCast(builder, smallexpmask, f32_bld.vec_type, "");
347       wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
348       res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
349       tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
350       res = lp_build_or(&i32_bld, tmp, res);
351    }
352 
353    else {
354       LLVMValueRef exp_one, isdenorm, denorm, normal, exp_adj;
355 
356       /* denorm (or zero) if exponent is zero */
357       exp_one = lp_build_const_int_vec(gallivm, i32_type, 1 << 23);
358       isdenorm = lp_build_cmp(&i32_bld, PIPE_FUNC_LESS, srcabs, exp_one);
359 
360       /* inf or nan if exponent is max */
361       wasinfnan = lp_build_cmp(&i32_bld, PIPE_FUNC_GEQUAL, srcabs, smallexpmask);
362 
363       /* for denormal (or zero), add (== or) magic exp to mantissa (== srcabs) (as int)
364        * then subtract it (as float).
365        * Another option would be to just do inttofp then do a rescale mul.
366        */
367       magic = lp_build_const_int_vec(gallivm, i32_type,
368                                      (127 - ((1 << (exponent_bits - 1)) - 2)) << 23);
369       denorm = lp_build_or(&i32_bld, srcabs, magic);
370       denorm = LLVMBuildBitCast(builder, denorm, f32_bld.vec_type, "");
371       denorm = lp_build_sub(&f32_bld, denorm,
372                             LLVMBuildBitCast(builder, magic, f32_bld.vec_type, ""));
373       denorm = LLVMBuildBitCast(builder, denorm, i32_bld.vec_type, "");
374 
375       /* for normals, Infs, Nans fix up exponent */
376       exp_adj = lp_build_const_int_vec(gallivm, i32_type,
377                                       (127 - ((1 << (exponent_bits - 1)) - 1)) << 23);
378       normal = lp_build_add(&i32_bld, srcabs, exp_adj);
379       tmp = lp_build_and(&i32_bld, wasinfnan, i32_floatexpmask);
380       normal = lp_build_or(&i32_bld, tmp, normal);
381 
382       res = lp_build_select(&i32_bld, isdenorm, denorm, normal);
383    }
384 
385    if (has_sign) {
386       LLVMValueRef signmask = lp_build_const_int_vec(gallivm, i32_type, 0x80000000);
387       shift = lp_build_const_int_vec(gallivm, i32_type, 8 - exponent_bits);
388       sign = lp_build_shl(&i32_bld, src, shift);
389       sign = lp_build_and(&i32_bld, signmask, sign);
390       res = lp_build_or(&i32_bld, res, sign);
391    }
392 
393    return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
394 }
395 
396 
397 /**
398  * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
399  *
400  * @param src   packed AoS r11g11b10 values (as (vector) int32)
401  * @param dst   pointer to the SoA result values
402  */
403 void
lp_build_r11g11b10_to_float(struct gallivm_state * gallivm,LLVMValueRef src,LLVMValueRef * dst)404 lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
405                             LLVMValueRef src,
406                             LLVMValueRef *dst)
407 {
408    LLVMTypeRef src_type = LLVMTypeOf(src);
409    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
410                             LLVMGetVectorSize(src_type) : 1;
411    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
412 
413    dst[0] = lp_build_smallfloat_to_float(gallivm, f32_type, src, 6, 5, 0, false);
414    dst[1] = lp_build_smallfloat_to_float(gallivm, f32_type, src, 6, 5, 11, false);
415    dst[2] = lp_build_smallfloat_to_float(gallivm, f32_type, src, 5, 5, 22, false);
416 
417    /* Just set alpha to one */
418    dst[3] = lp_build_one(gallivm, f32_type);
419 }
420 
421 
422 static LLVMValueRef
lp_build_rgb9_to_float_helper(struct gallivm_state * gallivm,struct lp_type f32_type,LLVMValueRef src,LLVMValueRef scale,unsigned mantissa_start)423 lp_build_rgb9_to_float_helper(struct gallivm_state *gallivm,
424                               struct lp_type f32_type,
425                               LLVMValueRef src,
426                               LLVMValueRef scale,
427                               unsigned mantissa_start)
428 {
429    LLVMValueRef shift, mask;
430 
431    struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
432    struct lp_build_context i32_bld, f32_bld;
433 
434    lp_build_context_init(&i32_bld, gallivm, i32_type);
435    lp_build_context_init(&f32_bld, gallivm, f32_type);
436 
437    /*
438     * This is much easier as other weirdo float formats, since
439     * there's no sign, no Inf/NaN, and there's nothing special
440     * required for normals/denormals neither (as without the implied one
441     * for the mantissa for other formats, everything looks like a denormal).
442     * So just do (float)comp_bits * scale
443     */
444    shift = lp_build_const_int_vec(gallivm, i32_type, mantissa_start);
445    mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff);
446    src = lp_build_shr(&i32_bld, src, shift);
447    src = lp_build_and(&i32_bld, src, mask);
448    src = lp_build_int_to_float(&f32_bld, src);
449    return lp_build_mul(&f32_bld, src, scale);
450 }
451 
452 
453 /**
454  * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values.
455  *
456  * @param src   packed AoS rgb9e5 values (as (vector) int32)
457  * @param dst   pointer to the SoA result values
458  */
459 void
lp_build_rgb9e5_to_float(struct gallivm_state * gallivm,LLVMValueRef src,LLVMValueRef * dst)460 lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
461                          LLVMValueRef src,
462                          LLVMValueRef *dst)
463 {
464    LLVMBuilderRef builder = gallivm->builder;
465    LLVMTypeRef src_type = LLVMTypeOf(src);
466    LLVMValueRef shift, scale, bias, exp;
467    unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
468                             LLVMGetVectorSize(src_type) : 1;
469    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
470    struct lp_type u32_type = lp_type_uint_vec(32, 32 * src_length);
471    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
472    struct lp_build_context i32_bld, u32_bld, f32_bld;
473 
474    lp_build_context_init(&i32_bld, gallivm, i32_type);
475    lp_build_context_init(&u32_bld, gallivm, u32_type);
476    lp_build_context_init(&f32_bld, gallivm, f32_type);
477 
478    /* extract exponent */
479    shift = lp_build_const_int_vec(gallivm, i32_type, 27);
480    /* this shift needs to be unsigned otherwise need mask */
481    exp = lp_build_shr(&u32_bld, src, shift);
482 
483    /*
484     * scale factor is 2 ^ (exp - bias)
485     * (and additionally corrected here for the mantissa bits)
486     * not using shift because
487     * a) don't have vector shift in a lot of cases
488     * b) shift direction changes hence need 2 shifts + conditional
489     *    (or rotate instruction which is even more rare (for instance XOP))
490     * so use whacky float 2 ^ function instead manipulating exponent
491     * (saves us the float conversion at the end too)
492     */
493    bias = lp_build_const_int_vec(gallivm, i32_type, 127 - (15 + 9));
494    scale = lp_build_add(&i32_bld, exp, bias);
495    shift = lp_build_const_int_vec(gallivm, i32_type, 23);
496    scale = lp_build_shl(&i32_bld, scale, shift);
497    scale = LLVMBuildBitCast(builder, scale, f32_bld.vec_type, "");
498 
499    dst[0] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 0);
500    dst[1] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 9);
501    dst[2] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 18);
502 
503    /* Just set alpha to one */
504    dst[3] = f32_bld.one;
505 }
506