• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "ir.h"
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
28 
29 enum lower_packing_builtins_op {
30    LOWER_PACK_UNPACK_NONE               = 0x0000,
31 
32    LOWER_PACK_SNORM_2x16                = 0x0001,
33    LOWER_UNPACK_SNORM_2x16              = 0x0002,
34 
35    LOWER_PACK_UNORM_2x16                = 0x0004,
36    LOWER_UNPACK_UNORM_2x16              = 0x0008,
37 
38    LOWER_PACK_HALF_2x16                 = 0x0010,
39    LOWER_UNPACK_HALF_2x16               = 0x0020,
40 
41    LOWER_PACK_SNORM_4x8                 = 0x0040,
42    LOWER_UNPACK_SNORM_4x8               = 0x0080,
43 
44    LOWER_PACK_UNORM_4x8                 = 0x0100,
45    LOWER_UNPACK_UNORM_4x8               = 0x0200,
46 
47    LOWER_PACK_USE_BFI                   = 0x0400,
48    LOWER_PACK_USE_BFE                   = 0x0800,
49 };
50 
51 namespace {
52 
53 using namespace ir_builder;
54 
55 /**
56  * A visitor that lowers built-in floating-point pack/unpack expressions
57  * such packSnorm2x16.
58  */
59 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
60 public:
61    /**
62     * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
63     */
lower_packing_builtins_visitor(int op_mask)64    explicit lower_packing_builtins_visitor(int op_mask)
65       : op_mask(op_mask),
66         progress(false)
67    {
68       factory.instructions = &factory_instructions;
69    }
70 
~lower_packing_builtins_visitor()71    virtual ~lower_packing_builtins_visitor()
72    {
73       assert(factory_instructions.is_empty());
74    }
75 
get_progress()76    bool get_progress() { return progress; }
77 
handle_rvalue(ir_rvalue ** rvalue)78    void handle_rvalue(ir_rvalue **rvalue)
79    {
80       if (!*rvalue)
81 	 return;
82 
83       ir_expression *expr = (*rvalue)->as_expression();
84       if (!expr)
85 	 return;
86 
87       enum lower_packing_builtins_op lowering_op =
88          choose_lowering_op(expr->operation);
89 
90       if (lowering_op == LOWER_PACK_UNPACK_NONE)
91          return;
92 
93       setup_factory(ralloc_parent(expr));
94 
95       ir_rvalue *op0 = expr->operands[0];
96       ralloc_steal(factory.mem_ctx, op0);
97 
98       switch (lowering_op) {
99       case LOWER_PACK_SNORM_2x16:
100          *rvalue = lower_pack_snorm_2x16(op0);
101          break;
102       case LOWER_PACK_SNORM_4x8:
103          *rvalue = lower_pack_snorm_4x8(op0);
104          break;
105       case LOWER_PACK_UNORM_2x16:
106          *rvalue = lower_pack_unorm_2x16(op0);
107          break;
108       case LOWER_PACK_UNORM_4x8:
109          *rvalue = lower_pack_unorm_4x8(op0);
110          break;
111       case LOWER_PACK_HALF_2x16:
112          *rvalue = lower_pack_half_2x16(op0);
113          break;
114       case LOWER_UNPACK_SNORM_2x16:
115          *rvalue = lower_unpack_snorm_2x16(op0);
116          break;
117       case LOWER_UNPACK_SNORM_4x8:
118          *rvalue = lower_unpack_snorm_4x8(op0);
119          break;
120       case LOWER_UNPACK_UNORM_2x16:
121          *rvalue = lower_unpack_unorm_2x16(op0);
122          break;
123       case LOWER_UNPACK_UNORM_4x8:
124          *rvalue = lower_unpack_unorm_4x8(op0);
125          break;
126       case LOWER_UNPACK_HALF_2x16:
127          *rvalue = lower_unpack_half_2x16(op0);
128          break;
129       case LOWER_PACK_UNPACK_NONE:
130       case LOWER_PACK_USE_BFI:
131       case LOWER_PACK_USE_BFE:
132          assert(!"not reached");
133          break;
134       }
135 
136       teardown_factory();
137       progress = true;
138    }
139 
140 private:
141    const int op_mask;
142    bool progress;
143    ir_factory factory;
144    exec_list factory_instructions;
145 
146    /**
147     * Determine the needed lowering operation by filtering \a expr_op
148     * through \ref op_mask.
149     */
150    enum lower_packing_builtins_op
choose_lowering_op(ir_expression_operation expr_op)151    choose_lowering_op(ir_expression_operation expr_op)
152    {
153       /* C++ regards int and enum as fundamentally different types.
154        * So, we can't simply return from each case; we must cast the return
155        * value.
156        */
157       int result;
158 
159       switch (expr_op) {
160       case ir_unop_pack_snorm_2x16:
161          result = op_mask & LOWER_PACK_SNORM_2x16;
162          break;
163       case ir_unop_pack_snorm_4x8:
164          result = op_mask & LOWER_PACK_SNORM_4x8;
165          break;
166       case ir_unop_pack_unorm_2x16:
167          result = op_mask & LOWER_PACK_UNORM_2x16;
168          break;
169       case ir_unop_pack_unorm_4x8:
170          result = op_mask & LOWER_PACK_UNORM_4x8;
171          break;
172       case ir_unop_pack_half_2x16:
173          result = op_mask & LOWER_PACK_HALF_2x16;
174          break;
175       case ir_unop_unpack_snorm_2x16:
176          result = op_mask & LOWER_UNPACK_SNORM_2x16;
177          break;
178       case ir_unop_unpack_snorm_4x8:
179          result = op_mask & LOWER_UNPACK_SNORM_4x8;
180          break;
181       case ir_unop_unpack_unorm_2x16:
182          result = op_mask & LOWER_UNPACK_UNORM_2x16;
183          break;
184       case ir_unop_unpack_unorm_4x8:
185          result = op_mask & LOWER_UNPACK_UNORM_4x8;
186          break;
187       case ir_unop_unpack_half_2x16:
188          result = op_mask & LOWER_UNPACK_HALF_2x16;
189          break;
190       default:
191          result = LOWER_PACK_UNPACK_NONE;
192          break;
193       }
194 
195       return static_cast<enum lower_packing_builtins_op>(result);
196    }
197 
198    void
setup_factory(void * mem_ctx)199    setup_factory(void *mem_ctx)
200    {
201       assert(factory.mem_ctx == NULL);
202       assert(factory.instructions->is_empty());
203 
204       factory.mem_ctx = mem_ctx;
205    }
206 
207    void
teardown_factory()208    teardown_factory()
209    {
210       base_ir->insert_before(factory.instructions);
211       assert(factory.instructions->is_empty());
212       factory.mem_ctx = NULL;
213    }
214 
215    template <typename T>
216    ir_constant*
constant(T x)217    constant(T x)
218    {
219       return factory.constant(x);
220    }
221 
222    /**
223     * \brief Pack two uint16's into a single uint32.
224     *
225     * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
226     * where the least significant bits specify the first element of the pair.
227     * Return the uint32.
228     */
229    ir_rvalue*
pack_uvec2_to_uint(ir_rvalue * uvec2_rval)230    pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
231    {
232       assert(uvec2_rval->type == &glsl_type_builtin_uvec2);
233 
234       /* uvec2 u = UVEC2_RVAL; */
235       ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec2,
236                                          "tmp_pack_uvec2_to_uint");
237       factory.emit(assign(u, uvec2_rval));
238 
239       if (op_mask & LOWER_PACK_USE_BFI) {
240          return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
241                                 swizzle_y(u),
242                                 constant(16u),
243                                 constant(16u));
244       }
245 
246       /* return (u.y << 16) | (u.x & 0xffff); */
247       return bit_or(lshift(swizzle_y(u), constant(16u)),
248                     bit_and(swizzle_x(u), constant(0xffffu)));
249    }
250 
251    /**
252     * \brief Pack four uint8's into a single uint32.
253     *
254     * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
255     * uint32 where the least significant bits specify the first element of the
256     * 4-tuple. Return the uint32.
257     */
258    ir_rvalue*
pack_uvec4_to_uint(ir_rvalue * uvec4_rval)259    pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
260    {
261       assert(uvec4_rval->type == &glsl_type_builtin_uvec4);
262 
263       ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec4,
264                                          "tmp_pack_uvec4_to_uint");
265 
266       if (op_mask & LOWER_PACK_USE_BFI) {
267          /* uvec4 u = UVEC4_RVAL; */
268          factory.emit(assign(u, uvec4_rval));
269 
270          return bitfield_insert(bitfield_insert(
271                                    bitfield_insert(
272                                       bit_and(swizzle_x(u), constant(0xffu)),
273                                       swizzle_y(u), constant(8u), constant(8u)),
274                                    swizzle_z(u), constant(16u), constant(8u)),
275                                 swizzle_w(u), constant(24u), constant(8u));
276       }
277 
278       /* uvec4 u = UVEC4_RVAL & 0xff */
279       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
280 
281       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
282       return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
283                            lshift(swizzle_z(u), constant(16u))),
284                     bit_or(lshift(swizzle_y(u), constant(8u)),
285                            swizzle_x(u)));
286    }
287 
288    /**
289     * \brief Unpack a uint32 into two uint16's.
290     *
291     * Interpret the given uint32 as a uint16 pair where the uint32's least
292     * significant bits specify the pair's first element. Return the uint16
293     * pair as a uvec2.
294     */
295    ir_rvalue*
unpack_uint_to_uvec2(ir_rvalue * uint_rval)296    unpack_uint_to_uvec2(ir_rvalue *uint_rval)
297    {
298       assert(uint_rval->type == &glsl_type_builtin_uint);
299 
300       /* uint u = UINT_RVAL; */
301       ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
302                                           "tmp_unpack_uint_to_uvec2_u");
303       factory.emit(assign(u, uint_rval));
304 
305       /* uvec2 u2; */
306       ir_variable *u2 = factory.make_temp(&glsl_type_builtin_uvec2,
307                                            "tmp_unpack_uint_to_uvec2_u2");
308 
309       /* u2.x = u & 0xffffu; */
310       factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
311 
312       /* u2.y = u >> 16u; */
313       factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
314 
315       return deref(u2).val;
316    }
317 
318    /**
319     * \brief Unpack a uint32 into two int16's.
320     *
321     * Specifically each 16-bit value is sign-extended to the full width of an
322     * int32 on return.
323     */
324    ir_rvalue *
unpack_uint_to_ivec2(ir_rvalue * uint_rval)325    unpack_uint_to_ivec2(ir_rvalue *uint_rval)
326    {
327       assert(uint_rval->type == &glsl_type_builtin_uint);
328 
329       if (!(op_mask & LOWER_PACK_USE_BFE)) {
330          return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
331                               constant(16u)),
332                        constant(16u));
333       }
334 
335       ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
336                                          "tmp_unpack_uint_to_ivec2_i");
337       factory.emit(assign(i, u2i(uint_rval)));
338 
339       /* ivec2 i2; */
340       ir_variable *i2 = factory.make_temp(&glsl_type_builtin_ivec2,
341                                           "tmp_unpack_uint_to_ivec2_i2");
342 
343       factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
344                           WRITEMASK_X));
345       factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
346                           WRITEMASK_Y));
347 
348       return deref(i2).val;
349    }
350 
351    /**
352     * \brief Unpack a uint32 into four uint8's.
353     *
354     * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
355     * significant bits specify the 4-tuple's first element. Return the uint8
356     * 4-tuple as a uvec4.
357     */
358    ir_rvalue*
unpack_uint_to_uvec4(ir_rvalue * uint_rval)359    unpack_uint_to_uvec4(ir_rvalue *uint_rval)
360    {
361       assert(uint_rval->type == &glsl_type_builtin_uint);
362 
363       /* uint u = UINT_RVAL; */
364       ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
365                                           "tmp_unpack_uint_to_uvec4_u");
366       factory.emit(assign(u, uint_rval));
367 
368       /* uvec4 u4; */
369       ir_variable *u4 = factory.make_temp(&glsl_type_builtin_uvec4,
370                                            "tmp_unpack_uint_to_uvec4_u4");
371 
372       /* u4.x = u & 0xffu; */
373       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
374 
375       if (op_mask & LOWER_PACK_USE_BFE) {
376          /* u4.y = bitfield_extract(u, 8, 8); */
377          factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
378                              WRITEMASK_Y));
379 
380          /* u4.z = bitfield_extract(u, 16, 8); */
381          factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
382                              WRITEMASK_Z));
383       } else {
384          /* u4.y = (u >> 8u) & 0xffu; */
385          factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
386                                          constant(0xffu)), WRITEMASK_Y));
387 
388          /* u4.z = (u >> 16u) & 0xffu; */
389          factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
390                                          constant(0xffu)), WRITEMASK_Z));
391       }
392 
393       /* u4.w = (u >> 24u) */
394       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
395 
396       return deref(u4).val;
397    }
398 
399    /**
400     * \brief Unpack a uint32 into four int8's.
401     *
402     * Specifically each 8-bit value is sign-extended to the full width of an
403     * int32 on return.
404     */
405    ir_rvalue *
unpack_uint_to_ivec4(ir_rvalue * uint_rval)406    unpack_uint_to_ivec4(ir_rvalue *uint_rval)
407    {
408       assert(uint_rval->type == &glsl_type_builtin_uint);
409 
410       if (!(op_mask & LOWER_PACK_USE_BFE)) {
411          return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
412                               constant(24u)),
413                        constant(24u));
414       }
415 
416       ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
417                                          "tmp_unpack_uint_to_ivec4_i");
418       factory.emit(assign(i, u2i(uint_rval)));
419 
420       /* ivec4 i4; */
421       ir_variable *i4 = factory.make_temp(&glsl_type_builtin_ivec4,
422                                           "tmp_unpack_uint_to_ivec4_i4");
423 
424       factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
425                           WRITEMASK_X));
426       factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
427                           WRITEMASK_Y));
428       factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
429                           WRITEMASK_Z));
430       factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
431                           WRITEMASK_W));
432 
433       return deref(i4).val;
434    }
435 
436    /**
437     * \brief Lower a packSnorm2x16 expression.
438     *
439     * \param vec2_rval is packSnorm2x16's input
440     * \return packSnorm2x16's output as a uint rvalue
441     */
442    ir_rvalue*
lower_pack_snorm_2x16(ir_rvalue * vec2_rval)443    lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
444    {
445       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
446        *
447        *    highp uint packSnorm2x16(vec2 v)
448        *    --------------------------------
449        *    First, converts each component of the normalized floating-point value
450        *    v into 16-bit integer values. Then, the results are packed into the
451        *    returned 32-bit unsigned integer.
452        *
453        *    The conversion for component c of v to fixed point is done as
454        *    follows:
455        *
456        *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
457        *
458        *    The first component of the vector will be written to the least
459        *    significant bits of the output; the last component will be written to
460        *    the most significant bits.
461        *
462        * This function generates IR that approximates the following pseudo-GLSL:
463        *
464        *     return pack_uvec2_to_uint(
465        *         uvec2(ivec2(
466        *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
467        *
468        * It is necessary to first convert the vec2 to ivec2 rather than directly
469        * converting vec2 to uvec2 because the latter conversion is undefined.
470        * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
471        * convert a negative floating point value to an uint".
472        */
473       assert(vec2_rval->type == &glsl_type_builtin_vec2);
474 
475       ir_rvalue *result = pack_uvec2_to_uint(
476             i2u(f2i(round_even(mul(clamp(vec2_rval,
477                                          constant(-1.0f),
478                                          constant(1.0f)),
479                                    constant(32767.0f))))));
480 
481       assert(result->type == &glsl_type_builtin_uint);
482       return result;
483    }
484 
485    /**
486     * \brief Lower a packSnorm4x8 expression.
487     *
488     * \param vec4_rval is packSnorm4x8's input
489     * \return packSnorm4x8's output as a uint rvalue
490     */
491    ir_rvalue*
lower_pack_snorm_4x8(ir_rvalue * vec4_rval)492    lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
493    {
494       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
495        *
496        *    highp uint packSnorm4x8(vec4 v)
497        *    -------------------------------
498        *    First, converts each component of the normalized floating-point value
499        *    v into 8-bit integer values. Then, the results are packed into the
500        *    returned 32-bit unsigned integer.
501        *
502        *    The conversion for component c of v to fixed point is done as
503        *    follows:
504        *
505        *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
506        *
507        *    The first component of the vector will be written to the least
508        *    significant bits of the output; the last component will be written to
509        *    the most significant bits.
510        *
511        * This function generates IR that approximates the following pseudo-GLSL:
512        *
513        *     return pack_uvec4_to_uint(
514        *         uvec4(ivec4(
515        *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
516        *
517        * It is necessary to first convert the vec4 to ivec4 rather than directly
518        * converting vec4 to uvec4 because the latter conversion is undefined.
519        * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
520        * convert a negative floating point value to an uint".
521        */
522       assert(vec4_rval->type == &glsl_type_builtin_vec4);
523 
524       ir_rvalue *result = pack_uvec4_to_uint(
525             i2u(f2i(round_even(mul(clamp(vec4_rval,
526                                          constant(-1.0f),
527                                          constant(1.0f)),
528                                    constant(127.0f))))));
529 
530       assert(result->type == &glsl_type_builtin_uint);
531       return result;
532    }
533 
534    /**
535     * \brief Lower an unpackSnorm2x16 expression.
536     *
537     * \param uint_rval is unpackSnorm2x16's input
538     * \return unpackSnorm2x16's output as a vec2 rvalue
539     */
540    ir_rvalue*
lower_unpack_snorm_2x16(ir_rvalue * uint_rval)541    lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
542    {
543       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
544        *
545        *    highp vec2 unpackSnorm2x16 (highp uint p)
546        *    -----------------------------------------
547        *    First, unpacks a single 32-bit unsigned integer p into a pair of
548        *    16-bit unsigned integers. Then, each component is converted to
549        *    a normalized floating-point value to generate the returned
550        *    two-component vector.
551        *
552        *    The conversion for unpacked fixed-point value f to floating point is
553        *    done as follows:
554        *
555        *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
556        *
557        *    The first component of the returned vector will be extracted from the
558        *    least significant bits of the input; the last component will be
559        *    extracted from the most significant bits.
560        *
561        * This function generates IR that approximates the following pseudo-GLSL:
562        *
563        *    return clamp(
564        *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
565        *       -1.0f, 1.0f);
566        *
567        * The above IR may appear unnecessarily complex, but the intermediate
568        * conversion to ivec2 and the bit shifts are necessary to correctly unpack
569        * negative floats.
570        *
571        * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
572        * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
573        * place that int16 into an int32, which results in the *positive* integer
574        * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
575        * unimportant bit 16. We must now extend the int16's sign bit into bits
576        * 17-32, which is accomplished by left-shifting then right-shifting.
577        */
578 
579       assert(uint_rval->type == &glsl_type_builtin_uint);
580 
581       ir_rvalue *result =
582         clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
583                   constant(32767.0f)),
584               constant(-1.0f),
585               constant(1.0f));
586 
587       assert(result->type == &glsl_type_builtin_vec2);
588       return result;
589    }
590 
591    /**
592     * \brief Lower an unpackSnorm4x8 expression.
593     *
594     * \param uint_rval is unpackSnorm4x8's input
595     * \return unpackSnorm4x8's output as a vec4 rvalue
596     */
597    ir_rvalue*
lower_unpack_snorm_4x8(ir_rvalue * uint_rval)598    lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
599    {
600       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
601        *
602        *    highp vec4 unpackSnorm4x8 (highp uint p)
603        *    ----------------------------------------
604        *    First, unpacks a single 32-bit unsigned integer p into four
605        *    8-bit unsigned integers. Then, each component is converted to
606        *    a normalized floating-point value to generate the returned
607        *    four-component vector.
608        *
609        *    The conversion for unpacked fixed-point value f to floating point is
610        *    done as follows:
611        *
612        *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
613        *
614        *    The first component of the returned vector will be extracted from the
615        *    least significant bits of the input; the last component will be
616        *    extracted from the most significant bits.
617        *
618        * This function generates IR that approximates the following pseudo-GLSL:
619        *
620        *    return clamp(
621        *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
622        *       -1.0f, 1.0f);
623        *
624        * The above IR may appear unnecessarily complex, but the intermediate
625        * conversion to ivec4 and the bit shifts are necessary to correctly unpack
626        * negative floats.
627        *
628        * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
629        * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
630        * place that int8 into an int32, which results in the *positive* integer
631        * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
632        * unimportant bit 8. We must now extend the int8's sign bit into bits
633        * 9-32, which is accomplished by left-shifting then right-shifting.
634        */
635 
636       assert(uint_rval->type == &glsl_type_builtin_uint);
637 
638       ir_rvalue *result =
639         clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
640                   constant(127.0f)),
641               constant(-1.0f),
642               constant(1.0f));
643 
644       assert(result->type == &glsl_type_builtin_vec4);
645       return result;
646    }
647 
648    /**
649     * \brief Lower a packUnorm2x16 expression.
650     *
651     * \param vec2_rval is packUnorm2x16's input
652     * \return packUnorm2x16's output as a uint rvalue
653     */
654    ir_rvalue*
lower_pack_unorm_2x16(ir_rvalue * vec2_rval)655    lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
656    {
657       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
658        *
659        *    highp uint packUnorm2x16 (vec2 v)
660        *    ---------------------------------
661        *    First, converts each component of the normalized floating-point value
662        *    v into 16-bit integer values. Then, the results are packed into the
663        *    returned 32-bit unsigned integer.
664        *
665        *    The conversion for component c of v to fixed point is done as
666        *    follows:
667        *
668        *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
669        *
670        *    The first component of the vector will be written to the least
671        *    significant bits of the output; the last component will be written to
672        *    the most significant bits.
673        *
674        * This function generates IR that approximates the following pseudo-GLSL:
675        *
676        *     return pack_uvec2_to_uint(uvec2(
677        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
678        *
679        * Here it is safe to directly convert the vec2 to uvec2 because the vec2
680        * has been clamped to a non-negative range.
681        */
682 
683       assert(vec2_rval->type == &glsl_type_builtin_vec2);
684 
685       ir_rvalue *result = pack_uvec2_to_uint(
686          f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
687 
688       assert(result->type == &glsl_type_builtin_uint);
689       return result;
690    }
691 
692    /**
693     * \brief Lower a packUnorm4x8 expression.
694     *
695     * \param vec4_rval is packUnorm4x8's input
696     * \return packUnorm4x8's output as a uint rvalue
697     */
698    ir_rvalue*
lower_pack_unorm_4x8(ir_rvalue * vec4_rval)699    lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
700    {
701       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
702        *
703        *    highp uint packUnorm4x8 (vec4 v)
704        *    --------------------------------
705        *    First, converts each component of the normalized floating-point value
706        *    v into 8-bit integer values. Then, the results are packed into the
707        *    returned 32-bit unsigned integer.
708        *
709        *    The conversion for component c of v to fixed point is done as
710        *    follows:
711        *
712        *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
713        *
714        *    The first component of the vector will be written to the least
715        *    significant bits of the output; the last component will be written to
716        *    the most significant bits.
717        *
718        * This function generates IR that approximates the following pseudo-GLSL:
719        *
720        *     return pack_uvec4_to_uint(uvec4(
721        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
722        *
723        * Here it is safe to directly convert the vec4 to uvec4 because the vec4
724        * has been clamped to a non-negative range.
725        */
726 
727       assert(vec4_rval->type == &glsl_type_builtin_vec4);
728 
729       ir_rvalue *result = pack_uvec4_to_uint(
730          f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
731 
732       assert(result->type == &glsl_type_builtin_uint);
733       return result;
734    }
735 
736    /**
737     * \brief Lower an unpackUnorm2x16 expression.
738     *
739     * \param uint_rval is unpackUnorm2x16's input
740     * \return unpackUnorm2x16's output as a vec2 rvalue
741     */
742    ir_rvalue*
lower_unpack_unorm_2x16(ir_rvalue * uint_rval)743    lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
744    {
745       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
746        *
747        *    highp vec2 unpackUnorm2x16 (highp uint p)
748        *    -----------------------------------------
749        *    First, unpacks a single 32-bit unsigned integer p into a pair of
750        *    16-bit unsigned integers. Then, each component is converted to
751        *    a normalized floating-point value to generate the returned
752        *    two-component vector.
753        *
754        *    The conversion for unpacked fixed-point value f to floating point is
755        *    done as follows:
756        *
757        *       unpackUnorm2x16: f / 65535.0
758        *
759        *    The first component of the returned vector will be extracted from the
760        *    least significant bits of the input; the last component will be
761        *    extracted from the most significant bits.
762        *
763        * This function generates IR that approximates the following pseudo-GLSL:
764        *
765        *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
766        */
767 
768       assert(uint_rval->type == &glsl_type_builtin_uint);
769 
770       ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
771                               constant(65535.0f));
772 
773       assert(result->type == &glsl_type_builtin_vec2);
774       return result;
775    }
776 
777    /**
778     * \brief Lower an unpackUnorm4x8 expression.
779     *
780     * \param uint_rval is unpackUnorm4x8's input
781     * \return unpackUnorm4x8's output as a vec4 rvalue
782     */
783    ir_rvalue*
lower_unpack_unorm_4x8(ir_rvalue * uint_rval)784    lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
785    {
786       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
787        *
788        *    highp vec4 unpackUnorm4x8 (highp uint p)
789        *    ----------------------------------------
790        *    First, unpacks a single 32-bit unsigned integer p into four
791        *    8-bit unsigned integers. Then, each component is converted to
792        *    a normalized floating-point value to generate the returned
793        *    two-component vector.
794        *
795        *    The conversion for unpacked fixed-point value f to floating point is
796        *    done as follows:
797        *
798        *       unpackUnorm4x8: f / 255.0
799        *
800        *    The first component of the returned vector will be extracted from the
801        *    least significant bits of the input; the last component will be
802        *    extracted from the most significant bits.
803        *
804        * This function generates IR that approximates the following pseudo-GLSL:
805        *
806        *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
807        */
808 
809       assert(uint_rval->type == &glsl_type_builtin_uint);
810 
811       ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
812                               constant(255.0f));
813 
814       assert(result->type == &glsl_type_builtin_vec4);
815       return result;
816    }
817 
818    /**
819     * \brief Lower the component-wise calculation of packHalf2x16.
820     *
821     * \param f_rval is one component of packHafl2x16's input
822     * \param e_rval is the unshifted exponent bits of f_rval
823     * \param m_rval is the unshifted mantissa bits of f_rval
824     *
825     * \return a uint rvalue that encodes a float16 in its lower 16 bits
826     */
827    ir_rvalue*
pack_half_1x16_nosign(ir_rvalue * f_rval,ir_rvalue * e_rval,ir_rvalue * m_rval)828    pack_half_1x16_nosign(ir_rvalue *f_rval,
829                          ir_rvalue *e_rval,
830                          ir_rvalue *m_rval)
831    {
832       assert(e_rval->type == &glsl_type_builtin_uint);
833       assert(m_rval->type == &glsl_type_builtin_uint);
834 
835       /* uint u16; */
836       ir_variable *u16 = factory.make_temp(&glsl_type_builtin_uint,
837                                            "tmp_pack_half_1x16_u16");
838 
839       /* float f = FLOAT_RVAL; */
840       ir_variable *f = factory.make_temp(&glsl_type_builtin_float,
841                                           "tmp_pack_half_1x16_f");
842       factory.emit(assign(f, f_rval));
843 
844       /* uint e = E_RVAL; */
845       ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
846                                           "tmp_pack_half_1x16_e");
847       factory.emit(assign(e, e_rval));
848 
849       /* uint m = M_RVAL; */
850       ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
851                                           "tmp_pack_half_1x16_m");
852       factory.emit(assign(m, m_rval));
853 
854       /* Preliminaries
855        * -------------
856        *
857        * For a float16, the bit layout is:
858        *
859        *   sign:     15
860        *   exponent: 10:14
861        *   mantissa: 0:9
862        *
863        * Let f16 be a float16 value. The sign, exponent, and mantissa
864        * determine its value thus:
865        *
866        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
867        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
868        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
869        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
870        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
871        *
872        * where 0 <= m16 < 2^10.
873        *
874        * For a float32, the bit layout is:
875        *
876        *   sign:     31
877        *   exponent: 23:30
878        *   mantissa: 0:22
879        *
880        * Let f32 be a float32 value. The sign, exponent, and mantissa
881        * determine its value thus:
882        *
883        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
884        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
885        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
886        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
887        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
888        *
889        * where 0 <= m32 < 2^23.
890        *
891        * The minimum and maximum normal float16 values are
892        *
893        *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
894        *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
895        *
896        * The step at max_norm16 is
897        *
898        *   max_step16 = 2^5                                     (22)
899        *
900        * Observe that the float16 boundary values in equations 20-21 lie in the
901        * range of normal float32 values.
902        *
903        *
904        * Rounding Behavior
905        * -----------------
906        * Not all float32 values can be exactly represented as a float16. We
907        * round all such intermediate float32 values to the nearest float16; if
908        * the float32 is exactly between to float16 values, we round to the one
909        * with an even mantissa. This rounding behavior has several benefits:
910        *
911        *   - It has no sign bias.
912        *
913        *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
914        *     GPU ISA.
915        *
916        *   - By reproducing the behavior of the GPU (at least on Intel hardware),
917        *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
918        *     result in the same value as if the expression were executed on the
919        *     GPU.
920        *
921        * Calculation
922        * -----------
923        * Our task is to compute s16, e16, m16 given f32.  Since this function
924        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
925        * cases consider.
926        */
927 
928       factory.emit(
929 
930          /* Case 1) f32 is NaN
931           *
932           *   The resultant f16 will also be NaN.
933           */
934 
935          /* if (e32 == 255 && m32 != 0) { */
936          if_tree(logic_and(equal(e, constant(0xffu << 23u)),
937                            logic_not(equal(m, constant(0u)))),
938 
939             assign(u16, constant(0x7fffu)),
940 
941          /* Case 2) f32 lies in the range [0, min_norm16).
942           *
943           *   The resultant float16 will be either zero, subnormal, or normal.
944           *
945           *   Solving
946           *
947           *     f32 = min_norm16       (30)
948           *
949           *   gives
950           *
951           *     e32 = 113 and m32 = 0  (31)
952           *
953           *   Therefore this case occurs if and only if
954           *
955           *     e32 < 113              (32)
956           */
957 
958          /* } else if (e32 < 113) { */
959          if_tree(less(e, constant(113u << 23u)),
960 
961             /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
962             assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
963                                            constant((float) (1 << 24)))))),
964 
965          /* Case 3) f32 lies in the range
966           *         [min_norm16, max_norm16 + max_step16).
967           *
968           *   The resultant float16 will be either normal or infinite.
969           *
970           *   Solving
971           *
972           *     f32 = max_norm16 + max_step16           (40)
973           *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
974           *         = 2^16                              (42)
975           *   gives
976           *
977           *     e32 = 143 and m32 = 0                   (43)
978           *
979           *   We already solved the boundary condition f32 = min_norm16 above
980           *   in equation 31. Therefore this case occurs if and only if
981           *
982           *     113 <= e32 and e32 < 143
983           */
984 
985          /* } else if (e32 < 143) { */
986          if_tree(less(e, constant(143u << 23u)),
987 
988             /* The addition below handles the case where the mantissa rounds
989              * up to 1024 and bumps the exponent.
990              *
991              * u16 = ((e - (112u << 23u)) >> 13u)
992              *     + round_to_even((float(m) / (1u << 13u));
993              */
994             assign(u16, add(rshift(sub(e, constant(112u << 23u)),
995                                    constant(13u)),
996                             f2u(round_even(
997                                   div(u2f(m), constant((float) (1 << 13))))))),
998 
999          /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
1000           *
1001           *   The resultant float16 will be infinite.
1002           *
1003           *   The cases above caught all float32 values in the range
1004           *   [0, max_norm16 + max_step16), so this is the fall-through case.
1005           */
1006 
1007          /* } else { */
1008 
1009             assign(u16, constant(31u << 10u))))));
1010 
1011          /* } */
1012 
1013        return deref(u16).val;
1014    }
1015 
1016    /**
1017     * \brief Lower a packHalf2x16 expression.
1018     *
1019     * \param vec2_rval is packHalf2x16's input
1020     * \return packHalf2x16's output as a uint rvalue
1021     */
1022    ir_rvalue*
lower_pack_half_2x16(ir_rvalue * vec2_rval)1023    lower_pack_half_2x16(ir_rvalue *vec2_rval)
1024    {
1025       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1026        *
1027        *    highp uint packHalf2x16 (mediump vec2 v)
1028        *    ----------------------------------------
1029        *    Returns an unsigned integer obtained by converting the components of
1030        *    a two-component floating-point vector to the 16-bit floating-point
1031        *    representation found in the OpenGL ES Specification, and then packing
1032        *    these two 16-bit integers into a 32-bit unsigned integer.
1033        *
1034        *    The first vector component specifies the 16 least- significant bits
1035        *    of the result; the second component specifies the 16 most-significant
1036        *    bits.
1037        */
1038 
1039       assert(vec2_rval->type == &glsl_type_builtin_vec2);
1040 
1041       /* vec2 f = VEC2_RVAL; */
1042       ir_variable *f = factory.make_temp(&glsl_type_builtin_vec2,
1043                                          "tmp_pack_half_2x16_f");
1044       factory.emit(assign(f, vec2_rval));
1045 
1046       /* uvec2 f32 = bitcast_f2u(f); */
1047       ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1048                                             "tmp_pack_half_2x16_f32");
1049       factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1050 
1051       /* uvec2 f16; */
1052       ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1053                                         "tmp_pack_half_2x16_f16");
1054 
1055       /* Get f32's unshifted exponent bits.
1056        *
1057        *   uvec2 e = f32 & 0x7f800000u;
1058        */
1059       ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1060                                           "tmp_pack_half_2x16_e");
1061       factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1062 
1063       /* Get f32's unshifted mantissa bits.
1064        *
1065        *   uvec2 m = f32 & 0x007fffffu;
1066        */
1067       ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1068                                           "tmp_pack_half_2x16_m");
1069       factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1070 
1071       /* Set f16's exponent and mantissa bits.
1072        *
1073        *   f16.x = pack_half_1x16_nosign(e.x, m.x);
1074        *   f16.y = pack_half_1y16_nosign(e.y, m.y);
1075        */
1076       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1077                                                      swizzle_x(e),
1078                                                      swizzle_x(m)),
1079                            WRITEMASK_X));
1080       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1081                                                      swizzle_y(e),
1082                                                      swizzle_y(m)),
1083                            WRITEMASK_Y));
1084 
1085       /* Set f16's sign bits.
1086        *
1087        *   f16 |= (f32 & (1u << 31u) >> 16u;
1088        */
1089       factory.emit(
1090          assign(f16, bit_or(f16,
1091                             rshift(bit_and(f32, constant(1u << 31u)),
1092                                    constant(16u)))));
1093 
1094 
1095       /* return (f16.y << 16u) | f16.x; */
1096       ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1097                                         constant(16u)),
1098                                  swizzle_x(f16));
1099 
1100       assert(result->type == &glsl_type_builtin_uint);
1101       return result;
1102    }
1103 
1104    /**
1105     * \brief Lower the component-wise calculation of unpackHalf2x16.
1106     *
1107     * Given a uint that encodes a float16 in its lower 16 bits, this function
1108     * returns a uint that encodes a float32 with the same value. The sign bit
1109     * of the float16 is ignored.
1110     *
1111     * \param e_rval is the unshifted exponent bits of a float16
1112     * \param m_rval is the unshifted mantissa bits of a float16
1113     * \param a uint rvalue that encodes a float32
1114     */
1115    ir_rvalue*
unpack_half_1x16_nosign(ir_rvalue * e_rval,ir_rvalue * m_rval)1116    unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1117    {
1118       assert(e_rval->type == &glsl_type_builtin_uint);
1119       assert(m_rval->type == &glsl_type_builtin_uint);
1120 
1121       /* uint u32; */
1122       ir_variable *u32 = factory.make_temp(&glsl_type_builtin_uint,
1123                                            "tmp_unpack_half_1x16_u32");
1124 
1125       /* uint e = E_RVAL; */
1126       ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
1127                                           "tmp_unpack_half_1x16_e");
1128       factory.emit(assign(e, e_rval));
1129 
1130       /* uint m = M_RVAL; */
1131       ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
1132                                           "tmp_unpack_half_1x16_m");
1133       factory.emit(assign(m, m_rval));
1134 
1135       /* Preliminaries
1136        * -------------
1137        *
1138        * For a float16, the bit layout is:
1139        *
1140        *   sign:     15
1141        *   exponent: 10:14
1142        *   mantissa: 0:9
1143        *
1144        * Let f16 be a float16 value. The sign, exponent, and mantissa
1145        * determine its value thus:
1146        *
1147        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
1148        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
1149        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1150        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
1151        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
1152        *
1153        * where 0 <= m16 < 2^10.
1154        *
1155        * For a float32, the bit layout is:
1156        *
1157        *   sign: 31
1158        *   exponent: 23:30
1159        *   mantissa: 0:22
1160        *
1161        * Let f32 be a float32 value. The sign, exponent, and mantissa
1162        * determine its value thus:
1163        *
1164        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
1165        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
1166        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1167        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
1168        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
1169        *
1170        * where 0 <= m32 < 2^23.
1171        *
1172        * Calculation
1173        * -----------
1174        * Our task is to compute s32, e32, m32 given f16.  Since this function
1175        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
1176        * cases consider.
1177        */
1178 
1179       factory.emit(
1180 
1181          /* Case 1) f16 is zero or subnormal.
1182           *
1183           *   The simplest method of calcuating f32 in this case is
1184           *
1185           *     f32 = f16                       (20)
1186           *         = 2^(-14) * (m16 / 2^10)    (21)
1187           *         = m16 / 2^(-24)             (22)
1188           */
1189 
1190          /* if (e16 == 0) { */
1191          if_tree(equal(e, constant(0u)),
1192 
1193             /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1194             assign(u32, expr(ir_unop_bitcast_f2u,
1195                                 div(u2f(m), constant((float)(1 << 24))))),
1196 
1197          /* Case 2) f16 is normal.
1198           *
1199           *   The equation
1200           *
1201           *     f32 = f16                              (30)
1202           *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
1203           *       2^(e16 - 15) * (1 + m16 / 2^10)
1204           *
1205           *   can be decomposed into two
1206           *
1207           *     2^(e32 - 127) = 2^(e16 - 15)           (32)
1208           *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
1209           *
1210           *   which solve to
1211           *
1212           *     e32 = e16 + 112                        (34)
1213           *     m32 = m16 * 2^13                       (35)
1214           */
1215 
1216          /* } else if (e16 < 31)) { */
1217          if_tree(less(e, constant(31u << 10u)),
1218 
1219               /* u32 = ((e + (112 << 10)) | m) << 13;
1220                */
1221               assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1222                                  constant(13u))),
1223 
1224 
1225          /* Case 3) f16 is infinite. */
1226          if_tree(equal(m, constant(0u)),
1227 
1228                  assign(u32, constant(255u << 23u)),
1229 
1230          /* Case 4) f16 is NaN. */
1231          /* } else { */
1232 
1233             assign(u32, constant(0x7fffffffu))))));
1234 
1235          /* } */
1236 
1237       return deref(u32).val;
1238    }
1239 
1240    /**
1241     * \brief Lower an unpackHalf2x16 expression.
1242     *
1243     * \param uint_rval is unpackHalf2x16's input
1244     * \return unpackHalf2x16's output as a vec2 rvalue
1245     */
1246    ir_rvalue*
lower_unpack_half_2x16(ir_rvalue * uint_rval)1247    lower_unpack_half_2x16(ir_rvalue *uint_rval)
1248    {
1249       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1250        *
1251        *    mediump vec2 unpackHalf2x16 (highp uint v)
1252        *    ------------------------------------------
1253        *    Returns a two-component floating-point vector with components
1254        *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1255        *    values, interpreting those values as 16-bit floating-point numbers
1256        *    according to the OpenGL ES Specification, and converting them to
1257        *    32-bit floating-point values.
1258        *
1259        *    The first component of the vector is obtained from the
1260        *    16 least-significant bits of v; the second component is obtained
1261        *    from the 16 most-significant bits of v.
1262        */
1263       assert(uint_rval->type == &glsl_type_builtin_uint);
1264 
1265       /* uint u = RVALUE;
1266        * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1267        */
1268       ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1269                                             "tmp_unpack_half_2x16_f16");
1270       factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1271 
1272       /* uvec2 f32; */
1273       ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1274                                             "tmp_unpack_half_2x16_f32");
1275 
1276       /* Get f16's unshifted exponent bits.
1277        *
1278        *    uvec2 e = f16 & 0x7c00u;
1279        */
1280       ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1281                                           "tmp_unpack_half_2x16_e");
1282       factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1283 
1284       /* Get f16's unshifted mantissa bits.
1285        *
1286        *    uvec2 m = f16 & 0x03ffu;
1287        */
1288       ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1289                                           "tmp_unpack_half_2x16_m");
1290       factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1291 
1292       /* Set f32's exponent and mantissa bits.
1293        *
1294        *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
1295        *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
1296        */
1297       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1298                                                        swizzle_x(m)),
1299                            WRITEMASK_X));
1300       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1301                                                        swizzle_y(m)),
1302                            WRITEMASK_Y));
1303 
1304       /* Set f32's sign bit.
1305        *
1306        *    f32 |= (f16 & 0x8000u) << 16u;
1307        */
1308       factory.emit(assign(f32, bit_or(f32,
1309                                        lshift(bit_and(f16,
1310                                                       constant(0x8000u)),
1311                                               constant(16u)))));
1312 
1313       /* return bitcast_u2f(f32); */
1314       ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1315       assert(result->type == &glsl_type_builtin_vec2);
1316       return result;
1317    }
1318 };
1319 
1320 } // namespace anonymous
1321 
1322 /**
1323  * \brief Lower the builtin packing functions.
1324  */
1325 bool
lower_packing_builtins(exec_list * instructions,bool has_shading_language_packing,bool has_gpu_shader5,bool has_half_float_packing)1326 lower_packing_builtins(exec_list *instructions,
1327                        bool has_shading_language_packing,
1328                        bool has_gpu_shader5,
1329                        bool has_half_float_packing)
1330 {
1331    if (!has_shading_language_packing)
1332       return false;
1333 
1334    int op_mask = LOWER_PACK_SNORM_2x16 |
1335                  LOWER_UNPACK_SNORM_2x16 |
1336                  LOWER_PACK_UNORM_2x16 |
1337                  LOWER_UNPACK_UNORM_2x16 |
1338                  LOWER_PACK_SNORM_4x8 |
1339                  LOWER_UNPACK_SNORM_4x8 |
1340                  LOWER_UNPACK_UNORM_4x8 |
1341                  LOWER_PACK_UNORM_4x8;
1342 
1343    if (has_gpu_shader5)
1344       op_mask |= LOWER_PACK_USE_BFI | LOWER_PACK_USE_BFE;
1345 
1346    if (!has_half_float_packing)
1347       op_mask |= LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16;
1348 
1349    lower_packing_builtins_visitor v(op_mask);
1350    visit_list_elements(&v, instructions, true);
1351    return v.get_progress();
1352 }
1353