1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "ir.h"
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
28
29 namespace {
30
31 using namespace ir_builder;
32
33 /**
34 * A visitor that lowers built-in floating-point pack/unpack expressions
35 * such packSnorm2x16.
36 */
37 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
38 public:
39 /**
40 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
41 */
lower_packing_builtins_visitor(int op_mask)42 explicit lower_packing_builtins_visitor(int op_mask)
43 : op_mask(op_mask),
44 progress(false)
45 {
46 factory.instructions = &factory_instructions;
47 }
48
~lower_packing_builtins_visitor()49 virtual ~lower_packing_builtins_visitor()
50 {
51 assert(factory_instructions.is_empty());
52 }
53
get_progress()54 bool get_progress() { return progress; }
55
handle_rvalue(ir_rvalue ** rvalue)56 void handle_rvalue(ir_rvalue **rvalue)
57 {
58 if (!*rvalue)
59 return;
60
61 ir_expression *expr = (*rvalue)->as_expression();
62 if (!expr)
63 return;
64
65 enum lower_packing_builtins_op lowering_op =
66 choose_lowering_op(expr->operation);
67
68 if (lowering_op == LOWER_PACK_UNPACK_NONE)
69 return;
70
71 setup_factory(ralloc_parent(expr));
72
73 ir_rvalue *op0 = expr->operands[0];
74 ralloc_steal(factory.mem_ctx, op0);
75
76 switch (lowering_op) {
77 case LOWER_PACK_SNORM_2x16:
78 *rvalue = lower_pack_snorm_2x16(op0);
79 break;
80 case LOWER_PACK_SNORM_4x8:
81 *rvalue = lower_pack_snorm_4x8(op0);
82 break;
83 case LOWER_PACK_UNORM_2x16:
84 *rvalue = lower_pack_unorm_2x16(op0);
85 break;
86 case LOWER_PACK_UNORM_4x8:
87 *rvalue = lower_pack_unorm_4x8(op0);
88 break;
89 case LOWER_PACK_HALF_2x16:
90 *rvalue = lower_pack_half_2x16(op0);
91 break;
92 case LOWER_UNPACK_SNORM_2x16:
93 *rvalue = lower_unpack_snorm_2x16(op0);
94 break;
95 case LOWER_UNPACK_SNORM_4x8:
96 *rvalue = lower_unpack_snorm_4x8(op0);
97 break;
98 case LOWER_UNPACK_UNORM_2x16:
99 *rvalue = lower_unpack_unorm_2x16(op0);
100 break;
101 case LOWER_UNPACK_UNORM_4x8:
102 *rvalue = lower_unpack_unorm_4x8(op0);
103 break;
104 case LOWER_UNPACK_HALF_2x16:
105 *rvalue = lower_unpack_half_2x16(op0);
106 break;
107 case LOWER_PACK_UNPACK_NONE:
108 case LOWER_PACK_USE_BFI:
109 case LOWER_PACK_USE_BFE:
110 assert(!"not reached");
111 break;
112 }
113
114 teardown_factory();
115 progress = true;
116 }
117
118 private:
119 const int op_mask;
120 bool progress;
121 ir_factory factory;
122 exec_list factory_instructions;
123
124 /**
125 * Determine the needed lowering operation by filtering \a expr_op
126 * through \ref op_mask.
127 */
128 enum lower_packing_builtins_op
choose_lowering_op(ir_expression_operation expr_op)129 choose_lowering_op(ir_expression_operation expr_op)
130 {
131 /* C++ regards int and enum as fundamentally different types.
132 * So, we can't simply return from each case; we must cast the return
133 * value.
134 */
135 int result;
136
137 switch (expr_op) {
138 case ir_unop_pack_snorm_2x16:
139 result = op_mask & LOWER_PACK_SNORM_2x16;
140 break;
141 case ir_unop_pack_snorm_4x8:
142 result = op_mask & LOWER_PACK_SNORM_4x8;
143 break;
144 case ir_unop_pack_unorm_2x16:
145 result = op_mask & LOWER_PACK_UNORM_2x16;
146 break;
147 case ir_unop_pack_unorm_4x8:
148 result = op_mask & LOWER_PACK_UNORM_4x8;
149 break;
150 case ir_unop_pack_half_2x16:
151 result = op_mask & LOWER_PACK_HALF_2x16;
152 break;
153 case ir_unop_unpack_snorm_2x16:
154 result = op_mask & LOWER_UNPACK_SNORM_2x16;
155 break;
156 case ir_unop_unpack_snorm_4x8:
157 result = op_mask & LOWER_UNPACK_SNORM_4x8;
158 break;
159 case ir_unop_unpack_unorm_2x16:
160 result = op_mask & LOWER_UNPACK_UNORM_2x16;
161 break;
162 case ir_unop_unpack_unorm_4x8:
163 result = op_mask & LOWER_UNPACK_UNORM_4x8;
164 break;
165 case ir_unop_unpack_half_2x16:
166 result = op_mask & LOWER_UNPACK_HALF_2x16;
167 break;
168 default:
169 result = LOWER_PACK_UNPACK_NONE;
170 break;
171 }
172
173 return static_cast<enum lower_packing_builtins_op>(result);
174 }
175
176 void
setup_factory(void * mem_ctx)177 setup_factory(void *mem_ctx)
178 {
179 assert(factory.mem_ctx == NULL);
180 assert(factory.instructions->is_empty());
181
182 factory.mem_ctx = mem_ctx;
183 }
184
185 void
teardown_factory()186 teardown_factory()
187 {
188 base_ir->insert_before(factory.instructions);
189 assert(factory.instructions->is_empty());
190 factory.mem_ctx = NULL;
191 }
192
193 template <typename T>
194 ir_constant*
constant(T x)195 constant(T x)
196 {
197 return factory.constant(x);
198 }
199
200 /**
201 * \brief Pack two uint16's into a single uint32.
202 *
203 * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
204 * where the least significant bits specify the first element of the pair.
205 * Return the uint32.
206 */
207 ir_rvalue*
pack_uvec2_to_uint(ir_rvalue * uvec2_rval)208 pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
209 {
210 assert(uvec2_rval->type == glsl_type::uvec2_type);
211
212 /* uvec2 u = UVEC2_RVAL; */
213 ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
214 "tmp_pack_uvec2_to_uint");
215 factory.emit(assign(u, uvec2_rval));
216
217 if (op_mask & LOWER_PACK_USE_BFI) {
218 return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
219 swizzle_y(u),
220 constant(16u),
221 constant(16u));
222 }
223
224 /* return (u.y << 16) | (u.x & 0xffff); */
225 return bit_or(lshift(swizzle_y(u), constant(16u)),
226 bit_and(swizzle_x(u), constant(0xffffu)));
227 }
228
229 /**
230 * \brief Pack four uint8's into a single uint32.
231 *
232 * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
233 * uint32 where the least significant bits specify the first element of the
234 * 4-tuple. Return the uint32.
235 */
236 ir_rvalue*
pack_uvec4_to_uint(ir_rvalue * uvec4_rval)237 pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
238 {
239 assert(uvec4_rval->type == glsl_type::uvec4_type);
240
241 ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
242 "tmp_pack_uvec4_to_uint");
243
244 if (op_mask & LOWER_PACK_USE_BFI) {
245 /* uvec4 u = UVEC4_RVAL; */
246 factory.emit(assign(u, uvec4_rval));
247
248 return bitfield_insert(bitfield_insert(
249 bitfield_insert(
250 bit_and(swizzle_x(u), constant(0xffu)),
251 swizzle_y(u), constant(8u), constant(8u)),
252 swizzle_z(u), constant(16u), constant(8u)),
253 swizzle_w(u), constant(24u), constant(8u));
254 }
255
256 /* uvec4 u = UVEC4_RVAL & 0xff */
257 factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
258
259 /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
260 return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
261 lshift(swizzle_z(u), constant(16u))),
262 bit_or(lshift(swizzle_y(u), constant(8u)),
263 swizzle_x(u)));
264 }
265
266 /**
267 * \brief Unpack a uint32 into two uint16's.
268 *
269 * Interpret the given uint32 as a uint16 pair where the uint32's least
270 * significant bits specify the pair's first element. Return the uint16
271 * pair as a uvec2.
272 */
273 ir_rvalue*
unpack_uint_to_uvec2(ir_rvalue * uint_rval)274 unpack_uint_to_uvec2(ir_rvalue *uint_rval)
275 {
276 assert(uint_rval->type == glsl_type::uint_type);
277
278 /* uint u = UINT_RVAL; */
279 ir_variable *u = factory.make_temp(glsl_type::uint_type,
280 "tmp_unpack_uint_to_uvec2_u");
281 factory.emit(assign(u, uint_rval));
282
283 /* uvec2 u2; */
284 ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
285 "tmp_unpack_uint_to_uvec2_u2");
286
287 /* u2.x = u & 0xffffu; */
288 factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
289
290 /* u2.y = u >> 16u; */
291 factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
292
293 return deref(u2).val;
294 }
295
296 /**
297 * \brief Unpack a uint32 into two int16's.
298 *
299 * Specifically each 16-bit value is sign-extended to the full width of an
300 * int32 on return.
301 */
302 ir_rvalue *
unpack_uint_to_ivec2(ir_rvalue * uint_rval)303 unpack_uint_to_ivec2(ir_rvalue *uint_rval)
304 {
305 assert(uint_rval->type == glsl_type::uint_type);
306
307 if (!(op_mask & LOWER_PACK_USE_BFE)) {
308 return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
309 constant(16u)),
310 constant(16u));
311 }
312
313 ir_variable *i = factory.make_temp(glsl_type::int_type,
314 "tmp_unpack_uint_to_ivec2_i");
315 factory.emit(assign(i, u2i(uint_rval)));
316
317 /* ivec2 i2; */
318 ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
319 "tmp_unpack_uint_to_ivec2_i2");
320
321 factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
322 WRITEMASK_X));
323 factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
324 WRITEMASK_Y));
325
326 return deref(i2).val;
327 }
328
329 /**
330 * \brief Unpack a uint32 into four uint8's.
331 *
332 * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
333 * significant bits specify the 4-tuple's first element. Return the uint8
334 * 4-tuple as a uvec4.
335 */
336 ir_rvalue*
unpack_uint_to_uvec4(ir_rvalue * uint_rval)337 unpack_uint_to_uvec4(ir_rvalue *uint_rval)
338 {
339 assert(uint_rval->type == glsl_type::uint_type);
340
341 /* uint u = UINT_RVAL; */
342 ir_variable *u = factory.make_temp(glsl_type::uint_type,
343 "tmp_unpack_uint_to_uvec4_u");
344 factory.emit(assign(u, uint_rval));
345
346 /* uvec4 u4; */
347 ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
348 "tmp_unpack_uint_to_uvec4_u4");
349
350 /* u4.x = u & 0xffu; */
351 factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
352
353 if (op_mask & LOWER_PACK_USE_BFE) {
354 /* u4.y = bitfield_extract(u, 8, 8); */
355 factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
356 WRITEMASK_Y));
357
358 /* u4.z = bitfield_extract(u, 16, 8); */
359 factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
360 WRITEMASK_Z));
361 } else {
362 /* u4.y = (u >> 8u) & 0xffu; */
363 factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
364 constant(0xffu)), WRITEMASK_Y));
365
366 /* u4.z = (u >> 16u) & 0xffu; */
367 factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
368 constant(0xffu)), WRITEMASK_Z));
369 }
370
371 /* u4.w = (u >> 24u) */
372 factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
373
374 return deref(u4).val;
375 }
376
377 /**
378 * \brief Unpack a uint32 into four int8's.
379 *
380 * Specifically each 8-bit value is sign-extended to the full width of an
381 * int32 on return.
382 */
383 ir_rvalue *
unpack_uint_to_ivec4(ir_rvalue * uint_rval)384 unpack_uint_to_ivec4(ir_rvalue *uint_rval)
385 {
386 assert(uint_rval->type == glsl_type::uint_type);
387
388 if (!(op_mask & LOWER_PACK_USE_BFE)) {
389 return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
390 constant(24u)),
391 constant(24u));
392 }
393
394 ir_variable *i = factory.make_temp(glsl_type::int_type,
395 "tmp_unpack_uint_to_ivec4_i");
396 factory.emit(assign(i, u2i(uint_rval)));
397
398 /* ivec4 i4; */
399 ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
400 "tmp_unpack_uint_to_ivec4_i4");
401
402 factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
403 WRITEMASK_X));
404 factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
405 WRITEMASK_Y));
406 factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
407 WRITEMASK_Z));
408 factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
409 WRITEMASK_W));
410
411 return deref(i4).val;
412 }
413
414 /**
415 * \brief Lower a packSnorm2x16 expression.
416 *
417 * \param vec2_rval is packSnorm2x16's input
418 * \return packSnorm2x16's output as a uint rvalue
419 */
420 ir_rvalue*
lower_pack_snorm_2x16(ir_rvalue * vec2_rval)421 lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
422 {
423 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
424 *
425 * highp uint packSnorm2x16(vec2 v)
426 * --------------------------------
427 * First, converts each component of the normalized floating-point value
428 * v into 16-bit integer values. Then, the results are packed into the
429 * returned 32-bit unsigned integer.
430 *
431 * The conversion for component c of v to fixed point is done as
432 * follows:
433 *
434 * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
435 *
436 * The first component of the vector will be written to the least
437 * significant bits of the output; the last component will be written to
438 * the most significant bits.
439 *
440 * This function generates IR that approximates the following pseudo-GLSL:
441 *
442 * return pack_uvec2_to_uint(
443 * uvec2(ivec2(
444 * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
445 *
446 * It is necessary to first convert the vec2 to ivec2 rather than directly
447 * converting vec2 to uvec2 because the latter conversion is undefined.
448 * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
449 * convert a negative floating point value to an uint".
450 */
451 assert(vec2_rval->type == glsl_type::vec2_type);
452
453 ir_rvalue *result = pack_uvec2_to_uint(
454 i2u(f2i(round_even(mul(clamp(vec2_rval,
455 constant(-1.0f),
456 constant(1.0f)),
457 constant(32767.0f))))));
458
459 assert(result->type == glsl_type::uint_type);
460 return result;
461 }
462
463 /**
464 * \brief Lower a packSnorm4x8 expression.
465 *
466 * \param vec4_rval is packSnorm4x8's input
467 * \return packSnorm4x8's output as a uint rvalue
468 */
469 ir_rvalue*
lower_pack_snorm_4x8(ir_rvalue * vec4_rval)470 lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
471 {
472 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
473 *
474 * highp uint packSnorm4x8(vec4 v)
475 * -------------------------------
476 * First, converts each component of the normalized floating-point value
477 * v into 8-bit integer values. Then, the results are packed into the
478 * returned 32-bit unsigned integer.
479 *
480 * The conversion for component c of v to fixed point is done as
481 * follows:
482 *
483 * packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
484 *
485 * The first component of the vector will be written to the least
486 * significant bits of the output; the last component will be written to
487 * the most significant bits.
488 *
489 * This function generates IR that approximates the following pseudo-GLSL:
490 *
491 * return pack_uvec4_to_uint(
492 * uvec4(ivec4(
493 * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
494 *
495 * It is necessary to first convert the vec4 to ivec4 rather than directly
496 * converting vec4 to uvec4 because the latter conversion is undefined.
497 * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
498 * convert a negative floating point value to an uint".
499 */
500 assert(vec4_rval->type == glsl_type::vec4_type);
501
502 ir_rvalue *result = pack_uvec4_to_uint(
503 i2u(f2i(round_even(mul(clamp(vec4_rval,
504 constant(-1.0f),
505 constant(1.0f)),
506 constant(127.0f))))));
507
508 assert(result->type == glsl_type::uint_type);
509 return result;
510 }
511
512 /**
513 * \brief Lower an unpackSnorm2x16 expression.
514 *
515 * \param uint_rval is unpackSnorm2x16's input
516 * \return unpackSnorm2x16's output as a vec2 rvalue
517 */
518 ir_rvalue*
lower_unpack_snorm_2x16(ir_rvalue * uint_rval)519 lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
520 {
521 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
522 *
523 * highp vec2 unpackSnorm2x16 (highp uint p)
524 * -----------------------------------------
525 * First, unpacks a single 32-bit unsigned integer p into a pair of
526 * 16-bit unsigned integers. Then, each component is converted to
527 * a normalized floating-point value to generate the returned
528 * two-component vector.
529 *
530 * The conversion for unpacked fixed-point value f to floating point is
531 * done as follows:
532 *
533 * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
534 *
535 * The first component of the returned vector will be extracted from the
536 * least significant bits of the input; the last component will be
537 * extracted from the most significant bits.
538 *
539 * This function generates IR that approximates the following pseudo-GLSL:
540 *
541 * return clamp(
542 * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
543 * -1.0f, 1.0f);
544 *
545 * The above IR may appear unnecessarily complex, but the intermediate
546 * conversion to ivec2 and the bit shifts are necessary to correctly unpack
547 * negative floats.
548 *
549 * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
550 * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
551 * place that int16 into an int32, which results in the *positive* integer
552 * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
553 * unimportant bit 16. We must now extend the int16's sign bit into bits
554 * 17-32, which is accomplished by left-shifting then right-shifting.
555 */
556
557 assert(uint_rval->type == glsl_type::uint_type);
558
559 ir_rvalue *result =
560 clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
561 constant(32767.0f)),
562 constant(-1.0f),
563 constant(1.0f));
564
565 assert(result->type == glsl_type::vec2_type);
566 return result;
567 }
568
569 /**
570 * \brief Lower an unpackSnorm4x8 expression.
571 *
572 * \param uint_rval is unpackSnorm4x8's input
573 * \return unpackSnorm4x8's output as a vec4 rvalue
574 */
575 ir_rvalue*
lower_unpack_snorm_4x8(ir_rvalue * uint_rval)576 lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
577 {
578 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
579 *
580 * highp vec4 unpackSnorm4x8 (highp uint p)
581 * ----------------------------------------
582 * First, unpacks a single 32-bit unsigned integer p into four
583 * 8-bit unsigned integers. Then, each component is converted to
584 * a normalized floating-point value to generate the returned
585 * four-component vector.
586 *
587 * The conversion for unpacked fixed-point value f to floating point is
588 * done as follows:
589 *
590 * unpackSnorm4x8: clamp(f / 127.0, -1, +1)
591 *
592 * The first component of the returned vector will be extracted from the
593 * least significant bits of the input; the last component will be
594 * extracted from the most significant bits.
595 *
596 * This function generates IR that approximates the following pseudo-GLSL:
597 *
598 * return clamp(
599 * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
600 * -1.0f, 1.0f);
601 *
602 * The above IR may appear unnecessarily complex, but the intermediate
603 * conversion to ivec4 and the bit shifts are necessary to correctly unpack
604 * negative floats.
605 *
606 * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
607 * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
608 * place that int8 into an int32, which results in the *positive* integer
609 * 0x000000ff. The int8's sign bit becomes, in the int32, the rather
610 * unimportant bit 8. We must now extend the int8's sign bit into bits
611 * 9-32, which is accomplished by left-shifting then right-shifting.
612 */
613
614 assert(uint_rval->type == glsl_type::uint_type);
615
616 ir_rvalue *result =
617 clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
618 constant(127.0f)),
619 constant(-1.0f),
620 constant(1.0f));
621
622 assert(result->type == glsl_type::vec4_type);
623 return result;
624 }
625
626 /**
627 * \brief Lower a packUnorm2x16 expression.
628 *
629 * \param vec2_rval is packUnorm2x16's input
630 * \return packUnorm2x16's output as a uint rvalue
631 */
632 ir_rvalue*
lower_pack_unorm_2x16(ir_rvalue * vec2_rval)633 lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
634 {
635 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
636 *
637 * highp uint packUnorm2x16 (vec2 v)
638 * ---------------------------------
639 * First, converts each component of the normalized floating-point value
640 * v into 16-bit integer values. Then, the results are packed into the
641 * returned 32-bit unsigned integer.
642 *
643 * The conversion for component c of v to fixed point is done as
644 * follows:
645 *
646 * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
647 *
648 * The first component of the vector will be written to the least
649 * significant bits of the output; the last component will be written to
650 * the most significant bits.
651 *
652 * This function generates IR that approximates the following pseudo-GLSL:
653 *
654 * return pack_uvec2_to_uint(uvec2(
655 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
656 *
657 * Here it is safe to directly convert the vec2 to uvec2 because the vec2
658 * has been clamped to a non-negative range.
659 */
660
661 assert(vec2_rval->type == glsl_type::vec2_type);
662
663 ir_rvalue *result = pack_uvec2_to_uint(
664 f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
665
666 assert(result->type == glsl_type::uint_type);
667 return result;
668 }
669
670 /**
671 * \brief Lower a packUnorm4x8 expression.
672 *
673 * \param vec4_rval is packUnorm4x8's input
674 * \return packUnorm4x8's output as a uint rvalue
675 */
676 ir_rvalue*
lower_pack_unorm_4x8(ir_rvalue * vec4_rval)677 lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
678 {
679 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
680 *
681 * highp uint packUnorm4x8 (vec4 v)
682 * --------------------------------
683 * First, converts each component of the normalized floating-point value
684 * v into 8-bit integer values. Then, the results are packed into the
685 * returned 32-bit unsigned integer.
686 *
687 * The conversion for component c of v to fixed point is done as
688 * follows:
689 *
690 * packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
691 *
692 * The first component of the vector will be written to the least
693 * significant bits of the output; the last component will be written to
694 * the most significant bits.
695 *
696 * This function generates IR that approximates the following pseudo-GLSL:
697 *
698 * return pack_uvec4_to_uint(uvec4(
699 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
700 *
701 * Here it is safe to directly convert the vec4 to uvec4 because the vec4
702 * has been clamped to a non-negative range.
703 */
704
705 assert(vec4_rval->type == glsl_type::vec4_type);
706
707 ir_rvalue *result = pack_uvec4_to_uint(
708 f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
709
710 assert(result->type == glsl_type::uint_type);
711 return result;
712 }
713
714 /**
715 * \brief Lower an unpackUnorm2x16 expression.
716 *
717 * \param uint_rval is unpackUnorm2x16's input
718 * \return unpackUnorm2x16's output as a vec2 rvalue
719 */
720 ir_rvalue*
lower_unpack_unorm_2x16(ir_rvalue * uint_rval)721 lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
722 {
723 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
724 *
725 * highp vec2 unpackUnorm2x16 (highp uint p)
726 * -----------------------------------------
727 * First, unpacks a single 32-bit unsigned integer p into a pair of
728 * 16-bit unsigned integers. Then, each component is converted to
729 * a normalized floating-point value to generate the returned
730 * two-component vector.
731 *
732 * The conversion for unpacked fixed-point value f to floating point is
733 * done as follows:
734 *
735 * unpackUnorm2x16: f / 65535.0
736 *
737 * The first component of the returned vector will be extracted from the
738 * least significant bits of the input; the last component will be
739 * extracted from the most significant bits.
740 *
741 * This function generates IR that approximates the following pseudo-GLSL:
742 *
743 * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
744 */
745
746 assert(uint_rval->type == glsl_type::uint_type);
747
748 ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
749 constant(65535.0f));
750
751 assert(result->type == glsl_type::vec2_type);
752 return result;
753 }
754
755 /**
756 * \brief Lower an unpackUnorm4x8 expression.
757 *
758 * \param uint_rval is unpackUnorm4x8's input
759 * \return unpackUnorm4x8's output as a vec4 rvalue
760 */
761 ir_rvalue*
lower_unpack_unorm_4x8(ir_rvalue * uint_rval)762 lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
763 {
764 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
765 *
766 * highp vec4 unpackUnorm4x8 (highp uint p)
767 * ----------------------------------------
768 * First, unpacks a single 32-bit unsigned integer p into four
769 * 8-bit unsigned integers. Then, each component is converted to
770 * a normalized floating-point value to generate the returned
771 * two-component vector.
772 *
773 * The conversion for unpacked fixed-point value f to floating point is
774 * done as follows:
775 *
776 * unpackUnorm4x8: f / 255.0
777 *
778 * The first component of the returned vector will be extracted from the
779 * least significant bits of the input; the last component will be
780 * extracted from the most significant bits.
781 *
782 * This function generates IR that approximates the following pseudo-GLSL:
783 *
784 * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
785 */
786
787 assert(uint_rval->type == glsl_type::uint_type);
788
789 ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
790 constant(255.0f));
791
792 assert(result->type == glsl_type::vec4_type);
793 return result;
794 }
795
796 /**
797 * \brief Lower the component-wise calculation of packHalf2x16.
798 *
799 * \param f_rval is one component of packHafl2x16's input
800 * \param e_rval is the unshifted exponent bits of f_rval
801 * \param m_rval is the unshifted mantissa bits of f_rval
802 *
803 * \return a uint rvalue that encodes a float16 in its lower 16 bits
804 */
805 ir_rvalue*
pack_half_1x16_nosign(ir_rvalue * f_rval,ir_rvalue * e_rval,ir_rvalue * m_rval)806 pack_half_1x16_nosign(ir_rvalue *f_rval,
807 ir_rvalue *e_rval,
808 ir_rvalue *m_rval)
809 {
810 assert(e_rval->type == glsl_type::uint_type);
811 assert(m_rval->type == glsl_type::uint_type);
812
813 /* uint u16; */
814 ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
815 "tmp_pack_half_1x16_u16");
816
817 /* float f = FLOAT_RVAL; */
818 ir_variable *f = factory.make_temp(glsl_type::float_type,
819 "tmp_pack_half_1x16_f");
820 factory.emit(assign(f, f_rval));
821
822 /* uint e = E_RVAL; */
823 ir_variable *e = factory.make_temp(glsl_type::uint_type,
824 "tmp_pack_half_1x16_e");
825 factory.emit(assign(e, e_rval));
826
827 /* uint m = M_RVAL; */
828 ir_variable *m = factory.make_temp(glsl_type::uint_type,
829 "tmp_pack_half_1x16_m");
830 factory.emit(assign(m, m_rval));
831
832 /* Preliminaries
833 * -------------
834 *
835 * For a float16, the bit layout is:
836 *
837 * sign: 15
838 * exponent: 10:14
839 * mantissa: 0:9
840 *
841 * Let f16 be a float16 value. The sign, exponent, and mantissa
842 * determine its value thus:
843 *
844 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
845 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
846 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
847 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
848 * if e16 = 31 and m16 != 0, then NaN (5)
849 *
850 * where 0 <= m16 < 2^10.
851 *
852 * For a float32, the bit layout is:
853 *
854 * sign: 31
855 * exponent: 23:30
856 * mantissa: 0:22
857 *
858 * Let f32 be a float32 value. The sign, exponent, and mantissa
859 * determine its value thus:
860 *
861 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
862 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
863 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
864 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
865 * if e32 = 255 and m32 != 0, then NaN (14)
866 *
867 * where 0 <= m32 < 2^23.
868 *
869 * The minimum and maximum normal float16 values are
870 *
871 * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
872 * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
873 *
874 * The step at max_norm16 is
875 *
876 * max_step16 = 2^5 (22)
877 *
878 * Observe that the float16 boundary values in equations 20-21 lie in the
879 * range of normal float32 values.
880 *
881 *
882 * Rounding Behavior
883 * -----------------
884 * Not all float32 values can be exactly represented as a float16. We
885 * round all such intermediate float32 values to the nearest float16; if
886 * the float32 is exactly between to float16 values, we round to the one
887 * with an even mantissa. This rounding behavior has several benefits:
888 *
889 * - It has no sign bias.
890 *
891 * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
892 * GPU ISA.
893 *
894 * - By reproducing the behavior of the GPU (at least on Intel hardware),
895 * compile-time evaluation of constant packHalf2x16 GLSL expressions will
896 * result in the same value as if the expression were executed on the
897 * GPU.
898 *
899 * Calculation
900 * -----------
901 * Our task is to compute s16, e16, m16 given f32. Since this function
902 * ignores the sign bit, assume that s32 = s16 = 0. There are several
903 * cases consider.
904 */
905
906 factory.emit(
907
908 /* Case 1) f32 is NaN
909 *
910 * The resultant f16 will also be NaN.
911 */
912
913 /* if (e32 == 255 && m32 != 0) { */
914 if_tree(logic_and(equal(e, constant(0xffu << 23u)),
915 logic_not(equal(m, constant(0u)))),
916
917 assign(u16, constant(0x7fffu)),
918
919 /* Case 2) f32 lies in the range [0, min_norm16).
920 *
921 * The resultant float16 will be either zero, subnormal, or normal.
922 *
923 * Solving
924 *
925 * f32 = min_norm16 (30)
926 *
927 * gives
928 *
929 * e32 = 113 and m32 = 0 (31)
930 *
931 * Therefore this case occurs if and only if
932 *
933 * e32 < 113 (32)
934 */
935
936 /* } else if (e32 < 113) { */
937 if_tree(less(e, constant(113u << 23u)),
938
939 /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
940 assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
941 constant((float) (1 << 24)))))),
942
943 /* Case 3) f32 lies in the range
944 * [min_norm16, max_norm16 + max_step16).
945 *
946 * The resultant float16 will be either normal or infinite.
947 *
948 * Solving
949 *
950 * f32 = max_norm16 + max_step16 (40)
951 * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
952 * = 2^16 (42)
953 * gives
954 *
955 * e32 = 143 and m32 = 0 (43)
956 *
957 * We already solved the boundary condition f32 = min_norm16 above
958 * in equation 31. Therefore this case occurs if and only if
959 *
960 * 113 <= e32 and e32 < 143
961 */
962
963 /* } else if (e32 < 143) { */
964 if_tree(less(e, constant(143u << 23u)),
965
966 /* The addition below handles the case where the mantissa rounds
967 * up to 1024 and bumps the exponent.
968 *
969 * u16 = ((e - (112u << 23u)) >> 13u)
970 * + round_to_even((float(m) / (1u << 13u));
971 */
972 assign(u16, add(rshift(sub(e, constant(112u << 23u)),
973 constant(13u)),
974 f2u(round_even(
975 div(u2f(m), constant((float) (1 << 13))))))),
976
977 /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
978 *
979 * The resultant float16 will be infinite.
980 *
981 * The cases above caught all float32 values in the range
982 * [0, max_norm16 + max_step16), so this is the fall-through case.
983 */
984
985 /* } else { */
986
987 assign(u16, constant(31u << 10u))))));
988
989 /* } */
990
991 return deref(u16).val;
992 }
993
994 /**
995 * \brief Lower a packHalf2x16 expression.
996 *
997 * \param vec2_rval is packHalf2x16's input
998 * \return packHalf2x16's output as a uint rvalue
999 */
1000 ir_rvalue*
lower_pack_half_2x16(ir_rvalue * vec2_rval)1001 lower_pack_half_2x16(ir_rvalue *vec2_rval)
1002 {
1003 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1004 *
1005 * highp uint packHalf2x16 (mediump vec2 v)
1006 * ----------------------------------------
1007 * Returns an unsigned integer obtained by converting the components of
1008 * a two-component floating-point vector to the 16-bit floating-point
1009 * representation found in the OpenGL ES Specification, and then packing
1010 * these two 16-bit integers into a 32-bit unsigned integer.
1011 *
1012 * The first vector component specifies the 16 least- significant bits
1013 * of the result; the second component specifies the 16 most-significant
1014 * bits.
1015 */
1016
1017 assert(vec2_rval->type == glsl_type::vec2_type);
1018
1019 /* vec2 f = VEC2_RVAL; */
1020 ir_variable *f = factory.make_temp(glsl_type::vec2_type,
1021 "tmp_pack_half_2x16_f");
1022 factory.emit(assign(f, vec2_rval));
1023
1024 /* uvec2 f32 = bitcast_f2u(f); */
1025 ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1026 "tmp_pack_half_2x16_f32");
1027 factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1028
1029 /* uvec2 f16; */
1030 ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1031 "tmp_pack_half_2x16_f16");
1032
1033 /* Get f32's unshifted exponent bits.
1034 *
1035 * uvec2 e = f32 & 0x7f800000u;
1036 */
1037 ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1038 "tmp_pack_half_2x16_e");
1039 factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1040
1041 /* Get f32's unshifted mantissa bits.
1042 *
1043 * uvec2 m = f32 & 0x007fffffu;
1044 */
1045 ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1046 "tmp_pack_half_2x16_m");
1047 factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1048
1049 /* Set f16's exponent and mantissa bits.
1050 *
1051 * f16.x = pack_half_1x16_nosign(e.x, m.x);
1052 * f16.y = pack_half_1y16_nosign(e.y, m.y);
1053 */
1054 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1055 swizzle_x(e),
1056 swizzle_x(m)),
1057 WRITEMASK_X));
1058 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1059 swizzle_y(e),
1060 swizzle_y(m)),
1061 WRITEMASK_Y));
1062
1063 /* Set f16's sign bits.
1064 *
1065 * f16 |= (f32 & (1u << 31u) >> 16u;
1066 */
1067 factory.emit(
1068 assign(f16, bit_or(f16,
1069 rshift(bit_and(f32, constant(1u << 31u)),
1070 constant(16u)))));
1071
1072
1073 /* return (f16.y << 16u) | f16.x; */
1074 ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1075 constant(16u)),
1076 swizzle_x(f16));
1077
1078 assert(result->type == glsl_type::uint_type);
1079 return result;
1080 }
1081
1082 /**
1083 * \brief Lower the component-wise calculation of unpackHalf2x16.
1084 *
1085 * Given a uint that encodes a float16 in its lower 16 bits, this function
1086 * returns a uint that encodes a float32 with the same value. The sign bit
1087 * of the float16 is ignored.
1088 *
1089 * \param e_rval is the unshifted exponent bits of a float16
1090 * \param m_rval is the unshifted mantissa bits of a float16
1091 * \param a uint rvalue that encodes a float32
1092 */
1093 ir_rvalue*
unpack_half_1x16_nosign(ir_rvalue * e_rval,ir_rvalue * m_rval)1094 unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1095 {
1096 assert(e_rval->type == glsl_type::uint_type);
1097 assert(m_rval->type == glsl_type::uint_type);
1098
1099 /* uint u32; */
1100 ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
1101 "tmp_unpack_half_1x16_u32");
1102
1103 /* uint e = E_RVAL; */
1104 ir_variable *e = factory.make_temp(glsl_type::uint_type,
1105 "tmp_unpack_half_1x16_e");
1106 factory.emit(assign(e, e_rval));
1107
1108 /* uint m = M_RVAL; */
1109 ir_variable *m = factory.make_temp(glsl_type::uint_type,
1110 "tmp_unpack_half_1x16_m");
1111 factory.emit(assign(m, m_rval));
1112
1113 /* Preliminaries
1114 * -------------
1115 *
1116 * For a float16, the bit layout is:
1117 *
1118 * sign: 15
1119 * exponent: 10:14
1120 * mantissa: 0:9
1121 *
1122 * Let f16 be a float16 value. The sign, exponent, and mantissa
1123 * determine its value thus:
1124 *
1125 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
1126 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
1127 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1128 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
1129 * if e16 = 31 and m16 != 0, then NaN (5)
1130 *
1131 * where 0 <= m16 < 2^10.
1132 *
1133 * For a float32, the bit layout is:
1134 *
1135 * sign: 31
1136 * exponent: 23:30
1137 * mantissa: 0:22
1138 *
1139 * Let f32 be a float32 value. The sign, exponent, and mantissa
1140 * determine its value thus:
1141 *
1142 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
1143 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
1144 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1145 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
1146 * if e32 = 255 and m32 != 0, then NaN (14)
1147 *
1148 * where 0 <= m32 < 2^23.
1149 *
1150 * Calculation
1151 * -----------
1152 * Our task is to compute s32, e32, m32 given f16. Since this function
1153 * ignores the sign bit, assume that s32 = s16 = 0. There are several
1154 * cases consider.
1155 */
1156
1157 factory.emit(
1158
1159 /* Case 1) f16 is zero or subnormal.
1160 *
1161 * The simplest method of calcuating f32 in this case is
1162 *
1163 * f32 = f16 (20)
1164 * = 2^(-14) * (m16 / 2^10) (21)
1165 * = m16 / 2^(-24) (22)
1166 */
1167
1168 /* if (e16 == 0) { */
1169 if_tree(equal(e, constant(0u)),
1170
1171 /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1172 assign(u32, expr(ir_unop_bitcast_f2u,
1173 div(u2f(m), constant((float)(1 << 24))))),
1174
1175 /* Case 2) f16 is normal.
1176 *
1177 * The equation
1178 *
1179 * f32 = f16 (30)
1180 * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
1181 * 2^(e16 - 15) * (1 + m16 / 2^10)
1182 *
1183 * can be decomposed into two
1184 *
1185 * 2^(e32 - 127) = 2^(e16 - 15) (32)
1186 * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
1187 *
1188 * which solve to
1189 *
1190 * e32 = e16 + 112 (34)
1191 * m32 = m16 * 2^13 (35)
1192 */
1193
1194 /* } else if (e16 < 31)) { */
1195 if_tree(less(e, constant(31u << 10u)),
1196
1197 /* u32 = ((e + (112 << 10)) | m) << 13;
1198 */
1199 assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1200 constant(13u))),
1201
1202
1203 /* Case 3) f16 is infinite. */
1204 if_tree(equal(m, constant(0u)),
1205
1206 assign(u32, constant(255u << 23u)),
1207
1208 /* Case 4) f16 is NaN. */
1209 /* } else { */
1210
1211 assign(u32, constant(0x7fffffffu))))));
1212
1213 /* } */
1214
1215 return deref(u32).val;
1216 }
1217
1218 /**
1219 * \brief Lower an unpackHalf2x16 expression.
1220 *
1221 * \param uint_rval is unpackHalf2x16's input
1222 * \return unpackHalf2x16's output as a vec2 rvalue
1223 */
1224 ir_rvalue*
lower_unpack_half_2x16(ir_rvalue * uint_rval)1225 lower_unpack_half_2x16(ir_rvalue *uint_rval)
1226 {
1227 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1228 *
1229 * mediump vec2 unpackHalf2x16 (highp uint v)
1230 * ------------------------------------------
1231 * Returns a two-component floating-point vector with components
1232 * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1233 * values, interpreting those values as 16-bit floating-point numbers
1234 * according to the OpenGL ES Specification, and converting them to
1235 * 32-bit floating-point values.
1236 *
1237 * The first component of the vector is obtained from the
1238 * 16 least-significant bits of v; the second component is obtained
1239 * from the 16 most-significant bits of v.
1240 */
1241 assert(uint_rval->type == glsl_type::uint_type);
1242
1243 /* uint u = RVALUE;
1244 * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1245 */
1246 ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1247 "tmp_unpack_half_2x16_f16");
1248 factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1249
1250 /* uvec2 f32; */
1251 ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1252 "tmp_unpack_half_2x16_f32");
1253
1254 /* Get f16's unshifted exponent bits.
1255 *
1256 * uvec2 e = f16 & 0x7c00u;
1257 */
1258 ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1259 "tmp_unpack_half_2x16_e");
1260 factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1261
1262 /* Get f16's unshifted mantissa bits.
1263 *
1264 * uvec2 m = f16 & 0x03ffu;
1265 */
1266 ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1267 "tmp_unpack_half_2x16_m");
1268 factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1269
1270 /* Set f32's exponent and mantissa bits.
1271 *
1272 * f32.x = unpack_half_1x16_nosign(e.x, m.x);
1273 * f32.y = unpack_half_1x16_nosign(e.y, m.y);
1274 */
1275 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1276 swizzle_x(m)),
1277 WRITEMASK_X));
1278 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1279 swizzle_y(m)),
1280 WRITEMASK_Y));
1281
1282 /* Set f32's sign bit.
1283 *
1284 * f32 |= (f16 & 0x8000u) << 16u;
1285 */
1286 factory.emit(assign(f32, bit_or(f32,
1287 lshift(bit_and(f16,
1288 constant(0x8000u)),
1289 constant(16u)))));
1290
1291 /* return bitcast_u2f(f32); */
1292 ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1293 assert(result->type == glsl_type::vec2_type);
1294 return result;
1295 }
1296 };
1297
1298 } // namespace anonymous
1299
1300 /**
1301 * \brief Lower the builtin packing functions.
1302 *
1303 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1304 */
1305 bool
lower_packing_builtins(exec_list * instructions,int op_mask)1306 lower_packing_builtins(exec_list *instructions, int op_mask)
1307 {
1308 lower_packing_builtins_visitor v(op_mask);
1309 visit_list_elements(&v, instructions, true);
1310 return v.get_progress();
1311 }
1312