1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "ir.h"
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
28
29 enum lower_packing_builtins_op {
30 LOWER_PACK_UNPACK_NONE = 0x0000,
31
32 LOWER_PACK_SNORM_2x16 = 0x0001,
33 LOWER_UNPACK_SNORM_2x16 = 0x0002,
34
35 LOWER_PACK_UNORM_2x16 = 0x0004,
36 LOWER_UNPACK_UNORM_2x16 = 0x0008,
37
38 LOWER_PACK_HALF_2x16 = 0x0010,
39 LOWER_UNPACK_HALF_2x16 = 0x0020,
40
41 LOWER_PACK_SNORM_4x8 = 0x0040,
42 LOWER_UNPACK_SNORM_4x8 = 0x0080,
43
44 LOWER_PACK_UNORM_4x8 = 0x0100,
45 LOWER_UNPACK_UNORM_4x8 = 0x0200,
46
47 LOWER_PACK_USE_BFI = 0x0400,
48 LOWER_PACK_USE_BFE = 0x0800,
49 };
50
51 namespace {
52
53 using namespace ir_builder;
54
55 /**
56 * A visitor that lowers built-in floating-point pack/unpack expressions
57 * such packSnorm2x16.
58 */
59 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
60 public:
61 /**
62 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
63 */
lower_packing_builtins_visitor(int op_mask)64 explicit lower_packing_builtins_visitor(int op_mask)
65 : op_mask(op_mask),
66 progress(false)
67 {
68 factory.instructions = &factory_instructions;
69 }
70
~lower_packing_builtins_visitor()71 virtual ~lower_packing_builtins_visitor()
72 {
73 assert(factory_instructions.is_empty());
74 }
75
get_progress()76 bool get_progress() { return progress; }
77
handle_rvalue(ir_rvalue ** rvalue)78 void handle_rvalue(ir_rvalue **rvalue)
79 {
80 if (!*rvalue)
81 return;
82
83 ir_expression *expr = (*rvalue)->as_expression();
84 if (!expr)
85 return;
86
87 enum lower_packing_builtins_op lowering_op =
88 choose_lowering_op(expr->operation);
89
90 if (lowering_op == LOWER_PACK_UNPACK_NONE)
91 return;
92
93 setup_factory(ralloc_parent(expr));
94
95 ir_rvalue *op0 = expr->operands[0];
96 ralloc_steal(factory.mem_ctx, op0);
97
98 switch (lowering_op) {
99 case LOWER_PACK_SNORM_2x16:
100 *rvalue = lower_pack_snorm_2x16(op0);
101 break;
102 case LOWER_PACK_SNORM_4x8:
103 *rvalue = lower_pack_snorm_4x8(op0);
104 break;
105 case LOWER_PACK_UNORM_2x16:
106 *rvalue = lower_pack_unorm_2x16(op0);
107 break;
108 case LOWER_PACK_UNORM_4x8:
109 *rvalue = lower_pack_unorm_4x8(op0);
110 break;
111 case LOWER_PACK_HALF_2x16:
112 *rvalue = lower_pack_half_2x16(op0);
113 break;
114 case LOWER_UNPACK_SNORM_2x16:
115 *rvalue = lower_unpack_snorm_2x16(op0);
116 break;
117 case LOWER_UNPACK_SNORM_4x8:
118 *rvalue = lower_unpack_snorm_4x8(op0);
119 break;
120 case LOWER_UNPACK_UNORM_2x16:
121 *rvalue = lower_unpack_unorm_2x16(op0);
122 break;
123 case LOWER_UNPACK_UNORM_4x8:
124 *rvalue = lower_unpack_unorm_4x8(op0);
125 break;
126 case LOWER_UNPACK_HALF_2x16:
127 *rvalue = lower_unpack_half_2x16(op0);
128 break;
129 case LOWER_PACK_UNPACK_NONE:
130 case LOWER_PACK_USE_BFI:
131 case LOWER_PACK_USE_BFE:
132 assert(!"not reached");
133 break;
134 }
135
136 teardown_factory();
137 progress = true;
138 }
139
140 private:
141 const int op_mask;
142 bool progress;
143 ir_factory factory;
144 exec_list factory_instructions;
145
146 /**
147 * Determine the needed lowering operation by filtering \a expr_op
148 * through \ref op_mask.
149 */
150 enum lower_packing_builtins_op
choose_lowering_op(ir_expression_operation expr_op)151 choose_lowering_op(ir_expression_operation expr_op)
152 {
153 /* C++ regards int and enum as fundamentally different types.
154 * So, we can't simply return from each case; we must cast the return
155 * value.
156 */
157 int result;
158
159 switch (expr_op) {
160 case ir_unop_pack_snorm_2x16:
161 result = op_mask & LOWER_PACK_SNORM_2x16;
162 break;
163 case ir_unop_pack_snorm_4x8:
164 result = op_mask & LOWER_PACK_SNORM_4x8;
165 break;
166 case ir_unop_pack_unorm_2x16:
167 result = op_mask & LOWER_PACK_UNORM_2x16;
168 break;
169 case ir_unop_pack_unorm_4x8:
170 result = op_mask & LOWER_PACK_UNORM_4x8;
171 break;
172 case ir_unop_pack_half_2x16:
173 result = op_mask & LOWER_PACK_HALF_2x16;
174 break;
175 case ir_unop_unpack_snorm_2x16:
176 result = op_mask & LOWER_UNPACK_SNORM_2x16;
177 break;
178 case ir_unop_unpack_snorm_4x8:
179 result = op_mask & LOWER_UNPACK_SNORM_4x8;
180 break;
181 case ir_unop_unpack_unorm_2x16:
182 result = op_mask & LOWER_UNPACK_UNORM_2x16;
183 break;
184 case ir_unop_unpack_unorm_4x8:
185 result = op_mask & LOWER_UNPACK_UNORM_4x8;
186 break;
187 case ir_unop_unpack_half_2x16:
188 result = op_mask & LOWER_UNPACK_HALF_2x16;
189 break;
190 default:
191 result = LOWER_PACK_UNPACK_NONE;
192 break;
193 }
194
195 return static_cast<enum lower_packing_builtins_op>(result);
196 }
197
198 void
setup_factory(void * mem_ctx)199 setup_factory(void *mem_ctx)
200 {
201 assert(factory.mem_ctx == NULL);
202 assert(factory.instructions->is_empty());
203
204 factory.mem_ctx = mem_ctx;
205 }
206
207 void
teardown_factory()208 teardown_factory()
209 {
210 base_ir->insert_before(factory.instructions);
211 assert(factory.instructions->is_empty());
212 factory.mem_ctx = NULL;
213 }
214
215 template <typename T>
216 ir_constant*
constant(T x)217 constant(T x)
218 {
219 return factory.constant(x);
220 }
221
222 /**
223 * \brief Pack two uint16's into a single uint32.
224 *
225 * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
226 * where the least significant bits specify the first element of the pair.
227 * Return the uint32.
228 */
229 ir_rvalue*
pack_uvec2_to_uint(ir_rvalue * uvec2_rval)230 pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
231 {
232 assert(uvec2_rval->type == &glsl_type_builtin_uvec2);
233
234 /* uvec2 u = UVEC2_RVAL; */
235 ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec2,
236 "tmp_pack_uvec2_to_uint");
237 factory.emit(assign(u, uvec2_rval));
238
239 if (op_mask & LOWER_PACK_USE_BFI) {
240 return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
241 swizzle_y(u),
242 constant(16u),
243 constant(16u));
244 }
245
246 /* return (u.y << 16) | (u.x & 0xffff); */
247 return bit_or(lshift(swizzle_y(u), constant(16u)),
248 bit_and(swizzle_x(u), constant(0xffffu)));
249 }
250
251 /**
252 * \brief Pack four uint8's into a single uint32.
253 *
254 * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
255 * uint32 where the least significant bits specify the first element of the
256 * 4-tuple. Return the uint32.
257 */
258 ir_rvalue*
pack_uvec4_to_uint(ir_rvalue * uvec4_rval)259 pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
260 {
261 assert(uvec4_rval->type == &glsl_type_builtin_uvec4);
262
263 ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec4,
264 "tmp_pack_uvec4_to_uint");
265
266 if (op_mask & LOWER_PACK_USE_BFI) {
267 /* uvec4 u = UVEC4_RVAL; */
268 factory.emit(assign(u, uvec4_rval));
269
270 return bitfield_insert(bitfield_insert(
271 bitfield_insert(
272 bit_and(swizzle_x(u), constant(0xffu)),
273 swizzle_y(u), constant(8u), constant(8u)),
274 swizzle_z(u), constant(16u), constant(8u)),
275 swizzle_w(u), constant(24u), constant(8u));
276 }
277
278 /* uvec4 u = UVEC4_RVAL & 0xff */
279 factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
280
281 /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
282 return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
283 lshift(swizzle_z(u), constant(16u))),
284 bit_or(lshift(swizzle_y(u), constant(8u)),
285 swizzle_x(u)));
286 }
287
288 /**
289 * \brief Unpack a uint32 into two uint16's.
290 *
291 * Interpret the given uint32 as a uint16 pair where the uint32's least
292 * significant bits specify the pair's first element. Return the uint16
293 * pair as a uvec2.
294 */
295 ir_rvalue*
unpack_uint_to_uvec2(ir_rvalue * uint_rval)296 unpack_uint_to_uvec2(ir_rvalue *uint_rval)
297 {
298 assert(uint_rval->type == &glsl_type_builtin_uint);
299
300 /* uint u = UINT_RVAL; */
301 ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
302 "tmp_unpack_uint_to_uvec2_u");
303 factory.emit(assign(u, uint_rval));
304
305 /* uvec2 u2; */
306 ir_variable *u2 = factory.make_temp(&glsl_type_builtin_uvec2,
307 "tmp_unpack_uint_to_uvec2_u2");
308
309 /* u2.x = u & 0xffffu; */
310 factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
311
312 /* u2.y = u >> 16u; */
313 factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
314
315 return deref(u2).val;
316 }
317
318 /**
319 * \brief Unpack a uint32 into two int16's.
320 *
321 * Specifically each 16-bit value is sign-extended to the full width of an
322 * int32 on return.
323 */
324 ir_rvalue *
unpack_uint_to_ivec2(ir_rvalue * uint_rval)325 unpack_uint_to_ivec2(ir_rvalue *uint_rval)
326 {
327 assert(uint_rval->type == &glsl_type_builtin_uint);
328
329 if (!(op_mask & LOWER_PACK_USE_BFE)) {
330 return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
331 constant(16u)),
332 constant(16u));
333 }
334
335 ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
336 "tmp_unpack_uint_to_ivec2_i");
337 factory.emit(assign(i, u2i(uint_rval)));
338
339 /* ivec2 i2; */
340 ir_variable *i2 = factory.make_temp(&glsl_type_builtin_ivec2,
341 "tmp_unpack_uint_to_ivec2_i2");
342
343 factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
344 WRITEMASK_X));
345 factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
346 WRITEMASK_Y));
347
348 return deref(i2).val;
349 }
350
351 /**
352 * \brief Unpack a uint32 into four uint8's.
353 *
354 * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
355 * significant bits specify the 4-tuple's first element. Return the uint8
356 * 4-tuple as a uvec4.
357 */
358 ir_rvalue*
unpack_uint_to_uvec4(ir_rvalue * uint_rval)359 unpack_uint_to_uvec4(ir_rvalue *uint_rval)
360 {
361 assert(uint_rval->type == &glsl_type_builtin_uint);
362
363 /* uint u = UINT_RVAL; */
364 ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
365 "tmp_unpack_uint_to_uvec4_u");
366 factory.emit(assign(u, uint_rval));
367
368 /* uvec4 u4; */
369 ir_variable *u4 = factory.make_temp(&glsl_type_builtin_uvec4,
370 "tmp_unpack_uint_to_uvec4_u4");
371
372 /* u4.x = u & 0xffu; */
373 factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
374
375 if (op_mask & LOWER_PACK_USE_BFE) {
376 /* u4.y = bitfield_extract(u, 8, 8); */
377 factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
378 WRITEMASK_Y));
379
380 /* u4.z = bitfield_extract(u, 16, 8); */
381 factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
382 WRITEMASK_Z));
383 } else {
384 /* u4.y = (u >> 8u) & 0xffu; */
385 factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
386 constant(0xffu)), WRITEMASK_Y));
387
388 /* u4.z = (u >> 16u) & 0xffu; */
389 factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
390 constant(0xffu)), WRITEMASK_Z));
391 }
392
393 /* u4.w = (u >> 24u) */
394 factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
395
396 return deref(u4).val;
397 }
398
399 /**
400 * \brief Unpack a uint32 into four int8's.
401 *
402 * Specifically each 8-bit value is sign-extended to the full width of an
403 * int32 on return.
404 */
405 ir_rvalue *
unpack_uint_to_ivec4(ir_rvalue * uint_rval)406 unpack_uint_to_ivec4(ir_rvalue *uint_rval)
407 {
408 assert(uint_rval->type == &glsl_type_builtin_uint);
409
410 if (!(op_mask & LOWER_PACK_USE_BFE)) {
411 return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
412 constant(24u)),
413 constant(24u));
414 }
415
416 ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
417 "tmp_unpack_uint_to_ivec4_i");
418 factory.emit(assign(i, u2i(uint_rval)));
419
420 /* ivec4 i4; */
421 ir_variable *i4 = factory.make_temp(&glsl_type_builtin_ivec4,
422 "tmp_unpack_uint_to_ivec4_i4");
423
424 factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
425 WRITEMASK_X));
426 factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
427 WRITEMASK_Y));
428 factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
429 WRITEMASK_Z));
430 factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
431 WRITEMASK_W));
432
433 return deref(i4).val;
434 }
435
436 /**
437 * \brief Lower a packSnorm2x16 expression.
438 *
439 * \param vec2_rval is packSnorm2x16's input
440 * \return packSnorm2x16's output as a uint rvalue
441 */
442 ir_rvalue*
lower_pack_snorm_2x16(ir_rvalue * vec2_rval)443 lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
444 {
445 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
446 *
447 * highp uint packSnorm2x16(vec2 v)
448 * --------------------------------
449 * First, converts each component of the normalized floating-point value
450 * v into 16-bit integer values. Then, the results are packed into the
451 * returned 32-bit unsigned integer.
452 *
453 * The conversion for component c of v to fixed point is done as
454 * follows:
455 *
456 * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
457 *
458 * The first component of the vector will be written to the least
459 * significant bits of the output; the last component will be written to
460 * the most significant bits.
461 *
462 * This function generates IR that approximates the following pseudo-GLSL:
463 *
464 * return pack_uvec2_to_uint(
465 * uvec2(ivec2(
466 * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
467 *
468 * It is necessary to first convert the vec2 to ivec2 rather than directly
469 * converting vec2 to uvec2 because the latter conversion is undefined.
470 * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
471 * convert a negative floating point value to an uint".
472 */
473 assert(vec2_rval->type == &glsl_type_builtin_vec2);
474
475 ir_rvalue *result = pack_uvec2_to_uint(
476 i2u(f2i(round_even(mul(clamp(vec2_rval,
477 constant(-1.0f),
478 constant(1.0f)),
479 constant(32767.0f))))));
480
481 assert(result->type == &glsl_type_builtin_uint);
482 return result;
483 }
484
485 /**
486 * \brief Lower a packSnorm4x8 expression.
487 *
488 * \param vec4_rval is packSnorm4x8's input
489 * \return packSnorm4x8's output as a uint rvalue
490 */
491 ir_rvalue*
lower_pack_snorm_4x8(ir_rvalue * vec4_rval)492 lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
493 {
494 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
495 *
496 * highp uint packSnorm4x8(vec4 v)
497 * -------------------------------
498 * First, converts each component of the normalized floating-point value
499 * v into 8-bit integer values. Then, the results are packed into the
500 * returned 32-bit unsigned integer.
501 *
502 * The conversion for component c of v to fixed point is done as
503 * follows:
504 *
505 * packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
506 *
507 * The first component of the vector will be written to the least
508 * significant bits of the output; the last component will be written to
509 * the most significant bits.
510 *
511 * This function generates IR that approximates the following pseudo-GLSL:
512 *
513 * return pack_uvec4_to_uint(
514 * uvec4(ivec4(
515 * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
516 *
517 * It is necessary to first convert the vec4 to ivec4 rather than directly
518 * converting vec4 to uvec4 because the latter conversion is undefined.
519 * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
520 * convert a negative floating point value to an uint".
521 */
522 assert(vec4_rval->type == &glsl_type_builtin_vec4);
523
524 ir_rvalue *result = pack_uvec4_to_uint(
525 i2u(f2i(round_even(mul(clamp(vec4_rval,
526 constant(-1.0f),
527 constant(1.0f)),
528 constant(127.0f))))));
529
530 assert(result->type == &glsl_type_builtin_uint);
531 return result;
532 }
533
534 /**
535 * \brief Lower an unpackSnorm2x16 expression.
536 *
537 * \param uint_rval is unpackSnorm2x16's input
538 * \return unpackSnorm2x16's output as a vec2 rvalue
539 */
540 ir_rvalue*
lower_unpack_snorm_2x16(ir_rvalue * uint_rval)541 lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
542 {
543 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
544 *
545 * highp vec2 unpackSnorm2x16 (highp uint p)
546 * -----------------------------------------
547 * First, unpacks a single 32-bit unsigned integer p into a pair of
548 * 16-bit unsigned integers. Then, each component is converted to
549 * a normalized floating-point value to generate the returned
550 * two-component vector.
551 *
552 * The conversion for unpacked fixed-point value f to floating point is
553 * done as follows:
554 *
555 * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
556 *
557 * The first component of the returned vector will be extracted from the
558 * least significant bits of the input; the last component will be
559 * extracted from the most significant bits.
560 *
561 * This function generates IR that approximates the following pseudo-GLSL:
562 *
563 * return clamp(
564 * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
565 * -1.0f, 1.0f);
566 *
567 * The above IR may appear unnecessarily complex, but the intermediate
568 * conversion to ivec2 and the bit shifts are necessary to correctly unpack
569 * negative floats.
570 *
571 * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
572 * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
573 * place that int16 into an int32, which results in the *positive* integer
574 * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
575 * unimportant bit 16. We must now extend the int16's sign bit into bits
576 * 17-32, which is accomplished by left-shifting then right-shifting.
577 */
578
579 assert(uint_rval->type == &glsl_type_builtin_uint);
580
581 ir_rvalue *result =
582 clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
583 constant(32767.0f)),
584 constant(-1.0f),
585 constant(1.0f));
586
587 assert(result->type == &glsl_type_builtin_vec2);
588 return result;
589 }
590
591 /**
592 * \brief Lower an unpackSnorm4x8 expression.
593 *
594 * \param uint_rval is unpackSnorm4x8's input
595 * \return unpackSnorm4x8's output as a vec4 rvalue
596 */
597 ir_rvalue*
lower_unpack_snorm_4x8(ir_rvalue * uint_rval)598 lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
599 {
600 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
601 *
602 * highp vec4 unpackSnorm4x8 (highp uint p)
603 * ----------------------------------------
604 * First, unpacks a single 32-bit unsigned integer p into four
605 * 8-bit unsigned integers. Then, each component is converted to
606 * a normalized floating-point value to generate the returned
607 * four-component vector.
608 *
609 * The conversion for unpacked fixed-point value f to floating point is
610 * done as follows:
611 *
612 * unpackSnorm4x8: clamp(f / 127.0, -1, +1)
613 *
614 * The first component of the returned vector will be extracted from the
615 * least significant bits of the input; the last component will be
616 * extracted from the most significant bits.
617 *
618 * This function generates IR that approximates the following pseudo-GLSL:
619 *
620 * return clamp(
621 * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
622 * -1.0f, 1.0f);
623 *
624 * The above IR may appear unnecessarily complex, but the intermediate
625 * conversion to ivec4 and the bit shifts are necessary to correctly unpack
626 * negative floats.
627 *
628 * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
629 * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
630 * place that int8 into an int32, which results in the *positive* integer
631 * 0x000000ff. The int8's sign bit becomes, in the int32, the rather
632 * unimportant bit 8. We must now extend the int8's sign bit into bits
633 * 9-32, which is accomplished by left-shifting then right-shifting.
634 */
635
636 assert(uint_rval->type == &glsl_type_builtin_uint);
637
638 ir_rvalue *result =
639 clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
640 constant(127.0f)),
641 constant(-1.0f),
642 constant(1.0f));
643
644 assert(result->type == &glsl_type_builtin_vec4);
645 return result;
646 }
647
648 /**
649 * \brief Lower a packUnorm2x16 expression.
650 *
651 * \param vec2_rval is packUnorm2x16's input
652 * \return packUnorm2x16's output as a uint rvalue
653 */
654 ir_rvalue*
lower_pack_unorm_2x16(ir_rvalue * vec2_rval)655 lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
656 {
657 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
658 *
659 * highp uint packUnorm2x16 (vec2 v)
660 * ---------------------------------
661 * First, converts each component of the normalized floating-point value
662 * v into 16-bit integer values. Then, the results are packed into the
663 * returned 32-bit unsigned integer.
664 *
665 * The conversion for component c of v to fixed point is done as
666 * follows:
667 *
668 * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
669 *
670 * The first component of the vector will be written to the least
671 * significant bits of the output; the last component will be written to
672 * the most significant bits.
673 *
674 * This function generates IR that approximates the following pseudo-GLSL:
675 *
676 * return pack_uvec2_to_uint(uvec2(
677 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
678 *
679 * Here it is safe to directly convert the vec2 to uvec2 because the vec2
680 * has been clamped to a non-negative range.
681 */
682
683 assert(vec2_rval->type == &glsl_type_builtin_vec2);
684
685 ir_rvalue *result = pack_uvec2_to_uint(
686 f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
687
688 assert(result->type == &glsl_type_builtin_uint);
689 return result;
690 }
691
692 /**
693 * \brief Lower a packUnorm4x8 expression.
694 *
695 * \param vec4_rval is packUnorm4x8's input
696 * \return packUnorm4x8's output as a uint rvalue
697 */
698 ir_rvalue*
lower_pack_unorm_4x8(ir_rvalue * vec4_rval)699 lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
700 {
701 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
702 *
703 * highp uint packUnorm4x8 (vec4 v)
704 * --------------------------------
705 * First, converts each component of the normalized floating-point value
706 * v into 8-bit integer values. Then, the results are packed into the
707 * returned 32-bit unsigned integer.
708 *
709 * The conversion for component c of v to fixed point is done as
710 * follows:
711 *
712 * packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
713 *
714 * The first component of the vector will be written to the least
715 * significant bits of the output; the last component will be written to
716 * the most significant bits.
717 *
718 * This function generates IR that approximates the following pseudo-GLSL:
719 *
720 * return pack_uvec4_to_uint(uvec4(
721 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
722 *
723 * Here it is safe to directly convert the vec4 to uvec4 because the vec4
724 * has been clamped to a non-negative range.
725 */
726
727 assert(vec4_rval->type == &glsl_type_builtin_vec4);
728
729 ir_rvalue *result = pack_uvec4_to_uint(
730 f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
731
732 assert(result->type == &glsl_type_builtin_uint);
733 return result;
734 }
735
736 /**
737 * \brief Lower an unpackUnorm2x16 expression.
738 *
739 * \param uint_rval is unpackUnorm2x16's input
740 * \return unpackUnorm2x16's output as a vec2 rvalue
741 */
742 ir_rvalue*
lower_unpack_unorm_2x16(ir_rvalue * uint_rval)743 lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
744 {
745 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
746 *
747 * highp vec2 unpackUnorm2x16 (highp uint p)
748 * -----------------------------------------
749 * First, unpacks a single 32-bit unsigned integer p into a pair of
750 * 16-bit unsigned integers. Then, each component is converted to
751 * a normalized floating-point value to generate the returned
752 * two-component vector.
753 *
754 * The conversion for unpacked fixed-point value f to floating point is
755 * done as follows:
756 *
757 * unpackUnorm2x16: f / 65535.0
758 *
759 * The first component of the returned vector will be extracted from the
760 * least significant bits of the input; the last component will be
761 * extracted from the most significant bits.
762 *
763 * This function generates IR that approximates the following pseudo-GLSL:
764 *
765 * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
766 */
767
768 assert(uint_rval->type == &glsl_type_builtin_uint);
769
770 ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
771 constant(65535.0f));
772
773 assert(result->type == &glsl_type_builtin_vec2);
774 return result;
775 }
776
777 /**
778 * \brief Lower an unpackUnorm4x8 expression.
779 *
780 * \param uint_rval is unpackUnorm4x8's input
781 * \return unpackUnorm4x8's output as a vec4 rvalue
782 */
783 ir_rvalue*
lower_unpack_unorm_4x8(ir_rvalue * uint_rval)784 lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
785 {
786 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
787 *
788 * highp vec4 unpackUnorm4x8 (highp uint p)
789 * ----------------------------------------
790 * First, unpacks a single 32-bit unsigned integer p into four
791 * 8-bit unsigned integers. Then, each component is converted to
792 * a normalized floating-point value to generate the returned
793 * two-component vector.
794 *
795 * The conversion for unpacked fixed-point value f to floating point is
796 * done as follows:
797 *
798 * unpackUnorm4x8: f / 255.0
799 *
800 * The first component of the returned vector will be extracted from the
801 * least significant bits of the input; the last component will be
802 * extracted from the most significant bits.
803 *
804 * This function generates IR that approximates the following pseudo-GLSL:
805 *
806 * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
807 */
808
809 assert(uint_rval->type == &glsl_type_builtin_uint);
810
811 ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
812 constant(255.0f));
813
814 assert(result->type == &glsl_type_builtin_vec4);
815 return result;
816 }
817
818 /**
819 * \brief Lower the component-wise calculation of packHalf2x16.
820 *
821 * \param f_rval is one component of packHafl2x16's input
822 * \param e_rval is the unshifted exponent bits of f_rval
823 * \param m_rval is the unshifted mantissa bits of f_rval
824 *
825 * \return a uint rvalue that encodes a float16 in its lower 16 bits
826 */
827 ir_rvalue*
pack_half_1x16_nosign(ir_rvalue * f_rval,ir_rvalue * e_rval,ir_rvalue * m_rval)828 pack_half_1x16_nosign(ir_rvalue *f_rval,
829 ir_rvalue *e_rval,
830 ir_rvalue *m_rval)
831 {
832 assert(e_rval->type == &glsl_type_builtin_uint);
833 assert(m_rval->type == &glsl_type_builtin_uint);
834
835 /* uint u16; */
836 ir_variable *u16 = factory.make_temp(&glsl_type_builtin_uint,
837 "tmp_pack_half_1x16_u16");
838
839 /* float f = FLOAT_RVAL; */
840 ir_variable *f = factory.make_temp(&glsl_type_builtin_float,
841 "tmp_pack_half_1x16_f");
842 factory.emit(assign(f, f_rval));
843
844 /* uint e = E_RVAL; */
845 ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
846 "tmp_pack_half_1x16_e");
847 factory.emit(assign(e, e_rval));
848
849 /* uint m = M_RVAL; */
850 ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
851 "tmp_pack_half_1x16_m");
852 factory.emit(assign(m, m_rval));
853
854 /* Preliminaries
855 * -------------
856 *
857 * For a float16, the bit layout is:
858 *
859 * sign: 15
860 * exponent: 10:14
861 * mantissa: 0:9
862 *
863 * Let f16 be a float16 value. The sign, exponent, and mantissa
864 * determine its value thus:
865 *
866 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
867 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
868 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
869 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
870 * if e16 = 31 and m16 != 0, then NaN (5)
871 *
872 * where 0 <= m16 < 2^10.
873 *
874 * For a float32, the bit layout is:
875 *
876 * sign: 31
877 * exponent: 23:30
878 * mantissa: 0:22
879 *
880 * Let f32 be a float32 value. The sign, exponent, and mantissa
881 * determine its value thus:
882 *
883 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
884 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
885 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
886 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
887 * if e32 = 255 and m32 != 0, then NaN (14)
888 *
889 * where 0 <= m32 < 2^23.
890 *
891 * The minimum and maximum normal float16 values are
892 *
893 * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
894 * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
895 *
896 * The step at max_norm16 is
897 *
898 * max_step16 = 2^5 (22)
899 *
900 * Observe that the float16 boundary values in equations 20-21 lie in the
901 * range of normal float32 values.
902 *
903 *
904 * Rounding Behavior
905 * -----------------
906 * Not all float32 values can be exactly represented as a float16. We
907 * round all such intermediate float32 values to the nearest float16; if
908 * the float32 is exactly between to float16 values, we round to the one
909 * with an even mantissa. This rounding behavior has several benefits:
910 *
911 * - It has no sign bias.
912 *
913 * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
914 * GPU ISA.
915 *
916 * - By reproducing the behavior of the GPU (at least on Intel hardware),
917 * compile-time evaluation of constant packHalf2x16 GLSL expressions will
918 * result in the same value as if the expression were executed on the
919 * GPU.
920 *
921 * Calculation
922 * -----------
923 * Our task is to compute s16, e16, m16 given f32. Since this function
924 * ignores the sign bit, assume that s32 = s16 = 0. There are several
925 * cases consider.
926 */
927
928 factory.emit(
929
930 /* Case 1) f32 is NaN
931 *
932 * The resultant f16 will also be NaN.
933 */
934
935 /* if (e32 == 255 && m32 != 0) { */
936 if_tree(logic_and(equal(e, constant(0xffu << 23u)),
937 logic_not(equal(m, constant(0u)))),
938
939 assign(u16, constant(0x7fffu)),
940
941 /* Case 2) f32 lies in the range [0, min_norm16).
942 *
943 * The resultant float16 will be either zero, subnormal, or normal.
944 *
945 * Solving
946 *
947 * f32 = min_norm16 (30)
948 *
949 * gives
950 *
951 * e32 = 113 and m32 = 0 (31)
952 *
953 * Therefore this case occurs if and only if
954 *
955 * e32 < 113 (32)
956 */
957
958 /* } else if (e32 < 113) { */
959 if_tree(less(e, constant(113u << 23u)),
960
961 /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
962 assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
963 constant((float) (1 << 24)))))),
964
965 /* Case 3) f32 lies in the range
966 * [min_norm16, max_norm16 + max_step16).
967 *
968 * The resultant float16 will be either normal or infinite.
969 *
970 * Solving
971 *
972 * f32 = max_norm16 + max_step16 (40)
973 * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
974 * = 2^16 (42)
975 * gives
976 *
977 * e32 = 143 and m32 = 0 (43)
978 *
979 * We already solved the boundary condition f32 = min_norm16 above
980 * in equation 31. Therefore this case occurs if and only if
981 *
982 * 113 <= e32 and e32 < 143
983 */
984
985 /* } else if (e32 < 143) { */
986 if_tree(less(e, constant(143u << 23u)),
987
988 /* The addition below handles the case where the mantissa rounds
989 * up to 1024 and bumps the exponent.
990 *
991 * u16 = ((e - (112u << 23u)) >> 13u)
992 * + round_to_even((float(m) / (1u << 13u));
993 */
994 assign(u16, add(rshift(sub(e, constant(112u << 23u)),
995 constant(13u)),
996 f2u(round_even(
997 div(u2f(m), constant((float) (1 << 13))))))),
998
999 /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
1000 *
1001 * The resultant float16 will be infinite.
1002 *
1003 * The cases above caught all float32 values in the range
1004 * [0, max_norm16 + max_step16), so this is the fall-through case.
1005 */
1006
1007 /* } else { */
1008
1009 assign(u16, constant(31u << 10u))))));
1010
1011 /* } */
1012
1013 return deref(u16).val;
1014 }
1015
1016 /**
1017 * \brief Lower a packHalf2x16 expression.
1018 *
1019 * \param vec2_rval is packHalf2x16's input
1020 * \return packHalf2x16's output as a uint rvalue
1021 */
1022 ir_rvalue*
lower_pack_half_2x16(ir_rvalue * vec2_rval)1023 lower_pack_half_2x16(ir_rvalue *vec2_rval)
1024 {
1025 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1026 *
1027 * highp uint packHalf2x16 (mediump vec2 v)
1028 * ----------------------------------------
1029 * Returns an unsigned integer obtained by converting the components of
1030 * a two-component floating-point vector to the 16-bit floating-point
1031 * representation found in the OpenGL ES Specification, and then packing
1032 * these two 16-bit integers into a 32-bit unsigned integer.
1033 *
1034 * The first vector component specifies the 16 least- significant bits
1035 * of the result; the second component specifies the 16 most-significant
1036 * bits.
1037 */
1038
1039 assert(vec2_rval->type == &glsl_type_builtin_vec2);
1040
1041 /* vec2 f = VEC2_RVAL; */
1042 ir_variable *f = factory.make_temp(&glsl_type_builtin_vec2,
1043 "tmp_pack_half_2x16_f");
1044 factory.emit(assign(f, vec2_rval));
1045
1046 /* uvec2 f32 = bitcast_f2u(f); */
1047 ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1048 "tmp_pack_half_2x16_f32");
1049 factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1050
1051 /* uvec2 f16; */
1052 ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1053 "tmp_pack_half_2x16_f16");
1054
1055 /* Get f32's unshifted exponent bits.
1056 *
1057 * uvec2 e = f32 & 0x7f800000u;
1058 */
1059 ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1060 "tmp_pack_half_2x16_e");
1061 factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1062
1063 /* Get f32's unshifted mantissa bits.
1064 *
1065 * uvec2 m = f32 & 0x007fffffu;
1066 */
1067 ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1068 "tmp_pack_half_2x16_m");
1069 factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1070
1071 /* Set f16's exponent and mantissa bits.
1072 *
1073 * f16.x = pack_half_1x16_nosign(e.x, m.x);
1074 * f16.y = pack_half_1y16_nosign(e.y, m.y);
1075 */
1076 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1077 swizzle_x(e),
1078 swizzle_x(m)),
1079 WRITEMASK_X));
1080 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1081 swizzle_y(e),
1082 swizzle_y(m)),
1083 WRITEMASK_Y));
1084
1085 /* Set f16's sign bits.
1086 *
1087 * f16 |= (f32 & (1u << 31u) >> 16u;
1088 */
1089 factory.emit(
1090 assign(f16, bit_or(f16,
1091 rshift(bit_and(f32, constant(1u << 31u)),
1092 constant(16u)))));
1093
1094
1095 /* return (f16.y << 16u) | f16.x; */
1096 ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1097 constant(16u)),
1098 swizzle_x(f16));
1099
1100 assert(result->type == &glsl_type_builtin_uint);
1101 return result;
1102 }
1103
1104 /**
1105 * \brief Lower the component-wise calculation of unpackHalf2x16.
1106 *
1107 * Given a uint that encodes a float16 in its lower 16 bits, this function
1108 * returns a uint that encodes a float32 with the same value. The sign bit
1109 * of the float16 is ignored.
1110 *
1111 * \param e_rval is the unshifted exponent bits of a float16
1112 * \param m_rval is the unshifted mantissa bits of a float16
1113 * \param a uint rvalue that encodes a float32
1114 */
1115 ir_rvalue*
unpack_half_1x16_nosign(ir_rvalue * e_rval,ir_rvalue * m_rval)1116 unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1117 {
1118 assert(e_rval->type == &glsl_type_builtin_uint);
1119 assert(m_rval->type == &glsl_type_builtin_uint);
1120
1121 /* uint u32; */
1122 ir_variable *u32 = factory.make_temp(&glsl_type_builtin_uint,
1123 "tmp_unpack_half_1x16_u32");
1124
1125 /* uint e = E_RVAL; */
1126 ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
1127 "tmp_unpack_half_1x16_e");
1128 factory.emit(assign(e, e_rval));
1129
1130 /* uint m = M_RVAL; */
1131 ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
1132 "tmp_unpack_half_1x16_m");
1133 factory.emit(assign(m, m_rval));
1134
1135 /* Preliminaries
1136 * -------------
1137 *
1138 * For a float16, the bit layout is:
1139 *
1140 * sign: 15
1141 * exponent: 10:14
1142 * mantissa: 0:9
1143 *
1144 * Let f16 be a float16 value. The sign, exponent, and mantissa
1145 * determine its value thus:
1146 *
1147 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
1148 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
1149 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1150 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
1151 * if e16 = 31 and m16 != 0, then NaN (5)
1152 *
1153 * where 0 <= m16 < 2^10.
1154 *
1155 * For a float32, the bit layout is:
1156 *
1157 * sign: 31
1158 * exponent: 23:30
1159 * mantissa: 0:22
1160 *
1161 * Let f32 be a float32 value. The sign, exponent, and mantissa
1162 * determine its value thus:
1163 *
1164 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
1165 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
1166 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1167 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
1168 * if e32 = 255 and m32 != 0, then NaN (14)
1169 *
1170 * where 0 <= m32 < 2^23.
1171 *
1172 * Calculation
1173 * -----------
1174 * Our task is to compute s32, e32, m32 given f16. Since this function
1175 * ignores the sign bit, assume that s32 = s16 = 0. There are several
1176 * cases consider.
1177 */
1178
1179 factory.emit(
1180
1181 /* Case 1) f16 is zero or subnormal.
1182 *
1183 * The simplest method of calcuating f32 in this case is
1184 *
1185 * f32 = f16 (20)
1186 * = 2^(-14) * (m16 / 2^10) (21)
1187 * = m16 / 2^(-24) (22)
1188 */
1189
1190 /* if (e16 == 0) { */
1191 if_tree(equal(e, constant(0u)),
1192
1193 /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1194 assign(u32, expr(ir_unop_bitcast_f2u,
1195 div(u2f(m), constant((float)(1 << 24))))),
1196
1197 /* Case 2) f16 is normal.
1198 *
1199 * The equation
1200 *
1201 * f32 = f16 (30)
1202 * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
1203 * 2^(e16 - 15) * (1 + m16 / 2^10)
1204 *
1205 * can be decomposed into two
1206 *
1207 * 2^(e32 - 127) = 2^(e16 - 15) (32)
1208 * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
1209 *
1210 * which solve to
1211 *
1212 * e32 = e16 + 112 (34)
1213 * m32 = m16 * 2^13 (35)
1214 */
1215
1216 /* } else if (e16 < 31)) { */
1217 if_tree(less(e, constant(31u << 10u)),
1218
1219 /* u32 = ((e + (112 << 10)) | m) << 13;
1220 */
1221 assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1222 constant(13u))),
1223
1224
1225 /* Case 3) f16 is infinite. */
1226 if_tree(equal(m, constant(0u)),
1227
1228 assign(u32, constant(255u << 23u)),
1229
1230 /* Case 4) f16 is NaN. */
1231 /* } else { */
1232
1233 assign(u32, constant(0x7fffffffu))))));
1234
1235 /* } */
1236
1237 return deref(u32).val;
1238 }
1239
1240 /**
1241 * \brief Lower an unpackHalf2x16 expression.
1242 *
1243 * \param uint_rval is unpackHalf2x16's input
1244 * \return unpackHalf2x16's output as a vec2 rvalue
1245 */
1246 ir_rvalue*
lower_unpack_half_2x16(ir_rvalue * uint_rval)1247 lower_unpack_half_2x16(ir_rvalue *uint_rval)
1248 {
1249 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1250 *
1251 * mediump vec2 unpackHalf2x16 (highp uint v)
1252 * ------------------------------------------
1253 * Returns a two-component floating-point vector with components
1254 * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1255 * values, interpreting those values as 16-bit floating-point numbers
1256 * according to the OpenGL ES Specification, and converting them to
1257 * 32-bit floating-point values.
1258 *
1259 * The first component of the vector is obtained from the
1260 * 16 least-significant bits of v; the second component is obtained
1261 * from the 16 most-significant bits of v.
1262 */
1263 assert(uint_rval->type == &glsl_type_builtin_uint);
1264
1265 /* uint u = RVALUE;
1266 * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1267 */
1268 ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1269 "tmp_unpack_half_2x16_f16");
1270 factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1271
1272 /* uvec2 f32; */
1273 ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1274 "tmp_unpack_half_2x16_f32");
1275
1276 /* Get f16's unshifted exponent bits.
1277 *
1278 * uvec2 e = f16 & 0x7c00u;
1279 */
1280 ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1281 "tmp_unpack_half_2x16_e");
1282 factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1283
1284 /* Get f16's unshifted mantissa bits.
1285 *
1286 * uvec2 m = f16 & 0x03ffu;
1287 */
1288 ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1289 "tmp_unpack_half_2x16_m");
1290 factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1291
1292 /* Set f32's exponent and mantissa bits.
1293 *
1294 * f32.x = unpack_half_1x16_nosign(e.x, m.x);
1295 * f32.y = unpack_half_1x16_nosign(e.y, m.y);
1296 */
1297 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1298 swizzle_x(m)),
1299 WRITEMASK_X));
1300 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1301 swizzle_y(m)),
1302 WRITEMASK_Y));
1303
1304 /* Set f32's sign bit.
1305 *
1306 * f32 |= (f16 & 0x8000u) << 16u;
1307 */
1308 factory.emit(assign(f32, bit_or(f32,
1309 lshift(bit_and(f16,
1310 constant(0x8000u)),
1311 constant(16u)))));
1312
1313 /* return bitcast_u2f(f32); */
1314 ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1315 assert(result->type == &glsl_type_builtin_vec2);
1316 return result;
1317 }
1318 };
1319
1320 } // namespace anonymous
1321
1322 /**
1323 * \brief Lower the builtin packing functions.
1324 */
1325 bool
lower_packing_builtins(exec_list * instructions,bool has_shading_language_packing,bool has_gpu_shader5,bool has_half_float_packing)1326 lower_packing_builtins(exec_list *instructions,
1327 bool has_shading_language_packing,
1328 bool has_gpu_shader5,
1329 bool has_half_float_packing)
1330 {
1331 if (!has_shading_language_packing)
1332 return false;
1333
1334 int op_mask = LOWER_PACK_SNORM_2x16 |
1335 LOWER_UNPACK_SNORM_2x16 |
1336 LOWER_PACK_UNORM_2x16 |
1337 LOWER_UNPACK_UNORM_2x16 |
1338 LOWER_PACK_SNORM_4x8 |
1339 LOWER_UNPACK_SNORM_4x8 |
1340 LOWER_UNPACK_UNORM_4x8 |
1341 LOWER_PACK_UNORM_4x8;
1342
1343 if (has_gpu_shader5)
1344 op_mask |= LOWER_PACK_USE_BFI | LOWER_PACK_USE_BFE;
1345
1346 if (!has_half_float_packing)
1347 op_mask |= LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16;
1348
1349 lower_packing_builtins_visitor v(op_mask);
1350 visit_list_elements(&v, instructions, true);
1351 return v.get_progress();
1352 }
1353