• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import re
2from nir_opcodes import opcodes
3from nir_opcodes import type_has_size, type_size, type_sizes, type_base_type
4
5def type_add_size(type_, size):
6    if type_has_size(type_):
7        return type_
8    return type_ + str(size)
9
10def op_bit_sizes(op):
11    sizes = None
12    if not type_has_size(op.output_type):
13        sizes = set(type_sizes(op.output_type))
14
15    for input_type in op.input_types:
16        if not type_has_size(input_type):
17            if sizes is None:
18                sizes = set(type_sizes(input_type))
19            else:
20                sizes = sizes.intersection(set(type_sizes(input_type)))
21
22    return sorted(list(sizes)) if sizes is not None else None
23
24def get_const_field(type_):
25    if type_size(type_) == 1:
26        return 'b'
27    elif type_base_type(type_) == 'bool':
28        return 'i' + str(type_size(type_))
29    elif type_ == "float16":
30        return "u16"
31    else:
32        return type_base_type(type_)[0] + str(type_size(type_))
33
34template = """\
35/*
36 * Copyright (C) 2014 Intel Corporation
37 *
38 * Permission is hereby granted, free of charge, to any person obtaining a
39 * copy of this software and associated documentation files (the "Software"),
40 * to deal in the Software without restriction, including without limitation
41 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
42 * and/or sell copies of the Software, and to permit persons to whom the
43 * Software is furnished to do so, subject to the following conditions:
44 *
45 * The above copyright notice and this permission notice (including the next
46 * paragraph) shall be included in all copies or substantial portions of the
47 * Software.
48 *
49 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
52 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
54 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
55 * IN THE SOFTWARE.
56 */
57
58#include <math.h>
59#include "util/rounding.h" /* for _mesa_roundeven */
60#include "util/half_float.h"
61#include "util/double.h"
62#include "util/softfloat.h"
63#include "util/bigmath.h"
64#include "util/format/format_utils.h"
65#include "util/format_r11g11b10f.h"
66#include "util/u_math.h"
67#include "nir_constant_expressions.h"
68
69/**
70 * \brief Checks if the provided value is a denorm and flushes it to zero.
71 */
72static void
73constant_denorm_flush_to_zero(nir_const_value *value, unsigned bit_size)
74{
75    switch(bit_size) {
76    case 64:
77        if (0 == (value->u64 & 0x7ff0000000000000))
78            value->u64 &= 0x8000000000000000;
79        break;
80    case 32:
81        if (0 == (value->u32 & 0x7f800000))
82            value->u32 &= 0x80000000;
83        break;
84    case 16:
85        if (0 == (value->u16 & 0x7c00))
86            value->u16 &= 0x8000;
87    }
88}
89
90/**
91 * Evaluate one component of packSnorm4x8.
92 */
93static uint8_t
94pack_snorm_1x8(float x)
95{
96    /* From section 8.4 of the GLSL 4.30 spec:
97     *
98     *    packSnorm4x8
99     *    ------------
100     *    The conversion for component c of v to fixed point is done as
101     *    follows:
102     *
103     *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
104     *
105     * We must first cast the float to an int, because casting a negative
106     * float to a uint is undefined.
107     */
108   return (uint8_t) (int)
109          _mesa_roundevenf(CLAMP(x, -1.0f, +1.0f) * 127.0f);
110}
111
112/**
113 * Evaluate one component of packSnorm2x16.
114 */
115static uint16_t
116pack_snorm_1x16(float x)
117{
118    /* From section 8.4 of the GLSL ES 3.00 spec:
119     *
120     *    packSnorm2x16
121     *    -------------
122     *    The conversion for component c of v to fixed point is done as
123     *    follows:
124     *
125     *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
126     *
127     * We must first cast the float to an int, because casting a negative
128     * float to a uint is undefined.
129     */
130   return (uint16_t) (int)
131          _mesa_roundevenf(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
132}
133
134/**
135 * Evaluate one component of unpackSnorm4x8.
136 */
137static float
138unpack_snorm_1x8(uint8_t u)
139{
140    /* From section 8.4 of the GLSL 4.30 spec:
141     *
142     *    unpackSnorm4x8
143     *    --------------
144     *    The conversion for unpacked fixed-point value f to floating point is
145     *    done as follows:
146     *
147     *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
148     */
149   return CLAMP((int8_t) u / 127.0f, -1.0f, +1.0f);
150}
151
152/**
153 * Evaluate one component of unpackSnorm2x16.
154 */
155static float
156unpack_snorm_1x16(uint16_t u)
157{
158    /* From section 8.4 of the GLSL ES 3.00 spec:
159     *
160     *    unpackSnorm2x16
161     *    ---------------
162     *    The conversion for unpacked fixed-point value f to floating point is
163     *    done as follows:
164     *
165     *       unpackSnorm2x16: clamp(f / 32767.0, -1, +1)
166     */
167   return CLAMP((int16_t) u / 32767.0f, -1.0f, +1.0f);
168}
169
170/**
171 * Evaluate one component packUnorm4x8.
172 */
173static uint8_t
174pack_unorm_1x8(float x)
175{
176    /* From section 8.4 of the GLSL 4.30 spec:
177     *
178     *    packUnorm4x8
179     *    ------------
180     *    The conversion for component c of v to fixed point is done as
181     *    follows:
182     *
183     *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
184     */
185   return (uint8_t) (int)
186          _mesa_roundevenf(CLAMP(x, 0.0f, 1.0f) * 255.0f);
187}
188
189/**
190 * Evaluate one component packUnorm2x16.
191 */
192static uint16_t
193pack_unorm_1x16(float x)
194{
195    /* From section 8.4 of the GLSL ES 3.00 spec:
196     *
197     *    packUnorm2x16
198     *    -------------
199     *    The conversion for component c of v to fixed point is done as
200     *    follows:
201     *
202     *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
203     */
204   return (uint16_t) (int)
205          _mesa_roundevenf(CLAMP(x, 0.0f, 1.0f) * 65535.0f);
206}
207
208/**
209 * Evaluate one component of unpackUnorm4x8.
210 */
211static float
212unpack_unorm_1x8(uint8_t u)
213{
214    /* From section 8.4 of the GLSL 4.30 spec:
215     *
216     *    unpackUnorm4x8
217     *    --------------
218     *    The conversion for unpacked fixed-point value f to floating point is
219     *    done as follows:
220     *
221     *       unpackUnorm4x8: f / 255.0
222     */
223   return (float) u / 255.0f;
224}
225
226/**
227 * Evaluate one component of unpackUnorm2x16.
228 */
229static float
230unpack_unorm_1x16(uint16_t u)
231{
232    /* From section 8.4 of the GLSL ES 3.00 spec:
233     *
234     *    unpackUnorm2x16
235     *    ---------------
236     *    The conversion for unpacked fixed-point value f to floating point is
237     *    done as follows:
238     *
239     *       unpackUnorm2x16: f / 65535.0
240     */
241   return (float) u / 65535.0f;
242}
243
244/**
245 * Evaluate one component of packHalf2x16.
246 */
247static uint16_t
248pack_half_1x16(float x)
249{
250   return _mesa_float_to_half(x);
251}
252
253/**
254 * Evaluate one component of packHalf2x16, RTZ mode.
255 */
256static uint16_t
257pack_half_1x16_rtz(float x)
258{
259   return _mesa_float_to_float16_rtz(x);
260}
261
262/**
263 * Evaluate one component of unpackHalf2x16.
264 */
265static float
266unpack_half_1x16_flush_to_zero(uint16_t u)
267{
268   if (0 == (u & 0x7c00))
269      u &= 0x8000;
270   return _mesa_half_to_float(u);
271}
272
273/**
274 * Evaluate one component of unpackHalf2x16.
275 */
276static float
277unpack_half_1x16(uint16_t u)
278{
279   return _mesa_half_to_float(u);
280}
281
282/* Broadcom v3d specific instructions */
283/**
284 * Packs 2 2x16 floating split into a r11g11b10f:
285 *
286 * dst[10:0]  = float16_to_float11 (src0[15:0])
287 * dst[21:11] = float16_to_float11 (src0[31:16])
288 * dst[31:22] = float16_to_float10 (src1[15:0])
289 */
290static uint32_t pack_32_to_r11g11b10_v3d(const uint32_t src0,
291                                         const uint32_t src1)
292{
293   float rgb[3] = {
294      unpack_half_1x16((src0 & 0xffff)),
295      unpack_half_1x16((src0 >> 16)),
296      unpack_half_1x16((src1 & 0xffff)),
297   };
298
299   return float3_to_r11g11b10f(rgb);
300}
301
302/**
303  * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
304  * as they receives a uint16_t val instead of a float
305  */
306static inline uint8_t _mesa_half_to_snorm8(uint16_t val)
307{
308   return pack_snorm_1x8(_mesa_half_to_float(val));
309}
310
311static uint16_t _mesa_float_to_snorm16(uint32_t val)
312{
313   union fi aux;
314   aux.ui = val;
315   return pack_snorm_1x16(aux.f);
316}
317
318static uint16_t _mesa_float_to_unorm16(uint32_t val)
319{
320   union fi aux;
321   aux.ui = val;
322   return pack_unorm_1x16(aux.f);
323}
324
325static inline uint32_t float_pack16_v3d(uint32_t f32)
326{
327   return _mesa_float_to_half(uif(f32));
328}
329
330static inline uint32_t float_unpack16_v3d(uint32_t f16)
331{
332   return fui(_mesa_half_to_float(f16));
333}
334
335static inline uint32_t vfpack_v3d(uint32_t a, uint32_t b)
336{
337   return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
338}
339
340static inline uint32_t vfsat_v3d(uint32_t a)
341{
342   const uint32_t low = fui(SATURATE(_mesa_half_to_float(a & 0xffff)));
343   const uint32_t high = fui(SATURATE(_mesa_half_to_float(a >> 16)));
344
345   return vfpack_v3d(low, high);
346}
347
348static inline uint32_t fmul_v3d(uint32_t a, uint32_t b)
349{
350   return fui(uif(a) * uif(b));
351}
352
353static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
354{
355   const uint32_t low = fmul_v3d(float_unpack16_v3d(a & 0xffff),
356                                 float_unpack16_v3d(b & 0xffff));
357   const uint32_t high = fmul_v3d(float_unpack16_v3d(a >> 16),
358                                  float_unpack16_v3d(b >> 16));
359
360   return vfpack_v3d(low, high);
361}
362
363/* Convert 2x16-bit floating point to 2x10-bit unorm */
364static uint32_t pack_2x16_to_unorm_2x10(uint32_t src0)
365{
366   return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
367}
368
369/*
370 * Convert 2x16-bit floating point to one 2-bit and one
371 * 10-bit unorm
372 */
373static uint32_t pack_2x16_to_unorm_10_2(uint32_t src0)
374{
375   return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
376}
377
378static uint32_t
379msad(uint32_t src0, uint32_t src1, uint32_t src2) {
380   uint32_t res = src2;
381   for (unsigned i = 0; i < 4; i++) {
382      const uint8_t ref = src0 >> (i * 8);
383      const uint8_t src = src1 >> (i * 8);
384      if (ref != 0)
385         res += MAX2(ref, src) - MIN2(ref, src);
386   }
387   return res;
388}
389
390/* Some typed vector structures to make things like src0.y work */
391typedef int8_t int1_t;
392typedef uint8_t uint1_t;
393typedef float float16_t;
394typedef float float32_t;
395typedef double float64_t;
396typedef bool bool1_t;
397typedef bool bool8_t;
398typedef bool bool16_t;
399typedef bool bool32_t;
400typedef bool bool64_t;
401% for type in ["float", "int", "uint", "bool"]:
402% for width in type_sizes(type):
403struct ${type}${width}_vec {
404   ${type}${width}_t x;
405   ${type}${width}_t y;
406   ${type}${width}_t z;
407   ${type}${width}_t w;
408   ${type}${width}_t e;
409   ${type}${width}_t f;
410   ${type}${width}_t g;
411   ${type}${width}_t h;
412   ${type}${width}_t i;
413   ${type}${width}_t j;
414   ${type}${width}_t k;
415   ${type}${width}_t l;
416   ${type}${width}_t m;
417   ${type}${width}_t n;
418   ${type}${width}_t o;
419   ${type}${width}_t p;
420};
421% endfor
422% endfor
423
424<%def name="evaluate_op(op, bit_size, execution_mode)">
425   <%
426   output_type = type_add_size(op.output_type, bit_size)
427   input_types = [type_add_size(type_, bit_size) for type_ in op.input_types]
428   %>
429
430   ## For each non-per-component input, create a variable srcN that
431   ## contains x, y, z, and w elements which are filled in with the
432   ## appropriately-typed values.
433   % for j in range(op.num_inputs):
434      % if op.input_sizes[j] == 0:
435         <% continue %>
436      % elif "src" + str(j) not in op.const_expr:
437         ## Avoid unused variable warnings
438         <% continue %>
439      %endif
440
441      const struct ${input_types[j]}_vec src${j} = {
442      % for k in range(op.input_sizes[j]):
443         % if input_types[j] == "int1":
444             /* 1-bit integers use a 0/-1 convention */
445             -(int1_t)_src[${j}][${k}].b,
446         % elif input_types[j] == "float16":
447            _mesa_half_to_float(_src[${j}][${k}].u16),
448         % else:
449            _src[${j}][${k}].${get_const_field(input_types[j])},
450         % endif
451      % endfor
452      % for k in range(op.input_sizes[j], 16):
453         0,
454      % endfor
455      };
456   % endfor
457
458   % if op.output_size == 0:
459      ## For per-component instructions, we need to iterate over the
460      ## components and apply the constant expression one component
461      ## at a time.
462      for (unsigned _i = 0; _i < num_components; _i++) {
463         ## For each per-component input, create a variable srcN that
464         ## contains the value of the current (_i'th) component.
465         % for j in range(op.num_inputs):
466            % if op.input_sizes[j] != 0:
467               <% continue %>
468            % elif "src" + str(j) not in op.const_expr:
469               ## Avoid unused variable warnings
470               <% continue %>
471            % elif input_types[j] == "int1":
472               /* 1-bit integers use a 0/-1 convention */
473               const int1_t src${j} = -(int1_t)_src[${j}][_i].b;
474            % elif input_types[j] == "float16":
475               const float src${j} =
476                  _mesa_half_to_float(_src[${j}][_i].u16);
477            % else:
478               const ${input_types[j]}_t src${j} =
479                  _src[${j}][_i].${get_const_field(input_types[j])};
480            % endif
481         % endfor
482
483         ## Create an appropriately-typed variable dst and assign the
484         ## result of the const_expr to it.  If const_expr already contains
485         ## writes to dst, just include const_expr directly.
486         % if "dst" in op.const_expr:
487            ${output_type}_t dst;
488
489            ${op.const_expr}
490         % else:
491            ${output_type}_t dst = ${op.const_expr};
492         % endif
493
494         ## Store the current component of the actual destination to the
495         ## value of dst.
496         % if output_type == "int1" or output_type == "uint1":
497            /* 1-bit integers get truncated */
498            _dst_val[_i].b = dst & 1;
499         % elif output_type.startswith("bool"):
500            ## Sanitize the C value to a proper NIR 0/-1 bool
501            _dst_val[_i].${get_const_field(output_type)} = -(int)dst;
502         % elif output_type == "float16":
503            if (nir_is_rounding_mode_rtz(execution_mode, 16)) {
504               _dst_val[_i].u16 = _mesa_float_to_float16_rtz(dst);
505            } else {
506               _dst_val[_i].u16 = _mesa_float_to_float16_rtne(dst);
507            }
508         % else:
509            _dst_val[_i].${get_const_field(output_type)} = dst;
510         % endif
511
512         % if op.name != "fquantize2f16" and type_base_type(output_type) == "float":
513            % if type_has_size(output_type):
514               if (nir_is_denorm_flush_to_zero(execution_mode, ${type_size(output_type)})) {
515                  constant_denorm_flush_to_zero(&_dst_val[_i], ${type_size(output_type)});
516               }
517            % else:
518               if (nir_is_denorm_flush_to_zero(execution_mode, ${bit_size})) {
519                  constant_denorm_flush_to_zero(&_dst_val[i], bit_size);
520               }
521            %endif
522         % endif
523      }
524   % else:
525      ## In the non-per-component case, create a struct dst with
526      ## appropriately-typed elements x, y, z, and w and assign the result
527      ## of the const_expr to all components of dst, or include the
528      ## const_expr directly if it writes to dst already.
529      struct ${output_type}_vec dst;
530
531      % if "dst" in op.const_expr:
532         ${op.const_expr}
533      % else:
534         ## Splat the value to all components.  This way expressions which
535         ## write the same value to all components don't need to explicitly
536         ## write to dest.
537         dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
538      % endif
539
540      ## For each component in the destination, copy the value of dst to
541      ## the actual destination.
542      % for k in range(op.output_size):
543         % if output_type == "int1" or output_type == "uint1":
544            /* 1-bit integers get truncated */
545            _dst_val[${k}].b = dst.${"xyzwefghijklmnop"[k]} & 1;
546         % elif output_type.startswith("bool"):
547            ## Sanitize the C value to a proper NIR 0/-1 bool
548            _dst_val[${k}].${get_const_field(output_type)} = -(int)dst.${"xyzwefghijklmnop"[k]};
549         % elif output_type == "float16":
550            if (nir_is_rounding_mode_rtz(execution_mode, 16)) {
551               _dst_val[${k}].u16 = _mesa_float_to_float16_rtz(dst.${"xyzwefghijklmnop"[k]});
552            } else {
553               _dst_val[${k}].u16 = _mesa_float_to_float16_rtne(dst.${"xyzwefghijklmnop"[k]});
554            }
555         % else:
556            _dst_val[${k}].${get_const_field(output_type)} = dst.${"xyzwefghijklmnop"[k]};
557         % endif
558
559         % if op.name != "fquantize2f16" and type_base_type(output_type) == "float":
560            % if type_has_size(output_type):
561               if (nir_is_denorm_flush_to_zero(execution_mode, ${type_size(output_type)})) {
562                  constant_denorm_flush_to_zero(&_dst_val[${k}], ${type_size(output_type)});
563               }
564            % else:
565               if (nir_is_denorm_flush_to_zero(execution_mode, ${bit_size})) {
566                  constant_denorm_flush_to_zero(&_dst_val[${k}], bit_size);
567               }
568            % endif
569         % endif
570      % endfor
571   % endif
572</%def>
573
574% for name, op in sorted(opcodes.items()):
575% if op.name == "fsat":
576#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
577#pragma optimize("", off) /* Temporary work-around for MSVC compiler bug, present in VS2019 16.9.2 */
578#endif
579% endif
580static void
581evaluate_${name}(nir_const_value *_dst_val,
582                 UNUSED unsigned num_components,
583                 ${"UNUSED" if op_bit_sizes(op) is None else ""} unsigned bit_size,
584                 UNUSED nir_const_value **_src,
585                 UNUSED unsigned execution_mode)
586{
587   % if op_bit_sizes(op) is not None:
588      switch (bit_size) {
589      % for bit_size in op_bit_sizes(op):
590      case ${bit_size}: {
591         ${evaluate_op(op, bit_size, execution_mode)}
592         break;
593      }
594      % endfor
595
596      default:
597         unreachable("unknown bit width");
598      }
599   % else:
600      ${evaluate_op(op, 0, execution_mode)}
601   % endif
602}
603% if op.name == "fsat":
604#if defined(_MSC_VER) && (defined(_M_ARM64) || defined(_M_ARM64EC))
605#pragma optimize("", on) /* Temporary work-around for MSVC compiler bug, present in VS2019 16.9.2 */
606#endif
607% endif
608% endfor
609
610void
611nir_eval_const_opcode(nir_op op, nir_const_value *dest,
612                      unsigned num_components, unsigned bit_width,
613                      nir_const_value **src,
614                      unsigned float_controls_execution_mode)
615{
616   switch (op) {
617% for name in sorted(opcodes.keys()):
618   case nir_op_${name}:
619      evaluate_${name}(dest, num_components, bit_width, src, float_controls_execution_mode);
620      return;
621% endfor
622   default:
623      unreachable("shouldn't get here");
624   }
625}"""
626
627from mako.template import Template
628
629print(Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
630                                type_base_type=type_base_type,
631                                type_size=type_size,
632                                type_has_size=type_has_size,
633                                type_add_size=type_add_size,
634                                op_bit_sizes=op_bit_sizes,
635                                get_const_field=get_const_field))
636