• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1<!--
2  Copyright (C) 2021 Collabora Ltd.
3
4  Permission is hereby granted, free of charge, to any person obtaining a
5  copy of this software and associated documentation files (the "Software"),
6  to deal in the Software without restriction, including without limitation
7  the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  and/or sell copies of the Software, and to permit persons to whom the
9  Software is furnished to do so, subject to the following conditions:
10
11  The above copyright notice and this permission notice (including the next
12  paragraph) shall be included in all copies or substantial portions of the
13  Software.
14
15  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  SOFTWARE.
22-->
23
24<valhall>
25  <lut name="Immediates">
26    <desc>
27      This immediates are accessible in (almost) any instruction, provided the
28      immediate mode is kept to the default. They optimize for the most common
29      immediate values; any immediate listed here may be used without taking up
30      a uniform slot or a register. Most integer instructions can access
31      separate half-words and individual bytes via swizzles on the source.
32    </desc>
33    <constant desc="Zero">0x00000000</constant>
34    <constant desc="All ones; integer $-1$">0xFFFFFFFF</constant>
35    <constant desc="Maximum integer; floating-point NaN">0x7FFFFFFF</constant>
36    <constant desc="Integers $(-2, -3, -4, -6)$">0xFAFCFDFE</constant>
37    <constant desc="16-bit integer $2^8$">0x01000000</constant>
38    <constant desc="Multiples of 16 $(0, 32, 0, 128)$">0x80002000</constant>
39    <constant desc="Multiples of 16 $(48, 80, 96, 112)$">0x70605030</constant>
40    <constant desc="Multiples of 16 $(144, 160, 176, 192)$">0xC0B0A090</constant>
41    <constant desc="Integers $(0, 1, 2, 3)$">0x03020100</constant>
42    <constant desc="Integers $(4, 5, 6, 7)$">0x07060504</constant>
43    <constant desc="Integers $(8, 9, 10, 11)$">0x0B0A0908</constant>
44    <constant desc="Integers $(12, 13, 14, 15)$">0x0F0E0D0C</constant>
45    <constant desc="Integers $(16, 17, 18, 19)$">0x13121110</constant>
46    <constant desc="Integers $(20, 21, 22, 23)$">0x17161514</constant>
47    <constant desc="Integers $(24, 25, 26, 27)$">0x1B1A1918</constant>
48    <constant desc="Integers $(28, 29, 30, 31)$">0x1F1E1D1C</constant>
49    <constant desc="Float $1.0$">0x3F800000</constant>
50    <constant desc="Float $0.1$">0x3DCCCCCD</constant>
51    <constant desc="Float $1 / \pi$">0x3EA2F983</constant>
52    <constant desc="Float $\log(2)$">0x3F317218</constant>
53    <constant desc="Float $\pi$">0x40490FDB</constant>
54    <constant desc="Float $0.0$">0x00000000</constant>
55    <constant desc="Float $65535.0 = 2^{16} - 1$">0x477FFF00</constant>
56    <constant desc="Half-float $(255.0, 256.0) = (2^8 - 1, 2^8)$">0x5C005BF8</constant>
57    <constant desc="Half-float $0.1 = 1 / 10$">0x2E660000</constant>
58    <constant desc="Half-float $0.25 = 2^{-2}$">0x34000000</constant>
59    <constant desc="Half-float $0.5 = 2^{-1}$">0x38000000</constant>
60    <constant desc="Half-float $1.0 = 2^0$">0x3C000000</constant>
61    <constant desc="Half-float $2.0 = 2^1$">0x40000000</constant>
62    <constant desc="Half-float $4.0 = 2^2$">0x44000000</constant>
63    <constant desc="Half-float $8.0 = 2^3$">0x48000000</constant>
64    <constant desc="Half-float $\pi$">0x42480000</constant>
65  </lut>
66
67  <enum name="Flow">
68    <desc>
69      Every Valhall instruction can wait on dependency
70      slots. A few special flows are available, specified in the instruction
71      metadata from this enum. The `wait0126` flow is required to wait on
72      dependency slot #6 and should be set on the instruction immediately
73      preceding `ATEST`. The `wait` flow should be set for barriers.
74      The `discard` flow only applies to fragment shaders and is used to
75      terminate helper invocations, it should be set as early as possible after
76      helper invocations are no longer needed as determined by data flow
77      analysis. The `end` flow is used to terminate the shader, although it
78      may be overloaded by the `BLEND` instruction.
79
80      The `reconverge` flow is required on any instruction immediately
81      preceding a possible change to the mask of active threads in a subgroup.
82      This includes all divergent branches, but it also includes the final
83      instruction at the end of any basic block where the immediate successor
84      (fallthrough) is the target of a divergent branch.
85    </desc>
86    <value name="None" default="true">none</value>
87    <value name="Wait on slot 0">wait0</value>
88    <value name="Wait on slot 1">wait1</value>
89    <value name="Wait on slots 0, 1">wait01</value>
90    <value name="Wait on slot 2">wait2</value>
91    <value name="Wait on slots 0, 2">wait02</value>
92    <value name="Wait on slots 1, 2">wait12</value>
93    <value name="Wait on slots 0, 1, 2">wait012</value>
94    <value name="Wait on slots 0, 1, 2, 6">wait0126</value>
95    <value name="Wait on slots 0, 1, 2, 6, 7">wait</value>
96    <value name="Perform branch reconverge">reconverge</value>
97    <reserved/>
98    <reserved/>
99    <value name="Terminate discarded threads">discard</value>
100    <reserved/>
101    <value name="Return from shader">end</value>
102  </enum>
103
104  <enum name="FAU special page 0">
105    <desc>
106      Situated between the immediates hard-coded in the hardware and the
107      uniforms defined purely in software, Valhall has a some special
108      "constants" passing through data structures. These are encoded like the
109      table of immediates, as if special constant $i$ were lookup table entry
110      $32 + i$.
111    </desc>
112    <reserved/>
113    <reserved/>
114    <value desc="Warp ID and warps/core - 1">warp_id</value>
115    <reserved/>
116    <value desc="Bounding box maximum X/Y">framebuffer_size</value>
117    <value desc="ATEST datum">atest_datum</value>
118    <value desc="Sample positions">sample</value>
119    <reserved/>
120    <value desc="Blend descriptor 0">blend_descriptor_0</value>
121    <value desc="Blend descriptor 1">blend_descriptor_1</value>
122    <value desc="Blend descriptor 2">blend_descriptor_2</value>
123    <value desc="Blend descriptor 3">blend_descriptor_3</value>
124    <value desc="Blend descriptor 4">blend_descriptor_4</value>
125    <value desc="Blend descriptor 5">blend_descriptor_5</value>
126    <value desc="Blend descriptor 6">blend_descriptor_6</value>
127    <value desc="Blend descriptor 7">blend_descriptor_7</value>
128  </enum>
129
130  <enum name="FAU special page 1">
131    <desc>
132      Situated between the immediates hard-coded in the hardware and the
133      uniforms defined purely in software, Valhall has a some special
134      "constants" passing through data structures. These are encoded like the
135      table of immediates, as if special constant $i$ were lookup table entry
136      $32 + i$.
137    </desc>
138    <reserved/>
139    <value desc="Thread local storage base pointer">thread_local_pointer</value>
140    <reserved/>
141    <value desc="Workgroup local storage base pointer">workgroup_local_pointer</value>
142    <reserved/>
143    <reserved/>
144    <reserved/>
145    <value desc="Shader resource table base pointer">resource_table_pointer</value>
146    <reserved/>
147    <reserved/>
148    <reserved/>
149    <reserved/>
150    <reserved/>
151    <reserved/>
152    <reserved/>
153    <reserved/>
154  </enum>
155
156  <enum name="FAU special page 3">
157    <desc>
158      Situated between the immediates hard-coded in the hardware and the
159      uniforms defined purely in software, Valhall has a some special
160      "constants" passing through data structures. These are encoded like the
161      table of immediates, as if special constant $i$ were lookup table entry
162      $32 + i$.
163    </desc>
164    <reserved/>
165    <value desc="Lane ID">lane_id</value>
166    <reserved/>
167    <value desc="Core ID">core_id</value>
168    <reserved/>
169    <reserved/>
170    <reserved/>
171    <reserved/>
172    <reserved/>
173    <reserved/>
174    <reserved/>
175    <reserved/>
176    <reserved/>
177    <reserved/>
178    <reserved/>
179    <value desc="Program counter">program_counter</value>
180  </enum>
181
182  <enum name="Swizzles (8-bit)">
183    <value default="true">b0123</value>
184    <value>b3210</value>
185    <value>b0101</value>
186    <value>b2323</value>
187    <value>b0000</value>
188    <value>b1111</value>
189    <value>b2222</value>
190    <value>b3333</value>
191    <value>b2301</value>
192    <value>b1032</value>
193    <value>b0011</value>
194    <value>b2233</value>
195    <reserved/>
196    <reserved/>
197    <reserved/>
198    <reserved/>
199  </enum>
200
201  <enum name="Lanes (8-bit)">
202    <desc>Used to select the 2 bytes for shifts of 16-bit vectors</desc>
203    <reserved/>
204    <reserved/>
205    <reserved/>
206    <reserved/>
207    <value>b00</value>
208    <value>b11</value>
209    <value>b22</value>
210    <value>b33</value>
211    <reserved/>
212    <reserved/>
213    <reserved/>
214    <reserved/>
215    <reserved/>
216    <reserved/>
217    <reserved/>
218    <reserved/>
219  </enum>
220
221  <enum name="Half-swizzles (8-bit)">
222    <desc>
223      Used to select the 2 bytes to convert for conversions from 8-bit vectors
224      to 16-bit vectors
225    </desc>
226    <value>b00</value>
227    <value>b10</value>
228    <value>b20</value>
229    <value>b30</value>
230    <value>b01</value>
231    <value>b11</value>
232    <value>b21</value>
233    <value>b31</value>
234    <value>b02</value>
235    <value>b12</value>
236    <value>b22</value>
237    <value>b32</value>
238    <value>b03</value>
239    <value>b13</value>
240    <value>b23</value>
241    <value>b33</value>
242  </enum>
243
244  <enum name="Swizzles (16-bit)">
245    <value>h00</value> <!-- 0,2 -->
246    <value>h10</value>
247    <value default="true">h01</value>
248    <value>h11</value>
249    <value>b00</value> <!-- 0,0 -->
250    <value>b20</value> <!-- 1,1 -->
251    <value>b02</value> <!-- 2,2 -->
252    <value>b22</value> <!-- 3,3 -->
253    <value>b11</value>
254    <value>b31</value>
255    <value>b13</value> <!-- 0,1 -->
256    <value>b33</value> <!-- 2,3 -->
257    <value>b01</value>
258    <value>b23</value>
259    <reserved/>
260    <reserved/>
261  </enum>
262
263  <enum name="Swizzles (32-bit)">
264    <value default="true">none</value>
265    <reserved/>
266    <value>h0</value>
267    <value>h1</value>
268    <value>b0</value>
269    <value>b1</value>
270    <value>b2</value>
271    <value>b3</value>
272  </enum>
273
274  <enum name="Swizzles (64-bit)">
275    <value default="true">none</value>
276    <reserved/>
277    <value>h0</value>
278    <value>h1</value>
279    <value>b0</value>
280    <value>b1</value>
281    <value>b2</value>
282    <value>b3</value>
283    <value>w0</value>
284    <reserved/>
285    <reserved/>
286    <reserved/>
287    <reserved/>
288    <reserved/>
289    <reserved/>
290    <reserved/>
291  </enum>
292
293  <enum name="Lane (8-bit)" implied="true">
294    <value>b0</value>
295    <value>b1</value>
296    <value>b2</value>
297    <value>b3</value>
298  </enum>
299
300  <enum name="Combine">
301    <desc>
302      Used for the lane select of `BRANCHZ`. To use an 8-bit condition, a
303      separate `ICMP` is required to cast to 16-bit.
304    </desc>
305    <value default="true">none</value>
306    <value>h0</value>
307    <value>h1</value>
308    <value>and</value>
309    <value>lowbits</value>
310  </enum>
311
312  <enum name="Lane (16-bit)" implied="true">
313    <value>h0</value>
314    <value>h1</value>
315  </enum>
316
317  <enum name="Load lane (8-bit)">
318    <value default="true">b0</value>
319    <value>b1</value>
320    <value>b2</value>
321    <value>b3</value>
322    <value desc="Zero-extend to 16-bit, low-half">h0</value>
323    <value desc="Zero-extend to 16-bit, high-half">h1</value>
324    <value desc="Zero-extend to 32-bit">w0</value>
325    <value desc="Zero-extend to 64-bit">d0</value>
326  </enum>
327
328  <enum name="Load lane (16-bit)">
329    <value desc="Low half" default="true">h0</value>
330    <value desc="High half">h1</value>
331    <value desc="Zero-extend to 32-bit">w0</value>
332    <value desc="Zero-extend to 64-bit">d0</value>
333    <reserved/>
334    <reserved/>
335    <reserved/>
336    <reserved/>
337  </enum>
338
339  <enum name="Load lane (24-bit)" implied="true">
340    <value default="true">identity</value>
341    <reserved/>
342    <reserved/>
343    <reserved/>
344    <reserved/>
345    <reserved/>
346    <reserved/>
347  </enum>
348
349  <enum name="Load lane (32-bit)">
350    <value default="true">w0</value>
351    <value desc="Zero-extend to 64-bit">d0</value>
352    <reserved/>
353    <reserved/>
354    <reserved/>
355    <reserved/>
356    <reserved/>
357    <reserved/>
358  </enum>
359
360  <enum name="Load lane (48-bit)">
361    <reserved/>
362    <reserved/>
363    <reserved/>
364    <reserved/>
365    <value default="true">identity</value>
366    <reserved/>
367    <reserved/>
368    <reserved/>
369  </enum>
370
371  <enum name="Load lane (64-bit)">
372    <reserved/>
373    <reserved/>
374    <reserved/>
375    <reserved/>
376    <reserved/>
377    <reserved/>
378    <reserved/>
379    <value default="true">identity</value>
380  </enum>
381
382  <enum name="Load lane (96-bit)">
383    <reserved/>
384    <reserved/>
385    <reserved/>
386    <reserved/>
387    <reserved/>
388    <reserved/>
389    <value default="true">identity</value>
390    <reserved/>
391  </enum>
392
393  <enum name="Load lane (128-bit)">
394    <reserved/>
395    <reserved/>
396    <reserved/>
397    <reserved/>
398    <reserved/>
399    <reserved/>
400    <reserved/>
401    <value default="true">identity</value>
402  </enum>
403
404  <enum name="Round mode">
405    <desc>Corresponds to IEEE 754 rounding modes</desc>
406    <value desc="Round to nearest even" default="true">rte</value>
407    <value desc="Round to positive infinity">rtp</value>
408    <value desc="Round to negative infinity">rtn</value>
409    <value desc="Round to zero">rtz</value>
410  </enum>
411
412  <enum name="Result type">
413    <desc>
414      Comparison instructions like `FCMP` return a boolean but may encode this
415      boolean in a variety of ways. `i1` gives a OpenGL style `0/1` boolean.
416      `m1` gives a Direct3D style `0/~0` boolean. `f1` gives a floating-point
417      `0.0f / 1.0f` boolean. Switching between these modes is useful to fold a
418      boolean type convert into a comparison. `u1` is used internally to
419      implement 64-bit comparisons.
420    </desc>
421    <value desc="Integer 1">i1</value>
422    <value desc="Float 1">f1</value>
423    <value desc="Minus 1">m1</value>
424    <value desc="Low half of 64-bit compare">u1</value>
425  </enum>
426
427  <enum name="Widen">
428    <value default="true">none</value>
429    <value>h0</value>
430    <value>h1</value>
431    <reserved/>
432    <reserved/>
433    <reserved/>
434    <reserved/>
435    <reserved/>
436  </enum>
437
438  <enum name="Clamp">
439    <desc>
440      Clamp applied to the destination of a floating-point instruction. Note the
441      clamps may be decomposed as two independent bits for `clamp_0_inf` and
442      `clamp_m1_1`, with `clamp_0_1` arising as the composition of `clamp_0_inf`
443      and `clamp_m1_1` in either order.
444
445      Clamps are implemented per the SPIR-V specification:
446
447      $$\text{clamp} \; (x, \ell, h) = \min( \max( x, \ell ), h)$$
448
449      The min/max functions return the other operand if one operand is NaN, and
450      compare $-0 &lt; +0$. That means the following identities hold for Valhall
451      clamps:
452
453      \begin{align*}
454        \text{clamp}(-0.0, 0.0, 1.0) &amp; = +0.0 \\
455        \text{clamp}(-\text{NaN}, 0.0, 1.0) &amp; = +0.0 \\
456        \text{clamp}(\text{NaN}, 0.0, 1.0) &amp; = +0.0 \\
457        &amp; \\
458        \text{clamp}(-0.0, -1.0, 1.0) &amp; = -0.0 \\
459        \text{clamp}(\text{NaN}, -1.0, 1.0) &amp; = -1.0 \\
460        \text{clamp}(-\text{NaN}, -1.0, 1.0) &amp; = -1.0 \\
461        &amp; \\
462        \max(\text{NaN}, 0.0) &amp; = +0.0 \\
463        \max(-\text{NaN}, 0.0) &amp; = +0.0 \\
464        \max(-0.0, 0.0) &amp; = +0.0 \\
465      \end{align*}
466
467      This behaviour is consistent with the FMin/FMax/FClamp and
468      NMin/NMax/NClamp rules prescribed by SPIR-V and governed by IEEE-754. As
469      a consequence, substituting these clamps for equivalent minimum/maximum
470      exprssions is legal even with strict floating point rules.
471    </desc>
472    <value default="true" desc="Identity">none</value>
473    <value desc="Clamp positive">clamp_0_inf</value>
474    <value desc="Clamp to $[-1, 1]$">clamp_m1_1</value>
475    <value desc="Clamp to $[0, 1]$">clamp_0_1</value>
476  </enum>
477
478  <enum name="Condition">
479    <desc>
480      Condition code. Type must be inferred from the instruction. IEEE 754 total
481      ordering only applies to floating point compares. "Not equal" and "greater
482      than or less than" are distinguished by NaN behaviour conforming to
483      the IEEE 754 specification.
484    </desc>
485    <value desc="Equal">eq</value>
486    <value desc="Greater than">gt</value>
487    <value desc="Greater than or equal">ge</value>
488    <value desc="Not equal">ne</value>
489    <value desc="Less than">lt</value>
490    <value desc="Less than or equal">le</value>
491    <value desc="Greater than or less than">gtlt</value>
492    <value desc="Totally ordered">total</value>
493  </enum>
494
495  <enum name="Dimension">
496    <desc>Texture dimension.</desc>
497    <value desc="1D or buffer">1d</value>
498    <value desc="2D or 2D array">2d</value>
499    <value desc="3D or 3D array">3d</value>
500    <value desc="Cube map or cube map array">cube</value>
501  </enum>
502
503  <enum name="LOD mode">
504    <desc>Level-of-detail selection mode in a texture instruction.</desc>
505    <value desc="Set to zero">zero</value>
506    <value desc="Computed based on neighboring fragments">computed</value>
507    <reserved/>
508    <reserved/>
509    <value desc="Explicitly specified in a register">explicit</value>
510    <value desc="Computed based on neighboring fragments added with bias in a register">computed_bias</value>
511    <value desc="Derived from a gradient descriptor in registers">grdesc</value>
512    <reserved/>
513  </enum>
514
515  <enum name="Register format">
516    <desc>Format of data loaded to / stored from registers for general memory access.</desc>
517    <value desc="32-bit type based on descriptor format">auto</value>
518    <reserved/>
519    <value desc="32-bit floats">f32</value>
520    <value desc="16-bit floats">f16</value>
521    <value desc="32-bit signed integers">s32</value>
522    <value desc="16-bit signed integers">s16</value>
523    <value desc="32-bit unsigned integers">u32</value>
524    <value desc="16-bit unsigned integers">u16</value>
525  </enum>
526
527  <enum name="Staging register count" implied="true">
528    <value>sr0</value>
529    <value>sr1</value>
530    <value>sr2</value>
531    <value>sr3</value>
532    <value>sr4</value>
533    <value>sr5</value>
534    <value>sr6</value>
535    <value>sr7</value>
536  </enum>
537
538  <enum name="Staging register write count" implied="true">
539    <value>write1</value>
540    <value>write2</value>
541    <value>write3</value>
542    <value>write4</value>
543    <value>write5</value>
544    <value>write6</value>
545    <value>write7</value>
546    <value>write8</value>
547  </enum>
548
549  <enum name="Write mask">
550    <reserved/>
551    <value>r</value>
552    <value>g</value>
553    <value>rg</value>
554    <value>b</value>
555    <value>rb</value>
556    <value>gb</value>
557    <value>rgb</value>
558    <value>a</value>
559    <value>ra</value>
560    <value>ga</value>
561    <value>rga</value>
562    <value>ba</value>
563    <value>rba</value>
564    <value>gba</value>
565    <value default="true">rgba</value>
566  </enum>
567
568  <enum name="Fetch component">
569    <value desc="Red">gather4_r</value>
570    <value desc="Green">gather4_g</value>
571    <value desc="Blue">gather4_b</value>
572    <value desc="Alpha">gather4_a</value>
573  </enum>
574
575  <enum name="Register type">
576    <desc>Unsized type, part of a register format.</desc>
577    <reserved/>
578    <value name="Float">f</value>
579    <value name="Unsigned">u</value>
580    <value name="Signed">s</value>
581  </enum>
582
583  <enum name="Register width">
584    <desc>Untyped size, part of a register format.</desc>
585    <value>16</value>
586    <value>32</value>
587  </enum>
588
589  <enum name="Varying texture register width">
590    <desc>
591      Size of results for varying texture instructions. For dual 16-bit results
592      use "16-bit".
593    </desc>
594    <value desc="16-bit">16</value>
595    <value desc="32-bit">32</value>
596    <value desc="16-bit, 32-bit">16.32</value>
597    <value desc="32-bit, 32-bit">32.32</value>
598  </enum>
599
600  <enum name="Vector size">
601    <desc>Number of channels loaded/stored for general memory access.</desc>
602    <value default="true" desc="Scalar">none</value>
603    <value desc="2 channels">v2</value>
604    <value desc="3 channels">v3</value>
605    <value desc="4 channels">v4</value>
606  </enum>
607
608  <enum name="Slot">
609    <desc>
610      Dependency slot set on a message-passing instruction that writes to
611      registers. Before reading the destination, a future instruction must wait
612      on the specified slot. Slot #7 is for `BARRIER` instructions only.
613    </desc>
614    <value desc="Slot #0">slot0</value>
615    <value desc="Slot #1">slot1</value>
616    <value desc="Slot #2">slot2</value>
617    <reserved/>
618    <reserved/>
619    <reserved/>
620    <reserved/>
621    <value desc="Slot #7">slot7</value>
622  </enum>
623
624  <enum name="Memory access">
625    <desc>Memory access hint for a `LOAD` or `STORE` instruction.</desc>
626    <value desc="No hint (global)" default="true">none</value>
627    <value desc="Internally streaming (position output)">istream</value>
628    <value desc="Externally streaming (varying output)">estream</value>
629    <value desc="Force access in discarded threads (thread local storage)">force</value>
630  </enum>
631
632  <enum name="Subgroup size">
633    <desc>
634      Selects the effective subgroup size from subgroup operations. The hardware
635      warps are sixteen threads on Valhall, but subdividing a warp may be useful
636      for API requirements. In particular, derivatives may be calculated with
637      quads (four threads).
638    </desc>
639    <value desc="Two threads">subgroup2</value>
640    <value desc="Four threads">subgroup4</value>
641    <value desc="Eight threads">subgroup8</value>
642    <value desc="Sixteen threads" default="true">subgroup16</value>
643  </enum>
644
645  <enum name="Lane operation">
646    <desc>
647      Acts as a modifier on the lane specificier for a `CLPER` instruction. The
648      `accumulate` mode is required for efficient subgroup reductions.
649    </desc>
650    <value name="No operation" default="true">none</value>
651    <value name="Exclusive-or">xor</value>
652    <value name="Accumulate">accumulate</value>
653    <value name="Shift">shift</value>
654  </enum>
655
656  <enum name="Inactive result">
657    <desc>
658      Accesses to inactive lanes (due to divergence) in a subgroup is generally
659      undefined in APIs. However, the results of permuting with an inactive lane
660      with `CLPER.i32` are well-defined in Valhall: they return one of the
661      following values, as specified in the `CLPER.i32` instructions. Sometimes
662      certain values enable small optimizations.
663    </desc>
664    <value name="0x00000000" default="true">zero</value>
665    <value name="0xFFFFFFFF">umax</value>
666    <value name="0x00000001">i1</value>
667    <value name="0x00010001">v2i1</value>
668    <value name="0x80000000">smin</value>
669    <value name="0x7FFFFFFF">smax</value>
670    <value name="0x80008000">v2smin</value>
671    <value name="0x7FFF7FFF">v2smax</value>
672    <value name="0x80808080">v4smin</value>
673    <value name="0x7F7F7F7F">v4smax</value>
674    <value name="0x3F800000">f1</value>
675    <value name="0x3C003C00">v2f1</value>
676    <value name="0xFF800000">infn</value>
677    <value name="0x7F800000">inf</value>
678    <value name="0xFC00FC00">v2infn</value>
679    <value name="0x7C007C00">v2inf</value>
680  </enum>
681
682  <enum name="Mux">
683    <desc>
684      Condition to use for a `MUX` instruction. `neg` checks the sign bit,
685      `int_zero` compares to `0x00000000`, `fp_zero` compares to $\pm 0.0$ as
686      an IEEE 754 float, and `bit` checks each bit separately. The `bit` mode
687      acts like an imaginary `CSEL.v32u1` instruction, and implements
688      `bitselect()` in OpenCL.
689    </desc>
690    <value desc="Negative">neg</value>
691    <value desc="Integer zero" default="true">int_zero</value>
692    <value desc="Floating point zero">fp_zero</value>
693    <value desc="Bitwise">bit</value>
694  </enum>
695
696  <enum name="Sample mode">
697    <desc>
698      Varying interpolation mode, for choosing the correct sample to
699      interpolate at, allowing the `sample` and `centroid` qualifiers to be
700      implemented, as well as the `interpolateAt*` functions.
701    </desc>
702    <value desc="Center">center</value>
703    <value desc="Centroid">centroid</value>
704    <value desc="Sample">sample</value>
705    <value desc="Explicit">explicit</value>
706  </enum>
707
708  <enum name="Update mode">
709    <desc>
710      The Valhall GPU maintains hidden state when interpolating varyings, to
711      allow reusing sample location calculations. The update mode of a varying
712      load controls this hidden state.
713    </desc>
714    <value desc="Store interpolation position">store</value>
715    <value desc="Retrieve interpolation position">retrieve</value>
716    <reserved/>
717    <value desc="Clobber saved position">clobber</value>
718  </enum>
719
720  <enum name="Sample and update mode">
721    <desc>
722      For fused varying/texture instructions, only the following specific
723      combinations of sample and update modes are permitted.
724    </desc>
725    <value desc="Center, store">center_store</value>
726    <value desc="Centroid, store">centroid_store</value>
727    <value desc="Sample, store">sample_store</value>
728    <value desc="Explicit, store">explicit_store</value>
729    <value desc="Center, clobber">center_clobber</value>
730    <reserved/>
731    <value desc="Sample, clobber">sample_clobber</value>
732    <value desc="Retrieve previous state">retrieve</value>
733  </enum>
734
735  <enum name="Source format">
736    <desc>
737      In-memory format of varyings.
738
739      Note: src_flat32 is only valid with 32-bit varying instructions and
740      src_flat16 is only valid with 16-bit varying instructions.
741    </desc>
742    <value desc="Uninterpreted 32-bit values">src_flat32</value>
743    <value desc="Uninterpreted 16-bit values">src_flat16</value>
744    <value desc="Interpolated 32-bit floats">src_f32</value>
745    <value desc="Interpolated 16-bit floats">src_f16</value>
746  </enum>
747
748  <enum name="Atomic operation">
749    <desc>
750      Operation performed in a general computational atomic instruction.
751    </desc>
752    <reserved/>
753    <reserved/>
754    <value desc="Add">aadd</value>
755    <reserved/>
756    <reserved/>
757    <reserved/>
758    <reserved/>
759    <reserved/>
760    <value desc="Signed minimum">asmin</value>
761    <value desc="Signed maximum">asmax</value>
762    <value desc="Unsigned minimum">aumin</value>
763    <value desc="Unsigned maximum">aumax</value>
764    <value desc="Bitwise and">aand</value>
765    <value desc="Bitwise or">aor</value>
766    <value desc="Bitwise exclusive-or">axor</value>
767    <value desc="Exchange (must return the value)">axchg</value>
768  </enum>
769
770  <enum name="Atomic operation with 1">
771    <desc>
772      Operation performed in a computational atomic-with-1 instruction.
773    </desc>
774    <value desc="Increment">ainc</value>
775    <value desc="Decrement">adec</value>
776    <value desc="Unsigned maximum with 1">aumax1</value>
777    <value desc="Signed maximum with 1">asmax1</value>
778    <value desc="Set bottom bit">aor1</value>
779  </enum>
780
781  <enum name="NaN mode">
782    <desc>
783      Flush specific NaN values in FLUSH.f32 and FLUSH.v2f16. flush_nan flushes
784      all NaN values to zero. quiet_nan flushes signaling NaNs to quiet NaNs.
785    </desc>
786    <value desc="None">none</value>
787    <value desc="Flush NaN">flush_nan</value>
788    <value desc="Quiet NaN">quiet_nan</value>
789  </enum>
790
791  <!-- note that the `unused="true"` annotation here just means that this
792       particular entry is unused by the compiler. This may be because the
793       instruction isn't generated yet, but it may also be because there
794       is a duplicate instruction in the Bifrost or pseudo XML files
795  -->
796  <ins name="NOP" title="No operation" dests="0" opcode="0x00" unused="true" unit="CVT">
797    <desc>
798      Do nothing. Useful at the start of a block for waiting on slots required
799      by the first actual instruction of the block, to reconcile dependencies
800      after a branch. Also useful as the sole instruction of an empty shader.
801    </desc>
802  </ins>
803
804  <ins name="BRANCHZ" title="Compare to zero and branch" dests="0" opcode="0x1F" unused="true" unit="CVT">
805    <desc>
806      Branches to a specified relative offset if its source is nonzero (default)
807      or if its source is zero (if `.eq` is set). The offset is 27-bits and
808      sign-extended, giving an effective range of ±26-bits. The offset is
809      specified in units of instructions, relative to the *next* instruction.
810      Positive offsets may be interpreted as "number of instructions to skip".
811      Since Valhall instructions are 8 bytes, this operates as:
812
813      $$PC := \begin{cases} PC + 8 \cdot (\text{offset} \; + 1) &amp; \text{if} \;
814      \text{src} \stackrel{?}{=} 0 \\ PC + 8 &amp; \text{otherwise} \end{cases}$$
815
816      Used with comparison instructions to implement control flow. Tie the
817      source to a nonzero constant to implement a jump. May introduce
818      divergence, so generally requires `.reconverge` flow control.
819    </desc>
820    <src combine="true">Value to compare against zero</src>
821    <imm name="offset" start="8" size="27" signed="true"/>
822    <conservative/>
823    <va_mod name="eq" start="36" size="1"/>
824  </ins>
825
826  <ins name="DISCARD.f32" title="Discard fragment" dests="0" opcode="0x20" unused="true" unit="CVT">
827    <desc>
828      Evaluates the given condition, and if it passes, discards the current
829      fragment and terminates the thread. Only valid in a **fragment** shader.
830    </desc>
831    <cmp/>
832    <src absneg="true" swizzle="true">Left value to compare</src>
833    <src absneg="true" swizzle="true">Right value to compare</src>
834  </ins>
835
836  <ins name="BRANCHZI" title="Compare to zero and branch indirect" opcode="0x2F" dests="0" last="true" unit="CVT">
837    <desc>
838      Jump to an indirectly specified (absolute or relative) address. Used to
839      jump to blend shaders at the end of a fragment shader.
840    </desc>
841    <src combine="true">Value to compare against zero</src>
842    <src>Branch target</src>
843    <conservative/>
844    <va_mod name="eq" start="36" size="1"/>
845    <va_mod name="absolute" start="40" size="1"/>
846  </ins>
847
848  <ins name="BARRIER" title="Execution and memory barrier" opcode="0x45" unused="true" unit="NONE">
849    <desc>
850      General-purpose barrier. Must use slot #7. Must be paired with a
851      `.wait` flow on the instruction.
852    </desc>
853    <slot/>
854  </ins>
855
856  <group name="CSEL" title="Floating-point conditional select" dests="1" unused="true" unit="CVT">
857    <ins name="CSEL.f32" opcode="0x154"/>
858    <ins name="CSEL.v2f16" opcode="0x155"/>
859    <desc>
860      Evaluates the given condition and outputs either the true source or the
861      false source.
862    </desc>
863    <cmp/>
864    <src float="true">Left value to compare</src>
865    <src float="true">Right value to compare</src>
866    <src float="true">Return value if true</src>
867    <src float="true">Return value if false</src>
868  </group>
869
870  <group name="CSEL" title="Integer conditional select" dests="1" unused="true" unit="CVT">
871    <ins name="CSEL.u32" opcode="0x150"/>
872    <ins name="CSEL.v2u16" opcode="0x151"/>
873    <ins name="CSEL.s32" opcode="0x158"/>
874    <ins name="CSEL.v2s16" opcode="0x159"/>
875    <desc>
876      Evaluates the given condition and outputs either the true source or the
877      false source.
878
879      Valhall lacks integer minimum/maximum instructions. `CSEL` instructions
880      with tied operands form the canonical implementations of these
881      instructions. Similarly, the integer $\text{sign}$ function is canonically
882      implemented with a pair of `CSEL` instructions.
883    </desc>
884    <cmp/>
885    <src>Left value to compare</src>
886    <src>Right value to compare</src>
887    <src>Return value if true</src>
888    <src>Return value if false</src>
889  </group>
890
891  <ins name="LD_VAR_SPECIAL" title="Load special varying" opcode="0x56" unused="true" unit="V">
892    <sr write="true"/>
893    <sr_count/>
894    <vecsize/>
895    <regfmt/>
896    <sample/>
897    <update/>
898    <slot/>
899    <src/>
900    <imm name="index" start="12" size="4"/> <!-- 0 for pointx, 1 for pointy, 2 for fragw, 3 for fragz -->
901  </ins>
902
903  <group name="LD_VAR_BUF_IMM" title="Load immediate varying" message="varying" unit="V">
904    <desc>Interpolates a given varying from hardware buffer</desc>
905    <ins name="LD_VAR_BUF_IMM.f32" opcode="0x5C"/>
906    <ins name="LD_VAR_BUF_IMM.f16" opcode="0x5D"/>
907    <slot/>
908    <vecsize/>
909    <source_format/>
910    <regfmt pseudo="true"/>
911    <sample/>
912    <update/>
913    <sr write="true"/>
914    <sr_count count="format"/>
915    <src/>
916    <imm name="index" start="16" size="8"/>
917  </group>
918
919  <group name="LD_VAR_BUF" title="Load indirect varying" message="varying" unit="V">
920    <desc>Interpolates a given varying from hardware buffer</desc>
921    <ins name="LD_VAR_BUF.f32" opcode="0x6C"/>
922    <ins name="LD_VAR_BUF.f16" opcode="0x6D"/>
923    <slot/>
924    <vecsize/>
925    <source_format/>
926    <regfmt pseudo="true"/>
927    <sample/>
928    <update/>
929    <sr write="true"/>
930    <sr_count count="format"/>
931    <src/>
932    <src/>
933  </group>
934
935  <ins name="LD_VAR" title="Load indirect varying" unused="true" unit="V" opcode="0x64">
936    <desc>Interpolates a given varying from a software buffer</desc>
937    <slot/>
938    <vecsize/>
939    <regfmt/>
940    <sample/>
941    <update/>
942    <sr write="true"/>
943    <sr_count/>
944    <src/>
945    <src>Varying index and table</src>
946  </ins>
947
948  <ins name="LD_VAR_IMM" title="Load immediate varying" unused="true" unit="V" opcode="0x54">
949    <desc>Interpolates a given varying from a software buffer</desc>
950    <slot/>
951    <vecsize/>
952    <regfmt/>
953    <sample/>
954    <update/>
955    <sr write="true"/>
956    <sr_count/>
957    <src/>
958    <imm name="table" start="8" size="4"/>
959    <imm name="index" start="12" size="8"/>
960  </ins>
961
962  <ins name="LD_VAR_FLAT" title="Load indirect varying" unused="true" unit="V" opcode="0x55">
963    <desc>Fetches a given varying from a software buffer</desc>
964    <slot/>
965    <vecsize/>
966    <regfmt/>
967    <sr write="true"/>
968    <sr_count/>
969    <src>Varying index and table</src>
970  </ins>
971
972  <ins name="LD_VAR_FLAT_IMM" title="Load immediate varying" unused="true" unit="V" opcode="0x41">
973    <desc>Fetches a given varying from a software buffer</desc>
974    <slot/>
975    <vecsize/>
976    <regfmt/>
977    <sr write="true"/>
978    <sr_count/>
979    <imm name="table" start="8" size="4"/>
980    <imm name="index" start="12" size="8"/>
981  </ins>
982
983  <ins name="LD_ATTR_IMM" title="Load immediate attribute" opcode="0x66" opcode2="0" unused="true" unit="LS">
984    <desc>
985      Load `vecsize` components from the attribute descriptor at entry `index`
986      of resource table `table` at index (vertex ID, instance ID), converting
987      to the specified register format.
988    </desc>
989    <sr_count/>
990    <vecsize/>
991    <regfmt/>
992    <slot/>
993    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
994    <sr write="true"/>
995    <src>Vertex ID</src>
996    <src>Instance ID</src>
997    <imm name="index" start="20" size="4"/>
998    <imm name="table" start="16" size="4"/>
999  </ins>
1000
1001  <ins name="LD_ATTR" title="Load indirect attribute" opcode="0x76" opcode2="0" unused="true" unit="LS">
1002    <desc>
1003      Load `vecsize` components from the attribute descriptor at the specified
1004      location at index (vertex ID, instance ID), converting
1005      to the specified register format.
1006
1007      The index must not diverge within a warp.
1008    </desc>
1009    <sr_count/>
1010    <vecsize/>
1011    <regfmt/>
1012    <slot/>
1013    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1014    <sr write="true"/>
1015    <src>Vertex ID</src>
1016    <src>Instance ID</src>
1017    <src>Index and table</src>
1018  </ins>
1019
1020  <ins name="LD_TEX_IMM" title="Load immediate texture" opcode="0x66" opcode2="1" message="attribute" unit="LS">
1021    <desc>
1022      Load `vecsize` components from the texture descriptor at entry `index`
1023      of resource table `table`, converting
1024      to the specified register format.
1025    </desc>
1026    <sr_count count="format"/>
1027    <vecsize/>
1028    <regfmt/>
1029    <slot/>
1030    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1031    <sr write="true"/>
1032    <src>X/Y coordinates (16:16)</src>
1033    <src>Z/W coordinates (16:16)</src>
1034    <imm name="index" ir_name="texture_index" start="20" size="4"/>
1035    <imm name="table" ir_name="" start="16" size="4"/>
1036  </ins>
1037
1038  <ins name="LD_TEX" title="Load indirect texture" message="attribute" opcode="0x76" opcode2="1" unit="LS">
1039    <desc>
1040      Load `vecsize` components from the texture descriptor at the specified
1041      location at index, converting
1042      to the specified register format.
1043    </desc>
1044    <sr_count count="format"/>
1045    <vecsize/>
1046    <regfmt/>
1047    <slot/>
1048    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1049    <sr write="true"/>
1050    <src>X/Y coordinates (16:16)</src>
1051    <src>Z/W coordinates (16:16)</src>
1052    <src>Index and table</src>
1053  </ins>
1054
1055  <ins name="LEA_ATTR_IMM" title="Load effective address of image texel" opcode="0x67" opcode2="0" unused="true" unit="LS">
1056    <desc>
1057      Load the effective address of an attribute specified with the
1058      given immediate index. Returns three staging register: the low/high
1059      32-bits of the address and the internal conversion descriptor.
1060    </desc>
1061    <slot/>
1062    <sr_count/>
1063    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1064    <sr write="true"/>
1065    <src>Vertex index</src>
1066    <src>Instance index</src>
1067    <imm name="table" start="16" size="4"/>
1068    <imm name="index" start="20" size="4"/>
1069  </ins>
1070
1071  <ins name="LEA_ATTR" title="Load effective address of image texel" opcode="0x77" opcode2="0" unused="true" unit="LS">
1072    <desc>
1073      Load the effective address of an attribute specified with the
1074      given index. Returns three staging register: the low/high
1075      32-bits of the address and the internal conversion descriptor.
1076    </desc>
1077    <vecsize/>
1078    <slot/>
1079    <sr_count/>
1080    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1081    <sr write="true"/>
1082    <src>Vertex index</src>
1083    <src>Instance index</src>
1084    <src>Attribute index and table</src>
1085  </ins>
1086
1087  <ins name="LEA_TEX_IMM" title="Load effective address of image texel" opcode="0x67" opcode2="1" unused="true" unit="LS">
1088    <desc>
1089      Load the effective address of a texel from the image specified with the
1090      given immediate index. Returns three staging registers: the low/high
1091      32-bits of the address and the internal conversion descriptor. The format
1092      of the internal conversion descriptor is compatible with Bifrost but
1093      omits the register format, as this is specified with the ST_CVT
1094      instruction on Valhall.
1095
1096      Coordinates are specified as 16-bit integers, packed into 32-bit sources.
1097    </desc>
1098    <slot/>
1099    <sr_count/>
1100    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1101    <sr write="true"/>
1102    <src>X/Y coordinates (16:16)</src>
1103    <src>Z/W coordinates (16:16)</src>
1104    <imm name="table" start="16" size="4"/>
1105    <imm name="index" start="20" size="4"/>
1106  </ins>
1107
1108  <ins name="LEA_TEX" title="Load effective address of image texel" opcode="0x77" opcode2="1" unused="true" unit="LS">
1109    <desc>
1110      Load the effective address of a texel from the image specified with the
1111      given index. Returns three staging register: the low/high
1112      32-bits of the address and the internal conversion descriptor. The format
1113      of the internal conversion descriptor is compatible with Bifrost but
1114      omits the register format, as this is specified with the ST_CVT
1115      instruction on Valhall.
1116
1117      Coordinates are specified as 16-bit integers, packed into 32-bit sources.
1118    </desc>
1119    <vecsize/>
1120    <slot/>
1121    <sr_count/>
1122    <va_mod name="descriptor_type" start="128" size="1" implied="true"/>
1123    <sr write="true"/>
1124    <src size="16">X/Y coordinates (16:16)</src>
1125    <src>Z/W coordinates (16:16)</src>
1126    <src>Index and table</src>
1127  </ins>
1128
1129  <ins name="LD_BUFFER.i8" title="Global memory load" message="load" opcode="0x6a" opcode2="0" unit="LS">
1130    <desc>
1131      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1132      all-ones, load from the buffer descriptors in the table indexed by the
1133      bottom byte of the mode descriptor. If they are all zeroes, load the
1134      contents of the buffer in the first table indexed by the bottom byte of
1135      the mode descriptor.
1136    </desc>
1137    <sr write="true"/>
1138    <sr_count count="1"/>
1139    <va_mod name="load_lane_8_bit" start="36" size="3"/>
1140    <va_mod name="unsigned" start="39" size="1"/>
1141    <slot/>
1142    <src size="32">Address to load from after adding offset</src>
1143    <src size="32">Mode descriptor</src>
1144  </ins>
1145
1146  <ins name="LD_BUFFER.i16" title="Global memory load" message="load" opcode="0x6a" opcode2="1" unit="LS">
1147    <desc>
1148      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1149      all-ones, load from the buffer descriptors in the table indexed by the
1150      bottom byte of the mode descriptor. If they are all zeroes, load the
1151      contents of the buffer in the first table indexed by the bottom byte of
1152      the mode descriptor.
1153    </desc>
1154    <sr write="true"/>
1155    <sr_count count="1"/>
1156    <va_mod name="load_lane_16_bit" start="36" size="3"/>
1157    <va_mod name="unsigned" start="39" size="1"/>
1158    <slot/>
1159    <src size="32">Byte offset</src>
1160    <src size="32">Mode descriptor</src>
1161  </ins>
1162
1163  <ins name="LD_BUFFER.i24" title="Global memory load" message="load" opcode="0x6a" opcode2="2" unit="LS">
1164    <desc>
1165      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1166      all-ones, load from the buffer descriptors in the table indexed by the
1167      bottom byte of the mode descriptor. If they are all zeroes, load the
1168      contents of the buffer in the first table indexed by the bottom byte of
1169      the mode descriptor.
1170    </desc>
1171    <sr write="true"/>
1172    <sr_count count="1"/>
1173    <va_mod name="load_lane_24_bit" start="36" size="3"/>
1174    <va_mod name="unsigned" start="39" size="1"/>
1175    <slot/>
1176    <src size="32">Byte offset</src>
1177    <src size="32">Mode descriptor</src>
1178  </ins>
1179
1180  <ins name="LD_BUFFER.i32" title="Global memory load" message="load" opcode="0x6a" opcode2="3" unit="LS">
1181    <desc>
1182      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1183      all-ones, load from the buffer descriptors in the table indexed by the
1184      bottom byte of the mode descriptor. If they are all zeroes, load the
1185      contents of the buffer in the first table indexed by the bottom byte of
1186      the mode descriptor.
1187    </desc>
1188    <sr write="true"/>
1189    <sr_count count="1"/>
1190    <va_mod name="load_lane_32_bit" start="36" size="3"/>
1191    <va_mod name="unsigned" start="39" size="1"/>
1192    <slot/>
1193    <src size="32">Byte offset</src>
1194    <src size="32">Mode descriptor</src>
1195  </ins>
1196
1197  <ins name="LD_BUFFER.i48" title="Global memory load" message="load" opcode="0x6a" opcode2="4" unit="LS">
1198    <desc>
1199      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1200      all-ones, load from the buffer descriptors in the table indexed by the
1201      bottom byte of the mode descriptor. If they are all zeroes, load the
1202      contents of the buffer in the first table indexed by the bottom byte of
1203      the mode descriptor.
1204    </desc>
1205    <sr write="true"/>
1206    <sr_count count="2"/>
1207    <va_mod name="load_lane_48_bit" start="36" size="3"/>
1208    <va_mod name="unsigned" start="39" size="1"/>
1209    <slot/>
1210    <src size="32">Byte offset</src>
1211    <src size="32">Mode descriptor</src>
1212  </ins>
1213
1214  <ins name="LD_BUFFER.i64" title="Global memory load" message="load" opcode="0x6a" opcode2="5" unit="LS">
1215    <desc>
1216      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1217      all-ones, load from the buffer descriptors in the table indexed by the
1218      bottom byte of the mode descriptor. If they are all zeroes, load the
1219      contents of the buffer in the first table indexed by the bottom byte of
1220      the mode descriptor.
1221    </desc>
1222    <sr write="true"/>
1223    <sr_count count="2"/>
1224    <va_mod name="load_lane_64_bit" start="36" size="3"/>
1225    <va_mod name="unsigned" start="39" size="1"/>
1226    <slot/>
1227    <src size="32">Byte offset</src>
1228    <src size="32">Mode descriptor</src>
1229  </ins>
1230
1231  <ins name="LD_BUFFER.i96" title="Global memory load" message="load" opcode="0x6a" opcode2="6" unit="LS">
1232    <desc>
1233      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1234      all-ones, load from the buffer descriptors in the table indexed by the
1235      bottom byte of the mode descriptor. If they are all zeroes, load the
1236      contents of the buffer in the first table indexed by the bottom byte of
1237      the mode descriptor.
1238    </desc>
1239    <sr write="true"/>
1240    <sr_count count="3"/>
1241    <va_mod name="load_lane_96_bit" start="36" size="3"/>
1242    <va_mod name="unsigned" start="39" size="1"/>
1243    <slot/>
1244    <src size="32">Byte offset</src>
1245    <src size="32">Mode descriptor</src>
1246  </ins>
1247
1248  <ins name="LD_BUFFER.i128" title="Global memory load" message="load" opcode="0x6a" opcode2="7" unit="LS">
1249    <desc>
1250      Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
1251      all-ones, load from the buffer descriptors in the table indexed by the
1252      bottom byte of the mode descriptor. If they are all zeroes, load the
1253      contents of the buffer in the first table indexed by the bottom byte of
1254      the mode descriptor.
1255    </desc>
1256    <sr write="true"/>
1257    <sr_count count="4"/>
1258    <va_mod name="load_lane_128_bit" start="36" size="3"/>
1259    <va_mod name="unsigned" start="39" size="1"/>
1260    <slot/>
1261    <src size="32">Byte offset</src>
1262    <src size="32">Mode descriptor</src>
1263  </ins>
1264
1265  <ins name="LEA_BUF_IMM" title="Load buffer effective address" message="attribute" opcode="0x5E" unit="LS">
1266    <desc>
1267      Load effective address of a buffer with an immediate offset added.
1268    </desc>
1269    <sr write="true"/>
1270    <sr_count count="2"/>
1271    <slot/>
1272    <imm name="table" ir_name="" start="8" size="4"/>
1273    <imm name="index" ir_name="" start="12" size="8"/>
1274    <src>Linear ID</src>
1275  </ins>
1276
1277  <ins name="LOAD.i8" title="Global memory load" opcode="0x60" opcode2="0" unused="true" unit="LS">
1278    <desc>Loads from main memory</desc>
1279    <sr write="true"/>
1280    <memory_access/>
1281    <sr_count/>
1282    <va_mod name="load_lane_8_bit" start="36" size="3"/>
1283    <va_mod name="unsigned" start="39" size="1"/>
1284    <slot/>
1285    <src size="64">Address to load from after adding offset</src>
1286    <imm name="offset" start="8" size="16" signed="true"/>
1287  </ins>
1288
1289  <ins name="LOAD.i16" title="Global memory load" opcode="0x60" opcode2="1" unused="true" unit="LS">
1290    <desc>Loads from main memory</desc>
1291    <sr write="true"/>
1292    <memory_access/>
1293    <sr_count/>
1294    <va_mod name="load_lane_16_bit" start="36" size="3"/>
1295    <va_mod name="unsigned" start="39" size="1"/>
1296    <slot/>
1297    <src size="64">Address to load from after adding offset</src>
1298    <imm name="offset" start="8" size="16" signed="true"/>
1299  </ins>
1300
1301  <ins name="LOAD.i24" title="Global memory load" opcode="0x60" opcode2="2" unused="true" unit="LS">
1302    <desc>Loads from main memory</desc>
1303    <sr write="true"/>
1304    <memory_access/>
1305    <sr_count/>
1306    <va_mod name="load_lane_24_bit" start="36" size="3"/>
1307    <va_mod name="unsigned" start="39" size="1"/>
1308    <slot/>
1309    <src size="64">Address to load from after adding offset</src>
1310    <imm name="offset" start="8" size="16" signed="true"/>
1311  </ins>
1312
1313  <ins name="LOAD.i32" title="Global memory load" opcode="0x60" opcode2="3" unused="true" unit="LS">
1314    <desc>Loads from main memory</desc>
1315    <sr write="true"/>
1316    <memory_access/>
1317    <sr_count/>
1318    <va_mod name="load_lane_32_bit" start="36" size="3"/>
1319    <va_mod name="unsigned" start="39" size="1"/>
1320    <slot/>
1321    <src size="64">Address to load from after adding offset</src>
1322    <imm name="offset" start="8" size="16" signed="true"/>
1323  </ins>
1324
1325  <ins name="LOAD.i48" title="Global memory load" opcode="0x60" opcode2="4" unused="true" unit="LS">
1326    <desc>Loads from main memory</desc>
1327    <sr write="true"/>
1328    <memory_access/>
1329    <sr_count/>
1330    <va_mod name="load_lane_48_bit" start="36" size="3"/>
1331    <va_mod name="unsigned" start="39" size="1"/>
1332    <slot/>
1333    <src size="64">Address to load from after adding offset</src>
1334    <imm name="offset" start="8" size="16" signed="true"/>
1335  </ins>
1336
1337  <ins name="LOAD.i64" title="Global memory load" opcode="0x60" opcode2="5" unused="true" unit="LS">
1338    <desc>Loads from main memory</desc>
1339    <sr write="true"/>
1340    <memory_access/>
1341    <sr_count/>
1342    <va_mod name="load_lane_64_bit" start="36" size="3"/>
1343    <va_mod name="unsigned" start="39" size="1"/>
1344    <slot/>
1345    <src size="64">Address to load from after adding offset</src>
1346    <imm name="offset" start="8" size="16" signed="true"/>
1347  </ins>
1348
1349  <ins name="LOAD.i96" title="Global memory load" opcode="0x60" opcode2="6" unused="true" unit="LS">
1350    <desc>Loads from main memory</desc>
1351    <sr write="true"/>
1352    <memory_access/>
1353    <sr_count/>
1354    <va_mod name="load_lane_96_bit" start="36" size="3"/>
1355    <va_mod name="unsigned" start="39" size="1"/>
1356    <slot/>
1357    <src size="64">Address to load from after adding offset</src>
1358    <imm name="offset" start="8" size="16" signed="true"/>
1359  </ins>
1360
1361  <ins name="LOAD.i128" title="Global memory load" opcode="0x60" opcode2="7" unused="true" unit="LS">
1362    <desc>Loads from main memory</desc>
1363    <sr write="true"/>
1364    <memory_access/>
1365    <sr_count/>
1366    <va_mod name="load_lane_128_bit" start="36" size="3"/>
1367    <va_mod name="unsigned" start="39" size="1"/>
1368    <slot/>
1369    <src size="64">Address to load from after adding offset</src>
1370    <imm name="offset" start="8" size="16" signed="true"/>
1371  </ins>
1372
1373  <group name="STORE" title="Global memory store" opcode="0x61" unused="true" unit="LS">
1374    <desc>Stores to main memory</desc>
1375    <sr read="true"/>
1376    <ins name="STORE.i8" opcode2="0x0"/>
1377    <ins name="STORE.i16" opcode2="0x1"/>
1378    <ins name="STORE.i24" opcode2="0x2"/>
1379    <ins name="STORE.i32" opcode2="0x3"/>
1380    <ins name="STORE.i48" opcode2="0x4"/>
1381    <ins name="STORE.i64" opcode2="0x5"/>
1382    <ins name="STORE.i96" opcode2="0x6"/>
1383    <ins name="STORE.i128" opcode2="0x7"/>
1384    <sr_count/>
1385    <memory_access/>
1386    <slot/>
1387    <src size="64">Address to store to after adding offset</src>
1388    <imm name="offset" start="8" size="16" signed="true"/>
1389  </group>
1390
1391  <ins name="LEA_BUFFER" title="Load buffer effective address" message="attribute" opcode="0x6B" unit="LS">
1392    <desc>
1393      Load effective address of a simple buffer with an offset added.
1394    </desc>
1395    <sr write="true"/>
1396    <sr_count count="2"/>
1397    <slot/>
1398    <src>Offset</src>
1399    <src>Index</src>
1400  </ins>
1401
1402  <ins name="ST_CVT" title="Store with conversion" opcode="0x71" unused="true" unit="LS">
1403    <desc>
1404      Store to memory with data conversion. The address to store to is given in
1405      the first source, which must be a 64-bit register (a pair of 32-bit
1406      registers). The other source is the conversion descriptor used for the store.
1407
1408      Used with LEA_TEX_IMM to implement image stores.
1409    </desc>
1410    <slot/>
1411    <va_mod name="memory_access" start="37" size="3"/>
1412    <vecsize/>
1413    <regfmt/>
1414    <sr read="true"/>
1415    <sr_count/>
1416    <src size="64">64-bit address to store to</src>
1417    <imm name="offset" start="8" size="8"/>
1418    <src>Internal conversion descriptor</src>
1419  </ins>
1420
1421  <ins name="LD_TILE" title="Load from tilebuffer" opcode="0x78" unused="true" unit="NONE">
1422    <desc>
1423      Loads a given render target, specified in the pixel indices descriptor, at
1424      a given location and sample, and convert to the format specified in the
1425      internal conversion descriptor. Used to implement EXT_framebuffer_fetch
1426      and internally in blend shaders.
1427    </desc>
1428    <sr write="true"/>
1429    <sr_count/>
1430    <vecsize/>
1431    <regfmt/>
1432    <slot/>
1433    <src>Pixel indices descriptor</src>
1434    <src>Coverage mask</src>
1435    <src>Conversion descriptor</src>
1436  </ins>
1437
1438  <ins name="ST_TILE" title="Store to tilebuffer" opcode="0x79" unused="true" unit="NONE">
1439    <desc>
1440      Store to given render target, specified in the pixel indices descriptor, at
1441      a given location and sample, and convert to the format specified in the
1442      internal conversion descriptor. Used internally in blend shaders.
1443    </desc>
1444    <sr read="true"/>
1445    <sr_count/>
1446    <vecsize/>
1447    <regfmt/>
1448    <slot/>
1449    <src>Pixel indices descriptor</src>
1450    <src>Coverage mask</src>
1451    <src>Conversion descriptor</src>
1452  </ins>
1453
1454  <ins name="BLEND" title="Blend render target" opcode="0x7F" unused="true" unit="NONE">
1455    <desc>
1456      Blends a given render target. This loads the API-specified blend state for
1457      the render target from the first source. Blend descriptors are available
1458      as special immediates. It then reads the colour to be blended from the
1459      first staging register, with the specified vector size and register format
1460      as desired. The resulting coverage mask is stored to the second set of
1461      staging registers.
1462
1463      In the fixed-function path, `BLEND` sends the colour to the blender to be
1464      written to the tilebuffer. Then, if the instruction's flow control
1465      specifies termination, the fragment program is ended. If it does not
1466      specify termination, `BLEND` acts as a relative branch, branching with the
1467      offset specified as `target`. This allows the subsequent instructions to
1468      be skipped when fixed-function blending is used. Note this implicit branch
1469      can never introduce divergence, so `.reconverge` is not required.
1470
1471      In the blend shader path, `BLEND` ignores the specified flow control and
1472      does not branch to the specified offset. Instead, execution continues
1473      normally with the next instruction. The compiler should insert code for
1474      calling a blend shader after the `BLEND` instruction unless it is known
1475      that a blend shader will never be required.
1476
1477      The indirection is required to support both fixed-function and blend
1478      shaders efficiently and without shader variants.
1479    </desc>
1480    <sr read="true"/>
1481    <src size="64">Blend descriptor</src>
1482    <src>Sample coverage</src>
1483    <imm name="target" start="8" size="8"/>
1484    <slot/>
1485    <sr_count/>
1486    <vecsize/>
1487    <regfmt/>
1488  </ins>
1489
1490  <ins name="ATEST" title="Alpha test" opcode="0x7D" unused="true" unit="NONE">
1491    <desc>
1492      Does alpha-to-coverage testing, updating the sample coverage mask. ATEST
1493      does not do an implicit discard. It should be executed before the first
1494      ZS_EMIT or BLEND instruction.
1495    </desc>
1496    <sr write="true">Updated coverage mask</sr>
1497    <src>Input coverage mask</src>
1498    <src swizzle="true">Alpha value (render target 0)</src>
1499    <src/>
1500    <sr_count/>
1501  </ins>
1502
1503  <ins name="ZS_EMIT" title="Depth/stencil write" opcode="0x7E" unused="true" unit="NONE">
1504    <desc>
1505      Programatically writes out depth, stencil, or both, depending on which
1506      modifiers are set. Used to implement gl_FragDepth and gl_FragStencil.
1507    </desc>
1508    <va_mod name="z" start="25" size="1"/>
1509    <va_mod name="stencil" start="24" size="1"/>
1510    <sr write="true">Updated coverage mask</sr>
1511    <src>Depth value</src>
1512    <src>Stencil value</src>
1513    <src>Input coverage mask</src>
1514    <sr_count/>
1515    <slot/>
1516  </ins>
1517
1518  <group name="CONVERT" title="Data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1519    <desc>
1520      Performs the given data conversion. Note that floating-point rounding is
1521      handled via the same hardware and therefore shares an encoding. Round mode
1522      is specified where it makes sense.
1523    </desc>
1524
1525    <ins name="V2S16_TO_V2F16" opcode2="0x7"/>
1526
1527    <ins name="S32_TO_F32" opcode2="0x9"/>
1528
1529    <ins name="V2U16_TO_V2F16" opcode2="0x17"/>
1530
1531    <ins name="U32_TO_F32" opcode2="0x19"/>
1532
1533    <roundmode/>
1534    <src widen="true">Value to convert</src>
1535  </group>
1536
1537  <group name="CONVERT" title="16->32 integer data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1538    <desc>
1539      Performs the given data conversion.
1540    </desc>
1541
1542    <ins name="S16_TO_S32" opcode2="0x4"/>
1543    <ins name="S16_TO_F32" opcode2="0x5"/>
1544    <ins name="U16_TO_U32" opcode2="0x14"/>
1545    <ins name="U16_TO_F32" opcode2="0x15"/>
1546
1547    <src swizzle="true" size="16">Value to convert</src>
1548  </group>
1549
1550  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1551    <desc>Performs the given data conversion.</desc>
1552    <ins name="F32_TO_S32" opcode2="0xC"/>
1553    <ins name="F32_TO_U32" opcode2="0x1C"/>
1554    <roundmode/>
1555    <src absneg="true">Value to convert</src>
1556  </group>
1557
1558  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1559    <desc>Performs the given data conversion.</desc>
1560    <ins name="V2F16_TO_V2S16" opcode2="0xE"/>
1561    <ins name="V2F16_TO_V2U16" opcode2="0x1E"/>
1562    <ins name="F16_TO_S32" opcode2="0xA"/>
1563    <ins name="F16_TO_U32" opcode2="0x1A"/>
1564    <roundmode/>
1565    <src swizzle="true" absneg="true" size="16">Value to convert</src>
1566  </group>
1567
1568  <ins name="F16_TO_F32" title="16-bit float to 32-bit float conversion" dests="1" opcode="0x90" opcode2="0xB" unused="true" unit="CVT">
1569    <desc>Converts up with the specified round mode.</desc>
1570    <roundmode/>
1571    <src lane="28" size="16" absneg="true">Value to convert</src>
1572  </ins>
1573
1574  <group name="CONVERT" title="8-bit to 32-bit data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1575    <desc>
1576      Performs the given data conversion.
1577    </desc>
1578
1579    <ins name="S8_TO_S32" opcode2="0x0"/>
1580    <ins name="S8_TO_F32" opcode2="0x1"/>
1581
1582    <ins name="U8_TO_U32" opcode2="0x10"/>
1583    <ins name="U8_TO_F32" opcode2="0x11"/>
1584
1585    <src lane="28" size="8">Value to convert</src>
1586  </group>
1587
1588  <group name="CONVERT" title="8-bit to 16-bit data conversions" dests="1" opcode="0x90" unused="true" unit="CVT">
1589    <desc>
1590      Performs the given data conversion.
1591    </desc>
1592
1593    <ins name="V2S8_TO_V2S16" opcode2="0x2"/>
1594    <ins name="V2S8_TO_V2F16" opcode2="0x3"/>
1595
1596    <ins name="V2U8_TO_V2U16" opcode2="0x12"/>
1597    <ins name="V2U8_TO_V2F16" opcode2="0x13"/>
1598
1599    <src halfswizzle="true" size="8">Value to convert</src>
1600  </group>
1601
1602  <group name="FROUND" title="Floating-point rounding" dests="1" opcode="0x90" unused="true" unit="CVT">
1603    <desc>
1604      Performs the given rounding, using the convert unit.
1605    </desc>
1606
1607    <ins name="FROUND.f32" opcode2="0xD"/>
1608    <ins name="FROUND.v2f16" opcode2="0xF"/>
1609
1610    <roundmode/>
1611    <src swizzle="true" absneg="true">Value to convert</src>
1612  </group>
1613
1614  <ins name="MOV.i32" title="Register move" dests="1" opcode="0x91" opcode2="0x0" unused="true" unit="CVT">
1615    <desc>Canonical register-to-register move.</desc>
1616    <src/>
1617  </ins>
1618
1619  <ins name="CLZ.u32" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x4" unused="true" unit="CVT">
1620    <desc>
1621      Used as a primitive for various bitwise operations.
1622    </desc>
1623    <src/>
1624  </ins>
1625
1626  <ins name="CLZ.v2u16" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x5" unused="true" unit="CVT">
1627    <desc>
1628      Used as a primitive for various bitwise operations.
1629    </desc>
1630    <src swizzle="true"/>
1631  </ins>
1632
1633  <ins name="CLZ.v4u8" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x6" unused="true" unit="CVT">
1634    <desc>
1635      Used as a primitive for various bitwise operations.
1636    </desc>
1637    <src/>
1638  </ins>
1639
1640  <ins name="IABS.s32" title="Absolute value" dests="1" opcode="0x91" opcode2="0x8" unused="true" unit="CVT">
1641    <desc>
1642      64-bit abs may be constructed in 4 instructions (5 clocks) by checking the
1643      sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with
1644      `IADD.s64` and `LSHIFT_XOR.i32` on each half.
1645    </desc>
1646    <src widen="true"/>
1647  </ins>
1648
1649  <ins name="IABS.v2s16" title="Absolute value" dests="1" opcode="0x91" opcode2="0x9" unused="true" unit="CVT">
1650    <src widen="true"/>
1651  </ins>
1652
1653  <ins name="IABS.v4s8" title="Absolute value" dests="1" opcode="0x91" opcode2="0xa" unused="true" unit="CVT">
1654    <src/>
1655  </ins>
1656
1657  <ins name="POPCOUNT.i32" title="Population count" dests="1" opcode="0x91" opcode2="0xC" unused="true" unit="SFU">
1658    <desc>
1659      Only available as 32-bit. Smaller bitsizes require explicit conversions.
1660      64-bit popcount may be constructed in 3 clocks by separate 32-bit
1661      popcounts of each half and a 32-bit add, which is guaranteed not to
1662      overflow.
1663    </desc>
1664    <src/>
1665  </ins>
1666
1667  <ins name="BITREV.i32" title="Bitwise reverse" dests="1" opcode="0x91" opcode2="0xD" unused="true" unit="SFU">
1668    <desc>
1669      Only available as 32-bit. Other bitsizes may be derived with swizzles.
1670    </desc>
1671    <src/>
1672  </ins>
1673
1674  <ins name="NOT_OLD.i32" title="Bitwise complement" dests="1" opcode="0x91" opcode2="0xE" unused="true" unit="SFU">
1675    <desc>
1676      For fully featured bitwise operation, see the shift opcodes.
1677    </desc>
1678    <src/>
1679  </ins>
1680
1681  <ins name="NOT_OLD.i64" title="Bitwise complement" dests="1" opcode="0x191" opcode2="0xE" unused="true" unit="SFU">
1682    <desc>
1683      For fully featured bitwise operation, see the shift opcodes.
1684    </desc>
1685    <src/>
1686  </ins>
1687
1688  <ins name="WMASK" title="Warp mask" dests="1" opcode="0x95" unused="true" unit="CVT">
1689    <desc>
1690      Returns the mask of lanes ever active within the warp (subgroup), such
1691      that the source is nonzero. The number of work-items in a subgroup is
1692      given as the popcount of this value with a nonzero input.
1693
1694      An `all()` subgroup operation may be constructed as `WMASK` of the input
1695      compared for equality with `WMASK` of an nonzero value.
1696
1697      An `any()` subgroup operation may be constructed as `WMASK` of the input
1698      compared against zero.
1699    </desc>
1700    <src/>
1701    <subgroup/>
1702  </ins>
1703
1704  <group name = "FLUSH" title="Flush floats" dests="1" opcode="0x98" unit="CVT">
1705    <ins name="FLUSH.f32" opcode2="0"/>
1706    <ins name="FLUSH.v2f16" opcode2="1"/>
1707    <desc>
1708      Flush special float values. The ftz modifier flushes subnormal values to
1709      zero. The flush_inf modifier flushes +inf to the maximum finite value, and
1710      -inf to the minimum finite value. nan_mode may flush either all NaN values
1711      to zero or signaling NaNs to quiet NaNs depending on the mode.
1712    </desc>
1713    <va_mod name="nan_mode" start="8" size="2"/>
1714    <va_mod name="ftz" start="10" size="1"/>
1715    <va_mod name="flush_inf" start="11" size="1"/>
1716    <src float="true" absneg="true" swizzle="true"/>
1717  </group>
1718
1719  <group name="FREXP" title="Fraction/exponent extract" dests="1" opcode="0x99" unused="true" unit="CVT">
1720    <ins name="FREXPM.f32" opcode2="0"/>
1721    <ins name="FREXPM.v2f16" opcode2="1"/>
1722    <ins name="FREXPE.f32" opcode2="2"/>
1723    <ins name="FREXPE.v2f16" opcode2="3"/>
1724    <desc>
1725      Breaks up the floating-point input into its fractional (mantissa) and
1726      exponent parts. By default, this is compatible with the `frexp()` function
1727      in APIs. With the log/sqrt modifiers, the floating point format is
1728      adjusted to be compatible with Valhall's argument reduction for logarithm
1729      and square root computation respectively.
1730    </desc>
1731    <va_mod name="sqrt" start="24" size="1"/>
1732    <va_mod name="log" start="25" size="1"/>
1733    <src float="true" swizzle="true"/>
1734  </group>
1735
1736  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unused="true" unit="SFU">
1737    <ins name="FRCP.f32" opcode2="0"/>
1738    <ins name="FRCP.f16" opcode2="1"/>
1739    <ins name="FRSQ.f32" opcode2="2"/>
1740    <ins name="FRSQ.f16" opcode2="3"/>
1741    <ins name="FLOGD.f32" opcode2="8"/>
1742    <ins name="FPCLASS.f32" opcode2="10"/>
1743    <ins name="FPCLASS.f16" opcode2="11"/>
1744    <ins name="FLOG_TABLE.f32" opcode2="12"/>
1745    <ins name="FRCP_APPROX.f32" opcode2="14"/>
1746    <ins name="FRSQ_APPROX.f32" opcode2="15"/>
1747    <desc>
1748      Performs a given special function. The floating-point reciprocal (`FRCP`)
1749      and reciprocal square root (`FRSQ`) instructions may be freely used as-is.
1750      The logarithm instruction (`FLOGD.f32`) requires an argument
1751      reduction. See the transcendentals section for more information. Like the
1752      Bifrost op, `FRSQ_APPROX.f32` does an implicit `FREXPM.f32.sqrt` on the
1753      source.
1754    </desc>
1755    <src float="true" swizzle="true" absneg="true"/>
1756  </group>
1757
1758  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unused="true" unit="SFU">
1759    <ins name="FSIN_TABLE.u6" opcode2="4"/>
1760    <ins name="FCOS_TABLE.u6" opcode2="5"/>
1761    <ins name="FSINCOS_OFFSET.u6" opcode2="6"/>
1762    <ins name="FEXP_TABLE.u4" opcode2="13"/>
1763    <desc>
1764      Performs a given special function. The trigonometric tables
1765      (`FSIN_TABLE.u6` and `FCOS_TABLE.u6`) are crude, requiring both an
1766      argument reduction and postprocessing.
1767    </desc>
1768    <src/>
1769  </group>
1770
1771  <group name="FADD" title="Floating-point add" dests="1" opcode2="0" unused="true" unit="FMA">
1772    <ins name="FADD.f32" opcode="0xA4"/>
1773    <ins name="FADD.v2f16" opcode="0xA5"/>
1774    <desc>$A + B$</desc>
1775    <clamp/>
1776    <src absneg="true" swizzle="true">A</src>
1777    <src absneg="true" swizzle="true">B</src>
1778  </group>
1779
1780  <group name="FMIN" title="Floating-point minimum" dests="1" opcode2="2" unused="true" unit="CVT">
1781    <ins name="FMIN.f32" opcode="0xA4"/>
1782    <ins name="FMIN.v2f16" opcode="0xA5"/>
1783    <desc>$\min \{ A, B \}$</desc>
1784    <clamp/>
1785    <src absneg="true" swizzle="true">A</src>
1786    <src absneg="true" swizzle="true">B</src>
1787  </group>
1788
1789  <group name="FMAX" title="Floating-point maximum" dests="1" opcode2="3" unused="true" unit="CVT">
1790    <ins name="FMAX.f32" opcode="0xA4"/>
1791    <ins name="FMAX.v2f16" opcode="0xA5"/>
1792    <desc>$\max \{ A, B \}$</desc>
1793    <clamp/>
1794    <src absneg="true" swizzle="true">A</src>
1795    <src absneg="true" swizzle="true">B</src>
1796  </group>
1797
1798  <group name="V2F32_TO_V2F16" title="Vectorized floating-point conversion" dests="1" opcode2="4" unused="true" unit="CVT">
1799    <ins name="V2F32_TO_V2F16" opcode="0xA5"/>
1800    <desc>
1801      Given a pair of 32-bit floats, output a pair of 16-bit floats packed into
1802      a 32-bit destination.
1803    </desc>
1804    <clamp/>
1805    <roundmode/>
1806    <src absneg="true">A</src>
1807    <src absneg="true">B</src>
1808  </group>
1809
1810  <group name="LDEXP" title="Floating-point rescaling" dests="1" opcode2="6" unused="true" unit="FMA">
1811    <ins name="LDEXP.f32" opcode="0xA4"/>
1812    <ins name="LDEXP.v2f16" opcode="0xA5"/>
1813    <desc>
1814      Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate
1815      various special functions, particularly base-2 exponents. Special case
1816      handling differs from an actual floating-point multiply, so this should
1817      not be used outside fixed instruction sequences.
1818    </desc>
1819    <src absneg="true">A</src>
1820    <src/>
1821    <roundmode/> <!-- Also has rtna -->
1822    <!-- Also has infinity handling for arctan -->
1823  </group>
1824
1825  <ins name="FEXP.f32" title="Floating-point exponent" dests="1" opcode="0xA4" opcode2="8" unused="true" unit="SFU">
1826    <desc>
1827      Calculates the base-2 exponent of an argument specified as a 8:24
1828      fixed-point. The original argument is passed as well for correct handling
1829      of special cases.
1830    </desc>
1831    <clamp/>
1832    <src>Input as 8:24 fixed-point</src>
1833    <src absneg="true">Input as 32-bit float</src>
1834  </ins>
1835
1836  <ins name="FADD_LSCALE.f32" title="Floating-point add with logarithm scale" dests="1" opcode="0xA4" opcode2="9" unused="true" unit="FMA">
1837    <desc>
1838      Performs a floating-point addition specialized for logarithm computation.
1839    </desc>
1840    <clamp/>
1841    <src absneg="true">A</src>
1842    <src absneg="true">B</src>
1843  </ins>
1844
1845  <ins name="FATAN_ASSIST.f32" title="ATAN calculation helper" dests="1" opcode="0xA4" opcode2="14" unused="true" unit="SFU">
1846    <desc>
1847      Used for `atan2()` implementation. Destination is two 16-bit
1848      values (int and float) for the first form, and a single 32-bit float when
1849      `.second` is set (indicating the FATAN_TABLE.f32 instruction).
1850    </desc>
1851    <va_mod name="second" start="24" size="1"/>
1852    <src>A</src>
1853    <src>B</src>
1854  </ins>
1855
1856  <group name="IADD" title="Integer addition" dests="1" opcode2="0" unused="true" unit="CVT">
1857    <desc>
1858      $A + B$ with optional saturation.
1859
1860      As Valhall lacks swizzle instructions, `IADD.v2i16` with zero is the
1861      canonical lowering for swizzles.
1862    </desc>
1863    <ins name="IADD.u32" opcode="0xA0"/>
1864    <ins name="IADD.v2u16" opcode="0xA1"/>
1865    <ins name="IADD.v4u8" opcode="0xA2"/>
1866    <ins name="IADD.s32" opcode="0xA8"/>
1867    <ins name="IADD.v2s16" opcode="0xA9"/>
1868    <ins name="IADD.v4s8" opcode="0xAA"/>
1869    <ins name="IADD.u64" opcode="0x1A3"/>
1870    <ins name="IADD.s64" opcode="0x1AB"/>
1871    <!-- <ins name="IADD.s32" opcode="0x1A0"/> -->
1872    <src widen="true">A</src>
1873    <src widen="true">B</src>
1874    <saturate/>
1875  </group>
1876
1877  <ins name="MKVEC.v2i16" title="Make 16-bit vector" dests="1" opcode="0xA1" opcode2="0x5" unused="true" unit="CVT">
1878    <desc>Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)`</desc>
1879    <src swizzle="true">A</src>
1880    <src swizzle="true">B</src>
1881  </ins>
1882
1883  <group name="ISUB" title="Integer subtract" dests="1" opcode2="1" unused="true" unit="CVT">
1884    <ins name="ISUB.u32" opcode="0xA0"/>
1885    <ins name="ISUB.v2u16" opcode="0xA1"/>
1886    <ins name="ISUB.v4u8" opcode="0xA2"/>
1887    <ins name="ISUB.s32" opcode="0xA8"/>
1888    <ins name="ISUB.v2s16" opcode="0xA9"/>
1889    <ins name="ISUB.v4s8" opcode="0xAA"/>
1890    <ins name="ISUB.u64" opcode="0x1A3"/>
1891    <ins name="ISUB.s64" opcode="0x1AB"/>
1892    <desc>$A - B$ with optional saturation</desc>
1893    <src widen="true">A</src>
1894    <src widen="true">B</src>
1895    <saturate/>
1896  </group>
1897
1898  <group name="SEG_ADD" title="Segment addition" dests="1" opcode2="6" unused="true" unit="CVT">
1899    <desc>
1900      Similar to SHADDX, but especially used for loading offsets into
1901      WLS. Usually this is only required for atomic operations, which cannot
1902      directly use wls_pointer as an address.
1903
1904      .neg indicates SEG_SUB instead.
1905    </desc>
1906    <ins name="SEG_ADD.u64" opcode="0x1A3"/>
1907    <va_mod name="neg" start="38" size="1"/>
1908    <va_mod name="preserve_null" start="39" size="1"/>
1909    <src>A</src>
1910    <src widen="true">B</src>
1911  </group>
1912
1913  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" opcode2="7" unused="true" unit="CVT">
1914    <desc>
1915      Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
1916      64-bit value A. These instructions accelerate address arithmetic, but may
1917      be used in full generality for 64-bit integer arithmetic.
1918    </desc>
1919    <ins name="SHADDX.u64" opcode="0x1A3"/>
1920    <ins name="SHADDX.s64" opcode="0x1AB"/>
1921    <imm name="shift" start="20" size="3"/>
1922    <src>A</src>
1923    <src widen="true">B</src>
1924  </group>
1925
1926  <group name="IMUL" title="Integer multiply" dests="1" opcode2="0x0A" unused="true" unit="SFU">
1927    <ins name="IMUL.i32" opcode="0xA0"/>
1928    <ins name="IMUL.v2i16" opcode="0xA1"/>
1929    <ins name="IMUL.v4i8" opcode="0xA2"/>
1930    <ins name="IMUL.s32" opcode="0xA8"/>
1931    <ins name="IMUL.v2s16" opcode="0xA9"/>
1932    <ins name="IMUL.v4s8" opcode="0xAA"/>
1933    <ins name="IMULD.u64" opcode="0x1A3"/>
1934    <!-- <ins name="IMUL.s32" opcode="0x1A0"/> -->
1935    <desc>
1936      $A \cdot B$ with optional saturation. Note the multipliers can only handle up to
1937      32-bit by 32-bit multiplies. The 64-bit "multiply" acts like IMUL.u32 but
1938      additionally writes the high half of the product to the high half of the
1939      64-bit destination. Along with IADD.u32 and IADD.u64, this allows the
1940      construction of a 64-bit multiply in 5 instructions (6 clocks).
1941    </desc>
1942    <src widen="true">A</src>
1943    <src widen="true">B</src>
1944    <saturate/>
1945  </group>
1946
1947  <group name="HADD" title="Integer half-add" dests="1" opcode2="0x0B" unused="true" unit="CVT">
1948    <ins name="HADD.u32" opcode="0xA0"/>
1949    <ins name="HADD.v2u16" opcode="0xA1"/>
1950    <ins name="HADD.v4u8" opcode="0xA2"/>
1951    <ins name="HADD.s32" opcode="0xA8"/>
1952    <ins name="HADD.v2s16" opcode="0xA9"/>
1953    <ins name="HADD.v4s8" opcode="0xAA"/>
1954    <va_mod name="rhadd" start="30" size="1"/>
1955    <src widen="true">A</src>
1956    <src widen="true">B</src>
1957    <desc>
1958      $(A + B) \gg 1$ without intermediate overflow, corresponding to `hadd()` in
1959      OpenCL. With the `.rhadd` modifier set, it instead calculates
1960      $(A + B + 1) \gg 1$ corresponding to `rhadd()` in OpenCL.
1961    </desc>
1962  </group>
1963
1964  <group name="CLPER" title="Cross-lane permute" dests="1" opcode2="0xF" unused="true" unit="SFU">
1965    <ins name="CLPER.i32" opcode="0xA0"/>
1966    <ins name="CLPER.v2u16" opcode="0xA1"/>
1967    <ins name="CLPER.v4u8" opcode="0xA2"/>
1968    <ins name="CLPER.s32" opcode="0xA8"/>
1969    <ins name="CLPER.v2s16" opcode="0xA9"/>
1970    <ins name="CLPER.v4s8" opcode="0xAA"/>
1971    <ins name="CLPER.u64" opcode="0x1A3"/>
1972    <ins name="CLPER.s64" opcode="0x1AB"/>
1973    <!-- <ins name="CLPER.s32" opcode="0x1A0"/> -->
1974    <desc>
1975      Selects the value of A in the subgroup lane given by B. This implements
1976      subgroup broadcasts. It may be used as a primitive for screen space
1977      derivatives in fragment shaders.
1978    </desc>
1979    <src>A</src>
1980    <src lanes="true" size="8">B</src>
1981    <subgroup/>
1982    <lane_op/>
1983    <inactive_result/>
1984  </group>
1985
1986  <group name="FMA" title="Fused floating-point multiply add" dests="1" unused="true" unit="FMA">
1987    <ins name="FMA.f32" opcode="0xB2"/>
1988    <ins name="FMA.v2f16" opcode="0xB3"/>
1989    <desc>$A \cdot B + C$</desc>
1990    <clamp/>
1991    <src absneg="true" swizzle="true">A</src>
1992    <src absneg="true" swizzle="true">B</src>
1993    <src absneg="true" swizzle="true">C</src>
1994  </group>
1995
1996  <group name="LSHIFT_AND" title="Left shift and bitwise AND" dests="1" opcode2="0x100" unused="true" unit="SFU">
1997    <ins name="LSHIFT_AND.i32" opcode="0xB4"/>
1998    <ins name="LSHIFT_AND.v2i16" opcode="0xB5"/>
1999    <ins name="LSHIFT_AND.v4i8" opcode="0xB6"/>
2000    <ins name="LSHIFT_AND.i64" opcode="0x1B7"/>
2001    <va_mod name="left" start="128" size="1" implied="true"/>
2002    <desc>
2003      Left shifts its first source by a specified amount and bitwise ANDs it with the
2004      second source, optionally inverting the second source or the result.
2005    </desc>
2006    <not_result/>
2007    <src widen="true">A</src>
2008    <src lanes="true" size="8">shift</src>
2009    <src not="true">B</src>
2010  </group>
2011
2012  <group name="RSHIFT_AND" title="Right shift and bitwise AND" dests="1" opcode2="0x000" unused="true" unit="SFU">
2013    <ins name="RSHIFT_AND.i32" opcode="0xB4"/>
2014    <ins name="RSHIFT_AND.v2i16" opcode="0xB5"/>
2015    <ins name="RSHIFT_AND.v4i8" opcode="0xB6"/>
2016    <ins name="RSHIFT_AND.i64" opcode="0x1B7"/>
2017    <va_mod name="left" start="128" size="1" implied="true"/>
2018    <desc>
2019      Right shifts its first source by a specified amount and bitwise ANDs it with the
2020      second source, optionally inverting the second source or the result. If
2021      `signed` is set, the hardware performs an arithmetic right shift; otherwise,
2022      it performs an unsigned right shift.
2023    </desc>
2024    <va_mod name="signed" start="34" size="1"/>
2025    <not_result/>
2026    <src widen="true">A</src>
2027    <src lanes="true" size="8">shift</src>
2028    <src not="true">B</src>
2029  </group>
2030
2031  <group name="LSHIFT_OR" title="Left shift and bitwise OR" dests="1" opcode2="0x101" unused="true" unit="SFU">
2032    <ins name="LSHIFT_OR.i32" opcode="0xB4"/>
2033    <ins name="LSHIFT_OR.v2i16" opcode="0xB5"/>
2034    <ins name="LSHIFT_OR.v4i8" opcode="0xB6"/>
2035    <ins name="LSHIFT_OR.i64" opcode="0x1B7"/>
2036    <va_mod name="left" start="128" size="1" implied="true"/>
2037    <desc>
2038      Left shifts its first source by a specified amount and bitwise ORs it with the
2039      second source, optionally inverting the second source or the result.
2040    </desc>
2041    <not_result/>
2042    <src widen="true">A</src>
2043    <src lanes="true" size="8">shift</src>
2044    <src not="true">B</src>
2045  </group>
2046
2047  <group name="RSHIFT_OR" title="Right shift and bitwise OR" dests="1" opcode2="0x001" unused="true" unit="SFU">
2048    <ins name="RSHIFT_OR.i32" opcode="0xB4"/>
2049    <ins name="RSHIFT_OR.v2i16" opcode="0xB5"/>
2050    <ins name="RSHIFT_OR.v4i8" opcode="0xB6"/>
2051    <ins name="RSHIFT_OR.i64" opcode="0x1B7"/>
2052    <va_mod name="left" start="128" size="1" implied="true"/>
2053    <desc>
2054      Right shifts its first source by a specified amount and bitwise ORs it with the
2055      second source, optionally inverting the second source or the result. If
2056      `signed` is set, the hardware performs an arithmetic right shift; otherwise,
2057      it performs an unsigned right shift.
2058   </desc>
2059    <va_mod name="signed" start="34" size="1"/>
2060    <not_result/>
2061    <src widen="true">A</src>
2062    <src lanes="true" size="8">shift</src>
2063    <src not="true">B</src>
2064  </group>
2065
2066  <group name="LSHIFT_XOR" title="Left shift and bitwise XOR" dests="1" opcode2="0x102" unused="true" unit="SFU">
2067    <ins name="LSHIFT_XOR.i32" opcode="0xB4"/>
2068    <ins name="LSHIFT_XOR.v2i16" opcode="0xB5"/>
2069    <ins name="LSHIFT_XOR.v4i8" opcode="0xB6"/>
2070    <ins name="LSHIFT_XOR.i64" opcode="0x1B7"/>
2071    <va_mod name="left" start="128" size="1" implied="true"/>
2072    <desc>
2073      Left shifts its first source by a specified amount and bitwise XORs it with the
2074      second source, optionally inverting the second source or the result.
2075    </desc>
2076    <not_result/>
2077    <src widen="true">A</src>
2078    <src lanes="true" size="8">shift</src>
2079    <src not="true">B</src>
2080  </group>
2081
2082  <group name="RSHIFT_XOR" title="Right shift and bitwise XOR" dests="1" opcode2="0x002" unused="true" unit="SFU">
2083    <ins name="RSHIFT_XOR.i32" opcode="0xB4"/>
2084    <ins name="RSHIFT_XOR.v2i16" opcode="0xB5"/>
2085    <ins name="RSHIFT_XOR.v4i8" opcode="0xB6"/>
2086    <ins name="RSHIFT_XOR.i64" opcode="0x1B7"/>
2087    <va_mod name="left" start="128" size="1" implied="true"/>
2088    <desc>
2089      Right shifts its first source by a specified amount and bitwise XORs it with the
2090      second source, optionally inverting the second source or the result. If
2091      `signed` is set, the hardware performs an arithmetic right shift; otherwise,
2092      it performs an unsigned right shift.
2093    </desc>
2094    <va_mod name="signed" start="34" size="1"/>
2095    <not_result/>
2096    <src widen="true">A</src>
2097    <src lanes="true" size="8">shift</src>
2098    <src not="true">B</src>
2099  </group>
2100
2101  <ins name="MUX.i32" title="Mux" dests="1" opcode="0xB8" unused="true" unit="SFU">
2102    <desc>
2103      Mux between A and B based on the provided mask. The condition specified
2104      as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
2105      else `B` is chosen. The `bit` modifier acts bitwise, equivalent to
2106      `bitselect()` in OpenCL, so `MUX.i32.bit A, B, mask` calculates
2107      `(A &amp; mask) | (B &amp; ~mask)`.
2108    </desc>
2109    <va_mod name="mux" start="32" size="2"/>
2110    <src>A</src>
2111    <src>B</src>
2112    <src>Mask</src>
2113  </ins>
2114
2115  <ins name="MUX.v2i16" title="Mux" dests="1" opcode="0xB9" unused="true" unit="SFU">
2116    <desc>
2117      Mux between A and B based on the provided mask. The condition specified
2118      as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
2119      else `B` is chosen. The `bit` modifier acts bitwise, equivalent to
2120      `bitselect()` in OpenCL, so `MUX.v2i16.bit A, B, mask` calculates
2121      `(A &amp; mask) | (B &amp; ~mask)`.
2122    </desc>
2123    <va_mod name="mux" start="32" size="2"/>
2124    <src swizzle="true">A</src>
2125    <src swizzle="true">B</src>
2126    <src swizzle="true">Mask</src>
2127  </ins>
2128
2129  <ins name="MUX.v4i8" title="Mux" dests="1" opcode="0xBA" unused="true" unit="SFU">
2130    <desc>
2131      Mux between A and B based on the provided mask. The condition specified
2132      as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
2133      else `B` is chosen. The `bit` modifier acts bitwise, equivalent to
2134      `bitselect()` in OpenCL, so `MUX.v4i8.bit A, B, mask` calculates
2135      `(A &amp; mask) | (B &amp; ~mask)`.
2136    </desc>
2137    <va_mod name="mux" start="32" size="2"/>
2138    <src>A</src>
2139    <src>B</src>
2140    <src>Mask</src>
2141  </ins>
2142
2143  <ins name="CUBE_SSEL" title="Cube S-coordinate select" dests="1" opcode="0xBC" opcode2="0" unused="true" unit="SFU">
2144    <desc>During a cube map transform, select the S coordinate given a selected face.</desc>
2145    <src absneg="true">Z coordinate as 32-bit floating point</src>
2146    <src absneg="true">X coordinate as 32-bit floating point</src>
2147    <src>Cube face index</src>
2148  </ins>
2149
2150  <ins name="CUBE_TSEL" title="Cube T-coordinate select" dests="1" opcode="0xBC" opcode2="1" unused="true" unit="SFU">
2151    <desc>During a cube map transform, select the T coordinate given a selected face.</desc>
2152    <src absneg="true">Y coordinate as 32-bit floating point</src>
2153    <src absneg="true">Z coordinate as 32-bit floating point</src>
2154    <src>Cube face index</src>
2155  </ins>
2156
2157  <ins name="MKVEC.v2i8" title="Make 8-bit vector" dests="1" opcode="0xBD" unit="CVT">
2158    <desc>
2159      Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD.
2160
2161      To implement `(uchar4) (A, B, C, D)` in full generality, use the sequence
2162      `MKVEC.v2i8 CD, C, D, #0; MKVEC.v2i8 out, A, B, CD`
2163
2164      `MKVEC.v2i8` also allows zero extending arbitrary 8-bit lanes. For
2165      example, to extend `r0.b3` to `r1`, use `MKVEC.v2i8 r1, r0.b3, 0x0.b0, 0x0`.
2166    </desc>
2167    <src lane="true">A</src>
2168    <src lane="true">B</src>
2169    <src>CD</src>
2170  </ins>
2171
2172  <ins name="CUBEFACE1" title="Cube map transform step 1" dests="1" opcode="0xC0" unused="true" unit="SFU">
2173    <desc>Select the maximum absolute value of its arguments.</desc>
2174    <src absneg="true">X coordinate as 32-bit floating point</src>
2175    <src absneg="true">Y coordinate as 32-bit floating point</src>
2176    <src absneg="true">Z coordinate as 32-bit floating point</src>
2177  </ins>
2178
2179  <ins name="CUBEFACE2_V9" title="Cube map transform step 2" dests="1" opcode="0xC1" unit="SFU">
2180    <desc>Select the cube face index corresponding to the arguments.</desc>
2181    <src absneg="true">X coordinate as 32-bit floating point</src>
2182    <src absneg="true">Y coordinate as 32-bit floating point</src>
2183    <src absneg="true">Z coordinate as 32-bit floating point</src>
2184  </ins>
2185
2186  <group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2" unused="true" unit="FMA">
2187    <desc>
2188      8-bit integer dot product between 4 channel vectors, intended for machine
2189      learning. Available in both unsigned and signed variants, controlling
2190      sign-extension/zero-extension behaviour to the final 32-bit destination.
2191      Saturation is available. Corresponds to the `cl_arm_integer_dot_product_*`
2192      family of OpenCL extensions. Not for actual use, just for completeness.
2193      Instead, use your platform's neural accelerator.
2194
2195      For $A, B \in \{ 0, \ldots, 255 \}^4$ and $\text{Accumulator} \in
2196      \mathbb{Z}$, calculates $(A \cdot B) + \text{Accumulator}$ and optionally
2197      saturates.
2198    </desc>
2199    <ins name="IDP.v4s8" opcode2="0"/>
2200    <ins name="IDP.v4u8" opcode2="1"/>
2201    <src>A</src>
2202    <src>B</src>
2203    <src>Accumulator</src>
2204    <saturate/>
2205  </group>
2206
2207  <group name="ICMP_OR" title="Unsigned integer compare" dests="1" unit="CVT" opcode2="0">
2208    <desc>
2209      Evaluates the given condition, do a logical or with the condition in
2210      the result source, and return in the given result type (integer
2211      one, integer minus one, or floating-point one). The third source is useful
2212      for chaining together conditions without intermediate bitwise arithmetic;
2213      when this is not desired, tie it to zero.
2214    </desc>
2215    <ins name="ICMP_OR.u32" opcode="0xF0"/>
2216    <ins name="ICMP_OR.v2u16" opcode="0xF1"/>
2217    <ins name="ICMP_OR.v4u8" opcode="0xF2"/>
2218    <cmp int_only="true"/>
2219    <result_type/>
2220    <src widen="true">A</src>
2221    <src widen="true">B</src>
2222    <src>C</src>
2223  </group>
2224
2225  <group name="ICMP_AND" title="Unsigned integer compare" dests="1" unit="CVT" opcode2="1">
2226    <desc>
2227      Evaluates the given condition, do a logical and with the condition in
2228      the result source, and return in the given result type (integer
2229      one, integer minus one, or floating-point one). The third source is useful
2230      for chaining together conditions without intermediate bitwise arithmetic.
2231    </desc>
2232    <ins name="ICMP_AND.u32" opcode="0xF0"/>
2233    <ins name="ICMP_AND.v2u16" opcode="0xF1"/>
2234    <ins name="ICMP_AND.v4u8" opcode="0xF2"/>
2235    <cmp int_only="true"/>
2236    <result_type/>
2237    <src widen="true">A</src>
2238    <src widen="true">B</src>
2239    <src>C</src>
2240  </group>
2241
2242  <group name="FCMP_OR" title="Floating-point compare" dests="1" unit="CVT" opcode2="0">
2243    <desc>
2244      Evaluates the given condition, do a logical or with the condition in
2245      the result source, and return in the given result type (integer
2246      one, integer minus one, or floating-point one). The third source is useful
2247      for chaining together conditions without intermediate bitwise arithmetic;
2248      when this is not desired, tie it to zero.
2249    </desc>
2250    <ins name="FCMP_OR.f32" opcode="0xF4"/>
2251    <ins name="FCMP_OR.v2f16" opcode="0xF5"/>
2252    <cmp/>
2253    <result_type/>
2254    <src absneg="true" swizzle="true">A</src>
2255    <src absneg="true" swizzle="true">B</src>
2256    <src>C</src>
2257  </group>
2258
2259  <group name="FCMP_AND" title="Floating-point compare" dests="1" unit="CVT" opcode2="1">
2260    <desc>
2261      Evaluates the given condition, do a logical and/or with the condition in
2262      the result source, and return in the given result type (integer
2263      one, integer minus one, or floating-point one). The third source is useful
2264      for chaining together conditions without intermediate bitwise arithmetic.
2265    </desc>
2266    <ins name="FCMP_AND.f32" opcode="0xF4"/>
2267    <ins name="FCMP_AND.v2f16" opcode="0xF5"/>
2268    <cmp/>
2269    <result_type/>
2270    <src absneg="true" swizzle="true">A</src>
2271    <src absneg="true" swizzle="true">B</src>
2272    <src>C</src>
2273  </group>
2274
2275  <group name="ICMP_OR" title="Signed integer compare" dests="1" unit="CVT" opcode2="0">
2276    <desc>
2277      Evaluates the given condition, do a logical or with the condition in
2278      the result source, and return in the given result type (integer
2279      one, integer minus one, or floating-point one). The third source is useful
2280      for chaining together conditions without intermediate bitwise arithmetic.
2281    </desc>
2282    <ins name="ICMP_OR.s32" opcode="0xF8"/>
2283    <ins name="ICMP_OR.v2s16" opcode="0xF9"/>
2284    <ins name="ICMP_OR.v4s8" opcode="0xFA"/>
2285    <cmp int_only="true"/>
2286    <result_type/>
2287    <src widen="true">A</src>
2288    <src widen="true">B</src>
2289    <src>C</src>
2290  </group>
2291
2292  <group name="ICMP_AND" title="Signed integer compare" dests="1" unit="CVT" opcode2="1">
2293    <desc>
2294      Evaluates the given condition, do a logical and with the condition in
2295      the result source, and return in the given result type (integer
2296      one, integer minus one, or floating-point one). The third source is useful
2297      for chaining together conditions without intermediate bitwise arithmetic.
2298    </desc>
2299    <ins name="ICMP_AND.s32" opcode="0xF8"/>
2300    <ins name="ICMP_AND.v2s16" opcode="0xF9"/>
2301    <ins name="ICMP_AND.v4s8" opcode="0xFA"/>
2302    <cmp int_only="true"/>
2303    <result_type/>
2304    <src widen="true">A</src>
2305    <src widen="true">B</src>
2306    <src>C</src>
2307  </group>
2308
2309  <group name="ICMP_MULTI" title="Integer compare" dests="1" unit="CVT" opcode2="2">
2310    <desc>
2311      Evaluates the given condition, do a logical and/or with the condition in
2312      the result source, and return in the given result type (integer
2313      one, integer minus one, or floating-point one). The third source is useful
2314      for chaining together conditions without intermediate bitwise arithmetic;
2315      when this is not desired, tie it to zero and use the OR combine mode (do
2316      not set the `.and` modifier).
2317
2318      Used to construct signed 64-bit compares
2319      in 1 `ICMP.u32` and 1 `ICMP.s32` instruction, in conjunction with the `u1`
2320      result type on the low half, the `m1` result type on the high half, and
2321      the result of the low half comparison passed as the third source.
2322    </desc>
2323    <ins name="ICMP_MULTI.u32" opcode="0xF0"/>
2324    <ins name="ICMP_MULTI.s32" opcode="0xF8"/>
2325    <cmp int_only="true"/>
2326    <result_type/>
2327    <src widen="true">A</src>
2328    <src widen="true">B</src>
2329    <src>C</src>
2330  </group>
2331
2332  <ins name="IADD_IMM.i32" title="Integer addition with immediate" dests="1" opcode="0x110" unit="CVT">
2333    <desc>
2334      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
2335      If no modifiers are required, this is preferred to `IADD.i32` with a
2336      constant accessed as a uniform. However, if the constant is available
2337      inline, `IADD.i32` is preferred.
2338
2339      `IADD_IMM.i32` with the source tied to zero is the canonical immediate move.
2340    </desc>
2341    <src>A</src>
2342    <imm name="constant" ir_name="index" start="8" size="32"/>
2343  </ins>
2344
2345  <ins name="IADD_IMM.v2i16" title="Integer addition with immediate" dests="1" opcode="0x111" unit="CVT">
2346    <desc>
2347      Adds an arbitrary pair of 16-bit immediates embedded within the
2348      instruction stream. If no modifiers are required, this is preferred to
2349      `IADD.v2i16` with a constant accessed as a uniform. However, if the
2350      constant is available inline, `IADD.v2i16` is preferred. Adding only a
2351      single 16-bit constant requires replication of the constant.
2352    </desc>
2353    <src>A</src>
2354    <imm name="constant" ir_name="index" start="8" size="32"/>
2355  </ins>
2356
2357  <ins name="IADD_IMM.v4i8" title="Integer addition with immediate" dests="1" opcode="0x112" unit="CVT">
2358    <desc>
2359      Adds an arbitrary quad of 8-bit immediates embedded within the
2360      instruction stream. If no modifiers are required, this is preferred to
2361      `IADD.v4i8` with a constant accessed as a uniform. However, if the
2362      constant is available inline, `IADD.v4i8` is preferred. Adding only a
2363      single 8-bit constant requires replication of the constant.
2364    </desc>
2365    <src>A</src>
2366    <imm name="constant" ir_name="index" start="8" size="32"/>
2367  </ins>
2368
2369  <ins name="FADD_IMM.f32" title="Floating-point addition with immediate" dests="1" opcode="0x114" unit="FMA">
2370    <desc>
2371      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
2372      If no modifiers are required, this is preferred to `FADD.f32` with a
2373      constant accessed as a uniform. However, if the constant is available
2374      inline, `FADD.f32` is preferred.
2375    </desc>
2376    <src>A</src>
2377    <imm name="constant" ir_name="index" start="8" size="32"/>
2378  </ins>
2379
2380  <ins name="FADD_IMM.v2f16" title="Floating-point addition with immediate" dests="1" opcode="0x115" unit="FMA">
2381    <desc>
2382      Adds an arbitrary pair of 16-bit immediates embedded within the
2383      instruction stream. If no modifiers are required, this is preferred to
2384      `FADD.v2f16` with a constant accessed as a uniform. However, if the
2385      constant is available inline, `FADD.v2f16` is preferred. Adding only a
2386      single 16-bit constant requires replication of the constant.
2387    </desc>
2388    <src float="true">A</src>
2389    <imm name="constant" ir_name="index" start="8" size="32"/>
2390  </ins>
2391
2392  <ins name="ATOM1_RETURN.i32" title="Atomic operations on memory with 1" opcode="0x69" opcode2="3" unused="true" unit="LS">
2393    <slot/>
2394    <sr_count/>
2395    <atom_opc_1/>
2396    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2397
2398    <!-- Optional for ATOM1.i32, in which sr_count must be 0 -->
2399    <sr write="true"/>
2400    <src size="64">64-bit address to operate on</src>
2401    <imm name="offset" start="8" size="8"/>
2402  </ins>
2403
2404  <ins name="ATOM1_RETURN.i64" title="Atomic operations on memory with 1" opcode="0x69" opcode2="5" unused="true" unit="LS">
2405    <slot/>
2406    <sr_count/>
2407    <atom_opc_1/>
2408    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2409
2410    <!-- Optional for ATOM1.i64, in which sr_count must be 0 -->
2411    <sr write="true"/>
2412    <src size="64">64-bit address to operate on</src>
2413    <imm name="offset" start="8" size="8"/>
2414  </ins>
2415
2416  <ins name="ATOM.i32" title="Atomic operations on memory" opcode="0x68" opcode2="3" unused="true" unit="LS">
2417    <slot/>
2418    <sr_count/>
2419    <atom_opc/>
2420    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2421
2422    <sr read="true"/>
2423    <src size="64">64-bit address to operate on</src>
2424    <imm name="offset" start="8" size="8"/>
2425  </ins>
2426
2427  <ins name="ATOM.i64" title="Atomic operations on memory" opcode="0x68" opcode2="5" unused="true" unit="LS">
2428    <slot/>
2429    <sr_count/>
2430    <atom_opc/>
2431    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2432
2433    <sr read="true"/>
2434    <src size="64">64-bit address to operate on</src>
2435    <imm name="offset" start="8" size="8"/>
2436  </ins>
2437
2438  <ins name="ATOM_RETURN.i32" title="Atomic operations on memory" opcode="0x120" opcode2="3" unused="true" unit="LS">
2439    <slot/>
2440    <sr_count/>
2441    <sr_write_count/>
2442
2443    <!-- Only valid with .xchg to implement ACMPXCHG -->
2444    <va_mod name="compare" start="26" size="1"/>
2445
2446    <atom_opc/>
2447    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2448
2449    <sr write="true" flags="false"/>
2450    <sr read="true" flags="rw"/>
2451    <src size="64">64-bit address to operate on</src>
2452    <imm name="offset" start="8" size="8"/>
2453  </ins>
2454
2455  <ins name="ATOM_RETURN.i64" title="Atomic operations on memory" opcode="0x120" opcode2="5" unused="true" unit="LS">
2456    <slot/>
2457    <sr_count/>
2458    <sr_write_count/>
2459    <va_mod name="compare" start="26" size="1"/>
2460    <atom_opc/>
2461    <va_mod name="memory_width" start="128" size="1" implied="true"/>
2462
2463    <sr write="true" flags="false"/>
2464    <sr read="true" flags="rw"/>
2465    <src size="64">64-bit address to operate on</src>
2466    <imm name="offset" start="8" size="8"/>
2467  </ins>
2468
2469  <ins name="TEX_FETCH" title="Texel fetch" opcode="0x125" message="tex" unit="T">
2470    <desc>Unfiltered textured instruction.</desc>
2471    <slot/>
2472    <skip/>
2473    <register_type/>
2474    <register_width/>
2475    <write_mask/>
2476    <dimension/>
2477    <wide_indices/>
2478    <array_enable/>
2479    <texel_offset/>
2480    <regfmt pseudo="true"/>
2481
2482    <!-- Leave secondary_register_width as 0 -->
2483    <sr_count/>
2484    <sr_write_count/>
2485
2486    <sr write="true" flags="false"/>
2487    <sr read="true" flags="false"/>
2488    <src size="64">Image to read from</src>
2489    <src pseudo="true">Dummy for IR</src>
2490    <immediate name="sr_count" size="4" pseudo="true"/>
2491  </ins>
2492
2493  <ins name="TEX_SINGLE" title="Texture load" opcode="0x128" message="tex" unit="T">
2494    <desc>Ordinary texturing instruction using a sampler.</desc>
2495    <slot/>
2496    <skip/>
2497    <register_type/>
2498    <register_width/>
2499    <write_mask/>
2500    <dimension/>
2501    <wide_indices/>
2502    <array_enable/>
2503    <texel_offset/>
2504    <regfmt pseudo="true"/>
2505    <shadow/>
2506    <lod_mode/>
2507
2508    <!-- Leave secondary_register_width as 0 -->
2509    <sr_count/>
2510    <sr_write_count/>
2511
2512    <sr write="true" flags="false"/>
2513    <sr read="true" flags="false"/>
2514    <src size="64">Image to read from</src>
2515    <src pseudo="true">Dummy for IR</src>
2516    <immediate name="sr_count" size="4" pseudo="true"/>
2517  </ins>
2518
2519  <ins name="TEX_GATHER" title="Texel gather" opcode="0x129" message="tex" unit="T">
2520    <desc>Texture gather instruction.</desc>
2521    <slot/>
2522    <skip/>
2523    <register_type/>
2524    <register_width/>
2525    <write_mask/>
2526    <dimension/>
2527    <wide_indices/>
2528    <array_enable/>
2529    <texel_offset/>
2530    <integer_coordinates/>
2531    <fetch_component/>
2532    <regfmt pseudo="true"/>
2533    <shadow/>
2534
2535    <!-- Leave secondary_register_width as 0 -->
2536    <sr_count count="sr_count"/>
2537    <sr_write_count/>
2538
2539    <sr write="true" flags="false"/>
2540    <sr read="true" flags="false"/>
2541    <src size="64">Image to read from</src>
2542    <src pseudo="true">Dummy source for IR</src>
2543    <immediate name="sr_count" size="4" pseudo="true"/>
2544  </ins>
2545
2546  <ins name="TEX_GRADIENT" title="Texture gradient" opcode="0x12A" message="tex" unit="T">
2547    <desc>Texture sample with explicit gradient.</desc>
2548    <slot/>
2549    <skip/>
2550    <register_type/>
2551    <register_width/>
2552    <write_mask/>
2553    <dimension/>
2554    <wide_indices/>
2555    <force_delta_enable/>
2556    <lod_bias_disable/>
2557    <lod_clamp_disable/>
2558    <derivative_enable/>
2559
2560    <sr_count/>
2561    <sr_write_count/>
2562
2563    <sr write="true" flags="false"/>
2564    <sr read="true" flags="false"/>
2565    <src size="64">Image to read from</src>
2566    <src pseudo="true">Dummy for IR</src>
2567    <immediate name="sr_count" size="4" pseudo="true"/>
2568  </ins>
2569
2570  <ins name="TEX_DUAL" title="Dual texture" opcode="0x12F" unused="true" unit="T">
2571    <desc>Pair of texture instructions.</desc>
2572    <slot/>
2573    <skip/>
2574    <register_type/>
2575    <register_width/>
2576    <secondary_register_width/>
2577    <write_mask/>
2578    <dimension/>
2579    <wide_indices/>
2580    <array_enable/>
2581    <texel_offset/>
2582    <shadow/>
2583    <lod_mode/>
2584
2585    <sr_count/>
2586    <sr_write_count/>
2587
2588    <sr write="true" flags="false"/>
2589    <sr read="true" flags="false"/>
2590    <src size="64">Image to read from</src>
2591  </ins>
2592
2593  <ins name="VAR_TEX_BUF_SINGLE" title="Fused varying-texturing" opcode="0x130" unused="true" unit="VT">
2594    <desc>
2595      Only works for FP32 varyings. Performance characteristics are similar
2596      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2597    </desc>
2598    <slot/>
2599    <skip/>
2600    <sample_and_update/>
2601    <register_type/>
2602    <vartex_register_width/>
2603    <dimension/>
2604    <array_enable/>
2605    <shadow/>
2606    <lod_mode/>
2607
2608    <sr_write_count/>
2609
2610    <sr write="true"/>
2611    <src size="64">Image to read from</src>
2612    <src>Varying offset</src>
2613  </ins>
2614
2615  <ins name="VAR_TEX_BUF_GATHER" title="Fused varying-texturing" opcode="0x131" unused="true" unit="VT">
2616    <desc>
2617      Only works for FP32 varyings. Performance characteristics are similar
2618      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2619    </desc>
2620    <slot/>
2621    <skip/>
2622    <sample_and_update/>
2623    <register_type/>
2624    <vartex_register_width/>
2625    <dimension/>
2626    <array_enable/>
2627    <integer_coordinates/>
2628    <fetch_component/>
2629    <shadow/>
2630
2631    <sr_write_count/>
2632
2633    <sr write="true"/>
2634    <src size="64">Image to read from</src>
2635    <src>Varying offset</src>
2636  </ins>
2637
2638  <ins name="VAR_TEX_BUF_GRADIENT" title="Fused varying-texturing" opcode="0x132" unused="true" unit="VT">
2639    <desc>
2640      Only works for FP32 varyings. Performance characteristics are similar
2641      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2642    </desc>
2643    <slot/>
2644    <skip/>
2645    <sample_and_update/>
2646    <register_type/>
2647    <vartex_register_width/>
2648    <dimension/>
2649    <array_enable/>
2650    <shadow/>
2651    <lod_bias_disable/>
2652    <lod_clamp_disable/>
2653
2654    <sr_write_count/>
2655
2656    <sr write="true"/>
2657    <src size="64">Image to read from</src>
2658    <src>Varying offset</src>
2659  </ins>
2660
2661  <ins name="VAR_TEX_BUF_DUAL" title="Fused varying-texturing" opcode="0x137" unused="true" unit="VT">
2662    <desc>
2663      Only works for FP32 varyings. Performance characteristics are similar
2664      to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units.
2665    </desc>
2666    <slot/>
2667    <skip/>
2668    <sample_and_update/>
2669    <register_type/>
2670    <vartex_register_width/>
2671    <dimension/>
2672    <array_enable/>
2673    <shadow/>
2674    <lod_mode/>
2675
2676    <sr_write_count/>
2677
2678    <sr write="true"/>
2679    <src size="64">Image to read from</src>
2680    <src>Varying offset</src>
2681  </ins>
2682
2683  <ins name="VAR_TEX_SINGLE" title="Fused varying-texturing" opcode="0x138" unused="true" unit="VT">
2684    <desc>
2685      Only works for FP32 varyings. Performance characteristics are similar
2686      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2687    </desc>
2688    <slot/>
2689    <skip/>
2690    <sample_and_update/>
2691    <register_type/>
2692    <vartex_register_width/>
2693    <dimension/>
2694    <array_enable/>
2695    <shadow/>
2696    <lod_mode/>
2697
2698    <sr_write_count/>
2699
2700    <sr write="true"/>
2701    <src size="64">Image to read from</src>
2702    <src>Varying offset</src>
2703  </ins>
2704
2705  <ins name="VAR_TEX_GATHER" title="Fused varying-texturing" opcode="0x139" unused="true" unit="VT">
2706    <desc>
2707      Only works for FP32 varyings. Performance characteristics are similar
2708      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2709    </desc>
2710    <slot/>
2711    <skip/>
2712    <sample_and_update/>
2713    <register_type/>
2714    <vartex_register_width/>
2715    <dimension/>
2716    <array_enable/>
2717    <integer_coordinates/>
2718    <fetch_component/>
2719    <shadow/>
2720
2721    <sr_write_count/>
2722
2723    <sr write="true"/>
2724    <src size="64">Image to read from</src>
2725    <src>Varying offset</src>
2726  </ins>
2727
2728  <ins name="VAR_TEX_GRADIENT" title="Fused varying-texturing" opcode="0x13A" unused="true" unit="VT">
2729    <desc>
2730      Only works for FP32 varyings. Performance characteristics are similar
2731      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
2732    </desc>
2733    <slot/>
2734    <skip/>
2735    <sample_and_update/>
2736    <register_type/>
2737    <vartex_register_width/>
2738    <dimension/>
2739    <array_enable/>
2740    <shadow/>
2741    <lod_bias_disable/>
2742    <lod_clamp_disable/>
2743
2744    <sr_write_count/>
2745
2746    <sr write="true"/>
2747    <src size="64">Image to read from</src>
2748    <src>Varying offset</src>
2749  </ins>
2750
2751  <ins name="VAR_TEX_DUAL" title="Fused varying-texturing" opcode="0x13F" unused="true" unit="VT">
2752    <desc>
2753      Only works for FP32 varyings. Performance characteristics are similar
2754      to LD_VAR_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units.
2755    </desc>
2756    <slot/>
2757    <skip/>
2758    <sample_and_update/>
2759    <register_type/>
2760    <vartex_register_width/>
2761    <dimension/>
2762    <array_enable/>
2763    <shadow/>
2764    <lod_mode/>
2765
2766    <sr_write_count/>
2767
2768    <sr write="true"/>
2769    <src size="64">Image to read from</src>
2770    <src>Varying offset</src>
2771  </ins>
2772
2773  <ins name="FMA_RSCALE.f32" title="Fused floating-point multiply add with exponent bias" dests="1" opcode="0x160" unused="true" unit="FMA">
2774    <desc>
2775      First calculates $A \cdot B + C$ and then biases the exponent by D. Used in
2776      special transcendental function sequences. It should not be used for
2777      general code as its special case handling differs from two back-to-back
2778      `FMA.f32` operations. Equivalent to `FMA.f32` back-to-back with
2779      `LDEXP.f32`
2780    </desc>
2781    <clamp/>
2782    <src absneg="true">A</src>
2783    <src absneg="true">B</src>
2784    <src absneg="true">C</src>
2785    <src>D</src>
2786  </ins>
2787
2788  <ins name="FMA_RSCALE_N.f32" title="Fused floating-point multiply add with exponent bias and zero override" dests="1" opcode="0x161" unused="true" unit="FMA">
2789    <desc>
2790      First calculates $A \cdot B + C$ and then biases the exponent by D. If $A
2791      = 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an
2792      ordinary multiply would return NaN. Used in special transcendental
2793      function sequences. It should not be used for general code as its special
2794      case handling differs from two back-to-back `FMA.f32` operations.
2795      Equivalent to `FMA.f32` back-to-back with `LDEXP.f32`
2796    </desc>
2797    <clamp/>
2798    <src absneg="true">A</src>
2799    <src absneg="true">B</src>
2800    <src absneg="true">C</src>
2801    <src>D</src>
2802  </ins>
2803
2804  <ins name="FMA_RSCALE_LEFT.f32" title="Fused floating-point multiply add with exponent bias and asymmetric zero handling" dests="1" opcode="0x162" unused="true" unit="FMA">
2805    <desc>
2806      First calculates $A \cdot B + C$ and then biases the exponent by D. If $A
2807      = 0$ or $B = 0$, the multiply is treated as $A$ even if an
2808      ordinary multiply would return NaN. Used in special transcendental
2809      function sequences. It should not be used for general code as its special
2810      case handling differs from two back-to-back `FMA.f32` operations.
2811      Equivalent to `FMA.f32` back-to-back with `LDEXP.f32`
2812    </desc>
2813    <clamp/>
2814    <src absneg="true">A</src>
2815    <src absneg="true">B</src>
2816    <src absneg="true">C</src>
2817    <src>D</src>
2818  </ins>
2819
2820  <ins name="FMA_RSCALE_SCALE16.f32" title="Fused floating-point multiply add with 16-bit exponent bias" dests="1" opcode="0x163" unused="true" unit="FMA">
2821    <desc>
2822      First calculates $A \cdot B + C$ and then biases the exponent by D,
2823      interpreted as a 16-bit value. Used in special transcendental function
2824      sequences. It should not be used for general code as its special case
2825      handling differs from two back-to-back `FMA.f32` operations.  Equivalent
2826      to `FMA.f32` back-to-back with `LDEXP.f32`
2827    </desc>
2828    <clamp/>
2829    <src absneg="true">A</src>
2830    <src absneg="true">B</src>
2831    <src absneg="true">C</src>
2832    <src>D</src>
2833  </ins>
2834
2835</valhall>
2836