• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 #![allow(non_upper_case_globals)]
5 
6 use crate::api::GetDebugFlags;
7 use crate::api::DEBUG;
8 use crate::builder::*;
9 use crate::ir::*;
10 use crate::sph::{OutputTopology, PixelImap};
11 
12 use nak_bindings::*;
13 
14 use compiler::bindings::*;
15 use compiler::cfg::CFGBuilder;
16 use compiler::nir::*;
17 use compiler::nir_instr_printer::NirInstrPrinter;
18 use std::cmp::max;
19 use std::collections::{HashMap, HashSet};
20 use std::ops::Index;
21 
init_info_from_nir(nak: &nak_compiler, nir: &nir_shader) -> ShaderInfo22 fn init_info_from_nir(nak: &nak_compiler, nir: &nir_shader) -> ShaderInfo {
23     ShaderInfo {
24         num_gprs: 0,
25         num_instrs: 0,
26         num_control_barriers: 0,
27         slm_size: nir.scratch_size,
28         max_crs_depth: 0,
29         uses_global_mem: false,
30         writes_global_mem: false,
31         // TODO: handle this.
32         uses_fp64: false,
33         stage: match nir.info.stage() {
34             MESA_SHADER_COMPUTE => {
35                 ShaderStageInfo::Compute(ComputeShaderInfo {
36                     local_size: [
37                         nir.info.workgroup_size[0],
38                         nir.info.workgroup_size[1],
39                         nir.info.workgroup_size[2],
40                     ],
41                     smem_size: nir.info.shared_size.try_into().unwrap(),
42                 })
43             }
44             MESA_SHADER_VERTEX => ShaderStageInfo::Vertex,
45             MESA_SHADER_FRAGMENT => {
46                 let info_fs = unsafe { &nir.info.__bindgen_anon_1.fs };
47                 ShaderStageInfo::Fragment(FragmentShaderInfo {
48                     uses_kill: false,
49                     does_interlock: false,
50                     post_depth_coverage: info_fs.post_depth_coverage(),
51                     early_fragment_tests: info_fs.early_fragment_tests(),
52                     uses_sample_shading: info_fs.uses_sample_shading(),
53                 })
54             }
55             MESA_SHADER_GEOMETRY => {
56                 let info_gs = unsafe { &nir.info.__bindgen_anon_1.gs };
57                 let output_topology = match info_gs.output_primitive {
58                     MESA_PRIM_POINTS => OutputTopology::PointList,
59                     MESA_PRIM_LINE_STRIP => OutputTopology::LineStrip,
60                     MESA_PRIM_TRIANGLE_STRIP => OutputTopology::TriangleStrip,
61                     _ => panic!(
62                         "Invalid GS input primitive {}",
63                         info_gs.input_primitive
64                     ),
65                 };
66 
67                 ShaderStageInfo::Geometry(GeometryShaderInfo {
68                     // TODO: Should be set if VK_NV_geometry_shader_passthrough is in use.
69                     passthrough_enable: false,
70                     stream_out_mask: info_gs.active_stream_mask(),
71                     threads_per_input_primitive: info_gs.invocations,
72                     output_topology: output_topology,
73                     max_output_vertex_count: info_gs.vertices_out,
74                 })
75             }
76             MESA_SHADER_TESS_CTRL => {
77                 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
78                 ShaderStageInfo::TessellationInit(TessellationInitShaderInfo {
79                     per_patch_attribute_count: 6,
80                     threads_per_patch: info_tess.tcs_vertices_out,
81                 })
82             }
83             MESA_SHADER_TESS_EVAL => {
84                 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
85                 ShaderStageInfo::Tessellation(TessellationShaderInfo {
86                     domain: match info_tess._primitive_mode {
87                         TESS_PRIMITIVE_TRIANGLES => {
88                             TessellationDomain::Triangle
89                         }
90                         TESS_PRIMITIVE_QUADS => TessellationDomain::Quad,
91                         TESS_PRIMITIVE_ISOLINES => TessellationDomain::Isoline,
92                         _ => panic!("Invalid tess_primitive_mode"),
93                     },
94                     spacing: match info_tess.spacing() {
95                         TESS_SPACING_EQUAL => TessellationSpacing::Integer,
96                         TESS_SPACING_FRACTIONAL_ODD => {
97                             TessellationSpacing::FractionalOdd
98                         }
99                         TESS_SPACING_FRACTIONAL_EVEN => {
100                             TessellationSpacing::FractionalEven
101                         }
102                         _ => panic!("Invalid gl_tess_spacing"),
103                     },
104                     primitives: if info_tess.point_mode() {
105                         TessellationPrimitives::Points
106                     } else if info_tess._primitive_mode
107                         == TESS_PRIMITIVE_ISOLINES
108                     {
109                         TessellationPrimitives::Lines
110                     } else if info_tess.ccw() {
111                         TessellationPrimitives::TrianglesCCW
112                     } else {
113                         TessellationPrimitives::TrianglesCW
114                     },
115                 })
116             }
117             _ => panic!("Unknown shader stage"),
118         },
119         io: match nir.info.stage() {
120             MESA_SHADER_COMPUTE => ShaderIoInfo::None,
121             MESA_SHADER_FRAGMENT => ShaderIoInfo::Fragment(FragmentIoInfo {
122                 sysvals_in: SysValInfo {
123                     // Required on fragment shaders, otherwise it cause a trap.
124                     ab: 1 << 31,
125                     c: 0,
126                 },
127                 sysvals_in_d: [PixelImap::Unused; 8],
128                 attr_in: [PixelImap::Unused; 128],
129                 barycentric_attr_in: [0; 4],
130                 reads_sample_mask: false,
131                 writes_color: 0,
132                 writes_sample_mask: false,
133                 writes_depth: false,
134             }),
135             MESA_SHADER_VERTEX
136             | MESA_SHADER_GEOMETRY
137             | MESA_SHADER_TESS_CTRL
138             | MESA_SHADER_TESS_EVAL => {
139                 let num_clip = nir.info.clip_distance_array_size();
140                 let num_cull = nir.info.cull_distance_array_size();
141                 let clip_enable = (1_u32 << num_clip) - 1;
142                 let cull_enable = ((1_u32 << num_cull) - 1) << num_clip;
143 
144                 ShaderIoInfo::Vtg(VtgIoInfo {
145                     sysvals_in: SysValInfo::default(),
146                     sysvals_in_d: 0,
147                     sysvals_out: SysValInfo::default(),
148                     sysvals_out_d: 0,
149                     attr_in: [0; 4],
150                     attr_out: [0; 4],
151 
152                     // TODO: figure out how to fill this.
153                     store_req_start: u8::MAX,
154                     store_req_end: 0,
155 
156                     clip_enable: clip_enable.try_into().unwrap(),
157                     cull_enable: cull_enable.try_into().unwrap(),
158                     xfb: if nir.xfb_info.is_null() {
159                         None
160                     } else {
161                         Some(Box::new(unsafe {
162                             nak_xfb_from_nir(nak, nir.xfb_info)
163                         }))
164                     },
165                 })
166             }
167             _ => panic!("Unknown shader stage"),
168         },
169     }
170 }
171 
alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue>172 fn alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue> {
173     let (file, comps) = if ssa.bit_size == 1 {
174         (RegFile::Pred, ssa.num_components)
175     } else {
176         let bits = ssa.bit_size * ssa.num_components;
177         (RegFile::GPR, bits.div_ceil(32))
178     };
179 
180     let mut vec = Vec::new();
181     for _ in 0..comps {
182         vec.push(b.alloc_ssa(file, 1)[0]);
183     }
184     vec
185 }
186 
187 struct PhiAllocMap<'a> {
188     alloc: &'a mut PhiAllocator,
189     map: HashMap<(u32, u8), u32>,
190 }
191 
192 impl<'a> PhiAllocMap<'a> {
new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a>193     fn new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a> {
194         PhiAllocMap {
195             alloc: alloc,
196             map: HashMap::new(),
197         }
198     }
199 
get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32200     fn get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32 {
201         *self
202             .map
203             .entry((phi.def.index, comp))
204             .or_insert_with(|| self.alloc.alloc())
205     }
206 }
207 
208 struct PerSizeFloatControls {
209     pub ftz: bool,
210     pub rnd_mode: FRndMode,
211 }
212 
213 struct ShaderFloatControls {
214     pub fp16: PerSizeFloatControls,
215     pub fp32: PerSizeFloatControls,
216     pub fp64: PerSizeFloatControls,
217 }
218 
219 impl Default for ShaderFloatControls {
default() -> Self220     fn default() -> Self {
221         Self {
222             fp16: PerSizeFloatControls {
223                 ftz: false,
224                 rnd_mode: FRndMode::NearestEven,
225             },
226             fp32: PerSizeFloatControls {
227                 ftz: true, // Default FTZ on fp32
228                 rnd_mode: FRndMode::NearestEven,
229             },
230             fp64: PerSizeFloatControls {
231                 ftz: false,
232                 rnd_mode: FRndMode::NearestEven,
233             },
234         }
235     }
236 }
237 
238 impl ShaderFloatControls {
from_nir(nir: &nir_shader) -> ShaderFloatControls239     fn from_nir(nir: &nir_shader) -> ShaderFloatControls {
240         let nir_fc = nir.info.float_controls_execution_mode;
241         let mut fc: ShaderFloatControls = Default::default();
242 
243         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) != 0 {
244             fc.fp16.ftz = false;
245         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) != 0 {
246             fc.fp16.ftz = true;
247         }
248         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) != 0 {
249             fc.fp16.rnd_mode = FRndMode::NearestEven;
250         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) != 0 {
251             fc.fp16.rnd_mode = FRndMode::Zero;
252         }
253 
254         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) != 0 {
255             fc.fp32.ftz = false;
256         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) != 0 {
257             fc.fp32.ftz = true;
258         }
259         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) != 0 {
260             fc.fp32.rnd_mode = FRndMode::NearestEven;
261         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) != 0 {
262             fc.fp32.rnd_mode = FRndMode::Zero;
263         }
264 
265         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) != 0 {
266             fc.fp64.ftz = false;
267         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) != 0 {
268             fc.fp64.ftz = true;
269         }
270         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) != 0 {
271             fc.fp64.rnd_mode = FRndMode::NearestEven;
272         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) != 0 {
273             fc.fp64.rnd_mode = FRndMode::Zero;
274         }
275 
276         fc
277     }
278 }
279 
280 impl Index<FloatType> for ShaderFloatControls {
281     type Output = PerSizeFloatControls;
282 
index(&self, idx: FloatType) -> &PerSizeFloatControls283     fn index(&self, idx: FloatType) -> &PerSizeFloatControls {
284         match idx {
285             FloatType::F16 => &self.fp16,
286             FloatType::F32 => &self.fp32,
287             FloatType::F64 => &self.fp64,
288         }
289     }
290 }
291 
292 #[derive(Clone, Copy, Eq, Hash, PartialEq)]
293 enum SyncType {
294     Sync,
295     Brk,
296     Cont,
297 }
298 
299 struct ShaderFromNir<'a> {
300     nir: &'a nir_shader,
301     sm: &'a dyn ShaderModel,
302     info: ShaderInfo,
303     float_ctl: ShaderFloatControls,
304     cfg: CFGBuilder<u32, BasicBlock>,
305     label_alloc: LabelAllocator,
306     block_label: HashMap<u32, Label>,
307     bar_label: HashMap<u32, Label>,
308     sync_blocks: HashSet<u32>,
309     crs: Vec<(u32, SyncType)>,
310     fs_out_regs: [SSAValue; 34],
311     end_block_id: u32,
312     ssa_map: HashMap<u32, Vec<SSAValue>>,
313     saturated: HashSet<*const nir_def>,
314     nir_instr_printer: NirInstrPrinter,
315 }
316 
317 impl<'a> ShaderFromNir<'a> {
new( nak: &nak_compiler, nir: &'a nir_shader, sm: &'a dyn ShaderModel, ) -> Self318     fn new(
319         nak: &nak_compiler,
320         nir: &'a nir_shader,
321         sm: &'a dyn ShaderModel,
322     ) -> Self {
323         Self {
324             nir: nir,
325             sm: sm,
326             info: init_info_from_nir(nak, nir),
327             float_ctl: ShaderFloatControls::from_nir(nir),
328             cfg: CFGBuilder::new(),
329             label_alloc: LabelAllocator::new(),
330             block_label: HashMap::new(),
331             bar_label: HashMap::new(),
332             sync_blocks: HashSet::new(),
333             crs: Vec::new(),
334             fs_out_regs: [SSAValue::NONE; 34],
335             end_block_id: 0,
336             ssa_map: HashMap::new(),
337             saturated: HashSet::new(),
338             nir_instr_printer: NirInstrPrinter::new().unwrap(),
339         }
340     }
341 
get_block_label(&mut self, block: &nir_block) -> Label342     fn get_block_label(&mut self, block: &nir_block) -> Label {
343         *self
344             .block_label
345             .entry(block.index)
346             .or_insert_with(|| self.label_alloc.alloc())
347     }
348 
push_crs(&mut self, target: &nir_block, sync_type: SyncType)349     fn push_crs(&mut self, target: &nir_block, sync_type: SyncType) {
350         self.sync_blocks.insert(target.index);
351         self.crs.push((target.index, sync_type));
352         let crs_depth = u32::try_from(self.crs.len()).unwrap();
353         self.info.max_crs_depth = max(self.info.max_crs_depth, crs_depth);
354     }
355 
pop_crs(&mut self, target: &nir_block, sync_type: SyncType)356     fn pop_crs(&mut self, target: &nir_block, sync_type: SyncType) {
357         if let Some((top_index, top_sync_type)) = self.crs.pop() {
358             assert!(top_index == target.index);
359             assert!(top_sync_type == sync_type);
360         } else {
361             panic!("Tried to pop an empty stack");
362         }
363     }
364 
peek_crs(&self, target: &nir_block) -> Option<SyncType>365     fn peek_crs(&self, target: &nir_block) -> Option<SyncType> {
366         for (i, (index, sync_type)) in self.crs.iter().enumerate().rev() {
367             if *index != target.index {
368                 continue;
369             }
370 
371             match sync_type {
372                 SyncType::Sync => {
373                     // Sync must always be top-of-stack
374                     assert!(i == self.crs.len() - 1);
375                 }
376                 SyncType::Brk => {
377                     // Brk cannot skip over another Brk
378                     for (_, inner_sync) in &self.crs[(i + 1)..] {
379                         assert!(*inner_sync != SyncType::Brk);
380                     }
381                 }
382                 SyncType::Cont => {
383                     // Cont can only skip over Sync
384                     for (_, inner_sync) in &self.crs[(i + 1)..] {
385                         assert!(*inner_sync == SyncType::Sync);
386                     }
387                 }
388             }
389 
390             return Some(*sync_type);
391         }
392 
393         assert!(!self.sync_blocks.contains(&target.index));
394         None
395     }
396 
get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue]397     fn get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue] {
398         self.ssa_map.get(&ssa.index).unwrap()
399     }
400 
set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>)401     fn set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>) {
402         if def.bit_size == 1 {
403             for s in &vec {
404                 assert!(s.is_predicate());
405             }
406         } else {
407             for s in &vec {
408                 assert!(!s.is_predicate());
409             }
410             let bits =
411                 usize::from(def.bit_size) * usize::from(def.num_components);
412             assert!(vec.len() == bits.div_ceil(32));
413         }
414         self.ssa_map
415             .entry(def.index)
416             .and_modify(|_| panic!("Cannot set an SSA def twice"))
417             .or_insert(vec);
418     }
419 
get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8)420     fn get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8) {
421         let vec = self.get_ssa(def);
422         match def.bit_size {
423             1 => (vec[usize::from(c)].into(), 0),
424             8 => (vec[usize::from(c / 4)].into(), c % 4),
425             16 => (vec[usize::from(c / 2)].into(), (c * 2) % 4),
426             32 => (vec[usize::from(c)].into(), 0),
427             64 => {
428                 let comps =
429                     [vec[usize::from(c) * 2 + 0], vec[usize::from(c) * 2 + 1]];
430                 (comps.into(), 0)
431             }
432             _ => panic!("Unsupported bit size: {}", def.bit_size),
433         }
434     }
435 
get_ssa_ref(&mut self, src: &nir_src) -> SSARef436     fn get_ssa_ref(&mut self, src: &nir_src) -> SSARef {
437         SSARef::try_from(self.get_ssa(src.as_def())).unwrap()
438     }
439 
get_src(&mut self, src: &nir_src) -> Src440     fn get_src(&mut self, src: &nir_src) -> Src {
441         self.get_ssa_ref(src).into()
442     }
443 
get_io_addr_offset( &mut self, addr: &nir_src, imm_bits: u8, ) -> (Src, i32)444     fn get_io_addr_offset(
445         &mut self,
446         addr: &nir_src,
447         imm_bits: u8,
448     ) -> (Src, i32) {
449         let addr = addr.as_def();
450         let addr_offset = unsafe {
451             nak_get_io_addr_offset(addr as *const _ as *mut _, imm_bits)
452         };
453 
454         if let Some(base_def) = std::ptr::NonNull::new(addr_offset.base.def) {
455             let base_def = unsafe { base_def.as_ref() };
456             let base_comp = u8::try_from(addr_offset.base.comp).unwrap();
457             let (base, _) = self.get_ssa_comp(base_def, base_comp);
458             (base.into(), addr_offset.offset)
459         } else {
460             (SrcRef::Zero.into(), addr_offset.offset)
461         }
462     }
463 
get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16)464     fn get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16) {
465         let (off, off_imm) = self.get_io_addr_offset(addr, 16);
466         if let Ok(off_imm_u16) = u16::try_from(off_imm) {
467             (off, off_imm_u16)
468         } else {
469             (self.get_src(addr), 0)
470         }
471     }
472 
set_dst(&mut self, def: &nir_def, ssa: SSARef)473     fn set_dst(&mut self, def: &nir_def, ssa: SSARef) {
474         self.set_ssa(def, (*ssa).into());
475     }
476 
try_saturate_alu_dst(&mut self, def: &nir_def) -> bool477     fn try_saturate_alu_dst(&mut self, def: &nir_def) -> bool {
478         if def.all_uses_are_fsat() {
479             self.saturated.insert(def as *const _);
480             true
481         } else {
482             false
483         }
484     }
485 
alu_src_is_saturated(&self, src: &nir_alu_src) -> bool486     fn alu_src_is_saturated(&self, src: &nir_alu_src) -> bool {
487         self.saturated
488             .get(&(src.src.as_def() as *const _))
489             .is_some()
490     }
491 
parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr)492     fn parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr) {
493         // Handle vectors and pack ops as a special case since they're the only
494         // ALU ops that can produce more than 16B. They are also the only ALU
495         // ops which we allow to consume small (8 and 16-bit) vector data
496         // scattered across multiple dwords
497         match alu.op {
498             nir_op_mov
499             | nir_op_pack_32_4x8_split
500             | nir_op_pack_32_2x16_split
501             | nir_op_pack_64_2x32_split
502             | nir_op_vec2
503             | nir_op_vec3
504             | nir_op_vec4
505             | nir_op_vec5
506             | nir_op_vec8
507             | nir_op_vec16 => {
508                 let src_bit_size = alu.get_src(0).src.bit_size();
509                 let bits = usize::from(alu.def.num_components)
510                     * usize::from(alu.def.bit_size);
511 
512                 // Collect the sources into a vec with src_bit_size per SSA
513                 // value in the vec.  This implicitly makes 64-bit sources look
514                 // like two 32-bit values
515                 let mut srcs = Vec::new();
516                 if alu.op == nir_op_mov {
517                     let src = alu.get_src(0);
518                     for c in 0..alu.def.num_components {
519                         let s = src.swizzle[usize::from(c)];
520                         let (src, byte) =
521                             self.get_ssa_comp(src.src.as_def(), s);
522                         for ssa in src.iter() {
523                             srcs.push((*ssa, byte));
524                         }
525                     }
526                 } else {
527                     for src in alu.srcs_as_slice().iter() {
528                         let s = src.swizzle[0];
529                         let (src, byte) =
530                             self.get_ssa_comp(src.src.as_def(), s);
531                         for ssa in src.iter() {
532                             srcs.push((*ssa, byte));
533                         }
534                     }
535                 }
536 
537                 let mut comps = Vec::new();
538                 match src_bit_size {
539                     1 | 32 | 64 => {
540                         for (ssa, _) in srcs {
541                             comps.push(ssa);
542                         }
543                     }
544                     8 => {
545                         for dc in 0..bits.div_ceil(32) {
546                             let mut psrc = [None; 4];
547                             let mut psel = [0_u8; 4];
548 
549                             for b in 0..4 {
550                                 let sc = dc * 4 + b;
551                                 if sc < srcs.len() {
552                                     let (ssa, byte) = srcs[sc];
553                                     // Deduplicate psrc entries
554                                     for i in 0..4_u8 {
555                                         let psrc_i = &mut psrc[usize::from(i)];
556                                         if psrc_i.is_none() {
557                                             *psrc_i = Some(ssa.into());
558                                         } else if *psrc_i
559                                             != Some(Src::from(ssa))
560                                         {
561                                             continue;
562                                         }
563                                         psel[b] = i * 4 + byte;
564                                         break;
565                                     }
566                                 }
567                             }
568 
569                             let psrc = {
570                                 let mut res = [Src::new_zero(); 4];
571 
572                                 for (idx, src) in psrc.iter().enumerate() {
573                                     if let Some(src) = src {
574                                         res[idx] = *src;
575                                     }
576                                 }
577 
578                                 res
579                             };
580 
581                             comps.push(b.prmt4(psrc, psel)[0]);
582                         }
583                     }
584                     16 => {
585                         for dc in 0..bits.div_ceil(32) {
586                             let mut psrc = [Src::new_zero(); 2];
587                             let mut psel = [0_u8; 4];
588 
589                             for w in 0..2 {
590                                 let sc = dc * 2 + w;
591                                 if sc < srcs.len() {
592                                     let (ssa, byte) = srcs[sc];
593                                     psrc[w] = ssa.into();
594                                     psel[w * 2 + 0] = (w as u8 * 4) + byte;
595                                     psel[w * 2 + 1] = (w as u8 * 4) + byte + 1;
596                                 }
597                             }
598                             comps.push(b.prmt(psrc[0], psrc[1], psel)[0]);
599                         }
600                     }
601                     _ => panic!("Unknown bit size: {src_bit_size}"),
602                 }
603 
604                 self.set_ssa(&alu.def, comps);
605                 return;
606             }
607             _ => (),
608         }
609 
610         let nir_srcs = alu.srcs_as_slice();
611         let mut srcs: Vec<Src> = Vec::new();
612         for (i, alu_src) in nir_srcs.iter().enumerate() {
613             let bit_size = alu_src.src.bit_size();
614             let comps = alu.src_components(i.try_into().unwrap());
615             let ssa = self.get_ssa(alu_src.src.as_def());
616 
617             match bit_size {
618                 1 => {
619                     assert!(comps == 1);
620                     let s = usize::from(alu_src.swizzle[0]);
621                     srcs.push(ssa[s].into());
622                 }
623                 8 | 16 => {
624                     let num_bytes = usize::from(comps * (bit_size / 8));
625                     assert!(num_bytes <= 4);
626 
627                     let mut bytes = [0_u8; 4];
628                     for c in 0..usize::from(comps) {
629                         let cs = alu_src.swizzle[c];
630                         if bit_size == 8 {
631                             bytes[c] = cs;
632                         } else {
633                             bytes[c * 2 + 0] = cs * 2 + 0;
634                             bytes[c * 2 + 1] = cs * 2 + 1;
635                         }
636                     }
637 
638                     let mut prmt_srcs = [Src::new_zero(); 4];
639                     let mut prmt = [0_u8; 4];
640                     for b in 0..num_bytes {
641                         for (ds, s) in prmt_srcs.iter_mut().enumerate() {
642                             let dw = ssa[usize::from(bytes[b] / 4)];
643                             if s.is_zero() {
644                                 *s = dw.into();
645                             } else if *s != Src::from(dw) {
646                                 continue;
647                             }
648                             prmt[usize::from(b)] =
649                                 (ds as u8) * 4 + (bytes[b] % 4);
650                             break;
651                         }
652                     }
653 
654                     srcs.push(b.prmt4(prmt_srcs, prmt).into());
655                 }
656                 32 => {
657                     assert!(comps == 1);
658                     let s = usize::from(alu_src.swizzle[0]);
659                     srcs.push(ssa[s].into());
660                 }
661                 64 => {
662                     assert!(comps == 1);
663                     let s = usize::from(alu_src.swizzle[0]);
664                     srcs.push([ssa[s * 2], ssa[s * 2 + 1]].into());
665                 }
666                 _ => panic!("Invalid bit size: {bit_size}"),
667             }
668         }
669 
670         // Restricts an F16v2 source to just x if the ALU op is single-component. This
671         // must only be called for per-component sources (see nir_op_info::output_sizes
672         // for more details).
673         let restrict_f16v2_src = |mut src: Src| {
674             if alu.def.num_components == 1 {
675                 src.src_swizzle = SrcSwizzle::Xx;
676             }
677             src
678         };
679 
680         let dst: SSARef = match alu.op {
681             nir_op_b2b1 => {
682                 assert!(alu.get_src(0).bit_size() == 32);
683                 b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], 0.into())
684             }
685             nir_op_b2b32 | nir_op_b2i8 | nir_op_b2i16 | nir_op_b2i32 => {
686                 b.sel(srcs[0].bnot(), 0.into(), 1.into())
687             }
688             nir_op_b2i64 => {
689                 let lo = b.sel(srcs[0].bnot(), 0.into(), 1.into());
690                 let hi = b.copy(0.into());
691                 [lo[0], hi[0]].into()
692             }
693             nir_op_b2f16 => b.sel(srcs[0].bnot(), 0.into(), 0x3c00.into()),
694             nir_op_b2f32 => {
695                 b.sel(srcs[0].bnot(), 0.0_f32.into(), 1.0_f32.into())
696             }
697             nir_op_b2f64 => {
698                 let lo = b.copy(0.into());
699                 let hi = b.sel(srcs[0].bnot(), 0.into(), 0x3ff00000.into());
700                 [lo[0], hi[0]].into()
701             }
702             nir_op_bcsel => b.sel(srcs[0], srcs[1], srcs[2]),
703             nir_op_bfm => {
704                 let dst = b.alloc_ssa(RegFile::GPR, 1);
705                 b.push_op(OpBMsk {
706                     dst: dst.into(),
707                     pos: srcs[1],
708                     width: srcs[0],
709                     wrap: true,
710                 });
711                 dst
712             }
713             nir_op_bit_count => {
714                 let dst = b.alloc_ssa(RegFile::GPR, 1);
715                 b.push_op(OpPopC {
716                     dst: dst.into(),
717                     src: srcs[0],
718                 });
719                 dst
720             }
721             nir_op_bitfield_reverse => b.brev(srcs[0]),
722             nir_op_ibitfield_extract | nir_op_ubitfield_extract => {
723                 let range = b.alloc_ssa(RegFile::GPR, 1);
724                 b.push_op(OpPrmt {
725                     dst: range.into(),
726                     srcs: [srcs[1], srcs[2]],
727                     sel: 0x0040.into(),
728                     mode: PrmtMode::Index,
729                 });
730 
731                 let dst = b.alloc_ssa(RegFile::GPR, 1);
732                 b.push_op(OpBfe {
733                     dst: dst.into(),
734                     base: srcs[0],
735                     signed: !matches!(alu.op, nir_op_ubitfield_extract),
736                     range: range.into(),
737                     reverse: false,
738                 });
739                 dst
740             }
741             nir_op_extract_u8 | nir_op_extract_i8 | nir_op_extract_u16
742             | nir_op_extract_i16 => {
743                 let src1 = alu.get_src(1);
744                 let elem = src1.src.comp_as_uint(src1.swizzle[0]).unwrap();
745                 let elem = u8::try_from(elem).unwrap();
746 
747                 match alu.op {
748                     nir_op_extract_u8 => {
749                         assert!(elem < 4);
750                         let byte = elem;
751                         let zero = 4;
752                         b.prmt(srcs[0], 0.into(), [byte, zero, zero, zero])
753                     }
754                     nir_op_extract_i8 => {
755                         assert!(elem < 4);
756                         let byte = elem;
757                         let sign = byte | 0x8;
758                         b.prmt(srcs[0], 0.into(), [byte, sign, sign, sign])
759                     }
760                     nir_op_extract_u16 => {
761                         assert!(elem < 2);
762                         let byte = elem * 2;
763                         let zero = 4;
764                         b.prmt(srcs[0], 0.into(), [byte, byte + 1, zero, zero])
765                     }
766                     nir_op_extract_i16 => {
767                         assert!(elem < 2);
768                         let byte = elem * 2;
769                         let sign = (byte + 1) | 0x8;
770                         b.prmt(srcs[0], 0.into(), [byte, byte + 1, sign, sign])
771                     }
772                     _ => panic!("Unknown extract op: {}", alu.op),
773                 }
774             }
775             nir_op_f2f16 | nir_op_f2f16_rtne | nir_op_f2f16_rtz
776             | nir_op_f2f32 | nir_op_f2f64 => {
777                 let src_bits = alu.get_src(0).src.bit_size();
778                 let dst_bits = alu.def.bit_size();
779                 let src_type = FloatType::from_bits(src_bits.into());
780                 let dst_type = FloatType::from_bits(dst_bits.into());
781 
782                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
783                 b.push_op(OpF2F {
784                     dst: dst.into(),
785                     src: srcs[0],
786                     src_type: FloatType::from_bits(src_bits.into()),
787                     dst_type: dst_type,
788                     rnd_mode: match alu.op {
789                         nir_op_f2f16_rtne => FRndMode::NearestEven,
790                         nir_op_f2f16_rtz => FRndMode::Zero,
791                         _ => self.float_ctl[dst_type].rnd_mode,
792                     },
793                     ftz: if src_bits < dst_bits {
794                         self.float_ctl[src_type].ftz
795                     } else {
796                         self.float_ctl[dst_type].ftz
797                     },
798                     high: false,
799                     integer_rnd: false,
800                 });
801                 dst
802             }
803             nir_op_find_lsb => {
804                 let rev = b.brev(srcs[0]);
805                 let dst = b.alloc_ssa(RegFile::GPR, 1);
806                 b.push_op(OpFlo {
807                     dst: dst.into(),
808                     src: rev.into(),
809                     signed: false,
810                     return_shift_amount: true,
811                 });
812                 dst
813             }
814             nir_op_f2i8 | nir_op_f2i16 | nir_op_f2i32 | nir_op_f2i64
815             | nir_op_f2u8 | nir_op_f2u16 | nir_op_f2u32 | nir_op_f2u64 => {
816                 let src_bits = usize::from(alu.get_src(0).bit_size());
817                 let dst_bits = alu.def.bit_size();
818                 let src_type = FloatType::from_bits(src_bits);
819                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
820                 let dst_is_signed = alu.info().output_type & 2 != 0;
821                 let dst_type =
822                     IntType::from_bits(dst_bits.into(), dst_is_signed);
823                 if b.sm() < 70 && dst_bits == 8 {
824                     // F2I doesn't support 8-bit destinations pre-Volta
825                     let tmp = b.alloc_ssa(RegFile::GPR, 1);
826                     let tmp_type = IntType::from_bits(32, dst_is_signed);
827                     b.push_op(OpF2I {
828                         dst: tmp.into(),
829                         src: srcs[0],
830                         src_type,
831                         dst_type: tmp_type,
832                         rnd_mode: FRndMode::Zero,
833                         ftz: self.float_ctl[src_type].ftz,
834                     });
835                     b.push_op(OpI2I {
836                         dst: dst.into(),
837                         src: tmp.into(),
838                         src_type: tmp_type,
839                         dst_type,
840                         saturate: true,
841                         abs: false,
842                         neg: false,
843                     });
844                 } else {
845                     b.push_op(OpF2I {
846                         dst: dst.into(),
847                         src: srcs[0],
848                         src_type,
849                         dst_type,
850                         rnd_mode: FRndMode::Zero,
851                         ftz: self.float_ctl[src_type].ftz,
852                     });
853                 }
854                 dst
855             }
856             nir_op_fabs | nir_op_fadd | nir_op_fneg => {
857                 let (x, y) = match alu.op {
858                     nir_op_fabs => (Src::new_zero().fneg(), srcs[0].fabs()),
859                     nir_op_fadd => (srcs[0], srcs[1]),
860                     nir_op_fneg => (Src::new_zero().fneg(), srcs[0].fneg()),
861                     _ => panic!("Unhandled case"),
862                 };
863                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
864                 let dst;
865                 if alu.def.bit_size() == 64 {
866                     dst = b.alloc_ssa(RegFile::GPR, 2);
867                     b.push_op(OpDAdd {
868                         dst: dst.into(),
869                         srcs: [x, y],
870                         rnd_mode: self.float_ctl[ftype].rnd_mode,
871                     });
872                 } else if alu.def.bit_size() == 32 {
873                     dst = b.alloc_ssa(RegFile::GPR, 1);
874                     b.push_op(OpFAdd {
875                         dst: dst.into(),
876                         srcs: [x, y],
877                         saturate: self.try_saturate_alu_dst(&alu.def),
878                         rnd_mode: self.float_ctl[ftype].rnd_mode,
879                         ftz: self.float_ctl[ftype].ftz,
880                     });
881                 } else if alu.def.bit_size() == 16 {
882                     assert!(
883                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
884                     );
885 
886                     dst = b.alloc_ssa(RegFile::GPR, 1);
887                     b.push_op(OpHAdd2 {
888                         dst: dst.into(),
889                         srcs: [restrict_f16v2_src(x), restrict_f16v2_src(y)],
890                         saturate: self.try_saturate_alu_dst(&alu.def),
891                         ftz: self.float_ctl[ftype].ftz,
892                         f32: false,
893                     });
894                 } else {
895                     panic!("Unsupported float type: f{}", alu.def.bit_size());
896                 }
897                 dst
898             }
899             nir_op_fceil | nir_op_ffloor | nir_op_fround_even
900             | nir_op_ftrunc => {
901                 let dst = b.alloc_ssa(RegFile::GPR, 1);
902                 let ty = FloatType::from_bits(alu.def.bit_size().into());
903                 let rnd_mode = match alu.op {
904                     nir_op_fceil => FRndMode::PosInf,
905                     nir_op_ffloor => FRndMode::NegInf,
906                     nir_op_ftrunc => FRndMode::Zero,
907                     nir_op_fround_even => FRndMode::NearestEven,
908                     _ => unreachable!(),
909                 };
910                 let ftz = self.float_ctl[ty].ftz;
911                 if b.sm() >= 70 {
912                     assert!(
913                         alu.def.bit_size() == 32 || alu.def.bit_size() == 16
914                     );
915                     b.push_op(OpFRnd {
916                         dst: dst.into(),
917                         src: srcs[0],
918                         src_type: ty,
919                         dst_type: ty,
920                         rnd_mode,
921                         ftz,
922                     });
923                 } else {
924                     assert!(alu.def.bit_size() == 32);
925                     b.push_op(OpF2F {
926                         dst: dst.into(),
927                         src: srcs[0],
928                         src_type: ty,
929                         dst_type: ty,
930                         rnd_mode,
931                         ftz,
932                         integer_rnd: true,
933                         high: false,
934                     });
935                 }
936                 dst
937             }
938             nir_op_fcos => b.fcos(srcs[0]),
939             nir_op_feq | nir_op_fge | nir_op_flt | nir_op_fneu => {
940                 let src_type =
941                     FloatType::from_bits(alu.get_src(0).bit_size().into());
942                 let cmp_op = match alu.op {
943                     nir_op_feq => FloatCmpOp::OrdEq,
944                     nir_op_fge => FloatCmpOp::OrdGe,
945                     nir_op_flt => FloatCmpOp::OrdLt,
946                     nir_op_fneu => FloatCmpOp::UnordNe,
947                     _ => panic!("Usupported float comparison"),
948                 };
949 
950                 let dst = b.alloc_ssa(RegFile::Pred, alu.def.num_components);
951                 if alu.get_src(0).bit_size() == 64 {
952                     assert!(alu.def.num_components == 1);
953                     b.push_op(OpDSetP {
954                         dst: dst.into(),
955                         set_op: PredSetOp::And,
956                         cmp_op: cmp_op,
957                         srcs: [srcs[0], srcs[1]],
958                         accum: SrcRef::True.into(),
959                     });
960                 } else if alu.get_src(0).bit_size() == 32 {
961                     assert!(alu.def.num_components == 1);
962                     b.push_op(OpFSetP {
963                         dst: dst.into(),
964                         set_op: PredSetOp::And,
965                         cmp_op: cmp_op,
966                         srcs: [srcs[0], srcs[1]],
967                         accum: SrcRef::True.into(),
968                         ftz: self.float_ctl[src_type].ftz,
969                     });
970                 } else if alu.get_src(0).bit_size() == 16 {
971                     assert!(
972                         alu.def.num_components == 1
973                             || alu.def.num_components == 2
974                     );
975 
976                     let dsts = if alu.def.num_components == 2 {
977                         [dst[0].into(), dst[1].into()]
978                     } else {
979                         [dst[0].into(), Dst::None]
980                     };
981 
982                     b.push_op(OpHSetP2 {
983                         dsts,
984                         set_op: PredSetOp::And,
985                         cmp_op: cmp_op,
986                         srcs: [
987                             restrict_f16v2_src(srcs[0]),
988                             restrict_f16v2_src(srcs[1]),
989                         ],
990                         accum: SrcRef::True.into(),
991                         ftz: self.float_ctl[src_type].ftz,
992                         horizontal: false,
993                     });
994                 } else {
995                     panic!(
996                         "Unsupported float type: f{}",
997                         alu.get_src(0).bit_size()
998                     );
999                 }
1000                 dst
1001             }
1002             nir_op_fexp2 => b.fexp2(srcs[0]),
1003             nir_op_ffma => {
1004                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1005                 let dst;
1006                 if alu.def.bit_size() == 64 {
1007                     debug_assert!(!self.float_ctl[ftype].ftz);
1008                     dst = b.alloc_ssa(RegFile::GPR, 2);
1009                     b.push_op(OpDFma {
1010                         dst: dst.into(),
1011                         srcs: [srcs[0], srcs[1], srcs[2]],
1012                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1013                     });
1014                 } else if alu.def.bit_size() == 32 {
1015                     dst = b.alloc_ssa(RegFile::GPR, 1);
1016                     b.push_op(OpFFma {
1017                         dst: dst.into(),
1018                         srcs: [srcs[0], srcs[1], srcs[2]],
1019                         saturate: self.try_saturate_alu_dst(&alu.def),
1020                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1021                         // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1022                         // anyway so only set one of the two bits.
1023                         ftz: self.float_ctl[ftype].ftz,
1024                         dnz: false,
1025                     });
1026                 } else if alu.def.bit_size() == 16 {
1027                     assert!(
1028                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1029                     );
1030 
1031                     dst = b.alloc_ssa(RegFile::GPR, 1);
1032                     b.push_op(OpHFma2 {
1033                         dst: dst.into(),
1034                         srcs: [
1035                             restrict_f16v2_src(srcs[0]),
1036                             restrict_f16v2_src(srcs[1]),
1037                             restrict_f16v2_src(srcs[2]),
1038                         ],
1039                         saturate: self.try_saturate_alu_dst(&alu.def),
1040                         ftz: self.float_ctl[ftype].ftz,
1041                         dnz: false,
1042                         f32: false,
1043                     });
1044                 } else {
1045                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1046                 }
1047                 dst
1048             }
1049             nir_op_ffmaz => {
1050                 assert!(alu.def.bit_size() == 32);
1051                 // DNZ implies FTZ so we need FTZ set or this is invalid
1052                 assert!(self.float_ctl.fp32.ftz);
1053                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1054                 b.push_op(OpFFma {
1055                     dst: dst.into(),
1056                     srcs: [srcs[0], srcs[1], srcs[2]],
1057                     saturate: self.try_saturate_alu_dst(&alu.def),
1058                     rnd_mode: self.float_ctl.fp32.rnd_mode,
1059                     // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1060                     // anyway so only set one of the two bits.
1061                     ftz: false,
1062                     dnz: true,
1063                 });
1064                 dst
1065             }
1066             nir_op_flog2 => {
1067                 assert!(alu.def.bit_size() == 32);
1068                 b.mufu(MuFuOp::Log2, srcs[0])
1069             }
1070             nir_op_fmax | nir_op_fmin => {
1071                 let dst;
1072                 if alu.def.bit_size() == 64 {
1073                     dst = b.alloc_ssa(RegFile::GPR, 2);
1074                     b.push_op(OpDMnMx {
1075                         dst: dst.into(),
1076                         srcs: [srcs[0], srcs[1]],
1077                         min: (alu.op == nir_op_fmin).into(),
1078                     });
1079                 } else if alu.def.bit_size() == 32 {
1080                     dst = b.alloc_ssa(RegFile::GPR, 1);
1081                     b.push_op(OpFMnMx {
1082                         dst: dst.into(),
1083                         srcs: [srcs[0], srcs[1]],
1084                         min: (alu.op == nir_op_fmin).into(),
1085                         ftz: self.float_ctl.fp32.ftz,
1086                     });
1087                 } else if alu.def.bit_size() == 16 {
1088                     dst = b.alloc_ssa(RegFile::GPR, 1);
1089                     b.push_op(OpHMnMx2 {
1090                         dst: dst.into(),
1091                         srcs: [
1092                             restrict_f16v2_src(srcs[0]),
1093                             restrict_f16v2_src(srcs[1]),
1094                         ],
1095                         min: (alu.op == nir_op_fmin).into(),
1096                         ftz: self.float_ctl.fp16.ftz,
1097                     });
1098                 } else {
1099                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1100                 }
1101                 dst
1102             }
1103             nir_op_fmul => {
1104                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1105                 let dst;
1106                 if alu.def.bit_size() == 64 {
1107                     debug_assert!(!self.float_ctl[ftype].ftz);
1108                     dst = b.alloc_ssa(RegFile::GPR, 2);
1109                     b.push_op(OpDMul {
1110                         dst: dst.into(),
1111                         srcs: [srcs[0], srcs[1]],
1112                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1113                     });
1114                 } else if alu.def.bit_size() == 32 {
1115                     dst = b.alloc_ssa(RegFile::GPR, 1);
1116                     b.push_op(OpFMul {
1117                         dst: dst.into(),
1118                         srcs: [srcs[0], srcs[1]],
1119                         saturate: self.try_saturate_alu_dst(&alu.def),
1120                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1121                         ftz: self.float_ctl[ftype].ftz,
1122                         dnz: false,
1123                     });
1124                 } else if alu.def.bit_size() == 16 {
1125                     assert!(
1126                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1127                     );
1128 
1129                     dst = b.alloc_ssa(RegFile::GPR, 1);
1130                     b.push_op(OpHMul2 {
1131                         dst: dst.into(),
1132                         srcs: [
1133                             restrict_f16v2_src(srcs[0]),
1134                             restrict_f16v2_src(srcs[1]),
1135                         ],
1136                         saturate: self.try_saturate_alu_dst(&alu.def),
1137                         ftz: self.float_ctl[ftype].ftz,
1138                         dnz: false,
1139                     });
1140                 } else {
1141                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1142                 }
1143                 dst
1144             }
1145             nir_op_fmulz => {
1146                 assert!(alu.def.bit_size() == 32);
1147                 // DNZ implies FTZ so we need FTZ set or this is invalid
1148                 assert!(self.float_ctl.fp32.ftz);
1149                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1150                 b.push_op(OpFMul {
1151                     dst: dst.into(),
1152                     srcs: [srcs[0], srcs[1]],
1153                     saturate: self.try_saturate_alu_dst(&alu.def),
1154                     rnd_mode: self.float_ctl.fp32.rnd_mode,
1155                     // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1156                     // anyway so only set one of the two bits.
1157                     ftz: false,
1158                     dnz: true,
1159                 });
1160                 dst
1161             }
1162             nir_op_fquantize2f16 => {
1163                 let tmp = b.alloc_ssa(RegFile::GPR, 1);
1164                 b.push_op(OpF2F {
1165                     dst: tmp.into(),
1166                     src: srcs[0],
1167                     src_type: FloatType::F32,
1168                     dst_type: FloatType::F16,
1169                     rnd_mode: FRndMode::NearestEven,
1170                     ftz: true,
1171                     high: false,
1172                     integer_rnd: false,
1173                 });
1174                 assert!(alu.def.bit_size() == 32);
1175                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1176                 b.push_op(OpF2F {
1177                     dst: dst.into(),
1178                     src: tmp.into(),
1179                     src_type: FloatType::F16,
1180                     dst_type: FloatType::F32,
1181                     rnd_mode: FRndMode::NearestEven,
1182                     ftz: true,
1183                     high: false,
1184                     integer_rnd: false,
1185                 });
1186                 if b.sm() < 70 {
1187                     // Pre-Volta, F2F.ftz doesn't flush denorms so we need to do
1188                     // that manually
1189                     let denorm = b.fsetp(
1190                         FloatCmpOp::OrdLt,
1191                         srcs[0].fabs(),
1192                         0x38800000.into(),
1193                     );
1194                     // Get the correctly signed zero
1195                     let zero =
1196                         b.lop2(LogicOp2::And, srcs[0], 0x80000000.into());
1197                     b.sel(denorm.into(), zero.into(), dst.into())
1198                 } else {
1199                     dst
1200                 }
1201             }
1202             nir_op_frcp => {
1203                 assert!(alu.def.bit_size() == 32);
1204                 b.mufu(MuFuOp::Rcp, srcs[0])
1205             }
1206             nir_op_frsq => {
1207                 assert!(alu.def.bit_size() == 32);
1208                 b.mufu(MuFuOp::Rsq, srcs[0])
1209             }
1210             nir_op_fsat => {
1211                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1212 
1213                 if self.alu_src_is_saturated(&alu.srcs_as_slice()[0]) {
1214                     b.copy(srcs[0])
1215                 } else if alu.def.bit_size() == 32 {
1216                     let dst = b.alloc_ssa(RegFile::GPR, 1);
1217                     b.push_op(OpFAdd {
1218                         dst: dst.into(),
1219                         srcs: [srcs[0], 0.into()],
1220                         saturate: true,
1221                         rnd_mode: self.float_ctl[ftype].rnd_mode,
1222                         ftz: self.float_ctl[ftype].ftz,
1223                     });
1224                     dst
1225                 } else if alu.def.bit_size() == 16 {
1226                     assert!(
1227                         self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1228                     );
1229 
1230                     let dst = b.alloc_ssa(RegFile::GPR, 1);
1231                     b.push_op(OpHAdd2 {
1232                         dst: dst.into(),
1233                         srcs: [restrict_f16v2_src(srcs[0]), 0.into()],
1234                         saturate: true,
1235                         ftz: self.float_ctl[ftype].ftz,
1236                         f32: false,
1237                     });
1238                     dst
1239                 } else {
1240                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1241                 }
1242             }
1243             nir_op_fsign => {
1244                 if alu.def.bit_size() == 64 {
1245                     let lz = b.dsetp(FloatCmpOp::OrdLt, srcs[0], 0.into());
1246                     let gz = b.dsetp(FloatCmpOp::OrdGt, srcs[0], 0.into());
1247                     let hi = b.sel(lz.into(), 0xbff00000.into(), 0.into());
1248                     let hi = b.sel(gz.into(), 0x3ff00000.into(), hi.into());
1249                     let lo = b.copy(0.into());
1250                     [lo[0], hi[0]].into()
1251                 } else if alu.def.bit_size() == 32 {
1252                     let lz = b.fset(FloatCmpOp::OrdLt, srcs[0], 0.into());
1253                     let gz = b.fset(FloatCmpOp::OrdGt, srcs[0], 0.into());
1254                     b.fadd(gz.into(), Src::from(lz).fneg())
1255                 } else if alu.def.bit_size() == 16 {
1256                     let x = restrict_f16v2_src(srcs[0]);
1257 
1258                     let lz = restrict_f16v2_src(
1259                         b.hset2(FloatCmpOp::OrdLt, x, 0.into()).into(),
1260                     );
1261                     let gz = restrict_f16v2_src(
1262                         b.hset2(FloatCmpOp::OrdGt, x, 0.into()).into(),
1263                     );
1264 
1265                     b.hadd2(gz, lz.fneg())
1266                 } else {
1267                     panic!("Unsupported float type: f{}", alu.def.bit_size());
1268                 }
1269             }
1270             nir_op_fsin => b.fsin(srcs[0]),
1271             nir_op_fsqrt => b.mufu(MuFuOp::Sqrt, srcs[0]),
1272             nir_op_i2f16 | nir_op_i2f32 | nir_op_i2f64 => {
1273                 let src_bits = alu.get_src(0).src.bit_size();
1274                 let dst_bits = alu.def.bit_size();
1275                 let dst_type = FloatType::from_bits(dst_bits.into());
1276                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1277                 b.push_op(OpI2F {
1278                     dst: dst.into(),
1279                     src: srcs[0],
1280                     dst_type: dst_type,
1281                     src_type: IntType::from_bits(src_bits.into(), true),
1282                     rnd_mode: self.float_ctl[dst_type].rnd_mode,
1283                 });
1284                 dst
1285             }
1286             nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32 | nir_op_i2i64
1287             | nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32 | nir_op_u2u64 => {
1288                 let src_bits = alu.get_src(0).src.bit_size();
1289                 let dst_bits = alu.def.bit_size();
1290 
1291                 let mut prmt = [0_u8; 8];
1292                 match alu.op {
1293                     nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32
1294                     | nir_op_i2i64 => {
1295                         let sign = ((src_bits / 8) - 1) | 0x8;
1296                         for i in 0..8 {
1297                             if i < (src_bits / 8) {
1298                                 prmt[usize::from(i)] = i;
1299                             } else {
1300                                 prmt[usize::from(i)] = sign;
1301                             }
1302                         }
1303                     }
1304                     nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32
1305                     | nir_op_u2u64 => {
1306                         for i in 0..8 {
1307                             if i < (src_bits / 8) {
1308                                 prmt[usize::from(i)] = i;
1309                             } else {
1310                                 prmt[usize::from(i)] = 4;
1311                             }
1312                         }
1313                     }
1314                     _ => panic!("Invalid integer conversion: {}", alu.op),
1315                 }
1316                 let prmt_lo: [u8; 4] = prmt[0..4].try_into().unwrap();
1317                 let prmt_hi: [u8; 4] = prmt[4..8].try_into().unwrap();
1318 
1319                 let src = srcs[0].as_ssa().unwrap();
1320                 if src_bits == 64 {
1321                     if dst_bits == 64 {
1322                         *src
1323                     } else {
1324                         b.prmt(src[0].into(), src[1].into(), prmt_lo)
1325                     }
1326                 } else {
1327                     if dst_bits == 64 {
1328                         let lo = b.prmt(src[0].into(), 0.into(), prmt_lo);
1329                         let hi = b.prmt(src[0].into(), 0.into(), prmt_hi);
1330                         [lo[0], hi[0]].into()
1331                     } else {
1332                         b.prmt(src[0].into(), 0.into(), prmt_lo)
1333                     }
1334                 }
1335             }
1336             nir_op_iabs => b.iabs(srcs[0]),
1337             nir_op_iadd => match alu.def.bit_size {
1338                 32 => b.iadd(srcs[0], srcs[1], 0.into()),
1339                 64 => b.iadd64(srcs[0], srcs[1], 0.into()),
1340                 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1341             },
1342             nir_op_iadd3 => match alu.def.bit_size {
1343                 32 => b.iadd(srcs[0], srcs[1], srcs[2]),
1344                 64 => b.iadd64(srcs[0], srcs[1], srcs[2]),
1345                 x => panic!("unsupported bit size for nir_op_iadd3: {x}"),
1346             },
1347             nir_op_iand => b.lop2(LogicOp2::And, srcs[0], srcs[1]),
1348             nir_op_ieq => {
1349                 if alu.get_src(0).bit_size() == 1 {
1350                     b.lop2(LogicOp2::Xor, srcs[0], srcs[1].bnot())
1351                 } else if alu.get_src(0).bit_size() == 64 {
1352                     b.isetp64(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1353                 } else {
1354                     assert!(alu.get_src(0).bit_size() == 32);
1355                     b.isetp(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1356                 }
1357             }
1358             nir_op_ifind_msb | nir_op_ifind_msb_rev | nir_op_ufind_msb
1359             | nir_op_ufind_msb_rev => {
1360                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1361                 b.push_op(OpFlo {
1362                     dst: dst.into(),
1363                     src: srcs[0],
1364                     signed: match alu.op {
1365                         nir_op_ifind_msb | nir_op_ifind_msb_rev => true,
1366                         nir_op_ufind_msb | nir_op_ufind_msb_rev => false,
1367                         _ => panic!("Not a find_msb op"),
1368                     },
1369                     return_shift_amount: match alu.op {
1370                         nir_op_ifind_msb | nir_op_ufind_msb => false,
1371                         nir_op_ifind_msb_rev | nir_op_ufind_msb_rev => true,
1372                         _ => panic!("Not a find_msb op"),
1373                     },
1374                 });
1375                 dst
1376             }
1377             nir_op_ige | nir_op_ilt | nir_op_uge | nir_op_ult => {
1378                 let x = *srcs[0].as_ssa().unwrap();
1379                 let y = *srcs[1].as_ssa().unwrap();
1380                 let (cmp_type, cmp_op) = match alu.op {
1381                     nir_op_ige => (IntCmpType::I32, IntCmpOp::Ge),
1382                     nir_op_ilt => (IntCmpType::I32, IntCmpOp::Lt),
1383                     nir_op_uge => (IntCmpType::U32, IntCmpOp::Ge),
1384                     nir_op_ult => (IntCmpType::U32, IntCmpOp::Lt),
1385                     _ => panic!("Not an integer comparison"),
1386                 };
1387                 if alu.get_src(0).bit_size() == 64 {
1388                     b.isetp64(cmp_type, cmp_op, x.into(), y.into())
1389                 } else {
1390                     assert!(alu.get_src(0).bit_size() == 32);
1391                     b.isetp(cmp_type, cmp_op, x.into(), y.into())
1392                 }
1393             }
1394             nir_op_imad => {
1395                 assert!(alu.def.bit_size() == 32);
1396                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1397                 b.push_op(OpIMad {
1398                     dst: dst.into(),
1399                     srcs: [srcs[0], srcs[1], srcs[2]],
1400                     signed: false,
1401                 });
1402                 dst
1403             }
1404             nir_op_imax | nir_op_imin | nir_op_umax | nir_op_umin => {
1405                 let (tp, min) = match alu.op {
1406                     nir_op_imax => (IntCmpType::I32, SrcRef::False),
1407                     nir_op_imin => (IntCmpType::I32, SrcRef::True),
1408                     nir_op_umax => (IntCmpType::U32, SrcRef::False),
1409                     nir_op_umin => (IntCmpType::U32, SrcRef::True),
1410                     _ => panic!("Not an integer min/max"),
1411                 };
1412                 assert!(alu.def.bit_size() == 32);
1413                 b.imnmx(tp, srcs[0], srcs[1], min.into())
1414             }
1415             nir_op_imul => {
1416                 assert!(alu.def.bit_size() == 32);
1417                 b.imul(srcs[0], srcs[1])
1418             }
1419             nir_op_imul_2x32_64 | nir_op_umul_2x32_64 => {
1420                 let signed = alu.op == nir_op_imul_2x32_64;
1421                 b.imul_2x32_64(srcs[0], srcs[1], signed)
1422             }
1423             nir_op_imul_high | nir_op_umul_high => {
1424                 let signed = alu.op == nir_op_imul_high;
1425                 let dst64 = b.imul_2x32_64(srcs[0], srcs[1], signed);
1426                 dst64[1].into()
1427             }
1428             nir_op_ine => {
1429                 if alu.get_src(0).bit_size() == 1 {
1430                     b.lop2(LogicOp2::Xor, srcs[0], srcs[1])
1431                 } else if alu.get_src(0).bit_size() == 64 {
1432                     b.isetp64(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1433                 } else {
1434                     assert!(alu.get_src(0).bit_size() == 32);
1435                     b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1436                 }
1437             }
1438             nir_op_ineg => {
1439                 if alu.def.bit_size == 64 {
1440                     b.ineg64(srcs[0])
1441                 } else {
1442                     assert!(alu.def.bit_size() == 32);
1443                     b.ineg(srcs[0])
1444                 }
1445             }
1446             nir_op_inot => {
1447                 if alu.def.bit_size() == 1 {
1448                     b.lop2(LogicOp2::PassB, true.into(), srcs[0].bnot())
1449                 } else {
1450                     assert!(alu.def.bit_size() == 32);
1451                     b.lop2(LogicOp2::PassB, 0.into(), srcs[0].bnot())
1452                 }
1453             }
1454             nir_op_ior => b.lop2(LogicOp2::Or, srcs[0], srcs[1]),
1455             nir_op_ishl => {
1456                 if alu.def.bit_size() == 64 {
1457                     let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1458                         (s as u32).into()
1459                     } else {
1460                         srcs[1]
1461                     };
1462                     b.shl64(srcs[0], shift)
1463                 } else {
1464                     assert!(alu.def.bit_size() == 32);
1465                     b.shl(srcs[0], srcs[1])
1466                 }
1467             }
1468             nir_op_ishr => {
1469                 if alu.def.bit_size() == 64 {
1470                     let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1471                         (s as u32).into()
1472                     } else {
1473                         srcs[1]
1474                     };
1475                     b.shr64(srcs[0], shift, true)
1476                 } else {
1477                     assert!(alu.def.bit_size() == 32);
1478                     b.shr(srcs[0], srcs[1], true)
1479                 }
1480             }
1481             nir_op_isub => match alu.def.bit_size {
1482                 32 => b.iadd(srcs[0], srcs[1].ineg(), 0.into()),
1483                 64 => b.iadd64(srcs[0], srcs[1].ineg(), 0.into()),
1484                 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1485             },
1486             nir_op_ixor => b.lop2(LogicOp2::Xor, srcs[0], srcs[1]),
1487             nir_op_pack_half_2x16_split | nir_op_pack_half_2x16_rtz_split => {
1488                 assert!(alu.get_src(0).bit_size() == 32);
1489 
1490                 let rnd_mode = match alu.op {
1491                     nir_op_pack_half_2x16_split => FRndMode::NearestEven,
1492                     nir_op_pack_half_2x16_rtz_split => FRndMode::Zero,
1493                     _ => panic!("Unhandled fp16 pack op"),
1494                 };
1495 
1496                 if self.sm.sm() >= 86 {
1497                     let result: SSARef = b.alloc_ssa(RegFile::GPR, 1);
1498                     b.push_op(OpF2FP {
1499                         dst: result.into(),
1500                         srcs: [srcs[1], srcs[0]],
1501                         rnd_mode: rnd_mode,
1502                     });
1503 
1504                     result
1505                 } else {
1506                     let low = b.alloc_ssa(RegFile::GPR, 1);
1507                     let high = b.alloc_ssa(RegFile::GPR, 1);
1508 
1509                     b.push_op(OpF2F {
1510                         dst: low.into(),
1511                         src: srcs[0],
1512                         src_type: FloatType::F32,
1513                         dst_type: FloatType::F16,
1514                         rnd_mode: rnd_mode,
1515                         ftz: false,
1516                         high: false,
1517                         integer_rnd: false,
1518                     });
1519 
1520                     let src_bits = usize::from(alu.get_src(1).bit_size());
1521                     let src_type = FloatType::from_bits(src_bits);
1522                     assert!(matches!(src_type, FloatType::F32));
1523                     b.push_op(OpF2F {
1524                         dst: high.into(),
1525                         src: srcs[1],
1526                         src_type: FloatType::F32,
1527                         dst_type: FloatType::F16,
1528                         rnd_mode: rnd_mode,
1529                         ftz: false,
1530                         high: false,
1531                         integer_rnd: false,
1532                     });
1533 
1534                     b.prmt(low.into(), high.into(), [0, 1, 4, 5])
1535                 }
1536             }
1537             nir_op_prmt_nv => {
1538                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1539                 b.push_op(OpPrmt {
1540                     dst: dst.into(),
1541                     srcs: [srcs[1], srcs[2]],
1542                     sel: srcs[0],
1543                     mode: PrmtMode::Index,
1544                 });
1545                 dst
1546             }
1547             nir_op_sdot_4x8_iadd => {
1548                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1549                 b.push_op(OpIDp4 {
1550                     dst: dst.into(),
1551                     src_types: [IntType::I8, IntType::I8],
1552                     srcs: [srcs[0], srcs[1], srcs[2]],
1553                 });
1554                 dst
1555             }
1556             nir_op_sudot_4x8_iadd => {
1557                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1558                 b.push_op(OpIDp4 {
1559                     dst: dst.into(),
1560                     src_types: [IntType::I8, IntType::U8],
1561                     srcs: [srcs[0], srcs[1], srcs[2]],
1562                 });
1563                 dst
1564             }
1565             nir_op_udot_4x8_uadd => {
1566                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1567                 b.push_op(OpIDp4 {
1568                     dst: dst.into(),
1569                     src_types: [IntType::U8, IntType::U8],
1570                     srcs: [srcs[0], srcs[1], srcs[2]],
1571                 });
1572                 dst
1573             }
1574             nir_op_u2f16 | nir_op_u2f32 | nir_op_u2f64 => {
1575                 let src_bits = alu.get_src(0).src.bit_size();
1576                 let dst_bits = alu.def.bit_size();
1577                 let dst_type = FloatType::from_bits(dst_bits.into());
1578                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1579                 b.push_op(OpI2F {
1580                     dst: dst.into(),
1581                     src: srcs[0],
1582                     dst_type: dst_type,
1583                     src_type: IntType::from_bits(src_bits.into(), false),
1584                     rnd_mode: self.float_ctl[dst_type].rnd_mode,
1585                 });
1586                 dst
1587             }
1588             nir_op_uadd_sat => {
1589                 let x = srcs[0].as_ssa().unwrap();
1590                 let y = srcs[1].as_ssa().unwrap();
1591                 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1592                 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1593                 b.push_op(OpIAdd3 {
1594                     dst: sum_lo.into(),
1595                     overflow: [ovf_lo.into(), Dst::None],
1596                     srcs: [0.into(), x[0].into(), y[0].into()],
1597                 });
1598                 if alu.def.bit_size() == 64 {
1599                     let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1600                     let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1601                     b.push_op(OpIAdd3X {
1602                         dst: sum_hi.into(),
1603                         overflow: [ovf_hi.into(), Dst::None],
1604                         srcs: [0.into(), x[1].into(), y[1].into()],
1605                         carry: [ovf_lo.into(), false.into()],
1606                     });
1607                     let lo =
1608                         b.sel(ovf_hi.into(), u32::MAX.into(), sum_lo.into());
1609                     let hi =
1610                         b.sel(ovf_hi.into(), u32::MAX.into(), sum_hi.into());
1611                     [lo[0], hi[0]].into()
1612                 } else {
1613                     assert!(alu.def.bit_size() == 32);
1614                     b.sel(ovf_lo.into(), u32::MAX.into(), sum_lo.into())
1615                 }
1616             }
1617             nir_op_usub_sat => {
1618                 let x = srcs[0].as_ssa().unwrap();
1619                 let y = srcs[1].as_ssa().unwrap();
1620                 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1621                 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1622                 // The result of OpIAdd3X is the 33-bit value
1623                 //
1624                 //  s|o = x + !y + 1
1625                 //
1626                 // The overflow bit of this result is true if and only if the
1627                 // subtract did NOT overflow.
1628                 b.push_op(OpIAdd3 {
1629                     dst: sum_lo.into(),
1630                     overflow: [ovf_lo.into(), Dst::None],
1631                     srcs: [0.into(), x[0].into(), Src::from(y[0]).ineg()],
1632                 });
1633                 if alu.def.bit_size() == 64 {
1634                     let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1635                     let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1636                     b.push_op(OpIAdd3X {
1637                         dst: sum_hi.into(),
1638                         overflow: [ovf_hi.into(), Dst::None],
1639                         srcs: [0.into(), x[1].into(), Src::from(y[1]).bnot()],
1640                         carry: [ovf_lo.into(), false.into()],
1641                     });
1642                     let lo = b.sel(ovf_hi.into(), sum_lo.into(), 0.into());
1643                     let hi = b.sel(ovf_hi.into(), sum_hi.into(), 0.into());
1644                     [lo[0], hi[0]].into()
1645                 } else {
1646                     assert!(alu.def.bit_size() == 32);
1647                     b.sel(ovf_lo.into(), sum_lo.into(), 0.into())
1648                 }
1649             }
1650             nir_op_unpack_32_2x16_split_x => {
1651                 b.prmt(srcs[0], 0.into(), [0, 1, 4, 4])
1652             }
1653             nir_op_unpack_32_2x16_split_y => {
1654                 b.prmt(srcs[0], 0.into(), [2, 3, 4, 4])
1655             }
1656             nir_op_unpack_64_2x32_split_x => {
1657                 let src0_x = srcs[0].as_ssa().unwrap()[0];
1658                 b.copy(src0_x.into())
1659             }
1660             nir_op_unpack_64_2x32_split_y => {
1661                 let src0_y = srcs[0].as_ssa().unwrap()[1];
1662                 b.copy(src0_y.into())
1663             }
1664             nir_op_unpack_half_2x16_split_x
1665             | nir_op_unpack_half_2x16_split_y => {
1666                 assert!(alu.def.bit_size() == 32);
1667                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1668 
1669                 b.push_op(OpF2F {
1670                     dst: dst[0].into(),
1671                     src: srcs[0],
1672                     src_type: FloatType::F16,
1673                     dst_type: FloatType::F32,
1674                     rnd_mode: FRndMode::NearestEven,
1675                     ftz: false,
1676                     high: alu.op == nir_op_unpack_half_2x16_split_y,
1677                     integer_rnd: false,
1678                 });
1679 
1680                 dst
1681             }
1682             nir_op_ushr => {
1683                 if alu.def.bit_size() == 64 {
1684                     let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1685                         (s as u32).into()
1686                     } else {
1687                         srcs[1]
1688                     };
1689                     b.shr64(srcs[0], shift, false)
1690                 } else {
1691                     assert!(alu.def.bit_size() == 32);
1692                     b.shr(srcs[0], srcs[1], false)
1693                 }
1694             }
1695             _ => panic!("Unsupported ALU instruction: {}", alu.info().name()),
1696         };
1697         self.set_dst(&alu.def, dst);
1698     }
1699 
parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr)1700     fn parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr) {
1701         let dim = match tex.sampler_dim {
1702             GLSL_SAMPLER_DIM_1D => {
1703                 if tex.is_array {
1704                     TexDim::Array1D
1705                 } else {
1706                     TexDim::_1D
1707                 }
1708             }
1709             GLSL_SAMPLER_DIM_2D => {
1710                 if tex.is_array {
1711                     TexDim::Array2D
1712                 } else {
1713                     TexDim::_2D
1714                 }
1715             }
1716             GLSL_SAMPLER_DIM_3D => {
1717                 assert!(!tex.is_array);
1718                 TexDim::_3D
1719             }
1720             GLSL_SAMPLER_DIM_CUBE => {
1721                 if tex.is_array {
1722                     TexDim::ArrayCube
1723                 } else {
1724                     TexDim::Cube
1725                 }
1726             }
1727             GLSL_SAMPLER_DIM_BUF => TexDim::_1D,
1728             GLSL_SAMPLER_DIM_MS => {
1729                 if tex.is_array {
1730                     TexDim::Array2D
1731                 } else {
1732                     TexDim::_2D
1733                 }
1734             }
1735             _ => panic!("Unsupported texture dimension: {}", tex.sampler_dim),
1736         };
1737 
1738         let srcs = tex.srcs_as_slice();
1739         assert!(srcs[0].src_type == nir_tex_src_backend1);
1740         if srcs.len() > 1 {
1741             assert!(srcs.len() == 2);
1742             assert!(srcs[1].src_type == nir_tex_src_backend2);
1743         }
1744 
1745         let flags: nak_nir_tex_flags =
1746             unsafe { std::mem::transmute_copy(&tex.backend_flags) };
1747 
1748         let mask = tex.def.components_read();
1749         let mut mask = u8::try_from(mask).unwrap();
1750         if flags.is_sparse() {
1751             mask &= !(1 << (tex.def.num_components - 1));
1752             if mask == 0 {
1753                 // This can happen if only the sparse predicate is used.  In
1754                 // that case, we need at least one result register.
1755                 mask = 1;
1756             }
1757         } else {
1758             debug_assert!(mask != 0);
1759         }
1760 
1761         let dst_comps = u8::try_from(mask.count_ones()).unwrap();
1762         let dst = b.alloc_ssa(RegFile::GPR, dst_comps);
1763 
1764         // On Volta and later, the destination is split in two
1765         let mut dsts = [Dst::None; 2];
1766         if dst_comps > 2 && b.sm() >= 70 {
1767             dsts[0] = SSARef::try_from(&dst[0..2]).unwrap().into();
1768             dsts[1] = SSARef::try_from(&dst[2..]).unwrap().into();
1769         } else {
1770             dsts[0] = dst.into();
1771         }
1772 
1773         let fault = if flags.is_sparse() {
1774             b.alloc_ssa(RegFile::Pred, 1).into()
1775         } else {
1776             Dst::None
1777         };
1778 
1779         if tex.op == nir_texop_hdr_dim_nv {
1780             let src = self.get_src(&srcs[0].src);
1781             assert!(fault.is_none());
1782             b.push_op(OpTxq {
1783                 dsts: dsts,
1784                 src: src,
1785                 query: TexQuery::Dimension,
1786                 mask: mask,
1787             });
1788         } else if tex.op == nir_texop_tex_type_nv {
1789             let src = self.get_src(&srcs[0].src);
1790             assert!(fault.is_none());
1791             b.push_op(OpTxq {
1792                 dsts: dsts,
1793                 src: src,
1794                 query: TexQuery::TextureType,
1795                 mask: mask,
1796             });
1797         } else {
1798             let lod_mode = match flags.lod_mode() {
1799                 NAK_NIR_LOD_MODE_AUTO => TexLodMode::Auto,
1800                 NAK_NIR_LOD_MODE_ZERO => TexLodMode::Zero,
1801                 NAK_NIR_LOD_MODE_BIAS => TexLodMode::Bias,
1802                 NAK_NIR_LOD_MODE_LOD => TexLodMode::Lod,
1803                 NAK_NIR_LOD_MODE_CLAMP => TexLodMode::Clamp,
1804                 NAK_NIR_LOD_MODE_BIAS_CLAMP => TexLodMode::BiasClamp,
1805                 _ => panic!("Invalid LOD mode"),
1806             };
1807 
1808             let offset_mode = match flags.offset_mode() {
1809                 NAK_NIR_OFFSET_MODE_NONE => Tld4OffsetMode::None,
1810                 NAK_NIR_OFFSET_MODE_AOFFI => Tld4OffsetMode::AddOffI,
1811                 NAK_NIR_OFFSET_MODE_PER_PX => Tld4OffsetMode::PerPx,
1812                 _ => panic!("Invalid offset mode"),
1813             };
1814 
1815             let srcs = [self.get_src(&srcs[0].src), self.get_src(&srcs[1].src)];
1816 
1817             if tex.op == nir_texop_txd {
1818                 assert!(lod_mode == TexLodMode::Auto);
1819                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1820                 assert!(!flags.has_z_cmpr());
1821                 b.push_op(OpTxd {
1822                     dsts: dsts,
1823                     fault,
1824                     srcs: srcs,
1825                     dim: dim,
1826                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1827                     mask: mask,
1828                 });
1829             } else if tex.op == nir_texop_lod {
1830                 assert!(offset_mode == Tld4OffsetMode::None);
1831                 b.push_op(OpTmml {
1832                     dsts: dsts,
1833                     srcs: srcs,
1834                     dim: dim,
1835                     mask: mask,
1836                 });
1837             } else if tex.op == nir_texop_txf || tex.op == nir_texop_txf_ms {
1838                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1839                 b.push_op(OpTld {
1840                     dsts: dsts,
1841                     fault,
1842                     srcs: srcs,
1843                     dim: dim,
1844                     lod_mode: lod_mode,
1845                     is_ms: tex.op == nir_texop_txf_ms,
1846                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1847                     mask: mask,
1848                 });
1849             } else if tex.op == nir_texop_tg4 {
1850                 b.push_op(OpTld4 {
1851                     dsts: dsts,
1852                     fault,
1853                     srcs: srcs,
1854                     dim: dim,
1855                     comp: tex.component().try_into().unwrap(),
1856                     offset_mode: offset_mode,
1857                     z_cmpr: flags.has_z_cmpr(),
1858                     mask: mask,
1859                 });
1860             } else {
1861                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1862                 b.push_op(OpTex {
1863                     dsts: dsts,
1864                     fault,
1865                     srcs: srcs,
1866                     dim: dim,
1867                     lod_mode: lod_mode,
1868                     z_cmpr: flags.has_z_cmpr(),
1869                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1870                     mask: mask,
1871                 });
1872             }
1873         }
1874 
1875         let mut di = 0_usize;
1876         let mut nir_dst = Vec::new();
1877         for i in 0..tex.def.num_components() {
1878             if flags.is_sparse() && i == tex.def.num_components - 1 {
1879                 let Dst::SSA(fault) = fault else {
1880                     panic!("No fault value for sparse op");
1881                 };
1882                 nir_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
1883             } else if mask & (1 << i) == 0 {
1884                 nir_dst.push(b.copy(0.into())[0]);
1885             } else {
1886                 nir_dst.push(dst[di]);
1887                 di += 1;
1888             }
1889         }
1890         self.set_ssa(tex.def.as_def(), nir_dst);
1891     }
1892 
get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType1893     fn get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType {
1894         let bit_size = intrin.def.bit_size();
1895         match intrin.atomic_op() {
1896             nir_atomic_op_iadd => AtomType::U(bit_size),
1897             nir_atomic_op_imin => AtomType::I(bit_size),
1898             nir_atomic_op_umin => AtomType::U(bit_size),
1899             nir_atomic_op_imax => AtomType::I(bit_size),
1900             nir_atomic_op_umax => AtomType::U(bit_size),
1901             nir_atomic_op_iand => AtomType::U(bit_size),
1902             nir_atomic_op_ior => AtomType::U(bit_size),
1903             nir_atomic_op_ixor => AtomType::U(bit_size),
1904             nir_atomic_op_xchg => AtomType::U(bit_size),
1905             nir_atomic_op_fadd => AtomType::F(bit_size),
1906             nir_atomic_op_fmin => AtomType::F(bit_size),
1907             nir_atomic_op_fmax => AtomType::F(bit_size),
1908             nir_atomic_op_cmpxchg => AtomType::U(bit_size),
1909             _ => panic!("Unsupported NIR atomic op"),
1910         }
1911     }
1912 
get_atomic_op( &self, intrin: &nir_intrinsic_instr, cmp_src: AtomCmpSrc, ) -> AtomOp1913     fn get_atomic_op(
1914         &self,
1915         intrin: &nir_intrinsic_instr,
1916         cmp_src: AtomCmpSrc,
1917     ) -> AtomOp {
1918         match intrin.atomic_op() {
1919             nir_atomic_op_iadd => AtomOp::Add,
1920             nir_atomic_op_imin => AtomOp::Min,
1921             nir_atomic_op_umin => AtomOp::Min,
1922             nir_atomic_op_imax => AtomOp::Max,
1923             nir_atomic_op_umax => AtomOp::Max,
1924             nir_atomic_op_iand => AtomOp::And,
1925             nir_atomic_op_ior => AtomOp::Or,
1926             nir_atomic_op_ixor => AtomOp::Xor,
1927             nir_atomic_op_xchg => AtomOp::Exch,
1928             nir_atomic_op_fadd => AtomOp::Add,
1929             nir_atomic_op_fmin => AtomOp::Min,
1930             nir_atomic_op_fmax => AtomOp::Max,
1931             nir_atomic_op_cmpxchg => AtomOp::CmpExch(cmp_src),
1932             _ => panic!("Unsupported NIR atomic op"),
1933         }
1934     }
1935 
get_eviction_priority( &mut self, access: gl_access_qualifier, ) -> MemEvictionPriority1936     fn get_eviction_priority(
1937         &mut self,
1938         access: gl_access_qualifier,
1939     ) -> MemEvictionPriority {
1940         if self.sm.sm() >= 70 && access & ACCESS_NON_TEMPORAL != 0 {
1941             MemEvictionPriority::First
1942         } else {
1943             MemEvictionPriority::Normal
1944         }
1945     }
1946 
get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim1947     fn get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim {
1948         let is_array = intrin.image_array();
1949         let image_dim = intrin.image_dim();
1950         match intrin.image_dim() {
1951             GLSL_SAMPLER_DIM_1D => {
1952                 if is_array {
1953                     ImageDim::_1DArray
1954                 } else {
1955                     ImageDim::_1D
1956                 }
1957             }
1958             GLSL_SAMPLER_DIM_2D => {
1959                 if is_array {
1960                     ImageDim::_2DArray
1961                 } else {
1962                     ImageDim::_2D
1963                 }
1964             }
1965             GLSL_SAMPLER_DIM_3D => {
1966                 assert!(!is_array);
1967                 ImageDim::_3D
1968             }
1969             GLSL_SAMPLER_DIM_CUBE => ImageDim::_2DArray,
1970             GLSL_SAMPLER_DIM_BUF => {
1971                 assert!(!is_array);
1972                 ImageDim::_1DBuffer
1973             }
1974             _ => panic!("Unsupported image dimension: {}", image_dim),
1975         }
1976     }
1977 
get_image_coord( &mut self, intrin: &nir_intrinsic_instr, dim: ImageDim, ) -> Src1978     fn get_image_coord(
1979         &mut self,
1980         intrin: &nir_intrinsic_instr,
1981         dim: ImageDim,
1982     ) -> Src {
1983         let vec = self.get_ssa(intrin.get_src(1).as_def());
1984         // let sample = self.get_src(&srcs[2]);
1985         let comps = usize::from(dim.coord_comps());
1986         SSARef::try_from(&vec[0..comps]).unwrap().into()
1987     }
1988 
parse_intrinsic( &mut self, b: &mut impl SSABuilder, intrin: &nir_intrinsic_instr, )1989     fn parse_intrinsic(
1990         &mut self,
1991         b: &mut impl SSABuilder,
1992         intrin: &nir_intrinsic_instr,
1993     ) {
1994         let srcs = intrin.srcs_as_slice();
1995         match intrin.intrinsic {
1996             nir_intrinsic_al2p_nv => {
1997                 let offset = self.get_src(&srcs[0]);
1998                 let addr = u16::try_from(intrin.base()).unwrap();
1999 
2000                 let flags = intrin.flags();
2001                 let flags: nak_nir_attr_io_flags =
2002                     unsafe { std::mem::transmute_copy(&flags) };
2003 
2004                 let access = AttrAccess {
2005                     addr: addr,
2006                     comps: 1,
2007                     patch: flags.patch(),
2008                     output: flags.output(),
2009                     phys: false,
2010                 };
2011 
2012                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2013                 b.push_op(OpAL2P {
2014                     dst: dst.into(),
2015                     offset: offset,
2016                     access: access,
2017                 });
2018                 self.set_dst(&intrin.def, dst);
2019             }
2020             nir_intrinsic_ald_nv | nir_intrinsic_ast_nv => {
2021                 let addr = u16::try_from(intrin.base()).unwrap();
2022                 let base = u16::try_from(intrin.range_base()).unwrap();
2023                 let range = u16::try_from(intrin.range()).unwrap();
2024                 let range = base..(base + range);
2025 
2026                 let flags = intrin.flags();
2027                 let flags: nak_nir_attr_io_flags =
2028                     unsafe { std::mem::transmute_copy(&flags) };
2029                 assert!(!flags.patch() || !flags.phys());
2030 
2031                 if let ShaderIoInfo::Vtg(io) = &mut self.info.io {
2032                     if flags.patch() {
2033                         match &mut self.info.stage {
2034                             ShaderStageInfo::TessellationInit(stage) => {
2035                                 assert!(flags.output());
2036                                 stage.per_patch_attribute_count = max(
2037                                     stage.per_patch_attribute_count,
2038                                     (range.end / 4).try_into().unwrap(),
2039                                 );
2040                             }
2041                             ShaderStageInfo::Tessellation(_) => (),
2042                             _ => panic!("Patch I/O not supported"),
2043                         }
2044                     } else {
2045                         if flags.output() {
2046                             if intrin.intrinsic == nir_intrinsic_ast_nv {
2047                                 io.mark_store_req(range.clone());
2048                             }
2049                             io.mark_attrs_written(range);
2050                         } else {
2051                             io.mark_attrs_read(range);
2052                         }
2053                     }
2054                 } else {
2055                     panic!("Must be a VTG stage");
2056                 }
2057 
2058                 let access = AttrAccess {
2059                     addr: addr,
2060                     comps: intrin.num_components,
2061                     patch: flags.patch(),
2062                     output: flags.output(),
2063                     phys: flags.phys(),
2064                 };
2065 
2066                 if intrin.intrinsic == nir_intrinsic_ald_nv {
2067                     let vtx = self.get_src(&srcs[0]);
2068                     let offset = self.get_src(&srcs[1]);
2069 
2070                     assert!(intrin.def.bit_size() == 32);
2071                     let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2072                     b.push_op(OpALd {
2073                         dst: dst.into(),
2074                         vtx: vtx,
2075                         offset: offset,
2076                         access: access,
2077                     });
2078                     self.set_dst(&intrin.def, dst);
2079                 } else if intrin.intrinsic == nir_intrinsic_ast_nv {
2080                     assert!(srcs[0].bit_size() == 32);
2081                     let data = self.get_src(&srcs[0]);
2082                     let vtx = self.get_src(&srcs[1]);
2083                     let offset = self.get_src(&srcs[2]);
2084 
2085                     b.push_op(OpASt {
2086                         data: data,
2087                         vtx: vtx,
2088                         offset: offset,
2089                         access: access,
2090                     });
2091                 } else {
2092                     panic!("Invalid VTG I/O intrinsic");
2093                 }
2094             }
2095             nir_intrinsic_as_uniform => {
2096                 let src = self.get_ssa(srcs[0].as_def());
2097                 let mut dst = Vec::new();
2098                 for comp in src {
2099                     let u = b.alloc_ssa(RegFile::UGPR, 1);
2100                     b.push_op(OpR2UR {
2101                         src: [*comp].into(),
2102                         dst: u.into(),
2103                     });
2104                     dst.push(u[0]);
2105                 }
2106                 self.set_ssa(&intrin.def, dst);
2107             }
2108             nir_intrinsic_ddx
2109             | nir_intrinsic_ddx_coarse
2110             | nir_intrinsic_ddx_fine => {
2111                 // TODO: Real coarse derivatives
2112 
2113                 assert!(intrin.def.bit_size() == 32);
2114                 let ftype = FloatType::F32;
2115                 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2116 
2117                 b.push_op(OpShfl {
2118                     dst: scratch[0].into(),
2119                     in_bounds: Dst::None,
2120                     src: self.get_src(&srcs[0]),
2121                     lane: 1_u32.into(),
2122                     c: (0x3_u32 | 0x1c_u32 << 8).into(),
2123                     op: ShflOp::Bfly,
2124                 });
2125 
2126                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2127 
2128                 b.push_op(OpFSwzAdd {
2129                     dst: dst[0].into(),
2130                     srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2131                     ops: [
2132                         FSwzAddOp::SubLeft,
2133                         FSwzAddOp::SubRight,
2134                         FSwzAddOp::SubLeft,
2135                         FSwzAddOp::SubRight,
2136                     ],
2137                     rnd_mode: self.float_ctl[ftype].rnd_mode,
2138                     ftz: self.float_ctl[ftype].ftz,
2139                 });
2140 
2141                 self.set_dst(&intrin.def, dst);
2142             }
2143             nir_intrinsic_ddy
2144             | nir_intrinsic_ddy_coarse
2145             | nir_intrinsic_ddy_fine => {
2146                 // TODO: Real coarse derivatives
2147 
2148                 assert!(intrin.def.bit_size() == 32);
2149                 let ftype = FloatType::F32;
2150                 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2151 
2152                 b.push_op(OpShfl {
2153                     dst: scratch[0].into(),
2154                     in_bounds: Dst::None,
2155                     src: self.get_src(&srcs[0]),
2156                     lane: 2_u32.into(),
2157                     c: (0x3_u32 | 0x1c_u32 << 8).into(),
2158                     op: ShflOp::Bfly,
2159                 });
2160 
2161                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2162 
2163                 b.push_op(OpFSwzAdd {
2164                     dst: dst[0].into(),
2165                     srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2166                     ops: [
2167                         FSwzAddOp::SubLeft,
2168                         FSwzAddOp::SubLeft,
2169                         FSwzAddOp::SubRight,
2170                         FSwzAddOp::SubRight,
2171                     ],
2172                     rnd_mode: self.float_ctl[ftype].rnd_mode,
2173                     ftz: self.float_ctl[ftype].ftz,
2174                 });
2175 
2176                 self.set_dst(&intrin.def, dst);
2177             }
2178             nir_intrinsic_ballot => {
2179                 assert!(srcs[0].bit_size() == 1);
2180                 let src = self.get_src(&srcs[0]);
2181 
2182                 assert!(intrin.def.bit_size() == 32);
2183                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2184 
2185                 b.push_op(OpVote {
2186                     op: VoteOp::Any,
2187                     ballot: dst.into(),
2188                     vote: Dst::None,
2189                     pred: src,
2190                 });
2191                 self.set_dst(&intrin.def, dst);
2192             }
2193             nir_intrinsic_bar_break_nv => {
2194                 let src = self.get_src(&srcs[0]);
2195                 let bar_in = b.bmov_to_bar(src);
2196                 let cond = self.get_src(&srcs[1]);
2197 
2198                 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2199                 b.push_op(OpBreak {
2200                     bar_out: bar_out.into(),
2201                     bar_in: bar_in.into(),
2202                     cond: cond.into(),
2203                 });
2204 
2205                 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2206             }
2207             nir_intrinsic_bar_set_nv => {
2208                 let label = self.label_alloc.alloc();
2209                 let old = self.bar_label.insert(intrin.def.index, label);
2210                 assert!(old.is_none());
2211 
2212                 let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
2213                 b.push_op(OpBClear {
2214                     dst: bar_clear.into(),
2215                 });
2216 
2217                 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2218                 b.push_op(OpBSSy {
2219                     bar_out: bar_out.into(),
2220                     bar_in: bar_clear.into(),
2221                     cond: SrcRef::True.into(),
2222                     target: label,
2223                 });
2224 
2225                 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2226             }
2227             nir_intrinsic_bar_sync_nv => {
2228                 let src = self.get_src(&srcs[0]);
2229 
2230                 let bar = b.bmov_to_bar(src);
2231                 b.push_op(OpBSync {
2232                     bar: bar.into(),
2233                     cond: SrcRef::True.into(),
2234                 });
2235 
2236                 let bar_set_idx = &srcs[1].as_def().index;
2237                 if let Some(label) = self.bar_label.get(bar_set_idx) {
2238                     b.push_op(OpNop {
2239                         label: Some(*label),
2240                     });
2241                 }
2242             }
2243             nir_intrinsic_bindless_image_atomic
2244             | nir_intrinsic_bindless_image_atomic_swap => {
2245                 let handle = self.get_src(&srcs[0]);
2246                 let dim = self.get_image_dim(intrin);
2247                 let coord = self.get_image_coord(intrin, dim);
2248                 // let sample = self.get_src(&srcs[2]);
2249                 let atom_type = self.get_atomic_type(intrin);
2250                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Packed);
2251 
2252                 assert!(
2253                     intrin.def.bit_size() == 32 || intrin.def.bit_size() == 64
2254                 );
2255                 assert!(intrin.def.num_components() == 1);
2256                 let dst = b.alloc_ssa(RegFile::GPR, intrin.def.bit_size() / 32);
2257 
2258                 let data = if intrin.intrinsic
2259                     == nir_intrinsic_bindless_image_atomic_swap
2260                 {
2261                     if intrin.def.bit_size() == 64 {
2262                         SSARef::from([
2263                             self.get_ssa(srcs[3].as_def())[0],
2264                             self.get_ssa(srcs[3].as_def())[1],
2265                             self.get_ssa(srcs[4].as_def())[0],
2266                             self.get_ssa(srcs[4].as_def())[1],
2267                         ])
2268                         .into()
2269                     } else {
2270                         SSARef::from([
2271                             self.get_ssa(srcs[3].as_def())[0],
2272                             self.get_ssa(srcs[4].as_def())[0],
2273                         ])
2274                         .into()
2275                     }
2276                 } else {
2277                     self.get_src(&srcs[3])
2278                 };
2279 
2280                 let is_reduction =
2281                     atom_op.is_reduction() && intrin.def.components_read() == 0;
2282 
2283                 b.push_op(OpSuAtom {
2284                     dst: if self.sm.sm() >= 70 && is_reduction {
2285                         Dst::None
2286                     } else {
2287                         dst.into()
2288                     },
2289                     fault: Dst::None,
2290                     handle: handle,
2291                     coord: coord,
2292                     data: data,
2293                     atom_op: atom_op,
2294                     atom_type: atom_type,
2295                     image_dim: dim,
2296                     mem_order: MemOrder::Strong(MemScope::System),
2297                     mem_eviction_priority: self
2298                         .get_eviction_priority(intrin.access()),
2299                 });
2300                 self.set_dst(&intrin.def, dst);
2301             }
2302             nir_intrinsic_bindless_image_load => {
2303                 let handle = self.get_src(&srcs[0]);
2304                 let dim = self.get_image_dim(intrin);
2305                 let coord = self.get_image_coord(intrin, dim);
2306                 // let sample = self.get_src(&srcs[2]);
2307 
2308                 let mem_order = if (intrin.access() & ACCESS_CAN_REORDER) != 0 {
2309                     if self.sm.sm() >= 80 {
2310                         MemOrder::Constant
2311                     } else {
2312                         MemOrder::Weak
2313                     }
2314                 } else {
2315                     MemOrder::Strong(MemScope::System)
2316                 };
2317 
2318                 let comps = intrin.num_components;
2319                 assert!(intrin.def.bit_size() == 32);
2320                 assert!(comps == 1 || comps == 2 || comps == 4);
2321 
2322                 let dst = b.alloc_ssa(RegFile::GPR, comps);
2323 
2324                 b.push_op(OpSuLd {
2325                     dst: dst.into(),
2326                     fault: Dst::None,
2327                     image_dim: dim,
2328                     mem_order,
2329                     mem_eviction_priority: self
2330                         .get_eviction_priority(intrin.access()),
2331                     mask: (1 << comps) - 1,
2332                     handle: handle,
2333                     coord: coord,
2334                 });
2335                 self.set_dst(&intrin.def, dst);
2336             }
2337             nir_intrinsic_bindless_image_sparse_load => {
2338                 let handle = self.get_src(&srcs[0]);
2339                 let dim = self.get_image_dim(intrin);
2340                 let coord = self.get_image_coord(intrin, dim);
2341                 // let sample = self.get_src(&srcs[2]);
2342 
2343                 let mem_order = if (intrin.access() & ACCESS_CAN_REORDER) != 0 {
2344                     if self.sm.sm() >= 80 {
2345                         MemOrder::Constant
2346                     } else {
2347                         MemOrder::Weak
2348                     }
2349                 } else {
2350                     MemOrder::Strong(MemScope::System)
2351                 };
2352 
2353                 let comps = intrin.num_components;
2354                 assert!(intrin.def.bit_size() == 32);
2355                 assert!(comps == 5);
2356 
2357                 let dst = b.alloc_ssa(RegFile::GPR, comps - 1);
2358                 let fault = b.alloc_ssa(RegFile::Pred, 1);
2359 
2360                 b.push_op(OpSuLd {
2361                     dst: dst.into(),
2362                     fault: fault.into(),
2363                     image_dim: dim,
2364                     mem_order,
2365                     mem_eviction_priority: self
2366                         .get_eviction_priority(intrin.access()),
2367                     mask: (1 << (comps - 1)) - 1,
2368                     handle: handle,
2369                     coord: coord,
2370                 });
2371 
2372                 let mut final_dst = Vec::new();
2373                 for i in 0..usize::from(comps) - 1 {
2374                     final_dst.push(dst[i]);
2375                 }
2376                 final_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
2377 
2378                 self.set_ssa(&intrin.def, final_dst);
2379             }
2380             nir_intrinsic_bindless_image_store => {
2381                 let handle = self.get_src(&srcs[0]);
2382                 let dim = self.get_image_dim(intrin);
2383                 let coord = self.get_image_coord(intrin, dim);
2384                 // let sample = self.get_src(&srcs[2]);
2385                 let data = self.get_src(&srcs[3]);
2386 
2387                 let comps = intrin.num_components;
2388                 assert!(srcs[3].bit_size() == 32);
2389                 assert!(comps == 1 || comps == 2 || comps == 4);
2390 
2391                 b.push_op(OpSuSt {
2392                     image_dim: dim,
2393                     mem_order: MemOrder::Strong(MemScope::System),
2394                     mem_eviction_priority: self
2395                         .get_eviction_priority(intrin.access()),
2396                     mask: (1 << comps) - 1,
2397                     handle: handle,
2398                     coord: coord,
2399                     data: data,
2400                 });
2401             }
2402             nir_intrinsic_copy_fs_outputs_nv => {
2403                 let ShaderIoInfo::Fragment(info) = &mut self.info.io else {
2404                     panic!(
2405                         "copy_fs_outputs_nv is only allowed in fragment shaders"
2406                     );
2407                 };
2408 
2409                 for i in 0..32 {
2410                     if !self.fs_out_regs[i].is_none() {
2411                         info.writes_color |= 1 << i;
2412                     }
2413                 }
2414                 let mask_idx = (NAK_FS_OUT_SAMPLE_MASK / 4) as usize;
2415                 info.writes_sample_mask = !self.fs_out_regs[mask_idx].is_none();
2416                 let depth_idx = (NAK_FS_OUT_DEPTH / 4) as usize;
2417                 info.writes_depth = !self.fs_out_regs[depth_idx].is_none();
2418 
2419                 let mut srcs = Vec::new();
2420                 for i in 0..8 {
2421                     // Even though the mask is per-component, the actual output
2422                     // space is per-output vec4s.
2423                     if info.writes_color & (0xf << (i * 4)) != 0 {
2424                         for c in 0..4 {
2425                             let reg = self.fs_out_regs[i * 4 + c];
2426                             if reg.is_none() {
2427                                 srcs.push(b.undef().into());
2428                             } else {
2429                                 srcs.push(reg.into());
2430                             }
2431                         }
2432                     }
2433                 }
2434 
2435                 // These always come together for some reason
2436                 if info.writes_sample_mask || info.writes_depth {
2437                     if info.writes_sample_mask {
2438                         srcs.push(self.fs_out_regs[mask_idx].into());
2439                     } else {
2440                         srcs.push(b.undef().into());
2441                     }
2442                     if info.writes_depth {
2443                         srcs.push(self.fs_out_regs[depth_idx].into());
2444                     }
2445                 }
2446 
2447                 b.push_op(OpRegOut { srcs: srcs });
2448             }
2449             nir_intrinsic_demote => {
2450                 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2451                     info.uses_kill = true;
2452                 } else {
2453                     panic!("OpKill is only available in fragment shaders");
2454                 }
2455                 b.push_op(OpKill {});
2456             }
2457             nir_intrinsic_demote_if => {
2458                 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2459                     info.uses_kill = true;
2460                 } else {
2461                     panic!("OpKill is only available in fragment shaders");
2462                 }
2463                 let cond = self.get_ssa(srcs[0].as_def())[0];
2464                 b.predicate(cond.into()).push_op(OpKill {});
2465             }
2466             nir_intrinsic_global_atomic => {
2467                 let bit_size = intrin.def.bit_size();
2468                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2469                 let data = self.get_src(&srcs[1]);
2470                 let atom_type = self.get_atomic_type(intrin);
2471                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2472 
2473                 assert!(intrin.def.num_components() == 1);
2474                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2475 
2476                 let is_reduction =
2477                     atom_op.is_reduction() && intrin.def.components_read() == 0;
2478 
2479                 b.push_op(OpAtom {
2480                     dst: if is_reduction { Dst::None } else { dst.into() },
2481                     addr: addr,
2482                     cmpr: 0.into(),
2483                     data: data,
2484                     atom_op: atom_op,
2485                     atom_type: atom_type,
2486                     addr_offset: offset,
2487                     mem_space: MemSpace::Global(MemAddrType::A64),
2488                     mem_order: MemOrder::Strong(MemScope::System),
2489                     mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2490                 });
2491                 self.set_dst(&intrin.def, dst);
2492             }
2493             nir_intrinsic_global_atomic_swap => {
2494                 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2495                 let bit_size = intrin.def.bit_size();
2496                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2497                 let cmpr = self.get_src(&srcs[1]);
2498                 let data = self.get_src(&srcs[2]);
2499                 let atom_type = AtomType::U(bit_size);
2500 
2501                 assert!(intrin.def.num_components() == 1);
2502                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2503 
2504                 b.push_op(OpAtom {
2505                     dst: dst.into(),
2506                     addr: addr,
2507                     cmpr: cmpr,
2508                     data: data,
2509                     atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
2510                     atom_type: atom_type,
2511                     addr_offset: offset,
2512                     mem_space: MemSpace::Global(MemAddrType::A64),
2513                     mem_order: MemOrder::Strong(MemScope::System),
2514                     mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2515                 });
2516                 self.set_dst(&intrin.def, dst);
2517             }
2518             nir_intrinsic_ipa_nv => {
2519                 let addr = u16::try_from(intrin.base()).unwrap();
2520 
2521                 let flags = intrin.flags();
2522                 let flags: nak_nir_ipa_flags =
2523                     unsafe { std::mem::transmute_copy(&flags) };
2524 
2525                 let mode = match flags.interp_mode() {
2526                     NAK_INTERP_MODE_PERSPECTIVE => PixelImap::Perspective,
2527                     NAK_INTERP_MODE_SCREEN_LINEAR => PixelImap::ScreenLinear,
2528                     NAK_INTERP_MODE_CONSTANT => PixelImap::Constant,
2529                     _ => panic!("Unsupported interp mode"),
2530                 };
2531 
2532                 let freq = match flags.interp_freq() {
2533                     NAK_INTERP_FREQ_PASS => InterpFreq::Pass,
2534                     NAK_INTERP_FREQ_PASS_MUL_W => InterpFreq::PassMulW,
2535                     NAK_INTERP_FREQ_CONSTANT => InterpFreq::Constant,
2536                     NAK_INTERP_FREQ_STATE => InterpFreq::State,
2537                     _ => panic!("Invalid interp freq"),
2538                 };
2539 
2540                 let loc = match flags.interp_loc() {
2541                     NAK_INTERP_LOC_DEFAULT => InterpLoc::Default,
2542                     NAK_INTERP_LOC_CENTROID => InterpLoc::Centroid,
2543                     NAK_INTERP_LOC_OFFSET => InterpLoc::Offset,
2544                     _ => panic!("Invalid interp loc"),
2545                 };
2546 
2547                 let inv_w = if freq == InterpFreq::PassMulW {
2548                     self.get_src(&srcs[0])
2549                 } else {
2550                     0.into()
2551                 };
2552 
2553                 let offset = if loc == InterpLoc::Offset {
2554                     self.get_src(&srcs[1])
2555                 } else {
2556                     0.into()
2557                 };
2558 
2559                 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2560                     panic!("OpIpa is only used for fragment shaders");
2561                 };
2562 
2563                 io.mark_attr_read(addr, mode);
2564 
2565                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2566                 b.push_op(OpIpa {
2567                     dst: dst.into(),
2568                     addr: addr,
2569                     freq: freq,
2570                     loc: loc,
2571                     inv_w: inv_w,
2572                     offset: offset,
2573                 });
2574                 self.set_dst(&intrin.def, dst);
2575             }
2576             nir_intrinsic_isberd_nv => {
2577                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2578                 b.push_op(OpIsberd {
2579                     dst: dst.into(),
2580                     idx: self.get_src(&srcs[0]),
2581                 });
2582                 self.set_dst(&intrin.def, dst);
2583             }
2584             nir_intrinsic_load_barycentric_at_offset_nv => (),
2585             nir_intrinsic_load_barycentric_centroid => (),
2586             nir_intrinsic_load_barycentric_pixel => (),
2587             nir_intrinsic_load_barycentric_sample => (),
2588             nir_intrinsic_load_global | nir_intrinsic_load_global_constant => {
2589                 let size_B =
2590                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2591                 assert!(u32::from(size_B) <= intrin.align());
2592                 let order = if intrin.intrinsic
2593                     == nir_intrinsic_load_global_constant
2594                     || (intrin.access() & ACCESS_CAN_REORDER) != 0
2595                 {
2596                     MemOrder::Constant
2597                 } else {
2598                     MemOrder::Strong(MemScope::System)
2599                 };
2600                 let access = MemAccess {
2601                     mem_type: MemType::from_size(size_B, false),
2602                     space: MemSpace::Global(MemAddrType::A64),
2603                     order: order,
2604                     eviction_priority: self
2605                         .get_eviction_priority(intrin.access()),
2606                 };
2607                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2608                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2609 
2610                 b.push_op(OpLd {
2611                     dst: dst.into(),
2612                     addr: addr,
2613                     offset: offset,
2614                     access: access,
2615                 });
2616                 self.set_dst(&intrin.def, dst);
2617             }
2618             nir_intrinsic_ldtram_nv => {
2619                 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2620                     panic!("ldtram_nv is only used for fragment shaders");
2621                 };
2622 
2623                 assert!(
2624                     intrin.def.bit_size() == 32
2625                         && intrin.def.num_components == 2
2626                 );
2627 
2628                 let flags = intrin.flags();
2629                 let use_c = flags != 0;
2630 
2631                 let addr = u16::try_from(intrin.base()).unwrap();
2632 
2633                 io.mark_barycentric_attr_in(addr);
2634 
2635                 let dst = b.alloc_ssa(RegFile::GPR, 2);
2636                 b.push_op(OpLdTram {
2637                     dst: dst.into(),
2638                     addr,
2639                     use_c,
2640                 });
2641                 self.set_dst(&intrin.def, dst);
2642             }
2643             nir_intrinsic_load_sample_id => {
2644                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2645                 b.push_op(OpPixLd {
2646                     dst: dst.into(),
2647                     val: PixVal::MyIndex,
2648                 });
2649                 self.set_dst(&intrin.def, dst);
2650             }
2651             nir_intrinsic_load_sample_mask_in => {
2652                 if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
2653                     info.reads_sample_mask = true;
2654                 } else {
2655                     panic!(
2656                         "sample_mask_in is only available in fragment shaders"
2657                     );
2658                 }
2659 
2660                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2661                 b.push_op(OpPixLd {
2662                     dst: dst.into(),
2663                     val: PixVal::CovMask,
2664                 });
2665                 self.set_dst(&intrin.def, dst);
2666             }
2667             nir_intrinsic_load_tess_coord_xy => {
2668                 // Loading gl_TessCoord in tessellation evaluation shaders is
2669                 // weird.  It's treated as a per-vertex output which is indexed
2670                 // by LANEID.
2671                 match &self.info.stage {
2672                     ShaderStageInfo::Tessellation(_) => (),
2673                     _ => panic!(
2674                         "load_tess_coord is only available in tessellation \
2675                          shaders"
2676                     ),
2677                 };
2678 
2679                 assert!(intrin.def.bit_size() == 32);
2680                 assert!(intrin.def.num_components() == 2);
2681 
2682                 let vtx = b.alloc_ssa(RegFile::GPR, 1);
2683                 b.push_op(OpS2R {
2684                     dst: vtx.into(),
2685                     idx: 0,
2686                 });
2687 
2688                 let access = AttrAccess {
2689                     addr: NAK_ATTR_TESS_COORD,
2690                     comps: 2,
2691                     patch: false,
2692                     output: true,
2693                     phys: false,
2694                 };
2695 
2696                 // This is recorded as a patch output in parse_shader() because
2697                 // the hardware requires it be in the SPH, whether we use it or
2698                 // not.
2699 
2700                 let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2701                 b.push_op(OpALd {
2702                     dst: dst.into(),
2703                     vtx: vtx.into(),
2704                     offset: 0.into(),
2705                     access: access,
2706                 });
2707                 self.set_dst(&intrin.def, dst);
2708             }
2709             nir_intrinsic_load_scratch => {
2710                 let size_B =
2711                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2712                 assert!(u32::from(size_B) <= intrin.align());
2713                 let access = MemAccess {
2714                     mem_type: MemType::from_size(size_B, false),
2715                     space: MemSpace::Local,
2716                     order: MemOrder::Strong(MemScope::CTA),
2717                     eviction_priority: MemEvictionPriority::Normal,
2718                 };
2719                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2720                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2721 
2722                 b.push_op(OpLd {
2723                     dst: dst.into(),
2724                     addr: addr,
2725                     offset: offset,
2726                     access: access,
2727                 });
2728                 self.set_dst(&intrin.def, dst);
2729             }
2730             nir_intrinsic_load_shared => {
2731                 let size_B =
2732                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2733                 assert!(u32::from(size_B) <= intrin.align());
2734                 let access = MemAccess {
2735                     mem_type: MemType::from_size(size_B, false),
2736                     space: MemSpace::Shared,
2737                     order: MemOrder::Strong(MemScope::CTA),
2738                     eviction_priority: MemEvictionPriority::Normal,
2739                 };
2740                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2741                 let offset = offset + intrin.base();
2742                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2743 
2744                 b.push_op(OpLd {
2745                     dst: dst.into(),
2746                     addr: addr,
2747                     offset: offset,
2748                     access: access,
2749                 });
2750                 self.set_dst(&intrin.def, dst);
2751             }
2752             nir_intrinsic_load_sysval_nv => {
2753                 let idx = u8::try_from(intrin.base()).unwrap();
2754                 debug_assert!(intrin.def.num_components == 1);
2755                 debug_assert!(
2756                     intrin.def.bit_size == 32 || intrin.def.bit_size == 64
2757                 );
2758                 let comps = intrin.def.bit_size / 32;
2759                 let dst = b.alloc_ssa(RegFile::GPR, comps);
2760                 if idx == NAK_SV_CLOCK || idx == NAK_SV_CLOCK + 1 {
2761                     debug_assert!(idx + comps <= NAK_SV_CLOCK + 2);
2762                     b.push_op(OpCS2R {
2763                         dst: dst.into(),
2764                         idx: idx,
2765                     });
2766                 } else {
2767                     debug_assert!(intrin.def.bit_size == 32);
2768                     b.push_op(OpS2R {
2769                         dst: dst.into(),
2770                         idx: idx,
2771                     });
2772                 }
2773                 self.set_dst(&intrin.def, dst);
2774             }
2775             nir_intrinsic_ldc_nv => {
2776                 let size_B =
2777                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2778                 let idx = &srcs[0];
2779 
2780                 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2781 
2782                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2783 
2784                 if let Some(idx_imm) = idx.as_uint() {
2785                     let idx_imm: u8 = idx_imm.try_into().unwrap();
2786                     let cb = CBufRef {
2787                         buf: CBuf::Binding(idx_imm),
2788                         offset: off_imm,
2789                     };
2790                     if off.is_zero() {
2791                         for (i, comp) in dst.iter().enumerate() {
2792                             let i = u16::try_from(i).unwrap();
2793                             b.copy_to((*comp).into(), cb.offset(i * 4).into());
2794                         }
2795                     } else {
2796                         b.push_op(OpLdc {
2797                             dst: dst.into(),
2798                             cb: cb.into(),
2799                             offset: off,
2800                             mode: LdcMode::Indexed,
2801                             mem_type: MemType::from_size(size_B, false),
2802                         });
2803                     }
2804                 } else {
2805                     // In the IndexedSegmented mode, the hardware computes the
2806                     // actual index and offset as follows:
2807                     //
2808                     //    idx = imm_idx + reg[31:16]
2809                     //    offset = imm_offset + reg[15:0]
2810                     //    ldc c[idx][offset]
2811                     //
2812                     // So pack the index and offset accordingly
2813                     let idx = self.get_src(idx);
2814                     let off_idx = b.prmt(off, idx, [0, 1, 4, 5]);
2815                     let cb = CBufRef {
2816                         buf: CBuf::Binding(0),
2817                         offset: off_imm,
2818                     };
2819                     b.push_op(OpLdc {
2820                         dst: dst.into(),
2821                         cb: cb.into(),
2822                         offset: off_idx.into(),
2823                         mode: LdcMode::IndexedSegmented,
2824                         mem_type: MemType::from_size(size_B, false),
2825                     });
2826                 }
2827                 self.set_dst(&intrin.def, dst);
2828             }
2829             nir_intrinsic_ldcx_nv => {
2830                 let size_B =
2831                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2832 
2833                 let handle = self.get_ssa_ref(&srcs[0]);
2834                 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2835 
2836                 let cb = CBufRef {
2837                     buf: CBuf::BindlessSSA(handle),
2838                     offset: off_imm,
2839                 };
2840 
2841                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2842                 if off.is_zero() {
2843                     for (i, comp) in dst.iter().enumerate() {
2844                         let i = u16::try_from(i).unwrap();
2845                         b.copy_to((*comp).into(), cb.offset(i * 4).into());
2846                     }
2847                 } else {
2848                     b.push_op(OpLdc {
2849                         dst: dst.into(),
2850                         cb: cb.into(),
2851                         offset: off,
2852                         mode: LdcMode::Indexed,
2853                         mem_type: MemType::from_size(size_B, false),
2854                     });
2855                 }
2856                 self.set_dst(&intrin.def, dst);
2857             }
2858             nir_intrinsic_pin_cx_handle_nv => {
2859                 let handle = self.get_ssa_ref(&srcs[0]);
2860                 b.push_op(OpPin {
2861                     src: handle.into(),
2862                     dst: handle.into(),
2863                 });
2864             }
2865             nir_intrinsic_unpin_cx_handle_nv => {
2866                 let handle = self.get_ssa_ref(&srcs[0]);
2867                 b.push_op(OpUnpin {
2868                     src: handle.into(),
2869                     dst: handle.into(),
2870                 });
2871             }
2872             nir_intrinsic_barrier => {
2873                 let modes = intrin.memory_modes();
2874                 let semantics = intrin.memory_semantics();
2875                 if (modes & nir_var_mem_global) != 0
2876                     && (semantics & NIR_MEMORY_RELEASE) != 0
2877                 {
2878                     // Pre-Volta doesn't have WBAll but it also seems that we
2879                     // don't need it.
2880                     if self.sm.sm() >= 70 {
2881                         b.push_op(OpCCtl {
2882                             op: CCtlOp::WBAll,
2883                             mem_space: MemSpace::Global(MemAddrType::A64),
2884                             addr: 0.into(),
2885                             addr_offset: 0,
2886                         });
2887                     }
2888                 }
2889                 match intrin.execution_scope() {
2890                     SCOPE_NONE => (),
2891                     SCOPE_WORKGROUP => {
2892                         assert!(
2893                             self.nir.info.stage() == MESA_SHADER_COMPUTE
2894                                 || self.nir.info.stage() == MESA_SHADER_KERNEL
2895                         );
2896                         self.info.num_control_barriers = 1;
2897                         b.push_op(OpBar {});
2898                     }
2899                     _ => panic!("Unhandled execution scope"),
2900                 }
2901                 if intrin.memory_scope() != SCOPE_NONE {
2902                     let mem_scope = match intrin.memory_scope() {
2903                         SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
2904                         SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
2905                             MemScope::GPU
2906                         }
2907                         _ => panic!("Unhandled memory scope"),
2908                     };
2909                     b.push_op(OpMemBar { scope: mem_scope });
2910                 }
2911                 if (modes & nir_var_mem_global) != 0
2912                     && (semantics & NIR_MEMORY_ACQUIRE) != 0
2913                 {
2914                     b.push_op(OpCCtl {
2915                         op: CCtlOp::IVAll,
2916                         mem_space: MemSpace::Global(MemAddrType::A64),
2917                         addr: 0.into(),
2918                         addr_offset: 0,
2919                     });
2920                 }
2921             }
2922             nir_intrinsic_quad_broadcast
2923             | nir_intrinsic_read_invocation
2924             | nir_intrinsic_shuffle
2925             | nir_intrinsic_shuffle_down
2926             | nir_intrinsic_shuffle_up
2927             | nir_intrinsic_shuffle_xor => {
2928                 assert!(srcs[0].bit_size() == 32);
2929                 assert!(srcs[0].num_components() == 1);
2930                 let data = self.get_src(&srcs[0]);
2931 
2932                 assert!(srcs[1].bit_size() == 32);
2933                 let idx = self.get_src(&srcs[1]);
2934 
2935                 assert!(intrin.def.bit_size() == 32);
2936                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2937 
2938                 b.push_op(OpShfl {
2939                     dst: dst.into(),
2940                     in_bounds: Dst::None,
2941                     src: data,
2942                     lane: idx,
2943                     c: match intrin.intrinsic {
2944                         nir_intrinsic_quad_broadcast => 0x1c_03.into(),
2945                         nir_intrinsic_shuffle_up => 0.into(),
2946                         _ => 0x1f.into(),
2947                     },
2948                     op: match intrin.intrinsic {
2949                         nir_intrinsic_shuffle_down => ShflOp::Down,
2950                         nir_intrinsic_shuffle_up => ShflOp::Up,
2951                         nir_intrinsic_shuffle_xor => ShflOp::Bfly,
2952                         _ => ShflOp::Idx,
2953                     },
2954                 });
2955                 self.set_dst(&intrin.def, dst);
2956             }
2957             nir_intrinsic_quad_swap_horizontal
2958             | nir_intrinsic_quad_swap_vertical
2959             | nir_intrinsic_quad_swap_diagonal => {
2960                 assert!(srcs[0].bit_size() == 32);
2961                 assert!(srcs[0].num_components() == 1);
2962                 let data = self.get_src(&srcs[0]);
2963 
2964                 assert!(intrin.def.bit_size() == 32);
2965                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2966                 b.push_op(OpShfl {
2967                     dst: dst.into(),
2968                     in_bounds: Dst::None,
2969                     src: data,
2970                     lane: match intrin.intrinsic {
2971                         nir_intrinsic_quad_swap_horizontal => 1_u32.into(),
2972                         nir_intrinsic_quad_swap_vertical => 2_u32.into(),
2973                         nir_intrinsic_quad_swap_diagonal => 3_u32.into(),
2974                         op => panic!("Unknown quad intrinsic {}", op),
2975                     },
2976                     c: 0x1c_03.into(),
2977                     op: ShflOp::Bfly,
2978                 });
2979                 self.set_dst(&intrin.def, dst);
2980             }
2981             nir_intrinsic_shared_atomic => {
2982                 let bit_size = intrin.def.bit_size();
2983                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2984                 let data = self.get_src(&srcs[1]);
2985                 let atom_type = self.get_atomic_type(intrin);
2986                 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2987 
2988                 assert!(intrin.def.num_components() == 1);
2989                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2990 
2991                 b.push_op(OpAtom {
2992                     dst: dst.into(),
2993                     addr: addr,
2994                     cmpr: 0.into(),
2995                     data: data,
2996                     atom_op: atom_op,
2997                     atom_type: atom_type,
2998                     addr_offset: offset,
2999                     mem_space: MemSpace::Shared,
3000                     mem_order: MemOrder::Strong(MemScope::CTA),
3001                     mem_eviction_priority: MemEvictionPriority::Normal,
3002                 });
3003                 self.set_dst(&intrin.def, dst);
3004             }
3005             nir_intrinsic_shared_atomic_swap => {
3006                 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
3007                 let bit_size = intrin.def.bit_size();
3008                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
3009                 let cmpr = self.get_src(&srcs[1]);
3010                 let data = self.get_src(&srcs[2]);
3011                 let atom_type = AtomType::U(bit_size);
3012 
3013                 assert!(intrin.def.num_components() == 1);
3014                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
3015 
3016                 b.push_op(OpAtom {
3017                     dst: dst.into(),
3018                     addr: addr,
3019                     cmpr: cmpr,
3020                     data: data,
3021                     atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
3022                     atom_type: atom_type,
3023                     addr_offset: offset,
3024                     mem_space: MemSpace::Shared,
3025                     mem_order: MemOrder::Strong(MemScope::CTA),
3026                     mem_eviction_priority: MemEvictionPriority::Normal,
3027                 });
3028                 self.set_dst(&intrin.def, dst);
3029             }
3030             nir_intrinsic_ssa_bar_nv => {
3031                 let src = self.get_src(&srcs[0]);
3032                 b.push_op(OpSrcBar { src });
3033             }
3034             nir_intrinsic_store_global => {
3035                 let data = self.get_src(&srcs[0]);
3036                 let size_B =
3037                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
3038                 assert!(u32::from(size_B) <= intrin.align());
3039                 let access = MemAccess {
3040                     mem_type: MemType::from_size(size_B, false),
3041                     space: MemSpace::Global(MemAddrType::A64),
3042                     order: MemOrder::Strong(MemScope::System),
3043                     eviction_priority: self
3044                         .get_eviction_priority(intrin.access()),
3045                 };
3046                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3047 
3048                 b.push_op(OpSt {
3049                     addr: addr,
3050                     data: data,
3051                     offset: offset,
3052                     access: access,
3053                 });
3054             }
3055             nir_intrinsic_fs_out_nv => {
3056                 let data = self.get_ssa(srcs[0].as_def());
3057                 assert!(data.len() == 1);
3058                 let data = data[0];
3059 
3060                 let addr = u16::try_from(intrin.base()).unwrap();
3061                 assert!(addr % 4 == 0);
3062 
3063                 self.fs_out_regs[usize::from(addr / 4)] = data;
3064             }
3065             nir_intrinsic_store_scratch => {
3066                 let data = self.get_src(&srcs[0]);
3067                 let size_B =
3068                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
3069                 assert!(u32::from(size_B) <= intrin.align());
3070                 let access = MemAccess {
3071                     mem_type: MemType::from_size(size_B, false),
3072                     space: MemSpace::Local,
3073                     order: MemOrder::Strong(MemScope::CTA),
3074                     eviction_priority: MemEvictionPriority::Normal,
3075                 };
3076                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3077 
3078                 b.push_op(OpSt {
3079                     addr: addr,
3080                     data: data,
3081                     offset: offset,
3082                     access: access,
3083                 });
3084             }
3085             nir_intrinsic_store_shared => {
3086                 let data = self.get_src(&srcs[0]);
3087                 let size_B =
3088                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
3089                 assert!(u32::from(size_B) <= intrin.align());
3090                 let access = MemAccess {
3091                     mem_type: MemType::from_size(size_B, false),
3092                     space: MemSpace::Shared,
3093                     order: MemOrder::Strong(MemScope::CTA),
3094                     eviction_priority: MemEvictionPriority::Normal,
3095                 };
3096                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3097                 let offset = offset + intrin.base();
3098 
3099                 b.push_op(OpSt {
3100                     addr: addr,
3101                     data: data,
3102                     offset: offset,
3103                     access: access,
3104                 });
3105             }
3106             nir_intrinsic_emit_vertex_nv | nir_intrinsic_end_primitive_nv => {
3107                 assert!(intrin.def.bit_size() == 32);
3108                 assert!(intrin.def.num_components() == 1);
3109 
3110                 let dst = b.alloc_ssa(RegFile::GPR, 1);
3111                 let handle = self.get_src(&srcs[0]);
3112                 let stream_id = intrin.stream_id();
3113 
3114                 b.push_op(OpOut {
3115                     dst: dst.into(),
3116                     handle: handle,
3117                     stream: stream_id.into(),
3118                     out_type: if intrin.intrinsic
3119                         == nir_intrinsic_emit_vertex_nv
3120                     {
3121                         OutType::Emit
3122                     } else {
3123                         OutType::Cut
3124                     },
3125                 });
3126                 self.set_dst(&intrin.def, dst);
3127             }
3128 
3129             nir_intrinsic_final_primitive_nv => {
3130                 let handle = self.get_src(&srcs[0]);
3131 
3132                 if self.sm.sm() >= 70 {
3133                     b.push_op(OpOutFinal { handle: handle });
3134                 } else {
3135                     b.push_op(OpRegOut { srcs: vec![handle] });
3136                 }
3137             }
3138             nir_intrinsic_vote_all
3139             | nir_intrinsic_vote_any
3140             | nir_intrinsic_vote_ieq => {
3141                 assert!(srcs[0].bit_size() == 1);
3142                 let src = self.get_src(&srcs[0]);
3143 
3144                 assert!(intrin.def.bit_size() == 1);
3145                 let dst = b.alloc_ssa(RegFile::Pred, 1);
3146 
3147                 b.push_op(OpVote {
3148                     op: match intrin.intrinsic {
3149                         nir_intrinsic_vote_all => VoteOp::All,
3150                         nir_intrinsic_vote_any => VoteOp::Any,
3151                         nir_intrinsic_vote_ieq => VoteOp::Eq,
3152                         _ => panic!("Unknown vote intrinsic"),
3153                     },
3154                     ballot: Dst::None,
3155                     vote: dst.into(),
3156                     pred: src,
3157                 });
3158                 self.set_dst(&intrin.def, dst);
3159             }
3160             nir_intrinsic_is_sparse_texels_resident => {
3161                 let src = self.get_src(&srcs[0]);
3162                 let dst = b.isetp(IntCmpType::I32, IntCmpOp::Ne, src, 0.into());
3163                 self.set_dst(&intrin.def, dst);
3164             }
3165             _ => panic!(
3166                 "Unsupported intrinsic instruction: {}",
3167                 intrin.info().name()
3168             ),
3169         }
3170     }
3171 
parse_load_const( &mut self, b: &mut impl SSABuilder, load_const: &nir_load_const_instr, )3172     fn parse_load_const(
3173         &mut self,
3174         b: &mut impl SSABuilder,
3175         load_const: &nir_load_const_instr,
3176     ) {
3177         let values = &load_const.values();
3178 
3179         let mut dst = Vec::new();
3180         match load_const.def.bit_size {
3181             1 => {
3182                 for c in 0..load_const.def.num_components {
3183                     let imm_b1 = unsafe { values[usize::from(c)].b };
3184                     dst.push(b.copy(imm_b1.into())[0]);
3185                 }
3186             }
3187             8 => {
3188                 for dw in 0..load_const.def.num_components.div_ceil(4) {
3189                     let mut imm_u32 = 0;
3190                     for b in 0..4 {
3191                         let c = dw * 4 + b;
3192                         if c < load_const.def.num_components {
3193                             let imm_u8 = unsafe { values[usize::from(c)].u8_ };
3194                             imm_u32 |= u32::from(imm_u8) << b * 8;
3195                         }
3196                     }
3197                     dst.push(b.copy(imm_u32.into())[0]);
3198                 }
3199             }
3200             16 => {
3201                 for dw in 0..load_const.def.num_components.div_ceil(2) {
3202                     let mut imm_u32 = 0;
3203                     for w in 0..2 {
3204                         let c = dw * 2 + w;
3205                         if c < load_const.def.num_components {
3206                             let imm_u16 =
3207                                 unsafe { values[usize::from(c)].u16_ };
3208                             imm_u32 |= u32::from(imm_u16) << w * 16;
3209                         }
3210                     }
3211                     dst.push(b.copy(imm_u32.into())[0]);
3212                 }
3213             }
3214             32 => {
3215                 for c in 0..load_const.def.num_components {
3216                     let imm_u32 = unsafe { values[usize::from(c)].u32_ };
3217                     dst.push(b.copy(imm_u32.into())[0]);
3218                 }
3219             }
3220             64 => {
3221                 for c in 0..load_const.def.num_components {
3222                     let imm_u64 = unsafe { values[c as usize].u64_ };
3223                     dst.push(b.copy((imm_u64 as u32).into())[0]);
3224                     dst.push(b.copy(((imm_u64 >> 32) as u32).into())[0]);
3225                 }
3226             }
3227             _ => panic!("Unknown bit size: {}", load_const.def.bit_size),
3228         }
3229 
3230         self.set_ssa(&load_const.def, dst);
3231     }
3232 
parse_undef( &mut self, b: &mut impl SSABuilder, undef: &nir_undef_instr, )3233     fn parse_undef(
3234         &mut self,
3235         b: &mut impl SSABuilder,
3236         undef: &nir_undef_instr,
3237     ) {
3238         let dst = alloc_ssa_for_nir(b, &undef.def);
3239         for c in &dst {
3240             b.push_op(OpUndef { dst: (*c).into() });
3241         }
3242         self.set_ssa(&undef.def, dst);
3243     }
3244 
emit_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, target: &nir_block, )3245     fn emit_jump(
3246         &mut self,
3247         b: &mut impl SSABuilder,
3248         nb: &nir_block,
3249         target: &nir_block,
3250     ) {
3251         if target.index == self.end_block_id {
3252             b.push_op(OpExit {});
3253         } else {
3254             self.cfg.add_edge(nb.index, target.index);
3255             let target_label = self.get_block_label(target);
3256 
3257             match self.peek_crs(target) {
3258                 Some(SyncType::Sync) => {
3259                     b.push_op(OpSync {
3260                         target: target_label,
3261                     });
3262                 }
3263                 Some(SyncType::Brk) => {
3264                     b.push_op(OpBrk {
3265                         target: target_label,
3266                     });
3267                 }
3268                 Some(SyncType::Cont) => {
3269                     b.push_op(OpCont {
3270                         target: target_label,
3271                     });
3272                 }
3273                 None => {
3274                     b.push_op(OpBra {
3275                         target: target_label,
3276                     });
3277                 }
3278             }
3279         }
3280     }
3281 
emit_pred_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, pred: Pred, target: &nir_block, fallthrough: &nir_block, )3282     fn emit_pred_jump(
3283         &mut self,
3284         b: &mut impl SSABuilder,
3285         nb: &nir_block,
3286         pred: Pred,
3287         target: &nir_block,
3288         fallthrough: &nir_block,
3289     ) {
3290         // The fall-through edge has to come first
3291         self.cfg.add_edge(nb.index, fallthrough.index);
3292         let op = if target.index == self.end_block_id {
3293             Op::Exit(OpExit {})
3294         } else {
3295             self.cfg.add_edge(nb.index, target.index);
3296             Op::Bra(OpBra {
3297                 target: self.get_block_label(target),
3298             })
3299         };
3300         b.predicate(pred).push_op(op);
3301     }
3302 
parse_block( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nb: &nir_block, )3303     fn parse_block(
3304         &mut self,
3305         ssa_alloc: &mut SSAValueAllocator,
3306         phi_map: &mut PhiAllocMap,
3307         nb: &nir_block,
3308     ) {
3309         let sm = self.sm;
3310         let mut b = SSAInstrBuilder::new(sm, ssa_alloc);
3311 
3312         if self.sm.sm() >= 70 && nb.index == 0 && self.nir.info.shared_size > 0
3313         {
3314             // The blob seems to always do a BSYNC before accessing shared
3315             // memory.  Perhaps this is to ensure that our allocation is
3316             // actually available and not in use by another thread?
3317             let label = self.label_alloc.alloc();
3318             let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
3319 
3320             b.push_op(OpBClear {
3321                 dst: bar_clear.into(),
3322             });
3323 
3324             let bar = b.alloc_ssa(RegFile::Bar, 1);
3325             b.push_op(OpBSSy {
3326                 bar_out: bar.into(),
3327                 bar_in: bar_clear.into(),
3328                 cond: SrcRef::True.into(),
3329                 target: label,
3330             });
3331 
3332             b.push_op(OpBSync {
3333                 bar: bar.into(),
3334                 cond: SrcRef::True.into(),
3335             });
3336 
3337             b.push_op(OpNop { label: Some(label) });
3338         }
3339 
3340         let mut phi = OpPhiDsts::new();
3341         for ni in nb.iter_instr_list() {
3342             let Some(np) = ni.as_phi() else {
3343                 break;
3344             };
3345 
3346             if DEBUG.annotate() {
3347                 let annotation = self
3348                     .nir_instr_printer
3349                     .instr_to_string(ni)
3350                     .unwrap()
3351                     .split_whitespace()
3352                     .collect::<Vec<_>>()
3353                     .join(" ");
3354                 b.push_op(OpAnnotate {
3355                     annotation: format!("generated by \"{}\"", annotation,),
3356                 });
3357             }
3358 
3359             let uniform = !nb.divergent
3360                 && self.sm.sm() >= 75
3361                 && !DEBUG.no_ugpr()
3362                 && !np.def.divergent;
3363 
3364             // This should be ensured by nak_nir_lower_cf()
3365             if uniform {
3366                 for ps in np.iter_srcs() {
3367                     assert!(!ps.pred().divergent);
3368                 }
3369             }
3370 
3371             let mut b = UniformBuilder::new(&mut b, uniform);
3372             let dst = alloc_ssa_for_nir(&mut b, np.def.as_def());
3373             for i in 0..dst.len() {
3374                 let phi_id = phi_map.get_phi_id(np, i.try_into().unwrap());
3375                 phi.dsts.push(phi_id, dst[i].into());
3376             }
3377             self.set_ssa(np.def.as_def(), dst);
3378         }
3379 
3380         if !phi.dsts.is_empty() {
3381             b.push_op(phi);
3382         }
3383 
3384         if self.sm.sm() < 75 && nb.cf_node.prev().is_none() {
3385             if let Some(_) = nb.parent().as_loop() {
3386                 b.push_op(OpPCnt {
3387                     target: self.get_block_label(nb),
3388                 });
3389                 self.push_crs(nb, SyncType::Cont);
3390             }
3391         }
3392 
3393         let mut goto = None;
3394         for ni in nb.iter_instr_list() {
3395             if DEBUG.annotate() && ni.type_ != nir_instr_type_phi {
3396                 let annotation = self
3397                     .nir_instr_printer
3398                     .instr_to_string(ni)
3399                     .unwrap()
3400                     .split_whitespace()
3401                     .collect::<Vec<_>>()
3402                     .join(" ");
3403                 b.push_op(OpAnnotate {
3404                     annotation: format!("generated by \"{}\"", annotation,),
3405                 });
3406             }
3407 
3408             let uniform = !nb.divergent
3409                 && self.sm.sm() >= 75
3410                 && !DEBUG.no_ugpr()
3411                 && ni.def().is_some_and(|d| !d.divergent);
3412             let mut b = UniformBuilder::new(&mut b, uniform);
3413 
3414             match ni.type_ {
3415                 nir_instr_type_alu => {
3416                     self.parse_alu(&mut b, ni.as_alu().unwrap())
3417                 }
3418                 nir_instr_type_jump => {
3419                     let jump = ni.as_jump().unwrap();
3420                     if jump.type_ == nir_jump_goto
3421                         || jump.type_ == nir_jump_goto_if
3422                     {
3423                         goto = Some(jump);
3424                     }
3425                 }
3426                 nir_instr_type_tex => {
3427                     self.parse_tex(&mut b, ni.as_tex().unwrap())
3428                 }
3429                 nir_instr_type_intrinsic => {
3430                     self.parse_intrinsic(&mut b, ni.as_intrinsic().unwrap())
3431                 }
3432                 nir_instr_type_load_const => {
3433                     self.parse_load_const(&mut b, ni.as_load_const().unwrap())
3434                 }
3435                 nir_instr_type_undef => {
3436                     self.parse_undef(&mut b, ni.as_undef().unwrap())
3437                 }
3438                 nir_instr_type_phi => (),
3439                 _ => panic!("Unsupported instruction type"),
3440             }
3441         }
3442 
3443         if self.sm.sm() < 70 {
3444             if let Some(ni) = nb.following_if() {
3445                 let fb = ni.following_block();
3446                 b.push_op(OpSSy {
3447                     target: self.get_block_label(fb),
3448                 });
3449                 self.push_crs(fb, SyncType::Sync);
3450             } else if let Some(nl) = nb.following_loop() {
3451                 let fb = nl.following_block();
3452                 b.push_op(OpPBk {
3453                     target: self.get_block_label(fb),
3454                 });
3455                 self.push_crs(fb, SyncType::Brk);
3456             }
3457         }
3458 
3459         let succ = nb.successors();
3460         for sb in succ {
3461             let sb = match sb {
3462                 Some(b) => b,
3463                 None => continue,
3464             };
3465 
3466             let mut phi = OpPhiSrcs::new();
3467 
3468             for ni in sb.iter_instr_list() {
3469                 let Some(np) = ni.as_phi() else {
3470                     break;
3471                 };
3472 
3473                 if DEBUG.annotate() {
3474                     let annotation = self
3475                         .nir_instr_printer
3476                         .instr_to_string(ni)
3477                         .unwrap()
3478                         .split_whitespace()
3479                         .collect::<Vec<_>>()
3480                         .join(" ");
3481                     b.push_op(OpAnnotate {
3482                         annotation: format!("generated by \"{}\"", annotation,),
3483                     });
3484                 }
3485 
3486                 for ps in np.iter_srcs() {
3487                     if ps.pred().index == nb.index {
3488                         let src = *self.get_src(&ps.src).as_ssa().unwrap();
3489                         for (i, src) in src.iter().enumerate() {
3490                             let phi_id =
3491                                 phi_map.get_phi_id(np, i.try_into().unwrap());
3492                             phi.srcs.push(phi_id, (*src).into());
3493                         }
3494                         break;
3495                     }
3496                 }
3497             }
3498 
3499             if !phi.srcs.is_empty() {
3500                 b.push_op(phi);
3501             }
3502         }
3503 
3504         if let Some(goto) = goto {
3505             let target = goto.target().unwrap();
3506             if goto.type_ == nir_jump_goto {
3507                 self.emit_jump(&mut b, nb, target);
3508             } else {
3509                 let cond = self.get_ssa(goto.condition.as_def())[0];
3510                 let else_target = goto.else_target().unwrap();
3511 
3512                 /* Next block in the NIR CF list */
3513                 let next_block = nb.cf_node.next().unwrap().as_block().unwrap();
3514 
3515                 if else_target as *const _ == next_block as *const _ {
3516                     self.emit_pred_jump(
3517                         &mut b,
3518                         nb,
3519                         // This is the branch to jump to the else
3520                         cond.into(),
3521                         target,
3522                         else_target,
3523                     );
3524                 } else if target as *const _ == next_block as *const _ {
3525                     self.emit_pred_jump(
3526                         &mut b,
3527                         nb,
3528                         Pred::from(cond).bnot(),
3529                         else_target,
3530                         target,
3531                     );
3532                 } else {
3533                     panic!(
3534                         "One of the two goto targets must be the next block in \
3535                             the NIR CF list"
3536                     );
3537                 }
3538             }
3539         } else {
3540             if let Some(ni) = nb.following_if() {
3541                 let cond = self.get_ssa(ni.condition.as_def())[0];
3542                 self.emit_pred_jump(
3543                     &mut b,
3544                     nb,
3545                     // This is the branch to jump to the else
3546                     Pred::from(cond).bnot(),
3547                     ni.first_else_block(),
3548                     ni.first_then_block(),
3549                 );
3550             } else {
3551                 assert!(succ[1].is_none());
3552                 let s0 = succ[0].unwrap();
3553                 self.emit_jump(&mut b, nb, s0);
3554             }
3555         }
3556 
3557         let bb = BasicBlock {
3558             label: self.get_block_label(nb),
3559             uniform: !nb.divergent,
3560             instrs: b.as_vec(),
3561         };
3562         self.cfg.add_node(nb.index, bb);
3563     }
3564 
parse_if( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, ni: &nir_if, )3565     fn parse_if(
3566         &mut self,
3567         ssa_alloc: &mut SSAValueAllocator,
3568         phi_map: &mut PhiAllocMap,
3569         ni: &nir_if,
3570     ) {
3571         self.parse_cf_list(ssa_alloc, phi_map, ni.iter_then_list());
3572         self.parse_cf_list(ssa_alloc, phi_map, ni.iter_else_list());
3573 
3574         if self.sm.sm() < 70 {
3575             let next_block = ni.cf_node.next().unwrap().as_block().unwrap();
3576             self.pop_crs(next_block, SyncType::Sync);
3577         }
3578     }
3579 
parse_loop( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nl: &nir_loop, )3580     fn parse_loop(
3581         &mut self,
3582         ssa_alloc: &mut SSAValueAllocator,
3583         phi_map: &mut PhiAllocMap,
3584         nl: &nir_loop,
3585     ) {
3586         self.parse_cf_list(ssa_alloc, phi_map, nl.iter_body());
3587 
3588         if self.sm.sm() < 70 {
3589             let header = nl.iter_body().next().unwrap().as_block().unwrap();
3590             self.pop_crs(header, SyncType::Cont);
3591             let next_block = nl.cf_node.next().unwrap().as_block().unwrap();
3592             self.pop_crs(next_block, SyncType::Brk);
3593         }
3594     }
3595 
parse_cf_list( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, list: ExecListIter<nir_cf_node>, )3596     fn parse_cf_list(
3597         &mut self,
3598         ssa_alloc: &mut SSAValueAllocator,
3599         phi_map: &mut PhiAllocMap,
3600         list: ExecListIter<nir_cf_node>,
3601     ) {
3602         for node in list {
3603             match node.type_ {
3604                 nir_cf_node_block => {
3605                     let nb = node.as_block().unwrap();
3606                     self.parse_block(ssa_alloc, phi_map, nb);
3607                 }
3608                 nir_cf_node_if => {
3609                     let ni = node.as_if().unwrap();
3610                     self.parse_if(ssa_alloc, phi_map, ni);
3611                 }
3612                 nir_cf_node_loop => {
3613                     let nl = node.as_loop().unwrap();
3614                     self.parse_loop(ssa_alloc, phi_map, nl);
3615                 }
3616                 _ => panic!("Invalid inner CF node type"),
3617             }
3618         }
3619     }
3620 
parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function3621     pub fn parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function {
3622         let mut ssa_alloc = SSAValueAllocator::new();
3623         let end_nb = nfi.end_block();
3624         self.end_block_id = end_nb.index;
3625 
3626         let mut phi_alloc = PhiAllocator::new();
3627         let mut phi_map = PhiAllocMap::new(&mut phi_alloc);
3628 
3629         self.parse_cf_list(&mut ssa_alloc, &mut phi_map, nfi.iter_body());
3630 
3631         let cfg = std::mem::take(&mut self.cfg).as_cfg();
3632         assert!(cfg.len() > 0);
3633         for i in 0..cfg.len() {
3634             if cfg[i].falls_through() {
3635                 assert!(cfg.succ_indices(i)[0] == i + 1);
3636             }
3637         }
3638 
3639         let mut f = Function {
3640             ssa_alloc: ssa_alloc,
3641             phi_alloc: phi_alloc,
3642             blocks: cfg,
3643         };
3644         f.repair_ssa();
3645         f
3646     }
3647 
parse_shader(mut self) -> Shader<'a>3648     pub fn parse_shader(mut self) -> Shader<'a> {
3649         let mut functions = Vec::new();
3650         for nf in self.nir.iter_functions() {
3651             if let Some(nfi) = nf.get_impl() {
3652                 let f = self.parse_function_impl(nfi);
3653                 functions.push(f);
3654             }
3655         }
3656 
3657         // Tessellation evaluation shaders MUST claim to read gl_TessCoord or
3658         // the hardware will throw an SPH error.
3659         if matches!(self.info.stage, ShaderStageInfo::Tessellation(_)) {
3660             match &mut self.info.io {
3661                 ShaderIoInfo::Vtg(io) => {
3662                     let tc = NAK_ATTR_TESS_COORD;
3663                     io.mark_attrs_written(tc..(tc + 8));
3664                 }
3665                 _ => panic!("Tessellation must have ShaderIoInfo::Vtg"),
3666             }
3667         }
3668 
3669         Shader {
3670             sm: self.sm,
3671             info: self.info,
3672             functions: functions,
3673         }
3674     }
3675 }
3676 
nak_shader_from_nir<'a>( nak: &nak_compiler, ns: &'a nir_shader, sm: &'a dyn ShaderModel, ) -> Shader<'a>3677 pub fn nak_shader_from_nir<'a>(
3678     nak: &nak_compiler,
3679     ns: &'a nir_shader,
3680     sm: &'a dyn ShaderModel,
3681 ) -> Shader<'a> {
3682     ShaderFromNir::new(nak, ns, sm).parse_shader()
3683 }
3684