• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 #![allow(non_upper_case_globals)]
5 
6 use crate::api::GetDebugFlags;
7 use crate::api::DEBUG;
8 use crate::cfg::CFGBuilder;
9 use crate::ir::*;
10 use crate::nir::*;
11 use crate::nir_instr_printer::NirInstrPrinter;
12 use crate::sph::{OutputTopology, PixelImap};
13 
14 use nak_bindings::*;
15 
16 use std::cmp::max;
17 use std::collections::{HashMap, HashSet};
18 use std::ops::Index;
19 
init_info_from_nir(nir: &nir_shader, sm: u8) -> ShaderInfo20 fn init_info_from_nir(nir: &nir_shader, sm: u8) -> ShaderInfo {
21     ShaderInfo {
22         sm: sm,
23         num_gprs: 0,
24         num_barriers: 0,
25         slm_size: nir.scratch_size,
26         uses_global_mem: false,
27         writes_global_mem: false,
28         // TODO: handle this.
29         uses_fp64: false,
30         stage: match nir.info.stage() {
31             MESA_SHADER_COMPUTE => {
32                 ShaderStageInfo::Compute(ComputeShaderInfo {
33                     local_size: [
34                         nir.info.workgroup_size[0],
35                         nir.info.workgroup_size[1],
36                         nir.info.workgroup_size[2],
37                     ],
38                     smem_size: nir.info.shared_size.try_into().unwrap(),
39                 })
40             }
41             MESA_SHADER_VERTEX => ShaderStageInfo::Vertex,
42             MESA_SHADER_FRAGMENT => ShaderStageInfo::Fragment,
43             MESA_SHADER_GEOMETRY => {
44                 let info_gs = unsafe { &nir.info.__bindgen_anon_1.gs };
45                 let output_topology = match info_gs.output_primitive {
46                     MESA_PRIM_POINTS => OutputTopology::PointList,
47                     MESA_PRIM_LINE_STRIP => OutputTopology::LineStrip,
48                     MESA_PRIM_TRIANGLE_STRIP => OutputTopology::TriangleStrip,
49                     _ => panic!(
50                         "Invalid GS input primitive {}",
51                         info_gs.input_primitive
52                     ),
53                 };
54 
55                 ShaderStageInfo::Geometry(GeometryShaderInfo {
56                     // TODO: Should be set if VK_NV_geometry_shader_passthrough is in use.
57                     passthrough_enable: false,
58                     stream_out_mask: info_gs.active_stream_mask(),
59                     threads_per_input_primitive: info_gs.invocations,
60                     output_topology: output_topology,
61                     max_output_vertex_count: info_gs.vertices_out,
62                 })
63             }
64             MESA_SHADER_TESS_CTRL => {
65                 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
66                 ShaderStageInfo::TessellationInit(TessellationInitShaderInfo {
67                     per_patch_attribute_count: 6,
68                     threads_per_patch: info_tess.tcs_vertices_out,
69                 })
70             }
71             MESA_SHADER_TESS_EVAL => ShaderStageInfo::Tessellation,
72             _ => panic!("Unknown shader stage"),
73         },
74         io: match nir.info.stage() {
75             MESA_SHADER_COMPUTE => ShaderIoInfo::None,
76             MESA_SHADER_FRAGMENT => ShaderIoInfo::Fragment(FragmentIoInfo {
77                 sysvals_in: SysValInfo {
78                     // Required on fragment shaders, otherwise it cause a trap.
79                     ab: 1 << 31,
80                     c: 0,
81                 },
82                 sysvals_in_d: [PixelImap::Unused; 8],
83                 attr_in: [PixelImap::Unused; 128],
84                 barycentric_attr_in: [0; 4],
85                 reads_sample_mask: false,
86                 uses_kill: false,
87                 writes_color: 0,
88                 writes_sample_mask: false,
89                 writes_depth: false,
90                 // TODO: Should be set if interlocks are in use. (VK_EXT_fragment_shader_interlock)
91                 does_interlock: false,
92             }),
93             MESA_SHADER_VERTEX
94             | MESA_SHADER_GEOMETRY
95             | MESA_SHADER_TESS_CTRL
96             | MESA_SHADER_TESS_EVAL => ShaderIoInfo::Vtg(VtgIoInfo {
97                 sysvals_in: SysValInfo::default(),
98                 sysvals_in_d: 0,
99                 sysvals_out: SysValInfo::default(),
100                 sysvals_out_d: 0,
101                 attr_in: [0; 4],
102                 attr_out: [0; 4],
103 
104                 // TODO: figure out how to fill this.
105                 store_req_start: u8::MAX,
106                 store_req_end: 0,
107             }),
108             _ => panic!("Unknown shader stage"),
109         },
110     }
111 }
112 
alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue>113 fn alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue> {
114     let (file, comps) = if ssa.bit_size == 1 {
115         (RegFile::Pred, ssa.num_components)
116     } else {
117         let bits = ssa.bit_size * ssa.num_components;
118         (RegFile::GPR, bits.div_ceil(32))
119     };
120 
121     let mut vec = Vec::new();
122     for _ in 0..comps {
123         vec.push(b.alloc_ssa(file, 1)[0]);
124     }
125     vec
126 }
127 
128 struct PhiAllocMap<'a> {
129     alloc: &'a mut PhiAllocator,
130     map: HashMap<(u32, u8), u32>,
131 }
132 
133 impl<'a> PhiAllocMap<'a> {
new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a>134     fn new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a> {
135         PhiAllocMap {
136             alloc: alloc,
137             map: HashMap::new(),
138         }
139     }
140 
get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32141     fn get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32 {
142         *self
143             .map
144             .entry((phi.def.index, comp))
145             .or_insert_with(|| self.alloc.alloc())
146     }
147 }
148 
149 struct PerSizeFloatControls {
150     pub ftz: bool,
151     pub rnd_mode: FRndMode,
152 }
153 
154 struct ShaderFloatControls {
155     pub fp16: PerSizeFloatControls,
156     pub fp32: PerSizeFloatControls,
157     pub fp64: PerSizeFloatControls,
158 }
159 
160 impl Default for ShaderFloatControls {
default() -> Self161     fn default() -> Self {
162         Self {
163             fp16: PerSizeFloatControls {
164                 ftz: false,
165                 rnd_mode: FRndMode::NearestEven,
166             },
167             fp32: PerSizeFloatControls {
168                 ftz: true, // Default FTZ on fp32
169                 rnd_mode: FRndMode::NearestEven,
170             },
171             fp64: PerSizeFloatControls {
172                 ftz: false,
173                 rnd_mode: FRndMode::NearestEven,
174             },
175         }
176     }
177 }
178 
179 impl ShaderFloatControls {
from_nir(nir: &nir_shader) -> ShaderFloatControls180     fn from_nir(nir: &nir_shader) -> ShaderFloatControls {
181         let nir_fc = nir.info.float_controls_execution_mode;
182         let mut fc: ShaderFloatControls = Default::default();
183 
184         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) != 0 {
185             fc.fp16.ftz = false;
186         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) != 0 {
187             fc.fp16.ftz = true;
188         }
189         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) != 0 {
190             fc.fp16.rnd_mode = FRndMode::NearestEven;
191         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) != 0 {
192             fc.fp16.rnd_mode = FRndMode::Zero;
193         }
194 
195         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) != 0 {
196             fc.fp32.ftz = false;
197         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) != 0 {
198             fc.fp32.ftz = true;
199         }
200         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) != 0 {
201             fc.fp32.rnd_mode = FRndMode::NearestEven;
202         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) != 0 {
203             fc.fp32.rnd_mode = FRndMode::Zero;
204         }
205 
206         if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) != 0 {
207             fc.fp64.ftz = false;
208         } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) != 0 {
209             fc.fp64.ftz = true;
210         }
211         if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) != 0 {
212             fc.fp64.rnd_mode = FRndMode::NearestEven;
213         } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) != 0 {
214             fc.fp64.rnd_mode = FRndMode::Zero;
215         }
216 
217         fc
218     }
219 }
220 
221 impl Index<FloatType> for ShaderFloatControls {
222     type Output = PerSizeFloatControls;
223 
index(&self, idx: FloatType) -> &PerSizeFloatControls224     fn index(&self, idx: FloatType) -> &PerSizeFloatControls {
225         match idx {
226             FloatType::F16 => &self.fp16,
227             FloatType::F32 => &self.fp32,
228             FloatType::F64 => &self.fp64,
229         }
230     }
231 }
232 
233 struct ShaderFromNir<'a> {
234     nir: &'a nir_shader,
235     info: ShaderInfo,
236     float_ctl: ShaderFloatControls,
237     cfg: CFGBuilder<u32, BasicBlock>,
238     label_alloc: LabelAllocator,
239     block_label: HashMap<u32, Label>,
240     bar_label: HashMap<u32, Label>,
241     fs_out_regs: [SSAValue; 34],
242     end_block_id: u32,
243     ssa_map: HashMap<u32, Vec<SSAValue>>,
244     saturated: HashSet<*const nir_def>,
245     nir_instr_printer: NirInstrPrinter,
246 }
247 
248 impl<'a> ShaderFromNir<'a> {
new(nir: &'a nir_shader, sm: u8) -> Self249     fn new(nir: &'a nir_shader, sm: u8) -> Self {
250         Self {
251             nir: nir,
252             info: init_info_from_nir(nir, sm),
253             float_ctl: ShaderFloatControls::from_nir(nir),
254             cfg: CFGBuilder::new(),
255             label_alloc: LabelAllocator::new(),
256             block_label: HashMap::new(),
257             bar_label: HashMap::new(),
258             fs_out_regs: [SSAValue::NONE; 34],
259             end_block_id: 0,
260             ssa_map: HashMap::new(),
261             saturated: HashSet::new(),
262             nir_instr_printer: NirInstrPrinter::new(),
263         }
264     }
265 
get_block_label(&mut self, block: &nir_block) -> Label266     fn get_block_label(&mut self, block: &nir_block) -> Label {
267         *self
268             .block_label
269             .entry(block.index)
270             .or_insert_with(|| self.label_alloc.alloc())
271     }
272 
get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue]273     fn get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue] {
274         self.ssa_map.get(&ssa.index).unwrap()
275     }
276 
set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>)277     fn set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>) {
278         if def.bit_size == 1 {
279             for s in &vec {
280                 assert!(s.is_predicate());
281             }
282         } else {
283             for s in &vec {
284                 assert!(!s.is_predicate());
285             }
286             let bits =
287                 usize::from(def.bit_size) * usize::from(def.num_components);
288             assert!(vec.len() == bits.div_ceil(32));
289         }
290         self.ssa_map
291             .entry(def.index)
292             .and_modify(|_| panic!("Cannot set an SSA def twice"))
293             .or_insert(vec);
294     }
295 
get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8)296     fn get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8) {
297         let vec = self.get_ssa(def);
298         match def.bit_size {
299             1 => (vec[usize::from(c)].into(), 0),
300             8 => (vec[usize::from(c / 4)].into(), c % 4),
301             16 => (vec[usize::from(c / 2)].into(), (c * 2) % 4),
302             32 => (vec[usize::from(c)].into(), 0),
303             64 => {
304                 let comps =
305                     [vec[usize::from(c) * 2 + 0], vec[usize::from(c) * 2 + 1]];
306                 (comps.into(), 0)
307             }
308             _ => panic!("Unsupported bit size: {}", def.bit_size),
309         }
310     }
311 
get_src(&mut self, src: &nir_src) -> Src312     fn get_src(&mut self, src: &nir_src) -> Src {
313         SSARef::try_from(self.get_ssa(src.as_def())).unwrap().into()
314     }
315 
get_io_addr_offset( &mut self, addr: &nir_src, imm_bits: u8, ) -> (Src, i32)316     fn get_io_addr_offset(
317         &mut self,
318         addr: &nir_src,
319         imm_bits: u8,
320     ) -> (Src, i32) {
321         let addr = addr.as_def();
322         let addr_offset = unsafe {
323             nak_get_io_addr_offset(addr as *const _ as *mut _, imm_bits)
324         };
325 
326         if let Some(base_def) = std::ptr::NonNull::new(addr_offset.base.def) {
327             let base_def = unsafe { base_def.as_ref() };
328             let base_comp = u8::try_from(addr_offset.base.comp).unwrap();
329             let (base, _) = self.get_ssa_comp(base_def, base_comp);
330             (base.into(), addr_offset.offset)
331         } else {
332             (SrcRef::Zero.into(), addr_offset.offset)
333         }
334     }
335 
set_dst(&mut self, def: &nir_def, ssa: SSARef)336     fn set_dst(&mut self, def: &nir_def, ssa: SSARef) {
337         self.set_ssa(def, (*ssa).into());
338     }
339 
try_saturate_alu_dst(&mut self, def: &nir_def) -> bool340     fn try_saturate_alu_dst(&mut self, def: &nir_def) -> bool {
341         if def.all_uses_are_fsat() {
342             self.saturated.insert(def as *const _);
343             true
344         } else {
345             false
346         }
347     }
348 
alu_src_is_saturated(&self, src: &nir_alu_src) -> bool349     fn alu_src_is_saturated(&self, src: &nir_alu_src) -> bool {
350         self.saturated.get(&(src.as_def() as *const _)).is_some()
351     }
352 
parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr)353     fn parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr) {
354         // Handle vectors and pack ops as a special case since they're the only
355         // ALU ops that can produce more than 16B. They are also the only ALU
356         // ops which we allow to consume small (8 and 16-bit) vector data
357         // scattered across multiple dwords
358         match alu.op {
359             nir_op_mov
360             | nir_op_pack_32_4x8_split
361             | nir_op_pack_32_2x16_split
362             | nir_op_pack_64_2x32_split
363             | nir_op_vec2
364             | nir_op_vec3
365             | nir_op_vec4
366             | nir_op_vec5
367             | nir_op_vec8
368             | nir_op_vec16 => {
369                 let src_bit_size = alu.get_src(0).src.bit_size();
370                 let bits = usize::from(alu.def.num_components)
371                     * usize::from(alu.def.bit_size);
372 
373                 // Collect the sources into a vec with src_bit_size per SSA
374                 // value in the vec.  This implicitly makes 64-bit sources look
375                 // like two 32-bit values
376                 let mut srcs = Vec::new();
377                 if alu.op == nir_op_mov {
378                     let src = alu.get_src(0);
379                     for c in 0..alu.def.num_components {
380                         let s = src.swizzle[usize::from(c)];
381                         let (src, byte) =
382                             self.get_ssa_comp(src.src.as_def(), s);
383                         for ssa in src.iter() {
384                             srcs.push((*ssa, byte));
385                         }
386                     }
387                 } else {
388                     for src in alu.srcs_as_slice().iter() {
389                         let s = src.swizzle[0];
390                         let (src, byte) =
391                             self.get_ssa_comp(src.src.as_def(), s);
392                         for ssa in src.iter() {
393                             srcs.push((*ssa, byte));
394                         }
395                     }
396                 }
397 
398                 let mut comps = Vec::new();
399                 match src_bit_size {
400                     1 | 32 | 64 => {
401                         for (ssa, _) in srcs {
402                             comps.push(ssa);
403                         }
404                     }
405                     8 => {
406                         for dc in 0..bits.div_ceil(32) {
407                             let mut psrc = [Src::new_zero(); 4];
408                             let mut psel = [0_u8; 4];
409 
410                             for b in 0..4 {
411                                 let sc = dc * 4 + b;
412                                 if sc < srcs.len() {
413                                     let (ssa, byte) = srcs[sc];
414                                     for i in 0..4_u8 {
415                                         let psrc_i = &mut psrc[usize::from(i)];
416                                         if *psrc_i == Src::new_zero() {
417                                             *psrc_i = ssa.into();
418                                         } else if *psrc_i != Src::from(ssa) {
419                                             continue;
420                                         }
421                                         psel[b] = i * 4 + byte;
422                                     }
423                                 }
424                             }
425                             comps.push(b.prmt4(psrc, psel)[0]);
426                         }
427                     }
428                     16 => {
429                         for dc in 0..bits.div_ceil(32) {
430                             let mut psrc = [Src::new_zero(); 2];
431                             let mut psel = [0_u8; 4];
432 
433                             for w in 0..2 {
434                                 let sc = dc * 2 + w;
435                                 if sc < srcs.len() {
436                                     let (ssa, byte) = srcs[sc];
437                                     let w_u8 = u8::try_from(w).unwrap();
438                                     psrc[w] = ssa.into();
439                                     psel[w * 2 + 0] = (w_u8 * 4) + byte;
440                                     psel[w * 2 + 1] = (w_u8 * 4) + byte + 1;
441                                 }
442                             }
443                             comps.push(b.prmt(psrc[0], psrc[1], psel)[0]);
444                         }
445                     }
446                     _ => panic!("Unknown bit size: {src_bit_size}"),
447                 }
448 
449                 self.set_ssa(&alu.def, comps);
450                 return;
451             }
452             _ => (),
453         }
454 
455         let mut srcs: Vec<Src> = Vec::new();
456         for (i, alu_src) in alu.srcs_as_slice().iter().enumerate() {
457             let bit_size = alu_src.src.bit_size();
458             let comps = alu.src_components(i.try_into().unwrap());
459             let ssa = self.get_ssa(alu_src.src.as_def());
460 
461             match bit_size {
462                 1 => {
463                     assert!(comps == 1);
464                     let s = usize::from(alu_src.swizzle[0]);
465                     srcs.push(ssa[s].into());
466                 }
467                 8 => {
468                     assert!(comps <= 4);
469                     let s = alu_src.swizzle[0];
470                     let dw = ssa[usize::from(s / 4)];
471 
472                     let mut prmt = [4_u8; 4];
473                     for c in 0..comps {
474                         let cs = alu_src.swizzle[usize::from(c)];
475                         assert!(s / 4 == cs / 4);
476                         prmt[usize::from(c)] = cs;
477                     }
478                     srcs.push(b.prmt(dw.into(), 0.into(), prmt).into());
479                 }
480                 16 => {
481                     assert!(comps <= 2);
482                     let s = alu_src.swizzle[0];
483                     let dw = ssa[usize::from(s / 2)];
484 
485                     let mut prmt = [0_u8; 4];
486                     for c in 0..comps {
487                         let cs = alu_src.swizzle[usize::from(c)];
488                         assert!(s / 2 == cs / 2);
489                         prmt[usize::from(c) * 2 + 0] = cs * 2 + 0;
490                         prmt[usize::from(c) * 2 + 1] = cs * 2 + 1;
491                     }
492                     // TODO: Some ops can handle swizzles
493                     srcs.push(b.prmt(dw.into(), 0.into(), prmt).into());
494                 }
495                 32 => {
496                     assert!(comps == 1);
497                     let s = usize::from(alu_src.swizzle[0]);
498                     srcs.push(ssa[s].into());
499                 }
500                 64 => {
501                     assert!(comps == 1);
502                     let s = usize::from(alu_src.swizzle[0]);
503                     srcs.push([ssa[s * 2], ssa[s * 2 + 1]].into());
504                 }
505                 _ => panic!("Invalid bit size: {bit_size}"),
506             }
507         }
508 
509         let dst: SSARef = match alu.op {
510             nir_op_b2b1 => {
511                 assert!(alu.get_src(0).bit_size() == 32);
512                 b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], 0.into())
513             }
514             nir_op_b2b32 | nir_op_b2i8 | nir_op_b2i16 | nir_op_b2i32 => {
515                 b.sel(srcs[0].bnot(), 0.into(), 1.into())
516             }
517             nir_op_b2i64 => {
518                 let lo = b.sel(srcs[0].bnot(), 0.into(), 1.into());
519                 let hi = b.copy(0.into());
520                 [lo[0], hi[0]].into()
521             }
522             nir_op_b2f32 => {
523                 b.sel(srcs[0].bnot(), 0.0_f32.into(), 1.0_f32.into())
524             }
525             nir_op_b2f64 => {
526                 let lo = b.copy(0.into());
527                 let hi = b.sel(srcs[0].bnot(), 0.into(), 0x3ff00000.into());
528                 [lo[0], hi[0]].into()
529             }
530             nir_op_bcsel => b.sel(srcs[0], srcs[1], srcs[2]),
531             nir_op_bfm => {
532                 let dst = b.alloc_ssa(RegFile::GPR, 1);
533                 b.push_op(OpBMsk {
534                     dst: dst.into(),
535                     pos: srcs[1],
536                     width: srcs[0],
537                     wrap: true,
538                 });
539                 dst
540             }
541             nir_op_bit_count => {
542                 let dst = b.alloc_ssa(RegFile::GPR, 1);
543                 b.push_op(OpPopC {
544                     dst: dst.into(),
545                     src: srcs[0],
546                 });
547                 dst
548             }
549             nir_op_bitfield_reverse => {
550                 let dst = b.alloc_ssa(RegFile::GPR, 1);
551                 if self.info.sm >= 70 {
552                     b.push_op(OpBRev {
553                         dst: dst.into(),
554                         src: srcs[0],
555                     });
556                 } else {
557                     // No BREV in Maxwell
558                     b.push_op(OpBfe {
559                         dst: dst.into(),
560                         base: srcs[0],
561                         signed: false,
562                         range: Src::new_imm_u32(0x2000),
563                         reverse: true,
564                     });
565                 }
566                 dst
567             }
568             nir_op_ibitfield_extract | nir_op_ubitfield_extract => {
569                 let range = b.alloc_ssa(RegFile::GPR, 1);
570                 b.push_op(OpPrmt {
571                     dst: range.into(),
572                     srcs: [srcs[1], srcs[2]],
573                     sel: 0x0040.into(),
574                     mode: PrmtMode::Index,
575                 });
576 
577                 let dst = b.alloc_ssa(RegFile::GPR, 1);
578                 b.push_op(OpBfe {
579                     dst: dst.into(),
580                     base: srcs[0],
581                     signed: !matches!(alu.op, nir_op_ubitfield_extract),
582                     range: range.into(),
583                     reverse: false,
584                 });
585                 dst
586             }
587             nir_op_extract_u8 | nir_op_extract_i8 | nir_op_extract_u16
588             | nir_op_extract_i16 => {
589                 let src1 = alu.get_src(1);
590                 let elem = src1.src.comp_as_uint(src1.swizzle[0]).unwrap();
591                 let elem = u8::try_from(elem).unwrap();
592 
593                 match alu.op {
594                     nir_op_extract_u8 => {
595                         assert!(elem < 4);
596                         let byte = elem;
597                         let zero = 4;
598                         b.prmt(srcs[0], 0.into(), [byte, zero, zero, zero])
599                     }
600                     nir_op_extract_i8 => {
601                         assert!(elem < 4);
602                         let byte = elem;
603                         let sign = byte | 0x8;
604                         b.prmt(srcs[0], 0.into(), [byte, sign, sign, sign])
605                     }
606                     nir_op_extract_u16 => {
607                         assert!(elem < 2);
608                         let byte = elem * 2;
609                         let zero = 4;
610                         b.prmt(srcs[0], 0.into(), [byte, byte + 1, zero, zero])
611                     }
612                     nir_op_extract_i16 => {
613                         assert!(elem < 2);
614                         let byte = elem * 2;
615                         let sign = (byte + 1) | 0x8;
616                         b.prmt(srcs[0], 0.into(), [byte, byte + 1, sign, sign])
617                     }
618                     _ => panic!("Unknown extract op: {}", alu.op),
619                 }
620             }
621             nir_op_f2f16 | nir_op_f2f16_rtne | nir_op_f2f16_rtz
622             | nir_op_f2f32 | nir_op_f2f64 => {
623                 let src_bits = alu.get_src(0).src.bit_size();
624                 let dst_bits = alu.def.bit_size();
625                 let src_type = FloatType::from_bits(src_bits.into());
626                 let dst_type = FloatType::from_bits(dst_bits.into());
627 
628                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
629                 b.push_op(OpF2F {
630                     dst: dst.into(),
631                     src: srcs[0],
632                     src_type: FloatType::from_bits(src_bits.into()),
633                     dst_type: dst_type,
634                     rnd_mode: match alu.op {
635                         nir_op_f2f16_rtne => FRndMode::NearestEven,
636                         nir_op_f2f16_rtz => FRndMode::Zero,
637                         _ => self.float_ctl[dst_type].rnd_mode,
638                     },
639                     ftz: if src_bits < dst_bits {
640                         self.float_ctl[src_type].ftz
641                     } else {
642                         self.float_ctl[dst_type].ftz
643                     },
644                     high: false,
645                     integer_rnd: false,
646                 });
647                 dst
648             }
649             nir_op_find_lsb => {
650                 let tmp = b.alloc_ssa(RegFile::GPR, 1);
651                 b.push_op(OpBRev {
652                     dst: tmp.into(),
653                     src: srcs[0],
654                 });
655                 let dst = b.alloc_ssa(RegFile::GPR, 1);
656                 b.push_op(OpFlo {
657                     dst: dst.into(),
658                     src: tmp.into(),
659                     signed: alu.op == nir_op_ifind_msb,
660                     return_shift_amount: true,
661                 });
662                 dst
663             }
664             nir_op_f2i8 | nir_op_f2i16 | nir_op_f2i32 | nir_op_f2i64
665             | nir_op_f2u8 | nir_op_f2u16 | nir_op_f2u32 | nir_op_f2u64 => {
666                 let src_bits = usize::from(alu.get_src(0).bit_size());
667                 let dst_bits = alu.def.bit_size();
668                 let src_type = FloatType::from_bits(src_bits);
669                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
670                 let dst_is_signed = alu.info().output_type & 2 != 0;
671                 b.push_op(OpF2I {
672                     dst: dst.into(),
673                     src: srcs[0],
674                     src_type: src_type,
675                     dst_type: IntType::from_bits(
676                         dst_bits.into(),
677                         dst_is_signed,
678                     ),
679                     rnd_mode: FRndMode::Zero,
680                     ftz: self.float_ctl[src_type].ftz,
681                 });
682                 dst
683             }
684             nir_op_fabs | nir_op_fadd | nir_op_fneg => {
685                 let (x, y) = match alu.op {
686                     nir_op_fabs => (srcs[0].fabs(), 0.0_f32.into()),
687                     nir_op_fadd => (srcs[0], srcs[1]),
688                     nir_op_fneg => (Src::new_zero().fneg(), srcs[0].fneg()),
689                     _ => panic!("Unhandled case"),
690                 };
691                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
692                 let dst;
693                 if alu.def.bit_size() == 64 {
694                     dst = b.alloc_ssa(RegFile::GPR, 2);
695                     b.push_op(OpDAdd {
696                         dst: dst.into(),
697                         srcs: [x, y],
698                         rnd_mode: self.float_ctl[ftype].rnd_mode,
699                     });
700                 } else if alu.def.bit_size() == 32 {
701                     dst = b.alloc_ssa(RegFile::GPR, 1);
702                     b.push_op(OpFAdd {
703                         dst: dst.into(),
704                         srcs: [x, y],
705                         saturate: self.try_saturate_alu_dst(&alu.def),
706                         rnd_mode: self.float_ctl[ftype].rnd_mode,
707                         ftz: self.float_ctl[ftype].ftz,
708                     });
709                 } else {
710                     panic!("Unsupported float type: f{}", alu.def.bit_size());
711                 }
712                 dst
713             }
714             nir_op_fceil | nir_op_ffloor | nir_op_fround_even
715             | nir_op_ftrunc => {
716                 assert!(alu.def.bit_size() == 32);
717                 let dst = b.alloc_ssa(RegFile::GPR, 1);
718                 let ty = FloatType::from_bits(alu.def.bit_size().into());
719                 let rnd_mode = match alu.op {
720                     nir_op_fceil => FRndMode::PosInf,
721                     nir_op_ffloor => FRndMode::NegInf,
722                     nir_op_ftrunc => FRndMode::Zero,
723                     nir_op_fround_even => FRndMode::NearestEven,
724                     _ => unreachable!(),
725                 };
726                 let ftz = self.float_ctl[ty].ftz;
727                 if b.sm() >= 70 {
728                     b.push_op(OpFRnd {
729                         dst: dst.into(),
730                         src: srcs[0],
731                         src_type: ty,
732                         dst_type: ty,
733                         rnd_mode,
734                         ftz,
735                     });
736                 } else {
737                     b.push_op(OpF2F {
738                         dst: dst.into(),
739                         src: srcs[0],
740                         src_type: ty,
741                         dst_type: ty,
742                         rnd_mode,
743                         ftz,
744                         integer_rnd: true,
745                         high: false,
746                     });
747                 }
748                 dst
749             }
750             nir_op_fcos => b.fcos(srcs[0]),
751             nir_op_feq | nir_op_fge | nir_op_flt | nir_op_fneu => {
752                 let src_type =
753                     FloatType::from_bits(alu.get_src(0).bit_size().into());
754                 let cmp_op = match alu.op {
755                     nir_op_feq => FloatCmpOp::OrdEq,
756                     nir_op_fge => FloatCmpOp::OrdGe,
757                     nir_op_flt => FloatCmpOp::OrdLt,
758                     nir_op_fneu => FloatCmpOp::UnordNe,
759                     _ => panic!("Usupported float comparison"),
760                 };
761 
762                 let dst = b.alloc_ssa(RegFile::Pred, 1);
763                 if alu.get_src(0).bit_size() == 64 {
764                     b.push_op(OpDSetP {
765                         dst: dst.into(),
766                         set_op: PredSetOp::And,
767                         cmp_op: cmp_op,
768                         srcs: [srcs[0], srcs[1]],
769                         accum: SrcRef::True.into(),
770                     });
771                 } else if alu.get_src(0).bit_size() == 32 {
772                     b.push_op(OpFSetP {
773                         dst: dst.into(),
774                         set_op: PredSetOp::And,
775                         cmp_op: cmp_op,
776                         srcs: [srcs[0], srcs[1]],
777                         accum: SrcRef::True.into(),
778                         ftz: self.float_ctl[src_type].ftz,
779                     });
780                 } else {
781                     panic!(
782                         "Unsupported float type: f{}",
783                         alu.get_src(0).bit_size()
784                     );
785                 }
786                 dst
787             }
788             nir_op_fexp2 => b.fexp2(srcs[0]),
789             nir_op_ffma => {
790                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
791                 let dst;
792                 if alu.def.bit_size() == 64 {
793                     debug_assert!(!self.float_ctl[ftype].ftz);
794                     dst = b.alloc_ssa(RegFile::GPR, 2);
795                     b.push_op(OpDFma {
796                         dst: dst.into(),
797                         srcs: [srcs[0], srcs[1], srcs[2]],
798                         rnd_mode: self.float_ctl[ftype].rnd_mode,
799                     });
800                 } else if alu.def.bit_size() == 32 {
801                     dst = b.alloc_ssa(RegFile::GPR, 1);
802                     b.push_op(OpFFma {
803                         dst: dst.into(),
804                         srcs: [srcs[0], srcs[1], srcs[2]],
805                         saturate: self.try_saturate_alu_dst(&alu.def),
806                         rnd_mode: self.float_ctl[ftype].rnd_mode,
807                         // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
808                         // anyway so only set one of the two bits.
809                         ftz: self.float_ctl[ftype].ftz,
810                         dnz: false,
811                     });
812                 } else {
813                     panic!("Unsupported float type: f{}", alu.def.bit_size());
814                 }
815                 dst
816             }
817             nir_op_ffmaz => {
818                 assert!(alu.def.bit_size() == 32);
819                 // DNZ implies FTZ so we need FTZ set or this is invalid
820                 assert!(self.float_ctl.fp32.ftz);
821                 let dst = b.alloc_ssa(RegFile::GPR, 1);
822                 b.push_op(OpFFma {
823                     dst: dst.into(),
824                     srcs: [srcs[0], srcs[1], srcs[2]],
825                     saturate: self.try_saturate_alu_dst(&alu.def),
826                     rnd_mode: self.float_ctl.fp32.rnd_mode,
827                     // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
828                     // anyway so only set one of the two bits.
829                     ftz: false,
830                     dnz: true,
831                 });
832                 dst
833             }
834             nir_op_flog2 => {
835                 assert!(alu.def.bit_size() == 32);
836                 b.mufu(MuFuOp::Log2, srcs[0])
837             }
838             nir_op_fmax | nir_op_fmin => {
839                 let dst;
840                 if alu.def.bit_size() == 64 {
841                     dst = b.alloc_ssa(RegFile::GPR, 2);
842                     b.push_op(OpDMnMx {
843                         dst: dst.into(),
844                         srcs: [srcs[0], srcs[1]],
845                         min: (alu.op == nir_op_fmin).into(),
846                     });
847                 } else if alu.def.bit_size() == 32 {
848                     dst = b.alloc_ssa(RegFile::GPR, 1);
849                     b.push_op(OpFMnMx {
850                         dst: dst.into(),
851                         srcs: [srcs[0], srcs[1]],
852                         min: (alu.op == nir_op_fmin).into(),
853                         ftz: self.float_ctl.fp32.ftz,
854                     });
855                 } else {
856                     panic!("Unsupported float type: f{}", alu.def.bit_size());
857                 }
858                 dst
859             }
860             nir_op_fmul => {
861                 let ftype = FloatType::from_bits(alu.def.bit_size().into());
862                 let dst;
863                 if alu.def.bit_size() == 64 {
864                     debug_assert!(!self.float_ctl[ftype].ftz);
865                     dst = b.alloc_ssa(RegFile::GPR, 2);
866                     b.push_op(OpDMul {
867                         dst: dst.into(),
868                         srcs: [srcs[0], srcs[1]],
869                         rnd_mode: self.float_ctl[ftype].rnd_mode,
870                     });
871                 } else if alu.def.bit_size() == 32 {
872                     dst = b.alloc_ssa(RegFile::GPR, 1);
873                     b.push_op(OpFMul {
874                         dst: dst.into(),
875                         srcs: [srcs[0], srcs[1]],
876                         saturate: self.try_saturate_alu_dst(&alu.def),
877                         rnd_mode: self.float_ctl[ftype].rnd_mode,
878                         ftz: self.float_ctl[ftype].ftz,
879                         dnz: false,
880                     });
881                 } else {
882                     panic!("Unsupported float type: f{}", alu.def.bit_size());
883                 }
884                 dst
885             }
886             nir_op_fmulz => {
887                 assert!(alu.def.bit_size() == 32);
888                 // DNZ implies FTZ so we need FTZ set or this is invalid
889                 assert!(self.float_ctl.fp32.ftz);
890                 let dst = b.alloc_ssa(RegFile::GPR, 1);
891                 b.push_op(OpFMul {
892                     dst: dst.into(),
893                     srcs: [srcs[0], srcs[1]],
894                     saturate: self.try_saturate_alu_dst(&alu.def),
895                     rnd_mode: self.float_ctl.fp32.rnd_mode,
896                     // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
897                     // anyway so only set one of the two bits.
898                     ftz: false,
899                     dnz: true,
900                 });
901                 dst
902             }
903             nir_op_fquantize2f16 => {
904                 let tmp = b.alloc_ssa(RegFile::GPR, 1);
905                 b.push_op(OpF2F {
906                     dst: tmp.into(),
907                     src: srcs[0],
908                     src_type: FloatType::F32,
909                     dst_type: FloatType::F16,
910                     rnd_mode: FRndMode::NearestEven,
911                     ftz: true,
912                     high: false,
913                     integer_rnd: false,
914                 });
915                 assert!(alu.def.bit_size() == 32);
916                 let dst = b.alloc_ssa(RegFile::GPR, 1);
917                 b.push_op(OpF2F {
918                     dst: dst.into(),
919                     src: tmp.into(),
920                     src_type: FloatType::F16,
921                     dst_type: FloatType::F32,
922                     rnd_mode: FRndMode::NearestEven,
923                     ftz: true,
924                     high: false,
925                     integer_rnd: false,
926                 });
927                 dst
928             }
929             nir_op_frcp => {
930                 assert!(alu.def.bit_size() == 32);
931                 b.mufu(MuFuOp::Rcp, srcs[0])
932             }
933             nir_op_frsq => {
934                 assert!(alu.def.bit_size() == 32);
935                 b.mufu(MuFuOp::Rsq, srcs[0])
936             }
937             nir_op_fsat => {
938                 assert!(alu.def.bit_size() == 32);
939                 if self.alu_src_is_saturated(&alu.srcs_as_slice()[0]) {
940                     b.copy(srcs[0])
941                 } else {
942                     let ftype = FloatType::from_bits(alu.def.bit_size().into());
943                     let dst = b.alloc_ssa(RegFile::GPR, 1);
944                     b.push_op(OpFAdd {
945                         dst: dst.into(),
946                         srcs: [srcs[0], 0.into()],
947                         saturate: true,
948                         rnd_mode: self.float_ctl[ftype].rnd_mode,
949                         ftz: self.float_ctl[ftype].ftz,
950                     });
951                     dst
952                 }
953             }
954             nir_op_fsign => {
955                 if alu.def.bit_size() == 64 {
956                     let lz = b.dsetp(FloatCmpOp::OrdLt, srcs[0], 0.into());
957                     let gz = b.dsetp(FloatCmpOp::OrdGt, srcs[0], 0.into());
958                     let hi = b.sel(lz.into(), 0xbff00000.into(), 0.into());
959                     let hi = b.sel(gz.into(), 0x3ff00000.into(), hi.into());
960                     let lo = b.copy(0.into());
961                     [lo[0], hi[0]].into()
962                 } else if alu.def.bit_size() == 32 {
963                     let lz = b.fset(FloatCmpOp::OrdLt, srcs[0], 0.into());
964                     let gz = b.fset(FloatCmpOp::OrdGt, srcs[0], 0.into());
965                     b.fadd(gz.into(), Src::from(lz).fneg())
966                 } else {
967                     panic!("Unsupported float type: f{}", alu.def.bit_size());
968                 }
969             }
970             nir_op_fsin => b.fsin(srcs[0]),
971             nir_op_fsqrt => b.mufu(MuFuOp::Sqrt, srcs[0]),
972             nir_op_i2f16 | nir_op_i2f32 | nir_op_i2f64 => {
973                 let src_bits = alu.get_src(0).src.bit_size();
974                 let dst_bits = alu.def.bit_size();
975                 let dst_type = FloatType::from_bits(dst_bits.into());
976                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
977                 b.push_op(OpI2F {
978                     dst: dst.into(),
979                     src: srcs[0],
980                     dst_type: dst_type,
981                     src_type: IntType::from_bits(src_bits.into(), true),
982                     rnd_mode: self.float_ctl[dst_type].rnd_mode,
983                 });
984                 dst
985             }
986             nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32 | nir_op_i2i64
987             | nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32 | nir_op_u2u64 => {
988                 let src_bits = alu.get_src(0).src.bit_size();
989                 let dst_bits = alu.def.bit_size();
990 
991                 let mut prmt = [0_u8; 8];
992                 match alu.op {
993                     nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32
994                     | nir_op_i2i64 => {
995                         let sign = ((src_bits / 8) - 1) | 0x8;
996                         for i in 0..8 {
997                             if i < (src_bits / 8) {
998                                 prmt[usize::from(i)] = i;
999                             } else {
1000                                 prmt[usize::from(i)] = sign;
1001                             }
1002                         }
1003                     }
1004                     nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32
1005                     | nir_op_u2u64 => {
1006                         for i in 0..8 {
1007                             if i < (src_bits / 8) {
1008                                 prmt[usize::from(i)] = i;
1009                             } else {
1010                                 prmt[usize::from(i)] = 4;
1011                             }
1012                         }
1013                     }
1014                     _ => panic!("Invalid integer conversion: {}", alu.op),
1015                 }
1016                 let prmt_lo: [u8; 4] = prmt[0..4].try_into().unwrap();
1017                 let prmt_hi: [u8; 4] = prmt[4..8].try_into().unwrap();
1018 
1019                 let src = srcs[0].as_ssa().unwrap();
1020                 if src_bits == 64 {
1021                     if dst_bits == 64 {
1022                         *src
1023                     } else {
1024                         b.prmt(src[0].into(), src[1].into(), prmt_lo)
1025                     }
1026                 } else {
1027                     if dst_bits == 64 {
1028                         let lo = b.prmt(src[0].into(), 0.into(), prmt_lo);
1029                         let hi = b.prmt(src[0].into(), 0.into(), prmt_hi);
1030                         [lo[0], hi[0]].into()
1031                     } else {
1032                         b.prmt(src[0].into(), 0.into(), prmt_lo)
1033                     }
1034                 }
1035             }
1036             nir_op_iabs => b.iabs(srcs[0]),
1037             nir_op_iadd => match alu.def.bit_size {
1038                 32 => b.iadd(srcs[0], srcs[1], 0.into()),
1039                 64 => b.iadd64(srcs[0], srcs[1], 0.into()),
1040                 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1041             },
1042             nir_op_iadd3 => match alu.def.bit_size {
1043                 32 => b.iadd(srcs[0], srcs[1], srcs[2]),
1044                 64 => b.iadd64(srcs[0], srcs[1], srcs[2]),
1045                 x => panic!("unsupported bit size for nir_op_iadd3: {x}"),
1046             },
1047             nir_op_iand => b.lop2(LogicOp2::And, srcs[0], srcs[1]),
1048             nir_op_ieq => {
1049                 if alu.get_src(0).bit_size() == 1 {
1050                     b.lop2(LogicOp2::Xor, srcs[0], srcs[1].bnot())
1051                 } else if alu.get_src(0).bit_size() == 64 {
1052                     b.isetp64(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1053                 } else {
1054                     assert!(alu.get_src(0).bit_size() == 32);
1055                     b.isetp(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1056                 }
1057             }
1058             nir_op_ifind_msb | nir_op_ifind_msb_rev | nir_op_ufind_msb
1059             | nir_op_ufind_msb_rev => {
1060                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1061                 b.push_op(OpFlo {
1062                     dst: dst.into(),
1063                     src: srcs[0],
1064                     signed: match alu.op {
1065                         nir_op_ifind_msb | nir_op_ifind_msb_rev => true,
1066                         nir_op_ufind_msb | nir_op_ufind_msb_rev => false,
1067                         _ => panic!("Not a find_msb op"),
1068                     },
1069                     return_shift_amount: match alu.op {
1070                         nir_op_ifind_msb | nir_op_ufind_msb => false,
1071                         nir_op_ifind_msb_rev | nir_op_ufind_msb_rev => true,
1072                         _ => panic!("Not a find_msb op"),
1073                     },
1074                 });
1075                 dst
1076             }
1077             nir_op_ige | nir_op_ilt | nir_op_uge | nir_op_ult => {
1078                 let x = *srcs[0].as_ssa().unwrap();
1079                 let y = *srcs[1].as_ssa().unwrap();
1080                 let (cmp_type, cmp_op) = match alu.op {
1081                     nir_op_ige => (IntCmpType::I32, IntCmpOp::Ge),
1082                     nir_op_ilt => (IntCmpType::I32, IntCmpOp::Lt),
1083                     nir_op_uge => (IntCmpType::U32, IntCmpOp::Ge),
1084                     nir_op_ult => (IntCmpType::U32, IntCmpOp::Lt),
1085                     _ => panic!("Not an integer comparison"),
1086                 };
1087                 if alu.get_src(0).bit_size() == 64 {
1088                     b.isetp64(cmp_type, cmp_op, x.into(), y.into())
1089                 } else {
1090                     assert!(alu.get_src(0).bit_size() == 32);
1091                     b.isetp(cmp_type, cmp_op, x.into(), y.into())
1092                 }
1093             }
1094             nir_op_imad => {
1095                 assert!(alu.def.bit_size() == 32);
1096                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1097                 b.push_op(OpIMad {
1098                     dst: dst.into(),
1099                     srcs: [srcs[0], srcs[1], srcs[2]],
1100                     signed: false,
1101                 });
1102                 dst
1103             }
1104             nir_op_imax | nir_op_imin | nir_op_umax | nir_op_umin => {
1105                 let (tp, min) = match alu.op {
1106                     nir_op_imax => (IntCmpType::I32, SrcRef::False),
1107                     nir_op_imin => (IntCmpType::I32, SrcRef::True),
1108                     nir_op_umax => (IntCmpType::U32, SrcRef::False),
1109                     nir_op_umin => (IntCmpType::U32, SrcRef::True),
1110                     _ => panic!("Not an integer min/max"),
1111                 };
1112                 assert!(alu.def.bit_size() == 32);
1113                 b.imnmx(tp, srcs[0], srcs[1], min.into())
1114             }
1115             nir_op_imul => {
1116                 assert!(alu.def.bit_size() == 32);
1117                 b.imul(srcs[0], srcs[1])
1118             }
1119             nir_op_imul_2x32_64 | nir_op_umul_2x32_64 => {
1120                 let signed = alu.op == nir_op_imul_2x32_64;
1121                 b.imul_2x32_64(srcs[0], srcs[1], signed)
1122             }
1123             nir_op_imul_high | nir_op_umul_high => {
1124                 let signed = alu.op == nir_op_imul_high;
1125                 let dst64 = b.imul_2x32_64(srcs[0], srcs[1], signed);
1126                 dst64[1].into()
1127             }
1128             nir_op_ine => {
1129                 if alu.get_src(0).bit_size() == 1 {
1130                     b.lop2(LogicOp2::Xor, srcs[0], srcs[1])
1131                 } else if alu.get_src(0).bit_size() == 64 {
1132                     b.isetp64(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1133                 } else {
1134                     assert!(alu.get_src(0).bit_size() == 32);
1135                     b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1136                 }
1137             }
1138             nir_op_ineg => {
1139                 if alu.def.bit_size == 64 {
1140                     let x = srcs[0].as_ssa().unwrap();
1141                     let sum = b.alloc_ssa(RegFile::GPR, 2);
1142                     let carry = b.alloc_ssa(RegFile::Pred, 1);
1143                     b.push_op(OpIAdd3 {
1144                         dst: sum[0].into(),
1145                         overflow: [carry.into(), Dst::None],
1146                         srcs: [0.into(), Src::from(x[0]).ineg(), 0.into()],
1147                     });
1148                     b.push_op(OpIAdd3X {
1149                         dst: sum[1].into(),
1150                         overflow: [Dst::None, Dst::None],
1151                         srcs: [0.into(), Src::from(x[1]).bnot(), 0.into()],
1152                         carry: [carry.into(), SrcRef::False.into()],
1153                     });
1154                     sum
1155                 } else {
1156                     assert!(alu.def.bit_size() == 32);
1157                     b.ineg(srcs[0])
1158                 }
1159             }
1160             nir_op_inot => {
1161                 if alu.def.bit_size() == 1 {
1162                     b.lop2(LogicOp2::PassB, true.into(), srcs[0].bnot())
1163                 } else {
1164                     assert!(alu.def.bit_size() == 32);
1165                     b.lop2(LogicOp2::PassB, 0.into(), srcs[0].bnot())
1166                 }
1167             }
1168             nir_op_ior => b.lop2(LogicOp2::Or, srcs[0], srcs[1]),
1169             nir_op_ishl => {
1170                 let x = *srcs[0].as_ssa().unwrap();
1171                 let shift = srcs[1];
1172                 if alu.def.bit_size() == 64 {
1173                     // For 64-bit shifts, we have to use clamp mode so we need
1174                     // to mask the shift in order satisfy NIR semantics.
1175                     let shift = b.lop2(LogicOp2::And, shift, 0x3f.into());
1176                     let dst = b.alloc_ssa(RegFile::GPR, 2);
1177                     b.push_op(OpShf {
1178                         dst: dst[0].into(),
1179                         low: 0.into(),
1180                         high: x[0].into(),
1181                         shift: shift.into(),
1182                         right: false,
1183                         wrap: false,
1184                         data_type: IntType::U32,
1185                         dst_high: true,
1186                     });
1187                     b.push_op(OpShf {
1188                         dst: dst[1].into(),
1189                         low: x[0].into(),
1190                         high: x[1].into(),
1191                         shift: shift.into(),
1192                         right: false,
1193                         wrap: false,
1194                         data_type: IntType::U64,
1195                         dst_high: true,
1196                     });
1197                     dst
1198                 } else {
1199                     assert!(alu.def.bit_size() == 32);
1200                     b.shl(srcs[0], srcs[1])
1201                 }
1202             }
1203             nir_op_ishr => {
1204                 let x = *srcs[0].as_ssa().unwrap();
1205                 let shift = srcs[1];
1206                 if alu.def.bit_size() == 64 {
1207                     // For 64-bit shifts, we have to use clamp mode so we need
1208                     // to mask the shift in order satisfy NIR semantics.
1209                     let shift = b.lop2(LogicOp2::And, shift, 0x3f.into());
1210                     let dst = b.alloc_ssa(RegFile::GPR, 2);
1211                     b.push_op(OpShf {
1212                         dst: dst[0].into(),
1213                         low: x[0].into(),
1214                         high: x[1].into(),
1215                         shift: shift.into(),
1216                         right: true,
1217                         wrap: false,
1218                         data_type: IntType::I64,
1219                         dst_high: false,
1220                     });
1221                     b.push_op(OpShf {
1222                         dst: dst[1].into(),
1223                         low: x[0].into(),
1224                         high: x[1].into(),
1225                         shift: shift.into(),
1226                         right: true,
1227                         wrap: false,
1228                         data_type: IntType::I32,
1229                         dst_high: true,
1230                     });
1231                     dst
1232                 } else {
1233                     assert!(alu.def.bit_size() == 32);
1234                     b.shr(srcs[0], srcs[1], true)
1235                 }
1236             }
1237             nir_op_ixor => b.lop2(LogicOp2::Xor, srcs[0], srcs[1]),
1238             nir_op_pack_half_2x16_split | nir_op_pack_half_2x16_rtz_split => {
1239                 assert!(alu.get_src(0).bit_size() == 32);
1240                 let low = b.alloc_ssa(RegFile::GPR, 1);
1241                 let high = b.alloc_ssa(RegFile::GPR, 1);
1242 
1243                 let rnd_mode = match alu.op {
1244                     nir_op_pack_half_2x16_split => FRndMode::NearestEven,
1245                     nir_op_pack_half_2x16_rtz_split => FRndMode::Zero,
1246                     _ => panic!("Unhandled fp16 pack op"),
1247                 };
1248 
1249                 b.push_op(OpF2F {
1250                     dst: low.into(),
1251                     src: srcs[0],
1252                     src_type: FloatType::F32,
1253                     dst_type: FloatType::F16,
1254                     rnd_mode: rnd_mode,
1255                     ftz: false,
1256                     high: false,
1257                     integer_rnd: false,
1258                 });
1259 
1260                 let src_bits = usize::from(alu.get_src(1).bit_size());
1261                 let src_type = FloatType::from_bits(src_bits);
1262                 assert!(matches!(src_type, FloatType::F32));
1263                 b.push_op(OpF2F {
1264                     dst: high.into(),
1265                     src: srcs[1],
1266                     src_type: FloatType::F32,
1267                     dst_type: FloatType::F16,
1268                     rnd_mode: rnd_mode,
1269                     ftz: false,
1270                     high: false,
1271                     integer_rnd: false,
1272                 });
1273 
1274                 b.prmt(low.into(), high.into(), [0, 1, 4, 5])
1275             }
1276             nir_op_sdot_4x8_iadd => {
1277                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1278                 b.push_op(OpIDp4 {
1279                     dst: dst.into(),
1280                     src_types: [IntType::I8, IntType::I8],
1281                     srcs: [srcs[0], srcs[1], srcs[2]],
1282                 });
1283                 dst
1284             }
1285             nir_op_sudot_4x8_iadd => {
1286                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1287                 b.push_op(OpIDp4 {
1288                     dst: dst.into(),
1289                     src_types: [IntType::I8, IntType::U8],
1290                     srcs: [srcs[0], srcs[1], srcs[2]],
1291                 });
1292                 dst
1293             }
1294             nir_op_udot_4x8_uadd => {
1295                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1296                 b.push_op(OpIDp4 {
1297                     dst: dst.into(),
1298                     src_types: [IntType::U8, IntType::U8],
1299                     srcs: [srcs[0], srcs[1], srcs[2]],
1300                 });
1301                 dst
1302             }
1303             nir_op_u2f16 | nir_op_u2f32 | nir_op_u2f64 => {
1304                 let src_bits = alu.get_src(0).src.bit_size();
1305                 let dst_bits = alu.def.bit_size();
1306                 let dst_type = FloatType::from_bits(dst_bits.into());
1307                 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1308                 b.push_op(OpI2F {
1309                     dst: dst.into(),
1310                     src: srcs[0],
1311                     dst_type: dst_type,
1312                     src_type: IntType::from_bits(src_bits.into(), false),
1313                     rnd_mode: self.float_ctl[dst_type].rnd_mode,
1314                 });
1315                 dst
1316             }
1317             nir_op_uadd_sat => {
1318                 let x = srcs[0].as_ssa().unwrap();
1319                 let y = srcs[1].as_ssa().unwrap();
1320                 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1321                 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1322                 b.push_op(OpIAdd3 {
1323                     dst: sum_lo.into(),
1324                     overflow: [ovf_lo.into(), Dst::None],
1325                     srcs: [0.into(), x[0].into(), y[0].into()],
1326                 });
1327                 if alu.def.bit_size() == 64 {
1328                     let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1329                     let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1330                     b.push_op(OpIAdd3X {
1331                         dst: sum_hi.into(),
1332                         overflow: [ovf_hi.into(), Dst::None],
1333                         srcs: [0.into(), x[1].into(), y[1].into()],
1334                         carry: [ovf_lo.into(), false.into()],
1335                     });
1336                     let lo =
1337                         b.sel(ovf_hi.into(), u32::MAX.into(), sum_lo.into());
1338                     let hi =
1339                         b.sel(ovf_hi.into(), u32::MAX.into(), sum_hi.into());
1340                     [lo[0], hi[0]].into()
1341                 } else {
1342                     assert!(alu.def.bit_size() == 32);
1343                     b.sel(ovf_lo.into(), u32::MAX.into(), sum_lo.into())
1344                 }
1345             }
1346             nir_op_usub_sat => {
1347                 let x = srcs[0].as_ssa().unwrap();
1348                 let y = srcs[1].as_ssa().unwrap();
1349                 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1350                 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1351                 // The result of OpIAdd3X is the 33-bit value
1352                 //
1353                 //  s|o = x + !y + 1
1354                 //
1355                 // The overflow bit of this result is true if and only if the
1356                 // subtract did NOT overflow.
1357                 b.push_op(OpIAdd3 {
1358                     dst: sum_lo.into(),
1359                     overflow: [ovf_lo.into(), Dst::None],
1360                     srcs: [0.into(), x[0].into(), Src::from(y[0]).ineg()],
1361                 });
1362                 if alu.def.bit_size() == 64 {
1363                     let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1364                     let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1365                     b.push_op(OpIAdd3X {
1366                         dst: sum_hi.into(),
1367                         overflow: [ovf_hi.into(), Dst::None],
1368                         srcs: [0.into(), x[1].into(), Src::from(y[1]).bnot()],
1369                         carry: [ovf_lo.into(), false.into()],
1370                     });
1371                     let lo = b.sel(ovf_hi.into(), sum_lo.into(), 0.into());
1372                     let hi = b.sel(ovf_hi.into(), sum_hi.into(), 0.into());
1373                     [lo[0], hi[0]].into()
1374                 } else {
1375                     assert!(alu.def.bit_size() == 32);
1376                     b.sel(ovf_lo.into(), sum_lo.into(), 0.into())
1377                 }
1378             }
1379             nir_op_unpack_32_2x16_split_x => {
1380                 b.prmt(srcs[0], 0.into(), [0, 1, 4, 4])
1381             }
1382             nir_op_unpack_32_2x16_split_y => {
1383                 b.prmt(srcs[0], 0.into(), [2, 3, 4, 4])
1384             }
1385             nir_op_unpack_64_2x32_split_x => {
1386                 let src0_x = srcs[0].as_ssa().unwrap()[0];
1387                 b.copy(src0_x.into())
1388             }
1389             nir_op_unpack_64_2x32_split_y => {
1390                 let src0_y = srcs[0].as_ssa().unwrap()[1];
1391                 b.copy(src0_y.into())
1392             }
1393             nir_op_unpack_half_2x16_split_x
1394             | nir_op_unpack_half_2x16_split_y => {
1395                 assert!(alu.def.bit_size() == 32);
1396                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1397 
1398                 b.push_op(OpF2F {
1399                     dst: dst[0].into(),
1400                     src: srcs[0],
1401                     src_type: FloatType::F16,
1402                     dst_type: FloatType::F32,
1403                     rnd_mode: FRndMode::NearestEven,
1404                     ftz: false,
1405                     high: alu.op == nir_op_unpack_half_2x16_split_y,
1406                     integer_rnd: false,
1407                 });
1408 
1409                 dst
1410             }
1411             nir_op_ushr => {
1412                 let x = *srcs[0].as_ssa().unwrap();
1413                 let shift = srcs[1];
1414                 if alu.def.bit_size() == 64 {
1415                     // For 64-bit shifts, we have to use clamp mode so we need
1416                     // to mask the shift in order satisfy NIR semantics.
1417                     let shift = b.lop2(LogicOp2::And, shift, 0x3f.into());
1418                     let dst = b.alloc_ssa(RegFile::GPR, 2);
1419                     b.push_op(OpShf {
1420                         dst: dst[0].into(),
1421                         low: x[0].into(),
1422                         high: x[1].into(),
1423                         shift: shift.into(),
1424                         right: true,
1425                         wrap: false,
1426                         data_type: IntType::U64,
1427                         dst_high: false,
1428                     });
1429                     b.push_op(OpShf {
1430                         dst: dst[1].into(),
1431                         low: x[0].into(),
1432                         high: x[1].into(),
1433                         shift: shift.into(),
1434                         right: true,
1435                         wrap: false,
1436                         data_type: IntType::U32,
1437                         dst_high: true,
1438                     });
1439                     dst
1440                 } else {
1441                     assert!(alu.def.bit_size() == 32);
1442                     b.shr(srcs[0], srcs[1], false)
1443                 }
1444             }
1445             nir_op_fddx | nir_op_fddx_coarse | nir_op_fddx_fine => {
1446                 // TODO: Real coarse derivatives
1447 
1448                 assert!(alu.def.bit_size() == 32);
1449                 let ftype = FloatType::F32;
1450                 let scratch = b.alloc_ssa(RegFile::GPR, 1);
1451 
1452                 b.push_op(OpShfl {
1453                     dst: scratch[0].into(),
1454                     in_bounds: Dst::None,
1455                     src: srcs[0],
1456                     lane: 1_u32.into(),
1457                     c: (0x3_u32 | 0x1c_u32 << 8).into(),
1458                     op: ShflOp::Bfly,
1459                 });
1460 
1461                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1462 
1463                 b.push_op(OpFSwzAdd {
1464                     dst: dst[0].into(),
1465                     srcs: [scratch[0].into(), srcs[0]],
1466                     ops: [
1467                         FSwzAddOp::SubLeft,
1468                         FSwzAddOp::SubRight,
1469                         FSwzAddOp::SubLeft,
1470                         FSwzAddOp::SubRight,
1471                     ],
1472                     rnd_mode: self.float_ctl[ftype].rnd_mode,
1473                     ftz: self.float_ctl[ftype].ftz,
1474                 });
1475 
1476                 dst
1477             }
1478             nir_op_fddy | nir_op_fddy_coarse | nir_op_fddy_fine => {
1479                 // TODO: Real coarse derivatives
1480 
1481                 assert!(alu.def.bit_size() == 32);
1482                 let ftype = FloatType::F32;
1483                 let scratch = b.alloc_ssa(RegFile::GPR, 1);
1484 
1485                 b.push_op(OpShfl {
1486                     dst: scratch[0].into(),
1487                     in_bounds: Dst::None,
1488                     src: srcs[0],
1489                     lane: 2_u32.into(),
1490                     c: (0x3_u32 | 0x1c_u32 << 8).into(),
1491                     op: ShflOp::Bfly,
1492                 });
1493 
1494                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1495 
1496                 b.push_op(OpFSwzAdd {
1497                     dst: dst[0].into(),
1498                     srcs: [scratch[0].into(), srcs[0]],
1499                     ops: [
1500                         FSwzAddOp::SubLeft,
1501                         FSwzAddOp::SubLeft,
1502                         FSwzAddOp::SubRight,
1503                         FSwzAddOp::SubRight,
1504                     ],
1505                     rnd_mode: self.float_ctl[ftype].rnd_mode,
1506                     ftz: self.float_ctl[ftype].ftz,
1507                 });
1508 
1509                 dst
1510             }
1511             _ => panic!("Unsupported ALU instruction: {}", alu.info().name()),
1512         };
1513         self.set_dst(&alu.def, dst);
1514     }
1515 
parse_jump(&mut self, _b: &mut impl SSABuilder, _jump: &nir_jump_instr)1516     fn parse_jump(&mut self, _b: &mut impl SSABuilder, _jump: &nir_jump_instr) {
1517         // Nothing to do
1518     }
1519 
parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr)1520     fn parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr) {
1521         let dim = match tex.sampler_dim {
1522             GLSL_SAMPLER_DIM_1D => {
1523                 if tex.is_array {
1524                     TexDim::Array1D
1525                 } else {
1526                     TexDim::_1D
1527                 }
1528             }
1529             GLSL_SAMPLER_DIM_2D => {
1530                 if tex.is_array {
1531                     TexDim::Array2D
1532                 } else {
1533                     TexDim::_2D
1534                 }
1535             }
1536             GLSL_SAMPLER_DIM_3D => {
1537                 assert!(!tex.is_array);
1538                 TexDim::_3D
1539             }
1540             GLSL_SAMPLER_DIM_CUBE => {
1541                 if tex.is_array {
1542                     TexDim::ArrayCube
1543                 } else {
1544                     TexDim::Cube
1545                 }
1546             }
1547             GLSL_SAMPLER_DIM_BUF => TexDim::_1D,
1548             GLSL_SAMPLER_DIM_MS => {
1549                 if tex.is_array {
1550                     TexDim::Array2D
1551                 } else {
1552                     TexDim::_2D
1553                 }
1554             }
1555             _ => panic!("Unsupported texture dimension: {}", tex.sampler_dim),
1556         };
1557 
1558         let srcs = tex.srcs_as_slice();
1559         assert!(srcs[0].src_type == nir_tex_src_backend1);
1560         if srcs.len() > 1 {
1561             assert!(srcs.len() == 2);
1562             assert!(srcs[1].src_type == nir_tex_src_backend2);
1563         }
1564 
1565         let flags: nak_nir_tex_flags =
1566             unsafe { std::mem::transmute_copy(&tex.backend_flags) };
1567 
1568         let mask = tex.def.components_read();
1569         let mask = u8::try_from(mask).unwrap();
1570 
1571         let dst_comps = u8::try_from(mask.count_ones()).unwrap();
1572         let dst = b.alloc_ssa(RegFile::GPR, dst_comps);
1573 
1574         // On Volta and later, the destination is split in two
1575         let mut dsts = [Dst::None; 2];
1576         if dst_comps > 2 && b.sm() >= 70 {
1577             dsts[0] = SSARef::try_from(&dst[0..2]).unwrap().into();
1578             dsts[1] = SSARef::try_from(&dst[2..]).unwrap().into();
1579         } else {
1580             dsts[0] = dst.into();
1581         }
1582 
1583         if tex.op == nir_texop_hdr_dim_nv {
1584             let src = self.get_src(&srcs[0].src);
1585             b.push_op(OpTxq {
1586                 dsts: dsts,
1587                 src: src,
1588                 query: TexQuery::Dimension,
1589                 mask: mask,
1590             });
1591         } else if tex.op == nir_texop_tex_type_nv {
1592             let src = self.get_src(&srcs[0].src);
1593             b.push_op(OpTxq {
1594                 dsts: dsts,
1595                 src: src,
1596                 query: TexQuery::TextureType,
1597                 mask: mask,
1598             });
1599         } else {
1600             let lod_mode = match flags.lod_mode() {
1601                 NAK_NIR_LOD_MODE_AUTO => TexLodMode::Auto,
1602                 NAK_NIR_LOD_MODE_ZERO => TexLodMode::Zero,
1603                 NAK_NIR_LOD_MODE_BIAS => TexLodMode::Bias,
1604                 NAK_NIR_LOD_MODE_LOD => TexLodMode::Lod,
1605                 NAK_NIR_LOD_MODE_CLAMP => TexLodMode::Clamp,
1606                 NAK_NIR_LOD_MODE_BIAS_CLAMP => TexLodMode::BiasClamp,
1607                 _ => panic!("Invalid LOD mode"),
1608             };
1609 
1610             let offset_mode = match flags.offset_mode() {
1611                 NAK_NIR_OFFSET_MODE_NONE => Tld4OffsetMode::None,
1612                 NAK_NIR_OFFSET_MODE_AOFFI => Tld4OffsetMode::AddOffI,
1613                 NAK_NIR_OFFSET_MODE_PER_PX => Tld4OffsetMode::PerPx,
1614                 _ => panic!("Invalid offset mode"),
1615             };
1616 
1617             let srcs = [self.get_src(&srcs[0].src), self.get_src(&srcs[1].src)];
1618 
1619             if tex.op == nir_texop_txd {
1620                 assert!(lod_mode == TexLodMode::Auto);
1621                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1622                 assert!(!flags.has_z_cmpr());
1623                 b.push_op(OpTxd {
1624                     dsts: dsts,
1625                     resident: Dst::None,
1626                     srcs: srcs,
1627                     dim: dim,
1628                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1629                     mask: mask,
1630                 });
1631             } else if tex.op == nir_texop_lod {
1632                 assert!(offset_mode == Tld4OffsetMode::None);
1633                 b.push_op(OpTmml {
1634                     dsts: dsts,
1635                     srcs: srcs,
1636                     dim: dim,
1637                     mask: mask,
1638                 });
1639             } else if tex.op == nir_texop_txf || tex.op == nir_texop_txf_ms {
1640                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1641                 b.push_op(OpTld {
1642                     dsts: dsts,
1643                     resident: Dst::None,
1644                     srcs: srcs,
1645                     dim: dim,
1646                     lod_mode: lod_mode,
1647                     is_ms: tex.op == nir_texop_txf_ms,
1648                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1649                     mask: mask,
1650                 });
1651             } else if tex.op == nir_texop_tg4 {
1652                 b.push_op(OpTld4 {
1653                     dsts: dsts,
1654                     resident: Dst::None,
1655                     srcs: srcs,
1656                     dim: dim,
1657                     comp: tex.component().try_into().unwrap(),
1658                     offset_mode: offset_mode,
1659                     z_cmpr: flags.has_z_cmpr(),
1660                     mask: mask,
1661                 });
1662             } else {
1663                 assert!(offset_mode != Tld4OffsetMode::PerPx);
1664                 b.push_op(OpTex {
1665                     dsts: dsts,
1666                     resident: Dst::None,
1667                     srcs: srcs,
1668                     dim: dim,
1669                     lod_mode: lod_mode,
1670                     z_cmpr: flags.has_z_cmpr(),
1671                     offset: offset_mode == Tld4OffsetMode::AddOffI,
1672                     mask: mask,
1673                 });
1674             }
1675         }
1676 
1677         let mut di = 0_usize;
1678         let mut nir_dst = Vec::new();
1679         for i in 0..tex.def.num_components() {
1680             if mask & (1 << i) == 0 {
1681                 nir_dst.push(b.copy(0.into())[0]);
1682             } else {
1683                 nir_dst.push(dst[di]);
1684                 di += 1;
1685             }
1686         }
1687         self.set_ssa(tex.def.as_def(), nir_dst);
1688     }
1689 
get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType1690     fn get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType {
1691         let bit_size = intrin.def.bit_size();
1692         match intrin.atomic_op() {
1693             nir_atomic_op_iadd => AtomType::U(bit_size),
1694             nir_atomic_op_imin => AtomType::I(bit_size),
1695             nir_atomic_op_umin => AtomType::U(bit_size),
1696             nir_atomic_op_imax => AtomType::I(bit_size),
1697             nir_atomic_op_umax => AtomType::U(bit_size),
1698             nir_atomic_op_iand => AtomType::U(bit_size),
1699             nir_atomic_op_ior => AtomType::U(bit_size),
1700             nir_atomic_op_ixor => AtomType::U(bit_size),
1701             nir_atomic_op_xchg => AtomType::U(bit_size),
1702             nir_atomic_op_fadd => AtomType::F(bit_size),
1703             nir_atomic_op_fmin => AtomType::F(bit_size),
1704             nir_atomic_op_fmax => AtomType::F(bit_size),
1705             nir_atomic_op_cmpxchg => AtomType::U(bit_size),
1706             _ => panic!("Unsupported NIR atomic op"),
1707         }
1708     }
1709 
get_atomic_op(&self, intrin: &nir_intrinsic_instr) -> AtomOp1710     fn get_atomic_op(&self, intrin: &nir_intrinsic_instr) -> AtomOp {
1711         match intrin.atomic_op() {
1712             nir_atomic_op_iadd => AtomOp::Add,
1713             nir_atomic_op_imin => AtomOp::Min,
1714             nir_atomic_op_umin => AtomOp::Min,
1715             nir_atomic_op_imax => AtomOp::Max,
1716             nir_atomic_op_umax => AtomOp::Max,
1717             nir_atomic_op_iand => AtomOp::And,
1718             nir_atomic_op_ior => AtomOp::Or,
1719             nir_atomic_op_ixor => AtomOp::Xor,
1720             nir_atomic_op_xchg => AtomOp::Exch,
1721             nir_atomic_op_fadd => AtomOp::Add,
1722             nir_atomic_op_fmin => AtomOp::Min,
1723             nir_atomic_op_fmax => AtomOp::Max,
1724             nir_atomic_op_cmpxchg => AtomOp::CmpExch,
1725             _ => panic!("Unsupported NIR atomic op"),
1726         }
1727     }
1728 
get_eviction_priority( &mut self, access: gl_access_qualifier, ) -> MemEvictionPriority1729     fn get_eviction_priority(
1730         &mut self,
1731         access: gl_access_qualifier,
1732     ) -> MemEvictionPriority {
1733         if self.info.sm >= 70 && access & ACCESS_NON_TEMPORAL != 0 {
1734             MemEvictionPriority::First
1735         } else {
1736             MemEvictionPriority::Normal
1737         }
1738     }
1739 
get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim1740     fn get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim {
1741         let is_array = intrin.image_array();
1742         let image_dim = intrin.image_dim();
1743         match intrin.image_dim() {
1744             GLSL_SAMPLER_DIM_1D => {
1745                 if is_array {
1746                     ImageDim::_1DArray
1747                 } else {
1748                     ImageDim::_1D
1749                 }
1750             }
1751             GLSL_SAMPLER_DIM_2D => {
1752                 if is_array {
1753                     ImageDim::_2DArray
1754                 } else {
1755                     ImageDim::_2D
1756                 }
1757             }
1758             GLSL_SAMPLER_DIM_3D => {
1759                 assert!(!is_array);
1760                 ImageDim::_3D
1761             }
1762             GLSL_SAMPLER_DIM_CUBE => ImageDim::_2DArray,
1763             GLSL_SAMPLER_DIM_BUF => {
1764                 assert!(!is_array);
1765                 ImageDim::_1DBuffer
1766             }
1767             _ => panic!("Unsupported image dimension: {}", image_dim),
1768         }
1769     }
1770 
get_image_coord( &mut self, intrin: &nir_intrinsic_instr, dim: ImageDim, ) -> Src1771     fn get_image_coord(
1772         &mut self,
1773         intrin: &nir_intrinsic_instr,
1774         dim: ImageDim,
1775     ) -> Src {
1776         let vec = self.get_ssa(intrin.get_src(1).as_def());
1777         // let sample = self.get_src(&srcs[2]);
1778         let comps = usize::from(dim.coord_comps());
1779         SSARef::try_from(&vec[0..comps]).unwrap().into()
1780     }
1781 
parse_intrinsic( &mut self, b: &mut impl SSABuilder, intrin: &nir_intrinsic_instr, )1782     fn parse_intrinsic(
1783         &mut self,
1784         b: &mut impl SSABuilder,
1785         intrin: &nir_intrinsic_instr,
1786     ) {
1787         let srcs = intrin.srcs_as_slice();
1788         match intrin.intrinsic {
1789             nir_intrinsic_al2p_nv => {
1790                 let offset = self.get_src(&srcs[0]);
1791                 let addr = u16::try_from(intrin.base()).unwrap();
1792 
1793                 let flags = intrin.flags();
1794                 let flags: nak_nir_attr_io_flags =
1795                     unsafe { std::mem::transmute_copy(&flags) };
1796 
1797                 let access = AttrAccess {
1798                     addr: addr,
1799                     comps: 1,
1800                     patch: flags.patch(),
1801                     output: flags.output(),
1802                     phys: false,
1803                 };
1804 
1805                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1806                 b.push_op(OpAL2P {
1807                     dst: dst.into(),
1808                     offset: offset,
1809                     access: access,
1810                 });
1811                 self.set_dst(&intrin.def, dst);
1812             }
1813             nir_intrinsic_ald_nv | nir_intrinsic_ast_nv => {
1814                 let addr = u16::try_from(intrin.base()).unwrap();
1815                 let base = u16::try_from(intrin.range_base()).unwrap();
1816                 let range = u16::try_from(intrin.range()).unwrap();
1817                 let range = base..(base + range);
1818 
1819                 let flags = intrin.flags();
1820                 let flags: nak_nir_attr_io_flags =
1821                     unsafe { std::mem::transmute_copy(&flags) };
1822                 assert!(!flags.patch() || !flags.phys());
1823 
1824                 if let ShaderIoInfo::Vtg(io) = &mut self.info.io {
1825                     if flags.patch() {
1826                         match &mut self.info.stage {
1827                             ShaderStageInfo::TessellationInit(stage) => {
1828                                 assert!(flags.output());
1829                                 stage.per_patch_attribute_count = max(
1830                                     stage.per_patch_attribute_count,
1831                                     (range.end / 4).try_into().unwrap(),
1832                                 );
1833                             }
1834                             ShaderStageInfo::Tessellation => (),
1835                             _ => panic!("Patch I/O not supported"),
1836                         }
1837                     } else {
1838                         if flags.output() {
1839                             if intrin.intrinsic == nir_intrinsic_ast_nv {
1840                                 io.mark_store_req(range.clone());
1841                             }
1842                             io.mark_attrs_written(range);
1843                         } else {
1844                             io.mark_attrs_read(range);
1845                         }
1846                     }
1847                 } else {
1848                     panic!("Must be a VTG stage");
1849                 }
1850 
1851                 let access = AttrAccess {
1852                     addr: addr,
1853                     comps: intrin.num_components,
1854                     patch: flags.patch(),
1855                     output: flags.output(),
1856                     phys: flags.phys(),
1857                 };
1858 
1859                 if intrin.intrinsic == nir_intrinsic_ald_nv {
1860                     let vtx = self.get_src(&srcs[0]);
1861                     let offset = self.get_src(&srcs[1]);
1862 
1863                     assert!(intrin.def.bit_size() == 32);
1864                     let dst = b.alloc_ssa(RegFile::GPR, access.comps);
1865                     b.push_op(OpALd {
1866                         dst: dst.into(),
1867                         vtx: vtx,
1868                         offset: offset,
1869                         access: access,
1870                     });
1871                     self.set_dst(&intrin.def, dst);
1872                 } else if intrin.intrinsic == nir_intrinsic_ast_nv {
1873                     assert!(srcs[0].bit_size() == 32);
1874                     let data = self.get_src(&srcs[0]);
1875                     let vtx = self.get_src(&srcs[1]);
1876                     let offset = self.get_src(&srcs[2]);
1877 
1878                     b.push_op(OpASt {
1879                         data: data,
1880                         vtx: vtx,
1881                         offset: offset,
1882                         access: access,
1883                     });
1884                 } else {
1885                     panic!("Invalid VTG I/O intrinsic");
1886                 }
1887             }
1888             nir_intrinsic_ballot => {
1889                 assert!(srcs[0].bit_size() == 1);
1890                 let src = self.get_src(&srcs[0]);
1891 
1892                 assert!(intrin.def.bit_size() == 32);
1893                 let dst = b.alloc_ssa(RegFile::GPR, 1);
1894 
1895                 b.push_op(OpVote {
1896                     op: VoteOp::Any,
1897                     ballot: dst.into(),
1898                     vote: Dst::None,
1899                     pred: src,
1900                 });
1901                 self.set_dst(&intrin.def, dst);
1902             }
1903             nir_intrinsic_bar_break_nv => {
1904                 let src = self.get_src(&srcs[0]);
1905                 let bar_in = b.bmov_to_bar(src);
1906 
1907                 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
1908                 b.push_op(OpBreak {
1909                     bar_out: bar_out.into(),
1910                     bar_in: bar_in.into(),
1911                     cond: SrcRef::True.into(),
1912                 });
1913 
1914                 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
1915             }
1916             nir_intrinsic_bar_set_nv => {
1917                 let label = self.label_alloc.alloc();
1918                 let old = self.bar_label.insert(intrin.def.index, label);
1919                 assert!(old.is_none());
1920 
1921                 let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
1922                 b.push_op(OpBClear {
1923                     dst: bar_clear.into(),
1924                 });
1925 
1926                 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
1927                 b.push_op(OpBSSy {
1928                     bar_out: bar_out.into(),
1929                     bar_in: bar_clear.into(),
1930                     cond: SrcRef::True.into(),
1931                     target: label,
1932                 });
1933 
1934                 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
1935             }
1936             nir_intrinsic_bar_sync_nv => {
1937                 let src = self.get_src(&srcs[0]);
1938 
1939                 let bar = b.bmov_to_bar(src);
1940                 b.push_op(OpBSync {
1941                     bar: bar.into(),
1942                     cond: SrcRef::True.into(),
1943                 });
1944 
1945                 let bar_set_idx = &srcs[1].as_def().index;
1946                 if let Some(label) = self.bar_label.get(bar_set_idx) {
1947                     b.push_op(OpNop {
1948                         label: Some(*label),
1949                     });
1950                 }
1951             }
1952             nir_intrinsic_bindless_image_atomic
1953             | nir_intrinsic_bindless_image_atomic_swap => {
1954                 let handle = self.get_src(&srcs[0]);
1955                 let dim = self.get_image_dim(intrin);
1956                 let coord = self.get_image_coord(intrin, dim);
1957                 // let sample = self.get_src(&srcs[2]);
1958                 let atom_type = self.get_atomic_type(intrin);
1959                 let atom_op = self.get_atomic_op(intrin);
1960 
1961                 assert!(
1962                     intrin.def.bit_size() == 32 || intrin.def.bit_size() == 64
1963                 );
1964                 assert!(intrin.def.num_components() == 1);
1965                 let dst = b.alloc_ssa(RegFile::GPR, intrin.def.bit_size() / 32);
1966 
1967                 let data = if intrin.intrinsic
1968                     == nir_intrinsic_bindless_image_atomic_swap
1969                 {
1970                     if intrin.def.bit_size() == 64 {
1971                         SSARef::from([
1972                             self.get_ssa(srcs[3].as_def())[0],
1973                             self.get_ssa(srcs[3].as_def())[1],
1974                             self.get_ssa(srcs[4].as_def())[0],
1975                             self.get_ssa(srcs[4].as_def())[1],
1976                         ])
1977                         .into()
1978                     } else {
1979                         SSARef::from([
1980                             self.get_ssa(srcs[3].as_def())[0],
1981                             self.get_ssa(srcs[4].as_def())[0],
1982                         ])
1983                         .into()
1984                     }
1985                 } else {
1986                     self.get_src(&srcs[3])
1987                 };
1988 
1989                 b.push_op(OpSuAtom {
1990                     dst: dst.into(),
1991                     resident: Dst::None,
1992                     handle: handle,
1993                     coord: coord,
1994                     data: data,
1995                     atom_op: atom_op,
1996                     atom_type: atom_type,
1997                     image_dim: dim,
1998                     mem_order: MemOrder::Strong(MemScope::System),
1999                     mem_eviction_priority: self
2000                         .get_eviction_priority(intrin.access()),
2001                 });
2002                 self.set_dst(&intrin.def, dst);
2003             }
2004             nir_intrinsic_bindless_image_load => {
2005                 let handle = self.get_src(&srcs[0]);
2006                 let dim = self.get_image_dim(intrin);
2007                 let coord = self.get_image_coord(intrin, dim);
2008                 // let sample = self.get_src(&srcs[2]);
2009 
2010                 let comps = intrin.num_components;
2011                 assert!(intrin.def.bit_size() == 32);
2012                 assert!(comps == 1 || comps == 2 || comps == 4);
2013 
2014                 let dst = b.alloc_ssa(RegFile::GPR, comps);
2015 
2016                 b.push_op(OpSuLd {
2017                     dst: dst.into(),
2018                     resident: Dst::None,
2019                     image_dim: dim,
2020                     mem_order: MemOrder::Strong(MemScope::System),
2021                     mem_eviction_priority: self
2022                         .get_eviction_priority(intrin.access()),
2023                     mask: (1 << comps) - 1,
2024                     handle: handle,
2025                     coord: coord,
2026                 });
2027                 self.set_dst(&intrin.def, dst);
2028             }
2029             nir_intrinsic_bindless_image_store => {
2030                 let handle = self.get_src(&srcs[0]);
2031                 let dim = self.get_image_dim(intrin);
2032                 let coord = self.get_image_coord(intrin, dim);
2033                 // let sample = self.get_src(&srcs[2]);
2034                 let data = self.get_src(&srcs[3]);
2035 
2036                 let comps = intrin.num_components;
2037                 assert!(srcs[3].bit_size() == 32);
2038                 assert!(comps == 1 || comps == 2 || comps == 4);
2039 
2040                 b.push_op(OpSuSt {
2041                     image_dim: dim,
2042                     mem_order: MemOrder::Strong(MemScope::System),
2043                     mem_eviction_priority: self
2044                         .get_eviction_priority(intrin.access()),
2045                     mask: (1 << comps) - 1,
2046                     handle: handle,
2047                     coord: coord,
2048                     data: data,
2049                 });
2050             }
2051             nir_intrinsic_demote
2052             | nir_intrinsic_discard
2053             | nir_intrinsic_terminate => {
2054                 if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
2055                     info.uses_kill = true;
2056                 } else {
2057                     panic!("OpKill is only available in fragment shaders");
2058                 }
2059                 b.push_op(OpKill {});
2060 
2061                 if intrin.intrinsic == nir_intrinsic_terminate {
2062                     b.push_op(OpExit {});
2063                 }
2064             }
2065             nir_intrinsic_demote_if
2066             | nir_intrinsic_discard_if
2067             | nir_intrinsic_terminate_if => {
2068                 if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
2069                     info.uses_kill = true;
2070                 } else {
2071                     panic!("OpKill is only available in fragment shaders");
2072                 }
2073                 let cond = self.get_ssa(srcs[0].as_def())[0];
2074                 b.predicate(cond.into()).push_op(OpKill {});
2075 
2076                 if intrin.intrinsic == nir_intrinsic_terminate_if {
2077                     b.predicate(cond.into()).push_op(OpExit {});
2078                 }
2079             }
2080             nir_intrinsic_global_atomic => {
2081                 let bit_size = intrin.def.bit_size();
2082                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2083                 let data = self.get_src(&srcs[1]);
2084                 let atom_type = self.get_atomic_type(intrin);
2085                 let atom_op = self.get_atomic_op(intrin);
2086 
2087                 assert!(intrin.def.num_components() == 1);
2088                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2089 
2090                 b.push_op(OpAtom {
2091                     dst: dst.into(),
2092                     addr: addr,
2093                     cmpr: 0.into(),
2094                     data: data,
2095                     atom_op: atom_op,
2096                     atom_type: atom_type,
2097                     addr_offset: offset,
2098                     mem_space: MemSpace::Global(MemAddrType::A64),
2099                     mem_order: MemOrder::Strong(MemScope::System),
2100                     mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2101                 });
2102                 self.set_dst(&intrin.def, dst);
2103             }
2104             nir_intrinsic_global_atomic_swap => {
2105                 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2106                 let bit_size = intrin.def.bit_size();
2107                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2108                 let cmpr = self.get_src(&srcs[1]);
2109                 let data = self.get_src(&srcs[2]);
2110                 let atom_type = AtomType::U(bit_size);
2111 
2112                 assert!(intrin.def.num_components() == 1);
2113                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2114 
2115                 b.push_op(OpAtom {
2116                     dst: dst.into(),
2117                     addr: addr,
2118                     cmpr: cmpr,
2119                     data: data,
2120                     atom_op: AtomOp::CmpExch,
2121                     atom_type: atom_type,
2122                     addr_offset: offset,
2123                     mem_space: MemSpace::Global(MemAddrType::A64),
2124                     mem_order: MemOrder::Strong(MemScope::System),
2125                     mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2126                 });
2127                 self.set_dst(&intrin.def, dst);
2128             }
2129             nir_intrinsic_ipa_nv => {
2130                 let addr = u16::try_from(intrin.base()).unwrap();
2131 
2132                 let flags = intrin.flags();
2133                 let flags: nak_nir_ipa_flags =
2134                     unsafe { std::mem::transmute_copy(&flags) };
2135 
2136                 let mode = match flags.interp_mode() {
2137                     NAK_INTERP_MODE_PERSPECTIVE => PixelImap::Perspective,
2138                     NAK_INTERP_MODE_SCREEN_LINEAR => PixelImap::ScreenLinear,
2139                     NAK_INTERP_MODE_CONSTANT => PixelImap::Constant,
2140                     _ => panic!("Unsupported interp mode"),
2141                 };
2142 
2143                 let freq = match flags.interp_freq() {
2144                     NAK_INTERP_FREQ_PASS => InterpFreq::Pass,
2145                     NAK_INTERP_FREQ_PASS_MUL_W => InterpFreq::PassMulW,
2146                     NAK_INTERP_FREQ_CONSTANT => InterpFreq::Constant,
2147                     NAK_INTERP_FREQ_STATE => InterpFreq::State,
2148                     _ => panic!("Invalid interp freq"),
2149                 };
2150 
2151                 let loc = match flags.interp_loc() {
2152                     NAK_INTERP_LOC_DEFAULT => InterpLoc::Default,
2153                     NAK_INTERP_LOC_CENTROID => InterpLoc::Centroid,
2154                     NAK_INTERP_LOC_OFFSET => InterpLoc::Offset,
2155                     _ => panic!("Invalid interp loc"),
2156                 };
2157 
2158                 let inv_w = if freq == InterpFreq::PassMulW {
2159                     self.get_src(&srcs[0])
2160                 } else {
2161                     0.into()
2162                 };
2163 
2164                 let offset = if loc == InterpLoc::Offset {
2165                     self.get_src(&srcs[1])
2166                 } else {
2167                     0.into()
2168                 };
2169 
2170                 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2171                     panic!("OpIpa is only used for fragment shaders");
2172                 };
2173 
2174                 io.mark_attr_read(addr, mode);
2175 
2176                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2177                 b.push_op(OpIpa {
2178                     dst: dst.into(),
2179                     addr: addr,
2180                     freq: freq,
2181                     loc: loc,
2182                     inv_w: inv_w,
2183                     offset: offset,
2184                 });
2185                 self.set_dst(&intrin.def, dst);
2186             }
2187             nir_intrinsic_isberd_nv => {
2188                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2189                 b.push_op(OpIsberd {
2190                     dst: dst.into(),
2191                     idx: self.get_src(&srcs[0]),
2192                 });
2193                 self.set_dst(&intrin.def, dst);
2194             }
2195             nir_intrinsic_load_barycentric_at_offset_nv => (),
2196             nir_intrinsic_load_barycentric_centroid => (),
2197             nir_intrinsic_load_barycentric_pixel => (),
2198             nir_intrinsic_load_barycentric_sample => (),
2199             nir_intrinsic_load_global | nir_intrinsic_load_global_constant => {
2200                 let size_B =
2201                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2202                 assert!(u32::from(size_B) <= intrin.align());
2203                 let order =
2204                     if intrin.intrinsic == nir_intrinsic_load_global_constant {
2205                         MemOrder::Constant
2206                     } else {
2207                         MemOrder::Strong(MemScope::System)
2208                     };
2209                 let access = MemAccess {
2210                     mem_type: MemType::from_size(size_B, false),
2211                     space: MemSpace::Global(MemAddrType::A64),
2212                     order: order,
2213                     eviction_priority: self
2214                         .get_eviction_priority(intrin.access()),
2215                 };
2216                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 32);
2217                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2218 
2219                 b.push_op(OpLd {
2220                     dst: dst.into(),
2221                     addr: addr,
2222                     offset: offset,
2223                     access: access,
2224                 });
2225                 self.set_dst(&intrin.def, dst);
2226             }
2227             nir_intrinsic_ldtram_nv => {
2228                 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2229                     panic!("ldtram_nv is only used for fragment shaders");
2230                 };
2231 
2232                 assert!(
2233                     intrin.def.bit_size() == 32
2234                         && intrin.def.num_components == 2
2235                 );
2236 
2237                 let flags = intrin.flags();
2238                 let use_c = flags != 0;
2239 
2240                 let addr = u16::try_from(intrin.base()).unwrap();
2241 
2242                 io.mark_barycentric_attr_in(addr);
2243 
2244                 let dst = b.alloc_ssa(RegFile::GPR, 2);
2245                 b.push_op(OpLdTram {
2246                     dst: dst.into(),
2247                     addr,
2248                     use_c,
2249                 });
2250                 self.set_dst(&intrin.def, dst);
2251             }
2252             nir_intrinsic_load_sample_id => {
2253                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2254                 b.push_op(OpPixLd {
2255                     dst: dst.into(),
2256                     val: PixVal::MyIndex,
2257                 });
2258                 self.set_dst(&intrin.def, dst);
2259             }
2260             nir_intrinsic_load_sample_mask_in => {
2261                 if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
2262                     info.reads_sample_mask = true;
2263                 } else {
2264                     panic!(
2265                         "sample_mask_in is only available in fragment shaders"
2266                     );
2267                 }
2268 
2269                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2270                 b.push_op(OpPixLd {
2271                     dst: dst.into(),
2272                     val: PixVal::CovMask,
2273                 });
2274                 self.set_dst(&intrin.def, dst);
2275             }
2276             nir_intrinsic_load_tess_coord_xy => {
2277                 // Loading gl_TessCoord in tessellation evaluation shaders is
2278                 // weird.  It's treated as a per-vertex output which is indexed
2279                 // by LANEID.
2280                 match &self.info.stage {
2281                     ShaderStageInfo::Tessellation => (),
2282                     _ => panic!(
2283                         "load_tess_coord is only available in tessellation \
2284                          shaders"
2285                     ),
2286                 };
2287 
2288                 assert!(intrin.def.bit_size() == 32);
2289                 assert!(intrin.def.num_components() == 2);
2290 
2291                 let vtx = b.alloc_ssa(RegFile::GPR, 1);
2292                 b.push_op(OpS2R {
2293                     dst: vtx.into(),
2294                     idx: 0,
2295                 });
2296 
2297                 let access = AttrAccess {
2298                     addr: NAK_ATTR_TESS_COORD,
2299                     comps: 2,
2300                     patch: false,
2301                     output: true,
2302                     phys: false,
2303                 };
2304 
2305                 // This is recorded as a patch output in parse_shader() because
2306                 // the hardware requires it be in the SPH, whether we use it or
2307                 // not.
2308 
2309                 let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2310                 b.push_op(OpALd {
2311                     dst: dst.into(),
2312                     vtx: vtx.into(),
2313                     offset: 0.into(),
2314                     access: access,
2315                 });
2316                 self.set_dst(&intrin.def, dst);
2317             }
2318             nir_intrinsic_load_scratch => {
2319                 let size_B =
2320                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2321                 assert!(u32::from(size_B) <= intrin.align());
2322                 let access = MemAccess {
2323                     mem_type: MemType::from_size(size_B, false),
2324                     space: MemSpace::Local,
2325                     order: MemOrder::Strong(MemScope::CTA),
2326                     eviction_priority: MemEvictionPriority::Normal,
2327                 };
2328                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2329                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2330 
2331                 b.push_op(OpLd {
2332                     dst: dst.into(),
2333                     addr: addr,
2334                     offset: offset,
2335                     access: access,
2336                 });
2337                 self.set_dst(&intrin.def, dst);
2338             }
2339             nir_intrinsic_load_shared => {
2340                 let size_B =
2341                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2342                 assert!(u32::from(size_B) <= intrin.align());
2343                 let access = MemAccess {
2344                     mem_type: MemType::from_size(size_B, false),
2345                     space: MemSpace::Shared,
2346                     order: MemOrder::Strong(MemScope::CTA),
2347                     eviction_priority: MemEvictionPriority::Normal,
2348                 };
2349                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2350                 let offset = offset + intrin.base();
2351                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2352 
2353                 b.push_op(OpLd {
2354                     dst: dst.into(),
2355                     addr: addr,
2356                     offset: offset,
2357                     access: access,
2358                 });
2359                 self.set_dst(&intrin.def, dst);
2360             }
2361             nir_intrinsic_load_sysval_nv => {
2362                 let idx = u8::try_from(intrin.base()).unwrap();
2363                 debug_assert!(intrin.def.num_components == 1);
2364                 debug_assert!(
2365                     intrin.def.bit_size == 32 || intrin.def.bit_size == 64
2366                 );
2367                 let comps = intrin.def.bit_size / 32;
2368                 let dst = b.alloc_ssa(RegFile::GPR, comps);
2369                 if idx == NAK_SV_CLOCK || idx == NAK_SV_CLOCK + 1 {
2370                     debug_assert!(idx + comps <= NAK_SV_CLOCK + 2);
2371                     b.push_op(OpCS2R {
2372                         dst: dst.into(),
2373                         idx: idx,
2374                     });
2375                 } else {
2376                     debug_assert!(intrin.def.bit_size == 32);
2377                     b.push_op(OpS2R {
2378                         dst: dst.into(),
2379                         idx: idx,
2380                     });
2381                 }
2382                 self.set_dst(&intrin.def, dst);
2383             }
2384             nir_intrinsic_load_ubo => {
2385                 let size_B =
2386                     (intrin.def.bit_size() / 8) * intrin.def.num_components();
2387                 let idx = srcs[0];
2388 
2389                 let (off, off_imm) = self.get_io_addr_offset(&srcs[1], 16);
2390                 let (off, off_imm) =
2391                     if let Ok(off_imm_u16) = u16::try_from(off_imm) {
2392                         (off, off_imm_u16)
2393                     } else {
2394                         (self.get_src(&srcs[1]), 0)
2395                     };
2396 
2397                 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2398 
2399                 if let Some(idx_imm) = idx.as_uint() {
2400                     let idx_imm: u8 = idx_imm.try_into().unwrap();
2401                     let cb = CBufRef {
2402                         buf: CBuf::Binding(idx_imm),
2403                         offset: off_imm,
2404                     };
2405                     if off.is_zero() {
2406                         for (i, comp) in dst.iter().enumerate() {
2407                             let i = u16::try_from(i).unwrap();
2408                             b.copy_to((*comp).into(), cb.offset(i * 4).into());
2409                         }
2410                     } else {
2411                         b.push_op(OpLdc {
2412                             dst: dst.into(),
2413                             cb: cb.into(),
2414                             offset: off,
2415                             mem_type: MemType::from_size(size_B, false),
2416                         });
2417                     }
2418                 } else {
2419                     panic!("Indirect UBO indices not yet supported");
2420                 }
2421                 self.set_dst(&intrin.def, dst);
2422             }
2423             nir_intrinsic_barrier => {
2424                 let modes = intrin.memory_modes();
2425                 let semantics = intrin.memory_semantics();
2426                 if (modes & nir_var_mem_global) != 0
2427                     && (semantics & NIR_MEMORY_RELEASE) != 0
2428                 {
2429                     b.push_op(OpCCtl {
2430                         op: CCtlOp::WBAll,
2431                         mem_space: MemSpace::Global(MemAddrType::A64),
2432                         addr: 0.into(),
2433                         addr_offset: 0,
2434                     });
2435                 }
2436                 match intrin.execution_scope() {
2437                     SCOPE_NONE => (),
2438                     SCOPE_WORKGROUP => {
2439                         assert!(
2440                             self.nir.info.stage() == MESA_SHADER_COMPUTE
2441                                 || self.nir.info.stage() == MESA_SHADER_KERNEL
2442                         );
2443                         self.info.num_barriers = 1;
2444                         b.push_op(OpBar {});
2445                     }
2446                     _ => panic!("Unhandled execution scope"),
2447                 }
2448                 if intrin.memory_scope() != SCOPE_NONE {
2449                     let mem_scope = match intrin.memory_scope() {
2450                         SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
2451                         SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
2452                             MemScope::GPU
2453                         }
2454                         _ => panic!("Unhandled memory scope"),
2455                     };
2456                     b.push_op(OpMemBar { scope: mem_scope });
2457                 }
2458                 if (modes & nir_var_mem_global) != 0
2459                     && (semantics & NIR_MEMORY_ACQUIRE) != 0
2460                 {
2461                     b.push_op(OpCCtl {
2462                         op: CCtlOp::IVAll,
2463                         mem_space: MemSpace::Global(MemAddrType::A64),
2464                         addr: 0.into(),
2465                         addr_offset: 0,
2466                     });
2467                 }
2468             }
2469             nir_intrinsic_quad_broadcast
2470             | nir_intrinsic_read_invocation
2471             | nir_intrinsic_shuffle
2472             | nir_intrinsic_shuffle_down
2473             | nir_intrinsic_shuffle_up
2474             | nir_intrinsic_shuffle_xor => {
2475                 assert!(srcs[0].bit_size() == 32);
2476                 assert!(srcs[0].num_components() == 1);
2477                 let data = self.get_src(&srcs[0]);
2478 
2479                 assert!(srcs[1].bit_size() == 32);
2480                 let idx = self.get_src(&srcs[1]);
2481 
2482                 assert!(intrin.def.bit_size() == 32);
2483                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2484 
2485                 b.push_op(OpShfl {
2486                     dst: dst.into(),
2487                     in_bounds: Dst::None,
2488                     src: data,
2489                     lane: idx,
2490                     c: match intrin.intrinsic {
2491                         nir_intrinsic_quad_broadcast => 0x1c_03.into(),
2492                         nir_intrinsic_shuffle_up => 0.into(),
2493                         _ => 0x1f.into(),
2494                     },
2495                     op: match intrin.intrinsic {
2496                         nir_intrinsic_shuffle_down => ShflOp::Down,
2497                         nir_intrinsic_shuffle_up => ShflOp::Up,
2498                         nir_intrinsic_shuffle_xor => ShflOp::Bfly,
2499                         _ => ShflOp::Idx,
2500                     },
2501                 });
2502                 self.set_dst(&intrin.def, dst);
2503             }
2504             nir_intrinsic_quad_swap_horizontal
2505             | nir_intrinsic_quad_swap_vertical
2506             | nir_intrinsic_quad_swap_diagonal => {
2507                 assert!(srcs[0].bit_size() == 32);
2508                 assert!(srcs[0].num_components() == 1);
2509                 let data = self.get_src(&srcs[0]);
2510 
2511                 assert!(intrin.def.bit_size() == 32);
2512                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2513                 b.push_op(OpShfl {
2514                     dst: dst.into(),
2515                     in_bounds: Dst::None,
2516                     src: data,
2517                     lane: match intrin.intrinsic {
2518                         nir_intrinsic_quad_swap_horizontal => 1_u32.into(),
2519                         nir_intrinsic_quad_swap_vertical => 2_u32.into(),
2520                         nir_intrinsic_quad_swap_diagonal => 3_u32.into(),
2521                         op => panic!("Unknown quad intrinsic {}", op),
2522                     },
2523                     c: 0x1c_03.into(),
2524                     op: ShflOp::Bfly,
2525                 });
2526                 self.set_dst(&intrin.def, dst);
2527             }
2528             nir_intrinsic_shared_atomic => {
2529                 let bit_size = intrin.def.bit_size();
2530                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2531                 let data = self.get_src(&srcs[1]);
2532                 let atom_type = self.get_atomic_type(intrin);
2533                 let atom_op = self.get_atomic_op(intrin);
2534 
2535                 assert!(intrin.def.num_components() == 1);
2536                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2537 
2538                 b.push_op(OpAtom {
2539                     dst: dst.into(),
2540                     addr: addr,
2541                     cmpr: 0.into(),
2542                     data: data,
2543                     atom_op: atom_op,
2544                     atom_type: atom_type,
2545                     addr_offset: offset,
2546                     mem_space: MemSpace::Shared,
2547                     mem_order: MemOrder::Strong(MemScope::CTA),
2548                     mem_eviction_priority: MemEvictionPriority::Normal,
2549                 });
2550                 self.set_dst(&intrin.def, dst);
2551             }
2552             nir_intrinsic_shared_atomic_swap => {
2553                 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2554                 let bit_size = intrin.def.bit_size();
2555                 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2556                 let cmpr = self.get_src(&srcs[1]);
2557                 let data = self.get_src(&srcs[2]);
2558                 let atom_type = AtomType::U(bit_size);
2559 
2560                 assert!(intrin.def.num_components() == 1);
2561                 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2562 
2563                 b.push_op(OpAtom {
2564                     dst: dst.into(),
2565                     addr: addr,
2566                     cmpr: cmpr,
2567                     data: data,
2568                     atom_op: AtomOp::CmpExch,
2569                     atom_type: atom_type,
2570                     addr_offset: offset,
2571                     mem_space: MemSpace::Shared,
2572                     mem_order: MemOrder::Strong(MemScope::CTA),
2573                     mem_eviction_priority: MemEvictionPriority::Normal,
2574                 });
2575                 self.set_dst(&intrin.def, dst);
2576             }
2577             nir_intrinsic_ssa_bar_nv => {
2578                 let src = self.get_src(&srcs[0]);
2579                 b.push_op(OpSrcBar { src });
2580             }
2581             nir_intrinsic_store_global => {
2582                 let data = self.get_src(&srcs[0]);
2583                 let size_B =
2584                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
2585                 assert!(u32::from(size_B) <= intrin.align());
2586                 let access = MemAccess {
2587                     mem_type: MemType::from_size(size_B, false),
2588                     space: MemSpace::Global(MemAddrType::A64),
2589                     order: MemOrder::Strong(MemScope::System),
2590                     eviction_priority: self
2591                         .get_eviction_priority(intrin.access()),
2592                 };
2593                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 32);
2594 
2595                 b.push_op(OpSt {
2596                     addr: addr,
2597                     data: data,
2598                     offset: offset,
2599                     access: access,
2600                 });
2601             }
2602             nir_intrinsic_store_output => {
2603                 let ShaderIoInfo::Fragment(_) = &mut self.info.io else {
2604                     panic!("load_input is only used for fragment shaders");
2605                 };
2606                 let data = self.get_src(&srcs[0]);
2607 
2608                 let addr = u16::try_from(intrin.base()).unwrap()
2609                     + u16::try_from(srcs[1].as_uint().unwrap()).unwrap()
2610                     + 4 * u16::try_from(intrin.component()).unwrap();
2611                 assert!(addr % 4 == 0);
2612 
2613                 for c in 0..usize::from(intrin.num_components) {
2614                     let idx = usize::from(addr / 4) + c;
2615                     self.fs_out_regs[idx] = data.as_ssa().unwrap()[c];
2616                 }
2617             }
2618             nir_intrinsic_store_scratch => {
2619                 let data = self.get_src(&srcs[0]);
2620                 let size_B =
2621                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
2622                 assert!(u32::from(size_B) <= intrin.align());
2623                 let access = MemAccess {
2624                     mem_type: MemType::from_size(size_B, false),
2625                     space: MemSpace::Local,
2626                     order: MemOrder::Strong(MemScope::CTA),
2627                     eviction_priority: MemEvictionPriority::Normal,
2628                 };
2629                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
2630 
2631                 b.push_op(OpSt {
2632                     addr: addr,
2633                     data: data,
2634                     offset: offset,
2635                     access: access,
2636                 });
2637             }
2638             nir_intrinsic_store_shared => {
2639                 let data = self.get_src(&srcs[0]);
2640                 let size_B =
2641                     (srcs[0].bit_size() / 8) * srcs[0].num_components();
2642                 assert!(u32::from(size_B) <= intrin.align());
2643                 let access = MemAccess {
2644                     mem_type: MemType::from_size(size_B, false),
2645                     space: MemSpace::Shared,
2646                     order: MemOrder::Strong(MemScope::CTA),
2647                     eviction_priority: MemEvictionPriority::Normal,
2648                 };
2649                 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
2650                 let offset = offset + intrin.base();
2651 
2652                 b.push_op(OpSt {
2653                     addr: addr,
2654                     data: data,
2655                     offset: offset,
2656                     access: access,
2657                 });
2658             }
2659             nir_intrinsic_emit_vertex_nv | nir_intrinsic_end_primitive_nv => {
2660                 assert!(intrin.def.bit_size() == 32);
2661                 assert!(intrin.def.num_components() == 1);
2662 
2663                 let dst = b.alloc_ssa(RegFile::GPR, 1);
2664                 let handle = self.get_src(&srcs[0]);
2665                 let stream_id = intrin.stream_id();
2666 
2667                 b.push_op(OpOut {
2668                     dst: dst.into(),
2669                     handle: handle,
2670                     stream: stream_id.into(),
2671                     out_type: if intrin.intrinsic
2672                         == nir_intrinsic_emit_vertex_nv
2673                     {
2674                         OutType::Emit
2675                     } else {
2676                         OutType::Cut
2677                     },
2678                 });
2679                 self.set_dst(&intrin.def, dst);
2680             }
2681 
2682             nir_intrinsic_final_primitive_nv => {
2683                 let handle = self.get_src(&srcs[0]);
2684 
2685                 if self.info.sm >= 70 {
2686                     b.push_op(OpOutFinal { handle: handle });
2687                 }
2688             }
2689             nir_intrinsic_vote_all
2690             | nir_intrinsic_vote_any
2691             | nir_intrinsic_vote_ieq => {
2692                 assert!(srcs[0].bit_size() == 1);
2693                 let src = self.get_src(&srcs[0]);
2694 
2695                 assert!(intrin.def.bit_size() == 1);
2696                 let dst = b.alloc_ssa(RegFile::Pred, 1);
2697 
2698                 b.push_op(OpVote {
2699                     op: match intrin.intrinsic {
2700                         nir_intrinsic_vote_all => VoteOp::All,
2701                         nir_intrinsic_vote_any => VoteOp::Any,
2702                         nir_intrinsic_vote_ieq => VoteOp::Eq,
2703                         _ => panic!("Unknown vote intrinsic"),
2704                     },
2705                     ballot: Dst::None,
2706                     vote: dst.into(),
2707                     pred: src,
2708                 });
2709                 self.set_dst(&intrin.def, dst);
2710             }
2711             _ => panic!(
2712                 "Unsupported intrinsic instruction: {}",
2713                 intrin.info().name()
2714             ),
2715         }
2716     }
2717 
parse_load_const( &mut self, b: &mut impl SSABuilder, load_const: &nir_load_const_instr, )2718     fn parse_load_const(
2719         &mut self,
2720         b: &mut impl SSABuilder,
2721         load_const: &nir_load_const_instr,
2722     ) {
2723         let values = &load_const.values();
2724 
2725         let mut dst = Vec::new();
2726         match load_const.def.bit_size {
2727             1 => {
2728                 for c in 0..load_const.def.num_components {
2729                     let imm_b1 = unsafe { values[usize::from(c)].b };
2730                     dst.push(b.copy(imm_b1.into())[0]);
2731                 }
2732             }
2733             8 => {
2734                 for dw in 0..load_const.def.num_components.div_ceil(4) {
2735                     let mut imm_u32 = 0;
2736                     for b in 0..4 {
2737                         let c = dw * 4 + b;
2738                         if c < load_const.def.num_components {
2739                             let imm_u8 = unsafe { values[usize::from(c)].u8_ };
2740                             imm_u32 |= u32::from(imm_u8) << b * 8;
2741                         }
2742                     }
2743                     dst.push(b.copy(imm_u32.into())[0]);
2744                 }
2745             }
2746             16 => {
2747                 for dw in 0..load_const.def.num_components.div_ceil(2) {
2748                     let mut imm_u32 = 0;
2749                     for w in 0..2 {
2750                         let c = dw * 2 + w;
2751                         if c < load_const.def.num_components {
2752                             let imm_u16 =
2753                                 unsafe { values[usize::from(c)].u16_ };
2754                             imm_u32 |= u32::from(imm_u16) << w * 16;
2755                         }
2756                     }
2757                     dst.push(b.copy(imm_u32.into())[0]);
2758                 }
2759             }
2760             32 => {
2761                 for c in 0..load_const.def.num_components {
2762                     let imm_u32 = unsafe { values[usize::from(c)].u32_ };
2763                     dst.push(b.copy(imm_u32.into())[0]);
2764                 }
2765             }
2766             64 => {
2767                 for c in 0..load_const.def.num_components {
2768                     let imm_u64 = unsafe { values[c as usize].u64_ };
2769                     dst.push(b.copy((imm_u64 as u32).into())[0]);
2770                     dst.push(b.copy(((imm_u64 >> 32) as u32).into())[0]);
2771                 }
2772             }
2773             _ => panic!("Unknown bit size: {}", load_const.def.bit_size),
2774         }
2775 
2776         self.set_ssa(&load_const.def, dst);
2777     }
2778 
parse_undef( &mut self, b: &mut impl SSABuilder, undef: &nir_undef_instr, )2779     fn parse_undef(
2780         &mut self,
2781         b: &mut impl SSABuilder,
2782         undef: &nir_undef_instr,
2783     ) {
2784         let dst = alloc_ssa_for_nir(b, &undef.def);
2785         for c in &dst {
2786             b.push_op(OpUndef { dst: (*c).into() });
2787         }
2788         self.set_ssa(&undef.def, dst);
2789     }
2790 
store_fs_outputs(&mut self, b: &mut impl SSABuilder)2791     fn store_fs_outputs(&mut self, b: &mut impl SSABuilder) {
2792         let ShaderIoInfo::Fragment(info) = &mut self.info.io else {
2793             return;
2794         };
2795 
2796         for i in 0..32 {
2797             // Assume that colors have to come a vec4 at a time
2798             if !self.fs_out_regs[i].is_none() {
2799                 info.writes_color |= 0xf << (i & !3)
2800             }
2801         }
2802         let mask_idx = (NAK_FS_OUT_SAMPLE_MASK / 4) as usize;
2803         info.writes_sample_mask = !self.fs_out_regs[mask_idx].is_none();
2804         let depth_idx = (NAK_FS_OUT_DEPTH / 4) as usize;
2805         info.writes_depth = !self.fs_out_regs[depth_idx].is_none();
2806 
2807         let mut srcs = Vec::new();
2808         for i in 0..32 {
2809             if info.writes_color & (1 << i) != 0 {
2810                 if self.fs_out_regs[i].is_none() {
2811                     srcs.push(0.into());
2812                 } else {
2813                     srcs.push(self.fs_out_regs[i].into());
2814                 }
2815             }
2816         }
2817 
2818         // These always come together for some reason
2819         if info.writes_sample_mask || info.writes_depth {
2820             if info.writes_sample_mask {
2821                 srcs.push(self.fs_out_regs[mask_idx].into());
2822             } else {
2823                 srcs.push(0.into());
2824             }
2825             if info.writes_depth {
2826                 // Saturate depth writes.
2827                 //
2828                 // TODO: This seems wrong in light of unrestricted depth but
2829                 // it's needed to pass CTS tests for now.
2830                 let depth = self.fs_out_regs[depth_idx];
2831                 let sat_depth = b.alloc_ssa(RegFile::GPR, 1);
2832                 b.push_op(OpFAdd {
2833                     dst: sat_depth.into(),
2834                     srcs: [depth.into(), 0.into()],
2835                     saturate: true,
2836                     rnd_mode: FRndMode::NearestEven,
2837                     ftz: false,
2838                 });
2839                 srcs.push(sat_depth.into());
2840             }
2841         }
2842 
2843         b.push_op(OpFSOut { srcs: srcs });
2844     }
2845 
parse_block( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nb: &nir_block, )2846     fn parse_block(
2847         &mut self,
2848         ssa_alloc: &mut SSAValueAllocator,
2849         phi_map: &mut PhiAllocMap,
2850         nb: &nir_block,
2851     ) {
2852         let mut b = SSAInstrBuilder::new(self.info.sm, ssa_alloc);
2853 
2854         if nb.index == 0 && self.nir.info.shared_size > 0 {
2855             // The blob seems to always do a BSYNC before accessing shared
2856             // memory.  Perhaps this is to ensure that our allocation is
2857             // actually available and not in use by another thread?
2858             let label = self.label_alloc.alloc();
2859             let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
2860 
2861             b.push_op(OpBClear {
2862                 dst: bar_clear.into(),
2863             });
2864 
2865             let bar = b.alloc_ssa(RegFile::Bar, 1);
2866             b.push_op(OpBSSy {
2867                 bar_out: bar.into(),
2868                 bar_in: bar_clear.into(),
2869                 cond: SrcRef::True.into(),
2870                 target: label,
2871             });
2872 
2873             b.push_op(OpBSync {
2874                 bar: bar.into(),
2875                 cond: SrcRef::True.into(),
2876             });
2877 
2878             b.push_op(OpNop { label: Some(label) });
2879         }
2880 
2881         let mut phi = OpPhiDsts::new();
2882         for ni in nb.iter_instr_list() {
2883             if ni.type_ == nir_instr_type_phi {
2884                 let np = ni.as_phi().unwrap();
2885                 let dst = alloc_ssa_for_nir(&mut b, np.def.as_def());
2886                 for (i, dst) in dst.iter().enumerate() {
2887                     let phi_id = phi_map.get_phi_id(np, i.try_into().unwrap());
2888                     phi.dsts.push(phi_id, (*dst).into());
2889                 }
2890                 self.set_ssa(np.def.as_def(), dst);
2891             } else {
2892                 break;
2893             }
2894         }
2895 
2896         if !phi.dsts.is_empty() {
2897             b.push_op(phi);
2898         }
2899 
2900         for ni in nb.iter_instr_list() {
2901             if DEBUG.annotate() {
2902                 let annotation = self
2903                     .nir_instr_printer
2904                     .instr_to_string(ni)
2905                     .split_whitespace()
2906                     .collect::<Vec<_>>()
2907                     .join(" ");
2908                 b.push_op(OpAnnotate {
2909                     annotation: format!("generated by \"{}\"", annotation,),
2910                 });
2911             }
2912 
2913             match ni.type_ {
2914                 nir_instr_type_alu => {
2915                     self.parse_alu(&mut b, ni.as_alu().unwrap())
2916                 }
2917                 nir_instr_type_jump => {
2918                     self.parse_jump(&mut b, ni.as_jump().unwrap())
2919                 }
2920                 nir_instr_type_tex => {
2921                     self.parse_tex(&mut b, ni.as_tex().unwrap())
2922                 }
2923                 nir_instr_type_intrinsic => {
2924                     self.parse_intrinsic(&mut b, ni.as_intrinsic().unwrap())
2925                 }
2926                 nir_instr_type_load_const => {
2927                     self.parse_load_const(&mut b, ni.as_load_const().unwrap())
2928                 }
2929                 nir_instr_type_undef => {
2930                     self.parse_undef(&mut b, ni.as_undef().unwrap())
2931                 }
2932                 nir_instr_type_phi => (),
2933                 _ => panic!("Unsupported instruction type"),
2934             }
2935         }
2936 
2937         let succ = nb.successors();
2938         for sb in succ {
2939             let sb = match sb {
2940                 Some(b) => b,
2941                 None => continue,
2942             };
2943 
2944             let mut phi = OpPhiSrcs::new();
2945 
2946             for i in sb.iter_instr_list() {
2947                 let np = match i.as_phi() {
2948                     Some(phi) => phi,
2949                     None => break,
2950                 };
2951 
2952                 for ps in np.iter_srcs() {
2953                     if ps.pred().index == nb.index {
2954                         let src = *self.get_src(&ps.src).as_ssa().unwrap();
2955                         for (i, src) in src.iter().enumerate() {
2956                             let phi_id =
2957                                 phi_map.get_phi_id(np, i.try_into().unwrap());
2958                             phi.srcs.push(phi_id, (*src).into());
2959                         }
2960                         break;
2961                     }
2962                 }
2963             }
2964 
2965             if !phi.srcs.is_empty() {
2966                 b.push_op(phi);
2967             }
2968         }
2969 
2970         if let Some(ni) = nb.following_if() {
2971             // The fall-through edge has to come first
2972             self.cfg.add_edge(nb.index, ni.first_then_block().index);
2973             self.cfg.add_edge(nb.index, ni.first_else_block().index);
2974 
2975             let mut bra = Instr::new_boxed(OpBra {
2976                 target: self.get_block_label(ni.first_else_block()),
2977             });
2978 
2979             let cond = self.get_ssa(ni.condition.as_def())[0];
2980             bra.pred = cond.into();
2981             // This is the branch to jump to the else
2982             bra.pred.pred_inv = true;
2983 
2984             b.push_instr(bra);
2985         } else {
2986             assert!(succ[1].is_none());
2987             let s0 = succ[0].unwrap();
2988             if s0.index == self.end_block_id {
2989                 self.store_fs_outputs(&mut b);
2990                 b.push_op(OpExit {});
2991             } else {
2992                 self.cfg.add_edge(nb.index, s0.index);
2993                 b.push_op(OpBra {
2994                     target: self.get_block_label(s0),
2995                 });
2996             }
2997         }
2998 
2999         let mut bb = BasicBlock::new(self.get_block_label(nb));
3000         bb.instrs.append(&mut b.as_vec());
3001         self.cfg.add_node(nb.index, bb);
3002     }
3003 
parse_if( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, ni: &nir_if, )3004     fn parse_if(
3005         &mut self,
3006         ssa_alloc: &mut SSAValueAllocator,
3007         phi_map: &mut PhiAllocMap,
3008         ni: &nir_if,
3009     ) {
3010         self.parse_cf_list(ssa_alloc, phi_map, ni.iter_then_list());
3011         self.parse_cf_list(ssa_alloc, phi_map, ni.iter_else_list());
3012     }
3013 
parse_loop( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nl: &nir_loop, )3014     fn parse_loop(
3015         &mut self,
3016         ssa_alloc: &mut SSAValueAllocator,
3017         phi_map: &mut PhiAllocMap,
3018         nl: &nir_loop,
3019     ) {
3020         self.parse_cf_list(ssa_alloc, phi_map, nl.iter_body());
3021     }
3022 
parse_cf_list( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, list: ExecListIter<nir_cf_node>, )3023     fn parse_cf_list(
3024         &mut self,
3025         ssa_alloc: &mut SSAValueAllocator,
3026         phi_map: &mut PhiAllocMap,
3027         list: ExecListIter<nir_cf_node>,
3028     ) {
3029         for node in list {
3030             match node.type_ {
3031                 nir_cf_node_block => {
3032                     let nb = node.as_block().unwrap();
3033                     self.parse_block(ssa_alloc, phi_map, nb);
3034                 }
3035                 nir_cf_node_if => {
3036                     let ni = node.as_if().unwrap();
3037                     self.parse_if(ssa_alloc, phi_map, ni);
3038                 }
3039                 nir_cf_node_loop => {
3040                     let nl = node.as_loop().unwrap();
3041                     self.parse_loop(ssa_alloc, phi_map, nl);
3042                 }
3043                 _ => panic!("Invalid inner CF node type"),
3044             }
3045         }
3046     }
3047 
parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function3048     pub fn parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function {
3049         let mut ssa_alloc = SSAValueAllocator::new();
3050         self.end_block_id = nfi.end_block().index;
3051 
3052         let mut phi_alloc = PhiAllocator::new();
3053         let mut phi_map = PhiAllocMap::new(&mut phi_alloc);
3054 
3055         self.parse_cf_list(&mut ssa_alloc, &mut phi_map, nfi.iter_body());
3056 
3057         let cfg = std::mem::take(&mut self.cfg).as_cfg();
3058         assert!(cfg.len() > 0);
3059         for i in 0..cfg.len() {
3060             if cfg[i].falls_through() {
3061                 assert!(cfg.succ_indices(i)[0] == i + 1);
3062             }
3063         }
3064 
3065         Function {
3066             ssa_alloc: ssa_alloc,
3067             phi_alloc: phi_alloc,
3068             blocks: cfg,
3069         }
3070     }
3071 
parse_shader(mut self) -> Shader3072     pub fn parse_shader(mut self) -> Shader {
3073         let mut functions = Vec::new();
3074         for nf in self.nir.iter_functions() {
3075             if let Some(nfi) = nf.get_impl() {
3076                 let f = self.parse_function_impl(nfi);
3077                 functions.push(f);
3078             }
3079         }
3080 
3081         // Tessellation evaluation shaders MUST claim to read gl_TessCoord or
3082         // the hardware will throw an SPH error.
3083         if matches!(self.info.stage, ShaderStageInfo::Tessellation) {
3084             match &mut self.info.io {
3085                 ShaderIoInfo::Vtg(io) => {
3086                     let tc = NAK_ATTR_TESS_COORD;
3087                     io.mark_attrs_written(tc..(tc + 8));
3088                 }
3089                 _ => panic!("Tessellation must have ShaderIoInfo::Vtg"),
3090             }
3091         }
3092 
3093         Shader {
3094             info: self.info,
3095             functions: functions,
3096         }
3097     }
3098 }
3099 
nak_shader_from_nir(ns: &nir_shader, sm: u8) -> Shader3100 pub fn nak_shader_from_nir(ns: &nir_shader, sm: u8) -> Shader {
3101     ShaderFromNir::new(ns, sm).parse_shader()
3102 }
3103