1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3
4 #![allow(non_upper_case_globals)]
5
6 use crate::api::GetDebugFlags;
7 use crate::api::DEBUG;
8 use crate::builder::*;
9 use crate::ir::*;
10 use crate::sph::{OutputTopology, PixelImap};
11
12 use nak_bindings::*;
13
14 use compiler::bindings::*;
15 use compiler::cfg::CFGBuilder;
16 use compiler::nir::*;
17 use compiler::nir_instr_printer::NirInstrPrinter;
18 use std::cmp::max;
19 use std::collections::{HashMap, HashSet};
20 use std::ops::Index;
21
init_info_from_nir(nak: &nak_compiler, nir: &nir_shader) -> ShaderInfo22 fn init_info_from_nir(nak: &nak_compiler, nir: &nir_shader) -> ShaderInfo {
23 ShaderInfo {
24 num_gprs: 0,
25 num_instrs: 0,
26 num_control_barriers: 0,
27 slm_size: nir.scratch_size,
28 max_crs_depth: 0,
29 uses_global_mem: false,
30 writes_global_mem: false,
31 // TODO: handle this.
32 uses_fp64: false,
33 stage: match nir.info.stage() {
34 MESA_SHADER_COMPUTE => {
35 ShaderStageInfo::Compute(ComputeShaderInfo {
36 local_size: [
37 nir.info.workgroup_size[0],
38 nir.info.workgroup_size[1],
39 nir.info.workgroup_size[2],
40 ],
41 smem_size: nir.info.shared_size.try_into().unwrap(),
42 })
43 }
44 MESA_SHADER_VERTEX => ShaderStageInfo::Vertex,
45 MESA_SHADER_FRAGMENT => {
46 let info_fs = unsafe { &nir.info.__bindgen_anon_1.fs };
47 ShaderStageInfo::Fragment(FragmentShaderInfo {
48 uses_kill: false,
49 does_interlock: false,
50 post_depth_coverage: info_fs.post_depth_coverage(),
51 early_fragment_tests: info_fs.early_fragment_tests(),
52 uses_sample_shading: info_fs.uses_sample_shading(),
53 })
54 }
55 MESA_SHADER_GEOMETRY => {
56 let info_gs = unsafe { &nir.info.__bindgen_anon_1.gs };
57 let output_topology = match info_gs.output_primitive {
58 MESA_PRIM_POINTS => OutputTopology::PointList,
59 MESA_PRIM_LINE_STRIP => OutputTopology::LineStrip,
60 MESA_PRIM_TRIANGLE_STRIP => OutputTopology::TriangleStrip,
61 _ => panic!(
62 "Invalid GS input primitive {}",
63 info_gs.input_primitive
64 ),
65 };
66
67 ShaderStageInfo::Geometry(GeometryShaderInfo {
68 // TODO: Should be set if VK_NV_geometry_shader_passthrough is in use.
69 passthrough_enable: false,
70 stream_out_mask: info_gs.active_stream_mask(),
71 threads_per_input_primitive: info_gs.invocations,
72 output_topology: output_topology,
73 max_output_vertex_count: info_gs.vertices_out,
74 })
75 }
76 MESA_SHADER_TESS_CTRL => {
77 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
78 ShaderStageInfo::TessellationInit(TessellationInitShaderInfo {
79 per_patch_attribute_count: 6,
80 threads_per_patch: info_tess.tcs_vertices_out,
81 })
82 }
83 MESA_SHADER_TESS_EVAL => {
84 let info_tess = unsafe { &nir.info.__bindgen_anon_1.tess };
85 ShaderStageInfo::Tessellation(TessellationShaderInfo {
86 domain: match info_tess._primitive_mode {
87 TESS_PRIMITIVE_TRIANGLES => {
88 TessellationDomain::Triangle
89 }
90 TESS_PRIMITIVE_QUADS => TessellationDomain::Quad,
91 TESS_PRIMITIVE_ISOLINES => TessellationDomain::Isoline,
92 _ => panic!("Invalid tess_primitive_mode"),
93 },
94 spacing: match info_tess.spacing() {
95 TESS_SPACING_EQUAL => TessellationSpacing::Integer,
96 TESS_SPACING_FRACTIONAL_ODD => {
97 TessellationSpacing::FractionalOdd
98 }
99 TESS_SPACING_FRACTIONAL_EVEN => {
100 TessellationSpacing::FractionalEven
101 }
102 _ => panic!("Invalid gl_tess_spacing"),
103 },
104 primitives: if info_tess.point_mode() {
105 TessellationPrimitives::Points
106 } else if info_tess._primitive_mode
107 == TESS_PRIMITIVE_ISOLINES
108 {
109 TessellationPrimitives::Lines
110 } else if info_tess.ccw() {
111 TessellationPrimitives::TrianglesCCW
112 } else {
113 TessellationPrimitives::TrianglesCW
114 },
115 })
116 }
117 _ => panic!("Unknown shader stage"),
118 },
119 io: match nir.info.stage() {
120 MESA_SHADER_COMPUTE => ShaderIoInfo::None,
121 MESA_SHADER_FRAGMENT => ShaderIoInfo::Fragment(FragmentIoInfo {
122 sysvals_in: SysValInfo {
123 // Required on fragment shaders, otherwise it cause a trap.
124 ab: 1 << 31,
125 c: 0,
126 },
127 sysvals_in_d: [PixelImap::Unused; 8],
128 attr_in: [PixelImap::Unused; 128],
129 barycentric_attr_in: [0; 4],
130 reads_sample_mask: false,
131 writes_color: 0,
132 writes_sample_mask: false,
133 writes_depth: false,
134 }),
135 MESA_SHADER_VERTEX
136 | MESA_SHADER_GEOMETRY
137 | MESA_SHADER_TESS_CTRL
138 | MESA_SHADER_TESS_EVAL => {
139 let num_clip = nir.info.clip_distance_array_size();
140 let num_cull = nir.info.cull_distance_array_size();
141 let clip_enable = (1_u32 << num_clip) - 1;
142 let cull_enable = ((1_u32 << num_cull) - 1) << num_clip;
143
144 ShaderIoInfo::Vtg(VtgIoInfo {
145 sysvals_in: SysValInfo::default(),
146 sysvals_in_d: 0,
147 sysvals_out: SysValInfo::default(),
148 sysvals_out_d: 0,
149 attr_in: [0; 4],
150 attr_out: [0; 4],
151
152 // TODO: figure out how to fill this.
153 store_req_start: u8::MAX,
154 store_req_end: 0,
155
156 clip_enable: clip_enable.try_into().unwrap(),
157 cull_enable: cull_enable.try_into().unwrap(),
158 xfb: if nir.xfb_info.is_null() {
159 None
160 } else {
161 Some(Box::new(unsafe {
162 nak_xfb_from_nir(nak, nir.xfb_info)
163 }))
164 },
165 })
166 }
167 _ => panic!("Unknown shader stage"),
168 },
169 }
170 }
171
alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue>172 fn alloc_ssa_for_nir(b: &mut impl SSABuilder, ssa: &nir_def) -> Vec<SSAValue> {
173 let (file, comps) = if ssa.bit_size == 1 {
174 (RegFile::Pred, ssa.num_components)
175 } else {
176 let bits = ssa.bit_size * ssa.num_components;
177 (RegFile::GPR, bits.div_ceil(32))
178 };
179
180 let mut vec = Vec::new();
181 for _ in 0..comps {
182 vec.push(b.alloc_ssa(file, 1)[0]);
183 }
184 vec
185 }
186
187 struct PhiAllocMap<'a> {
188 alloc: &'a mut PhiAllocator,
189 map: HashMap<(u32, u8), u32>,
190 }
191
192 impl<'a> PhiAllocMap<'a> {
new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a>193 fn new(alloc: &'a mut PhiAllocator) -> PhiAllocMap<'a> {
194 PhiAllocMap {
195 alloc: alloc,
196 map: HashMap::new(),
197 }
198 }
199
get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32200 fn get_phi_id(&mut self, phi: &nir_phi_instr, comp: u8) -> u32 {
201 *self
202 .map
203 .entry((phi.def.index, comp))
204 .or_insert_with(|| self.alloc.alloc())
205 }
206 }
207
208 struct PerSizeFloatControls {
209 pub ftz: bool,
210 pub rnd_mode: FRndMode,
211 }
212
213 struct ShaderFloatControls {
214 pub fp16: PerSizeFloatControls,
215 pub fp32: PerSizeFloatControls,
216 pub fp64: PerSizeFloatControls,
217 }
218
219 impl Default for ShaderFloatControls {
default() -> Self220 fn default() -> Self {
221 Self {
222 fp16: PerSizeFloatControls {
223 ftz: false,
224 rnd_mode: FRndMode::NearestEven,
225 },
226 fp32: PerSizeFloatControls {
227 ftz: true, // Default FTZ on fp32
228 rnd_mode: FRndMode::NearestEven,
229 },
230 fp64: PerSizeFloatControls {
231 ftz: false,
232 rnd_mode: FRndMode::NearestEven,
233 },
234 }
235 }
236 }
237
238 impl ShaderFloatControls {
from_nir(nir: &nir_shader) -> ShaderFloatControls239 fn from_nir(nir: &nir_shader) -> ShaderFloatControls {
240 let nir_fc = nir.info.float_controls_execution_mode;
241 let mut fc: ShaderFloatControls = Default::default();
242
243 if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) != 0 {
244 fc.fp16.ftz = false;
245 } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16) != 0 {
246 fc.fp16.ftz = true;
247 }
248 if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16) != 0 {
249 fc.fp16.rnd_mode = FRndMode::NearestEven;
250 } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) != 0 {
251 fc.fp16.rnd_mode = FRndMode::Zero;
252 }
253
254 if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) != 0 {
255 fc.fp32.ftz = false;
256 } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32) != 0 {
257 fc.fp32.ftz = true;
258 }
259 if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32) != 0 {
260 fc.fp32.rnd_mode = FRndMode::NearestEven;
261 } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) != 0 {
262 fc.fp32.rnd_mode = FRndMode::Zero;
263 }
264
265 if (nir_fc & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) != 0 {
266 fc.fp64.ftz = false;
267 } else if (nir_fc & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64) != 0 {
268 fc.fp64.ftz = true;
269 }
270 if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) != 0 {
271 fc.fp64.rnd_mode = FRndMode::NearestEven;
272 } else if (nir_fc & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) != 0 {
273 fc.fp64.rnd_mode = FRndMode::Zero;
274 }
275
276 fc
277 }
278 }
279
280 impl Index<FloatType> for ShaderFloatControls {
281 type Output = PerSizeFloatControls;
282
index(&self, idx: FloatType) -> &PerSizeFloatControls283 fn index(&self, idx: FloatType) -> &PerSizeFloatControls {
284 match idx {
285 FloatType::F16 => &self.fp16,
286 FloatType::F32 => &self.fp32,
287 FloatType::F64 => &self.fp64,
288 }
289 }
290 }
291
292 #[derive(Clone, Copy, Eq, Hash, PartialEq)]
293 enum SyncType {
294 Sync,
295 Brk,
296 Cont,
297 }
298
299 struct ShaderFromNir<'a> {
300 nir: &'a nir_shader,
301 sm: &'a dyn ShaderModel,
302 info: ShaderInfo,
303 float_ctl: ShaderFloatControls,
304 cfg: CFGBuilder<u32, BasicBlock>,
305 label_alloc: LabelAllocator,
306 block_label: HashMap<u32, Label>,
307 bar_label: HashMap<u32, Label>,
308 sync_blocks: HashSet<u32>,
309 crs: Vec<(u32, SyncType)>,
310 fs_out_regs: [SSAValue; 34],
311 end_block_id: u32,
312 ssa_map: HashMap<u32, Vec<SSAValue>>,
313 saturated: HashSet<*const nir_def>,
314 nir_instr_printer: NirInstrPrinter,
315 }
316
317 impl<'a> ShaderFromNir<'a> {
new( nak: &nak_compiler, nir: &'a nir_shader, sm: &'a dyn ShaderModel, ) -> Self318 fn new(
319 nak: &nak_compiler,
320 nir: &'a nir_shader,
321 sm: &'a dyn ShaderModel,
322 ) -> Self {
323 Self {
324 nir: nir,
325 sm: sm,
326 info: init_info_from_nir(nak, nir),
327 float_ctl: ShaderFloatControls::from_nir(nir),
328 cfg: CFGBuilder::new(),
329 label_alloc: LabelAllocator::new(),
330 block_label: HashMap::new(),
331 bar_label: HashMap::new(),
332 sync_blocks: HashSet::new(),
333 crs: Vec::new(),
334 fs_out_regs: [SSAValue::NONE; 34],
335 end_block_id: 0,
336 ssa_map: HashMap::new(),
337 saturated: HashSet::new(),
338 nir_instr_printer: NirInstrPrinter::new().unwrap(),
339 }
340 }
341
get_block_label(&mut self, block: &nir_block) -> Label342 fn get_block_label(&mut self, block: &nir_block) -> Label {
343 *self
344 .block_label
345 .entry(block.index)
346 .or_insert_with(|| self.label_alloc.alloc())
347 }
348
push_crs(&mut self, target: &nir_block, sync_type: SyncType)349 fn push_crs(&mut self, target: &nir_block, sync_type: SyncType) {
350 self.sync_blocks.insert(target.index);
351 self.crs.push((target.index, sync_type));
352 let crs_depth = u32::try_from(self.crs.len()).unwrap();
353 self.info.max_crs_depth = max(self.info.max_crs_depth, crs_depth);
354 }
355
pop_crs(&mut self, target: &nir_block, sync_type: SyncType)356 fn pop_crs(&mut self, target: &nir_block, sync_type: SyncType) {
357 if let Some((top_index, top_sync_type)) = self.crs.pop() {
358 assert!(top_index == target.index);
359 assert!(top_sync_type == sync_type);
360 } else {
361 panic!("Tried to pop an empty stack");
362 }
363 }
364
peek_crs(&self, target: &nir_block) -> Option<SyncType>365 fn peek_crs(&self, target: &nir_block) -> Option<SyncType> {
366 for (i, (index, sync_type)) in self.crs.iter().enumerate().rev() {
367 if *index != target.index {
368 continue;
369 }
370
371 match sync_type {
372 SyncType::Sync => {
373 // Sync must always be top-of-stack
374 assert!(i == self.crs.len() - 1);
375 }
376 SyncType::Brk => {
377 // Brk cannot skip over another Brk
378 for (_, inner_sync) in &self.crs[(i + 1)..] {
379 assert!(*inner_sync != SyncType::Brk);
380 }
381 }
382 SyncType::Cont => {
383 // Cont can only skip over Sync
384 for (_, inner_sync) in &self.crs[(i + 1)..] {
385 assert!(*inner_sync == SyncType::Sync);
386 }
387 }
388 }
389
390 return Some(*sync_type);
391 }
392
393 assert!(!self.sync_blocks.contains(&target.index));
394 None
395 }
396
get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue]397 fn get_ssa(&mut self, ssa: &nir_def) -> &[SSAValue] {
398 self.ssa_map.get(&ssa.index).unwrap()
399 }
400
set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>)401 fn set_ssa(&mut self, def: &nir_def, vec: Vec<SSAValue>) {
402 if def.bit_size == 1 {
403 for s in &vec {
404 assert!(s.is_predicate());
405 }
406 } else {
407 for s in &vec {
408 assert!(!s.is_predicate());
409 }
410 let bits =
411 usize::from(def.bit_size) * usize::from(def.num_components);
412 assert!(vec.len() == bits.div_ceil(32));
413 }
414 self.ssa_map
415 .entry(def.index)
416 .and_modify(|_| panic!("Cannot set an SSA def twice"))
417 .or_insert(vec);
418 }
419
get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8)420 fn get_ssa_comp(&mut self, def: &nir_def, c: u8) -> (SSARef, u8) {
421 let vec = self.get_ssa(def);
422 match def.bit_size {
423 1 => (vec[usize::from(c)].into(), 0),
424 8 => (vec[usize::from(c / 4)].into(), c % 4),
425 16 => (vec[usize::from(c / 2)].into(), (c * 2) % 4),
426 32 => (vec[usize::from(c)].into(), 0),
427 64 => {
428 let comps =
429 [vec[usize::from(c) * 2 + 0], vec[usize::from(c) * 2 + 1]];
430 (comps.into(), 0)
431 }
432 _ => panic!("Unsupported bit size: {}", def.bit_size),
433 }
434 }
435
get_ssa_ref(&mut self, src: &nir_src) -> SSARef436 fn get_ssa_ref(&mut self, src: &nir_src) -> SSARef {
437 SSARef::try_from(self.get_ssa(src.as_def())).unwrap()
438 }
439
get_src(&mut self, src: &nir_src) -> Src440 fn get_src(&mut self, src: &nir_src) -> Src {
441 self.get_ssa_ref(src).into()
442 }
443
get_io_addr_offset( &mut self, addr: &nir_src, imm_bits: u8, ) -> (Src, i32)444 fn get_io_addr_offset(
445 &mut self,
446 addr: &nir_src,
447 imm_bits: u8,
448 ) -> (Src, i32) {
449 let addr = addr.as_def();
450 let addr_offset = unsafe {
451 nak_get_io_addr_offset(addr as *const _ as *mut _, imm_bits)
452 };
453
454 if let Some(base_def) = std::ptr::NonNull::new(addr_offset.base.def) {
455 let base_def = unsafe { base_def.as_ref() };
456 let base_comp = u8::try_from(addr_offset.base.comp).unwrap();
457 let (base, _) = self.get_ssa_comp(base_def, base_comp);
458 (base.into(), addr_offset.offset)
459 } else {
460 (SrcRef::Zero.into(), addr_offset.offset)
461 }
462 }
463
get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16)464 fn get_cbuf_addr_offset(&mut self, addr: &nir_src) -> (Src, u16) {
465 let (off, off_imm) = self.get_io_addr_offset(addr, 16);
466 if let Ok(off_imm_u16) = u16::try_from(off_imm) {
467 (off, off_imm_u16)
468 } else {
469 (self.get_src(addr), 0)
470 }
471 }
472
set_dst(&mut self, def: &nir_def, ssa: SSARef)473 fn set_dst(&mut self, def: &nir_def, ssa: SSARef) {
474 self.set_ssa(def, (*ssa).into());
475 }
476
try_saturate_alu_dst(&mut self, def: &nir_def) -> bool477 fn try_saturate_alu_dst(&mut self, def: &nir_def) -> bool {
478 if def.all_uses_are_fsat() {
479 self.saturated.insert(def as *const _);
480 true
481 } else {
482 false
483 }
484 }
485
alu_src_is_saturated(&self, src: &nir_alu_src) -> bool486 fn alu_src_is_saturated(&self, src: &nir_alu_src) -> bool {
487 self.saturated
488 .get(&(src.src.as_def() as *const _))
489 .is_some()
490 }
491
parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr)492 fn parse_alu(&mut self, b: &mut impl SSABuilder, alu: &nir_alu_instr) {
493 // Handle vectors and pack ops as a special case since they're the only
494 // ALU ops that can produce more than 16B. They are also the only ALU
495 // ops which we allow to consume small (8 and 16-bit) vector data
496 // scattered across multiple dwords
497 match alu.op {
498 nir_op_mov
499 | nir_op_pack_32_4x8_split
500 | nir_op_pack_32_2x16_split
501 | nir_op_pack_64_2x32_split
502 | nir_op_vec2
503 | nir_op_vec3
504 | nir_op_vec4
505 | nir_op_vec5
506 | nir_op_vec8
507 | nir_op_vec16 => {
508 let src_bit_size = alu.get_src(0).src.bit_size();
509 let bits = usize::from(alu.def.num_components)
510 * usize::from(alu.def.bit_size);
511
512 // Collect the sources into a vec with src_bit_size per SSA
513 // value in the vec. This implicitly makes 64-bit sources look
514 // like two 32-bit values
515 let mut srcs = Vec::new();
516 if alu.op == nir_op_mov {
517 let src = alu.get_src(0);
518 for c in 0..alu.def.num_components {
519 let s = src.swizzle[usize::from(c)];
520 let (src, byte) =
521 self.get_ssa_comp(src.src.as_def(), s);
522 for ssa in src.iter() {
523 srcs.push((*ssa, byte));
524 }
525 }
526 } else {
527 for src in alu.srcs_as_slice().iter() {
528 let s = src.swizzle[0];
529 let (src, byte) =
530 self.get_ssa_comp(src.src.as_def(), s);
531 for ssa in src.iter() {
532 srcs.push((*ssa, byte));
533 }
534 }
535 }
536
537 let mut comps = Vec::new();
538 match src_bit_size {
539 1 | 32 | 64 => {
540 for (ssa, _) in srcs {
541 comps.push(ssa);
542 }
543 }
544 8 => {
545 for dc in 0..bits.div_ceil(32) {
546 let mut psrc = [None; 4];
547 let mut psel = [0_u8; 4];
548
549 for b in 0..4 {
550 let sc = dc * 4 + b;
551 if sc < srcs.len() {
552 let (ssa, byte) = srcs[sc];
553 // Deduplicate psrc entries
554 for i in 0..4_u8 {
555 let psrc_i = &mut psrc[usize::from(i)];
556 if psrc_i.is_none() {
557 *psrc_i = Some(ssa.into());
558 } else if *psrc_i
559 != Some(Src::from(ssa))
560 {
561 continue;
562 }
563 psel[b] = i * 4 + byte;
564 break;
565 }
566 }
567 }
568
569 let psrc = {
570 let mut res = [Src::new_zero(); 4];
571
572 for (idx, src) in psrc.iter().enumerate() {
573 if let Some(src) = src {
574 res[idx] = *src;
575 }
576 }
577
578 res
579 };
580
581 comps.push(b.prmt4(psrc, psel)[0]);
582 }
583 }
584 16 => {
585 for dc in 0..bits.div_ceil(32) {
586 let mut psrc = [Src::new_zero(); 2];
587 let mut psel = [0_u8; 4];
588
589 for w in 0..2 {
590 let sc = dc * 2 + w;
591 if sc < srcs.len() {
592 let (ssa, byte) = srcs[sc];
593 psrc[w] = ssa.into();
594 psel[w * 2 + 0] = (w as u8 * 4) + byte;
595 psel[w * 2 + 1] = (w as u8 * 4) + byte + 1;
596 }
597 }
598 comps.push(b.prmt(psrc[0], psrc[1], psel)[0]);
599 }
600 }
601 _ => panic!("Unknown bit size: {src_bit_size}"),
602 }
603
604 self.set_ssa(&alu.def, comps);
605 return;
606 }
607 _ => (),
608 }
609
610 let nir_srcs = alu.srcs_as_slice();
611 let mut srcs: Vec<Src> = Vec::new();
612 for (i, alu_src) in nir_srcs.iter().enumerate() {
613 let bit_size = alu_src.src.bit_size();
614 let comps = alu.src_components(i.try_into().unwrap());
615 let ssa = self.get_ssa(alu_src.src.as_def());
616
617 match bit_size {
618 1 => {
619 assert!(comps == 1);
620 let s = usize::from(alu_src.swizzle[0]);
621 srcs.push(ssa[s].into());
622 }
623 8 | 16 => {
624 let num_bytes = usize::from(comps * (bit_size / 8));
625 assert!(num_bytes <= 4);
626
627 let mut bytes = [0_u8; 4];
628 for c in 0..usize::from(comps) {
629 let cs = alu_src.swizzle[c];
630 if bit_size == 8 {
631 bytes[c] = cs;
632 } else {
633 bytes[c * 2 + 0] = cs * 2 + 0;
634 bytes[c * 2 + 1] = cs * 2 + 1;
635 }
636 }
637
638 let mut prmt_srcs = [Src::new_zero(); 4];
639 let mut prmt = [0_u8; 4];
640 for b in 0..num_bytes {
641 for (ds, s) in prmt_srcs.iter_mut().enumerate() {
642 let dw = ssa[usize::from(bytes[b] / 4)];
643 if s.is_zero() {
644 *s = dw.into();
645 } else if *s != Src::from(dw) {
646 continue;
647 }
648 prmt[usize::from(b)] =
649 (ds as u8) * 4 + (bytes[b] % 4);
650 break;
651 }
652 }
653
654 srcs.push(b.prmt4(prmt_srcs, prmt).into());
655 }
656 32 => {
657 assert!(comps == 1);
658 let s = usize::from(alu_src.swizzle[0]);
659 srcs.push(ssa[s].into());
660 }
661 64 => {
662 assert!(comps == 1);
663 let s = usize::from(alu_src.swizzle[0]);
664 srcs.push([ssa[s * 2], ssa[s * 2 + 1]].into());
665 }
666 _ => panic!("Invalid bit size: {bit_size}"),
667 }
668 }
669
670 // Restricts an F16v2 source to just x if the ALU op is single-component. This
671 // must only be called for per-component sources (see nir_op_info::output_sizes
672 // for more details).
673 let restrict_f16v2_src = |mut src: Src| {
674 if alu.def.num_components == 1 {
675 src.src_swizzle = SrcSwizzle::Xx;
676 }
677 src
678 };
679
680 let dst: SSARef = match alu.op {
681 nir_op_b2b1 => {
682 assert!(alu.get_src(0).bit_size() == 32);
683 b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], 0.into())
684 }
685 nir_op_b2b32 | nir_op_b2i8 | nir_op_b2i16 | nir_op_b2i32 => {
686 b.sel(srcs[0].bnot(), 0.into(), 1.into())
687 }
688 nir_op_b2i64 => {
689 let lo = b.sel(srcs[0].bnot(), 0.into(), 1.into());
690 let hi = b.copy(0.into());
691 [lo[0], hi[0]].into()
692 }
693 nir_op_b2f16 => b.sel(srcs[0].bnot(), 0.into(), 0x3c00.into()),
694 nir_op_b2f32 => {
695 b.sel(srcs[0].bnot(), 0.0_f32.into(), 1.0_f32.into())
696 }
697 nir_op_b2f64 => {
698 let lo = b.copy(0.into());
699 let hi = b.sel(srcs[0].bnot(), 0.into(), 0x3ff00000.into());
700 [lo[0], hi[0]].into()
701 }
702 nir_op_bcsel => b.sel(srcs[0], srcs[1], srcs[2]),
703 nir_op_bfm => {
704 let dst = b.alloc_ssa(RegFile::GPR, 1);
705 b.push_op(OpBMsk {
706 dst: dst.into(),
707 pos: srcs[1],
708 width: srcs[0],
709 wrap: true,
710 });
711 dst
712 }
713 nir_op_bit_count => {
714 let dst = b.alloc_ssa(RegFile::GPR, 1);
715 b.push_op(OpPopC {
716 dst: dst.into(),
717 src: srcs[0],
718 });
719 dst
720 }
721 nir_op_bitfield_reverse => b.brev(srcs[0]),
722 nir_op_ibitfield_extract | nir_op_ubitfield_extract => {
723 let range = b.alloc_ssa(RegFile::GPR, 1);
724 b.push_op(OpPrmt {
725 dst: range.into(),
726 srcs: [srcs[1], srcs[2]],
727 sel: 0x0040.into(),
728 mode: PrmtMode::Index,
729 });
730
731 let dst = b.alloc_ssa(RegFile::GPR, 1);
732 b.push_op(OpBfe {
733 dst: dst.into(),
734 base: srcs[0],
735 signed: !matches!(alu.op, nir_op_ubitfield_extract),
736 range: range.into(),
737 reverse: false,
738 });
739 dst
740 }
741 nir_op_extract_u8 | nir_op_extract_i8 | nir_op_extract_u16
742 | nir_op_extract_i16 => {
743 let src1 = alu.get_src(1);
744 let elem = src1.src.comp_as_uint(src1.swizzle[0]).unwrap();
745 let elem = u8::try_from(elem).unwrap();
746
747 match alu.op {
748 nir_op_extract_u8 => {
749 assert!(elem < 4);
750 let byte = elem;
751 let zero = 4;
752 b.prmt(srcs[0], 0.into(), [byte, zero, zero, zero])
753 }
754 nir_op_extract_i8 => {
755 assert!(elem < 4);
756 let byte = elem;
757 let sign = byte | 0x8;
758 b.prmt(srcs[0], 0.into(), [byte, sign, sign, sign])
759 }
760 nir_op_extract_u16 => {
761 assert!(elem < 2);
762 let byte = elem * 2;
763 let zero = 4;
764 b.prmt(srcs[0], 0.into(), [byte, byte + 1, zero, zero])
765 }
766 nir_op_extract_i16 => {
767 assert!(elem < 2);
768 let byte = elem * 2;
769 let sign = (byte + 1) | 0x8;
770 b.prmt(srcs[0], 0.into(), [byte, byte + 1, sign, sign])
771 }
772 _ => panic!("Unknown extract op: {}", alu.op),
773 }
774 }
775 nir_op_f2f16 | nir_op_f2f16_rtne | nir_op_f2f16_rtz
776 | nir_op_f2f32 | nir_op_f2f64 => {
777 let src_bits = alu.get_src(0).src.bit_size();
778 let dst_bits = alu.def.bit_size();
779 let src_type = FloatType::from_bits(src_bits.into());
780 let dst_type = FloatType::from_bits(dst_bits.into());
781
782 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
783 b.push_op(OpF2F {
784 dst: dst.into(),
785 src: srcs[0],
786 src_type: FloatType::from_bits(src_bits.into()),
787 dst_type: dst_type,
788 rnd_mode: match alu.op {
789 nir_op_f2f16_rtne => FRndMode::NearestEven,
790 nir_op_f2f16_rtz => FRndMode::Zero,
791 _ => self.float_ctl[dst_type].rnd_mode,
792 },
793 ftz: if src_bits < dst_bits {
794 self.float_ctl[src_type].ftz
795 } else {
796 self.float_ctl[dst_type].ftz
797 },
798 high: false,
799 integer_rnd: false,
800 });
801 dst
802 }
803 nir_op_find_lsb => {
804 let rev = b.brev(srcs[0]);
805 let dst = b.alloc_ssa(RegFile::GPR, 1);
806 b.push_op(OpFlo {
807 dst: dst.into(),
808 src: rev.into(),
809 signed: false,
810 return_shift_amount: true,
811 });
812 dst
813 }
814 nir_op_f2i8 | nir_op_f2i16 | nir_op_f2i32 | nir_op_f2i64
815 | nir_op_f2u8 | nir_op_f2u16 | nir_op_f2u32 | nir_op_f2u64 => {
816 let src_bits = usize::from(alu.get_src(0).bit_size());
817 let dst_bits = alu.def.bit_size();
818 let src_type = FloatType::from_bits(src_bits);
819 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
820 let dst_is_signed = alu.info().output_type & 2 != 0;
821 let dst_type =
822 IntType::from_bits(dst_bits.into(), dst_is_signed);
823 if b.sm() < 70 && dst_bits == 8 {
824 // F2I doesn't support 8-bit destinations pre-Volta
825 let tmp = b.alloc_ssa(RegFile::GPR, 1);
826 let tmp_type = IntType::from_bits(32, dst_is_signed);
827 b.push_op(OpF2I {
828 dst: tmp.into(),
829 src: srcs[0],
830 src_type,
831 dst_type: tmp_type,
832 rnd_mode: FRndMode::Zero,
833 ftz: self.float_ctl[src_type].ftz,
834 });
835 b.push_op(OpI2I {
836 dst: dst.into(),
837 src: tmp.into(),
838 src_type: tmp_type,
839 dst_type,
840 saturate: true,
841 abs: false,
842 neg: false,
843 });
844 } else {
845 b.push_op(OpF2I {
846 dst: dst.into(),
847 src: srcs[0],
848 src_type,
849 dst_type,
850 rnd_mode: FRndMode::Zero,
851 ftz: self.float_ctl[src_type].ftz,
852 });
853 }
854 dst
855 }
856 nir_op_fabs | nir_op_fadd | nir_op_fneg => {
857 let (x, y) = match alu.op {
858 nir_op_fabs => (Src::new_zero().fneg(), srcs[0].fabs()),
859 nir_op_fadd => (srcs[0], srcs[1]),
860 nir_op_fneg => (Src::new_zero().fneg(), srcs[0].fneg()),
861 _ => panic!("Unhandled case"),
862 };
863 let ftype = FloatType::from_bits(alu.def.bit_size().into());
864 let dst;
865 if alu.def.bit_size() == 64 {
866 dst = b.alloc_ssa(RegFile::GPR, 2);
867 b.push_op(OpDAdd {
868 dst: dst.into(),
869 srcs: [x, y],
870 rnd_mode: self.float_ctl[ftype].rnd_mode,
871 });
872 } else if alu.def.bit_size() == 32 {
873 dst = b.alloc_ssa(RegFile::GPR, 1);
874 b.push_op(OpFAdd {
875 dst: dst.into(),
876 srcs: [x, y],
877 saturate: self.try_saturate_alu_dst(&alu.def),
878 rnd_mode: self.float_ctl[ftype].rnd_mode,
879 ftz: self.float_ctl[ftype].ftz,
880 });
881 } else if alu.def.bit_size() == 16 {
882 assert!(
883 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
884 );
885
886 dst = b.alloc_ssa(RegFile::GPR, 1);
887 b.push_op(OpHAdd2 {
888 dst: dst.into(),
889 srcs: [restrict_f16v2_src(x), restrict_f16v2_src(y)],
890 saturate: self.try_saturate_alu_dst(&alu.def),
891 ftz: self.float_ctl[ftype].ftz,
892 f32: false,
893 });
894 } else {
895 panic!("Unsupported float type: f{}", alu.def.bit_size());
896 }
897 dst
898 }
899 nir_op_fceil | nir_op_ffloor | nir_op_fround_even
900 | nir_op_ftrunc => {
901 let dst = b.alloc_ssa(RegFile::GPR, 1);
902 let ty = FloatType::from_bits(alu.def.bit_size().into());
903 let rnd_mode = match alu.op {
904 nir_op_fceil => FRndMode::PosInf,
905 nir_op_ffloor => FRndMode::NegInf,
906 nir_op_ftrunc => FRndMode::Zero,
907 nir_op_fround_even => FRndMode::NearestEven,
908 _ => unreachable!(),
909 };
910 let ftz = self.float_ctl[ty].ftz;
911 if b.sm() >= 70 {
912 assert!(
913 alu.def.bit_size() == 32 || alu.def.bit_size() == 16
914 );
915 b.push_op(OpFRnd {
916 dst: dst.into(),
917 src: srcs[0],
918 src_type: ty,
919 dst_type: ty,
920 rnd_mode,
921 ftz,
922 });
923 } else {
924 assert!(alu.def.bit_size() == 32);
925 b.push_op(OpF2F {
926 dst: dst.into(),
927 src: srcs[0],
928 src_type: ty,
929 dst_type: ty,
930 rnd_mode,
931 ftz,
932 integer_rnd: true,
933 high: false,
934 });
935 }
936 dst
937 }
938 nir_op_fcos => b.fcos(srcs[0]),
939 nir_op_feq | nir_op_fge | nir_op_flt | nir_op_fneu => {
940 let src_type =
941 FloatType::from_bits(alu.get_src(0).bit_size().into());
942 let cmp_op = match alu.op {
943 nir_op_feq => FloatCmpOp::OrdEq,
944 nir_op_fge => FloatCmpOp::OrdGe,
945 nir_op_flt => FloatCmpOp::OrdLt,
946 nir_op_fneu => FloatCmpOp::UnordNe,
947 _ => panic!("Usupported float comparison"),
948 };
949
950 let dst = b.alloc_ssa(RegFile::Pred, alu.def.num_components);
951 if alu.get_src(0).bit_size() == 64 {
952 assert!(alu.def.num_components == 1);
953 b.push_op(OpDSetP {
954 dst: dst.into(),
955 set_op: PredSetOp::And,
956 cmp_op: cmp_op,
957 srcs: [srcs[0], srcs[1]],
958 accum: SrcRef::True.into(),
959 });
960 } else if alu.get_src(0).bit_size() == 32 {
961 assert!(alu.def.num_components == 1);
962 b.push_op(OpFSetP {
963 dst: dst.into(),
964 set_op: PredSetOp::And,
965 cmp_op: cmp_op,
966 srcs: [srcs[0], srcs[1]],
967 accum: SrcRef::True.into(),
968 ftz: self.float_ctl[src_type].ftz,
969 });
970 } else if alu.get_src(0).bit_size() == 16 {
971 assert!(
972 alu.def.num_components == 1
973 || alu.def.num_components == 2
974 );
975
976 let dsts = if alu.def.num_components == 2 {
977 [dst[0].into(), dst[1].into()]
978 } else {
979 [dst[0].into(), Dst::None]
980 };
981
982 b.push_op(OpHSetP2 {
983 dsts,
984 set_op: PredSetOp::And,
985 cmp_op: cmp_op,
986 srcs: [
987 restrict_f16v2_src(srcs[0]),
988 restrict_f16v2_src(srcs[1]),
989 ],
990 accum: SrcRef::True.into(),
991 ftz: self.float_ctl[src_type].ftz,
992 horizontal: false,
993 });
994 } else {
995 panic!(
996 "Unsupported float type: f{}",
997 alu.get_src(0).bit_size()
998 );
999 }
1000 dst
1001 }
1002 nir_op_fexp2 => b.fexp2(srcs[0]),
1003 nir_op_ffma => {
1004 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1005 let dst;
1006 if alu.def.bit_size() == 64 {
1007 debug_assert!(!self.float_ctl[ftype].ftz);
1008 dst = b.alloc_ssa(RegFile::GPR, 2);
1009 b.push_op(OpDFma {
1010 dst: dst.into(),
1011 srcs: [srcs[0], srcs[1], srcs[2]],
1012 rnd_mode: self.float_ctl[ftype].rnd_mode,
1013 });
1014 } else if alu.def.bit_size() == 32 {
1015 dst = b.alloc_ssa(RegFile::GPR, 1);
1016 b.push_op(OpFFma {
1017 dst: dst.into(),
1018 srcs: [srcs[0], srcs[1], srcs[2]],
1019 saturate: self.try_saturate_alu_dst(&alu.def),
1020 rnd_mode: self.float_ctl[ftype].rnd_mode,
1021 // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1022 // anyway so only set one of the two bits.
1023 ftz: self.float_ctl[ftype].ftz,
1024 dnz: false,
1025 });
1026 } else if alu.def.bit_size() == 16 {
1027 assert!(
1028 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1029 );
1030
1031 dst = b.alloc_ssa(RegFile::GPR, 1);
1032 b.push_op(OpHFma2 {
1033 dst: dst.into(),
1034 srcs: [
1035 restrict_f16v2_src(srcs[0]),
1036 restrict_f16v2_src(srcs[1]),
1037 restrict_f16v2_src(srcs[2]),
1038 ],
1039 saturate: self.try_saturate_alu_dst(&alu.def),
1040 ftz: self.float_ctl[ftype].ftz,
1041 dnz: false,
1042 f32: false,
1043 });
1044 } else {
1045 panic!("Unsupported float type: f{}", alu.def.bit_size());
1046 }
1047 dst
1048 }
1049 nir_op_ffmaz => {
1050 assert!(alu.def.bit_size() == 32);
1051 // DNZ implies FTZ so we need FTZ set or this is invalid
1052 assert!(self.float_ctl.fp32.ftz);
1053 let dst = b.alloc_ssa(RegFile::GPR, 1);
1054 b.push_op(OpFFma {
1055 dst: dst.into(),
1056 srcs: [srcs[0], srcs[1], srcs[2]],
1057 saturate: self.try_saturate_alu_dst(&alu.def),
1058 rnd_mode: self.float_ctl.fp32.rnd_mode,
1059 // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1060 // anyway so only set one of the two bits.
1061 ftz: false,
1062 dnz: true,
1063 });
1064 dst
1065 }
1066 nir_op_flog2 => {
1067 assert!(alu.def.bit_size() == 32);
1068 b.mufu(MuFuOp::Log2, srcs[0])
1069 }
1070 nir_op_fmax | nir_op_fmin => {
1071 let dst;
1072 if alu.def.bit_size() == 64 {
1073 dst = b.alloc_ssa(RegFile::GPR, 2);
1074 b.push_op(OpDMnMx {
1075 dst: dst.into(),
1076 srcs: [srcs[0], srcs[1]],
1077 min: (alu.op == nir_op_fmin).into(),
1078 });
1079 } else if alu.def.bit_size() == 32 {
1080 dst = b.alloc_ssa(RegFile::GPR, 1);
1081 b.push_op(OpFMnMx {
1082 dst: dst.into(),
1083 srcs: [srcs[0], srcs[1]],
1084 min: (alu.op == nir_op_fmin).into(),
1085 ftz: self.float_ctl.fp32.ftz,
1086 });
1087 } else if alu.def.bit_size() == 16 {
1088 dst = b.alloc_ssa(RegFile::GPR, 1);
1089 b.push_op(OpHMnMx2 {
1090 dst: dst.into(),
1091 srcs: [
1092 restrict_f16v2_src(srcs[0]),
1093 restrict_f16v2_src(srcs[1]),
1094 ],
1095 min: (alu.op == nir_op_fmin).into(),
1096 ftz: self.float_ctl.fp16.ftz,
1097 });
1098 } else {
1099 panic!("Unsupported float type: f{}", alu.def.bit_size());
1100 }
1101 dst
1102 }
1103 nir_op_fmul => {
1104 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1105 let dst;
1106 if alu.def.bit_size() == 64 {
1107 debug_assert!(!self.float_ctl[ftype].ftz);
1108 dst = b.alloc_ssa(RegFile::GPR, 2);
1109 b.push_op(OpDMul {
1110 dst: dst.into(),
1111 srcs: [srcs[0], srcs[1]],
1112 rnd_mode: self.float_ctl[ftype].rnd_mode,
1113 });
1114 } else if alu.def.bit_size() == 32 {
1115 dst = b.alloc_ssa(RegFile::GPR, 1);
1116 b.push_op(OpFMul {
1117 dst: dst.into(),
1118 srcs: [srcs[0], srcs[1]],
1119 saturate: self.try_saturate_alu_dst(&alu.def),
1120 rnd_mode: self.float_ctl[ftype].rnd_mode,
1121 ftz: self.float_ctl[ftype].ftz,
1122 dnz: false,
1123 });
1124 } else if alu.def.bit_size() == 16 {
1125 assert!(
1126 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1127 );
1128
1129 dst = b.alloc_ssa(RegFile::GPR, 1);
1130 b.push_op(OpHMul2 {
1131 dst: dst.into(),
1132 srcs: [
1133 restrict_f16v2_src(srcs[0]),
1134 restrict_f16v2_src(srcs[1]),
1135 ],
1136 saturate: self.try_saturate_alu_dst(&alu.def),
1137 ftz: self.float_ctl[ftype].ftz,
1138 dnz: false,
1139 });
1140 } else {
1141 panic!("Unsupported float type: f{}", alu.def.bit_size());
1142 }
1143 dst
1144 }
1145 nir_op_fmulz => {
1146 assert!(alu.def.bit_size() == 32);
1147 // DNZ implies FTZ so we need FTZ set or this is invalid
1148 assert!(self.float_ctl.fp32.ftz);
1149 let dst = b.alloc_ssa(RegFile::GPR, 1);
1150 b.push_op(OpFMul {
1151 dst: dst.into(),
1152 srcs: [srcs[0], srcs[1]],
1153 saturate: self.try_saturate_alu_dst(&alu.def),
1154 rnd_mode: self.float_ctl.fp32.rnd_mode,
1155 // The hardware doesn't like FTZ+DNZ and DNZ implies FTZ
1156 // anyway so only set one of the two bits.
1157 ftz: false,
1158 dnz: true,
1159 });
1160 dst
1161 }
1162 nir_op_fquantize2f16 => {
1163 let tmp = b.alloc_ssa(RegFile::GPR, 1);
1164 b.push_op(OpF2F {
1165 dst: tmp.into(),
1166 src: srcs[0],
1167 src_type: FloatType::F32,
1168 dst_type: FloatType::F16,
1169 rnd_mode: FRndMode::NearestEven,
1170 ftz: true,
1171 high: false,
1172 integer_rnd: false,
1173 });
1174 assert!(alu.def.bit_size() == 32);
1175 let dst = b.alloc_ssa(RegFile::GPR, 1);
1176 b.push_op(OpF2F {
1177 dst: dst.into(),
1178 src: tmp.into(),
1179 src_type: FloatType::F16,
1180 dst_type: FloatType::F32,
1181 rnd_mode: FRndMode::NearestEven,
1182 ftz: true,
1183 high: false,
1184 integer_rnd: false,
1185 });
1186 if b.sm() < 70 {
1187 // Pre-Volta, F2F.ftz doesn't flush denorms so we need to do
1188 // that manually
1189 let denorm = b.fsetp(
1190 FloatCmpOp::OrdLt,
1191 srcs[0].fabs(),
1192 0x38800000.into(),
1193 );
1194 // Get the correctly signed zero
1195 let zero =
1196 b.lop2(LogicOp2::And, srcs[0], 0x80000000.into());
1197 b.sel(denorm.into(), zero.into(), dst.into())
1198 } else {
1199 dst
1200 }
1201 }
1202 nir_op_frcp => {
1203 assert!(alu.def.bit_size() == 32);
1204 b.mufu(MuFuOp::Rcp, srcs[0])
1205 }
1206 nir_op_frsq => {
1207 assert!(alu.def.bit_size() == 32);
1208 b.mufu(MuFuOp::Rsq, srcs[0])
1209 }
1210 nir_op_fsat => {
1211 let ftype = FloatType::from_bits(alu.def.bit_size().into());
1212
1213 if self.alu_src_is_saturated(&alu.srcs_as_slice()[0]) {
1214 b.copy(srcs[0])
1215 } else if alu.def.bit_size() == 32 {
1216 let dst = b.alloc_ssa(RegFile::GPR, 1);
1217 b.push_op(OpFAdd {
1218 dst: dst.into(),
1219 srcs: [srcs[0], 0.into()],
1220 saturate: true,
1221 rnd_mode: self.float_ctl[ftype].rnd_mode,
1222 ftz: self.float_ctl[ftype].ftz,
1223 });
1224 dst
1225 } else if alu.def.bit_size() == 16 {
1226 assert!(
1227 self.float_ctl[ftype].rnd_mode == FRndMode::NearestEven
1228 );
1229
1230 let dst = b.alloc_ssa(RegFile::GPR, 1);
1231 b.push_op(OpHAdd2 {
1232 dst: dst.into(),
1233 srcs: [restrict_f16v2_src(srcs[0]), 0.into()],
1234 saturate: true,
1235 ftz: self.float_ctl[ftype].ftz,
1236 f32: false,
1237 });
1238 dst
1239 } else {
1240 panic!("Unsupported float type: f{}", alu.def.bit_size());
1241 }
1242 }
1243 nir_op_fsign => {
1244 if alu.def.bit_size() == 64 {
1245 let lz = b.dsetp(FloatCmpOp::OrdLt, srcs[0], 0.into());
1246 let gz = b.dsetp(FloatCmpOp::OrdGt, srcs[0], 0.into());
1247 let hi = b.sel(lz.into(), 0xbff00000.into(), 0.into());
1248 let hi = b.sel(gz.into(), 0x3ff00000.into(), hi.into());
1249 let lo = b.copy(0.into());
1250 [lo[0], hi[0]].into()
1251 } else if alu.def.bit_size() == 32 {
1252 let lz = b.fset(FloatCmpOp::OrdLt, srcs[0], 0.into());
1253 let gz = b.fset(FloatCmpOp::OrdGt, srcs[0], 0.into());
1254 b.fadd(gz.into(), Src::from(lz).fneg())
1255 } else if alu.def.bit_size() == 16 {
1256 let x = restrict_f16v2_src(srcs[0]);
1257
1258 let lz = restrict_f16v2_src(
1259 b.hset2(FloatCmpOp::OrdLt, x, 0.into()).into(),
1260 );
1261 let gz = restrict_f16v2_src(
1262 b.hset2(FloatCmpOp::OrdGt, x, 0.into()).into(),
1263 );
1264
1265 b.hadd2(gz, lz.fneg())
1266 } else {
1267 panic!("Unsupported float type: f{}", alu.def.bit_size());
1268 }
1269 }
1270 nir_op_fsin => b.fsin(srcs[0]),
1271 nir_op_fsqrt => b.mufu(MuFuOp::Sqrt, srcs[0]),
1272 nir_op_i2f16 | nir_op_i2f32 | nir_op_i2f64 => {
1273 let src_bits = alu.get_src(0).src.bit_size();
1274 let dst_bits = alu.def.bit_size();
1275 let dst_type = FloatType::from_bits(dst_bits.into());
1276 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1277 b.push_op(OpI2F {
1278 dst: dst.into(),
1279 src: srcs[0],
1280 dst_type: dst_type,
1281 src_type: IntType::from_bits(src_bits.into(), true),
1282 rnd_mode: self.float_ctl[dst_type].rnd_mode,
1283 });
1284 dst
1285 }
1286 nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32 | nir_op_i2i64
1287 | nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32 | nir_op_u2u64 => {
1288 let src_bits = alu.get_src(0).src.bit_size();
1289 let dst_bits = alu.def.bit_size();
1290
1291 let mut prmt = [0_u8; 8];
1292 match alu.op {
1293 nir_op_i2i8 | nir_op_i2i16 | nir_op_i2i32
1294 | nir_op_i2i64 => {
1295 let sign = ((src_bits / 8) - 1) | 0x8;
1296 for i in 0..8 {
1297 if i < (src_bits / 8) {
1298 prmt[usize::from(i)] = i;
1299 } else {
1300 prmt[usize::from(i)] = sign;
1301 }
1302 }
1303 }
1304 nir_op_u2u8 | nir_op_u2u16 | nir_op_u2u32
1305 | nir_op_u2u64 => {
1306 for i in 0..8 {
1307 if i < (src_bits / 8) {
1308 prmt[usize::from(i)] = i;
1309 } else {
1310 prmt[usize::from(i)] = 4;
1311 }
1312 }
1313 }
1314 _ => panic!("Invalid integer conversion: {}", alu.op),
1315 }
1316 let prmt_lo: [u8; 4] = prmt[0..4].try_into().unwrap();
1317 let prmt_hi: [u8; 4] = prmt[4..8].try_into().unwrap();
1318
1319 let src = srcs[0].as_ssa().unwrap();
1320 if src_bits == 64 {
1321 if dst_bits == 64 {
1322 *src
1323 } else {
1324 b.prmt(src[0].into(), src[1].into(), prmt_lo)
1325 }
1326 } else {
1327 if dst_bits == 64 {
1328 let lo = b.prmt(src[0].into(), 0.into(), prmt_lo);
1329 let hi = b.prmt(src[0].into(), 0.into(), prmt_hi);
1330 [lo[0], hi[0]].into()
1331 } else {
1332 b.prmt(src[0].into(), 0.into(), prmt_lo)
1333 }
1334 }
1335 }
1336 nir_op_iabs => b.iabs(srcs[0]),
1337 nir_op_iadd => match alu.def.bit_size {
1338 32 => b.iadd(srcs[0], srcs[1], 0.into()),
1339 64 => b.iadd64(srcs[0], srcs[1], 0.into()),
1340 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1341 },
1342 nir_op_iadd3 => match alu.def.bit_size {
1343 32 => b.iadd(srcs[0], srcs[1], srcs[2]),
1344 64 => b.iadd64(srcs[0], srcs[1], srcs[2]),
1345 x => panic!("unsupported bit size for nir_op_iadd3: {x}"),
1346 },
1347 nir_op_iand => b.lop2(LogicOp2::And, srcs[0], srcs[1]),
1348 nir_op_ieq => {
1349 if alu.get_src(0).bit_size() == 1 {
1350 b.lop2(LogicOp2::Xor, srcs[0], srcs[1].bnot())
1351 } else if alu.get_src(0).bit_size() == 64 {
1352 b.isetp64(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1353 } else {
1354 assert!(alu.get_src(0).bit_size() == 32);
1355 b.isetp(IntCmpType::I32, IntCmpOp::Eq, srcs[0], srcs[1])
1356 }
1357 }
1358 nir_op_ifind_msb | nir_op_ifind_msb_rev | nir_op_ufind_msb
1359 | nir_op_ufind_msb_rev => {
1360 let dst = b.alloc_ssa(RegFile::GPR, 1);
1361 b.push_op(OpFlo {
1362 dst: dst.into(),
1363 src: srcs[0],
1364 signed: match alu.op {
1365 nir_op_ifind_msb | nir_op_ifind_msb_rev => true,
1366 nir_op_ufind_msb | nir_op_ufind_msb_rev => false,
1367 _ => panic!("Not a find_msb op"),
1368 },
1369 return_shift_amount: match alu.op {
1370 nir_op_ifind_msb | nir_op_ufind_msb => false,
1371 nir_op_ifind_msb_rev | nir_op_ufind_msb_rev => true,
1372 _ => panic!("Not a find_msb op"),
1373 },
1374 });
1375 dst
1376 }
1377 nir_op_ige | nir_op_ilt | nir_op_uge | nir_op_ult => {
1378 let x = *srcs[0].as_ssa().unwrap();
1379 let y = *srcs[1].as_ssa().unwrap();
1380 let (cmp_type, cmp_op) = match alu.op {
1381 nir_op_ige => (IntCmpType::I32, IntCmpOp::Ge),
1382 nir_op_ilt => (IntCmpType::I32, IntCmpOp::Lt),
1383 nir_op_uge => (IntCmpType::U32, IntCmpOp::Ge),
1384 nir_op_ult => (IntCmpType::U32, IntCmpOp::Lt),
1385 _ => panic!("Not an integer comparison"),
1386 };
1387 if alu.get_src(0).bit_size() == 64 {
1388 b.isetp64(cmp_type, cmp_op, x.into(), y.into())
1389 } else {
1390 assert!(alu.get_src(0).bit_size() == 32);
1391 b.isetp(cmp_type, cmp_op, x.into(), y.into())
1392 }
1393 }
1394 nir_op_imad => {
1395 assert!(alu.def.bit_size() == 32);
1396 let dst = b.alloc_ssa(RegFile::GPR, 1);
1397 b.push_op(OpIMad {
1398 dst: dst.into(),
1399 srcs: [srcs[0], srcs[1], srcs[2]],
1400 signed: false,
1401 });
1402 dst
1403 }
1404 nir_op_imax | nir_op_imin | nir_op_umax | nir_op_umin => {
1405 let (tp, min) = match alu.op {
1406 nir_op_imax => (IntCmpType::I32, SrcRef::False),
1407 nir_op_imin => (IntCmpType::I32, SrcRef::True),
1408 nir_op_umax => (IntCmpType::U32, SrcRef::False),
1409 nir_op_umin => (IntCmpType::U32, SrcRef::True),
1410 _ => panic!("Not an integer min/max"),
1411 };
1412 assert!(alu.def.bit_size() == 32);
1413 b.imnmx(tp, srcs[0], srcs[1], min.into())
1414 }
1415 nir_op_imul => {
1416 assert!(alu.def.bit_size() == 32);
1417 b.imul(srcs[0], srcs[1])
1418 }
1419 nir_op_imul_2x32_64 | nir_op_umul_2x32_64 => {
1420 let signed = alu.op == nir_op_imul_2x32_64;
1421 b.imul_2x32_64(srcs[0], srcs[1], signed)
1422 }
1423 nir_op_imul_high | nir_op_umul_high => {
1424 let signed = alu.op == nir_op_imul_high;
1425 let dst64 = b.imul_2x32_64(srcs[0], srcs[1], signed);
1426 dst64[1].into()
1427 }
1428 nir_op_ine => {
1429 if alu.get_src(0).bit_size() == 1 {
1430 b.lop2(LogicOp2::Xor, srcs[0], srcs[1])
1431 } else if alu.get_src(0).bit_size() == 64 {
1432 b.isetp64(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1433 } else {
1434 assert!(alu.get_src(0).bit_size() == 32);
1435 b.isetp(IntCmpType::I32, IntCmpOp::Ne, srcs[0], srcs[1])
1436 }
1437 }
1438 nir_op_ineg => {
1439 if alu.def.bit_size == 64 {
1440 b.ineg64(srcs[0])
1441 } else {
1442 assert!(alu.def.bit_size() == 32);
1443 b.ineg(srcs[0])
1444 }
1445 }
1446 nir_op_inot => {
1447 if alu.def.bit_size() == 1 {
1448 b.lop2(LogicOp2::PassB, true.into(), srcs[0].bnot())
1449 } else {
1450 assert!(alu.def.bit_size() == 32);
1451 b.lop2(LogicOp2::PassB, 0.into(), srcs[0].bnot())
1452 }
1453 }
1454 nir_op_ior => b.lop2(LogicOp2::Or, srcs[0], srcs[1]),
1455 nir_op_ishl => {
1456 if alu.def.bit_size() == 64 {
1457 let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1458 (s as u32).into()
1459 } else {
1460 srcs[1]
1461 };
1462 b.shl64(srcs[0], shift)
1463 } else {
1464 assert!(alu.def.bit_size() == 32);
1465 b.shl(srcs[0], srcs[1])
1466 }
1467 }
1468 nir_op_ishr => {
1469 if alu.def.bit_size() == 64 {
1470 let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1471 (s as u32).into()
1472 } else {
1473 srcs[1]
1474 };
1475 b.shr64(srcs[0], shift, true)
1476 } else {
1477 assert!(alu.def.bit_size() == 32);
1478 b.shr(srcs[0], srcs[1], true)
1479 }
1480 }
1481 nir_op_isub => match alu.def.bit_size {
1482 32 => b.iadd(srcs[0], srcs[1].ineg(), 0.into()),
1483 64 => b.iadd64(srcs[0], srcs[1].ineg(), 0.into()),
1484 x => panic!("unsupported bit size for nir_op_iadd: {x}"),
1485 },
1486 nir_op_ixor => b.lop2(LogicOp2::Xor, srcs[0], srcs[1]),
1487 nir_op_pack_half_2x16_split | nir_op_pack_half_2x16_rtz_split => {
1488 assert!(alu.get_src(0).bit_size() == 32);
1489
1490 let rnd_mode = match alu.op {
1491 nir_op_pack_half_2x16_split => FRndMode::NearestEven,
1492 nir_op_pack_half_2x16_rtz_split => FRndMode::Zero,
1493 _ => panic!("Unhandled fp16 pack op"),
1494 };
1495
1496 if self.sm.sm() >= 86 {
1497 let result: SSARef = b.alloc_ssa(RegFile::GPR, 1);
1498 b.push_op(OpF2FP {
1499 dst: result.into(),
1500 srcs: [srcs[1], srcs[0]],
1501 rnd_mode: rnd_mode,
1502 });
1503
1504 result
1505 } else {
1506 let low = b.alloc_ssa(RegFile::GPR, 1);
1507 let high = b.alloc_ssa(RegFile::GPR, 1);
1508
1509 b.push_op(OpF2F {
1510 dst: low.into(),
1511 src: srcs[0],
1512 src_type: FloatType::F32,
1513 dst_type: FloatType::F16,
1514 rnd_mode: rnd_mode,
1515 ftz: false,
1516 high: false,
1517 integer_rnd: false,
1518 });
1519
1520 let src_bits = usize::from(alu.get_src(1).bit_size());
1521 let src_type = FloatType::from_bits(src_bits);
1522 assert!(matches!(src_type, FloatType::F32));
1523 b.push_op(OpF2F {
1524 dst: high.into(),
1525 src: srcs[1],
1526 src_type: FloatType::F32,
1527 dst_type: FloatType::F16,
1528 rnd_mode: rnd_mode,
1529 ftz: false,
1530 high: false,
1531 integer_rnd: false,
1532 });
1533
1534 b.prmt(low.into(), high.into(), [0, 1, 4, 5])
1535 }
1536 }
1537 nir_op_prmt_nv => {
1538 let dst = b.alloc_ssa(RegFile::GPR, 1);
1539 b.push_op(OpPrmt {
1540 dst: dst.into(),
1541 srcs: [srcs[1], srcs[2]],
1542 sel: srcs[0],
1543 mode: PrmtMode::Index,
1544 });
1545 dst
1546 }
1547 nir_op_sdot_4x8_iadd => {
1548 let dst = b.alloc_ssa(RegFile::GPR, 1);
1549 b.push_op(OpIDp4 {
1550 dst: dst.into(),
1551 src_types: [IntType::I8, IntType::I8],
1552 srcs: [srcs[0], srcs[1], srcs[2]],
1553 });
1554 dst
1555 }
1556 nir_op_sudot_4x8_iadd => {
1557 let dst = b.alloc_ssa(RegFile::GPR, 1);
1558 b.push_op(OpIDp4 {
1559 dst: dst.into(),
1560 src_types: [IntType::I8, IntType::U8],
1561 srcs: [srcs[0], srcs[1], srcs[2]],
1562 });
1563 dst
1564 }
1565 nir_op_udot_4x8_uadd => {
1566 let dst = b.alloc_ssa(RegFile::GPR, 1);
1567 b.push_op(OpIDp4 {
1568 dst: dst.into(),
1569 src_types: [IntType::U8, IntType::U8],
1570 srcs: [srcs[0], srcs[1], srcs[2]],
1571 });
1572 dst
1573 }
1574 nir_op_u2f16 | nir_op_u2f32 | nir_op_u2f64 => {
1575 let src_bits = alu.get_src(0).src.bit_size();
1576 let dst_bits = alu.def.bit_size();
1577 let dst_type = FloatType::from_bits(dst_bits.into());
1578 let dst = b.alloc_ssa(RegFile::GPR, dst_bits.div_ceil(32));
1579 b.push_op(OpI2F {
1580 dst: dst.into(),
1581 src: srcs[0],
1582 dst_type: dst_type,
1583 src_type: IntType::from_bits(src_bits.into(), false),
1584 rnd_mode: self.float_ctl[dst_type].rnd_mode,
1585 });
1586 dst
1587 }
1588 nir_op_uadd_sat => {
1589 let x = srcs[0].as_ssa().unwrap();
1590 let y = srcs[1].as_ssa().unwrap();
1591 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1592 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1593 b.push_op(OpIAdd3 {
1594 dst: sum_lo.into(),
1595 overflow: [ovf_lo.into(), Dst::None],
1596 srcs: [0.into(), x[0].into(), y[0].into()],
1597 });
1598 if alu.def.bit_size() == 64 {
1599 let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1600 let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1601 b.push_op(OpIAdd3X {
1602 dst: sum_hi.into(),
1603 overflow: [ovf_hi.into(), Dst::None],
1604 srcs: [0.into(), x[1].into(), y[1].into()],
1605 carry: [ovf_lo.into(), false.into()],
1606 });
1607 let lo =
1608 b.sel(ovf_hi.into(), u32::MAX.into(), sum_lo.into());
1609 let hi =
1610 b.sel(ovf_hi.into(), u32::MAX.into(), sum_hi.into());
1611 [lo[0], hi[0]].into()
1612 } else {
1613 assert!(alu.def.bit_size() == 32);
1614 b.sel(ovf_lo.into(), u32::MAX.into(), sum_lo.into())
1615 }
1616 }
1617 nir_op_usub_sat => {
1618 let x = srcs[0].as_ssa().unwrap();
1619 let y = srcs[1].as_ssa().unwrap();
1620 let sum_lo = b.alloc_ssa(RegFile::GPR, 1);
1621 let ovf_lo = b.alloc_ssa(RegFile::Pred, 1);
1622 // The result of OpIAdd3X is the 33-bit value
1623 //
1624 // s|o = x + !y + 1
1625 //
1626 // The overflow bit of this result is true if and only if the
1627 // subtract did NOT overflow.
1628 b.push_op(OpIAdd3 {
1629 dst: sum_lo.into(),
1630 overflow: [ovf_lo.into(), Dst::None],
1631 srcs: [0.into(), x[0].into(), Src::from(y[0]).ineg()],
1632 });
1633 if alu.def.bit_size() == 64 {
1634 let sum_hi = b.alloc_ssa(RegFile::GPR, 1);
1635 let ovf_hi = b.alloc_ssa(RegFile::Pred, 1);
1636 b.push_op(OpIAdd3X {
1637 dst: sum_hi.into(),
1638 overflow: [ovf_hi.into(), Dst::None],
1639 srcs: [0.into(), x[1].into(), Src::from(y[1]).bnot()],
1640 carry: [ovf_lo.into(), false.into()],
1641 });
1642 let lo = b.sel(ovf_hi.into(), sum_lo.into(), 0.into());
1643 let hi = b.sel(ovf_hi.into(), sum_hi.into(), 0.into());
1644 [lo[0], hi[0]].into()
1645 } else {
1646 assert!(alu.def.bit_size() == 32);
1647 b.sel(ovf_lo.into(), sum_lo.into(), 0.into())
1648 }
1649 }
1650 nir_op_unpack_32_2x16_split_x => {
1651 b.prmt(srcs[0], 0.into(), [0, 1, 4, 4])
1652 }
1653 nir_op_unpack_32_2x16_split_y => {
1654 b.prmt(srcs[0], 0.into(), [2, 3, 4, 4])
1655 }
1656 nir_op_unpack_64_2x32_split_x => {
1657 let src0_x = srcs[0].as_ssa().unwrap()[0];
1658 b.copy(src0_x.into())
1659 }
1660 nir_op_unpack_64_2x32_split_y => {
1661 let src0_y = srcs[0].as_ssa().unwrap()[1];
1662 b.copy(src0_y.into())
1663 }
1664 nir_op_unpack_half_2x16_split_x
1665 | nir_op_unpack_half_2x16_split_y => {
1666 assert!(alu.def.bit_size() == 32);
1667 let dst = b.alloc_ssa(RegFile::GPR, 1);
1668
1669 b.push_op(OpF2F {
1670 dst: dst[0].into(),
1671 src: srcs[0],
1672 src_type: FloatType::F16,
1673 dst_type: FloatType::F32,
1674 rnd_mode: FRndMode::NearestEven,
1675 ftz: false,
1676 high: alu.op == nir_op_unpack_half_2x16_split_y,
1677 integer_rnd: false,
1678 });
1679
1680 dst
1681 }
1682 nir_op_ushr => {
1683 if alu.def.bit_size() == 64 {
1684 let shift = if let Some(s) = nir_srcs[1].comp_as_uint(0) {
1685 (s as u32).into()
1686 } else {
1687 srcs[1]
1688 };
1689 b.shr64(srcs[0], shift, false)
1690 } else {
1691 assert!(alu.def.bit_size() == 32);
1692 b.shr(srcs[0], srcs[1], false)
1693 }
1694 }
1695 _ => panic!("Unsupported ALU instruction: {}", alu.info().name()),
1696 };
1697 self.set_dst(&alu.def, dst);
1698 }
1699
parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr)1700 fn parse_tex(&mut self, b: &mut impl SSABuilder, tex: &nir_tex_instr) {
1701 let dim = match tex.sampler_dim {
1702 GLSL_SAMPLER_DIM_1D => {
1703 if tex.is_array {
1704 TexDim::Array1D
1705 } else {
1706 TexDim::_1D
1707 }
1708 }
1709 GLSL_SAMPLER_DIM_2D => {
1710 if tex.is_array {
1711 TexDim::Array2D
1712 } else {
1713 TexDim::_2D
1714 }
1715 }
1716 GLSL_SAMPLER_DIM_3D => {
1717 assert!(!tex.is_array);
1718 TexDim::_3D
1719 }
1720 GLSL_SAMPLER_DIM_CUBE => {
1721 if tex.is_array {
1722 TexDim::ArrayCube
1723 } else {
1724 TexDim::Cube
1725 }
1726 }
1727 GLSL_SAMPLER_DIM_BUF => TexDim::_1D,
1728 GLSL_SAMPLER_DIM_MS => {
1729 if tex.is_array {
1730 TexDim::Array2D
1731 } else {
1732 TexDim::_2D
1733 }
1734 }
1735 _ => panic!("Unsupported texture dimension: {}", tex.sampler_dim),
1736 };
1737
1738 let srcs = tex.srcs_as_slice();
1739 assert!(srcs[0].src_type == nir_tex_src_backend1);
1740 if srcs.len() > 1 {
1741 assert!(srcs.len() == 2);
1742 assert!(srcs[1].src_type == nir_tex_src_backend2);
1743 }
1744
1745 let flags: nak_nir_tex_flags =
1746 unsafe { std::mem::transmute_copy(&tex.backend_flags) };
1747
1748 let mask = tex.def.components_read();
1749 let mut mask = u8::try_from(mask).unwrap();
1750 if flags.is_sparse() {
1751 mask &= !(1 << (tex.def.num_components - 1));
1752 if mask == 0 {
1753 // This can happen if only the sparse predicate is used. In
1754 // that case, we need at least one result register.
1755 mask = 1;
1756 }
1757 } else {
1758 debug_assert!(mask != 0);
1759 }
1760
1761 let dst_comps = u8::try_from(mask.count_ones()).unwrap();
1762 let dst = b.alloc_ssa(RegFile::GPR, dst_comps);
1763
1764 // On Volta and later, the destination is split in two
1765 let mut dsts = [Dst::None; 2];
1766 if dst_comps > 2 && b.sm() >= 70 {
1767 dsts[0] = SSARef::try_from(&dst[0..2]).unwrap().into();
1768 dsts[1] = SSARef::try_from(&dst[2..]).unwrap().into();
1769 } else {
1770 dsts[0] = dst.into();
1771 }
1772
1773 let fault = if flags.is_sparse() {
1774 b.alloc_ssa(RegFile::Pred, 1).into()
1775 } else {
1776 Dst::None
1777 };
1778
1779 if tex.op == nir_texop_hdr_dim_nv {
1780 let src = self.get_src(&srcs[0].src);
1781 assert!(fault.is_none());
1782 b.push_op(OpTxq {
1783 dsts: dsts,
1784 src: src,
1785 query: TexQuery::Dimension,
1786 mask: mask,
1787 });
1788 } else if tex.op == nir_texop_tex_type_nv {
1789 let src = self.get_src(&srcs[0].src);
1790 assert!(fault.is_none());
1791 b.push_op(OpTxq {
1792 dsts: dsts,
1793 src: src,
1794 query: TexQuery::TextureType,
1795 mask: mask,
1796 });
1797 } else {
1798 let lod_mode = match flags.lod_mode() {
1799 NAK_NIR_LOD_MODE_AUTO => TexLodMode::Auto,
1800 NAK_NIR_LOD_MODE_ZERO => TexLodMode::Zero,
1801 NAK_NIR_LOD_MODE_BIAS => TexLodMode::Bias,
1802 NAK_NIR_LOD_MODE_LOD => TexLodMode::Lod,
1803 NAK_NIR_LOD_MODE_CLAMP => TexLodMode::Clamp,
1804 NAK_NIR_LOD_MODE_BIAS_CLAMP => TexLodMode::BiasClamp,
1805 _ => panic!("Invalid LOD mode"),
1806 };
1807
1808 let offset_mode = match flags.offset_mode() {
1809 NAK_NIR_OFFSET_MODE_NONE => Tld4OffsetMode::None,
1810 NAK_NIR_OFFSET_MODE_AOFFI => Tld4OffsetMode::AddOffI,
1811 NAK_NIR_OFFSET_MODE_PER_PX => Tld4OffsetMode::PerPx,
1812 _ => panic!("Invalid offset mode"),
1813 };
1814
1815 let srcs = [self.get_src(&srcs[0].src), self.get_src(&srcs[1].src)];
1816
1817 if tex.op == nir_texop_txd {
1818 assert!(lod_mode == TexLodMode::Auto);
1819 assert!(offset_mode != Tld4OffsetMode::PerPx);
1820 assert!(!flags.has_z_cmpr());
1821 b.push_op(OpTxd {
1822 dsts: dsts,
1823 fault,
1824 srcs: srcs,
1825 dim: dim,
1826 offset: offset_mode == Tld4OffsetMode::AddOffI,
1827 mask: mask,
1828 });
1829 } else if tex.op == nir_texop_lod {
1830 assert!(offset_mode == Tld4OffsetMode::None);
1831 b.push_op(OpTmml {
1832 dsts: dsts,
1833 srcs: srcs,
1834 dim: dim,
1835 mask: mask,
1836 });
1837 } else if tex.op == nir_texop_txf || tex.op == nir_texop_txf_ms {
1838 assert!(offset_mode != Tld4OffsetMode::PerPx);
1839 b.push_op(OpTld {
1840 dsts: dsts,
1841 fault,
1842 srcs: srcs,
1843 dim: dim,
1844 lod_mode: lod_mode,
1845 is_ms: tex.op == nir_texop_txf_ms,
1846 offset: offset_mode == Tld4OffsetMode::AddOffI,
1847 mask: mask,
1848 });
1849 } else if tex.op == nir_texop_tg4 {
1850 b.push_op(OpTld4 {
1851 dsts: dsts,
1852 fault,
1853 srcs: srcs,
1854 dim: dim,
1855 comp: tex.component().try_into().unwrap(),
1856 offset_mode: offset_mode,
1857 z_cmpr: flags.has_z_cmpr(),
1858 mask: mask,
1859 });
1860 } else {
1861 assert!(offset_mode != Tld4OffsetMode::PerPx);
1862 b.push_op(OpTex {
1863 dsts: dsts,
1864 fault,
1865 srcs: srcs,
1866 dim: dim,
1867 lod_mode: lod_mode,
1868 z_cmpr: flags.has_z_cmpr(),
1869 offset: offset_mode == Tld4OffsetMode::AddOffI,
1870 mask: mask,
1871 });
1872 }
1873 }
1874
1875 let mut di = 0_usize;
1876 let mut nir_dst = Vec::new();
1877 for i in 0..tex.def.num_components() {
1878 if flags.is_sparse() && i == tex.def.num_components - 1 {
1879 let Dst::SSA(fault) = fault else {
1880 panic!("No fault value for sparse op");
1881 };
1882 nir_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
1883 } else if mask & (1 << i) == 0 {
1884 nir_dst.push(b.copy(0.into())[0]);
1885 } else {
1886 nir_dst.push(dst[di]);
1887 di += 1;
1888 }
1889 }
1890 self.set_ssa(tex.def.as_def(), nir_dst);
1891 }
1892
get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType1893 fn get_atomic_type(&self, intrin: &nir_intrinsic_instr) -> AtomType {
1894 let bit_size = intrin.def.bit_size();
1895 match intrin.atomic_op() {
1896 nir_atomic_op_iadd => AtomType::U(bit_size),
1897 nir_atomic_op_imin => AtomType::I(bit_size),
1898 nir_atomic_op_umin => AtomType::U(bit_size),
1899 nir_atomic_op_imax => AtomType::I(bit_size),
1900 nir_atomic_op_umax => AtomType::U(bit_size),
1901 nir_atomic_op_iand => AtomType::U(bit_size),
1902 nir_atomic_op_ior => AtomType::U(bit_size),
1903 nir_atomic_op_ixor => AtomType::U(bit_size),
1904 nir_atomic_op_xchg => AtomType::U(bit_size),
1905 nir_atomic_op_fadd => AtomType::F(bit_size),
1906 nir_atomic_op_fmin => AtomType::F(bit_size),
1907 nir_atomic_op_fmax => AtomType::F(bit_size),
1908 nir_atomic_op_cmpxchg => AtomType::U(bit_size),
1909 _ => panic!("Unsupported NIR atomic op"),
1910 }
1911 }
1912
get_atomic_op( &self, intrin: &nir_intrinsic_instr, cmp_src: AtomCmpSrc, ) -> AtomOp1913 fn get_atomic_op(
1914 &self,
1915 intrin: &nir_intrinsic_instr,
1916 cmp_src: AtomCmpSrc,
1917 ) -> AtomOp {
1918 match intrin.atomic_op() {
1919 nir_atomic_op_iadd => AtomOp::Add,
1920 nir_atomic_op_imin => AtomOp::Min,
1921 nir_atomic_op_umin => AtomOp::Min,
1922 nir_atomic_op_imax => AtomOp::Max,
1923 nir_atomic_op_umax => AtomOp::Max,
1924 nir_atomic_op_iand => AtomOp::And,
1925 nir_atomic_op_ior => AtomOp::Or,
1926 nir_atomic_op_ixor => AtomOp::Xor,
1927 nir_atomic_op_xchg => AtomOp::Exch,
1928 nir_atomic_op_fadd => AtomOp::Add,
1929 nir_atomic_op_fmin => AtomOp::Min,
1930 nir_atomic_op_fmax => AtomOp::Max,
1931 nir_atomic_op_cmpxchg => AtomOp::CmpExch(cmp_src),
1932 _ => panic!("Unsupported NIR atomic op"),
1933 }
1934 }
1935
get_eviction_priority( &mut self, access: gl_access_qualifier, ) -> MemEvictionPriority1936 fn get_eviction_priority(
1937 &mut self,
1938 access: gl_access_qualifier,
1939 ) -> MemEvictionPriority {
1940 if self.sm.sm() >= 70 && access & ACCESS_NON_TEMPORAL != 0 {
1941 MemEvictionPriority::First
1942 } else {
1943 MemEvictionPriority::Normal
1944 }
1945 }
1946
get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim1947 fn get_image_dim(&mut self, intrin: &nir_intrinsic_instr) -> ImageDim {
1948 let is_array = intrin.image_array();
1949 let image_dim = intrin.image_dim();
1950 match intrin.image_dim() {
1951 GLSL_SAMPLER_DIM_1D => {
1952 if is_array {
1953 ImageDim::_1DArray
1954 } else {
1955 ImageDim::_1D
1956 }
1957 }
1958 GLSL_SAMPLER_DIM_2D => {
1959 if is_array {
1960 ImageDim::_2DArray
1961 } else {
1962 ImageDim::_2D
1963 }
1964 }
1965 GLSL_SAMPLER_DIM_3D => {
1966 assert!(!is_array);
1967 ImageDim::_3D
1968 }
1969 GLSL_SAMPLER_DIM_CUBE => ImageDim::_2DArray,
1970 GLSL_SAMPLER_DIM_BUF => {
1971 assert!(!is_array);
1972 ImageDim::_1DBuffer
1973 }
1974 _ => panic!("Unsupported image dimension: {}", image_dim),
1975 }
1976 }
1977
get_image_coord( &mut self, intrin: &nir_intrinsic_instr, dim: ImageDim, ) -> Src1978 fn get_image_coord(
1979 &mut self,
1980 intrin: &nir_intrinsic_instr,
1981 dim: ImageDim,
1982 ) -> Src {
1983 let vec = self.get_ssa(intrin.get_src(1).as_def());
1984 // let sample = self.get_src(&srcs[2]);
1985 let comps = usize::from(dim.coord_comps());
1986 SSARef::try_from(&vec[0..comps]).unwrap().into()
1987 }
1988
parse_intrinsic( &mut self, b: &mut impl SSABuilder, intrin: &nir_intrinsic_instr, )1989 fn parse_intrinsic(
1990 &mut self,
1991 b: &mut impl SSABuilder,
1992 intrin: &nir_intrinsic_instr,
1993 ) {
1994 let srcs = intrin.srcs_as_slice();
1995 match intrin.intrinsic {
1996 nir_intrinsic_al2p_nv => {
1997 let offset = self.get_src(&srcs[0]);
1998 let addr = u16::try_from(intrin.base()).unwrap();
1999
2000 let flags = intrin.flags();
2001 let flags: nak_nir_attr_io_flags =
2002 unsafe { std::mem::transmute_copy(&flags) };
2003
2004 let access = AttrAccess {
2005 addr: addr,
2006 comps: 1,
2007 patch: flags.patch(),
2008 output: flags.output(),
2009 phys: false,
2010 };
2011
2012 let dst = b.alloc_ssa(RegFile::GPR, 1);
2013 b.push_op(OpAL2P {
2014 dst: dst.into(),
2015 offset: offset,
2016 access: access,
2017 });
2018 self.set_dst(&intrin.def, dst);
2019 }
2020 nir_intrinsic_ald_nv | nir_intrinsic_ast_nv => {
2021 let addr = u16::try_from(intrin.base()).unwrap();
2022 let base = u16::try_from(intrin.range_base()).unwrap();
2023 let range = u16::try_from(intrin.range()).unwrap();
2024 let range = base..(base + range);
2025
2026 let flags = intrin.flags();
2027 let flags: nak_nir_attr_io_flags =
2028 unsafe { std::mem::transmute_copy(&flags) };
2029 assert!(!flags.patch() || !flags.phys());
2030
2031 if let ShaderIoInfo::Vtg(io) = &mut self.info.io {
2032 if flags.patch() {
2033 match &mut self.info.stage {
2034 ShaderStageInfo::TessellationInit(stage) => {
2035 assert!(flags.output());
2036 stage.per_patch_attribute_count = max(
2037 stage.per_patch_attribute_count,
2038 (range.end / 4).try_into().unwrap(),
2039 );
2040 }
2041 ShaderStageInfo::Tessellation(_) => (),
2042 _ => panic!("Patch I/O not supported"),
2043 }
2044 } else {
2045 if flags.output() {
2046 if intrin.intrinsic == nir_intrinsic_ast_nv {
2047 io.mark_store_req(range.clone());
2048 }
2049 io.mark_attrs_written(range);
2050 } else {
2051 io.mark_attrs_read(range);
2052 }
2053 }
2054 } else {
2055 panic!("Must be a VTG stage");
2056 }
2057
2058 let access = AttrAccess {
2059 addr: addr,
2060 comps: intrin.num_components,
2061 patch: flags.patch(),
2062 output: flags.output(),
2063 phys: flags.phys(),
2064 };
2065
2066 if intrin.intrinsic == nir_intrinsic_ald_nv {
2067 let vtx = self.get_src(&srcs[0]);
2068 let offset = self.get_src(&srcs[1]);
2069
2070 assert!(intrin.def.bit_size() == 32);
2071 let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2072 b.push_op(OpALd {
2073 dst: dst.into(),
2074 vtx: vtx,
2075 offset: offset,
2076 access: access,
2077 });
2078 self.set_dst(&intrin.def, dst);
2079 } else if intrin.intrinsic == nir_intrinsic_ast_nv {
2080 assert!(srcs[0].bit_size() == 32);
2081 let data = self.get_src(&srcs[0]);
2082 let vtx = self.get_src(&srcs[1]);
2083 let offset = self.get_src(&srcs[2]);
2084
2085 b.push_op(OpASt {
2086 data: data,
2087 vtx: vtx,
2088 offset: offset,
2089 access: access,
2090 });
2091 } else {
2092 panic!("Invalid VTG I/O intrinsic");
2093 }
2094 }
2095 nir_intrinsic_as_uniform => {
2096 let src = self.get_ssa(srcs[0].as_def());
2097 let mut dst = Vec::new();
2098 for comp in src {
2099 let u = b.alloc_ssa(RegFile::UGPR, 1);
2100 b.push_op(OpR2UR {
2101 src: [*comp].into(),
2102 dst: u.into(),
2103 });
2104 dst.push(u[0]);
2105 }
2106 self.set_ssa(&intrin.def, dst);
2107 }
2108 nir_intrinsic_ddx
2109 | nir_intrinsic_ddx_coarse
2110 | nir_intrinsic_ddx_fine => {
2111 // TODO: Real coarse derivatives
2112
2113 assert!(intrin.def.bit_size() == 32);
2114 let ftype = FloatType::F32;
2115 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2116
2117 b.push_op(OpShfl {
2118 dst: scratch[0].into(),
2119 in_bounds: Dst::None,
2120 src: self.get_src(&srcs[0]),
2121 lane: 1_u32.into(),
2122 c: (0x3_u32 | 0x1c_u32 << 8).into(),
2123 op: ShflOp::Bfly,
2124 });
2125
2126 let dst = b.alloc_ssa(RegFile::GPR, 1);
2127
2128 b.push_op(OpFSwzAdd {
2129 dst: dst[0].into(),
2130 srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2131 ops: [
2132 FSwzAddOp::SubLeft,
2133 FSwzAddOp::SubRight,
2134 FSwzAddOp::SubLeft,
2135 FSwzAddOp::SubRight,
2136 ],
2137 rnd_mode: self.float_ctl[ftype].rnd_mode,
2138 ftz: self.float_ctl[ftype].ftz,
2139 });
2140
2141 self.set_dst(&intrin.def, dst);
2142 }
2143 nir_intrinsic_ddy
2144 | nir_intrinsic_ddy_coarse
2145 | nir_intrinsic_ddy_fine => {
2146 // TODO: Real coarse derivatives
2147
2148 assert!(intrin.def.bit_size() == 32);
2149 let ftype = FloatType::F32;
2150 let scratch = b.alloc_ssa(RegFile::GPR, 1);
2151
2152 b.push_op(OpShfl {
2153 dst: scratch[0].into(),
2154 in_bounds: Dst::None,
2155 src: self.get_src(&srcs[0]),
2156 lane: 2_u32.into(),
2157 c: (0x3_u32 | 0x1c_u32 << 8).into(),
2158 op: ShflOp::Bfly,
2159 });
2160
2161 let dst = b.alloc_ssa(RegFile::GPR, 1);
2162
2163 b.push_op(OpFSwzAdd {
2164 dst: dst[0].into(),
2165 srcs: [scratch[0].into(), self.get_src(&srcs[0])],
2166 ops: [
2167 FSwzAddOp::SubLeft,
2168 FSwzAddOp::SubLeft,
2169 FSwzAddOp::SubRight,
2170 FSwzAddOp::SubRight,
2171 ],
2172 rnd_mode: self.float_ctl[ftype].rnd_mode,
2173 ftz: self.float_ctl[ftype].ftz,
2174 });
2175
2176 self.set_dst(&intrin.def, dst);
2177 }
2178 nir_intrinsic_ballot => {
2179 assert!(srcs[0].bit_size() == 1);
2180 let src = self.get_src(&srcs[0]);
2181
2182 assert!(intrin.def.bit_size() == 32);
2183 let dst = b.alloc_ssa(RegFile::GPR, 1);
2184
2185 b.push_op(OpVote {
2186 op: VoteOp::Any,
2187 ballot: dst.into(),
2188 vote: Dst::None,
2189 pred: src,
2190 });
2191 self.set_dst(&intrin.def, dst);
2192 }
2193 nir_intrinsic_bar_break_nv => {
2194 let src = self.get_src(&srcs[0]);
2195 let bar_in = b.bmov_to_bar(src);
2196 let cond = self.get_src(&srcs[1]);
2197
2198 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2199 b.push_op(OpBreak {
2200 bar_out: bar_out.into(),
2201 bar_in: bar_in.into(),
2202 cond: cond.into(),
2203 });
2204
2205 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2206 }
2207 nir_intrinsic_bar_set_nv => {
2208 let label = self.label_alloc.alloc();
2209 let old = self.bar_label.insert(intrin.def.index, label);
2210 assert!(old.is_none());
2211
2212 let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
2213 b.push_op(OpBClear {
2214 dst: bar_clear.into(),
2215 });
2216
2217 let bar_out = b.alloc_ssa(RegFile::Bar, 1);
2218 b.push_op(OpBSSy {
2219 bar_out: bar_out.into(),
2220 bar_in: bar_clear.into(),
2221 cond: SrcRef::True.into(),
2222 target: label,
2223 });
2224
2225 self.set_dst(&intrin.def, b.bmov_to_gpr(bar_out.into()));
2226 }
2227 nir_intrinsic_bar_sync_nv => {
2228 let src = self.get_src(&srcs[0]);
2229
2230 let bar = b.bmov_to_bar(src);
2231 b.push_op(OpBSync {
2232 bar: bar.into(),
2233 cond: SrcRef::True.into(),
2234 });
2235
2236 let bar_set_idx = &srcs[1].as_def().index;
2237 if let Some(label) = self.bar_label.get(bar_set_idx) {
2238 b.push_op(OpNop {
2239 label: Some(*label),
2240 });
2241 }
2242 }
2243 nir_intrinsic_bindless_image_atomic
2244 | nir_intrinsic_bindless_image_atomic_swap => {
2245 let handle = self.get_src(&srcs[0]);
2246 let dim = self.get_image_dim(intrin);
2247 let coord = self.get_image_coord(intrin, dim);
2248 // let sample = self.get_src(&srcs[2]);
2249 let atom_type = self.get_atomic_type(intrin);
2250 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Packed);
2251
2252 assert!(
2253 intrin.def.bit_size() == 32 || intrin.def.bit_size() == 64
2254 );
2255 assert!(intrin.def.num_components() == 1);
2256 let dst = b.alloc_ssa(RegFile::GPR, intrin.def.bit_size() / 32);
2257
2258 let data = if intrin.intrinsic
2259 == nir_intrinsic_bindless_image_atomic_swap
2260 {
2261 if intrin.def.bit_size() == 64 {
2262 SSARef::from([
2263 self.get_ssa(srcs[3].as_def())[0],
2264 self.get_ssa(srcs[3].as_def())[1],
2265 self.get_ssa(srcs[4].as_def())[0],
2266 self.get_ssa(srcs[4].as_def())[1],
2267 ])
2268 .into()
2269 } else {
2270 SSARef::from([
2271 self.get_ssa(srcs[3].as_def())[0],
2272 self.get_ssa(srcs[4].as_def())[0],
2273 ])
2274 .into()
2275 }
2276 } else {
2277 self.get_src(&srcs[3])
2278 };
2279
2280 let is_reduction =
2281 atom_op.is_reduction() && intrin.def.components_read() == 0;
2282
2283 b.push_op(OpSuAtom {
2284 dst: if self.sm.sm() >= 70 && is_reduction {
2285 Dst::None
2286 } else {
2287 dst.into()
2288 },
2289 fault: Dst::None,
2290 handle: handle,
2291 coord: coord,
2292 data: data,
2293 atom_op: atom_op,
2294 atom_type: atom_type,
2295 image_dim: dim,
2296 mem_order: MemOrder::Strong(MemScope::System),
2297 mem_eviction_priority: self
2298 .get_eviction_priority(intrin.access()),
2299 });
2300 self.set_dst(&intrin.def, dst);
2301 }
2302 nir_intrinsic_bindless_image_load => {
2303 let handle = self.get_src(&srcs[0]);
2304 let dim = self.get_image_dim(intrin);
2305 let coord = self.get_image_coord(intrin, dim);
2306 // let sample = self.get_src(&srcs[2]);
2307
2308 let mem_order = if (intrin.access() & ACCESS_CAN_REORDER) != 0 {
2309 if self.sm.sm() >= 80 {
2310 MemOrder::Constant
2311 } else {
2312 MemOrder::Weak
2313 }
2314 } else {
2315 MemOrder::Strong(MemScope::System)
2316 };
2317
2318 let comps = intrin.num_components;
2319 assert!(intrin.def.bit_size() == 32);
2320 assert!(comps == 1 || comps == 2 || comps == 4);
2321
2322 let dst = b.alloc_ssa(RegFile::GPR, comps);
2323
2324 b.push_op(OpSuLd {
2325 dst: dst.into(),
2326 fault: Dst::None,
2327 image_dim: dim,
2328 mem_order,
2329 mem_eviction_priority: self
2330 .get_eviction_priority(intrin.access()),
2331 mask: (1 << comps) - 1,
2332 handle: handle,
2333 coord: coord,
2334 });
2335 self.set_dst(&intrin.def, dst);
2336 }
2337 nir_intrinsic_bindless_image_sparse_load => {
2338 let handle = self.get_src(&srcs[0]);
2339 let dim = self.get_image_dim(intrin);
2340 let coord = self.get_image_coord(intrin, dim);
2341 // let sample = self.get_src(&srcs[2]);
2342
2343 let mem_order = if (intrin.access() & ACCESS_CAN_REORDER) != 0 {
2344 if self.sm.sm() >= 80 {
2345 MemOrder::Constant
2346 } else {
2347 MemOrder::Weak
2348 }
2349 } else {
2350 MemOrder::Strong(MemScope::System)
2351 };
2352
2353 let comps = intrin.num_components;
2354 assert!(intrin.def.bit_size() == 32);
2355 assert!(comps == 5);
2356
2357 let dst = b.alloc_ssa(RegFile::GPR, comps - 1);
2358 let fault = b.alloc_ssa(RegFile::Pred, 1);
2359
2360 b.push_op(OpSuLd {
2361 dst: dst.into(),
2362 fault: fault.into(),
2363 image_dim: dim,
2364 mem_order,
2365 mem_eviction_priority: self
2366 .get_eviction_priority(intrin.access()),
2367 mask: (1 << (comps - 1)) - 1,
2368 handle: handle,
2369 coord: coord,
2370 });
2371
2372 let mut final_dst = Vec::new();
2373 for i in 0..usize::from(comps) - 1 {
2374 final_dst.push(dst[i]);
2375 }
2376 final_dst.push(b.sel(fault.into(), 0.into(), 1.into())[0]);
2377
2378 self.set_ssa(&intrin.def, final_dst);
2379 }
2380 nir_intrinsic_bindless_image_store => {
2381 let handle = self.get_src(&srcs[0]);
2382 let dim = self.get_image_dim(intrin);
2383 let coord = self.get_image_coord(intrin, dim);
2384 // let sample = self.get_src(&srcs[2]);
2385 let data = self.get_src(&srcs[3]);
2386
2387 let comps = intrin.num_components;
2388 assert!(srcs[3].bit_size() == 32);
2389 assert!(comps == 1 || comps == 2 || comps == 4);
2390
2391 b.push_op(OpSuSt {
2392 image_dim: dim,
2393 mem_order: MemOrder::Strong(MemScope::System),
2394 mem_eviction_priority: self
2395 .get_eviction_priority(intrin.access()),
2396 mask: (1 << comps) - 1,
2397 handle: handle,
2398 coord: coord,
2399 data: data,
2400 });
2401 }
2402 nir_intrinsic_copy_fs_outputs_nv => {
2403 let ShaderIoInfo::Fragment(info) = &mut self.info.io else {
2404 panic!(
2405 "copy_fs_outputs_nv is only allowed in fragment shaders"
2406 );
2407 };
2408
2409 for i in 0..32 {
2410 if !self.fs_out_regs[i].is_none() {
2411 info.writes_color |= 1 << i;
2412 }
2413 }
2414 let mask_idx = (NAK_FS_OUT_SAMPLE_MASK / 4) as usize;
2415 info.writes_sample_mask = !self.fs_out_regs[mask_idx].is_none();
2416 let depth_idx = (NAK_FS_OUT_DEPTH / 4) as usize;
2417 info.writes_depth = !self.fs_out_regs[depth_idx].is_none();
2418
2419 let mut srcs = Vec::new();
2420 for i in 0..8 {
2421 // Even though the mask is per-component, the actual output
2422 // space is per-output vec4s.
2423 if info.writes_color & (0xf << (i * 4)) != 0 {
2424 for c in 0..4 {
2425 let reg = self.fs_out_regs[i * 4 + c];
2426 if reg.is_none() {
2427 srcs.push(b.undef().into());
2428 } else {
2429 srcs.push(reg.into());
2430 }
2431 }
2432 }
2433 }
2434
2435 // These always come together for some reason
2436 if info.writes_sample_mask || info.writes_depth {
2437 if info.writes_sample_mask {
2438 srcs.push(self.fs_out_regs[mask_idx].into());
2439 } else {
2440 srcs.push(b.undef().into());
2441 }
2442 if info.writes_depth {
2443 srcs.push(self.fs_out_regs[depth_idx].into());
2444 }
2445 }
2446
2447 b.push_op(OpRegOut { srcs: srcs });
2448 }
2449 nir_intrinsic_demote => {
2450 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2451 info.uses_kill = true;
2452 } else {
2453 panic!("OpKill is only available in fragment shaders");
2454 }
2455 b.push_op(OpKill {});
2456 }
2457 nir_intrinsic_demote_if => {
2458 if let ShaderStageInfo::Fragment(info) = &mut self.info.stage {
2459 info.uses_kill = true;
2460 } else {
2461 panic!("OpKill is only available in fragment shaders");
2462 }
2463 let cond = self.get_ssa(srcs[0].as_def())[0];
2464 b.predicate(cond.into()).push_op(OpKill {});
2465 }
2466 nir_intrinsic_global_atomic => {
2467 let bit_size = intrin.def.bit_size();
2468 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2469 let data = self.get_src(&srcs[1]);
2470 let atom_type = self.get_atomic_type(intrin);
2471 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2472
2473 assert!(intrin.def.num_components() == 1);
2474 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2475
2476 let is_reduction =
2477 atom_op.is_reduction() && intrin.def.components_read() == 0;
2478
2479 b.push_op(OpAtom {
2480 dst: if is_reduction { Dst::None } else { dst.into() },
2481 addr: addr,
2482 cmpr: 0.into(),
2483 data: data,
2484 atom_op: atom_op,
2485 atom_type: atom_type,
2486 addr_offset: offset,
2487 mem_space: MemSpace::Global(MemAddrType::A64),
2488 mem_order: MemOrder::Strong(MemScope::System),
2489 mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2490 });
2491 self.set_dst(&intrin.def, dst);
2492 }
2493 nir_intrinsic_global_atomic_swap => {
2494 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
2495 let bit_size = intrin.def.bit_size();
2496 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2497 let cmpr = self.get_src(&srcs[1]);
2498 let data = self.get_src(&srcs[2]);
2499 let atom_type = AtomType::U(bit_size);
2500
2501 assert!(intrin.def.num_components() == 1);
2502 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2503
2504 b.push_op(OpAtom {
2505 dst: dst.into(),
2506 addr: addr,
2507 cmpr: cmpr,
2508 data: data,
2509 atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
2510 atom_type: atom_type,
2511 addr_offset: offset,
2512 mem_space: MemSpace::Global(MemAddrType::A64),
2513 mem_order: MemOrder::Strong(MemScope::System),
2514 mem_eviction_priority: MemEvictionPriority::Normal, // Note: no intrinic access
2515 });
2516 self.set_dst(&intrin.def, dst);
2517 }
2518 nir_intrinsic_ipa_nv => {
2519 let addr = u16::try_from(intrin.base()).unwrap();
2520
2521 let flags = intrin.flags();
2522 let flags: nak_nir_ipa_flags =
2523 unsafe { std::mem::transmute_copy(&flags) };
2524
2525 let mode = match flags.interp_mode() {
2526 NAK_INTERP_MODE_PERSPECTIVE => PixelImap::Perspective,
2527 NAK_INTERP_MODE_SCREEN_LINEAR => PixelImap::ScreenLinear,
2528 NAK_INTERP_MODE_CONSTANT => PixelImap::Constant,
2529 _ => panic!("Unsupported interp mode"),
2530 };
2531
2532 let freq = match flags.interp_freq() {
2533 NAK_INTERP_FREQ_PASS => InterpFreq::Pass,
2534 NAK_INTERP_FREQ_PASS_MUL_W => InterpFreq::PassMulW,
2535 NAK_INTERP_FREQ_CONSTANT => InterpFreq::Constant,
2536 NAK_INTERP_FREQ_STATE => InterpFreq::State,
2537 _ => panic!("Invalid interp freq"),
2538 };
2539
2540 let loc = match flags.interp_loc() {
2541 NAK_INTERP_LOC_DEFAULT => InterpLoc::Default,
2542 NAK_INTERP_LOC_CENTROID => InterpLoc::Centroid,
2543 NAK_INTERP_LOC_OFFSET => InterpLoc::Offset,
2544 _ => panic!("Invalid interp loc"),
2545 };
2546
2547 let inv_w = if freq == InterpFreq::PassMulW {
2548 self.get_src(&srcs[0])
2549 } else {
2550 0.into()
2551 };
2552
2553 let offset = if loc == InterpLoc::Offset {
2554 self.get_src(&srcs[1])
2555 } else {
2556 0.into()
2557 };
2558
2559 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2560 panic!("OpIpa is only used for fragment shaders");
2561 };
2562
2563 io.mark_attr_read(addr, mode);
2564
2565 let dst = b.alloc_ssa(RegFile::GPR, 1);
2566 b.push_op(OpIpa {
2567 dst: dst.into(),
2568 addr: addr,
2569 freq: freq,
2570 loc: loc,
2571 inv_w: inv_w,
2572 offset: offset,
2573 });
2574 self.set_dst(&intrin.def, dst);
2575 }
2576 nir_intrinsic_isberd_nv => {
2577 let dst = b.alloc_ssa(RegFile::GPR, 1);
2578 b.push_op(OpIsberd {
2579 dst: dst.into(),
2580 idx: self.get_src(&srcs[0]),
2581 });
2582 self.set_dst(&intrin.def, dst);
2583 }
2584 nir_intrinsic_load_barycentric_at_offset_nv => (),
2585 nir_intrinsic_load_barycentric_centroid => (),
2586 nir_intrinsic_load_barycentric_pixel => (),
2587 nir_intrinsic_load_barycentric_sample => (),
2588 nir_intrinsic_load_global | nir_intrinsic_load_global_constant => {
2589 let size_B =
2590 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2591 assert!(u32::from(size_B) <= intrin.align());
2592 let order = if intrin.intrinsic
2593 == nir_intrinsic_load_global_constant
2594 || (intrin.access() & ACCESS_CAN_REORDER) != 0
2595 {
2596 MemOrder::Constant
2597 } else {
2598 MemOrder::Strong(MemScope::System)
2599 };
2600 let access = MemAccess {
2601 mem_type: MemType::from_size(size_B, false),
2602 space: MemSpace::Global(MemAddrType::A64),
2603 order: order,
2604 eviction_priority: self
2605 .get_eviction_priority(intrin.access()),
2606 };
2607 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2608 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2609
2610 b.push_op(OpLd {
2611 dst: dst.into(),
2612 addr: addr,
2613 offset: offset,
2614 access: access,
2615 });
2616 self.set_dst(&intrin.def, dst);
2617 }
2618 nir_intrinsic_ldtram_nv => {
2619 let ShaderIoInfo::Fragment(io) = &mut self.info.io else {
2620 panic!("ldtram_nv is only used for fragment shaders");
2621 };
2622
2623 assert!(
2624 intrin.def.bit_size() == 32
2625 && intrin.def.num_components == 2
2626 );
2627
2628 let flags = intrin.flags();
2629 let use_c = flags != 0;
2630
2631 let addr = u16::try_from(intrin.base()).unwrap();
2632
2633 io.mark_barycentric_attr_in(addr);
2634
2635 let dst = b.alloc_ssa(RegFile::GPR, 2);
2636 b.push_op(OpLdTram {
2637 dst: dst.into(),
2638 addr,
2639 use_c,
2640 });
2641 self.set_dst(&intrin.def, dst);
2642 }
2643 nir_intrinsic_load_sample_id => {
2644 let dst = b.alloc_ssa(RegFile::GPR, 1);
2645 b.push_op(OpPixLd {
2646 dst: dst.into(),
2647 val: PixVal::MyIndex,
2648 });
2649 self.set_dst(&intrin.def, dst);
2650 }
2651 nir_intrinsic_load_sample_mask_in => {
2652 if let ShaderIoInfo::Fragment(info) = &mut self.info.io {
2653 info.reads_sample_mask = true;
2654 } else {
2655 panic!(
2656 "sample_mask_in is only available in fragment shaders"
2657 );
2658 }
2659
2660 let dst = b.alloc_ssa(RegFile::GPR, 1);
2661 b.push_op(OpPixLd {
2662 dst: dst.into(),
2663 val: PixVal::CovMask,
2664 });
2665 self.set_dst(&intrin.def, dst);
2666 }
2667 nir_intrinsic_load_tess_coord_xy => {
2668 // Loading gl_TessCoord in tessellation evaluation shaders is
2669 // weird. It's treated as a per-vertex output which is indexed
2670 // by LANEID.
2671 match &self.info.stage {
2672 ShaderStageInfo::Tessellation(_) => (),
2673 _ => panic!(
2674 "load_tess_coord is only available in tessellation \
2675 shaders"
2676 ),
2677 };
2678
2679 assert!(intrin.def.bit_size() == 32);
2680 assert!(intrin.def.num_components() == 2);
2681
2682 let vtx = b.alloc_ssa(RegFile::GPR, 1);
2683 b.push_op(OpS2R {
2684 dst: vtx.into(),
2685 idx: 0,
2686 });
2687
2688 let access = AttrAccess {
2689 addr: NAK_ATTR_TESS_COORD,
2690 comps: 2,
2691 patch: false,
2692 output: true,
2693 phys: false,
2694 };
2695
2696 // This is recorded as a patch output in parse_shader() because
2697 // the hardware requires it be in the SPH, whether we use it or
2698 // not.
2699
2700 let dst = b.alloc_ssa(RegFile::GPR, access.comps);
2701 b.push_op(OpALd {
2702 dst: dst.into(),
2703 vtx: vtx.into(),
2704 offset: 0.into(),
2705 access: access,
2706 });
2707 self.set_dst(&intrin.def, dst);
2708 }
2709 nir_intrinsic_load_scratch => {
2710 let size_B =
2711 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2712 assert!(u32::from(size_B) <= intrin.align());
2713 let access = MemAccess {
2714 mem_type: MemType::from_size(size_B, false),
2715 space: MemSpace::Local,
2716 order: MemOrder::Strong(MemScope::CTA),
2717 eviction_priority: MemEvictionPriority::Normal,
2718 };
2719 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2720 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2721
2722 b.push_op(OpLd {
2723 dst: dst.into(),
2724 addr: addr,
2725 offset: offset,
2726 access: access,
2727 });
2728 self.set_dst(&intrin.def, dst);
2729 }
2730 nir_intrinsic_load_shared => {
2731 let size_B =
2732 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2733 assert!(u32::from(size_B) <= intrin.align());
2734 let access = MemAccess {
2735 mem_type: MemType::from_size(size_B, false),
2736 space: MemSpace::Shared,
2737 order: MemOrder::Strong(MemScope::CTA),
2738 eviction_priority: MemEvictionPriority::Normal,
2739 };
2740 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2741 let offset = offset + intrin.base();
2742 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2743
2744 b.push_op(OpLd {
2745 dst: dst.into(),
2746 addr: addr,
2747 offset: offset,
2748 access: access,
2749 });
2750 self.set_dst(&intrin.def, dst);
2751 }
2752 nir_intrinsic_load_sysval_nv => {
2753 let idx = u8::try_from(intrin.base()).unwrap();
2754 debug_assert!(intrin.def.num_components == 1);
2755 debug_assert!(
2756 intrin.def.bit_size == 32 || intrin.def.bit_size == 64
2757 );
2758 let comps = intrin.def.bit_size / 32;
2759 let dst = b.alloc_ssa(RegFile::GPR, comps);
2760 if idx == NAK_SV_CLOCK || idx == NAK_SV_CLOCK + 1 {
2761 debug_assert!(idx + comps <= NAK_SV_CLOCK + 2);
2762 b.push_op(OpCS2R {
2763 dst: dst.into(),
2764 idx: idx,
2765 });
2766 } else {
2767 debug_assert!(intrin.def.bit_size == 32);
2768 b.push_op(OpS2R {
2769 dst: dst.into(),
2770 idx: idx,
2771 });
2772 }
2773 self.set_dst(&intrin.def, dst);
2774 }
2775 nir_intrinsic_ldc_nv => {
2776 let size_B =
2777 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2778 let idx = &srcs[0];
2779
2780 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2781
2782 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2783
2784 if let Some(idx_imm) = idx.as_uint() {
2785 let idx_imm: u8 = idx_imm.try_into().unwrap();
2786 let cb = CBufRef {
2787 buf: CBuf::Binding(idx_imm),
2788 offset: off_imm,
2789 };
2790 if off.is_zero() {
2791 for (i, comp) in dst.iter().enumerate() {
2792 let i = u16::try_from(i).unwrap();
2793 b.copy_to((*comp).into(), cb.offset(i * 4).into());
2794 }
2795 } else {
2796 b.push_op(OpLdc {
2797 dst: dst.into(),
2798 cb: cb.into(),
2799 offset: off,
2800 mode: LdcMode::Indexed,
2801 mem_type: MemType::from_size(size_B, false),
2802 });
2803 }
2804 } else {
2805 // In the IndexedSegmented mode, the hardware computes the
2806 // actual index and offset as follows:
2807 //
2808 // idx = imm_idx + reg[31:16]
2809 // offset = imm_offset + reg[15:0]
2810 // ldc c[idx][offset]
2811 //
2812 // So pack the index and offset accordingly
2813 let idx = self.get_src(idx);
2814 let off_idx = b.prmt(off, idx, [0, 1, 4, 5]);
2815 let cb = CBufRef {
2816 buf: CBuf::Binding(0),
2817 offset: off_imm,
2818 };
2819 b.push_op(OpLdc {
2820 dst: dst.into(),
2821 cb: cb.into(),
2822 offset: off_idx.into(),
2823 mode: LdcMode::IndexedSegmented,
2824 mem_type: MemType::from_size(size_B, false),
2825 });
2826 }
2827 self.set_dst(&intrin.def, dst);
2828 }
2829 nir_intrinsic_ldcx_nv => {
2830 let size_B =
2831 (intrin.def.bit_size() / 8) * intrin.def.num_components();
2832
2833 let handle = self.get_ssa_ref(&srcs[0]);
2834 let (off, off_imm) = self.get_cbuf_addr_offset(&srcs[1]);
2835
2836 let cb = CBufRef {
2837 buf: CBuf::BindlessSSA(handle),
2838 offset: off_imm,
2839 };
2840
2841 let dst = b.alloc_ssa(RegFile::GPR, size_B.div_ceil(4));
2842 if off.is_zero() {
2843 for (i, comp) in dst.iter().enumerate() {
2844 let i = u16::try_from(i).unwrap();
2845 b.copy_to((*comp).into(), cb.offset(i * 4).into());
2846 }
2847 } else {
2848 b.push_op(OpLdc {
2849 dst: dst.into(),
2850 cb: cb.into(),
2851 offset: off,
2852 mode: LdcMode::Indexed,
2853 mem_type: MemType::from_size(size_B, false),
2854 });
2855 }
2856 self.set_dst(&intrin.def, dst);
2857 }
2858 nir_intrinsic_pin_cx_handle_nv => {
2859 let handle = self.get_ssa_ref(&srcs[0]);
2860 b.push_op(OpPin {
2861 src: handle.into(),
2862 dst: handle.into(),
2863 });
2864 }
2865 nir_intrinsic_unpin_cx_handle_nv => {
2866 let handle = self.get_ssa_ref(&srcs[0]);
2867 b.push_op(OpUnpin {
2868 src: handle.into(),
2869 dst: handle.into(),
2870 });
2871 }
2872 nir_intrinsic_barrier => {
2873 let modes = intrin.memory_modes();
2874 let semantics = intrin.memory_semantics();
2875 if (modes & nir_var_mem_global) != 0
2876 && (semantics & NIR_MEMORY_RELEASE) != 0
2877 {
2878 // Pre-Volta doesn't have WBAll but it also seems that we
2879 // don't need it.
2880 if self.sm.sm() >= 70 {
2881 b.push_op(OpCCtl {
2882 op: CCtlOp::WBAll,
2883 mem_space: MemSpace::Global(MemAddrType::A64),
2884 addr: 0.into(),
2885 addr_offset: 0,
2886 });
2887 }
2888 }
2889 match intrin.execution_scope() {
2890 SCOPE_NONE => (),
2891 SCOPE_WORKGROUP => {
2892 assert!(
2893 self.nir.info.stage() == MESA_SHADER_COMPUTE
2894 || self.nir.info.stage() == MESA_SHADER_KERNEL
2895 );
2896 self.info.num_control_barriers = 1;
2897 b.push_op(OpBar {});
2898 }
2899 _ => panic!("Unhandled execution scope"),
2900 }
2901 if intrin.memory_scope() != SCOPE_NONE {
2902 let mem_scope = match intrin.memory_scope() {
2903 SCOPE_INVOCATION | SCOPE_SUBGROUP => MemScope::CTA,
2904 SCOPE_WORKGROUP | SCOPE_QUEUE_FAMILY | SCOPE_DEVICE => {
2905 MemScope::GPU
2906 }
2907 _ => panic!("Unhandled memory scope"),
2908 };
2909 b.push_op(OpMemBar { scope: mem_scope });
2910 }
2911 if (modes & nir_var_mem_global) != 0
2912 && (semantics & NIR_MEMORY_ACQUIRE) != 0
2913 {
2914 b.push_op(OpCCtl {
2915 op: CCtlOp::IVAll,
2916 mem_space: MemSpace::Global(MemAddrType::A64),
2917 addr: 0.into(),
2918 addr_offset: 0,
2919 });
2920 }
2921 }
2922 nir_intrinsic_quad_broadcast
2923 | nir_intrinsic_read_invocation
2924 | nir_intrinsic_shuffle
2925 | nir_intrinsic_shuffle_down
2926 | nir_intrinsic_shuffle_up
2927 | nir_intrinsic_shuffle_xor => {
2928 assert!(srcs[0].bit_size() == 32);
2929 assert!(srcs[0].num_components() == 1);
2930 let data = self.get_src(&srcs[0]);
2931
2932 assert!(srcs[1].bit_size() == 32);
2933 let idx = self.get_src(&srcs[1]);
2934
2935 assert!(intrin.def.bit_size() == 32);
2936 let dst = b.alloc_ssa(RegFile::GPR, 1);
2937
2938 b.push_op(OpShfl {
2939 dst: dst.into(),
2940 in_bounds: Dst::None,
2941 src: data,
2942 lane: idx,
2943 c: match intrin.intrinsic {
2944 nir_intrinsic_quad_broadcast => 0x1c_03.into(),
2945 nir_intrinsic_shuffle_up => 0.into(),
2946 _ => 0x1f.into(),
2947 },
2948 op: match intrin.intrinsic {
2949 nir_intrinsic_shuffle_down => ShflOp::Down,
2950 nir_intrinsic_shuffle_up => ShflOp::Up,
2951 nir_intrinsic_shuffle_xor => ShflOp::Bfly,
2952 _ => ShflOp::Idx,
2953 },
2954 });
2955 self.set_dst(&intrin.def, dst);
2956 }
2957 nir_intrinsic_quad_swap_horizontal
2958 | nir_intrinsic_quad_swap_vertical
2959 | nir_intrinsic_quad_swap_diagonal => {
2960 assert!(srcs[0].bit_size() == 32);
2961 assert!(srcs[0].num_components() == 1);
2962 let data = self.get_src(&srcs[0]);
2963
2964 assert!(intrin.def.bit_size() == 32);
2965 let dst = b.alloc_ssa(RegFile::GPR, 1);
2966 b.push_op(OpShfl {
2967 dst: dst.into(),
2968 in_bounds: Dst::None,
2969 src: data,
2970 lane: match intrin.intrinsic {
2971 nir_intrinsic_quad_swap_horizontal => 1_u32.into(),
2972 nir_intrinsic_quad_swap_vertical => 2_u32.into(),
2973 nir_intrinsic_quad_swap_diagonal => 3_u32.into(),
2974 op => panic!("Unknown quad intrinsic {}", op),
2975 },
2976 c: 0x1c_03.into(),
2977 op: ShflOp::Bfly,
2978 });
2979 self.set_dst(&intrin.def, dst);
2980 }
2981 nir_intrinsic_shared_atomic => {
2982 let bit_size = intrin.def.bit_size();
2983 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
2984 let data = self.get_src(&srcs[1]);
2985 let atom_type = self.get_atomic_type(intrin);
2986 let atom_op = self.get_atomic_op(intrin, AtomCmpSrc::Separate);
2987
2988 assert!(intrin.def.num_components() == 1);
2989 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
2990
2991 b.push_op(OpAtom {
2992 dst: dst.into(),
2993 addr: addr,
2994 cmpr: 0.into(),
2995 data: data,
2996 atom_op: atom_op,
2997 atom_type: atom_type,
2998 addr_offset: offset,
2999 mem_space: MemSpace::Shared,
3000 mem_order: MemOrder::Strong(MemScope::CTA),
3001 mem_eviction_priority: MemEvictionPriority::Normal,
3002 });
3003 self.set_dst(&intrin.def, dst);
3004 }
3005 nir_intrinsic_shared_atomic_swap => {
3006 assert!(intrin.atomic_op() == nir_atomic_op_cmpxchg);
3007 let bit_size = intrin.def.bit_size();
3008 let (addr, offset) = self.get_io_addr_offset(&srcs[0], 24);
3009 let cmpr = self.get_src(&srcs[1]);
3010 let data = self.get_src(&srcs[2]);
3011 let atom_type = AtomType::U(bit_size);
3012
3013 assert!(intrin.def.num_components() == 1);
3014 let dst = b.alloc_ssa(RegFile::GPR, bit_size.div_ceil(32));
3015
3016 b.push_op(OpAtom {
3017 dst: dst.into(),
3018 addr: addr,
3019 cmpr: cmpr,
3020 data: data,
3021 atom_op: AtomOp::CmpExch(AtomCmpSrc::Separate),
3022 atom_type: atom_type,
3023 addr_offset: offset,
3024 mem_space: MemSpace::Shared,
3025 mem_order: MemOrder::Strong(MemScope::CTA),
3026 mem_eviction_priority: MemEvictionPriority::Normal,
3027 });
3028 self.set_dst(&intrin.def, dst);
3029 }
3030 nir_intrinsic_ssa_bar_nv => {
3031 let src = self.get_src(&srcs[0]);
3032 b.push_op(OpSrcBar { src });
3033 }
3034 nir_intrinsic_store_global => {
3035 let data = self.get_src(&srcs[0]);
3036 let size_B =
3037 (srcs[0].bit_size() / 8) * srcs[0].num_components();
3038 assert!(u32::from(size_B) <= intrin.align());
3039 let access = MemAccess {
3040 mem_type: MemType::from_size(size_B, false),
3041 space: MemSpace::Global(MemAddrType::A64),
3042 order: MemOrder::Strong(MemScope::System),
3043 eviction_priority: self
3044 .get_eviction_priority(intrin.access()),
3045 };
3046 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3047
3048 b.push_op(OpSt {
3049 addr: addr,
3050 data: data,
3051 offset: offset,
3052 access: access,
3053 });
3054 }
3055 nir_intrinsic_fs_out_nv => {
3056 let data = self.get_ssa(srcs[0].as_def());
3057 assert!(data.len() == 1);
3058 let data = data[0];
3059
3060 let addr = u16::try_from(intrin.base()).unwrap();
3061 assert!(addr % 4 == 0);
3062
3063 self.fs_out_regs[usize::from(addr / 4)] = data;
3064 }
3065 nir_intrinsic_store_scratch => {
3066 let data = self.get_src(&srcs[0]);
3067 let size_B =
3068 (srcs[0].bit_size() / 8) * srcs[0].num_components();
3069 assert!(u32::from(size_B) <= intrin.align());
3070 let access = MemAccess {
3071 mem_type: MemType::from_size(size_B, false),
3072 space: MemSpace::Local,
3073 order: MemOrder::Strong(MemScope::CTA),
3074 eviction_priority: MemEvictionPriority::Normal,
3075 };
3076 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3077
3078 b.push_op(OpSt {
3079 addr: addr,
3080 data: data,
3081 offset: offset,
3082 access: access,
3083 });
3084 }
3085 nir_intrinsic_store_shared => {
3086 let data = self.get_src(&srcs[0]);
3087 let size_B =
3088 (srcs[0].bit_size() / 8) * srcs[0].num_components();
3089 assert!(u32::from(size_B) <= intrin.align());
3090 let access = MemAccess {
3091 mem_type: MemType::from_size(size_B, false),
3092 space: MemSpace::Shared,
3093 order: MemOrder::Strong(MemScope::CTA),
3094 eviction_priority: MemEvictionPriority::Normal,
3095 };
3096 let (addr, offset) = self.get_io_addr_offset(&srcs[1], 24);
3097 let offset = offset + intrin.base();
3098
3099 b.push_op(OpSt {
3100 addr: addr,
3101 data: data,
3102 offset: offset,
3103 access: access,
3104 });
3105 }
3106 nir_intrinsic_emit_vertex_nv | nir_intrinsic_end_primitive_nv => {
3107 assert!(intrin.def.bit_size() == 32);
3108 assert!(intrin.def.num_components() == 1);
3109
3110 let dst = b.alloc_ssa(RegFile::GPR, 1);
3111 let handle = self.get_src(&srcs[0]);
3112 let stream_id = intrin.stream_id();
3113
3114 b.push_op(OpOut {
3115 dst: dst.into(),
3116 handle: handle,
3117 stream: stream_id.into(),
3118 out_type: if intrin.intrinsic
3119 == nir_intrinsic_emit_vertex_nv
3120 {
3121 OutType::Emit
3122 } else {
3123 OutType::Cut
3124 },
3125 });
3126 self.set_dst(&intrin.def, dst);
3127 }
3128
3129 nir_intrinsic_final_primitive_nv => {
3130 let handle = self.get_src(&srcs[0]);
3131
3132 if self.sm.sm() >= 70 {
3133 b.push_op(OpOutFinal { handle: handle });
3134 } else {
3135 b.push_op(OpRegOut { srcs: vec![handle] });
3136 }
3137 }
3138 nir_intrinsic_vote_all
3139 | nir_intrinsic_vote_any
3140 | nir_intrinsic_vote_ieq => {
3141 assert!(srcs[0].bit_size() == 1);
3142 let src = self.get_src(&srcs[0]);
3143
3144 assert!(intrin.def.bit_size() == 1);
3145 let dst = b.alloc_ssa(RegFile::Pred, 1);
3146
3147 b.push_op(OpVote {
3148 op: match intrin.intrinsic {
3149 nir_intrinsic_vote_all => VoteOp::All,
3150 nir_intrinsic_vote_any => VoteOp::Any,
3151 nir_intrinsic_vote_ieq => VoteOp::Eq,
3152 _ => panic!("Unknown vote intrinsic"),
3153 },
3154 ballot: Dst::None,
3155 vote: dst.into(),
3156 pred: src,
3157 });
3158 self.set_dst(&intrin.def, dst);
3159 }
3160 nir_intrinsic_is_sparse_texels_resident => {
3161 let src = self.get_src(&srcs[0]);
3162 let dst = b.isetp(IntCmpType::I32, IntCmpOp::Ne, src, 0.into());
3163 self.set_dst(&intrin.def, dst);
3164 }
3165 _ => panic!(
3166 "Unsupported intrinsic instruction: {}",
3167 intrin.info().name()
3168 ),
3169 }
3170 }
3171
parse_load_const( &mut self, b: &mut impl SSABuilder, load_const: &nir_load_const_instr, )3172 fn parse_load_const(
3173 &mut self,
3174 b: &mut impl SSABuilder,
3175 load_const: &nir_load_const_instr,
3176 ) {
3177 let values = &load_const.values();
3178
3179 let mut dst = Vec::new();
3180 match load_const.def.bit_size {
3181 1 => {
3182 for c in 0..load_const.def.num_components {
3183 let imm_b1 = unsafe { values[usize::from(c)].b };
3184 dst.push(b.copy(imm_b1.into())[0]);
3185 }
3186 }
3187 8 => {
3188 for dw in 0..load_const.def.num_components.div_ceil(4) {
3189 let mut imm_u32 = 0;
3190 for b in 0..4 {
3191 let c = dw * 4 + b;
3192 if c < load_const.def.num_components {
3193 let imm_u8 = unsafe { values[usize::from(c)].u8_ };
3194 imm_u32 |= u32::from(imm_u8) << b * 8;
3195 }
3196 }
3197 dst.push(b.copy(imm_u32.into())[0]);
3198 }
3199 }
3200 16 => {
3201 for dw in 0..load_const.def.num_components.div_ceil(2) {
3202 let mut imm_u32 = 0;
3203 for w in 0..2 {
3204 let c = dw * 2 + w;
3205 if c < load_const.def.num_components {
3206 let imm_u16 =
3207 unsafe { values[usize::from(c)].u16_ };
3208 imm_u32 |= u32::from(imm_u16) << w * 16;
3209 }
3210 }
3211 dst.push(b.copy(imm_u32.into())[0]);
3212 }
3213 }
3214 32 => {
3215 for c in 0..load_const.def.num_components {
3216 let imm_u32 = unsafe { values[usize::from(c)].u32_ };
3217 dst.push(b.copy(imm_u32.into())[0]);
3218 }
3219 }
3220 64 => {
3221 for c in 0..load_const.def.num_components {
3222 let imm_u64 = unsafe { values[c as usize].u64_ };
3223 dst.push(b.copy((imm_u64 as u32).into())[0]);
3224 dst.push(b.copy(((imm_u64 >> 32) as u32).into())[0]);
3225 }
3226 }
3227 _ => panic!("Unknown bit size: {}", load_const.def.bit_size),
3228 }
3229
3230 self.set_ssa(&load_const.def, dst);
3231 }
3232
parse_undef( &mut self, b: &mut impl SSABuilder, undef: &nir_undef_instr, )3233 fn parse_undef(
3234 &mut self,
3235 b: &mut impl SSABuilder,
3236 undef: &nir_undef_instr,
3237 ) {
3238 let dst = alloc_ssa_for_nir(b, &undef.def);
3239 for c in &dst {
3240 b.push_op(OpUndef { dst: (*c).into() });
3241 }
3242 self.set_ssa(&undef.def, dst);
3243 }
3244
emit_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, target: &nir_block, )3245 fn emit_jump(
3246 &mut self,
3247 b: &mut impl SSABuilder,
3248 nb: &nir_block,
3249 target: &nir_block,
3250 ) {
3251 if target.index == self.end_block_id {
3252 b.push_op(OpExit {});
3253 } else {
3254 self.cfg.add_edge(nb.index, target.index);
3255 let target_label = self.get_block_label(target);
3256
3257 match self.peek_crs(target) {
3258 Some(SyncType::Sync) => {
3259 b.push_op(OpSync {
3260 target: target_label,
3261 });
3262 }
3263 Some(SyncType::Brk) => {
3264 b.push_op(OpBrk {
3265 target: target_label,
3266 });
3267 }
3268 Some(SyncType::Cont) => {
3269 b.push_op(OpCont {
3270 target: target_label,
3271 });
3272 }
3273 None => {
3274 b.push_op(OpBra {
3275 target: target_label,
3276 });
3277 }
3278 }
3279 }
3280 }
3281
emit_pred_jump( &mut self, b: &mut impl SSABuilder, nb: &nir_block, pred: Pred, target: &nir_block, fallthrough: &nir_block, )3282 fn emit_pred_jump(
3283 &mut self,
3284 b: &mut impl SSABuilder,
3285 nb: &nir_block,
3286 pred: Pred,
3287 target: &nir_block,
3288 fallthrough: &nir_block,
3289 ) {
3290 // The fall-through edge has to come first
3291 self.cfg.add_edge(nb.index, fallthrough.index);
3292 let op = if target.index == self.end_block_id {
3293 Op::Exit(OpExit {})
3294 } else {
3295 self.cfg.add_edge(nb.index, target.index);
3296 Op::Bra(OpBra {
3297 target: self.get_block_label(target),
3298 })
3299 };
3300 b.predicate(pred).push_op(op);
3301 }
3302
parse_block( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nb: &nir_block, )3303 fn parse_block(
3304 &mut self,
3305 ssa_alloc: &mut SSAValueAllocator,
3306 phi_map: &mut PhiAllocMap,
3307 nb: &nir_block,
3308 ) {
3309 let sm = self.sm;
3310 let mut b = SSAInstrBuilder::new(sm, ssa_alloc);
3311
3312 if self.sm.sm() >= 70 && nb.index == 0 && self.nir.info.shared_size > 0
3313 {
3314 // The blob seems to always do a BSYNC before accessing shared
3315 // memory. Perhaps this is to ensure that our allocation is
3316 // actually available and not in use by another thread?
3317 let label = self.label_alloc.alloc();
3318 let bar_clear = b.alloc_ssa(RegFile::Bar, 1);
3319
3320 b.push_op(OpBClear {
3321 dst: bar_clear.into(),
3322 });
3323
3324 let bar = b.alloc_ssa(RegFile::Bar, 1);
3325 b.push_op(OpBSSy {
3326 bar_out: bar.into(),
3327 bar_in: bar_clear.into(),
3328 cond: SrcRef::True.into(),
3329 target: label,
3330 });
3331
3332 b.push_op(OpBSync {
3333 bar: bar.into(),
3334 cond: SrcRef::True.into(),
3335 });
3336
3337 b.push_op(OpNop { label: Some(label) });
3338 }
3339
3340 let mut phi = OpPhiDsts::new();
3341 for ni in nb.iter_instr_list() {
3342 let Some(np) = ni.as_phi() else {
3343 break;
3344 };
3345
3346 if DEBUG.annotate() {
3347 let annotation = self
3348 .nir_instr_printer
3349 .instr_to_string(ni)
3350 .unwrap()
3351 .split_whitespace()
3352 .collect::<Vec<_>>()
3353 .join(" ");
3354 b.push_op(OpAnnotate {
3355 annotation: format!("generated by \"{}\"", annotation,),
3356 });
3357 }
3358
3359 let uniform = !nb.divergent
3360 && self.sm.sm() >= 75
3361 && !DEBUG.no_ugpr()
3362 && !np.def.divergent;
3363
3364 // This should be ensured by nak_nir_lower_cf()
3365 if uniform {
3366 for ps in np.iter_srcs() {
3367 assert!(!ps.pred().divergent);
3368 }
3369 }
3370
3371 let mut b = UniformBuilder::new(&mut b, uniform);
3372 let dst = alloc_ssa_for_nir(&mut b, np.def.as_def());
3373 for i in 0..dst.len() {
3374 let phi_id = phi_map.get_phi_id(np, i.try_into().unwrap());
3375 phi.dsts.push(phi_id, dst[i].into());
3376 }
3377 self.set_ssa(np.def.as_def(), dst);
3378 }
3379
3380 if !phi.dsts.is_empty() {
3381 b.push_op(phi);
3382 }
3383
3384 if self.sm.sm() < 75 && nb.cf_node.prev().is_none() {
3385 if let Some(_) = nb.parent().as_loop() {
3386 b.push_op(OpPCnt {
3387 target: self.get_block_label(nb),
3388 });
3389 self.push_crs(nb, SyncType::Cont);
3390 }
3391 }
3392
3393 let mut goto = None;
3394 for ni in nb.iter_instr_list() {
3395 if DEBUG.annotate() && ni.type_ != nir_instr_type_phi {
3396 let annotation = self
3397 .nir_instr_printer
3398 .instr_to_string(ni)
3399 .unwrap()
3400 .split_whitespace()
3401 .collect::<Vec<_>>()
3402 .join(" ");
3403 b.push_op(OpAnnotate {
3404 annotation: format!("generated by \"{}\"", annotation,),
3405 });
3406 }
3407
3408 let uniform = !nb.divergent
3409 && self.sm.sm() >= 75
3410 && !DEBUG.no_ugpr()
3411 && ni.def().is_some_and(|d| !d.divergent);
3412 let mut b = UniformBuilder::new(&mut b, uniform);
3413
3414 match ni.type_ {
3415 nir_instr_type_alu => {
3416 self.parse_alu(&mut b, ni.as_alu().unwrap())
3417 }
3418 nir_instr_type_jump => {
3419 let jump = ni.as_jump().unwrap();
3420 if jump.type_ == nir_jump_goto
3421 || jump.type_ == nir_jump_goto_if
3422 {
3423 goto = Some(jump);
3424 }
3425 }
3426 nir_instr_type_tex => {
3427 self.parse_tex(&mut b, ni.as_tex().unwrap())
3428 }
3429 nir_instr_type_intrinsic => {
3430 self.parse_intrinsic(&mut b, ni.as_intrinsic().unwrap())
3431 }
3432 nir_instr_type_load_const => {
3433 self.parse_load_const(&mut b, ni.as_load_const().unwrap())
3434 }
3435 nir_instr_type_undef => {
3436 self.parse_undef(&mut b, ni.as_undef().unwrap())
3437 }
3438 nir_instr_type_phi => (),
3439 _ => panic!("Unsupported instruction type"),
3440 }
3441 }
3442
3443 if self.sm.sm() < 70 {
3444 if let Some(ni) = nb.following_if() {
3445 let fb = ni.following_block();
3446 b.push_op(OpSSy {
3447 target: self.get_block_label(fb),
3448 });
3449 self.push_crs(fb, SyncType::Sync);
3450 } else if let Some(nl) = nb.following_loop() {
3451 let fb = nl.following_block();
3452 b.push_op(OpPBk {
3453 target: self.get_block_label(fb),
3454 });
3455 self.push_crs(fb, SyncType::Brk);
3456 }
3457 }
3458
3459 let succ = nb.successors();
3460 for sb in succ {
3461 let sb = match sb {
3462 Some(b) => b,
3463 None => continue,
3464 };
3465
3466 let mut phi = OpPhiSrcs::new();
3467
3468 for ni in sb.iter_instr_list() {
3469 let Some(np) = ni.as_phi() else {
3470 break;
3471 };
3472
3473 if DEBUG.annotate() {
3474 let annotation = self
3475 .nir_instr_printer
3476 .instr_to_string(ni)
3477 .unwrap()
3478 .split_whitespace()
3479 .collect::<Vec<_>>()
3480 .join(" ");
3481 b.push_op(OpAnnotate {
3482 annotation: format!("generated by \"{}\"", annotation,),
3483 });
3484 }
3485
3486 for ps in np.iter_srcs() {
3487 if ps.pred().index == nb.index {
3488 let src = *self.get_src(&ps.src).as_ssa().unwrap();
3489 for (i, src) in src.iter().enumerate() {
3490 let phi_id =
3491 phi_map.get_phi_id(np, i.try_into().unwrap());
3492 phi.srcs.push(phi_id, (*src).into());
3493 }
3494 break;
3495 }
3496 }
3497 }
3498
3499 if !phi.srcs.is_empty() {
3500 b.push_op(phi);
3501 }
3502 }
3503
3504 if let Some(goto) = goto {
3505 let target = goto.target().unwrap();
3506 if goto.type_ == nir_jump_goto {
3507 self.emit_jump(&mut b, nb, target);
3508 } else {
3509 let cond = self.get_ssa(goto.condition.as_def())[0];
3510 let else_target = goto.else_target().unwrap();
3511
3512 /* Next block in the NIR CF list */
3513 let next_block = nb.cf_node.next().unwrap().as_block().unwrap();
3514
3515 if else_target as *const _ == next_block as *const _ {
3516 self.emit_pred_jump(
3517 &mut b,
3518 nb,
3519 // This is the branch to jump to the else
3520 cond.into(),
3521 target,
3522 else_target,
3523 );
3524 } else if target as *const _ == next_block as *const _ {
3525 self.emit_pred_jump(
3526 &mut b,
3527 nb,
3528 Pred::from(cond).bnot(),
3529 else_target,
3530 target,
3531 );
3532 } else {
3533 panic!(
3534 "One of the two goto targets must be the next block in \
3535 the NIR CF list"
3536 );
3537 }
3538 }
3539 } else {
3540 if let Some(ni) = nb.following_if() {
3541 let cond = self.get_ssa(ni.condition.as_def())[0];
3542 self.emit_pred_jump(
3543 &mut b,
3544 nb,
3545 // This is the branch to jump to the else
3546 Pred::from(cond).bnot(),
3547 ni.first_else_block(),
3548 ni.first_then_block(),
3549 );
3550 } else {
3551 assert!(succ[1].is_none());
3552 let s0 = succ[0].unwrap();
3553 self.emit_jump(&mut b, nb, s0);
3554 }
3555 }
3556
3557 let bb = BasicBlock {
3558 label: self.get_block_label(nb),
3559 uniform: !nb.divergent,
3560 instrs: b.as_vec(),
3561 };
3562 self.cfg.add_node(nb.index, bb);
3563 }
3564
parse_if( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, ni: &nir_if, )3565 fn parse_if(
3566 &mut self,
3567 ssa_alloc: &mut SSAValueAllocator,
3568 phi_map: &mut PhiAllocMap,
3569 ni: &nir_if,
3570 ) {
3571 self.parse_cf_list(ssa_alloc, phi_map, ni.iter_then_list());
3572 self.parse_cf_list(ssa_alloc, phi_map, ni.iter_else_list());
3573
3574 if self.sm.sm() < 70 {
3575 let next_block = ni.cf_node.next().unwrap().as_block().unwrap();
3576 self.pop_crs(next_block, SyncType::Sync);
3577 }
3578 }
3579
parse_loop( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, nl: &nir_loop, )3580 fn parse_loop(
3581 &mut self,
3582 ssa_alloc: &mut SSAValueAllocator,
3583 phi_map: &mut PhiAllocMap,
3584 nl: &nir_loop,
3585 ) {
3586 self.parse_cf_list(ssa_alloc, phi_map, nl.iter_body());
3587
3588 if self.sm.sm() < 70 {
3589 let header = nl.iter_body().next().unwrap().as_block().unwrap();
3590 self.pop_crs(header, SyncType::Cont);
3591 let next_block = nl.cf_node.next().unwrap().as_block().unwrap();
3592 self.pop_crs(next_block, SyncType::Brk);
3593 }
3594 }
3595
parse_cf_list( &mut self, ssa_alloc: &mut SSAValueAllocator, phi_map: &mut PhiAllocMap, list: ExecListIter<nir_cf_node>, )3596 fn parse_cf_list(
3597 &mut self,
3598 ssa_alloc: &mut SSAValueAllocator,
3599 phi_map: &mut PhiAllocMap,
3600 list: ExecListIter<nir_cf_node>,
3601 ) {
3602 for node in list {
3603 match node.type_ {
3604 nir_cf_node_block => {
3605 let nb = node.as_block().unwrap();
3606 self.parse_block(ssa_alloc, phi_map, nb);
3607 }
3608 nir_cf_node_if => {
3609 let ni = node.as_if().unwrap();
3610 self.parse_if(ssa_alloc, phi_map, ni);
3611 }
3612 nir_cf_node_loop => {
3613 let nl = node.as_loop().unwrap();
3614 self.parse_loop(ssa_alloc, phi_map, nl);
3615 }
3616 _ => panic!("Invalid inner CF node type"),
3617 }
3618 }
3619 }
3620
parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function3621 pub fn parse_function_impl(&mut self, nfi: &nir_function_impl) -> Function {
3622 let mut ssa_alloc = SSAValueAllocator::new();
3623 let end_nb = nfi.end_block();
3624 self.end_block_id = end_nb.index;
3625
3626 let mut phi_alloc = PhiAllocator::new();
3627 let mut phi_map = PhiAllocMap::new(&mut phi_alloc);
3628
3629 self.parse_cf_list(&mut ssa_alloc, &mut phi_map, nfi.iter_body());
3630
3631 let cfg = std::mem::take(&mut self.cfg).as_cfg();
3632 assert!(cfg.len() > 0);
3633 for i in 0..cfg.len() {
3634 if cfg[i].falls_through() {
3635 assert!(cfg.succ_indices(i)[0] == i + 1);
3636 }
3637 }
3638
3639 let mut f = Function {
3640 ssa_alloc: ssa_alloc,
3641 phi_alloc: phi_alloc,
3642 blocks: cfg,
3643 };
3644 f.repair_ssa();
3645 f
3646 }
3647
parse_shader(mut self) -> Shader<'a>3648 pub fn parse_shader(mut self) -> Shader<'a> {
3649 let mut functions = Vec::new();
3650 for nf in self.nir.iter_functions() {
3651 if let Some(nfi) = nf.get_impl() {
3652 let f = self.parse_function_impl(nfi);
3653 functions.push(f);
3654 }
3655 }
3656
3657 // Tessellation evaluation shaders MUST claim to read gl_TessCoord or
3658 // the hardware will throw an SPH error.
3659 if matches!(self.info.stage, ShaderStageInfo::Tessellation(_)) {
3660 match &mut self.info.io {
3661 ShaderIoInfo::Vtg(io) => {
3662 let tc = NAK_ATTR_TESS_COORD;
3663 io.mark_attrs_written(tc..(tc + 8));
3664 }
3665 _ => panic!("Tessellation must have ShaderIoInfo::Vtg"),
3666 }
3667 }
3668
3669 Shader {
3670 sm: self.sm,
3671 info: self.info,
3672 functions: functions,
3673 }
3674 }
3675 }
3676
nak_shader_from_nir<'a>( nak: &nak_compiler, ns: &'a nir_shader, sm: &'a dyn ShaderModel, ) -> Shader<'a>3677 pub fn nak_shader_from_nir<'a>(
3678 nak: &nak_compiler,
3679 ns: &'a nir_shader,
3680 sm: &'a dyn ShaderModel,
3681 ) -> Shader<'a> {
3682 ShaderFromNir::new(nak, ns, sm).parse_shader()
3683 }
3684