• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 use crate::ir::*;
5 use crate::legalize::{
6     src_is_reg, src_is_upred_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers,
7     LegalizeBuilder,
8 };
9 use bitview::*;
10 
11 use std::collections::HashMap;
12 use std::ops::Range;
13 
14 pub struct ShaderModel70 {
15     sm: u8,
16 }
17 
18 impl ShaderModel70 {
new(sm: u8) -> Self19     pub fn new(sm: u8) -> Self {
20         assert!(sm >= 70);
21         Self { sm }
22     }
23 
has_uniform_alu(&self) -> bool24     fn has_uniform_alu(&self) -> bool {
25         self.sm >= 75
26     }
27 }
28 
29 impl ShaderModel for ShaderModel70 {
sm(&self) -> u830     fn sm(&self) -> u8 {
31         self.sm
32     }
33 
num_regs(&self, file: RegFile) -> u3234     fn num_regs(&self, file: RegFile) -> u32 {
35         match file {
36             RegFile::GPR => 255 - self.hw_reserved_gprs(),
37             RegFile::UGPR => {
38                 if self.has_uniform_alu() {
39                     63
40                 } else {
41                     0
42                 }
43             }
44             RegFile::Pred => 7,
45             RegFile::UPred => {
46                 if self.has_uniform_alu() {
47                     7
48                 } else {
49                     0
50                 }
51             }
52             RegFile::Carry => 0,
53             RegFile::Bar => 16,
54             RegFile::Mem => RegRef::MAX_IDX + 1,
55         }
56     }
57 
hw_reserved_gprs(&self) -> u3258     fn hw_reserved_gprs(&self) -> u32 {
59         // On Volta+, 2 GPRs get burned for the program counter - see the
60         // footnote on table 2 of the volta whitepaper
61         // https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
62         2
63     }
64 
crs_size(&self, max_crs_depth: u32) -> u3265     fn crs_size(&self, max_crs_depth: u32) -> u32 {
66         assert!(max_crs_depth == 0);
67         0
68     }
69 
op_can_be_uniform(&self, op: &Op) -> bool70     fn op_can_be_uniform(&self, op: &Op) -> bool {
71         if !self.has_uniform_alu() {
72             return false;
73         }
74 
75         match op {
76             Op::R2UR(_)
77             | Op::S2R(_)
78             | Op::BMsk(_)
79             | Op::BRev(_)
80             | Op::Flo(_)
81             | Op::IAdd3(_)
82             | Op::IAdd3X(_)
83             | Op::IMad(_)
84             | Op::IMad64(_)
85             | Op::ISetP(_)
86             | Op::Lop3(_)
87             | Op::Mov(_)
88             | Op::PLop3(_)
89             | Op::PopC(_)
90             | Op::Prmt(_)
91             | Op::PSetP(_)
92             | Op::Sel(_)
93             | Op::Shf(_)
94             | Op::Shl(_)
95             | Op::Shr(_)
96             | Op::Vote(_)
97             | Op::Copy(_)
98             | Op::Pin(_)
99             | Op::Unpin(_) => true,
100             Op::Ldc(op) => op.offset.is_zero(),
101             // UCLEA  USHL  USHR
102             _ => false,
103         }
104     }
105 
legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op)106     fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) {
107         as_sm70_op_mut(op).legalize(b);
108     }
109 
encode_shader(&self, s: &Shader<'_>) -> Vec<u32>110     fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32> {
111         encode_sm70_shader(self, s)
112     }
113 }
114 
115 /// A per-op trait that implements Volta+ opcode semantics
116 trait SM70Op {
legalize(&mut self, b: &mut LegalizeBuilder)117     fn legalize(&mut self, b: &mut LegalizeBuilder);
encode(&self, e: &mut SM70Encoder<'_>)118     fn encode(&self, e: &mut SM70Encoder<'_>);
119 }
120 
121 struct SM70Encoder<'a> {
122     sm: &'a ShaderModel70,
123     ip: usize,
124     labels: &'a HashMap<Label, usize>,
125     inst: [u32; 4],
126 }
127 
128 impl BitViewable for SM70Encoder<'_> {
bits(&self) -> usize129     fn bits(&self) -> usize {
130         BitView::new(&self.inst).bits()
131     }
132 
get_bit_range_u64(&self, range: Range<usize>) -> u64133     fn get_bit_range_u64(&self, range: Range<usize>) -> u64 {
134         BitView::new(&self.inst).get_bit_range_u64(range)
135     }
136 }
137 
138 impl BitMutViewable for SM70Encoder<'_> {
set_bit_range_u64(&mut self, range: Range<usize>, val: u64)139     fn set_bit_range_u64(&mut self, range: Range<usize>, val: u64) {
140         BitMutView::new(&mut self.inst).set_bit_range_u64(range, val);
141     }
142 }
143 
144 impl SetFieldU64 for SM70Encoder<'_> {
set_field_u64(&mut self, range: Range<usize>, val: u64)145     fn set_field_u64(&mut self, range: Range<usize>, val: u64) {
146         BitMutView::new(&mut self.inst).set_field_u64(range, val);
147     }
148 }
149 
150 impl SM70Encoder<'_> {
set_opcode(&mut self, opcode: u16)151     fn set_opcode(&mut self, opcode: u16) {
152         self.set_field(0..12, opcode);
153     }
154 
set_reg(&mut self, range: Range<usize>, reg: RegRef)155     fn set_reg(&mut self, range: Range<usize>, reg: RegRef) {
156         assert!(range.len() == 8);
157         assert!(reg.file() == RegFile::GPR);
158         self.set_field(range, reg.base_idx());
159     }
160 
set_ureg(&mut self, range: Range<usize>, reg: RegRef)161     fn set_ureg(&mut self, range: Range<usize>, reg: RegRef) {
162         assert!(self.sm.sm >= 75);
163         assert!(range.len() == 8);
164         assert!(reg.file() == RegFile::UGPR);
165         assert!(reg.base_idx() <= 63);
166         self.set_field(range, reg.base_idx());
167     }
168 
set_pred_reg(&mut self, range: Range<usize>, reg: RegRef)169     fn set_pred_reg(&mut self, range: Range<usize>, reg: RegRef) {
170         assert!(range.len() == 3);
171         assert!(reg.base_idx() <= 7);
172         assert!(reg.comps() == 1);
173         self.set_field(range, reg.base_idx());
174     }
175 
set_reg_src(&mut self, range: Range<usize>, src: Src)176     fn set_reg_src(&mut self, range: Range<usize>, src: Src) {
177         assert!(src.src_mod.is_none());
178         match src.src_ref {
179             SrcRef::Zero => self.set_reg(range, RegRef::zero(RegFile::GPR, 1)),
180             SrcRef::Reg(reg) => self.set_reg(range, reg),
181             _ => panic!("Not a register"),
182         }
183     }
184 
set_pred_dst(&mut self, range: Range<usize>, dst: Dst)185     fn set_pred_dst(&mut self, range: Range<usize>, dst: Dst) {
186         match dst {
187             Dst::None => {
188                 self.set_pred_reg(range, RegRef::zero(RegFile::Pred, 1));
189             }
190             Dst::Reg(reg) => self.set_pred_reg(range, reg),
191             _ => panic!("Not a register"),
192         }
193     }
194 
set_pred_src_file( &mut self, range: Range<usize>, not_bit: usize, src: Src, file: RegFile, )195     fn set_pred_src_file(
196         &mut self,
197         range: Range<usize>,
198         not_bit: usize,
199         src: Src,
200         file: RegFile,
201     ) {
202         // The default for predicates is true
203         let true_reg = RegRef::new(file, 7, 1);
204 
205         let (not, reg) = match src.src_ref {
206             SrcRef::True => (false, true_reg),
207             SrcRef::False => (true, true_reg),
208             SrcRef::Reg(reg) => {
209                 assert!(reg.file() == file);
210                 (false, reg)
211             }
212             _ => panic!("Not a register"),
213         };
214         self.set_pred_reg(range, reg);
215         self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
216     }
217 
set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)218     fn set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
219         self.set_pred_src_file(range, not_bit, src, RegFile::Pred);
220     }
221 
set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)222     fn set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
223         self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
224     }
225 
set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef)226     fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
227         let mut v = BitMutView::new_subset(self, range);
228         v.set_field(6..22, cb.offset);
229         match cb.buf {
230             CBuf::Binding(idx) => {
231                 v.set_field(22..27, idx);
232                 self.set_bit(cx_bit, false);
233             }
234             CBuf::BindlessUGPR(reg) => {
235                 assert!(reg.base_idx() <= 63);
236                 assert!(reg.file() == RegFile::UGPR);
237                 v.set_field(0..6, reg.base_idx());
238                 self.set_bit(cx_bit, true);
239             }
240             CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
241         }
242     }
243 
set_pred(&mut self, pred: &Pred)244     fn set_pred(&mut self, pred: &Pred) {
245         assert!(!pred.is_false());
246         self.set_pred_reg(
247             12..15,
248             match pred.pred_ref {
249                 PredRef::None => RegRef::zero(RegFile::Pred, 1),
250                 PredRef::Reg(reg) => reg,
251                 PredRef::SSA(_) => panic!("SSA values must be lowered"),
252             },
253         );
254         self.set_bit(15, pred.pred_inv);
255     }
256 
set_dst(&mut self, dst: Dst)257     fn set_dst(&mut self, dst: Dst) {
258         match dst {
259             Dst::None => self.set_reg(16..24, RegRef::zero(RegFile::GPR, 1)),
260             Dst::Reg(reg) => self.set_reg(16..24, reg),
261             _ => panic!("Not a register"),
262         }
263     }
264 
set_udst(&mut self, dst: Dst)265     fn set_udst(&mut self, dst: Dst) {
266         match dst {
267             Dst::None => self.set_ureg(16..24, RegRef::zero(RegFile::UGPR, 1)),
268             Dst::Reg(reg) => self.set_ureg(16..24, reg),
269             _ => panic!("Not a register"),
270         }
271     }
272 
set_bar_reg(&mut self, range: Range<usize>, reg: RegRef)273     fn set_bar_reg(&mut self, range: Range<usize>, reg: RegRef) {
274         assert!(range.len() == 4);
275         assert!(reg.file() == RegFile::Bar);
276         assert!(reg.comps() == 1);
277         self.set_field(range, reg.base_idx());
278     }
279 
set_bar_dst(&mut self, range: Range<usize>, dst: Dst)280     fn set_bar_dst(&mut self, range: Range<usize>, dst: Dst) {
281         self.set_bar_reg(range, *dst.as_reg().unwrap());
282     }
283 
set_bar_src(&mut self, range: Range<usize>, src: Src)284     fn set_bar_src(&mut self, range: Range<usize>, src: Src) {
285         assert!(src.src_mod.is_none());
286         self.set_bar_reg(range, *src.src_ref.as_reg().unwrap());
287     }
288 
set_instr_deps(&mut self, deps: &InstrDeps)289     fn set_instr_deps(&mut self, deps: &InstrDeps) {
290         self.set_field(105..109, deps.delay);
291         self.set_bit(109, deps.yld);
292         self.set_field(110..113, deps.wr_bar().unwrap_or(7));
293         self.set_field(113..116, deps.rd_bar().unwrap_or(7));
294         self.set_field(116..122, deps.wt_bar_mask);
295         self.set_field(122..126, deps.reuse_mask);
296     }
297 }
298 
299 //
300 // Helpers for encoding of ALU instructions
301 //
302 
303 struct ALURegRef {
304     pub reg: RegRef,
305     pub abs: bool,
306     pub neg: bool,
307     pub swizzle: SrcSwizzle,
308 }
309 
310 struct ALUCBufRef {
311     pub cb: CBufRef,
312     pub abs: bool,
313     pub neg: bool,
314     pub swizzle: SrcSwizzle,
315 }
316 
317 enum ALUSrc {
318     None,
319     Imm32(u32),
320     Reg(ALURegRef),
321     UReg(ALURegRef),
322     CBuf(ALUCBufRef),
323 }
324 
src_is_zero_or_gpr(src: &Src) -> bool325 fn src_is_zero_or_gpr(src: &Src) -> bool {
326     match src.src_ref {
327         SrcRef::Zero => true,
328         SrcRef::Reg(reg) => reg.file() == RegFile::GPR,
329         _ => false,
330     }
331 }
332 
src_mod_has_abs(src_mod: SrcMod) -> bool333 fn src_mod_has_abs(src_mod: SrcMod) -> bool {
334     match src_mod {
335         SrcMod::None | SrcMod::FNeg | SrcMod::INeg | SrcMod::BNot => false,
336         SrcMod::FAbs | SrcMod::FNegAbs => true,
337     }
338 }
339 
src_mod_has_neg(src_mod: SrcMod) -> bool340 fn src_mod_has_neg(src_mod: SrcMod) -> bool {
341     match src_mod {
342         SrcMod::None | SrcMod::FAbs => false,
343         SrcMod::FNeg | SrcMod::FNegAbs | SrcMod::INeg | SrcMod::BNot => true,
344     }
345 }
346 
src_mod_is_bnot(src_mod: SrcMod) -> bool347 fn src_mod_is_bnot(src_mod: SrcMod) -> bool {
348     match src_mod {
349         SrcMod::None => false,
350         SrcMod::BNot => true,
351         _ => panic!("Not an predicate source modifier"),
352     }
353 }
354 
dst_is_bar(dst: Dst) -> bool355 fn dst_is_bar(dst: Dst) -> bool {
356     match dst {
357         Dst::None => false,
358         Dst::SSA(ssa) => ssa.file().unwrap() == RegFile::Bar,
359         Dst::Reg(reg) => reg.file() == RegFile::Bar,
360     }
361 }
362 
363 impl ALUSrc {
from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc364     fn from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc {
365         let Some(src) = src else {
366             return ALUSrc::None;
367         };
368 
369         match src.src_ref {
370             SrcRef::Zero | SrcRef::Reg(_) => {
371                 let reg = match src.src_ref {
372                     SrcRef::Zero => {
373                         let file = if op_is_uniform {
374                             RegFile::UGPR
375                         } else {
376                             RegFile::GPR
377                         };
378                         RegRef::zero(file, 1)
379                     }
380                     SrcRef::Reg(reg) => reg,
381                     _ => panic!("Invalid source ref"),
382                 };
383                 assert!(reg.comps() <= 2);
384                 let alu_ref = ALURegRef {
385                     reg: reg,
386                     abs: src_mod_has_abs(src.src_mod),
387                     neg: src_mod_has_neg(src.src_mod),
388                     swizzle: src.src_swizzle,
389                 };
390                 if op_is_uniform {
391                     assert!(reg.file() == RegFile::UGPR);
392                     ALUSrc::Reg(alu_ref)
393                 } else {
394                     match reg.file() {
395                         RegFile::GPR => ALUSrc::Reg(alu_ref),
396                         RegFile::UGPR => ALUSrc::UReg(alu_ref),
397                         _ => panic!("Invalid ALU register file"),
398                     }
399                 }
400             }
401             SrcRef::Imm32(i) => {
402                 assert!(src.src_mod.is_none());
403                 assert!(src.src_swizzle.is_none());
404                 ALUSrc::Imm32(i)
405             }
406             SrcRef::CBuf(cb) => {
407                 let alu_ref = ALUCBufRef {
408                     cb: cb,
409                     abs: src_mod_has_abs(src.src_mod),
410                     neg: src_mod_has_neg(src.src_mod),
411                     swizzle: src.src_swizzle,
412                 };
413                 ALUSrc::CBuf(alu_ref)
414             }
415             _ => panic!("Invalid ALU source"),
416         }
417     }
418 
has_src_mod(&self) -> bool419     pub fn has_src_mod(&self) -> bool {
420         match self {
421             ALUSrc::Reg(reg) | ALUSrc::UReg(reg) => reg.abs || reg.neg,
422             ALUSrc::CBuf(cb) => cb.abs || cb.neg,
423             _ => false,
424         }
425     }
426 }
427 
428 impl SM70Encoder<'_> {
set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle)429     fn set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle) {
430         assert!(range.len() == 2);
431 
432         self.set_field(
433             range,
434             match swizzle {
435                 SrcSwizzle::None => 0x00_u8,
436                 SrcSwizzle::Xx => 0x02_u8,
437                 SrcSwizzle::Yy => 0x03_u8,
438             },
439         );
440     }
441 
set_alu_reg( &mut self, range: Range<usize>, abs_bit: usize, neg_bit: usize, swizzle_range: Range<usize>, file: RegFile, is_fp16_alu: bool, has_mod: bool, reg: &ALURegRef, )442     fn set_alu_reg(
443         &mut self,
444         range: Range<usize>,
445         abs_bit: usize,
446         neg_bit: usize,
447         swizzle_range: Range<usize>,
448         file: RegFile,
449         is_fp16_alu: bool,
450         has_mod: bool,
451         reg: &ALURegRef,
452     ) {
453         match file {
454             RegFile::GPR => self.set_reg(range, reg.reg),
455             RegFile::UGPR => self.set_ureg(range, reg.reg),
456             _ => panic!("Invalid ALU src register file"),
457         }
458 
459         if has_mod {
460             self.set_bit(abs_bit, reg.abs);
461             self.set_bit(neg_bit, reg.neg);
462         } else {
463             assert!(!reg.abs && !reg.neg);
464         }
465 
466         if is_fp16_alu {
467             self.set_swizzle(swizzle_range, reg.swizzle);
468         } else {
469             assert!(reg.swizzle == SrcSwizzle::None);
470         }
471     }
472 
encode_alu_src0( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, )473     fn encode_alu_src0(
474         &mut self,
475         src: &ALUSrc,
476         file: RegFile,
477         is_fp16_alu: bool,
478     ) {
479         let reg = match src {
480             ALUSrc::None => return,
481             ALUSrc::Reg(reg) => reg,
482             _ => panic!("Invalid ALU src"),
483         };
484         self.set_alu_reg(24..32, 73, 72, 74..76, file, is_fp16_alu, true, reg);
485     }
486 
encode_alu_src2( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, bit74_75_are_mod: bool, )487     fn encode_alu_src2(
488         &mut self,
489         src: &ALUSrc,
490         file: RegFile,
491         is_fp16_alu: bool,
492         bit74_75_are_mod: bool,
493     ) {
494         let reg = match src {
495             ALUSrc::None => return,
496             ALUSrc::Reg(reg) => reg,
497             _ => panic!("Invalid ALU src"),
498         };
499         self.set_alu_reg(
500             64..72,
501             74,
502             75,
503             81..83,
504             file,
505             is_fp16_alu,
506             bit74_75_are_mod,
507             reg,
508         );
509     }
510 
encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)511     fn encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
512         self.set_alu_reg(
513             32..40,
514             62,
515             63,
516             60..62,
517             RegFile::GPR,
518             is_fp16_alu,
519             true,
520             reg,
521         );
522     }
523 
encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)524     fn encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
525         self.set_ureg(32..40, reg.reg);
526         self.set_bit(62, reg.abs);
527         self.set_bit(63, reg.neg);
528 
529         if is_fp16_alu {
530             self.set_swizzle(60..62, reg.swizzle);
531         } else {
532             assert!(reg.swizzle == SrcSwizzle::None);
533         }
534 
535         self.set_bit(91, true);
536     }
537 
encode_alu_imm(&mut self, imm: &u32)538     fn encode_alu_imm(&mut self, imm: &u32) {
539         self.set_field(32..64, *imm);
540     }
541 
encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool)542     fn encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool) {
543         self.set_src_cb(32..59, 91, &cb.cb);
544         self.set_bit(62, cb.abs);
545         self.set_bit(63, cb.neg);
546 
547         if is_fp16_alu {
548             self.set_swizzle(60..62, cb.swizzle);
549         } else {
550             assert!(cb.swizzle == SrcSwizzle::None);
551         }
552     }
553 
encode_alu_base( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, is_fp16_alu: bool, )554     fn encode_alu_base(
555         &mut self,
556         opcode: u16,
557         dst: Option<&Dst>,
558         src0: Option<&Src>,
559         src1: Option<&Src>,
560         src2: Option<&Src>,
561         is_fp16_alu: bool,
562     ) {
563         if let Some(dst) = dst {
564             self.set_dst(*dst);
565         }
566 
567         let src0 = ALUSrc::from_src(src0, false);
568         let src1 = ALUSrc::from_src(src1, false);
569         let src2 = ALUSrc::from_src(src2, false);
570 
571         // Bits 74..76 are used both for the swizzle on src0 and for the source
572         // modifier for the register source of src1 and src2.  When both are
573         // registers, it's used for src2.  The hardware elects to always support
574         // a swizzle and not support source modifiers in that case.
575         let bit74_75_are_mod = !is_fp16_alu
576             || matches!(src1, ALUSrc::None)
577             || matches!(src2, ALUSrc::None);
578         debug_assert!(bit74_75_are_mod || !src0.has_src_mod());
579 
580         self.encode_alu_src0(&src0, RegFile::GPR, is_fp16_alu);
581 
582         let form = match &src2 {
583             ALUSrc::None | ALUSrc::Reg(_) => {
584                 self.encode_alu_src2(
585                     &src2,
586                     RegFile::GPR,
587                     is_fp16_alu,
588                     bit74_75_are_mod,
589                 );
590                 match &src1 {
591                     ALUSrc::None => 1_u8, // form
592                     ALUSrc::Reg(reg1) => {
593                         self.encode_alu_reg(reg1, is_fp16_alu);
594                         1_u8 // form
595                     }
596                     ALUSrc::UReg(reg1) => {
597                         self.encode_alu_ureg(reg1, is_fp16_alu);
598                         6_u8 // form
599                     }
600                     ALUSrc::Imm32(imm1) => {
601                         self.encode_alu_imm(imm1);
602                         4_u8 // form
603                     }
604                     ALUSrc::CBuf(cb1) => {
605                         self.encode_alu_cb(cb1, is_fp16_alu);
606                         5_u8 // form
607                     }
608                 }
609             }
610             ALUSrc::UReg(reg2) => {
611                 self.encode_alu_ureg(reg2, is_fp16_alu);
612                 self.encode_alu_src2(
613                     &src1,
614                     RegFile::GPR,
615                     is_fp16_alu,
616                     bit74_75_are_mod,
617                 );
618                 7_u8 // form
619             }
620             ALUSrc::Imm32(imm2) => {
621                 self.encode_alu_imm(imm2);
622                 self.encode_alu_src2(
623                     &src1,
624                     RegFile::GPR,
625                     is_fp16_alu,
626                     bit74_75_are_mod,
627                 );
628                 2_u8 // form
629             }
630             ALUSrc::CBuf(cb2) => {
631                 // TODO set_src_cx
632                 self.encode_alu_cb(cb2, is_fp16_alu);
633                 self.encode_alu_src2(
634                     &src1,
635                     RegFile::GPR,
636                     is_fp16_alu,
637                     bit74_75_are_mod,
638                 );
639                 3_u8 // form
640             }
641         };
642 
643         self.set_field(0..9, opcode);
644         self.set_field(9..12, form);
645     }
646 
encode_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )647     fn encode_alu(
648         &mut self,
649         opcode: u16,
650         dst: Option<&Dst>,
651         src0: Option<&Src>,
652         src1: Option<&Src>,
653         src2: Option<&Src>,
654     ) {
655         self.encode_alu_base(opcode, dst, src0, src1, src2, false);
656     }
657 
encode_fp16_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )658     fn encode_fp16_alu(
659         &mut self,
660         opcode: u16,
661         dst: Option<&Dst>,
662         src0: Option<&Src>,
663         src1: Option<&Src>,
664         src2: Option<&Src>,
665     ) {
666         self.encode_alu_base(opcode, dst, src0, src1, src2, true);
667     }
668 
encode_ualu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )669     fn encode_ualu(
670         &mut self,
671         opcode: u16,
672         dst: Option<&Dst>,
673         src0: Option<&Src>,
674         src1: Option<&Src>,
675         src2: Option<&Src>,
676     ) {
677         if let Some(dst) = dst {
678             self.set_udst(*dst);
679         }
680 
681         let src0 = ALUSrc::from_src(src0, true);
682         let src1 = ALUSrc::from_src(src1, true);
683         let src2 = ALUSrc::from_src(src2, true);
684 
685         // All uniform ALU requires bit 91 set
686         self.set_bit(91, true);
687 
688         self.encode_alu_src0(&src0, RegFile::UGPR, false);
689         let form = match &src2 {
690             ALUSrc::None | ALUSrc::Reg(_) => {
691                 self.encode_alu_src2(&src2, RegFile::UGPR, false, true);
692                 match &src1 {
693                     ALUSrc::None => 1_u8, // form
694                     ALUSrc::Reg(reg1) => {
695                         self.encode_alu_ureg(reg1, false);
696                         1_u8 // form
697                     }
698                     ALUSrc::UReg(_) => panic!("UALU never has UReg"),
699                     ALUSrc::Imm32(imm1) => {
700                         self.encode_alu_imm(imm1);
701                         4_u8 // form
702                     }
703                     ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
704                 }
705             }
706             ALUSrc::UReg(_) => panic!("UALU never has UReg"),
707             ALUSrc::Imm32(imm2) => {
708                 self.encode_alu_imm(imm2);
709                 self.encode_alu_src2(&src1, RegFile::UGPR, false, true);
710                 2_u8 // form
711             }
712             ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
713         };
714 
715         self.set_field(0..9, opcode);
716         self.set_field(9..12, form);
717     }
718 
set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode)719     fn set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode) {
720         assert!(range.len() == 2);
721         self.set_field(
722             range,
723             match rnd_mode {
724                 FRndMode::NearestEven => 0_u8,
725                 FRndMode::NegInf => 1_u8,
726                 FRndMode::PosInf => 2_u8,
727                 FRndMode::Zero => 3_u8,
728             },
729         );
730     }
731 }
732 
733 //
734 // Legalization helpers
735 //
736 
op_gpr(op: &impl DstsAsSlice) -> RegFile737 fn op_gpr(op: &impl DstsAsSlice) -> RegFile {
738     if op.is_uniform() {
739         RegFile::UGPR
740     } else {
741         RegFile::GPR
742     }
743 }
744 
745 /// Helper to legalize extended or external instructions
746 ///
747 /// These are instructions which reach out external units such as load/store
748 /// and texture ops.  They typically can't take anything but GPRs and are the
749 /// only types of instructions that support vectors.  They also can never be
750 /// uniform so we always evict uniform sources.
751 ///
legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder)752 fn legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder) {
753     let src_types = op.src_types();
754     for (i, src) in op.srcs_as_mut_slice().iter_mut().enumerate() {
755         match src_types[i] {
756             SrcType::SSA | SrcType::GPR => match &mut src.src_ref {
757                 SrcRef::Zero | SrcRef::True | SrcRef::False => {
758                     assert!(src_types[i] != SrcType::SSA);
759                 }
760                 SrcRef::SSA(ssa) => {
761                     b.copy_ssa_ref_if_uniform(ssa);
762                 }
763                 _ => panic!("Unsupported source reference"),
764             },
765             SrcType::ALU
766             | SrcType::F16
767             | SrcType::F16v2
768             | SrcType::F32
769             | SrcType::F64
770             | SrcType::I32
771             | SrcType::B32 => {
772                 panic!("ALU srcs must be legalized explicitly");
773             }
774             SrcType::Pred => {
775                 panic!("Predicates must be legalized explicitly");
776             }
777             SrcType::Carry => {
778                 panic!("Carry is invalid on Volta+");
779             }
780             SrcType::Bar => (),
781         }
782     }
783 }
784 
785 //
786 // Implementations of SM70Op for each op we support on Volta+
787 //
788 
789 impl SM70Op for OpFAdd {
legalize(&mut self, b: &mut LegalizeBuilder)790     fn legalize(&mut self, b: &mut LegalizeBuilder) {
791         let gpr = op_gpr(self);
792         let [src0, src1] = &mut self.srcs;
793         swap_srcs_if_not_reg(src0, src1, gpr);
794         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
795     }
796 
encode(&self, e: &mut SM70Encoder<'_>)797     fn encode(&self, e: &mut SM70Encoder<'_>) {
798         if src_is_zero_or_gpr(&self.srcs[1]) {
799             e.encode_alu(
800                 0x021,
801                 Some(&self.dst),
802                 Some(&self.srcs[0]),
803                 Some(&self.srcs[1]),
804                 None,
805             )
806         } else {
807             e.encode_alu(
808                 0x021,
809                 Some(&self.dst),
810                 Some(&self.srcs[0]),
811                 Some(&Src::new_zero()),
812                 Some(&self.srcs[1]),
813             )
814         };
815         e.set_bit(77, self.saturate);
816         e.set_rnd_mode(78..80, self.rnd_mode);
817         e.set_bit(80, self.ftz);
818     }
819 }
820 
821 impl SM70Op for OpFFma {
legalize(&mut self, b: &mut LegalizeBuilder)822     fn legalize(&mut self, b: &mut LegalizeBuilder) {
823         let gpr = op_gpr(self);
824         let [src0, src1, src2] = &mut self.srcs;
825         swap_srcs_if_not_reg(src0, src1, gpr);
826         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
827         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F32);
828     }
829 
encode(&self, e: &mut SM70Encoder<'_>)830     fn encode(&self, e: &mut SM70Encoder<'_>) {
831         e.encode_alu(
832             0x023,
833             Some(&self.dst),
834             Some(&self.srcs[0]),
835             Some(&self.srcs[1]),
836             Some(&self.srcs[2]),
837         );
838         e.set_bit(76, self.dnz);
839         e.set_bit(77, self.saturate);
840         e.set_rnd_mode(78..80, self.rnd_mode);
841         e.set_bit(80, self.ftz);
842     }
843 }
844 
845 impl SM70Op for OpFMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)846     fn legalize(&mut self, b: &mut LegalizeBuilder) {
847         let gpr = op_gpr(self);
848         let [src0, src1] = &mut self.srcs;
849         swap_srcs_if_not_reg(src0, src1, gpr);
850         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
851     }
852 
encode(&self, e: &mut SM70Encoder<'_>)853     fn encode(&self, e: &mut SM70Encoder<'_>) {
854         e.encode_alu(
855             0x009,
856             Some(&self.dst),
857             Some(&self.srcs[0]),
858             Some(&self.srcs[1]),
859             Some(&Src::new_zero()),
860         );
861         e.set_pred_src(87..90, 90, self.min);
862         e.set_bit(80, self.ftz);
863     }
864 }
865 
866 impl SM70Op for OpFMul {
legalize(&mut self, b: &mut LegalizeBuilder)867     fn legalize(&mut self, b: &mut LegalizeBuilder) {
868         let gpr = op_gpr(self);
869         let [src0, src1] = &mut self.srcs;
870         swap_srcs_if_not_reg(src0, src1, gpr);
871         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
872     }
873 
encode(&self, e: &mut SM70Encoder<'_>)874     fn encode(&self, e: &mut SM70Encoder<'_>) {
875         e.encode_alu(
876             0x020,
877             Some(&self.dst),
878             Some(&self.srcs[0]),
879             Some(&self.srcs[1]),
880             Some(&Src::new_zero()),
881         );
882         e.set_bit(76, self.dnz);
883         e.set_bit(77, self.saturate);
884         e.set_rnd_mode(78..80, self.rnd_mode);
885         e.set_bit(80, self.ftz);
886         e.set_field(84..87, 0x4_u8); // TODO: PDIV
887     }
888 }
889 
890 impl SM70Encoder<'_> {
set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp)891     fn set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp) {
892         assert!(range.len() == 4);
893         self.set_field(
894             range,
895             match op {
896                 FloatCmpOp::OrdLt => 0x01_u8,
897                 FloatCmpOp::OrdEq => 0x02_u8,
898                 FloatCmpOp::OrdLe => 0x03_u8,
899                 FloatCmpOp::OrdGt => 0x04_u8,
900                 FloatCmpOp::OrdNe => 0x05_u8,
901                 FloatCmpOp::OrdGe => 0x06_u8,
902                 FloatCmpOp::UnordLt => 0x09_u8,
903                 FloatCmpOp::UnordEq => 0x0a_u8,
904                 FloatCmpOp::UnordLe => 0x0b_u8,
905                 FloatCmpOp::UnordGt => 0x0c_u8,
906                 FloatCmpOp::UnordNe => 0x0d_u8,
907                 FloatCmpOp::UnordGe => 0x0e_u8,
908                 FloatCmpOp::IsNum => 0x07_u8,
909                 FloatCmpOp::IsNan => 0x08_u8,
910             },
911         );
912     }
913 
set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp)914     fn set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp) {
915         assert!(range.len() == 2);
916         self.set_field(
917             range,
918             match op {
919                 PredSetOp::And => 0_u8,
920                 PredSetOp::Or => 1_u8,
921                 PredSetOp::Xor => 2_u8,
922             },
923         );
924     }
925 
set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp)926     fn set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp) {
927         assert!(range.len() == 3);
928         self.set_field(
929             range,
930             match op {
931                 IntCmpOp::Eq => 2_u8,
932                 IntCmpOp::Ne => 5_u8,
933                 IntCmpOp::Lt => 1_u8,
934                 IntCmpOp::Le => 3_u8,
935                 IntCmpOp::Gt => 4_u8,
936                 IntCmpOp::Ge => 6_u8,
937             },
938         );
939     }
940 }
941 
942 impl SM70Op for OpFSet {
legalize(&mut self, b: &mut LegalizeBuilder)943     fn legalize(&mut self, b: &mut LegalizeBuilder) {
944         let gpr = op_gpr(self);
945         let [src0, src1] = &mut self.srcs;
946         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
947             std::mem::swap(src0, src1);
948             self.cmp_op = self.cmp_op.flip();
949         }
950         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
951     }
952 
encode(&self, e: &mut SM70Encoder<'_>)953     fn encode(&self, e: &mut SM70Encoder<'_>) {
954         e.encode_alu(
955             0x00a,
956             Some(&self.dst),
957             Some(&self.srcs[0]),
958             Some(&self.srcs[1]),
959             None,
960         );
961         e.set_float_cmp_op(76..80, self.cmp_op);
962         e.set_bit(80, self.ftz);
963         e.set_field(87..90, 0x7_u8); // TODO: src predicate
964     }
965 }
966 
967 impl SM70Op for OpFSetP {
legalize(&mut self, b: &mut LegalizeBuilder)968     fn legalize(&mut self, b: &mut LegalizeBuilder) {
969         let gpr = op_gpr(self);
970         let [src0, src1] = &mut self.srcs;
971         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
972             std::mem::swap(src0, src1);
973             self.cmp_op = self.cmp_op.flip();
974         }
975         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
976     }
977 
encode(&self, e: &mut SM70Encoder<'_>)978     fn encode(&self, e: &mut SM70Encoder<'_>) {
979         e.encode_alu(
980             0x00b,
981             None,
982             Some(&self.srcs[0]),
983             Some(&self.srcs[1]),
984             None,
985         );
986 
987         e.set_pred_set_op(74..76, self.set_op);
988         e.set_float_cmp_op(76..80, self.cmp_op);
989         e.set_bit(80, self.ftz);
990 
991         e.set_pred_dst(81..84, self.dst);
992         e.set_pred_dst(84..87, Dst::None); // dst1
993 
994         e.set_pred_src(87..90, 90, self.accum);
995     }
996 }
997 
998 impl SM70Op for OpFSwzAdd {
legalize(&mut self, b: &mut LegalizeBuilder)999     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1000         let gpr = op_gpr(self);
1001         let [src0, src1] = &mut self.srcs;
1002         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
1003         b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F32);
1004     }
1005 
encode(&self, e: &mut SM70Encoder<'_>)1006     fn encode(&self, e: &mut SM70Encoder<'_>) {
1007         e.set_opcode(0x822);
1008         e.set_dst(self.dst);
1009 
1010         e.set_reg_src(24..32, self.srcs[0]);
1011         e.set_reg_src(64..72, self.srcs[1]);
1012 
1013         let mut subop = 0x0_u8;
1014 
1015         for (i, swz_op) in self.ops.iter().enumerate() {
1016             let swz_op = match swz_op {
1017                 FSwzAddOp::Add => 0,
1018                 FSwzAddOp::SubRight => 2,
1019                 FSwzAddOp::SubLeft => 1,
1020                 FSwzAddOp::MoveLeft => 3,
1021             };
1022 
1023             subop |= swz_op << ((self.ops.len() - i - 1) * 2);
1024         }
1025 
1026         e.set_field(32..40, subop);
1027 
1028         e.set_bit(77, false); // NDV
1029         e.set_rnd_mode(78..80, self.rnd_mode);
1030         e.set_bit(80, self.ftz);
1031     }
1032 }
1033 
1034 impl SM70Op for OpMuFu {
legalize(&mut self, _b: &mut LegalizeBuilder)1035     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1036         // Nothing to do
1037     }
1038 
encode(&self, e: &mut SM70Encoder<'_>)1039     fn encode(&self, e: &mut SM70Encoder<'_>) {
1040         e.encode_alu(0x108, Some(&self.dst), None, Some(&self.src), None);
1041         e.set_field(
1042             74..80,
1043             match self.op {
1044                 MuFuOp::Cos => 0_u8,
1045                 MuFuOp::Sin => 1_u8,
1046                 MuFuOp::Exp2 => 2_u8,
1047                 MuFuOp::Log2 => 3_u8,
1048                 MuFuOp::Rcp => 4_u8,
1049                 MuFuOp::Rsq => 5_u8,
1050                 MuFuOp::Rcp64H => 6_u8,
1051                 MuFuOp::Rsq64H => 7_u8,
1052                 MuFuOp::Sqrt => 8_u8,
1053                 MuFuOp::Tanh => 9_u8,
1054             },
1055         );
1056     }
1057 }
1058 
1059 impl SM70Op for OpDAdd {
legalize(&mut self, b: &mut LegalizeBuilder)1060     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1061         let gpr = op_gpr(self);
1062         let [src0, src1] = &mut self.srcs;
1063         swap_srcs_if_not_reg(src0, src1, gpr);
1064         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1065     }
1066 
encode(&self, e: &mut SM70Encoder<'_>)1067     fn encode(&self, e: &mut SM70Encoder<'_>) {
1068         e.encode_alu(
1069             0x029,
1070             Some(&self.dst),
1071             Some(&self.srcs[0]),
1072             None,
1073             Some(&self.srcs[1]),
1074         );
1075         e.set_rnd_mode(78..80, self.rnd_mode);
1076     }
1077 }
1078 
1079 impl SM70Op for OpDFma {
legalize(&mut self, b: &mut LegalizeBuilder)1080     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1081         let gpr = op_gpr(self);
1082         let [src0, src1, src2] = &mut self.srcs;
1083         swap_srcs_if_not_reg(src0, src1, gpr);
1084         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1085         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F64);
1086     }
1087 
encode(&self, e: &mut SM70Encoder<'_>)1088     fn encode(&self, e: &mut SM70Encoder<'_>) {
1089         e.encode_alu(
1090             0x02b,
1091             Some(&self.dst),
1092             Some(&self.srcs[0]),
1093             Some(&self.srcs[1]),
1094             Some(&self.srcs[2]),
1095         );
1096         e.set_rnd_mode(78..80, self.rnd_mode);
1097     }
1098 }
1099 
1100 impl SM70Op for OpDMul {
legalize(&mut self, b: &mut LegalizeBuilder)1101     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1102         let gpr = op_gpr(self);
1103         let [src0, src1] = &mut self.srcs;
1104         swap_srcs_if_not_reg(src0, src1, gpr);
1105         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1106     }
1107 
encode(&self, e: &mut SM70Encoder<'_>)1108     fn encode(&self, e: &mut SM70Encoder<'_>) {
1109         e.encode_alu(
1110             0x028,
1111             Some(&self.dst),
1112             Some(&self.srcs[0]),
1113             Some(&self.srcs[1]),
1114             None,
1115         );
1116         e.set_rnd_mode(78..80, self.rnd_mode);
1117     }
1118 }
1119 
1120 impl SM70Op for OpDSetP {
legalize(&mut self, b: &mut LegalizeBuilder)1121     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1122         let gpr = op_gpr(self);
1123         let [src0, src1] = &mut self.srcs;
1124         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1125             std::mem::swap(src0, src1);
1126             self.cmp_op = self.cmp_op.flip();
1127         }
1128         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1129     }
1130 
encode(&self, e: &mut SM70Encoder<'_>)1131     fn encode(&self, e: &mut SM70Encoder<'_>) {
1132         if src_is_zero_or_gpr(&self.srcs[1]) {
1133             e.encode_alu(
1134                 0x02a,
1135                 None,
1136                 Some(&self.srcs[0]),
1137                 Some(&self.srcs[1]),
1138                 None,
1139             )
1140         } else {
1141             e.encode_alu(
1142                 0x02a,
1143                 None,
1144                 Some(&self.srcs[0]),
1145                 None,
1146                 Some(&self.srcs[1]),
1147             )
1148         };
1149 
1150         e.set_pred_set_op(74..76, self.set_op);
1151         e.set_float_cmp_op(76..80, self.cmp_op);
1152 
1153         e.set_pred_dst(81..84, self.dst);
1154         e.set_pred_dst(84..87, Dst::None); /* dst1 */
1155 
1156         e.set_pred_src(87..90, 90, self.accum);
1157     }
1158 }
1159 
1160 impl SM70Op for OpHAdd2 {
legalize(&mut self, b: &mut LegalizeBuilder)1161     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1162         let gpr = op_gpr(self);
1163         let [src0, src1] = &mut self.srcs;
1164         swap_srcs_if_not_reg(src0, src1, gpr);
1165         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1166     }
1167 
encode(&self, e: &mut SM70Encoder<'_>)1168     fn encode(&self, e: &mut SM70Encoder<'_>) {
1169         if src_is_zero_or_gpr(&self.srcs[1]) {
1170             e.encode_fp16_alu(
1171                 0x030,
1172                 Some(&self.dst),
1173                 Some(&self.srcs[0]),
1174                 Some(&self.srcs[1]),
1175                 None,
1176             )
1177         } else {
1178             e.encode_fp16_alu(
1179                 0x030,
1180                 Some(&self.dst),
1181                 Some(&self.srcs[0]),
1182                 None,
1183                 Some(&self.srcs[1]),
1184             )
1185         };
1186 
1187         e.set_bit(77, self.saturate);
1188         e.set_bit(78, self.f32);
1189         e.set_bit(80, self.ftz);
1190         e.set_bit(85, false); // .BF16_V2 (SM90+)
1191     }
1192 }
1193 
1194 impl SM70Op for OpHFma2 {
legalize(&mut self, b: &mut LegalizeBuilder)1195     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1196         let gpr = op_gpr(self);
1197         let [src0, src1, src2] = &mut self.srcs;
1198         swap_srcs_if_not_reg(src0, src1, gpr);
1199         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1200         b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F16v2);
1201         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F16v2);
1202 
1203         // HFMA2 doesn't have fabs or fneg on SRC2.
1204         if !src2.src_mod.is_none() {
1205             b.copy_alu_src_and_lower_fmod(src2, SrcType::F16v2);
1206         }
1207     }
1208 
encode(&self, e: &mut SM70Encoder<'_>)1209     fn encode(&self, e: &mut SM70Encoder<'_>) {
1210         // HFMA2 doesn't have fneg and fabs on SRC2.
1211         assert!(self.srcs[2].src_mod.is_none());
1212 
1213         e.encode_fp16_alu(
1214             0x031,
1215             Some(&self.dst),
1216             Some(&self.srcs[0]),
1217             Some(&self.srcs[1]),
1218             Some(&self.srcs[2]),
1219         );
1220 
1221         e.set_bit(76, self.dnz);
1222         e.set_bit(77, self.saturate);
1223         e.set_bit(78, self.f32);
1224         e.set_bit(79, false); // .RELU (SM86+)
1225         e.set_bit(80, self.ftz);
1226         e.set_bit(85, false); // .BF16_V2 (SM86+)
1227     }
1228 }
1229 
1230 impl SM70Op for OpHMul2 {
legalize(&mut self, b: &mut LegalizeBuilder)1231     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1232         let gpr = op_gpr(self);
1233         let [src0, src1] = &mut self.srcs;
1234         swap_srcs_if_not_reg(src0, src1, gpr);
1235         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1236     }
1237 
encode(&self, e: &mut SM70Encoder<'_>)1238     fn encode(&self, e: &mut SM70Encoder<'_>) {
1239         e.encode_fp16_alu(
1240             0x032,
1241             Some(&self.dst),
1242             Some(&self.srcs[0]),
1243             Some(&self.srcs[1]),
1244             None,
1245         );
1246 
1247         e.set_bit(76, self.dnz);
1248         e.set_bit(77, self.saturate);
1249         e.set_bit(78, false); // .F32 (SM70-SM75)
1250         e.set_bit(79, false); // .RELU (SM86+)
1251         e.set_bit(80, self.ftz);
1252         e.set_bit(85, false); // .BF16_V2 (SM90+)
1253     }
1254 }
1255 
1256 impl SM70Op for OpHSet2 {
legalize(&mut self, b: &mut LegalizeBuilder)1257     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1258         let gpr = op_gpr(self);
1259         let [src0, src1] = &mut self.srcs;
1260         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1261             std::mem::swap(src0, src1);
1262             self.cmp_op = self.cmp_op.flip();
1263         }
1264         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1265     }
1266 
encode(&self, e: &mut SM70Encoder<'_>)1267     fn encode(&self, e: &mut SM70Encoder<'_>) {
1268         if src_is_zero_or_gpr(&self.srcs[1]) {
1269             e.encode_fp16_alu(
1270                 0x033,
1271                 Some(&self.dst),
1272                 Some(&self.srcs[0]),
1273                 Some(&self.srcs[1]),
1274                 None,
1275             )
1276         } else {
1277             e.encode_fp16_alu(
1278                 0x033,
1279                 Some(&self.dst),
1280                 Some(&self.srcs[0]),
1281                 None,
1282                 Some(&self.srcs[1]),
1283             )
1284         };
1285 
1286         e.set_bit(65, false); // .BF16_V2 (SM90+)
1287         e.set_pred_set_op(69..71, self.set_op);
1288 
1289         // This differentiate between integer and fp16 output
1290         e.set_bit(71, true); // .BF
1291         e.set_float_cmp_op(76..80, self.cmp_op);
1292         e.set_bit(80, self.ftz);
1293 
1294         e.set_pred_src(87..90, 90, self.accum);
1295     }
1296 }
1297 
1298 impl SM70Op for OpHSetP2 {
legalize(&mut self, b: &mut LegalizeBuilder)1299     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1300         let gpr = op_gpr(self);
1301         let [src0, src1] = &mut self.srcs;
1302         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1303             std::mem::swap(src0, src1);
1304             self.cmp_op = self.cmp_op.flip();
1305         }
1306         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1307     }
1308 
encode(&self, e: &mut SM70Encoder<'_>)1309     fn encode(&self, e: &mut SM70Encoder<'_>) {
1310         if src_is_zero_or_gpr(&self.srcs[1]) {
1311             e.encode_fp16_alu(
1312                 0x034,
1313                 None,
1314                 Some(&self.srcs[0]),
1315                 Some(&self.srcs[1]),
1316                 None,
1317             )
1318         } else {
1319             e.encode_fp16_alu(
1320                 0x034,
1321                 None,
1322                 Some(&self.srcs[0]),
1323                 None,
1324                 Some(&self.srcs[1]),
1325             )
1326         };
1327 
1328         e.set_bit(65, false); // .BF16_V2 (SM90+)
1329         e.set_pred_set_op(69..71, self.set_op);
1330         e.set_bit(71, self.horizontal); // .H_AND
1331         e.set_float_cmp_op(76..80, self.cmp_op);
1332         e.set_bit(80, self.ftz);
1333 
1334         e.set_pred_dst(81..84, self.dsts[0]);
1335         e.set_pred_dst(84..87, self.dsts[1]);
1336 
1337         e.set_pred_src(87..90, 90, self.accum);
1338     }
1339 }
1340 
1341 impl SM70Op for OpHMnMx2 {
legalize(&mut self, b: &mut LegalizeBuilder)1342     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1343         let gpr = op_gpr(self);
1344         let [src0, src1] = &mut self.srcs;
1345         swap_srcs_if_not_reg(src0, src1, gpr);
1346         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1347     }
1348 
encode(&self, e: &mut SM70Encoder<'_>)1349     fn encode(&self, e: &mut SM70Encoder<'_>) {
1350         assert!(e.sm.sm >= 80);
1351 
1352         e.encode_fp16_alu(
1353             0x040,
1354             Some(&self.dst),
1355             Some(&self.srcs[0]),
1356             Some(&self.srcs[1]),
1357             None,
1358         );
1359 
1360         // This differentiate between integer and fp16 output
1361         e.set_bit(78, false); // .F32 (SM86)
1362         e.set_bit(80, self.ftz);
1363         e.set_bit(81, false); // .NAN
1364         e.set_bit(82, false); // .XORSIGN
1365         e.set_bit(85, false); // .BF16_V2
1366 
1367         e.set_pred_src(87..90, 90, self.min);
1368     }
1369 }
1370 
1371 impl SM70Op for OpBMsk {
legalize(&mut self, b: &mut LegalizeBuilder)1372     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1373         let gpr = op_gpr(self);
1374         b.copy_alu_src_if_not_reg(&mut self.pos, gpr, SrcType::ALU);
1375     }
1376 
encode(&self, e: &mut SM70Encoder<'_>)1377     fn encode(&self, e: &mut SM70Encoder<'_>) {
1378         if self.is_uniform() {
1379             e.encode_ualu(
1380                 0x09b,
1381                 Some(&self.dst),
1382                 Some(&self.pos),
1383                 Some(&self.width),
1384                 None,
1385             )
1386         } else {
1387             e.encode_alu(
1388                 0x01b,
1389                 Some(&self.dst),
1390                 Some(&self.pos),
1391                 Some(&self.width),
1392                 None,
1393             )
1394         };
1395 
1396         e.set_bit(75, self.wrap);
1397     }
1398 }
1399 
1400 impl SM70Op for OpBRev {
legalize(&mut self, _b: &mut LegalizeBuilder)1401     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1402         // Nothing to do
1403     }
1404 
encode(&self, e: &mut SM70Encoder<'_>)1405     fn encode(&self, e: &mut SM70Encoder<'_>) {
1406         if self.is_uniform() {
1407             e.encode_ualu(0x0be, Some(&self.dst), None, Some(&self.src), None)
1408         } else {
1409             e.encode_alu(0x101, Some(&self.dst), None, Some(&self.src), None)
1410         }
1411     }
1412 }
1413 
1414 impl SM70Op for OpFlo {
legalize(&mut self, _b: &mut LegalizeBuilder)1415     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1416         // Nothing to do
1417     }
1418 
encode(&self, e: &mut SM70Encoder<'_>)1419     fn encode(&self, e: &mut SM70Encoder<'_>) {
1420         if self.is_uniform() {
1421             e.encode_ualu(0x0bd, Some(&self.dst), None, Some(&self.src), None)
1422         } else {
1423             e.encode_alu(0x100, Some(&self.dst), None, Some(&self.src), None)
1424         };
1425         e.set_pred_dst(81..84, Dst::None);
1426         e.set_field(74..75, self.return_shift_amount as u8);
1427         e.set_field(73..74, self.signed as u8);
1428         let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1429         e.set_field(63..64, not_mod)
1430     }
1431 }
1432 
1433 impl SM70Op for OpIAbs {
legalize(&mut self, _b: &mut LegalizeBuilder)1434     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1435         // Nothing to do
1436     }
1437 
encode(&self, e: &mut SM70Encoder<'_>)1438     fn encode(&self, e: &mut SM70Encoder<'_>) {
1439         e.encode_alu(0x013, Some(&self.dst), None, Some(&self.src), None)
1440     }
1441 }
1442 
1443 impl SM70Op for OpIAdd3 {
legalize(&mut self, b: &mut LegalizeBuilder)1444     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1445         let gpr = op_gpr(self);
1446         let [src0, src1, src2] = &mut self.srcs;
1447         swap_srcs_if_not_reg(src0, src1, gpr);
1448         swap_srcs_if_not_reg(src2, src1, gpr);
1449         if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1450             assert!(self.overflow[0].is_none());
1451             assert!(self.overflow[1].is_none());
1452             let val = b.alloc_ssa(gpr, 1);
1453             b.push_op(OpIAdd3 {
1454                 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1455                 overflow: [Dst::None; 2],
1456                 dst: val.into(),
1457             });
1458             *src0 = val.into();
1459         }
1460         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::I32);
1461         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::I32);
1462         if !self.overflow[0].is_none() || !self.overflow[1].is_none() {
1463             b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1464             b.copy_alu_src_if_ineg_imm(src2, gpr, SrcType::I32);
1465         }
1466     }
1467 
encode(&self, e: &mut SM70Encoder<'_>)1468     fn encode(&self, e: &mut SM70Encoder<'_>) {
1469         // Hardware requires at least one of these be unmodified
1470         assert!(
1471             self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1472         );
1473 
1474         if self.is_uniform() {
1475             e.encode_ualu(
1476                 0x090,
1477                 Some(&self.dst),
1478                 Some(&self.srcs[0]),
1479                 Some(&self.srcs[1]),
1480                 Some(&self.srcs[2]),
1481             )
1482         } else {
1483             e.encode_alu(
1484                 0x010,
1485                 Some(&self.dst),
1486                 Some(&self.srcs[0]),
1487                 Some(&self.srcs[1]),
1488                 Some(&self.srcs[2]),
1489             )
1490         };
1491 
1492         e.set_pred_src(87..90, 90, false.into());
1493         e.set_pred_src(77..80, 80, false.into());
1494 
1495         e.set_pred_dst(81..84, self.overflow[0]);
1496         e.set_pred_dst(84..87, self.overflow[1]);
1497     }
1498 }
1499 
1500 impl SM70Op for OpIAdd3X {
legalize(&mut self, b: &mut LegalizeBuilder)1501     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1502         let gpr = op_gpr(self);
1503         let [src0, src1, src2] = &mut self.srcs;
1504         swap_srcs_if_not_reg(src0, src1, gpr);
1505         swap_srcs_if_not_reg(src2, src1, gpr);
1506         if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1507             let val = b.alloc_ssa(gpr, 1);
1508             b.push_op(OpIAdd3X {
1509                 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1510                 overflow: [Dst::None; 2],
1511                 dst: val.into(),
1512                 carry: [false.into(); 2],
1513             });
1514             *src0 = val.into();
1515         }
1516         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::B32);
1517         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::B32);
1518         if !self.is_uniform() {
1519             b.copy_src_if_upred(&mut self.carry[0]);
1520             b.copy_src_if_upred(&mut self.carry[1]);
1521         }
1522     }
1523 
encode(&self, e: &mut SM70Encoder<'_>)1524     fn encode(&self, e: &mut SM70Encoder<'_>) {
1525         // Hardware requires at least one of these be unmodified
1526         assert!(
1527             self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1528         );
1529 
1530         if self.is_uniform() {
1531             e.encode_ualu(
1532                 0x090,
1533                 Some(&self.dst),
1534                 Some(&self.srcs[0]),
1535                 Some(&self.srcs[1]),
1536                 Some(&self.srcs[2]),
1537             );
1538 
1539             e.set_upred_src(87..90, 90, self.carry[0]);
1540             e.set_upred_src(77..80, 80, self.carry[1]);
1541         } else {
1542             e.encode_alu(
1543                 0x010,
1544                 Some(&self.dst),
1545                 Some(&self.srcs[0]),
1546                 Some(&self.srcs[1]),
1547                 Some(&self.srcs[2]),
1548             );
1549 
1550             e.set_pred_src(87..90, 90, self.carry[0]);
1551             e.set_pred_src(77..80, 80, self.carry[1]);
1552         }
1553 
1554         e.set_bit(74, true); // .X
1555 
1556         e.set_pred_dst(81..84, self.overflow[0]);
1557         e.set_pred_dst(84..87, self.overflow[1]);
1558     }
1559 }
1560 
1561 impl SM70Op for OpIDp4 {
legalize(&mut self, b: &mut LegalizeBuilder)1562     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1563         let gpr = op_gpr(self);
1564         let [src_type0, src_type1] = &mut self.src_types;
1565         let [src0, src1, src2] = &mut self.srcs;
1566         if swap_srcs_if_not_reg(src0, src1, gpr) {
1567             std::mem::swap(src_type0, src_type1);
1568         }
1569         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1570         b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1571         b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1572     }
1573 
encode(&self, e: &mut SM70Encoder<'_>)1574     fn encode(&self, e: &mut SM70Encoder<'_>) {
1575         e.encode_alu(
1576             0x026,
1577             Some(&self.dst),
1578             Some(&self.srcs[0]),
1579             Some(&self.srcs[1]),
1580             Some(&self.srcs[2]),
1581         );
1582 
1583         e.set_bit(
1584             73,
1585             match self.src_types[0] {
1586                 IntType::U8 => false,
1587                 IntType::I8 => true,
1588                 _ => panic!("Invalid DP4 source type"),
1589             },
1590         );
1591         e.set_bit(
1592             74,
1593             match self.src_types[1] {
1594                 IntType::U8 => false,
1595                 IntType::I8 => true,
1596                 _ => panic!("Invalid DP4 source type"),
1597             },
1598         );
1599     }
1600 }
1601 
1602 impl SM70Op for OpIMad {
legalize(&mut self, b: &mut LegalizeBuilder)1603     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1604         let gpr = op_gpr(self);
1605         let [src0, src1, src2] = &mut self.srcs;
1606         swap_srcs_if_not_reg(src0, src1, gpr);
1607         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1608         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1609     }
1610 
encode(&self, e: &mut SM70Encoder<'_>)1611     fn encode(&self, e: &mut SM70Encoder<'_>) {
1612         if self.is_uniform() {
1613             e.encode_ualu(
1614                 0x0a4,
1615                 Some(&self.dst),
1616                 Some(&self.srcs[0]),
1617                 Some(&self.srcs[1]),
1618                 Some(&self.srcs[2]),
1619             )
1620         } else {
1621             e.encode_alu(
1622                 0x024,
1623                 Some(&self.dst),
1624                 Some(&self.srcs[0]),
1625                 Some(&self.srcs[1]),
1626                 Some(&self.srcs[2]),
1627             )
1628         };
1629         e.set_pred_dst(81..84, Dst::None);
1630         e.set_bit(73, self.signed);
1631     }
1632 }
1633 
1634 impl SM70Op for OpIMad64 {
legalize(&mut self, b: &mut LegalizeBuilder)1635     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1636         let gpr = op_gpr(self);
1637         let [src0, src1, src2] = &mut self.srcs;
1638         swap_srcs_if_not_reg(src0, src1, gpr);
1639         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1640         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1641     }
1642 
encode(&self, e: &mut SM70Encoder<'_>)1643     fn encode(&self, e: &mut SM70Encoder<'_>) {
1644         if self.is_uniform() {
1645             e.encode_ualu(
1646                 0x0a5,
1647                 Some(&self.dst),
1648                 Some(&self.srcs[0]),
1649                 Some(&self.srcs[1]),
1650                 Some(&self.srcs[2]),
1651             )
1652         } else {
1653             e.encode_alu(
1654                 0x025,
1655                 Some(&self.dst),
1656                 Some(&self.srcs[0]),
1657                 Some(&self.srcs[1]),
1658                 Some(&self.srcs[2]),
1659             )
1660         };
1661         e.set_pred_dst(81..84, Dst::None);
1662         e.set_bit(73, self.signed);
1663     }
1664 }
1665 
1666 impl SM70Op for OpIMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)1667     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1668         let gpr = op_gpr(self);
1669         let [src0, src1] = &mut self.srcs;
1670         swap_srcs_if_not_reg(src0, src1, gpr);
1671         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1672     }
1673 
encode(&self, e: &mut SM70Encoder<'_>)1674     fn encode(&self, e: &mut SM70Encoder<'_>) {
1675         e.encode_alu(
1676             0x017,
1677             Some(&self.dst),
1678             Some(&self.srcs[0]),
1679             Some(&self.srcs[1]),
1680             None,
1681         );
1682         e.set_pred_src(87..90, 90, self.min);
1683         e.set_bit(
1684             73,
1685             match self.cmp_type {
1686                 IntCmpType::U32 => false,
1687                 IntCmpType::I32 => true,
1688             },
1689         );
1690     }
1691 }
1692 
1693 impl SM70Op for OpISetP {
legalize(&mut self, b: &mut LegalizeBuilder)1694     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1695         let gpr = op_gpr(self);
1696         let [src0, src1] = &mut self.srcs;
1697         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1698             std::mem::swap(src0, src1);
1699             self.cmp_op = self.cmp_op.flip();
1700         }
1701         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1702         if !self.is_uniform() {
1703             b.copy_src_if_upred(&mut self.low_cmp);
1704             b.copy_src_if_upred(&mut self.accum);
1705         }
1706     }
1707 
encode(&self, e: &mut SM70Encoder<'_>)1708     fn encode(&self, e: &mut SM70Encoder<'_>) {
1709         if self.is_uniform() {
1710             e.encode_ualu(
1711                 0x08c,
1712                 None,
1713                 Some(&self.srcs[0]),
1714                 Some(&self.srcs[1]),
1715                 None,
1716             );
1717 
1718             e.set_upred_src(68..71, 71, self.low_cmp);
1719             e.set_upred_src(87..90, 90, self.accum);
1720         } else {
1721             e.encode_alu(
1722                 0x00c,
1723                 None,
1724                 Some(&self.srcs[0]),
1725                 Some(&self.srcs[1]),
1726                 None,
1727             );
1728 
1729             e.set_pred_src(68..71, 71, self.low_cmp);
1730             e.set_pred_src(87..90, 90, self.accum);
1731         }
1732 
1733         e.set_bit(72, self.ex);
1734 
1735         e.set_field(
1736             73..74,
1737             match self.cmp_type {
1738                 IntCmpType::U32 => 0_u32,
1739                 IntCmpType::I32 => 1_u32,
1740             },
1741         );
1742         e.set_pred_set_op(74..76, self.set_op);
1743         e.set_int_cmp_op(76..79, self.cmp_op);
1744 
1745         e.set_pred_dst(81..84, self.dst);
1746         e.set_pred_dst(84..87, Dst::None); // dst1
1747     }
1748 }
1749 
src_as_lop_imm(src: &Src) -> Option<bool>1750 fn src_as_lop_imm(src: &Src) -> Option<bool> {
1751     let x = match src.src_ref {
1752         SrcRef::Zero => false,
1753         SrcRef::True => true,
1754         SrcRef::False => false,
1755         SrcRef::Imm32(i) => {
1756             if i == 0 {
1757                 false
1758             } else if i == !0 {
1759                 true
1760             } else {
1761                 return None;
1762             }
1763         }
1764         _ => return None,
1765     };
1766     Some(x ^ src.src_mod.is_bnot())
1767 }
1768 
fold_lop_src(src: &Src, x: &mut u8)1769 fn fold_lop_src(src: &Src, x: &mut u8) {
1770     if let Some(i) = src_as_lop_imm(src) {
1771         *x = if i { !0 } else { 0 };
1772     }
1773     if src.src_mod.is_bnot() {
1774         *x = !*x;
1775     }
1776 }
1777 
1778 impl SM70Op for OpLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)1779     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1780         let gpr = op_gpr(self);
1781         // Fold constants and modifiers if we can
1782         self.op = LogicOp3::new_lut(&|mut x, mut y, mut z| {
1783             fold_lop_src(&self.srcs[0], &mut x);
1784             fold_lop_src(&self.srcs[1], &mut y);
1785             fold_lop_src(&self.srcs[2], &mut z);
1786             self.op.eval(x, y, z)
1787         });
1788         for src in &mut self.srcs {
1789             src.src_mod = SrcMod::None;
1790             if src_as_lop_imm(src).is_some() {
1791                 src.src_ref = SrcRef::Zero;
1792             }
1793         }
1794 
1795         let [src0, src1, src2] = &mut self.srcs;
1796         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1797             std::mem::swap(src0, src1);
1798             self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(y, x, z))
1799         }
1800         if !src_is_reg(src2, gpr) && src_is_reg(src1, gpr) {
1801             std::mem::swap(src2, src1);
1802             self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(x, z, y))
1803         }
1804 
1805         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1806         b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1807     }
1808 
encode(&self, e: &mut SM70Encoder<'_>)1809     fn encode(&self, e: &mut SM70Encoder<'_>) {
1810         if self.is_uniform() {
1811             e.encode_ualu(
1812                 0x092,
1813                 Some(&self.dst),
1814                 Some(&self.srcs[0]),
1815                 Some(&self.srcs[1]),
1816                 Some(&self.srcs[2]),
1817             );
1818 
1819             e.set_upred_src(87..90, 90, SrcRef::False.into());
1820         } else {
1821             e.encode_alu(
1822                 0x012,
1823                 Some(&self.dst),
1824                 Some(&self.srcs[0]),
1825                 Some(&self.srcs[1]),
1826                 Some(&self.srcs[2]),
1827             );
1828 
1829             e.set_pred_src(87..90, 90, SrcRef::False.into());
1830         }
1831 
1832         e.set_field(72..80, self.op.lut);
1833         e.set_bit(80, false); // .PAND
1834         e.set_field(81..84, 7_u32); // pred
1835     }
1836 }
1837 
1838 impl SM70Op for OpPopC {
legalize(&mut self, _b: &mut LegalizeBuilder)1839     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1840         // Nothing to do
1841     }
1842 
encode(&self, e: &mut SM70Encoder<'_>)1843     fn encode(&self, e: &mut SM70Encoder<'_>) {
1844         if self.is_uniform() {
1845             e.encode_ualu(0x0bf, Some(&self.dst), None, Some(&self.src), None)
1846         } else {
1847             e.encode_alu(0x109, Some(&self.dst), None, Some(&self.src), None)
1848         };
1849 
1850         let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1851         e.set_field(63..64, not_mod);
1852     }
1853 }
1854 
1855 impl SM70Op for OpShf {
legalize(&mut self, b: &mut LegalizeBuilder)1856     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1857         let gpr = op_gpr(self);
1858         b.copy_alu_src_if_not_reg(&mut self.low, gpr, SrcType::ALU);
1859         b.copy_alu_src_if_both_not_reg(
1860             &self.shift,
1861             &mut self.high,
1862             gpr,
1863             SrcType::ALU,
1864         );
1865     }
1866 
encode(&self, e: &mut SM70Encoder<'_>)1867     fn encode(&self, e: &mut SM70Encoder<'_>) {
1868         if self.is_uniform() {
1869             e.encode_ualu(
1870                 0x099,
1871                 Some(&self.dst),
1872                 Some(&self.low),
1873                 Some(&self.shift),
1874                 Some(&self.high),
1875             )
1876         } else {
1877             e.encode_alu(
1878                 0x019,
1879                 Some(&self.dst),
1880                 Some(&self.low),
1881                 Some(&self.shift),
1882                 Some(&self.high),
1883             )
1884         };
1885 
1886         e.set_field(
1887             73..75,
1888             match self.data_type {
1889                 IntType::I64 => 0_u8,
1890                 IntType::U64 => 1_u8,
1891                 IntType::I32 => 2_u8,
1892                 IntType::U32 => 3_u8,
1893                 _ => panic!("Invalid shift data type"),
1894             },
1895         );
1896         e.set_bit(75, self.wrap);
1897         e.set_bit(76, self.right);
1898         e.set_bit(80, self.dst_high);
1899     }
1900 }
1901 
1902 impl SM70Op for OpF2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1903     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1904         // Nothing to do
1905     }
1906 
encode(&self, e: &mut SM70Encoder<'_>)1907     fn encode(&self, e: &mut SM70Encoder<'_>) {
1908         assert!(!self.integer_rnd);
1909         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1910             e.encode_alu(0x104, Some(&self.dst), None, Some(&self.src), None)
1911         } else {
1912             e.encode_alu(0x110, Some(&self.dst), None, Some(&self.src), None)
1913         };
1914 
1915         if self.high {
1916             e.set_field(60..62, 1_u8); // .H1
1917         }
1918 
1919         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1920         e.set_rnd_mode(78..80, self.rnd_mode);
1921         e.set_bit(80, self.ftz);
1922         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1923     }
1924 }
1925 
1926 impl SM70Op for OpF2FP {
legalize(&mut self, b: &mut LegalizeBuilder)1927     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1928         let gpr = op_gpr(self);
1929         let [src0, src1] = &mut self.srcs;
1930         swap_srcs_if_not_reg(src0, src1, gpr);
1931 
1932         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1933     }
1934 
encode(&self, e: &mut SM70Encoder<'_>)1935     fn encode(&self, e: &mut SM70Encoder<'_>) {
1936         e.encode_alu(
1937             0x03e,
1938             Some(&self.dst),
1939             Some(&self.srcs[0]),
1940             Some(&self.srcs[1]),
1941             Some(&Src::new_zero()),
1942         );
1943 
1944         // .MERGE_C behavior
1945         // Use src1 and src2, src0 is unused
1946         // src1 get converted and packed in the lower 16 bits of dest.
1947         // src2 lower or high 16 bits (decided by .H1 flag) get packed in the upper of dest.
1948         e.set_bit(78, false); // TODO: .MERGE_C
1949         e.set_bit(72, false); // .H1 (MERGE_C only)
1950         e.set_rnd_mode(79..81, self.rnd_mode);
1951     }
1952 }
1953 
1954 impl SM70Op for OpF2I {
legalize(&mut self, _b: &mut LegalizeBuilder)1955     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1956         // Nothing to do
1957     }
1958 
encode(&self, e: &mut SM70Encoder<'_>)1959     fn encode(&self, e: &mut SM70Encoder<'_>) {
1960         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1961             e.encode_alu(0x105, Some(&self.dst), None, Some(&self.src), None)
1962         } else {
1963             e.encode_alu(0x111, Some(&self.dst), None, Some(&self.src), None)
1964         };
1965 
1966         e.set_bit(72, self.dst_type.is_signed());
1967         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1968         e.set_bit(77, false); // NTZ
1969         e.set_rnd_mode(78..80, self.rnd_mode);
1970         e.set_bit(80, self.ftz);
1971         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1972     }
1973 }
1974 
1975 impl SM70Op for OpI2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1976     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1977         // Nothing to do
1978     }
1979 
encode(&self, e: &mut SM70Encoder<'_>)1980     fn encode(&self, e: &mut SM70Encoder<'_>) {
1981         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1982             e.encode_alu(0x106, Some(&self.dst), None, Some(&self.src), None)
1983         } else {
1984             e.encode_alu(0x112, Some(&self.dst), None, Some(&self.src), None)
1985         };
1986 
1987         e.set_field(60..62, 0_u8); // TODO: subop
1988         e.set_bit(74, self.src_type.is_signed());
1989         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1990         e.set_rnd_mode(78..80, self.rnd_mode);
1991         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1992     }
1993 }
1994 
1995 impl SM70Op for OpFRnd {
legalize(&mut self, _b: &mut LegalizeBuilder)1996     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1997         // Nothing to do
1998     }
1999 
encode(&self, e: &mut SM70Encoder<'_>)2000     fn encode(&self, e: &mut SM70Encoder<'_>) {
2001         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
2002             e.encode_alu(0x107, Some(&self.dst), None, Some(&self.src), None)
2003         } else {
2004             e.encode_alu(0x113, Some(&self.dst), None, Some(&self.src), None)
2005         };
2006 
2007         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
2008         e.set_bit(80, self.ftz);
2009         e.set_rnd_mode(78..80, self.rnd_mode);
2010         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
2011     }
2012 }
2013 
2014 impl SM70Op for OpMov {
legalize(&mut self, _b: &mut LegalizeBuilder)2015     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2016         // Nothing to do
2017     }
2018 
encode(&self, e: &mut SM70Encoder<'_>)2019     fn encode(&self, e: &mut SM70Encoder<'_>) {
2020         if self.is_uniform() {
2021             e.set_opcode(0xc82);
2022             e.set_udst(self.dst);
2023 
2024             // umov is encoded like a non-uniform ALU op
2025             let src = ALUSrc::from_src(Some(&self.src), true);
2026             let form: u8 = match &src {
2027                 ALUSrc::Reg(reg) => {
2028                     e.encode_alu_ureg(reg, false);
2029                     0x6 // form
2030                 }
2031                 ALUSrc::Imm32(imm) => {
2032                     e.encode_alu_imm(imm);
2033                     0x4 // form
2034                 }
2035                 _ => panic!("Invalid umov src"),
2036             };
2037             e.set_field(9..12, form);
2038         } else {
2039             e.encode_alu(0x002, Some(&self.dst), None, Some(&self.src), None);
2040             e.set_field(72..76, self.quad_lanes);
2041         }
2042     }
2043 }
2044 
2045 impl SM70Op for OpPrmt {
legalize(&mut self, b: &mut LegalizeBuilder)2046     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2047         let gpr = op_gpr(self);
2048         let [src0, src1] = &mut self.srcs;
2049         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2050         b.copy_alu_src_if_not_reg(src1, gpr, SrcType::ALU);
2051     }
2052 
encode(&self, e: &mut SM70Encoder<'_>)2053     fn encode(&self, e: &mut SM70Encoder<'_>) {
2054         if self.is_uniform() {
2055             e.encode_ualu(
2056                 0x96,
2057                 Some(&self.dst),
2058                 Some(&self.srcs[0]),
2059                 Some(&self.sel),
2060                 Some(&self.srcs[1]),
2061             )
2062         } else {
2063             e.encode_alu(
2064                 0x16,
2065                 Some(&self.dst),
2066                 Some(&self.srcs[0]),
2067                 Some(&self.sel),
2068                 Some(&self.srcs[1]),
2069             )
2070         };
2071 
2072         e.set_field(
2073             72..75,
2074             match self.mode {
2075                 PrmtMode::Index => 0_u8,
2076                 PrmtMode::Forward4Extract => 1_u8,
2077                 PrmtMode::Backward4Extract => 2_u8,
2078                 PrmtMode::Replicate8 => 3_u8,
2079                 PrmtMode::EdgeClampLeft => 4_u8,
2080                 PrmtMode::EdgeClampRight => 5_u8,
2081                 PrmtMode::Replicate16 => 6_u8,
2082             },
2083         );
2084     }
2085 }
2086 
2087 impl SM70Op for OpSel {
legalize(&mut self, b: &mut LegalizeBuilder)2088     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2089         let gpr = op_gpr(self);
2090         if !self.is_uniform() {
2091             b.copy_src_if_upred(&mut self.cond);
2092         }
2093         let [src0, src1] = &mut self.srcs;
2094         if swap_srcs_if_not_reg(src0, src1, gpr) {
2095             self.cond = self.cond.bnot();
2096         }
2097         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2098     }
2099 
encode(&self, e: &mut SM70Encoder<'_>)2100     fn encode(&self, e: &mut SM70Encoder<'_>) {
2101         if self.is_uniform() {
2102             e.encode_ualu(
2103                 0x087,
2104                 Some(&self.dst),
2105                 Some(&self.srcs[0]),
2106                 Some(&self.srcs[1]),
2107                 None,
2108             );
2109 
2110             e.set_upred_src(87..90, 90, self.cond);
2111         } else {
2112             e.encode_alu(
2113                 0x007,
2114                 Some(&self.dst),
2115                 Some(&self.srcs[0]),
2116                 Some(&self.srcs[1]),
2117                 None,
2118             );
2119 
2120             e.set_pred_src(87..90, 90, self.cond);
2121         }
2122     }
2123 }
2124 
2125 impl SM70Op for OpShfl {
legalize(&mut self, b: &mut LegalizeBuilder)2126     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2127         let gpr = op_gpr(self);
2128         b.copy_alu_src_if_not_reg(&mut self.src, gpr, SrcType::GPR);
2129         b.copy_alu_src_if_not_reg_or_imm(&mut self.lane, gpr, SrcType::ALU);
2130         b.copy_alu_src_if_not_reg_or_imm(&mut self.c, gpr, SrcType::ALU);
2131     }
2132 
encode(&self, e: &mut SM70Encoder<'_>)2133     fn encode(&self, e: &mut SM70Encoder<'_>) {
2134         assert!(self.lane.src_mod.is_none());
2135         assert!(self.c.src_mod.is_none());
2136 
2137         match &self.lane.src_ref {
2138             SrcRef::Zero | SrcRef::Reg(_) => match &self.c.src_ref {
2139                 SrcRef::Zero | SrcRef::Reg(_) => {
2140                     e.set_opcode(0x389);
2141                     e.set_reg_src(32..40, self.lane);
2142                     e.set_reg_src(64..72, self.c);
2143                 }
2144                 SrcRef::Imm32(imm_c) => {
2145                     e.set_opcode(0x589);
2146                     e.set_reg_src(32..40, self.lane);
2147                     e.set_field(40..53, *imm_c & 0x1f1f);
2148                 }
2149                 _ => panic!("Invalid instruction form"),
2150             },
2151             SrcRef::Imm32(imm_lane) => match &self.c.src_ref {
2152                 SrcRef::Zero | SrcRef::Reg(_) => {
2153                     e.set_opcode(0x989);
2154                     e.set_field(53..58, *imm_lane & 0x1f);
2155                     e.set_reg_src(64..72, self.c);
2156                 }
2157                 SrcRef::Imm32(imm_c) => {
2158                     e.set_opcode(0xf89);
2159                     e.set_field(40..53, *imm_c & 0x1f1f);
2160                     e.set_field(53..58, *imm_lane & 0x1f);
2161                 }
2162                 _ => panic!("Invalid instruction form"),
2163             },
2164             _ => panic!("Invalid instruction form"),
2165         };
2166 
2167         e.set_dst(self.dst);
2168         e.set_pred_dst(81..84, self.in_bounds);
2169         e.set_reg_src(24..32, self.src);
2170         e.set_field(
2171             58..60,
2172             match self.op {
2173                 ShflOp::Idx => 0_u8,
2174                 ShflOp::Up => 1_u8,
2175                 ShflOp::Down => 2_u8,
2176                 ShflOp::Bfly => 3_u8,
2177             },
2178         );
2179     }
2180 }
2181 
2182 impl SM70Op for OpPLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)2183     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2184         // Fold constants and modifiers if we can
2185         for lop in &mut self.ops {
2186             *lop = LogicOp3::new_lut(&|mut x, mut y, mut z| {
2187                 fold_lop_src(&self.srcs[0], &mut x);
2188                 fold_lop_src(&self.srcs[1], &mut y);
2189                 fold_lop_src(&self.srcs[2], &mut z);
2190                 lop.eval(x, y, z)
2191             });
2192         }
2193         for src in &mut self.srcs {
2194             src.src_mod = SrcMod::None;
2195             if src_as_lop_imm(src).is_some() {
2196                 src.src_ref = SrcRef::True;
2197             }
2198         }
2199 
2200         if !self.is_uniform() {
2201             // The warp form of plop3 allows a single uniform predicate in
2202             // src2. If we have a uniform predicate anywhere, try to move it
2203             // there.
2204             let [src0, src1, src2] = &mut self.srcs;
2205             if src_is_upred_reg(src0) && !src_is_upred_reg(src2) {
2206                 std::mem::swap(src0, src2);
2207                 for lop in &mut self.ops {
2208                     *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(z, y, x))
2209                 }
2210             }
2211             if src_is_upred_reg(src1) && !src_is_upred_reg(src2) {
2212                 std::mem::swap(src1, src2);
2213                 for lop in &mut self.ops {
2214                     *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(x, z, y))
2215                 }
2216             }
2217             b.copy_src_if_upred(src0);
2218             b.copy_src_if_upred(src1);
2219         }
2220     }
2221 
encode(&self, e: &mut SM70Encoder<'_>)2222     fn encode(&self, e: &mut SM70Encoder<'_>) {
2223         if self.is_uniform() {
2224             e.set_opcode(0x89c);
2225 
2226             e.set_upred_src(68..71, 71, self.srcs[2]);
2227             e.set_upred_src(77..80, 80, self.srcs[1]);
2228             e.set_upred_src(87..90, 90, self.srcs[0]);
2229         } else {
2230             e.set_opcode(0x81c);
2231 
2232             if self.srcs[2]
2233                 .src_ref
2234                 .as_reg()
2235                 .is_some_and(|r| r.is_uniform())
2236             {
2237                 e.set_upred_src(68..71, 71, self.srcs[2]);
2238                 e.set_bit(67, true);
2239             } else {
2240                 e.set_pred_src(68..71, 71, self.srcs[2]);
2241             }
2242             e.set_pred_src(77..80, 80, self.srcs[1]);
2243             e.set_pred_src(87..90, 90, self.srcs[0]);
2244         }
2245         e.set_field(16..24, self.ops[1].lut);
2246         e.set_field(64..67, self.ops[0].lut & 0x7);
2247         e.set_field(72..77, self.ops[0].lut >> 3);
2248 
2249         e.set_pred_dst(81..84, self.dsts[0]);
2250         e.set_pred_dst(84..87, self.dsts[1]);
2251     }
2252 }
2253 
2254 impl SM70Op for OpR2UR {
legalize(&mut self, _b: &mut LegalizeBuilder)2255     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2256         // Nothing to do
2257     }
2258 
encode(&self, e: &mut SM70Encoder<'_>)2259     fn encode(&self, e: &mut SM70Encoder<'_>) {
2260         e.set_opcode(0x3c2);
2261         e.set_udst(self.dst);
2262         e.set_reg_src(24..32, self.src);
2263         e.set_pred_dst(81..84, Dst::None);
2264     }
2265 }
2266 
2267 impl SM70Encoder<'_> {
set_tex_dim(&mut self, range: Range<usize>, dim: TexDim)2268     fn set_tex_dim(&mut self, range: Range<usize>, dim: TexDim) {
2269         assert!(range.len() == 3);
2270         self.set_field(
2271             range,
2272             match dim {
2273                 TexDim::_1D => 0_u8,
2274                 TexDim::Array1D => 4_u8,
2275                 TexDim::_2D => 1_u8,
2276                 TexDim::Array2D => 5_u8,
2277                 TexDim::_3D => 2_u8,
2278                 TexDim::Cube => 3_u8,
2279                 TexDim::ArrayCube => 7_u8,
2280             },
2281         );
2282     }
2283 
set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode)2284     fn set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode) {
2285         assert!(range.len() == 3);
2286         self.set_field(
2287             range,
2288             match lod_mode {
2289                 TexLodMode::Auto => 0_u8,
2290                 TexLodMode::Zero => 1_u8,
2291                 TexLodMode::Bias => 2_u8,
2292                 TexLodMode::Lod => 3_u8,
2293                 TexLodMode::Clamp => 4_u8,
2294                 TexLodMode::BiasClamp => 5_u8,
2295             },
2296         );
2297     }
2298 
set_image_dim(&mut self, range: Range<usize>, dim: ImageDim)2299     fn set_image_dim(&mut self, range: Range<usize>, dim: ImageDim) {
2300         assert!(range.len() == 3);
2301         self.set_field(
2302             range,
2303             match dim {
2304                 ImageDim::_1D => 0_u8,
2305                 ImageDim::_1DBuffer => 1_u8,
2306                 ImageDim::_1DArray => 2_u8,
2307                 ImageDim::_2D => 3_u8,
2308                 ImageDim::_2DArray => 4_u8,
2309                 ImageDim::_3D => 5_u8,
2310             },
2311         );
2312     }
2313 }
2314 
2315 impl SM70Op for OpTex {
legalize(&mut self, b: &mut LegalizeBuilder)2316     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2317         legalize_ext_instr(self, b);
2318     }
2319 
encode(&self, e: &mut SM70Encoder<'_>)2320     fn encode(&self, e: &mut SM70Encoder<'_>) {
2321         e.set_opcode(0x361);
2322         e.set_bit(59, true); // .B
2323 
2324         e.set_dst(self.dsts[0]);
2325         if let Dst::Reg(reg) = self.dsts[1] {
2326             e.set_reg(64..72, reg);
2327         } else {
2328             e.set_field(64..72, 255_u8);
2329         }
2330         e.set_pred_dst(81..84, self.fault);
2331 
2332         e.set_reg_src(24..32, self.srcs[0]);
2333         e.set_reg_src(32..40, self.srcs[1]);
2334 
2335         e.set_tex_dim(61..64, self.dim);
2336         e.set_field(72..76, self.mask);
2337         e.set_bit(76, self.offset);
2338         e.set_bit(77, false); // ToDo: NDV
2339         e.set_bit(78, self.z_cmpr);
2340         e.set_field(84..87, 1);
2341         e.set_tex_lod_mode(87..90, self.lod_mode);
2342         e.set_bit(90, false); // TODO: .NODEP
2343     }
2344 }
2345 
2346 impl SM70Op for OpTld {
legalize(&mut self, b: &mut LegalizeBuilder)2347     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2348         legalize_ext_instr(self, b);
2349     }
2350 
encode(&self, e: &mut SM70Encoder<'_>)2351     fn encode(&self, e: &mut SM70Encoder<'_>) {
2352         e.set_opcode(0x367);
2353         e.set_bit(59, true); // .B
2354 
2355         e.set_dst(self.dsts[0]);
2356         if let Dst::Reg(reg) = self.dsts[1] {
2357             e.set_reg(64..72, reg);
2358         } else {
2359             e.set_field(64..72, 255_u8);
2360         }
2361         e.set_pred_dst(81..84, self.fault);
2362 
2363         e.set_reg_src(24..32, self.srcs[0]);
2364         e.set_reg_src(32..40, self.srcs[1]);
2365 
2366         e.set_tex_dim(61..64, self.dim);
2367         e.set_field(72..76, self.mask);
2368         e.set_bit(76, self.offset);
2369         // bit 77: .CL
2370         e.set_bit(78, self.is_ms);
2371         // bits 79..81: .F16
2372         assert!(
2373             self.lod_mode == TexLodMode::Zero
2374                 || self.lod_mode == TexLodMode::Lod
2375         );
2376         e.set_tex_lod_mode(87..90, self.lod_mode);
2377         e.set_bit(90, false); // TODO: .NODEP
2378     }
2379 }
2380 
2381 impl SM70Op for OpTld4 {
legalize(&mut self, b: &mut LegalizeBuilder)2382     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2383         legalize_ext_instr(self, b);
2384     }
2385 
encode(&self, e: &mut SM70Encoder<'_>)2386     fn encode(&self, e: &mut SM70Encoder<'_>) {
2387         e.set_opcode(0x364);
2388         e.set_bit(59, true); // .B
2389 
2390         e.set_dst(self.dsts[0]);
2391         if let Dst::Reg(reg) = self.dsts[1] {
2392             e.set_reg(64..72, reg);
2393         } else {
2394             e.set_field(64..72, 255_u8);
2395         }
2396         e.set_pred_dst(81..84, self.fault);
2397 
2398         e.set_reg_src(24..32, self.srcs[0]);
2399         e.set_reg_src(32..40, self.srcs[1]);
2400 
2401         e.set_tex_dim(61..64, self.dim);
2402         e.set_field(72..76, self.mask);
2403         e.set_field(
2404             76..78,
2405             match self.offset_mode {
2406                 Tld4OffsetMode::None => 0_u8,
2407                 Tld4OffsetMode::AddOffI => 1_u8,
2408                 Tld4OffsetMode::PerPx => 2_u8,
2409             },
2410         );
2411         // bit 77: .CL
2412         e.set_bit(78, self.z_cmpr);
2413         e.set_bit(84, true); // !.EF
2414         e.set_field(87..89, self.comp);
2415         e.set_bit(90, false); // TODO: .NODEP
2416     }
2417 }
2418 
2419 impl SM70Op for OpTmml {
legalize(&mut self, b: &mut LegalizeBuilder)2420     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2421         legalize_ext_instr(self, b);
2422     }
2423 
encode(&self, e: &mut SM70Encoder<'_>)2424     fn encode(&self, e: &mut SM70Encoder<'_>) {
2425         e.set_opcode(0x36a);
2426         e.set_bit(59, true); // .B
2427 
2428         e.set_dst(self.dsts[0]);
2429         if let Dst::Reg(reg) = self.dsts[1] {
2430             e.set_reg(64..72, reg);
2431         } else {
2432             e.set_field(64..72, 255_u8);
2433         }
2434 
2435         e.set_reg_src(24..32, self.srcs[0]);
2436         e.set_reg_src(32..40, self.srcs[1]);
2437 
2438         e.set_tex_dim(61..64, self.dim);
2439         e.set_field(72..76, self.mask);
2440         e.set_bit(77, false); // ToDo: NDV
2441         e.set_bit(90, false); // TODO: .NODEP
2442     }
2443 }
2444 
2445 impl SM70Op for OpTxd {
legalize(&mut self, b: &mut LegalizeBuilder)2446     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2447         legalize_ext_instr(self, b);
2448     }
2449 
encode(&self, e: &mut SM70Encoder<'_>)2450     fn encode(&self, e: &mut SM70Encoder<'_>) {
2451         e.set_opcode(0x36d);
2452         e.set_bit(59, true); // .B
2453 
2454         e.set_dst(self.dsts[0]);
2455         if let Dst::Reg(reg) = self.dsts[1] {
2456             e.set_reg(64..72, reg);
2457         } else {
2458             e.set_field(64..72, 255_u8);
2459         }
2460         e.set_pred_dst(81..84, self.fault);
2461 
2462         e.set_reg_src(24..32, self.srcs[0]);
2463         e.set_reg_src(32..40, self.srcs[1]);
2464 
2465         e.set_tex_dim(61..64, self.dim);
2466         e.set_field(72..76, self.mask);
2467         e.set_bit(76, self.offset);
2468         e.set_bit(77, false); // ToDo: NDV
2469         e.set_bit(90, false); // TODO: .NODEP
2470     }
2471 }
2472 
2473 impl SM70Op for OpTxq {
legalize(&mut self, b: &mut LegalizeBuilder)2474     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2475         legalize_ext_instr(self, b);
2476     }
2477 
encode(&self, e: &mut SM70Encoder<'_>)2478     fn encode(&self, e: &mut SM70Encoder<'_>) {
2479         e.set_opcode(0x370);
2480         e.set_bit(59, true); // .B
2481 
2482         e.set_dst(self.dsts[0]);
2483         if let Dst::Reg(reg) = self.dsts[1] {
2484             e.set_reg(64..72, reg);
2485         } else {
2486             e.set_field(64..72, 255_u8);
2487         }
2488 
2489         e.set_reg_src(24..32, self.src);
2490         e.set_field(
2491             62..64,
2492             match self.query {
2493                 TexQuery::Dimension => 0_u8,
2494                 TexQuery::TextureType => 1_u8,
2495                 TexQuery::SamplerPos => 2_u8,
2496             },
2497         );
2498         e.set_field(72..76, self.mask);
2499     }
2500 }
2501 
2502 impl SM70Encoder<'_> {
set_mem_order(&mut self, order: &MemOrder)2503     fn set_mem_order(&mut self, order: &MemOrder) {
2504         if self.sm.sm < 80 {
2505             let scope = match order {
2506                 MemOrder::Constant => MemScope::System,
2507                 MemOrder::Weak => MemScope::CTA,
2508                 MemOrder::Strong(s) => *s,
2509             };
2510             self.set_field(
2511                 77..79,
2512                 match scope {
2513                     MemScope::CTA => 0_u8,
2514                     // SM => 1_u8,
2515                     MemScope::GPU => 2_u8,
2516                     MemScope::System => 3_u8,
2517                 },
2518             );
2519             self.set_field(
2520                 79..81,
2521                 match order {
2522                     MemOrder::Constant => 0_u8,
2523                     MemOrder::Weak => 1_u8,
2524                     MemOrder::Strong(_) => 2_u8,
2525                     // MMIO => 3_u8,
2526                 },
2527             );
2528         } else {
2529             self.set_field(
2530                 77..81,
2531                 match order {
2532                     MemOrder::Constant => 0x4_u8,
2533                     MemOrder::Weak => 0x0_u8,
2534                     MemOrder::Strong(MemScope::CTA) => 0x5_u8,
2535                     MemOrder::Strong(MemScope::GPU) => 0x7_u8,
2536                     MemOrder::Strong(MemScope::System) => 0xa_u8,
2537                 },
2538             );
2539         }
2540     }
2541 
set_eviction_priority(&mut self, pri: &MemEvictionPriority)2542     fn set_eviction_priority(&mut self, pri: &MemEvictionPriority) {
2543         self.set_field(
2544             84..86,
2545             match pri {
2546                 MemEvictionPriority::First => 0_u8,
2547                 MemEvictionPriority::Normal => 1_u8,
2548                 MemEvictionPriority::Last => 2_u8,
2549                 MemEvictionPriority::Unchanged => 3_u8,
2550             },
2551         );
2552     }
2553 
set_mem_type(&mut self, range: Range<usize>, mem_type: MemType)2554     fn set_mem_type(&mut self, range: Range<usize>, mem_type: MemType) {
2555         assert!(range.len() == 3);
2556         self.set_field(
2557             range,
2558             match mem_type {
2559                 MemType::U8 => 0_u8,
2560                 MemType::I8 => 1_u8,
2561                 MemType::U16 => 2_u8,
2562                 MemType::I16 => 3_u8,
2563                 MemType::B32 => 4_u8,
2564                 MemType::B64 => 5_u8,
2565                 MemType::B128 => 6_u8,
2566             },
2567         );
2568     }
2569 
set_mem_access(&mut self, access: &MemAccess)2570     fn set_mem_access(&mut self, access: &MemAccess) {
2571         self.set_field(
2572             72..73,
2573             match access.space.addr_type() {
2574                 MemAddrType::A32 => 0_u8,
2575                 MemAddrType::A64 => 1_u8,
2576             },
2577         );
2578         self.set_mem_type(73..76, access.mem_type);
2579         self.set_mem_order(&access.order);
2580         self.set_eviction_priority(&access.eviction_priority);
2581     }
2582 }
2583 
2584 impl SM70Op for OpSuLd {
legalize(&mut self, b: &mut LegalizeBuilder)2585     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2586         legalize_ext_instr(self, b);
2587     }
2588 
encode(&self, e: &mut SM70Encoder<'_>)2589     fn encode(&self, e: &mut SM70Encoder<'_>) {
2590         e.set_opcode(0x998);
2591 
2592         e.set_dst(self.dst);
2593         e.set_reg_src(24..32, self.coord);
2594         e.set_reg_src(64..72, self.handle);
2595         e.set_pred_dst(81..84, self.fault);
2596 
2597         e.set_image_dim(61..64, self.image_dim);
2598         e.set_mem_order(&self.mem_order);
2599         e.set_eviction_priority(&self.mem_eviction_priority);
2600 
2601         assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2602         e.set_field(72..76, self.mask);
2603     }
2604 }
2605 
2606 impl SM70Op for OpSuSt {
legalize(&mut self, b: &mut LegalizeBuilder)2607     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2608         legalize_ext_instr(self, b);
2609     }
2610 
encode(&self, e: &mut SM70Encoder<'_>)2611     fn encode(&self, e: &mut SM70Encoder<'_>) {
2612         e.set_opcode(0x99c);
2613 
2614         e.set_reg_src(24..32, self.coord);
2615         e.set_reg_src(32..40, self.data);
2616         e.set_reg_src(64..72, self.handle);
2617 
2618         e.set_image_dim(61..64, self.image_dim);
2619         e.set_mem_order(&self.mem_order);
2620         e.set_eviction_priority(&self.mem_eviction_priority);
2621 
2622         assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2623         e.set_field(72..76, self.mask);
2624     }
2625 }
2626 
2627 impl SM70Op for OpSuAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2628     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2629         legalize_ext_instr(self, b);
2630     }
2631 
encode(&self, e: &mut SM70Encoder<'_>)2632     fn encode(&self, e: &mut SM70Encoder<'_>) {
2633         if self.dst.is_none() {
2634             e.set_opcode(0x3a0);
2635             e.set_atom_op(87..90, self.atom_op);
2636         } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2637             e.set_opcode(0x396);
2638             assert!(cmp_src == AtomCmpSrc::Packed);
2639         } else {
2640             e.set_opcode(0x394);
2641             e.set_atom_op(87..91, self.atom_op);
2642         };
2643 
2644         e.set_dst(self.dst);
2645         e.set_reg_src(24..32, self.coord);
2646         e.set_reg_src(32..40, self.data);
2647         e.set_reg_src(64..72, self.handle);
2648         e.set_pred_dst(81..84, self.fault);
2649 
2650         e.set_image_dim(61..64, self.image_dim);
2651         e.set_mem_order(&self.mem_order);
2652         e.set_eviction_priority(&self.mem_eviction_priority);
2653 
2654         e.set_bit(72, false); // .BA
2655         e.set_atom_type(73..76, self.atom_type);
2656     }
2657 }
2658 
2659 impl SM70Op for OpLd {
legalize(&mut self, b: &mut LegalizeBuilder)2660     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2661         legalize_ext_instr(self, b);
2662     }
2663 
encode(&self, e: &mut SM70Encoder<'_>)2664     fn encode(&self, e: &mut SM70Encoder<'_>) {
2665         match self.access.space {
2666             MemSpace::Global(_) => {
2667                 e.set_opcode(0x381);
2668                 e.set_pred_dst(81..84, Dst::None);
2669                 e.set_mem_access(&self.access);
2670             }
2671             MemSpace::Local => {
2672                 e.set_opcode(0x983);
2673                 e.set_field(84..87, 1_u8);
2674 
2675                 e.set_mem_type(73..76, self.access.mem_type);
2676                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2677                 assert!(
2678                     self.access.eviction_priority
2679                         == MemEvictionPriority::Normal
2680                 );
2681             }
2682             MemSpace::Shared => {
2683                 e.set_opcode(0x984);
2684 
2685                 e.set_mem_type(73..76, self.access.mem_type);
2686                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2687                 assert!(
2688                     self.access.eviction_priority
2689                         == MemEvictionPriority::Normal
2690                 );
2691 
2692                 e.set_bit(87, false); // !.ZD - Returns a predicate?
2693             }
2694         }
2695 
2696         e.set_dst(self.dst);
2697         e.set_reg_src(24..32, self.addr);
2698         e.set_field(40..64, self.offset);
2699     }
2700 }
2701 
2702 impl SM70Op for OpLdc {
legalize(&mut self, b: &mut LegalizeBuilder)2703     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2704         let gpr = op_gpr(self);
2705         b.copy_alu_src_if_not_reg(&mut self.offset, gpr, SrcType::GPR);
2706     }
2707 
encode(&self, e: &mut SM70Encoder<'_>)2708     fn encode(&self, e: &mut SM70Encoder<'_>) {
2709         let SrcRef::CBuf(cb) = &self.cb.src_ref else {
2710             panic!("LDC must take a cbuf source");
2711         };
2712 
2713         match cb.buf {
2714             CBuf::Binding(idx) => {
2715                 if self.is_uniform() {
2716                     e.set_opcode(0xab9);
2717                     e.set_udst(self.dst);
2718 
2719                     assert!(self.offset.is_zero());
2720                     assert!(self.mode == LdcMode::Indexed);
2721                 } else {
2722                     e.set_opcode(0xb82);
2723                     e.set_dst(self.dst);
2724 
2725                     e.set_reg_src(24..32, self.offset);
2726                     e.set_field(
2727                         78..80,
2728                         match self.mode {
2729                             LdcMode::Indexed => 0_u8,
2730                             LdcMode::IndexedLinear => 1_u8,
2731                             LdcMode::IndexedSegmented => 2_u8,
2732                             LdcMode::IndexedSegmentedLinear => 3_u8,
2733                         },
2734                     );
2735                 }
2736                 e.set_field(54..59, idx);
2737                 e.set_bit(91, false); // Bound
2738             }
2739             CBuf::BindlessUGPR(handle) => {
2740                 if self.is_uniform() {
2741                     e.set_opcode(0xab9);
2742                     e.set_udst(self.dst);
2743 
2744                     assert!(self.offset.is_zero());
2745                 } else {
2746                     e.set_opcode(0x582);
2747                     e.set_dst(self.dst);
2748 
2749                     e.set_reg_src(64..72, self.offset);
2750                 }
2751 
2752                 e.set_ureg(24..32, handle);
2753                 e.set_reg_src(64..72, self.offset);
2754                 assert!(self.mode == LdcMode::Indexed);
2755                 e.set_bit(91, true); // Bindless
2756             }
2757             CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
2758         }
2759 
2760         e.set_field(38..54, cb.offset);
2761         e.set_mem_type(73..76, self.mem_type);
2762     }
2763 }
2764 
2765 impl SM70Op for OpSt {
legalize(&mut self, b: &mut LegalizeBuilder)2766     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2767         legalize_ext_instr(self, b);
2768     }
2769 
encode(&self, e: &mut SM70Encoder<'_>)2770     fn encode(&self, e: &mut SM70Encoder<'_>) {
2771         match self.access.space {
2772             MemSpace::Global(_) => {
2773                 e.set_opcode(0x386);
2774                 e.set_mem_access(&self.access);
2775             }
2776             MemSpace::Local => {
2777                 e.set_opcode(0x387);
2778                 e.set_field(84..87, 1_u8);
2779 
2780                 e.set_mem_type(73..76, self.access.mem_type);
2781                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2782                 assert!(
2783                     self.access.eviction_priority
2784                         == MemEvictionPriority::Normal
2785                 );
2786             }
2787             MemSpace::Shared => {
2788                 e.set_opcode(0x388);
2789 
2790                 e.set_mem_type(73..76, self.access.mem_type);
2791                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2792                 assert!(
2793                     self.access.eviction_priority
2794                         == MemEvictionPriority::Normal
2795                 );
2796             }
2797         }
2798 
2799         e.set_reg_src(24..32, self.addr);
2800         e.set_reg_src(32..40, self.data);
2801         e.set_field(40..64, self.offset);
2802     }
2803 }
2804 
2805 impl SM70Encoder<'_> {
set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp)2806     fn set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp) {
2807         self.set_field(
2808             range,
2809             match atom_op {
2810                 AtomOp::Add => 0_u8,
2811                 AtomOp::Min => 1_u8,
2812                 AtomOp::Max => 2_u8,
2813                 AtomOp::Inc => 3_u8,
2814                 AtomOp::Dec => 4_u8,
2815                 AtomOp::And => 5_u8,
2816                 AtomOp::Or => 6_u8,
2817                 AtomOp::Xor => 7_u8,
2818                 AtomOp::Exch => 8_u8,
2819                 AtomOp::CmpExch(_) => panic!("CmpExch is a separate opcode"),
2820             },
2821         );
2822     }
2823 
set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType)2824     fn set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType) {
2825         assert!(range.len() == 3);
2826         self.set_field(
2827             range,
2828             match atom_type {
2829                 AtomType::U32 => 0_u8,
2830                 AtomType::I32 => 1_u8,
2831                 AtomType::U64 => 2_u8,
2832                 AtomType::F32 => 3_u8,
2833                 AtomType::F16x2 => 4_u8,
2834                 AtomType::I64 => 5_u8,
2835                 AtomType::F64 => 6_u8,
2836             },
2837         );
2838     }
2839 }
2840 
2841 impl SM70Op for OpAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2842     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2843         legalize_ext_instr(self, b);
2844     }
2845 
encode(&self, e: &mut SM70Encoder<'_>)2846     fn encode(&self, e: &mut SM70Encoder<'_>) {
2847         match self.mem_space {
2848             MemSpace::Global(_) => {
2849                 if self.dst.is_none() {
2850                     e.set_opcode(0x98e);
2851 
2852                     e.set_reg_src(32..40, self.data);
2853                     e.set_atom_op(87..90, self.atom_op);
2854                 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2855                     e.set_opcode(0x3a9);
2856 
2857                     assert!(cmp_src == AtomCmpSrc::Separate);
2858                     e.set_reg_src(32..40, self.cmpr);
2859                     e.set_reg_src(64..72, self.data);
2860                 } else {
2861                     e.set_opcode(0x3a8);
2862 
2863                     e.set_reg_src(32..40, self.data);
2864                     e.set_atom_op(87..91, self.atom_op);
2865                 }
2866 
2867                 e.set_pred_dst(81..84, Dst::None);
2868 
2869                 e.set_field(
2870                     72..73,
2871                     match self.mem_space.addr_type() {
2872                         MemAddrType::A32 => 0_u8,
2873                         MemAddrType::A64 => 1_u8,
2874                     },
2875                 );
2876 
2877                 e.set_mem_order(&self.mem_order);
2878                 e.set_eviction_priority(&self.mem_eviction_priority);
2879             }
2880             MemSpace::Local => panic!("Atomics do not support local"),
2881             MemSpace::Shared => {
2882                 if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2883                     e.set_opcode(0x38d);
2884 
2885                     assert!(cmp_src == AtomCmpSrc::Separate);
2886                     e.set_reg_src(32..40, self.cmpr);
2887                     e.set_reg_src(64..72, self.data);
2888                 } else {
2889                     e.set_opcode(0x38c);
2890 
2891                     e.set_reg_src(32..40, self.data);
2892                     e.set_atom_op(87..91, self.atom_op);
2893                 }
2894 
2895                 assert!(self.mem_order == MemOrder::Strong(MemScope::CTA));
2896                 assert!(
2897                     self.mem_eviction_priority == MemEvictionPriority::Normal
2898                 );
2899             }
2900         }
2901 
2902         e.set_dst(self.dst);
2903         e.set_reg_src(24..32, self.addr);
2904         e.set_field(40..64, self.addr_offset);
2905         e.set_atom_type(73..76, self.atom_type);
2906     }
2907 }
2908 
2909 impl SM70Op for OpAL2P {
legalize(&mut self, b: &mut LegalizeBuilder)2910     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2911         legalize_ext_instr(self, b);
2912     }
2913 
encode(&self, e: &mut SM70Encoder<'_>)2914     fn encode(&self, e: &mut SM70Encoder<'_>) {
2915         e.set_opcode(0x920);
2916 
2917         e.set_dst(self.dst);
2918         e.set_reg_src(24..32, self.offset);
2919 
2920         e.set_field(40..50, self.access.addr);
2921         e.set_field(74..76, 0_u8); // comps
2922         assert!(!self.access.patch);
2923         e.set_bit(79, self.access.output);
2924     }
2925 }
2926 
2927 impl SM70Op for OpALd {
legalize(&mut self, b: &mut LegalizeBuilder)2928     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2929         legalize_ext_instr(self, b);
2930     }
2931 
encode(&self, e: &mut SM70Encoder<'_>)2932     fn encode(&self, e: &mut SM70Encoder<'_>) {
2933         e.set_opcode(0x321);
2934 
2935         e.set_dst(self.dst);
2936         e.set_reg_src(32..40, self.vtx);
2937         e.set_reg_src(24..32, self.offset);
2938 
2939         e.set_field(40..50, self.access.addr);
2940         e.set_field(74..76, self.access.comps - 1);
2941         e.set_field(76..77, self.access.patch);
2942         e.set_field(77..78, self.access.phys);
2943         e.set_field(79..80, self.access.output);
2944     }
2945 }
2946 
2947 impl SM70Op for OpASt {
legalize(&mut self, b: &mut LegalizeBuilder)2948     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2949         legalize_ext_instr(self, b);
2950     }
2951 
encode(&self, e: &mut SM70Encoder<'_>)2952     fn encode(&self, e: &mut SM70Encoder<'_>) {
2953         e.set_opcode(0x322);
2954 
2955         e.set_reg_src(32..40, self.data);
2956         e.set_reg_src(64..72, self.vtx);
2957         e.set_reg_src(24..32, self.offset);
2958 
2959         e.set_field(40..50, self.access.addr);
2960         e.set_field(74..76, self.access.comps - 1);
2961         e.set_field(76..77, self.access.patch);
2962         e.set_field(77..78, self.access.phys);
2963         assert!(self.access.output);
2964     }
2965 }
2966 
2967 impl SM70Op for OpIpa {
legalize(&mut self, b: &mut LegalizeBuilder)2968     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2969         legalize_ext_instr(self, b);
2970     }
2971 
encode(&self, e: &mut SM70Encoder<'_>)2972     fn encode(&self, e: &mut SM70Encoder<'_>) {
2973         e.set_opcode(0x326);
2974 
2975         e.set_dst(self.dst);
2976 
2977         assert!(self.addr % 4 == 0);
2978         e.set_field(64..72, self.addr >> 2);
2979 
2980         e.set_field(
2981             76..78,
2982             match self.loc {
2983                 InterpLoc::Default => 0_u8,
2984                 InterpLoc::Centroid => 1_u8,
2985                 InterpLoc::Offset => 2_u8,
2986             },
2987         );
2988         e.set_field(
2989             78..80,
2990             match self.freq {
2991                 InterpFreq::Pass => 0_u8,
2992                 InterpFreq::Constant => 1_u8,
2993                 InterpFreq::State => 2_u8,
2994                 InterpFreq::PassMulW => {
2995                     panic!("InterpFreq::PassMulW is invalid on SM70+");
2996                 }
2997             },
2998         );
2999 
3000         assert!(self.inv_w.is_zero());
3001         e.set_reg_src(32..40, self.offset);
3002 
3003         // TODO: What is this for?
3004         e.set_pred_dst(81..84, Dst::None);
3005     }
3006 }
3007 
3008 impl SM70Op for OpLdTram {
legalize(&mut self, b: &mut LegalizeBuilder)3009     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3010         legalize_ext_instr(self, b);
3011     }
3012 
encode(&self, e: &mut SM70Encoder<'_>)3013     fn encode(&self, e: &mut SM70Encoder<'_>) {
3014         e.set_opcode(0x3ad);
3015         e.set_dst(self.dst);
3016         e.set_ureg(24..32, RegRef::zero(RegFile::UGPR, 1));
3017 
3018         assert!(self.addr % 4 == 0);
3019         e.set_field(64..72, self.addr >> 2);
3020 
3021         e.set_bit(72, self.use_c);
3022 
3023         // Unknown but required
3024         e.set_bit(91, true);
3025     }
3026 }
3027 
3028 impl SM70Op for OpCCtl {
legalize(&mut self, b: &mut LegalizeBuilder)3029     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3030         legalize_ext_instr(self, b);
3031     }
3032 
encode(&self, e: &mut SM70Encoder<'_>)3033     fn encode(&self, e: &mut SM70Encoder<'_>) {
3034         assert!(matches!(self.mem_space, MemSpace::Global(_)));
3035         e.set_opcode(0x98f);
3036 
3037         e.set_reg_src(24..32, self.addr);
3038         e.set_field(32..64, self.addr_offset);
3039 
3040         e.set_field(
3041             87..91,
3042             match self.op {
3043                 CCtlOp::PF1 => 0_u8,
3044                 CCtlOp::PF2 => 1_u8,
3045                 CCtlOp::WB => 2_u8,
3046                 CCtlOp::IV => 3_u8,
3047                 CCtlOp::IVAll => 4_u8,
3048                 CCtlOp::RS => 5_u8,
3049                 CCtlOp::IVAllP => 6_u8,
3050                 CCtlOp::WBAll => 7_u8,
3051                 CCtlOp::WBAllP => 8_u8,
3052                 op => panic!("Unsupported cache control {op:?}"),
3053             },
3054         );
3055     }
3056 }
3057 
3058 impl SM70Op for OpMemBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3059     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3060         // Nothing to do
3061     }
3062 
encode(&self, e: &mut SM70Encoder<'_>)3063     fn encode(&self, e: &mut SM70Encoder<'_>) {
3064         e.set_opcode(0x992);
3065 
3066         e.set_bit(72, false); // !.MMIO
3067         e.set_field(
3068             76..79,
3069             match self.scope {
3070                 MemScope::CTA => 0_u8,
3071                 // SM => 1_u8,
3072                 MemScope::GPU => 2_u8,
3073                 MemScope::System => 3_u8,
3074             },
3075         );
3076         e.set_bit(80, false); // .SC
3077     }
3078 }
3079 
3080 impl SM70Encoder<'_> {
set_rel_offset(&mut self, range: Range<usize>, label: &Label)3081     fn set_rel_offset(&mut self, range: Range<usize>, label: &Label) {
3082         let ip = u64::try_from(self.ip).unwrap();
3083         let ip = i64::try_from(ip).unwrap();
3084 
3085         let target_ip = *self.labels.get(label).unwrap();
3086         let target_ip = u64::try_from(target_ip).unwrap();
3087         let target_ip = i64::try_from(target_ip).unwrap();
3088 
3089         let rel_offset = target_ip - ip - 4;
3090 
3091         self.set_field(range, rel_offset);
3092     }
3093 }
3094 
3095 impl SM70Op for OpBClear {
legalize(&mut self, _b: &mut LegalizeBuilder)3096     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3097         // Nothing to do
3098     }
3099 
encode(&self, e: &mut SM70Encoder<'_>)3100     fn encode(&self, e: &mut SM70Encoder<'_>) {
3101         e.set_opcode(0x355);
3102 
3103         e.set_dst(Dst::None);
3104         e.set_bar_dst(24..28, self.dst);
3105 
3106         e.set_bit(84, true); // .CLEAR
3107     }
3108 }
3109 
3110 impl SM70Op for OpBMov {
legalize(&mut self, _b: &mut LegalizeBuilder)3111     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3112         // Nothing to do
3113     }
3114 
encode(&self, e: &mut SM70Encoder<'_>)3115     fn encode(&self, e: &mut SM70Encoder<'_>) {
3116         if dst_is_bar(self.dst) {
3117             e.set_opcode(0x356);
3118 
3119             e.set_bar_dst(24..28, self.dst);
3120             e.set_reg_src(32..40, self.src);
3121 
3122             e.set_bit(84, self.clear);
3123         } else {
3124             e.set_opcode(0x355);
3125 
3126             e.set_dst(self.dst);
3127             e.set_bar_src(24..28, self.src);
3128 
3129             e.set_bit(84, self.clear);
3130         }
3131     }
3132 }
3133 
3134 impl SM70Op for OpBreak {
legalize(&mut self, _b: &mut LegalizeBuilder)3135     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3136         // Nothing to do
3137     }
3138 
encode(&self, e: &mut SM70Encoder<'_>)3139     fn encode(&self, e: &mut SM70Encoder<'_>) {
3140         e.set_opcode(0x942);
3141         assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3142         e.set_bar_dst(16..20, self.bar_out);
3143         e.set_pred_src(87..90, 90, self.cond);
3144     }
3145 }
3146 
3147 impl SM70Op for OpBSSy {
legalize(&mut self, _b: &mut LegalizeBuilder)3148     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3149         // Nothing to do
3150     }
3151 
encode(&self, e: &mut SM70Encoder<'_>)3152     fn encode(&self, e: &mut SM70Encoder<'_>) {
3153         e.set_opcode(0x945);
3154         assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3155         e.set_bar_dst(16..20, self.bar_out);
3156         e.set_rel_offset(34..64, &self.target);
3157         e.set_pred_src(87..90, 90, self.cond);
3158     }
3159 }
3160 
3161 impl SM70Op for OpBSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3162     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3163         // Nothing to do
3164     }
3165 
encode(&self, e: &mut SM70Encoder<'_>)3166     fn encode(&self, e: &mut SM70Encoder<'_>) {
3167         e.set_opcode(0x941);
3168         e.set_bar_src(16..20, self.bar);
3169         e.set_pred_src(87..90, 90, self.cond);
3170     }
3171 }
3172 
3173 impl SM70Op for OpBra {
legalize(&mut self, _b: &mut LegalizeBuilder)3174     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3175         // Nothing to do
3176     }
3177 
encode(&self, e: &mut SM70Encoder<'_>)3178     fn encode(&self, e: &mut SM70Encoder<'_>) {
3179         e.set_opcode(0x947);
3180         e.set_rel_offset(34..82, &self.target);
3181         e.set_field(87..90, 0x7_u8); // TODO: Pred?
3182     }
3183 }
3184 
3185 impl SM70Op for OpExit {
legalize(&mut self, _b: &mut LegalizeBuilder)3186     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3187         // Nothing to do
3188     }
3189 
encode(&self, e: &mut SM70Encoder<'_>)3190     fn encode(&self, e: &mut SM70Encoder<'_>) {
3191         e.set_opcode(0x94d);
3192 
3193         // ./.KEEPREFCOUNT/.PREEMPTED/.INVALID3
3194         e.set_field(84..85, false);
3195         e.set_field(85..86, false); // .NO_ATEXIT
3196         e.set_field(87..90, 0x7_u8); // TODO: Predicate
3197         e.set_field(90..91, false); // NOT
3198     }
3199 }
3200 
3201 impl SM70Op for OpWarpSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3202     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3203         // Nothing to do
3204     }
3205 
encode(&self, e: &mut SM70Encoder<'_>)3206     fn encode(&self, e: &mut SM70Encoder<'_>) {
3207         e.encode_alu(0x148, None, None, Some(&Src::from(self.mask)), None);
3208         e.set_pred_src(87..90, 90, SrcRef::True.into());
3209     }
3210 }
3211 
3212 impl SM70Op for OpBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3213     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3214         // Nothing to do
3215     }
3216 
encode(&self, e: &mut SM70Encoder<'_>)3217     fn encode(&self, e: &mut SM70Encoder<'_>) {
3218         e.set_opcode(0xb1d);
3219 
3220         // e.set_opcode(0x31d);
3221 
3222         // // src0 == src1
3223         // e.set_reg_src(32..40, SrcRef::Zero.into());
3224 
3225         // // 00: RED.POPC
3226         // // 01: RED.AND
3227         // // 02: RED.OR
3228         // e.set_field(74..76, 0_u8);
3229 
3230         // // 00: SYNC
3231         // // 01: ARV
3232         // // 02: RED
3233         // // 03: SCAN
3234         // e.set_field(77..79, 0_u8);
3235 
3236         // e.set_pred_src(87..90, 90, SrcRef::True.into());
3237     }
3238 }
3239 
3240 impl SM70Op for OpCS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3241     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3242         // Nothing to do
3243     }
3244 
encode(&self, e: &mut SM70Encoder<'_>)3245     fn encode(&self, e: &mut SM70Encoder<'_>) {
3246         e.set_opcode(0x805);
3247         e.set_dst(self.dst);
3248         e.set_field(72..80, self.idx);
3249         e.set_bit(80, self.dst.as_reg().unwrap().comps() == 2); // .64
3250     }
3251 }
3252 
3253 impl SM70Op for OpIsberd {
legalize(&mut self, _b: &mut LegalizeBuilder)3254     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3255         // Nothing to do
3256     }
3257 
encode(&self, e: &mut SM70Encoder<'_>)3258     fn encode(&self, e: &mut SM70Encoder<'_>) {
3259         e.set_opcode(0x923);
3260         e.set_dst(self.dst);
3261         e.set_reg_src(24..32, self.idx);
3262     }
3263 }
3264 
3265 impl SM70Op for OpKill {
legalize(&mut self, _b: &mut LegalizeBuilder)3266     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3267         // Nothing to do
3268     }
3269 
encode(&self, e: &mut SM70Encoder<'_>)3270     fn encode(&self, e: &mut SM70Encoder<'_>) {
3271         e.set_opcode(0x95b);
3272         e.set_pred_src(87..90, 90, SrcRef::True.into());
3273     }
3274 }
3275 
3276 impl SM70Op for OpNop {
legalize(&mut self, _b: &mut LegalizeBuilder)3277     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3278         // Nothing to do
3279     }
3280 
encode(&self, e: &mut SM70Encoder<'_>)3281     fn encode(&self, e: &mut SM70Encoder<'_>) {
3282         e.set_opcode(0x918);
3283     }
3284 }
3285 
3286 impl SM70Op for OpPixLd {
legalize(&mut self, _b: &mut LegalizeBuilder)3287     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3288         // Nothing to do
3289     }
3290 
encode(&self, e: &mut SM70Encoder<'_>)3291     fn encode(&self, e: &mut SM70Encoder<'_>) {
3292         e.set_opcode(0x925);
3293         e.set_dst(self.dst);
3294         e.set_field(
3295             78..81,
3296             match &self.val {
3297                 PixVal::MsCount => 0_u8,
3298                 PixVal::CovMask => 1_u8,
3299                 PixVal::CentroidOffset => 2_u8,
3300                 PixVal::MyIndex => 3_u8,
3301                 PixVal::InnerCoverage => 4_u8,
3302                 other => panic!("Unsupported PixVal: {other}"),
3303             },
3304         );
3305         e.set_pred_dst(81..84, Dst::None);
3306     }
3307 }
3308 
3309 impl SM70Op for OpS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3310     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3311         // Nothing to do
3312     }
3313 
encode(&self, e: &mut SM70Encoder<'_>)3314     fn encode(&self, e: &mut SM70Encoder<'_>) {
3315         assert!(!self.is_uniform());
3316         e.set_opcode(if self.is_uniform() { 0x9c3 } else { 0x919 });
3317         e.set_dst(self.dst);
3318         e.set_field(72..80, self.idx);
3319     }
3320 }
3321 
3322 impl SM70Op for OpOut {
legalize(&mut self, b: &mut LegalizeBuilder)3323     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3324         let gpr = op_gpr(self);
3325         b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3326         b.copy_alu_src_if_not_reg_or_imm(&mut self.stream, gpr, SrcType::ALU);
3327     }
3328 
encode(&self, e: &mut SM70Encoder<'_>)3329     fn encode(&self, e: &mut SM70Encoder<'_>) {
3330         e.encode_alu(
3331             0x124,
3332             Some(&self.dst),
3333             Some(&self.handle),
3334             Some(&self.stream),
3335             None,
3336         );
3337 
3338         e.set_field(
3339             78..80,
3340             match self.out_type {
3341                 OutType::Emit => 1_u8,
3342                 OutType::Cut => 2_u8,
3343                 OutType::EmitThenCut => 3_u8,
3344             },
3345         );
3346     }
3347 }
3348 
3349 impl SM70Op for OpOutFinal {
legalize(&mut self, b: &mut LegalizeBuilder)3350     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3351         let gpr = op_gpr(self);
3352         b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3353     }
3354 
encode(&self, e: &mut SM70Encoder<'_>)3355     fn encode(&self, e: &mut SM70Encoder<'_>) {
3356         e.encode_alu(
3357             0x124,
3358             Some(&Dst::None),
3359             Some(&self.handle),
3360             Some(&Src::new_zero()),
3361             None,
3362         );
3363     }
3364 }
3365 
3366 impl SM70Op for OpVote {
legalize(&mut self, b: &mut LegalizeBuilder)3367     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3368         b.copy_src_if_upred(&mut self.pred);
3369     }
3370 
encode(&self, e: &mut SM70Encoder<'_>)3371     fn encode(&self, e: &mut SM70Encoder<'_>) {
3372         if self.is_uniform() {
3373             e.set_opcode(0x886);
3374             e.set_udst(self.ballot);
3375         } else {
3376             e.set_opcode(0x806);
3377             e.set_dst(self.ballot);
3378         }
3379 
3380         e.set_field(
3381             72..74,
3382             match self.op {
3383                 VoteOp::All => 0_u8,
3384                 VoteOp::Any => 1_u8,
3385                 VoteOp::Eq => 2_u8,
3386             },
3387         );
3388 
3389         e.set_pred_dst(81..84, self.vote);
3390         e.set_pred_src(87..90, 90, self.pred);
3391     }
3392 }
3393 
3394 macro_rules! as_sm70_op_match {
3395     ($op: expr) => {
3396         match $op {
3397             Op::FAdd(op) => op,
3398             Op::FFma(op) => op,
3399             Op::FMnMx(op) => op,
3400             Op::FMul(op) => op,
3401             Op::FSet(op) => op,
3402             Op::FSetP(op) => op,
3403             Op::FSwzAdd(op) => op,
3404             Op::DAdd(op) => op,
3405             Op::DFma(op) => op,
3406             Op::DMul(op) => op,
3407             Op::DSetP(op) => op,
3408             Op::HAdd2(op) => op,
3409             Op::HFma2(op) => op,
3410             Op::HMul2(op) => op,
3411             Op::HSet2(op) => op,
3412             Op::HSetP2(op) => op,
3413             Op::HMnMx2(op) => op,
3414             Op::MuFu(op) => op,
3415             Op::BMsk(op) => op,
3416             Op::BRev(op) => op,
3417             Op::Flo(op) => op,
3418             Op::IAbs(op) => op,
3419             Op::IAdd3(op) => op,
3420             Op::IAdd3X(op) => op,
3421             Op::IDp4(op) => op,
3422             Op::IMad(op) => op,
3423             Op::IMad64(op) => op,
3424             Op::IMnMx(op) => op,
3425             Op::ISetP(op) => op,
3426             Op::Lop3(op) => op,
3427             Op::PopC(op) => op,
3428             Op::Shf(op) => op,
3429             Op::F2F(op) => op,
3430             Op::F2FP(op) => op,
3431             Op::F2I(op) => op,
3432             Op::I2F(op) => op,
3433             Op::FRnd(op) => op,
3434             Op::Mov(op) => op,
3435             Op::Prmt(op) => op,
3436             Op::Sel(op) => op,
3437             Op::Shfl(op) => op,
3438             Op::PLop3(op) => op,
3439             Op::R2UR(op) => op,
3440             Op::Tex(op) => op,
3441             Op::Tld(op) => op,
3442             Op::Tld4(op) => op,
3443             Op::Tmml(op) => op,
3444             Op::Txd(op) => op,
3445             Op::Txq(op) => op,
3446             Op::SuLd(op) => op,
3447             Op::SuSt(op) => op,
3448             Op::SuAtom(op) => op,
3449             Op::Ld(op) => op,
3450             Op::Ldc(op) => op,
3451             Op::St(op) => op,
3452             Op::Atom(op) => op,
3453             Op::AL2P(op) => op,
3454             Op::ALd(op) => op,
3455             Op::ASt(op) => op,
3456             Op::Ipa(op) => op,
3457             Op::LdTram(op) => op,
3458             Op::CCtl(op) => op,
3459             Op::MemBar(op) => op,
3460             Op::BClear(op) => op,
3461             Op::BMov(op) => op,
3462             Op::Break(op) => op,
3463             Op::BSSy(op) => op,
3464             Op::BSync(op) => op,
3465             Op::Bra(op) => op,
3466             Op::Exit(op) => op,
3467             Op::WarpSync(op) => op,
3468             Op::Bar(op) => op,
3469             Op::CS2R(op) => op,
3470             Op::Isberd(op) => op,
3471             Op::Kill(op) => op,
3472             Op::Nop(op) => op,
3473             Op::PixLd(op) => op,
3474             Op::S2R(op) => op,
3475             Op::Out(op) => op,
3476             Op::OutFinal(op) => op,
3477             Op::Vote(op) => op,
3478             _ => panic!("Unsupported op: {}", $op),
3479         }
3480     };
3481 }
3482 
as_sm70_op(op: &Op) -> &dyn SM70Op3483 fn as_sm70_op(op: &Op) -> &dyn SM70Op {
3484     as_sm70_op_match!(op)
3485 }
3486 
as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op3487 fn as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op {
3488     as_sm70_op_match!(op)
3489 }
3490 
encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32>3491 fn encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32> {
3492     assert!(s.functions.len() == 1);
3493     let func = &s.functions[0];
3494 
3495     let mut ip = 0_usize;
3496     let mut labels = HashMap::new();
3497     for b in &func.blocks {
3498         labels.insert(b.label, ip);
3499         for instr in &b.instrs {
3500             if let Op::Nop(op) = &instr.op {
3501                 if let Some(label) = op.label {
3502                     labels.insert(label, ip);
3503                 }
3504             }
3505             ip += 4;
3506         }
3507     }
3508 
3509     let mut encoded = Vec::new();
3510     for b in &func.blocks {
3511         for instr in &b.instrs {
3512             let mut e = SM70Encoder {
3513                 sm,
3514                 ip: encoded.len(),
3515                 labels: &labels,
3516                 inst: [0_u32; 4],
3517             };
3518             as_sm70_op(&instr.op).encode(&mut e);
3519             e.set_pred(&instr.pred);
3520             e.set_instr_deps(&instr.deps);
3521             encoded.extend_from_slice(&e.inst[..]);
3522         }
3523     }
3524     encoded
3525 }
3526