1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3
4 use crate::ir::*;
5 use crate::legalize::{
6 src_is_reg, src_is_upred_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers,
7 LegalizeBuilder,
8 };
9 use bitview::*;
10
11 use std::collections::HashMap;
12 use std::ops::Range;
13
14 pub struct ShaderModel70 {
15 sm: u8,
16 }
17
18 impl ShaderModel70 {
new(sm: u8) -> Self19 pub fn new(sm: u8) -> Self {
20 assert!(sm >= 70);
21 Self { sm }
22 }
23
has_uniform_alu(&self) -> bool24 fn has_uniform_alu(&self) -> bool {
25 self.sm >= 75
26 }
27 }
28
29 impl ShaderModel for ShaderModel70 {
sm(&self) -> u830 fn sm(&self) -> u8 {
31 self.sm
32 }
33
num_regs(&self, file: RegFile) -> u3234 fn num_regs(&self, file: RegFile) -> u32 {
35 match file {
36 RegFile::GPR => 255 - self.hw_reserved_gprs(),
37 RegFile::UGPR => {
38 if self.has_uniform_alu() {
39 63
40 } else {
41 0
42 }
43 }
44 RegFile::Pred => 7,
45 RegFile::UPred => {
46 if self.has_uniform_alu() {
47 7
48 } else {
49 0
50 }
51 }
52 RegFile::Carry => 0,
53 RegFile::Bar => 16,
54 RegFile::Mem => RegRef::MAX_IDX + 1,
55 }
56 }
57
hw_reserved_gprs(&self) -> u3258 fn hw_reserved_gprs(&self) -> u32 {
59 // On Volta+, 2 GPRs get burned for the program counter - see the
60 // footnote on table 2 of the volta whitepaper
61 // https://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
62 2
63 }
64
crs_size(&self, max_crs_depth: u32) -> u3265 fn crs_size(&self, max_crs_depth: u32) -> u32 {
66 assert!(max_crs_depth == 0);
67 0
68 }
69
op_can_be_uniform(&self, op: &Op) -> bool70 fn op_can_be_uniform(&self, op: &Op) -> bool {
71 if !self.has_uniform_alu() {
72 return false;
73 }
74
75 match op {
76 Op::R2UR(_)
77 | Op::S2R(_)
78 | Op::BMsk(_)
79 | Op::BRev(_)
80 | Op::Flo(_)
81 | Op::IAdd3(_)
82 | Op::IAdd3X(_)
83 | Op::IMad(_)
84 | Op::IMad64(_)
85 | Op::ISetP(_)
86 | Op::Lop3(_)
87 | Op::Mov(_)
88 | Op::PLop3(_)
89 | Op::PopC(_)
90 | Op::Prmt(_)
91 | Op::PSetP(_)
92 | Op::Sel(_)
93 | Op::Shf(_)
94 | Op::Shl(_)
95 | Op::Shr(_)
96 | Op::Vote(_)
97 | Op::Copy(_)
98 | Op::Pin(_)
99 | Op::Unpin(_) => true,
100 Op::Ldc(op) => op.offset.is_zero(),
101 // UCLEA USHL USHR
102 _ => false,
103 }
104 }
105
legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op)106 fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) {
107 as_sm70_op_mut(op).legalize(b);
108 }
109
encode_shader(&self, s: &Shader<'_>) -> Vec<u32>110 fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32> {
111 encode_sm70_shader(self, s)
112 }
113 }
114
115 /// A per-op trait that implements Volta+ opcode semantics
116 trait SM70Op {
legalize(&mut self, b: &mut LegalizeBuilder)117 fn legalize(&mut self, b: &mut LegalizeBuilder);
encode(&self, e: &mut SM70Encoder<'_>)118 fn encode(&self, e: &mut SM70Encoder<'_>);
119 }
120
121 struct SM70Encoder<'a> {
122 sm: &'a ShaderModel70,
123 ip: usize,
124 labels: &'a HashMap<Label, usize>,
125 inst: [u32; 4],
126 }
127
128 impl BitViewable for SM70Encoder<'_> {
bits(&self) -> usize129 fn bits(&self) -> usize {
130 BitView::new(&self.inst).bits()
131 }
132
get_bit_range_u64(&self, range: Range<usize>) -> u64133 fn get_bit_range_u64(&self, range: Range<usize>) -> u64 {
134 BitView::new(&self.inst).get_bit_range_u64(range)
135 }
136 }
137
138 impl BitMutViewable for SM70Encoder<'_> {
set_bit_range_u64(&mut self, range: Range<usize>, val: u64)139 fn set_bit_range_u64(&mut self, range: Range<usize>, val: u64) {
140 BitMutView::new(&mut self.inst).set_bit_range_u64(range, val);
141 }
142 }
143
144 impl SetFieldU64 for SM70Encoder<'_> {
set_field_u64(&mut self, range: Range<usize>, val: u64)145 fn set_field_u64(&mut self, range: Range<usize>, val: u64) {
146 BitMutView::new(&mut self.inst).set_field_u64(range, val);
147 }
148 }
149
150 impl SM70Encoder<'_> {
set_opcode(&mut self, opcode: u16)151 fn set_opcode(&mut self, opcode: u16) {
152 self.set_field(0..12, opcode);
153 }
154
set_reg(&mut self, range: Range<usize>, reg: RegRef)155 fn set_reg(&mut self, range: Range<usize>, reg: RegRef) {
156 assert!(range.len() == 8);
157 assert!(reg.file() == RegFile::GPR);
158 self.set_field(range, reg.base_idx());
159 }
160
set_ureg(&mut self, range: Range<usize>, reg: RegRef)161 fn set_ureg(&mut self, range: Range<usize>, reg: RegRef) {
162 assert!(self.sm.sm >= 75);
163 assert!(range.len() == 8);
164 assert!(reg.file() == RegFile::UGPR);
165 assert!(reg.base_idx() <= 63);
166 self.set_field(range, reg.base_idx());
167 }
168
set_pred_reg(&mut self, range: Range<usize>, reg: RegRef)169 fn set_pred_reg(&mut self, range: Range<usize>, reg: RegRef) {
170 assert!(range.len() == 3);
171 assert!(reg.base_idx() <= 7);
172 assert!(reg.comps() == 1);
173 self.set_field(range, reg.base_idx());
174 }
175
set_reg_src(&mut self, range: Range<usize>, src: Src)176 fn set_reg_src(&mut self, range: Range<usize>, src: Src) {
177 assert!(src.src_mod.is_none());
178 match src.src_ref {
179 SrcRef::Zero => self.set_reg(range, RegRef::zero(RegFile::GPR, 1)),
180 SrcRef::Reg(reg) => self.set_reg(range, reg),
181 _ => panic!("Not a register"),
182 }
183 }
184
set_pred_dst(&mut self, range: Range<usize>, dst: Dst)185 fn set_pred_dst(&mut self, range: Range<usize>, dst: Dst) {
186 match dst {
187 Dst::None => {
188 self.set_pred_reg(range, RegRef::zero(RegFile::Pred, 1));
189 }
190 Dst::Reg(reg) => self.set_pred_reg(range, reg),
191 _ => panic!("Not a register"),
192 }
193 }
194
set_pred_src_file( &mut self, range: Range<usize>, not_bit: usize, src: Src, file: RegFile, )195 fn set_pred_src_file(
196 &mut self,
197 range: Range<usize>,
198 not_bit: usize,
199 src: Src,
200 file: RegFile,
201 ) {
202 // The default for predicates is true
203 let true_reg = RegRef::new(file, 7, 1);
204
205 let (not, reg) = match src.src_ref {
206 SrcRef::True => (false, true_reg),
207 SrcRef::False => (true, true_reg),
208 SrcRef::Reg(reg) => {
209 assert!(reg.file() == file);
210 (false, reg)
211 }
212 _ => panic!("Not a register"),
213 };
214 self.set_pred_reg(range, reg);
215 self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
216 }
217
set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)218 fn set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
219 self.set_pred_src_file(range, not_bit, src, RegFile::Pred);
220 }
221
set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)222 fn set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
223 self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
224 }
225
set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef)226 fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
227 let mut v = BitMutView::new_subset(self, range);
228 v.set_field(6..22, cb.offset);
229 match cb.buf {
230 CBuf::Binding(idx) => {
231 v.set_field(22..27, idx);
232 self.set_bit(cx_bit, false);
233 }
234 CBuf::BindlessUGPR(reg) => {
235 assert!(reg.base_idx() <= 63);
236 assert!(reg.file() == RegFile::UGPR);
237 v.set_field(0..6, reg.base_idx());
238 self.set_bit(cx_bit, true);
239 }
240 CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
241 }
242 }
243
set_pred(&mut self, pred: &Pred)244 fn set_pred(&mut self, pred: &Pred) {
245 assert!(!pred.is_false());
246 self.set_pred_reg(
247 12..15,
248 match pred.pred_ref {
249 PredRef::None => RegRef::zero(RegFile::Pred, 1),
250 PredRef::Reg(reg) => reg,
251 PredRef::SSA(_) => panic!("SSA values must be lowered"),
252 },
253 );
254 self.set_bit(15, pred.pred_inv);
255 }
256
set_dst(&mut self, dst: Dst)257 fn set_dst(&mut self, dst: Dst) {
258 match dst {
259 Dst::None => self.set_reg(16..24, RegRef::zero(RegFile::GPR, 1)),
260 Dst::Reg(reg) => self.set_reg(16..24, reg),
261 _ => panic!("Not a register"),
262 }
263 }
264
set_udst(&mut self, dst: Dst)265 fn set_udst(&mut self, dst: Dst) {
266 match dst {
267 Dst::None => self.set_ureg(16..24, RegRef::zero(RegFile::UGPR, 1)),
268 Dst::Reg(reg) => self.set_ureg(16..24, reg),
269 _ => panic!("Not a register"),
270 }
271 }
272
set_bar_reg(&mut self, range: Range<usize>, reg: RegRef)273 fn set_bar_reg(&mut self, range: Range<usize>, reg: RegRef) {
274 assert!(range.len() == 4);
275 assert!(reg.file() == RegFile::Bar);
276 assert!(reg.comps() == 1);
277 self.set_field(range, reg.base_idx());
278 }
279
set_bar_dst(&mut self, range: Range<usize>, dst: Dst)280 fn set_bar_dst(&mut self, range: Range<usize>, dst: Dst) {
281 self.set_bar_reg(range, *dst.as_reg().unwrap());
282 }
283
set_bar_src(&mut self, range: Range<usize>, src: Src)284 fn set_bar_src(&mut self, range: Range<usize>, src: Src) {
285 assert!(src.src_mod.is_none());
286 self.set_bar_reg(range, *src.src_ref.as_reg().unwrap());
287 }
288
set_instr_deps(&mut self, deps: &InstrDeps)289 fn set_instr_deps(&mut self, deps: &InstrDeps) {
290 self.set_field(105..109, deps.delay);
291 self.set_bit(109, deps.yld);
292 self.set_field(110..113, deps.wr_bar().unwrap_or(7));
293 self.set_field(113..116, deps.rd_bar().unwrap_or(7));
294 self.set_field(116..122, deps.wt_bar_mask);
295 self.set_field(122..126, deps.reuse_mask);
296 }
297 }
298
299 //
300 // Helpers for encoding of ALU instructions
301 //
302
303 struct ALURegRef {
304 pub reg: RegRef,
305 pub abs: bool,
306 pub neg: bool,
307 pub swizzle: SrcSwizzle,
308 }
309
310 struct ALUCBufRef {
311 pub cb: CBufRef,
312 pub abs: bool,
313 pub neg: bool,
314 pub swizzle: SrcSwizzle,
315 }
316
317 enum ALUSrc {
318 None,
319 Imm32(u32),
320 Reg(ALURegRef),
321 UReg(ALURegRef),
322 CBuf(ALUCBufRef),
323 }
324
src_is_zero_or_gpr(src: &Src) -> bool325 fn src_is_zero_or_gpr(src: &Src) -> bool {
326 match src.src_ref {
327 SrcRef::Zero => true,
328 SrcRef::Reg(reg) => reg.file() == RegFile::GPR,
329 _ => false,
330 }
331 }
332
src_mod_has_abs(src_mod: SrcMod) -> bool333 fn src_mod_has_abs(src_mod: SrcMod) -> bool {
334 match src_mod {
335 SrcMod::None | SrcMod::FNeg | SrcMod::INeg | SrcMod::BNot => false,
336 SrcMod::FAbs | SrcMod::FNegAbs => true,
337 }
338 }
339
src_mod_has_neg(src_mod: SrcMod) -> bool340 fn src_mod_has_neg(src_mod: SrcMod) -> bool {
341 match src_mod {
342 SrcMod::None | SrcMod::FAbs => false,
343 SrcMod::FNeg | SrcMod::FNegAbs | SrcMod::INeg | SrcMod::BNot => true,
344 }
345 }
346
src_mod_is_bnot(src_mod: SrcMod) -> bool347 fn src_mod_is_bnot(src_mod: SrcMod) -> bool {
348 match src_mod {
349 SrcMod::None => false,
350 SrcMod::BNot => true,
351 _ => panic!("Not an predicate source modifier"),
352 }
353 }
354
dst_is_bar(dst: Dst) -> bool355 fn dst_is_bar(dst: Dst) -> bool {
356 match dst {
357 Dst::None => false,
358 Dst::SSA(ssa) => ssa.file().unwrap() == RegFile::Bar,
359 Dst::Reg(reg) => reg.file() == RegFile::Bar,
360 }
361 }
362
363 impl ALUSrc {
from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc364 fn from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc {
365 let Some(src) = src else {
366 return ALUSrc::None;
367 };
368
369 match src.src_ref {
370 SrcRef::Zero | SrcRef::Reg(_) => {
371 let reg = match src.src_ref {
372 SrcRef::Zero => {
373 let file = if op_is_uniform {
374 RegFile::UGPR
375 } else {
376 RegFile::GPR
377 };
378 RegRef::zero(file, 1)
379 }
380 SrcRef::Reg(reg) => reg,
381 _ => panic!("Invalid source ref"),
382 };
383 assert!(reg.comps() <= 2);
384 let alu_ref = ALURegRef {
385 reg: reg,
386 abs: src_mod_has_abs(src.src_mod),
387 neg: src_mod_has_neg(src.src_mod),
388 swizzle: src.src_swizzle,
389 };
390 if op_is_uniform {
391 assert!(reg.file() == RegFile::UGPR);
392 ALUSrc::Reg(alu_ref)
393 } else {
394 match reg.file() {
395 RegFile::GPR => ALUSrc::Reg(alu_ref),
396 RegFile::UGPR => ALUSrc::UReg(alu_ref),
397 _ => panic!("Invalid ALU register file"),
398 }
399 }
400 }
401 SrcRef::Imm32(i) => {
402 assert!(src.src_mod.is_none());
403 assert!(src.src_swizzle.is_none());
404 ALUSrc::Imm32(i)
405 }
406 SrcRef::CBuf(cb) => {
407 let alu_ref = ALUCBufRef {
408 cb: cb,
409 abs: src_mod_has_abs(src.src_mod),
410 neg: src_mod_has_neg(src.src_mod),
411 swizzle: src.src_swizzle,
412 };
413 ALUSrc::CBuf(alu_ref)
414 }
415 _ => panic!("Invalid ALU source"),
416 }
417 }
418
has_src_mod(&self) -> bool419 pub fn has_src_mod(&self) -> bool {
420 match self {
421 ALUSrc::Reg(reg) | ALUSrc::UReg(reg) => reg.abs || reg.neg,
422 ALUSrc::CBuf(cb) => cb.abs || cb.neg,
423 _ => false,
424 }
425 }
426 }
427
428 impl SM70Encoder<'_> {
set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle)429 fn set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle) {
430 assert!(range.len() == 2);
431
432 self.set_field(
433 range,
434 match swizzle {
435 SrcSwizzle::None => 0x00_u8,
436 SrcSwizzle::Xx => 0x02_u8,
437 SrcSwizzle::Yy => 0x03_u8,
438 },
439 );
440 }
441
set_alu_reg( &mut self, range: Range<usize>, abs_bit: usize, neg_bit: usize, swizzle_range: Range<usize>, file: RegFile, is_fp16_alu: bool, has_mod: bool, reg: &ALURegRef, )442 fn set_alu_reg(
443 &mut self,
444 range: Range<usize>,
445 abs_bit: usize,
446 neg_bit: usize,
447 swizzle_range: Range<usize>,
448 file: RegFile,
449 is_fp16_alu: bool,
450 has_mod: bool,
451 reg: &ALURegRef,
452 ) {
453 match file {
454 RegFile::GPR => self.set_reg(range, reg.reg),
455 RegFile::UGPR => self.set_ureg(range, reg.reg),
456 _ => panic!("Invalid ALU src register file"),
457 }
458
459 if has_mod {
460 self.set_bit(abs_bit, reg.abs);
461 self.set_bit(neg_bit, reg.neg);
462 } else {
463 assert!(!reg.abs && !reg.neg);
464 }
465
466 if is_fp16_alu {
467 self.set_swizzle(swizzle_range, reg.swizzle);
468 } else {
469 assert!(reg.swizzle == SrcSwizzle::None);
470 }
471 }
472
encode_alu_src0( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, )473 fn encode_alu_src0(
474 &mut self,
475 src: &ALUSrc,
476 file: RegFile,
477 is_fp16_alu: bool,
478 ) {
479 let reg = match src {
480 ALUSrc::None => return,
481 ALUSrc::Reg(reg) => reg,
482 _ => panic!("Invalid ALU src"),
483 };
484 self.set_alu_reg(24..32, 73, 72, 74..76, file, is_fp16_alu, true, reg);
485 }
486
encode_alu_src2( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, bit74_75_are_mod: bool, )487 fn encode_alu_src2(
488 &mut self,
489 src: &ALUSrc,
490 file: RegFile,
491 is_fp16_alu: bool,
492 bit74_75_are_mod: bool,
493 ) {
494 let reg = match src {
495 ALUSrc::None => return,
496 ALUSrc::Reg(reg) => reg,
497 _ => panic!("Invalid ALU src"),
498 };
499 self.set_alu_reg(
500 64..72,
501 74,
502 75,
503 81..83,
504 file,
505 is_fp16_alu,
506 bit74_75_are_mod,
507 reg,
508 );
509 }
510
encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)511 fn encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
512 self.set_alu_reg(
513 32..40,
514 62,
515 63,
516 60..62,
517 RegFile::GPR,
518 is_fp16_alu,
519 true,
520 reg,
521 );
522 }
523
encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)524 fn encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
525 self.set_ureg(32..40, reg.reg);
526 self.set_bit(62, reg.abs);
527 self.set_bit(63, reg.neg);
528
529 if is_fp16_alu {
530 self.set_swizzle(60..62, reg.swizzle);
531 } else {
532 assert!(reg.swizzle == SrcSwizzle::None);
533 }
534
535 self.set_bit(91, true);
536 }
537
encode_alu_imm(&mut self, imm: &u32)538 fn encode_alu_imm(&mut self, imm: &u32) {
539 self.set_field(32..64, *imm);
540 }
541
encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool)542 fn encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool) {
543 self.set_src_cb(32..59, 91, &cb.cb);
544 self.set_bit(62, cb.abs);
545 self.set_bit(63, cb.neg);
546
547 if is_fp16_alu {
548 self.set_swizzle(60..62, cb.swizzle);
549 } else {
550 assert!(cb.swizzle == SrcSwizzle::None);
551 }
552 }
553
encode_alu_base( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, is_fp16_alu: bool, )554 fn encode_alu_base(
555 &mut self,
556 opcode: u16,
557 dst: Option<&Dst>,
558 src0: Option<&Src>,
559 src1: Option<&Src>,
560 src2: Option<&Src>,
561 is_fp16_alu: bool,
562 ) {
563 if let Some(dst) = dst {
564 self.set_dst(*dst);
565 }
566
567 let src0 = ALUSrc::from_src(src0, false);
568 let src1 = ALUSrc::from_src(src1, false);
569 let src2 = ALUSrc::from_src(src2, false);
570
571 // Bits 74..76 are used both for the swizzle on src0 and for the source
572 // modifier for the register source of src1 and src2. When both are
573 // registers, it's used for src2. The hardware elects to always support
574 // a swizzle and not support source modifiers in that case.
575 let bit74_75_are_mod = !is_fp16_alu
576 || matches!(src1, ALUSrc::None)
577 || matches!(src2, ALUSrc::None);
578 debug_assert!(bit74_75_are_mod || !src0.has_src_mod());
579
580 self.encode_alu_src0(&src0, RegFile::GPR, is_fp16_alu);
581
582 let form = match &src2 {
583 ALUSrc::None | ALUSrc::Reg(_) => {
584 self.encode_alu_src2(
585 &src2,
586 RegFile::GPR,
587 is_fp16_alu,
588 bit74_75_are_mod,
589 );
590 match &src1 {
591 ALUSrc::None => 1_u8, // form
592 ALUSrc::Reg(reg1) => {
593 self.encode_alu_reg(reg1, is_fp16_alu);
594 1_u8 // form
595 }
596 ALUSrc::UReg(reg1) => {
597 self.encode_alu_ureg(reg1, is_fp16_alu);
598 6_u8 // form
599 }
600 ALUSrc::Imm32(imm1) => {
601 self.encode_alu_imm(imm1);
602 4_u8 // form
603 }
604 ALUSrc::CBuf(cb1) => {
605 self.encode_alu_cb(cb1, is_fp16_alu);
606 5_u8 // form
607 }
608 }
609 }
610 ALUSrc::UReg(reg2) => {
611 self.encode_alu_ureg(reg2, is_fp16_alu);
612 self.encode_alu_src2(
613 &src1,
614 RegFile::GPR,
615 is_fp16_alu,
616 bit74_75_are_mod,
617 );
618 7_u8 // form
619 }
620 ALUSrc::Imm32(imm2) => {
621 self.encode_alu_imm(imm2);
622 self.encode_alu_src2(
623 &src1,
624 RegFile::GPR,
625 is_fp16_alu,
626 bit74_75_are_mod,
627 );
628 2_u8 // form
629 }
630 ALUSrc::CBuf(cb2) => {
631 // TODO set_src_cx
632 self.encode_alu_cb(cb2, is_fp16_alu);
633 self.encode_alu_src2(
634 &src1,
635 RegFile::GPR,
636 is_fp16_alu,
637 bit74_75_are_mod,
638 );
639 3_u8 // form
640 }
641 };
642
643 self.set_field(0..9, opcode);
644 self.set_field(9..12, form);
645 }
646
encode_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )647 fn encode_alu(
648 &mut self,
649 opcode: u16,
650 dst: Option<&Dst>,
651 src0: Option<&Src>,
652 src1: Option<&Src>,
653 src2: Option<&Src>,
654 ) {
655 self.encode_alu_base(opcode, dst, src0, src1, src2, false);
656 }
657
encode_fp16_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )658 fn encode_fp16_alu(
659 &mut self,
660 opcode: u16,
661 dst: Option<&Dst>,
662 src0: Option<&Src>,
663 src1: Option<&Src>,
664 src2: Option<&Src>,
665 ) {
666 self.encode_alu_base(opcode, dst, src0, src1, src2, true);
667 }
668
encode_ualu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )669 fn encode_ualu(
670 &mut self,
671 opcode: u16,
672 dst: Option<&Dst>,
673 src0: Option<&Src>,
674 src1: Option<&Src>,
675 src2: Option<&Src>,
676 ) {
677 if let Some(dst) = dst {
678 self.set_udst(*dst);
679 }
680
681 let src0 = ALUSrc::from_src(src0, true);
682 let src1 = ALUSrc::from_src(src1, true);
683 let src2 = ALUSrc::from_src(src2, true);
684
685 // All uniform ALU requires bit 91 set
686 self.set_bit(91, true);
687
688 self.encode_alu_src0(&src0, RegFile::UGPR, false);
689 let form = match &src2 {
690 ALUSrc::None | ALUSrc::Reg(_) => {
691 self.encode_alu_src2(&src2, RegFile::UGPR, false, true);
692 match &src1 {
693 ALUSrc::None => 1_u8, // form
694 ALUSrc::Reg(reg1) => {
695 self.encode_alu_ureg(reg1, false);
696 1_u8 // form
697 }
698 ALUSrc::UReg(_) => panic!("UALU never has UReg"),
699 ALUSrc::Imm32(imm1) => {
700 self.encode_alu_imm(imm1);
701 4_u8 // form
702 }
703 ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
704 }
705 }
706 ALUSrc::UReg(_) => panic!("UALU never has UReg"),
707 ALUSrc::Imm32(imm2) => {
708 self.encode_alu_imm(imm2);
709 self.encode_alu_src2(&src1, RegFile::UGPR, false, true);
710 2_u8 // form
711 }
712 ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
713 };
714
715 self.set_field(0..9, opcode);
716 self.set_field(9..12, form);
717 }
718
set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode)719 fn set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode) {
720 assert!(range.len() == 2);
721 self.set_field(
722 range,
723 match rnd_mode {
724 FRndMode::NearestEven => 0_u8,
725 FRndMode::NegInf => 1_u8,
726 FRndMode::PosInf => 2_u8,
727 FRndMode::Zero => 3_u8,
728 },
729 );
730 }
731 }
732
733 //
734 // Legalization helpers
735 //
736
op_gpr(op: &impl DstsAsSlice) -> RegFile737 fn op_gpr(op: &impl DstsAsSlice) -> RegFile {
738 if op.is_uniform() {
739 RegFile::UGPR
740 } else {
741 RegFile::GPR
742 }
743 }
744
745 /// Helper to legalize extended or external instructions
746 ///
747 /// These are instructions which reach out external units such as load/store
748 /// and texture ops. They typically can't take anything but GPRs and are the
749 /// only types of instructions that support vectors. They also can never be
750 /// uniform so we always evict uniform sources.
751 ///
legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder)752 fn legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder) {
753 let src_types = op.src_types();
754 for (i, src) in op.srcs_as_mut_slice().iter_mut().enumerate() {
755 match src_types[i] {
756 SrcType::SSA | SrcType::GPR => match &mut src.src_ref {
757 SrcRef::Zero | SrcRef::True | SrcRef::False => {
758 assert!(src_types[i] != SrcType::SSA);
759 }
760 SrcRef::SSA(ssa) => {
761 b.copy_ssa_ref_if_uniform(ssa);
762 }
763 _ => panic!("Unsupported source reference"),
764 },
765 SrcType::ALU
766 | SrcType::F16
767 | SrcType::F16v2
768 | SrcType::F32
769 | SrcType::F64
770 | SrcType::I32
771 | SrcType::B32 => {
772 panic!("ALU srcs must be legalized explicitly");
773 }
774 SrcType::Pred => {
775 panic!("Predicates must be legalized explicitly");
776 }
777 SrcType::Carry => {
778 panic!("Carry is invalid on Volta+");
779 }
780 SrcType::Bar => (),
781 }
782 }
783 }
784
785 //
786 // Implementations of SM70Op for each op we support on Volta+
787 //
788
789 impl SM70Op for OpFAdd {
legalize(&mut self, b: &mut LegalizeBuilder)790 fn legalize(&mut self, b: &mut LegalizeBuilder) {
791 let gpr = op_gpr(self);
792 let [src0, src1] = &mut self.srcs;
793 swap_srcs_if_not_reg(src0, src1, gpr);
794 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
795 }
796
encode(&self, e: &mut SM70Encoder<'_>)797 fn encode(&self, e: &mut SM70Encoder<'_>) {
798 if src_is_zero_or_gpr(&self.srcs[1]) {
799 e.encode_alu(
800 0x021,
801 Some(&self.dst),
802 Some(&self.srcs[0]),
803 Some(&self.srcs[1]),
804 None,
805 )
806 } else {
807 e.encode_alu(
808 0x021,
809 Some(&self.dst),
810 Some(&self.srcs[0]),
811 Some(&Src::new_zero()),
812 Some(&self.srcs[1]),
813 )
814 };
815 e.set_bit(77, self.saturate);
816 e.set_rnd_mode(78..80, self.rnd_mode);
817 e.set_bit(80, self.ftz);
818 }
819 }
820
821 impl SM70Op for OpFFma {
legalize(&mut self, b: &mut LegalizeBuilder)822 fn legalize(&mut self, b: &mut LegalizeBuilder) {
823 let gpr = op_gpr(self);
824 let [src0, src1, src2] = &mut self.srcs;
825 swap_srcs_if_not_reg(src0, src1, gpr);
826 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
827 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F32);
828 }
829
encode(&self, e: &mut SM70Encoder<'_>)830 fn encode(&self, e: &mut SM70Encoder<'_>) {
831 e.encode_alu(
832 0x023,
833 Some(&self.dst),
834 Some(&self.srcs[0]),
835 Some(&self.srcs[1]),
836 Some(&self.srcs[2]),
837 );
838 e.set_bit(76, self.dnz);
839 e.set_bit(77, self.saturate);
840 e.set_rnd_mode(78..80, self.rnd_mode);
841 e.set_bit(80, self.ftz);
842 }
843 }
844
845 impl SM70Op for OpFMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)846 fn legalize(&mut self, b: &mut LegalizeBuilder) {
847 let gpr = op_gpr(self);
848 let [src0, src1] = &mut self.srcs;
849 swap_srcs_if_not_reg(src0, src1, gpr);
850 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
851 }
852
encode(&self, e: &mut SM70Encoder<'_>)853 fn encode(&self, e: &mut SM70Encoder<'_>) {
854 e.encode_alu(
855 0x009,
856 Some(&self.dst),
857 Some(&self.srcs[0]),
858 Some(&self.srcs[1]),
859 Some(&Src::new_zero()),
860 );
861 e.set_pred_src(87..90, 90, self.min);
862 e.set_bit(80, self.ftz);
863 }
864 }
865
866 impl SM70Op for OpFMul {
legalize(&mut self, b: &mut LegalizeBuilder)867 fn legalize(&mut self, b: &mut LegalizeBuilder) {
868 let gpr = op_gpr(self);
869 let [src0, src1] = &mut self.srcs;
870 swap_srcs_if_not_reg(src0, src1, gpr);
871 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
872 }
873
encode(&self, e: &mut SM70Encoder<'_>)874 fn encode(&self, e: &mut SM70Encoder<'_>) {
875 e.encode_alu(
876 0x020,
877 Some(&self.dst),
878 Some(&self.srcs[0]),
879 Some(&self.srcs[1]),
880 Some(&Src::new_zero()),
881 );
882 e.set_bit(76, self.dnz);
883 e.set_bit(77, self.saturate);
884 e.set_rnd_mode(78..80, self.rnd_mode);
885 e.set_bit(80, self.ftz);
886 e.set_field(84..87, 0x4_u8); // TODO: PDIV
887 }
888 }
889
890 impl SM70Encoder<'_> {
set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp)891 fn set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp) {
892 assert!(range.len() == 4);
893 self.set_field(
894 range,
895 match op {
896 FloatCmpOp::OrdLt => 0x01_u8,
897 FloatCmpOp::OrdEq => 0x02_u8,
898 FloatCmpOp::OrdLe => 0x03_u8,
899 FloatCmpOp::OrdGt => 0x04_u8,
900 FloatCmpOp::OrdNe => 0x05_u8,
901 FloatCmpOp::OrdGe => 0x06_u8,
902 FloatCmpOp::UnordLt => 0x09_u8,
903 FloatCmpOp::UnordEq => 0x0a_u8,
904 FloatCmpOp::UnordLe => 0x0b_u8,
905 FloatCmpOp::UnordGt => 0x0c_u8,
906 FloatCmpOp::UnordNe => 0x0d_u8,
907 FloatCmpOp::UnordGe => 0x0e_u8,
908 FloatCmpOp::IsNum => 0x07_u8,
909 FloatCmpOp::IsNan => 0x08_u8,
910 },
911 );
912 }
913
set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp)914 fn set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp) {
915 assert!(range.len() == 2);
916 self.set_field(
917 range,
918 match op {
919 PredSetOp::And => 0_u8,
920 PredSetOp::Or => 1_u8,
921 PredSetOp::Xor => 2_u8,
922 },
923 );
924 }
925
set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp)926 fn set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp) {
927 assert!(range.len() == 3);
928 self.set_field(
929 range,
930 match op {
931 IntCmpOp::Eq => 2_u8,
932 IntCmpOp::Ne => 5_u8,
933 IntCmpOp::Lt => 1_u8,
934 IntCmpOp::Le => 3_u8,
935 IntCmpOp::Gt => 4_u8,
936 IntCmpOp::Ge => 6_u8,
937 },
938 );
939 }
940 }
941
942 impl SM70Op for OpFSet {
legalize(&mut self, b: &mut LegalizeBuilder)943 fn legalize(&mut self, b: &mut LegalizeBuilder) {
944 let gpr = op_gpr(self);
945 let [src0, src1] = &mut self.srcs;
946 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
947 std::mem::swap(src0, src1);
948 self.cmp_op = self.cmp_op.flip();
949 }
950 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
951 }
952
encode(&self, e: &mut SM70Encoder<'_>)953 fn encode(&self, e: &mut SM70Encoder<'_>) {
954 e.encode_alu(
955 0x00a,
956 Some(&self.dst),
957 Some(&self.srcs[0]),
958 Some(&self.srcs[1]),
959 None,
960 );
961 e.set_float_cmp_op(76..80, self.cmp_op);
962 e.set_bit(80, self.ftz);
963 e.set_field(87..90, 0x7_u8); // TODO: src predicate
964 }
965 }
966
967 impl SM70Op for OpFSetP {
legalize(&mut self, b: &mut LegalizeBuilder)968 fn legalize(&mut self, b: &mut LegalizeBuilder) {
969 let gpr = op_gpr(self);
970 let [src0, src1] = &mut self.srcs;
971 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
972 std::mem::swap(src0, src1);
973 self.cmp_op = self.cmp_op.flip();
974 }
975 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
976 }
977
encode(&self, e: &mut SM70Encoder<'_>)978 fn encode(&self, e: &mut SM70Encoder<'_>) {
979 e.encode_alu(
980 0x00b,
981 None,
982 Some(&self.srcs[0]),
983 Some(&self.srcs[1]),
984 None,
985 );
986
987 e.set_pred_set_op(74..76, self.set_op);
988 e.set_float_cmp_op(76..80, self.cmp_op);
989 e.set_bit(80, self.ftz);
990
991 e.set_pred_dst(81..84, self.dst);
992 e.set_pred_dst(84..87, Dst::None); // dst1
993
994 e.set_pred_src(87..90, 90, self.accum);
995 }
996 }
997
998 impl SM70Op for OpFSwzAdd {
legalize(&mut self, b: &mut LegalizeBuilder)999 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1000 let gpr = op_gpr(self);
1001 let [src0, src1] = &mut self.srcs;
1002 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
1003 b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F32);
1004 }
1005
encode(&self, e: &mut SM70Encoder<'_>)1006 fn encode(&self, e: &mut SM70Encoder<'_>) {
1007 e.set_opcode(0x822);
1008 e.set_dst(self.dst);
1009
1010 e.set_reg_src(24..32, self.srcs[0]);
1011 e.set_reg_src(64..72, self.srcs[1]);
1012
1013 let mut subop = 0x0_u8;
1014
1015 for (i, swz_op) in self.ops.iter().enumerate() {
1016 let swz_op = match swz_op {
1017 FSwzAddOp::Add => 0,
1018 FSwzAddOp::SubRight => 2,
1019 FSwzAddOp::SubLeft => 1,
1020 FSwzAddOp::MoveLeft => 3,
1021 };
1022
1023 subop |= swz_op << ((self.ops.len() - i - 1) * 2);
1024 }
1025
1026 e.set_field(32..40, subop);
1027
1028 e.set_bit(77, false); // NDV
1029 e.set_rnd_mode(78..80, self.rnd_mode);
1030 e.set_bit(80, self.ftz);
1031 }
1032 }
1033
1034 impl SM70Op for OpMuFu {
legalize(&mut self, _b: &mut LegalizeBuilder)1035 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1036 // Nothing to do
1037 }
1038
encode(&self, e: &mut SM70Encoder<'_>)1039 fn encode(&self, e: &mut SM70Encoder<'_>) {
1040 e.encode_alu(0x108, Some(&self.dst), None, Some(&self.src), None);
1041 e.set_field(
1042 74..80,
1043 match self.op {
1044 MuFuOp::Cos => 0_u8,
1045 MuFuOp::Sin => 1_u8,
1046 MuFuOp::Exp2 => 2_u8,
1047 MuFuOp::Log2 => 3_u8,
1048 MuFuOp::Rcp => 4_u8,
1049 MuFuOp::Rsq => 5_u8,
1050 MuFuOp::Rcp64H => 6_u8,
1051 MuFuOp::Rsq64H => 7_u8,
1052 MuFuOp::Sqrt => 8_u8,
1053 MuFuOp::Tanh => 9_u8,
1054 },
1055 );
1056 }
1057 }
1058
1059 impl SM70Op for OpDAdd {
legalize(&mut self, b: &mut LegalizeBuilder)1060 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1061 let gpr = op_gpr(self);
1062 let [src0, src1] = &mut self.srcs;
1063 swap_srcs_if_not_reg(src0, src1, gpr);
1064 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1065 }
1066
encode(&self, e: &mut SM70Encoder<'_>)1067 fn encode(&self, e: &mut SM70Encoder<'_>) {
1068 e.encode_alu(
1069 0x029,
1070 Some(&self.dst),
1071 Some(&self.srcs[0]),
1072 None,
1073 Some(&self.srcs[1]),
1074 );
1075 e.set_rnd_mode(78..80, self.rnd_mode);
1076 }
1077 }
1078
1079 impl SM70Op for OpDFma {
legalize(&mut self, b: &mut LegalizeBuilder)1080 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1081 let gpr = op_gpr(self);
1082 let [src0, src1, src2] = &mut self.srcs;
1083 swap_srcs_if_not_reg(src0, src1, gpr);
1084 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1085 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F64);
1086 }
1087
encode(&self, e: &mut SM70Encoder<'_>)1088 fn encode(&self, e: &mut SM70Encoder<'_>) {
1089 e.encode_alu(
1090 0x02b,
1091 Some(&self.dst),
1092 Some(&self.srcs[0]),
1093 Some(&self.srcs[1]),
1094 Some(&self.srcs[2]),
1095 );
1096 e.set_rnd_mode(78..80, self.rnd_mode);
1097 }
1098 }
1099
1100 impl SM70Op for OpDMul {
legalize(&mut self, b: &mut LegalizeBuilder)1101 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1102 let gpr = op_gpr(self);
1103 let [src0, src1] = &mut self.srcs;
1104 swap_srcs_if_not_reg(src0, src1, gpr);
1105 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1106 }
1107
encode(&self, e: &mut SM70Encoder<'_>)1108 fn encode(&self, e: &mut SM70Encoder<'_>) {
1109 e.encode_alu(
1110 0x028,
1111 Some(&self.dst),
1112 Some(&self.srcs[0]),
1113 Some(&self.srcs[1]),
1114 None,
1115 );
1116 e.set_rnd_mode(78..80, self.rnd_mode);
1117 }
1118 }
1119
1120 impl SM70Op for OpDSetP {
legalize(&mut self, b: &mut LegalizeBuilder)1121 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1122 let gpr = op_gpr(self);
1123 let [src0, src1] = &mut self.srcs;
1124 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1125 std::mem::swap(src0, src1);
1126 self.cmp_op = self.cmp_op.flip();
1127 }
1128 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1129 }
1130
encode(&self, e: &mut SM70Encoder<'_>)1131 fn encode(&self, e: &mut SM70Encoder<'_>) {
1132 if src_is_zero_or_gpr(&self.srcs[1]) {
1133 e.encode_alu(
1134 0x02a,
1135 None,
1136 Some(&self.srcs[0]),
1137 Some(&self.srcs[1]),
1138 None,
1139 )
1140 } else {
1141 e.encode_alu(
1142 0x02a,
1143 None,
1144 Some(&self.srcs[0]),
1145 None,
1146 Some(&self.srcs[1]),
1147 )
1148 };
1149
1150 e.set_pred_set_op(74..76, self.set_op);
1151 e.set_float_cmp_op(76..80, self.cmp_op);
1152
1153 e.set_pred_dst(81..84, self.dst);
1154 e.set_pred_dst(84..87, Dst::None); /* dst1 */
1155
1156 e.set_pred_src(87..90, 90, self.accum);
1157 }
1158 }
1159
1160 impl SM70Op for OpHAdd2 {
legalize(&mut self, b: &mut LegalizeBuilder)1161 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1162 let gpr = op_gpr(self);
1163 let [src0, src1] = &mut self.srcs;
1164 swap_srcs_if_not_reg(src0, src1, gpr);
1165 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1166 }
1167
encode(&self, e: &mut SM70Encoder<'_>)1168 fn encode(&self, e: &mut SM70Encoder<'_>) {
1169 if src_is_zero_or_gpr(&self.srcs[1]) {
1170 e.encode_fp16_alu(
1171 0x030,
1172 Some(&self.dst),
1173 Some(&self.srcs[0]),
1174 Some(&self.srcs[1]),
1175 None,
1176 )
1177 } else {
1178 e.encode_fp16_alu(
1179 0x030,
1180 Some(&self.dst),
1181 Some(&self.srcs[0]),
1182 None,
1183 Some(&self.srcs[1]),
1184 )
1185 };
1186
1187 e.set_bit(77, self.saturate);
1188 e.set_bit(78, self.f32);
1189 e.set_bit(80, self.ftz);
1190 e.set_bit(85, false); // .BF16_V2 (SM90+)
1191 }
1192 }
1193
1194 impl SM70Op for OpHFma2 {
legalize(&mut self, b: &mut LegalizeBuilder)1195 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1196 let gpr = op_gpr(self);
1197 let [src0, src1, src2] = &mut self.srcs;
1198 swap_srcs_if_not_reg(src0, src1, gpr);
1199 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1200 b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F16v2);
1201 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F16v2);
1202
1203 // HFMA2 doesn't have fabs or fneg on SRC2.
1204 if !src2.src_mod.is_none() {
1205 b.copy_alu_src_and_lower_fmod(src2, SrcType::F16v2);
1206 }
1207 }
1208
encode(&self, e: &mut SM70Encoder<'_>)1209 fn encode(&self, e: &mut SM70Encoder<'_>) {
1210 // HFMA2 doesn't have fneg and fabs on SRC2.
1211 assert!(self.srcs[2].src_mod.is_none());
1212
1213 e.encode_fp16_alu(
1214 0x031,
1215 Some(&self.dst),
1216 Some(&self.srcs[0]),
1217 Some(&self.srcs[1]),
1218 Some(&self.srcs[2]),
1219 );
1220
1221 e.set_bit(76, self.dnz);
1222 e.set_bit(77, self.saturate);
1223 e.set_bit(78, self.f32);
1224 e.set_bit(79, false); // .RELU (SM86+)
1225 e.set_bit(80, self.ftz);
1226 e.set_bit(85, false); // .BF16_V2 (SM86+)
1227 }
1228 }
1229
1230 impl SM70Op for OpHMul2 {
legalize(&mut self, b: &mut LegalizeBuilder)1231 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1232 let gpr = op_gpr(self);
1233 let [src0, src1] = &mut self.srcs;
1234 swap_srcs_if_not_reg(src0, src1, gpr);
1235 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1236 }
1237
encode(&self, e: &mut SM70Encoder<'_>)1238 fn encode(&self, e: &mut SM70Encoder<'_>) {
1239 e.encode_fp16_alu(
1240 0x032,
1241 Some(&self.dst),
1242 Some(&self.srcs[0]),
1243 Some(&self.srcs[1]),
1244 None,
1245 );
1246
1247 e.set_bit(76, self.dnz);
1248 e.set_bit(77, self.saturate);
1249 e.set_bit(78, false); // .F32 (SM70-SM75)
1250 e.set_bit(79, false); // .RELU (SM86+)
1251 e.set_bit(80, self.ftz);
1252 e.set_bit(85, false); // .BF16_V2 (SM90+)
1253 }
1254 }
1255
1256 impl SM70Op for OpHSet2 {
legalize(&mut self, b: &mut LegalizeBuilder)1257 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1258 let gpr = op_gpr(self);
1259 let [src0, src1] = &mut self.srcs;
1260 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1261 std::mem::swap(src0, src1);
1262 self.cmp_op = self.cmp_op.flip();
1263 }
1264 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1265 }
1266
encode(&self, e: &mut SM70Encoder<'_>)1267 fn encode(&self, e: &mut SM70Encoder<'_>) {
1268 if src_is_zero_or_gpr(&self.srcs[1]) {
1269 e.encode_fp16_alu(
1270 0x033,
1271 Some(&self.dst),
1272 Some(&self.srcs[0]),
1273 Some(&self.srcs[1]),
1274 None,
1275 )
1276 } else {
1277 e.encode_fp16_alu(
1278 0x033,
1279 Some(&self.dst),
1280 Some(&self.srcs[0]),
1281 None,
1282 Some(&self.srcs[1]),
1283 )
1284 };
1285
1286 e.set_bit(65, false); // .BF16_V2 (SM90+)
1287 e.set_pred_set_op(69..71, self.set_op);
1288
1289 // This differentiate between integer and fp16 output
1290 e.set_bit(71, true); // .BF
1291 e.set_float_cmp_op(76..80, self.cmp_op);
1292 e.set_bit(80, self.ftz);
1293
1294 e.set_pred_src(87..90, 90, self.accum);
1295 }
1296 }
1297
1298 impl SM70Op for OpHSetP2 {
legalize(&mut self, b: &mut LegalizeBuilder)1299 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1300 let gpr = op_gpr(self);
1301 let [src0, src1] = &mut self.srcs;
1302 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1303 std::mem::swap(src0, src1);
1304 self.cmp_op = self.cmp_op.flip();
1305 }
1306 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1307 }
1308
encode(&self, e: &mut SM70Encoder<'_>)1309 fn encode(&self, e: &mut SM70Encoder<'_>) {
1310 if src_is_zero_or_gpr(&self.srcs[1]) {
1311 e.encode_fp16_alu(
1312 0x034,
1313 None,
1314 Some(&self.srcs[0]),
1315 Some(&self.srcs[1]),
1316 None,
1317 )
1318 } else {
1319 e.encode_fp16_alu(
1320 0x034,
1321 None,
1322 Some(&self.srcs[0]),
1323 None,
1324 Some(&self.srcs[1]),
1325 )
1326 };
1327
1328 e.set_bit(65, false); // .BF16_V2 (SM90+)
1329 e.set_pred_set_op(69..71, self.set_op);
1330 e.set_bit(71, self.horizontal); // .H_AND
1331 e.set_float_cmp_op(76..80, self.cmp_op);
1332 e.set_bit(80, self.ftz);
1333
1334 e.set_pred_dst(81..84, self.dsts[0]);
1335 e.set_pred_dst(84..87, self.dsts[1]);
1336
1337 e.set_pred_src(87..90, 90, self.accum);
1338 }
1339 }
1340
1341 impl SM70Op for OpHMnMx2 {
legalize(&mut self, b: &mut LegalizeBuilder)1342 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1343 let gpr = op_gpr(self);
1344 let [src0, src1] = &mut self.srcs;
1345 swap_srcs_if_not_reg(src0, src1, gpr);
1346 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1347 }
1348
encode(&self, e: &mut SM70Encoder<'_>)1349 fn encode(&self, e: &mut SM70Encoder<'_>) {
1350 assert!(e.sm.sm >= 80);
1351
1352 e.encode_fp16_alu(
1353 0x040,
1354 Some(&self.dst),
1355 Some(&self.srcs[0]),
1356 Some(&self.srcs[1]),
1357 None,
1358 );
1359
1360 // This differentiate between integer and fp16 output
1361 e.set_bit(78, false); // .F32 (SM86)
1362 e.set_bit(80, self.ftz);
1363 e.set_bit(81, false); // .NAN
1364 e.set_bit(82, false); // .XORSIGN
1365 e.set_bit(85, false); // .BF16_V2
1366
1367 e.set_pred_src(87..90, 90, self.min);
1368 }
1369 }
1370
1371 impl SM70Op for OpBMsk {
legalize(&mut self, b: &mut LegalizeBuilder)1372 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1373 let gpr = op_gpr(self);
1374 b.copy_alu_src_if_not_reg(&mut self.pos, gpr, SrcType::ALU);
1375 }
1376
encode(&self, e: &mut SM70Encoder<'_>)1377 fn encode(&self, e: &mut SM70Encoder<'_>) {
1378 if self.is_uniform() {
1379 e.encode_ualu(
1380 0x09b,
1381 Some(&self.dst),
1382 Some(&self.pos),
1383 Some(&self.width),
1384 None,
1385 )
1386 } else {
1387 e.encode_alu(
1388 0x01b,
1389 Some(&self.dst),
1390 Some(&self.pos),
1391 Some(&self.width),
1392 None,
1393 )
1394 };
1395
1396 e.set_bit(75, self.wrap);
1397 }
1398 }
1399
1400 impl SM70Op for OpBRev {
legalize(&mut self, _b: &mut LegalizeBuilder)1401 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1402 // Nothing to do
1403 }
1404
encode(&self, e: &mut SM70Encoder<'_>)1405 fn encode(&self, e: &mut SM70Encoder<'_>) {
1406 if self.is_uniform() {
1407 e.encode_ualu(0x0be, Some(&self.dst), None, Some(&self.src), None)
1408 } else {
1409 e.encode_alu(0x101, Some(&self.dst), None, Some(&self.src), None)
1410 }
1411 }
1412 }
1413
1414 impl SM70Op for OpFlo {
legalize(&mut self, _b: &mut LegalizeBuilder)1415 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1416 // Nothing to do
1417 }
1418
encode(&self, e: &mut SM70Encoder<'_>)1419 fn encode(&self, e: &mut SM70Encoder<'_>) {
1420 if self.is_uniform() {
1421 e.encode_ualu(0x0bd, Some(&self.dst), None, Some(&self.src), None)
1422 } else {
1423 e.encode_alu(0x100, Some(&self.dst), None, Some(&self.src), None)
1424 };
1425 e.set_pred_dst(81..84, Dst::None);
1426 e.set_field(74..75, self.return_shift_amount as u8);
1427 e.set_field(73..74, self.signed as u8);
1428 let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1429 e.set_field(63..64, not_mod)
1430 }
1431 }
1432
1433 impl SM70Op for OpIAbs {
legalize(&mut self, _b: &mut LegalizeBuilder)1434 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1435 // Nothing to do
1436 }
1437
encode(&self, e: &mut SM70Encoder<'_>)1438 fn encode(&self, e: &mut SM70Encoder<'_>) {
1439 e.encode_alu(0x013, Some(&self.dst), None, Some(&self.src), None)
1440 }
1441 }
1442
1443 impl SM70Op for OpIAdd3 {
legalize(&mut self, b: &mut LegalizeBuilder)1444 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1445 let gpr = op_gpr(self);
1446 let [src0, src1, src2] = &mut self.srcs;
1447 swap_srcs_if_not_reg(src0, src1, gpr);
1448 swap_srcs_if_not_reg(src2, src1, gpr);
1449 if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1450 assert!(self.overflow[0].is_none());
1451 assert!(self.overflow[1].is_none());
1452 let val = b.alloc_ssa(gpr, 1);
1453 b.push_op(OpIAdd3 {
1454 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1455 overflow: [Dst::None; 2],
1456 dst: val.into(),
1457 });
1458 *src0 = val.into();
1459 }
1460 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::I32);
1461 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::I32);
1462 if !self.overflow[0].is_none() || !self.overflow[1].is_none() {
1463 b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1464 b.copy_alu_src_if_ineg_imm(src2, gpr, SrcType::I32);
1465 }
1466 }
1467
encode(&self, e: &mut SM70Encoder<'_>)1468 fn encode(&self, e: &mut SM70Encoder<'_>) {
1469 // Hardware requires at least one of these be unmodified
1470 assert!(
1471 self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1472 );
1473
1474 if self.is_uniform() {
1475 e.encode_ualu(
1476 0x090,
1477 Some(&self.dst),
1478 Some(&self.srcs[0]),
1479 Some(&self.srcs[1]),
1480 Some(&self.srcs[2]),
1481 )
1482 } else {
1483 e.encode_alu(
1484 0x010,
1485 Some(&self.dst),
1486 Some(&self.srcs[0]),
1487 Some(&self.srcs[1]),
1488 Some(&self.srcs[2]),
1489 )
1490 };
1491
1492 e.set_pred_src(87..90, 90, false.into());
1493 e.set_pred_src(77..80, 80, false.into());
1494
1495 e.set_pred_dst(81..84, self.overflow[0]);
1496 e.set_pred_dst(84..87, self.overflow[1]);
1497 }
1498 }
1499
1500 impl SM70Op for OpIAdd3X {
legalize(&mut self, b: &mut LegalizeBuilder)1501 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1502 let gpr = op_gpr(self);
1503 let [src0, src1, src2] = &mut self.srcs;
1504 swap_srcs_if_not_reg(src0, src1, gpr);
1505 swap_srcs_if_not_reg(src2, src1, gpr);
1506 if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1507 let val = b.alloc_ssa(gpr, 1);
1508 b.push_op(OpIAdd3X {
1509 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1510 overflow: [Dst::None; 2],
1511 dst: val.into(),
1512 carry: [false.into(); 2],
1513 });
1514 *src0 = val.into();
1515 }
1516 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::B32);
1517 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::B32);
1518 if !self.is_uniform() {
1519 b.copy_src_if_upred(&mut self.carry[0]);
1520 b.copy_src_if_upred(&mut self.carry[1]);
1521 }
1522 }
1523
encode(&self, e: &mut SM70Encoder<'_>)1524 fn encode(&self, e: &mut SM70Encoder<'_>) {
1525 // Hardware requires at least one of these be unmodified
1526 assert!(
1527 self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1528 );
1529
1530 if self.is_uniform() {
1531 e.encode_ualu(
1532 0x090,
1533 Some(&self.dst),
1534 Some(&self.srcs[0]),
1535 Some(&self.srcs[1]),
1536 Some(&self.srcs[2]),
1537 );
1538
1539 e.set_upred_src(87..90, 90, self.carry[0]);
1540 e.set_upred_src(77..80, 80, self.carry[1]);
1541 } else {
1542 e.encode_alu(
1543 0x010,
1544 Some(&self.dst),
1545 Some(&self.srcs[0]),
1546 Some(&self.srcs[1]),
1547 Some(&self.srcs[2]),
1548 );
1549
1550 e.set_pred_src(87..90, 90, self.carry[0]);
1551 e.set_pred_src(77..80, 80, self.carry[1]);
1552 }
1553
1554 e.set_bit(74, true); // .X
1555
1556 e.set_pred_dst(81..84, self.overflow[0]);
1557 e.set_pred_dst(84..87, self.overflow[1]);
1558 }
1559 }
1560
1561 impl SM70Op for OpIDp4 {
legalize(&mut self, b: &mut LegalizeBuilder)1562 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1563 let gpr = op_gpr(self);
1564 let [src_type0, src_type1] = &mut self.src_types;
1565 let [src0, src1, src2] = &mut self.srcs;
1566 if swap_srcs_if_not_reg(src0, src1, gpr) {
1567 std::mem::swap(src_type0, src_type1);
1568 }
1569 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1570 b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1571 b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1572 }
1573
encode(&self, e: &mut SM70Encoder<'_>)1574 fn encode(&self, e: &mut SM70Encoder<'_>) {
1575 e.encode_alu(
1576 0x026,
1577 Some(&self.dst),
1578 Some(&self.srcs[0]),
1579 Some(&self.srcs[1]),
1580 Some(&self.srcs[2]),
1581 );
1582
1583 e.set_bit(
1584 73,
1585 match self.src_types[0] {
1586 IntType::U8 => false,
1587 IntType::I8 => true,
1588 _ => panic!("Invalid DP4 source type"),
1589 },
1590 );
1591 e.set_bit(
1592 74,
1593 match self.src_types[1] {
1594 IntType::U8 => false,
1595 IntType::I8 => true,
1596 _ => panic!("Invalid DP4 source type"),
1597 },
1598 );
1599 }
1600 }
1601
1602 impl SM70Op for OpIMad {
legalize(&mut self, b: &mut LegalizeBuilder)1603 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1604 let gpr = op_gpr(self);
1605 let [src0, src1, src2] = &mut self.srcs;
1606 swap_srcs_if_not_reg(src0, src1, gpr);
1607 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1608 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1609 }
1610
encode(&self, e: &mut SM70Encoder<'_>)1611 fn encode(&self, e: &mut SM70Encoder<'_>) {
1612 if self.is_uniform() {
1613 e.encode_ualu(
1614 0x0a4,
1615 Some(&self.dst),
1616 Some(&self.srcs[0]),
1617 Some(&self.srcs[1]),
1618 Some(&self.srcs[2]),
1619 )
1620 } else {
1621 e.encode_alu(
1622 0x024,
1623 Some(&self.dst),
1624 Some(&self.srcs[0]),
1625 Some(&self.srcs[1]),
1626 Some(&self.srcs[2]),
1627 )
1628 };
1629 e.set_pred_dst(81..84, Dst::None);
1630 e.set_bit(73, self.signed);
1631 }
1632 }
1633
1634 impl SM70Op for OpIMad64 {
legalize(&mut self, b: &mut LegalizeBuilder)1635 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1636 let gpr = op_gpr(self);
1637 let [src0, src1, src2] = &mut self.srcs;
1638 swap_srcs_if_not_reg(src0, src1, gpr);
1639 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1640 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1641 }
1642
encode(&self, e: &mut SM70Encoder<'_>)1643 fn encode(&self, e: &mut SM70Encoder<'_>) {
1644 if self.is_uniform() {
1645 e.encode_ualu(
1646 0x0a5,
1647 Some(&self.dst),
1648 Some(&self.srcs[0]),
1649 Some(&self.srcs[1]),
1650 Some(&self.srcs[2]),
1651 )
1652 } else {
1653 e.encode_alu(
1654 0x025,
1655 Some(&self.dst),
1656 Some(&self.srcs[0]),
1657 Some(&self.srcs[1]),
1658 Some(&self.srcs[2]),
1659 )
1660 };
1661 e.set_pred_dst(81..84, Dst::None);
1662 e.set_bit(73, self.signed);
1663 }
1664 }
1665
1666 impl SM70Op for OpIMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)1667 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1668 let gpr = op_gpr(self);
1669 let [src0, src1] = &mut self.srcs;
1670 swap_srcs_if_not_reg(src0, src1, gpr);
1671 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1672 }
1673
encode(&self, e: &mut SM70Encoder<'_>)1674 fn encode(&self, e: &mut SM70Encoder<'_>) {
1675 e.encode_alu(
1676 0x017,
1677 Some(&self.dst),
1678 Some(&self.srcs[0]),
1679 Some(&self.srcs[1]),
1680 None,
1681 );
1682 e.set_pred_src(87..90, 90, self.min);
1683 e.set_bit(
1684 73,
1685 match self.cmp_type {
1686 IntCmpType::U32 => false,
1687 IntCmpType::I32 => true,
1688 },
1689 );
1690 }
1691 }
1692
1693 impl SM70Op for OpISetP {
legalize(&mut self, b: &mut LegalizeBuilder)1694 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1695 let gpr = op_gpr(self);
1696 let [src0, src1] = &mut self.srcs;
1697 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1698 std::mem::swap(src0, src1);
1699 self.cmp_op = self.cmp_op.flip();
1700 }
1701 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1702 if !self.is_uniform() {
1703 b.copy_src_if_upred(&mut self.low_cmp);
1704 b.copy_src_if_upred(&mut self.accum);
1705 }
1706 }
1707
encode(&self, e: &mut SM70Encoder<'_>)1708 fn encode(&self, e: &mut SM70Encoder<'_>) {
1709 if self.is_uniform() {
1710 e.encode_ualu(
1711 0x08c,
1712 None,
1713 Some(&self.srcs[0]),
1714 Some(&self.srcs[1]),
1715 None,
1716 );
1717
1718 e.set_upred_src(68..71, 71, self.low_cmp);
1719 e.set_upred_src(87..90, 90, self.accum);
1720 } else {
1721 e.encode_alu(
1722 0x00c,
1723 None,
1724 Some(&self.srcs[0]),
1725 Some(&self.srcs[1]),
1726 None,
1727 );
1728
1729 e.set_pred_src(68..71, 71, self.low_cmp);
1730 e.set_pred_src(87..90, 90, self.accum);
1731 }
1732
1733 e.set_bit(72, self.ex);
1734
1735 e.set_field(
1736 73..74,
1737 match self.cmp_type {
1738 IntCmpType::U32 => 0_u32,
1739 IntCmpType::I32 => 1_u32,
1740 },
1741 );
1742 e.set_pred_set_op(74..76, self.set_op);
1743 e.set_int_cmp_op(76..79, self.cmp_op);
1744
1745 e.set_pred_dst(81..84, self.dst);
1746 e.set_pred_dst(84..87, Dst::None); // dst1
1747 }
1748 }
1749
src_as_lop_imm(src: &Src) -> Option<bool>1750 fn src_as_lop_imm(src: &Src) -> Option<bool> {
1751 let x = match src.src_ref {
1752 SrcRef::Zero => false,
1753 SrcRef::True => true,
1754 SrcRef::False => false,
1755 SrcRef::Imm32(i) => {
1756 if i == 0 {
1757 false
1758 } else if i == !0 {
1759 true
1760 } else {
1761 return None;
1762 }
1763 }
1764 _ => return None,
1765 };
1766 Some(x ^ src.src_mod.is_bnot())
1767 }
1768
fold_lop_src(src: &Src, x: &mut u8)1769 fn fold_lop_src(src: &Src, x: &mut u8) {
1770 if let Some(i) = src_as_lop_imm(src) {
1771 *x = if i { !0 } else { 0 };
1772 }
1773 if src.src_mod.is_bnot() {
1774 *x = !*x;
1775 }
1776 }
1777
1778 impl SM70Op for OpLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)1779 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1780 let gpr = op_gpr(self);
1781 // Fold constants and modifiers if we can
1782 self.op = LogicOp3::new_lut(&|mut x, mut y, mut z| {
1783 fold_lop_src(&self.srcs[0], &mut x);
1784 fold_lop_src(&self.srcs[1], &mut y);
1785 fold_lop_src(&self.srcs[2], &mut z);
1786 self.op.eval(x, y, z)
1787 });
1788 for src in &mut self.srcs {
1789 src.src_mod = SrcMod::None;
1790 if src_as_lop_imm(src).is_some() {
1791 src.src_ref = SrcRef::Zero;
1792 }
1793 }
1794
1795 let [src0, src1, src2] = &mut self.srcs;
1796 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1797 std::mem::swap(src0, src1);
1798 self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(y, x, z))
1799 }
1800 if !src_is_reg(src2, gpr) && src_is_reg(src1, gpr) {
1801 std::mem::swap(src2, src1);
1802 self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(x, z, y))
1803 }
1804
1805 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1806 b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1807 }
1808
encode(&self, e: &mut SM70Encoder<'_>)1809 fn encode(&self, e: &mut SM70Encoder<'_>) {
1810 if self.is_uniform() {
1811 e.encode_ualu(
1812 0x092,
1813 Some(&self.dst),
1814 Some(&self.srcs[0]),
1815 Some(&self.srcs[1]),
1816 Some(&self.srcs[2]),
1817 );
1818
1819 e.set_upred_src(87..90, 90, SrcRef::False.into());
1820 } else {
1821 e.encode_alu(
1822 0x012,
1823 Some(&self.dst),
1824 Some(&self.srcs[0]),
1825 Some(&self.srcs[1]),
1826 Some(&self.srcs[2]),
1827 );
1828
1829 e.set_pred_src(87..90, 90, SrcRef::False.into());
1830 }
1831
1832 e.set_field(72..80, self.op.lut);
1833 e.set_bit(80, false); // .PAND
1834 e.set_field(81..84, 7_u32); // pred
1835 }
1836 }
1837
1838 impl SM70Op for OpPopC {
legalize(&mut self, _b: &mut LegalizeBuilder)1839 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1840 // Nothing to do
1841 }
1842
encode(&self, e: &mut SM70Encoder<'_>)1843 fn encode(&self, e: &mut SM70Encoder<'_>) {
1844 if self.is_uniform() {
1845 e.encode_ualu(0x0bf, Some(&self.dst), None, Some(&self.src), None)
1846 } else {
1847 e.encode_alu(0x109, Some(&self.dst), None, Some(&self.src), None)
1848 };
1849
1850 let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1851 e.set_field(63..64, not_mod);
1852 }
1853 }
1854
1855 impl SM70Op for OpShf {
legalize(&mut self, b: &mut LegalizeBuilder)1856 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1857 let gpr = op_gpr(self);
1858 b.copy_alu_src_if_not_reg(&mut self.low, gpr, SrcType::ALU);
1859 b.copy_alu_src_if_both_not_reg(
1860 &self.shift,
1861 &mut self.high,
1862 gpr,
1863 SrcType::ALU,
1864 );
1865 }
1866
encode(&self, e: &mut SM70Encoder<'_>)1867 fn encode(&self, e: &mut SM70Encoder<'_>) {
1868 if self.is_uniform() {
1869 e.encode_ualu(
1870 0x099,
1871 Some(&self.dst),
1872 Some(&self.low),
1873 Some(&self.shift),
1874 Some(&self.high),
1875 )
1876 } else {
1877 e.encode_alu(
1878 0x019,
1879 Some(&self.dst),
1880 Some(&self.low),
1881 Some(&self.shift),
1882 Some(&self.high),
1883 )
1884 };
1885
1886 e.set_field(
1887 73..75,
1888 match self.data_type {
1889 IntType::I64 => 0_u8,
1890 IntType::U64 => 1_u8,
1891 IntType::I32 => 2_u8,
1892 IntType::U32 => 3_u8,
1893 _ => panic!("Invalid shift data type"),
1894 },
1895 );
1896 e.set_bit(75, self.wrap);
1897 e.set_bit(76, self.right);
1898 e.set_bit(80, self.dst_high);
1899 }
1900 }
1901
1902 impl SM70Op for OpF2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1903 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1904 // Nothing to do
1905 }
1906
encode(&self, e: &mut SM70Encoder<'_>)1907 fn encode(&self, e: &mut SM70Encoder<'_>) {
1908 assert!(!self.integer_rnd);
1909 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1910 e.encode_alu(0x104, Some(&self.dst), None, Some(&self.src), None)
1911 } else {
1912 e.encode_alu(0x110, Some(&self.dst), None, Some(&self.src), None)
1913 };
1914
1915 if self.high {
1916 e.set_field(60..62, 1_u8); // .H1
1917 }
1918
1919 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1920 e.set_rnd_mode(78..80, self.rnd_mode);
1921 e.set_bit(80, self.ftz);
1922 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1923 }
1924 }
1925
1926 impl SM70Op for OpF2FP {
legalize(&mut self, b: &mut LegalizeBuilder)1927 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1928 let gpr = op_gpr(self);
1929 let [src0, src1] = &mut self.srcs;
1930 swap_srcs_if_not_reg(src0, src1, gpr);
1931
1932 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1933 }
1934
encode(&self, e: &mut SM70Encoder<'_>)1935 fn encode(&self, e: &mut SM70Encoder<'_>) {
1936 e.encode_alu(
1937 0x03e,
1938 Some(&self.dst),
1939 Some(&self.srcs[0]),
1940 Some(&self.srcs[1]),
1941 Some(&Src::new_zero()),
1942 );
1943
1944 // .MERGE_C behavior
1945 // Use src1 and src2, src0 is unused
1946 // src1 get converted and packed in the lower 16 bits of dest.
1947 // src2 lower or high 16 bits (decided by .H1 flag) get packed in the upper of dest.
1948 e.set_bit(78, false); // TODO: .MERGE_C
1949 e.set_bit(72, false); // .H1 (MERGE_C only)
1950 e.set_rnd_mode(79..81, self.rnd_mode);
1951 }
1952 }
1953
1954 impl SM70Op for OpF2I {
legalize(&mut self, _b: &mut LegalizeBuilder)1955 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1956 // Nothing to do
1957 }
1958
encode(&self, e: &mut SM70Encoder<'_>)1959 fn encode(&self, e: &mut SM70Encoder<'_>) {
1960 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1961 e.encode_alu(0x105, Some(&self.dst), None, Some(&self.src), None)
1962 } else {
1963 e.encode_alu(0x111, Some(&self.dst), None, Some(&self.src), None)
1964 };
1965
1966 e.set_bit(72, self.dst_type.is_signed());
1967 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1968 e.set_bit(77, false); // NTZ
1969 e.set_rnd_mode(78..80, self.rnd_mode);
1970 e.set_bit(80, self.ftz);
1971 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1972 }
1973 }
1974
1975 impl SM70Op for OpI2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1976 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1977 // Nothing to do
1978 }
1979
encode(&self, e: &mut SM70Encoder<'_>)1980 fn encode(&self, e: &mut SM70Encoder<'_>) {
1981 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1982 e.encode_alu(0x106, Some(&self.dst), None, Some(&self.src), None)
1983 } else {
1984 e.encode_alu(0x112, Some(&self.dst), None, Some(&self.src), None)
1985 };
1986
1987 e.set_field(60..62, 0_u8); // TODO: subop
1988 e.set_bit(74, self.src_type.is_signed());
1989 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1990 e.set_rnd_mode(78..80, self.rnd_mode);
1991 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1992 }
1993 }
1994
1995 impl SM70Op for OpFRnd {
legalize(&mut self, _b: &mut LegalizeBuilder)1996 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1997 // Nothing to do
1998 }
1999
encode(&self, e: &mut SM70Encoder<'_>)2000 fn encode(&self, e: &mut SM70Encoder<'_>) {
2001 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
2002 e.encode_alu(0x107, Some(&self.dst), None, Some(&self.src), None)
2003 } else {
2004 e.encode_alu(0x113, Some(&self.dst), None, Some(&self.src), None)
2005 };
2006
2007 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
2008 e.set_bit(80, self.ftz);
2009 e.set_rnd_mode(78..80, self.rnd_mode);
2010 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
2011 }
2012 }
2013
2014 impl SM70Op for OpMov {
legalize(&mut self, _b: &mut LegalizeBuilder)2015 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2016 // Nothing to do
2017 }
2018
encode(&self, e: &mut SM70Encoder<'_>)2019 fn encode(&self, e: &mut SM70Encoder<'_>) {
2020 if self.is_uniform() {
2021 e.set_opcode(0xc82);
2022 e.set_udst(self.dst);
2023
2024 // umov is encoded like a non-uniform ALU op
2025 let src = ALUSrc::from_src(Some(&self.src), true);
2026 let form: u8 = match &src {
2027 ALUSrc::Reg(reg) => {
2028 e.encode_alu_ureg(reg, false);
2029 0x6 // form
2030 }
2031 ALUSrc::Imm32(imm) => {
2032 e.encode_alu_imm(imm);
2033 0x4 // form
2034 }
2035 _ => panic!("Invalid umov src"),
2036 };
2037 e.set_field(9..12, form);
2038 } else {
2039 e.encode_alu(0x002, Some(&self.dst), None, Some(&self.src), None);
2040 e.set_field(72..76, self.quad_lanes);
2041 }
2042 }
2043 }
2044
2045 impl SM70Op for OpPrmt {
legalize(&mut self, b: &mut LegalizeBuilder)2046 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2047 let gpr = op_gpr(self);
2048 let [src0, src1] = &mut self.srcs;
2049 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2050 b.copy_alu_src_if_not_reg(src1, gpr, SrcType::ALU);
2051 }
2052
encode(&self, e: &mut SM70Encoder<'_>)2053 fn encode(&self, e: &mut SM70Encoder<'_>) {
2054 if self.is_uniform() {
2055 e.encode_ualu(
2056 0x96,
2057 Some(&self.dst),
2058 Some(&self.srcs[0]),
2059 Some(&self.sel),
2060 Some(&self.srcs[1]),
2061 )
2062 } else {
2063 e.encode_alu(
2064 0x16,
2065 Some(&self.dst),
2066 Some(&self.srcs[0]),
2067 Some(&self.sel),
2068 Some(&self.srcs[1]),
2069 )
2070 };
2071
2072 e.set_field(
2073 72..75,
2074 match self.mode {
2075 PrmtMode::Index => 0_u8,
2076 PrmtMode::Forward4Extract => 1_u8,
2077 PrmtMode::Backward4Extract => 2_u8,
2078 PrmtMode::Replicate8 => 3_u8,
2079 PrmtMode::EdgeClampLeft => 4_u8,
2080 PrmtMode::EdgeClampRight => 5_u8,
2081 PrmtMode::Replicate16 => 6_u8,
2082 },
2083 );
2084 }
2085 }
2086
2087 impl SM70Op for OpSel {
legalize(&mut self, b: &mut LegalizeBuilder)2088 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2089 let gpr = op_gpr(self);
2090 if !self.is_uniform() {
2091 b.copy_src_if_upred(&mut self.cond);
2092 }
2093 let [src0, src1] = &mut self.srcs;
2094 if swap_srcs_if_not_reg(src0, src1, gpr) {
2095 self.cond = self.cond.bnot();
2096 }
2097 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2098 }
2099
encode(&self, e: &mut SM70Encoder<'_>)2100 fn encode(&self, e: &mut SM70Encoder<'_>) {
2101 if self.is_uniform() {
2102 e.encode_ualu(
2103 0x087,
2104 Some(&self.dst),
2105 Some(&self.srcs[0]),
2106 Some(&self.srcs[1]),
2107 None,
2108 );
2109
2110 e.set_upred_src(87..90, 90, self.cond);
2111 } else {
2112 e.encode_alu(
2113 0x007,
2114 Some(&self.dst),
2115 Some(&self.srcs[0]),
2116 Some(&self.srcs[1]),
2117 None,
2118 );
2119
2120 e.set_pred_src(87..90, 90, self.cond);
2121 }
2122 }
2123 }
2124
2125 impl SM70Op for OpShfl {
legalize(&mut self, b: &mut LegalizeBuilder)2126 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2127 let gpr = op_gpr(self);
2128 b.copy_alu_src_if_not_reg(&mut self.src, gpr, SrcType::GPR);
2129 b.copy_alu_src_if_not_reg_or_imm(&mut self.lane, gpr, SrcType::ALU);
2130 b.copy_alu_src_if_not_reg_or_imm(&mut self.c, gpr, SrcType::ALU);
2131 }
2132
encode(&self, e: &mut SM70Encoder<'_>)2133 fn encode(&self, e: &mut SM70Encoder<'_>) {
2134 assert!(self.lane.src_mod.is_none());
2135 assert!(self.c.src_mod.is_none());
2136
2137 match &self.lane.src_ref {
2138 SrcRef::Zero | SrcRef::Reg(_) => match &self.c.src_ref {
2139 SrcRef::Zero | SrcRef::Reg(_) => {
2140 e.set_opcode(0x389);
2141 e.set_reg_src(32..40, self.lane);
2142 e.set_reg_src(64..72, self.c);
2143 }
2144 SrcRef::Imm32(imm_c) => {
2145 e.set_opcode(0x589);
2146 e.set_reg_src(32..40, self.lane);
2147 e.set_field(40..53, *imm_c & 0x1f1f);
2148 }
2149 _ => panic!("Invalid instruction form"),
2150 },
2151 SrcRef::Imm32(imm_lane) => match &self.c.src_ref {
2152 SrcRef::Zero | SrcRef::Reg(_) => {
2153 e.set_opcode(0x989);
2154 e.set_field(53..58, *imm_lane & 0x1f);
2155 e.set_reg_src(64..72, self.c);
2156 }
2157 SrcRef::Imm32(imm_c) => {
2158 e.set_opcode(0xf89);
2159 e.set_field(40..53, *imm_c & 0x1f1f);
2160 e.set_field(53..58, *imm_lane & 0x1f);
2161 }
2162 _ => panic!("Invalid instruction form"),
2163 },
2164 _ => panic!("Invalid instruction form"),
2165 };
2166
2167 e.set_dst(self.dst);
2168 e.set_pred_dst(81..84, self.in_bounds);
2169 e.set_reg_src(24..32, self.src);
2170 e.set_field(
2171 58..60,
2172 match self.op {
2173 ShflOp::Idx => 0_u8,
2174 ShflOp::Up => 1_u8,
2175 ShflOp::Down => 2_u8,
2176 ShflOp::Bfly => 3_u8,
2177 },
2178 );
2179 }
2180 }
2181
2182 impl SM70Op for OpPLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)2183 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2184 // Fold constants and modifiers if we can
2185 for lop in &mut self.ops {
2186 *lop = LogicOp3::new_lut(&|mut x, mut y, mut z| {
2187 fold_lop_src(&self.srcs[0], &mut x);
2188 fold_lop_src(&self.srcs[1], &mut y);
2189 fold_lop_src(&self.srcs[2], &mut z);
2190 lop.eval(x, y, z)
2191 });
2192 }
2193 for src in &mut self.srcs {
2194 src.src_mod = SrcMod::None;
2195 if src_as_lop_imm(src).is_some() {
2196 src.src_ref = SrcRef::True;
2197 }
2198 }
2199
2200 if !self.is_uniform() {
2201 // The warp form of plop3 allows a single uniform predicate in
2202 // src2. If we have a uniform predicate anywhere, try to move it
2203 // there.
2204 let [src0, src1, src2] = &mut self.srcs;
2205 if src_is_upred_reg(src0) && !src_is_upred_reg(src2) {
2206 std::mem::swap(src0, src2);
2207 for lop in &mut self.ops {
2208 *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(z, y, x))
2209 }
2210 }
2211 if src_is_upred_reg(src1) && !src_is_upred_reg(src2) {
2212 std::mem::swap(src1, src2);
2213 for lop in &mut self.ops {
2214 *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(x, z, y))
2215 }
2216 }
2217 b.copy_src_if_upred(src0);
2218 b.copy_src_if_upred(src1);
2219 }
2220 }
2221
encode(&self, e: &mut SM70Encoder<'_>)2222 fn encode(&self, e: &mut SM70Encoder<'_>) {
2223 if self.is_uniform() {
2224 e.set_opcode(0x89c);
2225
2226 e.set_upred_src(68..71, 71, self.srcs[2]);
2227 e.set_upred_src(77..80, 80, self.srcs[1]);
2228 e.set_upred_src(87..90, 90, self.srcs[0]);
2229 } else {
2230 e.set_opcode(0x81c);
2231
2232 if self.srcs[2]
2233 .src_ref
2234 .as_reg()
2235 .is_some_and(|r| r.is_uniform())
2236 {
2237 e.set_upred_src(68..71, 71, self.srcs[2]);
2238 e.set_bit(67, true);
2239 } else {
2240 e.set_pred_src(68..71, 71, self.srcs[2]);
2241 }
2242 e.set_pred_src(77..80, 80, self.srcs[1]);
2243 e.set_pred_src(87..90, 90, self.srcs[0]);
2244 }
2245 e.set_field(16..24, self.ops[1].lut);
2246 e.set_field(64..67, self.ops[0].lut & 0x7);
2247 e.set_field(72..77, self.ops[0].lut >> 3);
2248
2249 e.set_pred_dst(81..84, self.dsts[0]);
2250 e.set_pred_dst(84..87, self.dsts[1]);
2251 }
2252 }
2253
2254 impl SM70Op for OpR2UR {
legalize(&mut self, _b: &mut LegalizeBuilder)2255 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2256 // Nothing to do
2257 }
2258
encode(&self, e: &mut SM70Encoder<'_>)2259 fn encode(&self, e: &mut SM70Encoder<'_>) {
2260 e.set_opcode(0x3c2);
2261 e.set_udst(self.dst);
2262 e.set_reg_src(24..32, self.src);
2263 e.set_pred_dst(81..84, Dst::None);
2264 }
2265 }
2266
2267 impl SM70Encoder<'_> {
set_tex_dim(&mut self, range: Range<usize>, dim: TexDim)2268 fn set_tex_dim(&mut self, range: Range<usize>, dim: TexDim) {
2269 assert!(range.len() == 3);
2270 self.set_field(
2271 range,
2272 match dim {
2273 TexDim::_1D => 0_u8,
2274 TexDim::Array1D => 4_u8,
2275 TexDim::_2D => 1_u8,
2276 TexDim::Array2D => 5_u8,
2277 TexDim::_3D => 2_u8,
2278 TexDim::Cube => 3_u8,
2279 TexDim::ArrayCube => 7_u8,
2280 },
2281 );
2282 }
2283
set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode)2284 fn set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode) {
2285 assert!(range.len() == 3);
2286 self.set_field(
2287 range,
2288 match lod_mode {
2289 TexLodMode::Auto => 0_u8,
2290 TexLodMode::Zero => 1_u8,
2291 TexLodMode::Bias => 2_u8,
2292 TexLodMode::Lod => 3_u8,
2293 TexLodMode::Clamp => 4_u8,
2294 TexLodMode::BiasClamp => 5_u8,
2295 },
2296 );
2297 }
2298
set_image_dim(&mut self, range: Range<usize>, dim: ImageDim)2299 fn set_image_dim(&mut self, range: Range<usize>, dim: ImageDim) {
2300 assert!(range.len() == 3);
2301 self.set_field(
2302 range,
2303 match dim {
2304 ImageDim::_1D => 0_u8,
2305 ImageDim::_1DBuffer => 1_u8,
2306 ImageDim::_1DArray => 2_u8,
2307 ImageDim::_2D => 3_u8,
2308 ImageDim::_2DArray => 4_u8,
2309 ImageDim::_3D => 5_u8,
2310 },
2311 );
2312 }
2313 }
2314
2315 impl SM70Op for OpTex {
legalize(&mut self, b: &mut LegalizeBuilder)2316 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2317 legalize_ext_instr(self, b);
2318 }
2319
encode(&self, e: &mut SM70Encoder<'_>)2320 fn encode(&self, e: &mut SM70Encoder<'_>) {
2321 e.set_opcode(0x361);
2322 e.set_bit(59, true); // .B
2323
2324 e.set_dst(self.dsts[0]);
2325 if let Dst::Reg(reg) = self.dsts[1] {
2326 e.set_reg(64..72, reg);
2327 } else {
2328 e.set_field(64..72, 255_u8);
2329 }
2330 e.set_pred_dst(81..84, self.fault);
2331
2332 e.set_reg_src(24..32, self.srcs[0]);
2333 e.set_reg_src(32..40, self.srcs[1]);
2334
2335 e.set_tex_dim(61..64, self.dim);
2336 e.set_field(72..76, self.mask);
2337 e.set_bit(76, self.offset);
2338 e.set_bit(77, false); // ToDo: NDV
2339 e.set_bit(78, self.z_cmpr);
2340 e.set_field(84..87, 1);
2341 e.set_tex_lod_mode(87..90, self.lod_mode);
2342 e.set_bit(90, false); // TODO: .NODEP
2343 }
2344 }
2345
2346 impl SM70Op for OpTld {
legalize(&mut self, b: &mut LegalizeBuilder)2347 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2348 legalize_ext_instr(self, b);
2349 }
2350
encode(&self, e: &mut SM70Encoder<'_>)2351 fn encode(&self, e: &mut SM70Encoder<'_>) {
2352 e.set_opcode(0x367);
2353 e.set_bit(59, true); // .B
2354
2355 e.set_dst(self.dsts[0]);
2356 if let Dst::Reg(reg) = self.dsts[1] {
2357 e.set_reg(64..72, reg);
2358 } else {
2359 e.set_field(64..72, 255_u8);
2360 }
2361 e.set_pred_dst(81..84, self.fault);
2362
2363 e.set_reg_src(24..32, self.srcs[0]);
2364 e.set_reg_src(32..40, self.srcs[1]);
2365
2366 e.set_tex_dim(61..64, self.dim);
2367 e.set_field(72..76, self.mask);
2368 e.set_bit(76, self.offset);
2369 // bit 77: .CL
2370 e.set_bit(78, self.is_ms);
2371 // bits 79..81: .F16
2372 assert!(
2373 self.lod_mode == TexLodMode::Zero
2374 || self.lod_mode == TexLodMode::Lod
2375 );
2376 e.set_tex_lod_mode(87..90, self.lod_mode);
2377 e.set_bit(90, false); // TODO: .NODEP
2378 }
2379 }
2380
2381 impl SM70Op for OpTld4 {
legalize(&mut self, b: &mut LegalizeBuilder)2382 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2383 legalize_ext_instr(self, b);
2384 }
2385
encode(&self, e: &mut SM70Encoder<'_>)2386 fn encode(&self, e: &mut SM70Encoder<'_>) {
2387 e.set_opcode(0x364);
2388 e.set_bit(59, true); // .B
2389
2390 e.set_dst(self.dsts[0]);
2391 if let Dst::Reg(reg) = self.dsts[1] {
2392 e.set_reg(64..72, reg);
2393 } else {
2394 e.set_field(64..72, 255_u8);
2395 }
2396 e.set_pred_dst(81..84, self.fault);
2397
2398 e.set_reg_src(24..32, self.srcs[0]);
2399 e.set_reg_src(32..40, self.srcs[1]);
2400
2401 e.set_tex_dim(61..64, self.dim);
2402 e.set_field(72..76, self.mask);
2403 e.set_field(
2404 76..78,
2405 match self.offset_mode {
2406 Tld4OffsetMode::None => 0_u8,
2407 Tld4OffsetMode::AddOffI => 1_u8,
2408 Tld4OffsetMode::PerPx => 2_u8,
2409 },
2410 );
2411 // bit 77: .CL
2412 e.set_bit(78, self.z_cmpr);
2413 e.set_bit(84, true); // !.EF
2414 e.set_field(87..89, self.comp);
2415 e.set_bit(90, false); // TODO: .NODEP
2416 }
2417 }
2418
2419 impl SM70Op for OpTmml {
legalize(&mut self, b: &mut LegalizeBuilder)2420 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2421 legalize_ext_instr(self, b);
2422 }
2423
encode(&self, e: &mut SM70Encoder<'_>)2424 fn encode(&self, e: &mut SM70Encoder<'_>) {
2425 e.set_opcode(0x36a);
2426 e.set_bit(59, true); // .B
2427
2428 e.set_dst(self.dsts[0]);
2429 if let Dst::Reg(reg) = self.dsts[1] {
2430 e.set_reg(64..72, reg);
2431 } else {
2432 e.set_field(64..72, 255_u8);
2433 }
2434
2435 e.set_reg_src(24..32, self.srcs[0]);
2436 e.set_reg_src(32..40, self.srcs[1]);
2437
2438 e.set_tex_dim(61..64, self.dim);
2439 e.set_field(72..76, self.mask);
2440 e.set_bit(77, false); // ToDo: NDV
2441 e.set_bit(90, false); // TODO: .NODEP
2442 }
2443 }
2444
2445 impl SM70Op for OpTxd {
legalize(&mut self, b: &mut LegalizeBuilder)2446 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2447 legalize_ext_instr(self, b);
2448 }
2449
encode(&self, e: &mut SM70Encoder<'_>)2450 fn encode(&self, e: &mut SM70Encoder<'_>) {
2451 e.set_opcode(0x36d);
2452 e.set_bit(59, true); // .B
2453
2454 e.set_dst(self.dsts[0]);
2455 if let Dst::Reg(reg) = self.dsts[1] {
2456 e.set_reg(64..72, reg);
2457 } else {
2458 e.set_field(64..72, 255_u8);
2459 }
2460 e.set_pred_dst(81..84, self.fault);
2461
2462 e.set_reg_src(24..32, self.srcs[0]);
2463 e.set_reg_src(32..40, self.srcs[1]);
2464
2465 e.set_tex_dim(61..64, self.dim);
2466 e.set_field(72..76, self.mask);
2467 e.set_bit(76, self.offset);
2468 e.set_bit(77, false); // ToDo: NDV
2469 e.set_bit(90, false); // TODO: .NODEP
2470 }
2471 }
2472
2473 impl SM70Op for OpTxq {
legalize(&mut self, b: &mut LegalizeBuilder)2474 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2475 legalize_ext_instr(self, b);
2476 }
2477
encode(&self, e: &mut SM70Encoder<'_>)2478 fn encode(&self, e: &mut SM70Encoder<'_>) {
2479 e.set_opcode(0x370);
2480 e.set_bit(59, true); // .B
2481
2482 e.set_dst(self.dsts[0]);
2483 if let Dst::Reg(reg) = self.dsts[1] {
2484 e.set_reg(64..72, reg);
2485 } else {
2486 e.set_field(64..72, 255_u8);
2487 }
2488
2489 e.set_reg_src(24..32, self.src);
2490 e.set_field(
2491 62..64,
2492 match self.query {
2493 TexQuery::Dimension => 0_u8,
2494 TexQuery::TextureType => 1_u8,
2495 TexQuery::SamplerPos => 2_u8,
2496 },
2497 );
2498 e.set_field(72..76, self.mask);
2499 }
2500 }
2501
2502 impl SM70Encoder<'_> {
set_mem_order(&mut self, order: &MemOrder)2503 fn set_mem_order(&mut self, order: &MemOrder) {
2504 if self.sm.sm < 80 {
2505 let scope = match order {
2506 MemOrder::Constant => MemScope::System,
2507 MemOrder::Weak => MemScope::CTA,
2508 MemOrder::Strong(s) => *s,
2509 };
2510 self.set_field(
2511 77..79,
2512 match scope {
2513 MemScope::CTA => 0_u8,
2514 // SM => 1_u8,
2515 MemScope::GPU => 2_u8,
2516 MemScope::System => 3_u8,
2517 },
2518 );
2519 self.set_field(
2520 79..81,
2521 match order {
2522 MemOrder::Constant => 0_u8,
2523 MemOrder::Weak => 1_u8,
2524 MemOrder::Strong(_) => 2_u8,
2525 // MMIO => 3_u8,
2526 },
2527 );
2528 } else {
2529 self.set_field(
2530 77..81,
2531 match order {
2532 MemOrder::Constant => 0x4_u8,
2533 MemOrder::Weak => 0x0_u8,
2534 MemOrder::Strong(MemScope::CTA) => 0x5_u8,
2535 MemOrder::Strong(MemScope::GPU) => 0x7_u8,
2536 MemOrder::Strong(MemScope::System) => 0xa_u8,
2537 },
2538 );
2539 }
2540 }
2541
set_eviction_priority(&mut self, pri: &MemEvictionPriority)2542 fn set_eviction_priority(&mut self, pri: &MemEvictionPriority) {
2543 self.set_field(
2544 84..86,
2545 match pri {
2546 MemEvictionPriority::First => 0_u8,
2547 MemEvictionPriority::Normal => 1_u8,
2548 MemEvictionPriority::Last => 2_u8,
2549 MemEvictionPriority::Unchanged => 3_u8,
2550 },
2551 );
2552 }
2553
set_mem_type(&mut self, range: Range<usize>, mem_type: MemType)2554 fn set_mem_type(&mut self, range: Range<usize>, mem_type: MemType) {
2555 assert!(range.len() == 3);
2556 self.set_field(
2557 range,
2558 match mem_type {
2559 MemType::U8 => 0_u8,
2560 MemType::I8 => 1_u8,
2561 MemType::U16 => 2_u8,
2562 MemType::I16 => 3_u8,
2563 MemType::B32 => 4_u8,
2564 MemType::B64 => 5_u8,
2565 MemType::B128 => 6_u8,
2566 },
2567 );
2568 }
2569
set_mem_access(&mut self, access: &MemAccess)2570 fn set_mem_access(&mut self, access: &MemAccess) {
2571 self.set_field(
2572 72..73,
2573 match access.space.addr_type() {
2574 MemAddrType::A32 => 0_u8,
2575 MemAddrType::A64 => 1_u8,
2576 },
2577 );
2578 self.set_mem_type(73..76, access.mem_type);
2579 self.set_mem_order(&access.order);
2580 self.set_eviction_priority(&access.eviction_priority);
2581 }
2582 }
2583
2584 impl SM70Op for OpSuLd {
legalize(&mut self, b: &mut LegalizeBuilder)2585 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2586 legalize_ext_instr(self, b);
2587 }
2588
encode(&self, e: &mut SM70Encoder<'_>)2589 fn encode(&self, e: &mut SM70Encoder<'_>) {
2590 e.set_opcode(0x998);
2591
2592 e.set_dst(self.dst);
2593 e.set_reg_src(24..32, self.coord);
2594 e.set_reg_src(64..72, self.handle);
2595 e.set_pred_dst(81..84, self.fault);
2596
2597 e.set_image_dim(61..64, self.image_dim);
2598 e.set_mem_order(&self.mem_order);
2599 e.set_eviction_priority(&self.mem_eviction_priority);
2600
2601 assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2602 e.set_field(72..76, self.mask);
2603 }
2604 }
2605
2606 impl SM70Op for OpSuSt {
legalize(&mut self, b: &mut LegalizeBuilder)2607 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2608 legalize_ext_instr(self, b);
2609 }
2610
encode(&self, e: &mut SM70Encoder<'_>)2611 fn encode(&self, e: &mut SM70Encoder<'_>) {
2612 e.set_opcode(0x99c);
2613
2614 e.set_reg_src(24..32, self.coord);
2615 e.set_reg_src(32..40, self.data);
2616 e.set_reg_src(64..72, self.handle);
2617
2618 e.set_image_dim(61..64, self.image_dim);
2619 e.set_mem_order(&self.mem_order);
2620 e.set_eviction_priority(&self.mem_eviction_priority);
2621
2622 assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2623 e.set_field(72..76, self.mask);
2624 }
2625 }
2626
2627 impl SM70Op for OpSuAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2628 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2629 legalize_ext_instr(self, b);
2630 }
2631
encode(&self, e: &mut SM70Encoder<'_>)2632 fn encode(&self, e: &mut SM70Encoder<'_>) {
2633 if self.dst.is_none() {
2634 e.set_opcode(0x3a0);
2635 e.set_atom_op(87..90, self.atom_op);
2636 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2637 e.set_opcode(0x396);
2638 assert!(cmp_src == AtomCmpSrc::Packed);
2639 } else {
2640 e.set_opcode(0x394);
2641 e.set_atom_op(87..91, self.atom_op);
2642 };
2643
2644 e.set_dst(self.dst);
2645 e.set_reg_src(24..32, self.coord);
2646 e.set_reg_src(32..40, self.data);
2647 e.set_reg_src(64..72, self.handle);
2648 e.set_pred_dst(81..84, self.fault);
2649
2650 e.set_image_dim(61..64, self.image_dim);
2651 e.set_mem_order(&self.mem_order);
2652 e.set_eviction_priority(&self.mem_eviction_priority);
2653
2654 e.set_bit(72, false); // .BA
2655 e.set_atom_type(73..76, self.atom_type);
2656 }
2657 }
2658
2659 impl SM70Op for OpLd {
legalize(&mut self, b: &mut LegalizeBuilder)2660 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2661 legalize_ext_instr(self, b);
2662 }
2663
encode(&self, e: &mut SM70Encoder<'_>)2664 fn encode(&self, e: &mut SM70Encoder<'_>) {
2665 match self.access.space {
2666 MemSpace::Global(_) => {
2667 e.set_opcode(0x381);
2668 e.set_pred_dst(81..84, Dst::None);
2669 e.set_mem_access(&self.access);
2670 }
2671 MemSpace::Local => {
2672 e.set_opcode(0x983);
2673 e.set_field(84..87, 1_u8);
2674
2675 e.set_mem_type(73..76, self.access.mem_type);
2676 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2677 assert!(
2678 self.access.eviction_priority
2679 == MemEvictionPriority::Normal
2680 );
2681 }
2682 MemSpace::Shared => {
2683 e.set_opcode(0x984);
2684
2685 e.set_mem_type(73..76, self.access.mem_type);
2686 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2687 assert!(
2688 self.access.eviction_priority
2689 == MemEvictionPriority::Normal
2690 );
2691
2692 e.set_bit(87, false); // !.ZD - Returns a predicate?
2693 }
2694 }
2695
2696 e.set_dst(self.dst);
2697 e.set_reg_src(24..32, self.addr);
2698 e.set_field(40..64, self.offset);
2699 }
2700 }
2701
2702 impl SM70Op for OpLdc {
legalize(&mut self, b: &mut LegalizeBuilder)2703 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2704 let gpr = op_gpr(self);
2705 b.copy_alu_src_if_not_reg(&mut self.offset, gpr, SrcType::GPR);
2706 }
2707
encode(&self, e: &mut SM70Encoder<'_>)2708 fn encode(&self, e: &mut SM70Encoder<'_>) {
2709 let SrcRef::CBuf(cb) = &self.cb.src_ref else {
2710 panic!("LDC must take a cbuf source");
2711 };
2712
2713 match cb.buf {
2714 CBuf::Binding(idx) => {
2715 if self.is_uniform() {
2716 e.set_opcode(0xab9);
2717 e.set_udst(self.dst);
2718
2719 assert!(self.offset.is_zero());
2720 assert!(self.mode == LdcMode::Indexed);
2721 } else {
2722 e.set_opcode(0xb82);
2723 e.set_dst(self.dst);
2724
2725 e.set_reg_src(24..32, self.offset);
2726 e.set_field(
2727 78..80,
2728 match self.mode {
2729 LdcMode::Indexed => 0_u8,
2730 LdcMode::IndexedLinear => 1_u8,
2731 LdcMode::IndexedSegmented => 2_u8,
2732 LdcMode::IndexedSegmentedLinear => 3_u8,
2733 },
2734 );
2735 }
2736 e.set_field(54..59, idx);
2737 e.set_bit(91, false); // Bound
2738 }
2739 CBuf::BindlessUGPR(handle) => {
2740 if self.is_uniform() {
2741 e.set_opcode(0xab9);
2742 e.set_udst(self.dst);
2743
2744 assert!(self.offset.is_zero());
2745 } else {
2746 e.set_opcode(0x582);
2747 e.set_dst(self.dst);
2748
2749 e.set_reg_src(64..72, self.offset);
2750 }
2751
2752 e.set_ureg(24..32, handle);
2753 e.set_reg_src(64..72, self.offset);
2754 assert!(self.mode == LdcMode::Indexed);
2755 e.set_bit(91, true); // Bindless
2756 }
2757 CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
2758 }
2759
2760 e.set_field(38..54, cb.offset);
2761 e.set_mem_type(73..76, self.mem_type);
2762 }
2763 }
2764
2765 impl SM70Op for OpSt {
legalize(&mut self, b: &mut LegalizeBuilder)2766 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2767 legalize_ext_instr(self, b);
2768 }
2769
encode(&self, e: &mut SM70Encoder<'_>)2770 fn encode(&self, e: &mut SM70Encoder<'_>) {
2771 match self.access.space {
2772 MemSpace::Global(_) => {
2773 e.set_opcode(0x386);
2774 e.set_mem_access(&self.access);
2775 }
2776 MemSpace::Local => {
2777 e.set_opcode(0x387);
2778 e.set_field(84..87, 1_u8);
2779
2780 e.set_mem_type(73..76, self.access.mem_type);
2781 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2782 assert!(
2783 self.access.eviction_priority
2784 == MemEvictionPriority::Normal
2785 );
2786 }
2787 MemSpace::Shared => {
2788 e.set_opcode(0x388);
2789
2790 e.set_mem_type(73..76, self.access.mem_type);
2791 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2792 assert!(
2793 self.access.eviction_priority
2794 == MemEvictionPriority::Normal
2795 );
2796 }
2797 }
2798
2799 e.set_reg_src(24..32, self.addr);
2800 e.set_reg_src(32..40, self.data);
2801 e.set_field(40..64, self.offset);
2802 }
2803 }
2804
2805 impl SM70Encoder<'_> {
set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp)2806 fn set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp) {
2807 self.set_field(
2808 range,
2809 match atom_op {
2810 AtomOp::Add => 0_u8,
2811 AtomOp::Min => 1_u8,
2812 AtomOp::Max => 2_u8,
2813 AtomOp::Inc => 3_u8,
2814 AtomOp::Dec => 4_u8,
2815 AtomOp::And => 5_u8,
2816 AtomOp::Or => 6_u8,
2817 AtomOp::Xor => 7_u8,
2818 AtomOp::Exch => 8_u8,
2819 AtomOp::CmpExch(_) => panic!("CmpExch is a separate opcode"),
2820 },
2821 );
2822 }
2823
set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType)2824 fn set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType) {
2825 assert!(range.len() == 3);
2826 self.set_field(
2827 range,
2828 match atom_type {
2829 AtomType::U32 => 0_u8,
2830 AtomType::I32 => 1_u8,
2831 AtomType::U64 => 2_u8,
2832 AtomType::F32 => 3_u8,
2833 AtomType::F16x2 => 4_u8,
2834 AtomType::I64 => 5_u8,
2835 AtomType::F64 => 6_u8,
2836 },
2837 );
2838 }
2839 }
2840
2841 impl SM70Op for OpAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2842 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2843 legalize_ext_instr(self, b);
2844 }
2845
encode(&self, e: &mut SM70Encoder<'_>)2846 fn encode(&self, e: &mut SM70Encoder<'_>) {
2847 match self.mem_space {
2848 MemSpace::Global(_) => {
2849 if self.dst.is_none() {
2850 e.set_opcode(0x98e);
2851
2852 e.set_reg_src(32..40, self.data);
2853 e.set_atom_op(87..90, self.atom_op);
2854 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2855 e.set_opcode(0x3a9);
2856
2857 assert!(cmp_src == AtomCmpSrc::Separate);
2858 e.set_reg_src(32..40, self.cmpr);
2859 e.set_reg_src(64..72, self.data);
2860 } else {
2861 e.set_opcode(0x3a8);
2862
2863 e.set_reg_src(32..40, self.data);
2864 e.set_atom_op(87..91, self.atom_op);
2865 }
2866
2867 e.set_pred_dst(81..84, Dst::None);
2868
2869 e.set_field(
2870 72..73,
2871 match self.mem_space.addr_type() {
2872 MemAddrType::A32 => 0_u8,
2873 MemAddrType::A64 => 1_u8,
2874 },
2875 );
2876
2877 e.set_mem_order(&self.mem_order);
2878 e.set_eviction_priority(&self.mem_eviction_priority);
2879 }
2880 MemSpace::Local => panic!("Atomics do not support local"),
2881 MemSpace::Shared => {
2882 if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2883 e.set_opcode(0x38d);
2884
2885 assert!(cmp_src == AtomCmpSrc::Separate);
2886 e.set_reg_src(32..40, self.cmpr);
2887 e.set_reg_src(64..72, self.data);
2888 } else {
2889 e.set_opcode(0x38c);
2890
2891 e.set_reg_src(32..40, self.data);
2892 e.set_atom_op(87..91, self.atom_op);
2893 }
2894
2895 assert!(self.mem_order == MemOrder::Strong(MemScope::CTA));
2896 assert!(
2897 self.mem_eviction_priority == MemEvictionPriority::Normal
2898 );
2899 }
2900 }
2901
2902 e.set_dst(self.dst);
2903 e.set_reg_src(24..32, self.addr);
2904 e.set_field(40..64, self.addr_offset);
2905 e.set_atom_type(73..76, self.atom_type);
2906 }
2907 }
2908
2909 impl SM70Op for OpAL2P {
legalize(&mut self, b: &mut LegalizeBuilder)2910 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2911 legalize_ext_instr(self, b);
2912 }
2913
encode(&self, e: &mut SM70Encoder<'_>)2914 fn encode(&self, e: &mut SM70Encoder<'_>) {
2915 e.set_opcode(0x920);
2916
2917 e.set_dst(self.dst);
2918 e.set_reg_src(24..32, self.offset);
2919
2920 e.set_field(40..50, self.access.addr);
2921 e.set_field(74..76, 0_u8); // comps
2922 assert!(!self.access.patch);
2923 e.set_bit(79, self.access.output);
2924 }
2925 }
2926
2927 impl SM70Op for OpALd {
legalize(&mut self, b: &mut LegalizeBuilder)2928 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2929 legalize_ext_instr(self, b);
2930 }
2931
encode(&self, e: &mut SM70Encoder<'_>)2932 fn encode(&self, e: &mut SM70Encoder<'_>) {
2933 e.set_opcode(0x321);
2934
2935 e.set_dst(self.dst);
2936 e.set_reg_src(32..40, self.vtx);
2937 e.set_reg_src(24..32, self.offset);
2938
2939 e.set_field(40..50, self.access.addr);
2940 e.set_field(74..76, self.access.comps - 1);
2941 e.set_field(76..77, self.access.patch);
2942 e.set_field(77..78, self.access.phys);
2943 e.set_field(79..80, self.access.output);
2944 }
2945 }
2946
2947 impl SM70Op for OpASt {
legalize(&mut self, b: &mut LegalizeBuilder)2948 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2949 legalize_ext_instr(self, b);
2950 }
2951
encode(&self, e: &mut SM70Encoder<'_>)2952 fn encode(&self, e: &mut SM70Encoder<'_>) {
2953 e.set_opcode(0x322);
2954
2955 e.set_reg_src(32..40, self.data);
2956 e.set_reg_src(64..72, self.vtx);
2957 e.set_reg_src(24..32, self.offset);
2958
2959 e.set_field(40..50, self.access.addr);
2960 e.set_field(74..76, self.access.comps - 1);
2961 e.set_field(76..77, self.access.patch);
2962 e.set_field(77..78, self.access.phys);
2963 assert!(self.access.output);
2964 }
2965 }
2966
2967 impl SM70Op for OpIpa {
legalize(&mut self, b: &mut LegalizeBuilder)2968 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2969 legalize_ext_instr(self, b);
2970 }
2971
encode(&self, e: &mut SM70Encoder<'_>)2972 fn encode(&self, e: &mut SM70Encoder<'_>) {
2973 e.set_opcode(0x326);
2974
2975 e.set_dst(self.dst);
2976
2977 assert!(self.addr % 4 == 0);
2978 e.set_field(64..72, self.addr >> 2);
2979
2980 e.set_field(
2981 76..78,
2982 match self.loc {
2983 InterpLoc::Default => 0_u8,
2984 InterpLoc::Centroid => 1_u8,
2985 InterpLoc::Offset => 2_u8,
2986 },
2987 );
2988 e.set_field(
2989 78..80,
2990 match self.freq {
2991 InterpFreq::Pass => 0_u8,
2992 InterpFreq::Constant => 1_u8,
2993 InterpFreq::State => 2_u8,
2994 InterpFreq::PassMulW => {
2995 panic!("InterpFreq::PassMulW is invalid on SM70+");
2996 }
2997 },
2998 );
2999
3000 assert!(self.inv_w.is_zero());
3001 e.set_reg_src(32..40, self.offset);
3002
3003 // TODO: What is this for?
3004 e.set_pred_dst(81..84, Dst::None);
3005 }
3006 }
3007
3008 impl SM70Op for OpLdTram {
legalize(&mut self, b: &mut LegalizeBuilder)3009 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3010 legalize_ext_instr(self, b);
3011 }
3012
encode(&self, e: &mut SM70Encoder<'_>)3013 fn encode(&self, e: &mut SM70Encoder<'_>) {
3014 e.set_opcode(0x3ad);
3015 e.set_dst(self.dst);
3016 e.set_ureg(24..32, RegRef::zero(RegFile::UGPR, 1));
3017
3018 assert!(self.addr % 4 == 0);
3019 e.set_field(64..72, self.addr >> 2);
3020
3021 e.set_bit(72, self.use_c);
3022
3023 // Unknown but required
3024 e.set_bit(91, true);
3025 }
3026 }
3027
3028 impl SM70Op for OpCCtl {
legalize(&mut self, b: &mut LegalizeBuilder)3029 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3030 legalize_ext_instr(self, b);
3031 }
3032
encode(&self, e: &mut SM70Encoder<'_>)3033 fn encode(&self, e: &mut SM70Encoder<'_>) {
3034 assert!(matches!(self.mem_space, MemSpace::Global(_)));
3035 e.set_opcode(0x98f);
3036
3037 e.set_reg_src(24..32, self.addr);
3038 e.set_field(32..64, self.addr_offset);
3039
3040 e.set_field(
3041 87..91,
3042 match self.op {
3043 CCtlOp::PF1 => 0_u8,
3044 CCtlOp::PF2 => 1_u8,
3045 CCtlOp::WB => 2_u8,
3046 CCtlOp::IV => 3_u8,
3047 CCtlOp::IVAll => 4_u8,
3048 CCtlOp::RS => 5_u8,
3049 CCtlOp::IVAllP => 6_u8,
3050 CCtlOp::WBAll => 7_u8,
3051 CCtlOp::WBAllP => 8_u8,
3052 op => panic!("Unsupported cache control {op:?}"),
3053 },
3054 );
3055 }
3056 }
3057
3058 impl SM70Op for OpMemBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3059 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3060 // Nothing to do
3061 }
3062
encode(&self, e: &mut SM70Encoder<'_>)3063 fn encode(&self, e: &mut SM70Encoder<'_>) {
3064 e.set_opcode(0x992);
3065
3066 e.set_bit(72, false); // !.MMIO
3067 e.set_field(
3068 76..79,
3069 match self.scope {
3070 MemScope::CTA => 0_u8,
3071 // SM => 1_u8,
3072 MemScope::GPU => 2_u8,
3073 MemScope::System => 3_u8,
3074 },
3075 );
3076 e.set_bit(80, false); // .SC
3077 }
3078 }
3079
3080 impl SM70Encoder<'_> {
set_rel_offset(&mut self, range: Range<usize>, label: &Label)3081 fn set_rel_offset(&mut self, range: Range<usize>, label: &Label) {
3082 let ip = u64::try_from(self.ip).unwrap();
3083 let ip = i64::try_from(ip).unwrap();
3084
3085 let target_ip = *self.labels.get(label).unwrap();
3086 let target_ip = u64::try_from(target_ip).unwrap();
3087 let target_ip = i64::try_from(target_ip).unwrap();
3088
3089 let rel_offset = target_ip - ip - 4;
3090
3091 self.set_field(range, rel_offset);
3092 }
3093 }
3094
3095 impl SM70Op for OpBClear {
legalize(&mut self, _b: &mut LegalizeBuilder)3096 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3097 // Nothing to do
3098 }
3099
encode(&self, e: &mut SM70Encoder<'_>)3100 fn encode(&self, e: &mut SM70Encoder<'_>) {
3101 e.set_opcode(0x355);
3102
3103 e.set_dst(Dst::None);
3104 e.set_bar_dst(24..28, self.dst);
3105
3106 e.set_bit(84, true); // .CLEAR
3107 }
3108 }
3109
3110 impl SM70Op for OpBMov {
legalize(&mut self, _b: &mut LegalizeBuilder)3111 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3112 // Nothing to do
3113 }
3114
encode(&self, e: &mut SM70Encoder<'_>)3115 fn encode(&self, e: &mut SM70Encoder<'_>) {
3116 if dst_is_bar(self.dst) {
3117 e.set_opcode(0x356);
3118
3119 e.set_bar_dst(24..28, self.dst);
3120 e.set_reg_src(32..40, self.src);
3121
3122 e.set_bit(84, self.clear);
3123 } else {
3124 e.set_opcode(0x355);
3125
3126 e.set_dst(self.dst);
3127 e.set_bar_src(24..28, self.src);
3128
3129 e.set_bit(84, self.clear);
3130 }
3131 }
3132 }
3133
3134 impl SM70Op for OpBreak {
legalize(&mut self, _b: &mut LegalizeBuilder)3135 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3136 // Nothing to do
3137 }
3138
encode(&self, e: &mut SM70Encoder<'_>)3139 fn encode(&self, e: &mut SM70Encoder<'_>) {
3140 e.set_opcode(0x942);
3141 assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3142 e.set_bar_dst(16..20, self.bar_out);
3143 e.set_pred_src(87..90, 90, self.cond);
3144 }
3145 }
3146
3147 impl SM70Op for OpBSSy {
legalize(&mut self, _b: &mut LegalizeBuilder)3148 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3149 // Nothing to do
3150 }
3151
encode(&self, e: &mut SM70Encoder<'_>)3152 fn encode(&self, e: &mut SM70Encoder<'_>) {
3153 e.set_opcode(0x945);
3154 assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3155 e.set_bar_dst(16..20, self.bar_out);
3156 e.set_rel_offset(34..64, &self.target);
3157 e.set_pred_src(87..90, 90, self.cond);
3158 }
3159 }
3160
3161 impl SM70Op for OpBSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3162 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3163 // Nothing to do
3164 }
3165
encode(&self, e: &mut SM70Encoder<'_>)3166 fn encode(&self, e: &mut SM70Encoder<'_>) {
3167 e.set_opcode(0x941);
3168 e.set_bar_src(16..20, self.bar);
3169 e.set_pred_src(87..90, 90, self.cond);
3170 }
3171 }
3172
3173 impl SM70Op for OpBra {
legalize(&mut self, _b: &mut LegalizeBuilder)3174 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3175 // Nothing to do
3176 }
3177
encode(&self, e: &mut SM70Encoder<'_>)3178 fn encode(&self, e: &mut SM70Encoder<'_>) {
3179 e.set_opcode(0x947);
3180 e.set_rel_offset(34..82, &self.target);
3181 e.set_field(87..90, 0x7_u8); // TODO: Pred?
3182 }
3183 }
3184
3185 impl SM70Op for OpExit {
legalize(&mut self, _b: &mut LegalizeBuilder)3186 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3187 // Nothing to do
3188 }
3189
encode(&self, e: &mut SM70Encoder<'_>)3190 fn encode(&self, e: &mut SM70Encoder<'_>) {
3191 e.set_opcode(0x94d);
3192
3193 // ./.KEEPREFCOUNT/.PREEMPTED/.INVALID3
3194 e.set_field(84..85, false);
3195 e.set_field(85..86, false); // .NO_ATEXIT
3196 e.set_field(87..90, 0x7_u8); // TODO: Predicate
3197 e.set_field(90..91, false); // NOT
3198 }
3199 }
3200
3201 impl SM70Op for OpWarpSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3202 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3203 // Nothing to do
3204 }
3205
encode(&self, e: &mut SM70Encoder<'_>)3206 fn encode(&self, e: &mut SM70Encoder<'_>) {
3207 e.encode_alu(0x148, None, None, Some(&Src::from(self.mask)), None);
3208 e.set_pred_src(87..90, 90, SrcRef::True.into());
3209 }
3210 }
3211
3212 impl SM70Op for OpBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3213 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3214 // Nothing to do
3215 }
3216
encode(&self, e: &mut SM70Encoder<'_>)3217 fn encode(&self, e: &mut SM70Encoder<'_>) {
3218 e.set_opcode(0xb1d);
3219
3220 // e.set_opcode(0x31d);
3221
3222 // // src0 == src1
3223 // e.set_reg_src(32..40, SrcRef::Zero.into());
3224
3225 // // 00: RED.POPC
3226 // // 01: RED.AND
3227 // // 02: RED.OR
3228 // e.set_field(74..76, 0_u8);
3229
3230 // // 00: SYNC
3231 // // 01: ARV
3232 // // 02: RED
3233 // // 03: SCAN
3234 // e.set_field(77..79, 0_u8);
3235
3236 // e.set_pred_src(87..90, 90, SrcRef::True.into());
3237 }
3238 }
3239
3240 impl SM70Op for OpCS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3241 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3242 // Nothing to do
3243 }
3244
encode(&self, e: &mut SM70Encoder<'_>)3245 fn encode(&self, e: &mut SM70Encoder<'_>) {
3246 e.set_opcode(0x805);
3247 e.set_dst(self.dst);
3248 e.set_field(72..80, self.idx);
3249 e.set_bit(80, self.dst.as_reg().unwrap().comps() == 2); // .64
3250 }
3251 }
3252
3253 impl SM70Op for OpIsberd {
legalize(&mut self, _b: &mut LegalizeBuilder)3254 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3255 // Nothing to do
3256 }
3257
encode(&self, e: &mut SM70Encoder<'_>)3258 fn encode(&self, e: &mut SM70Encoder<'_>) {
3259 e.set_opcode(0x923);
3260 e.set_dst(self.dst);
3261 e.set_reg_src(24..32, self.idx);
3262 }
3263 }
3264
3265 impl SM70Op for OpKill {
legalize(&mut self, _b: &mut LegalizeBuilder)3266 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3267 // Nothing to do
3268 }
3269
encode(&self, e: &mut SM70Encoder<'_>)3270 fn encode(&self, e: &mut SM70Encoder<'_>) {
3271 e.set_opcode(0x95b);
3272 e.set_pred_src(87..90, 90, SrcRef::True.into());
3273 }
3274 }
3275
3276 impl SM70Op for OpNop {
legalize(&mut self, _b: &mut LegalizeBuilder)3277 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3278 // Nothing to do
3279 }
3280
encode(&self, e: &mut SM70Encoder<'_>)3281 fn encode(&self, e: &mut SM70Encoder<'_>) {
3282 e.set_opcode(0x918);
3283 }
3284 }
3285
3286 impl SM70Op for OpPixLd {
legalize(&mut self, _b: &mut LegalizeBuilder)3287 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3288 // Nothing to do
3289 }
3290
encode(&self, e: &mut SM70Encoder<'_>)3291 fn encode(&self, e: &mut SM70Encoder<'_>) {
3292 e.set_opcode(0x925);
3293 e.set_dst(self.dst);
3294 e.set_field(
3295 78..81,
3296 match &self.val {
3297 PixVal::MsCount => 0_u8,
3298 PixVal::CovMask => 1_u8,
3299 PixVal::CentroidOffset => 2_u8,
3300 PixVal::MyIndex => 3_u8,
3301 PixVal::InnerCoverage => 4_u8,
3302 other => panic!("Unsupported PixVal: {other}"),
3303 },
3304 );
3305 e.set_pred_dst(81..84, Dst::None);
3306 }
3307 }
3308
3309 impl SM70Op for OpS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3310 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3311 // Nothing to do
3312 }
3313
encode(&self, e: &mut SM70Encoder<'_>)3314 fn encode(&self, e: &mut SM70Encoder<'_>) {
3315 assert!(!self.is_uniform());
3316 e.set_opcode(if self.is_uniform() { 0x9c3 } else { 0x919 });
3317 e.set_dst(self.dst);
3318 e.set_field(72..80, self.idx);
3319 }
3320 }
3321
3322 impl SM70Op for OpOut {
legalize(&mut self, b: &mut LegalizeBuilder)3323 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3324 let gpr = op_gpr(self);
3325 b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3326 b.copy_alu_src_if_not_reg_or_imm(&mut self.stream, gpr, SrcType::ALU);
3327 }
3328
encode(&self, e: &mut SM70Encoder<'_>)3329 fn encode(&self, e: &mut SM70Encoder<'_>) {
3330 e.encode_alu(
3331 0x124,
3332 Some(&self.dst),
3333 Some(&self.handle),
3334 Some(&self.stream),
3335 None,
3336 );
3337
3338 e.set_field(
3339 78..80,
3340 match self.out_type {
3341 OutType::Emit => 1_u8,
3342 OutType::Cut => 2_u8,
3343 OutType::EmitThenCut => 3_u8,
3344 },
3345 );
3346 }
3347 }
3348
3349 impl SM70Op for OpOutFinal {
legalize(&mut self, b: &mut LegalizeBuilder)3350 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3351 let gpr = op_gpr(self);
3352 b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3353 }
3354
encode(&self, e: &mut SM70Encoder<'_>)3355 fn encode(&self, e: &mut SM70Encoder<'_>) {
3356 e.encode_alu(
3357 0x124,
3358 Some(&Dst::None),
3359 Some(&self.handle),
3360 Some(&Src::new_zero()),
3361 None,
3362 );
3363 }
3364 }
3365
3366 impl SM70Op for OpVote {
legalize(&mut self, b: &mut LegalizeBuilder)3367 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3368 b.copy_src_if_upred(&mut self.pred);
3369 }
3370
encode(&self, e: &mut SM70Encoder<'_>)3371 fn encode(&self, e: &mut SM70Encoder<'_>) {
3372 if self.is_uniform() {
3373 e.set_opcode(0x886);
3374 e.set_udst(self.ballot);
3375 } else {
3376 e.set_opcode(0x806);
3377 e.set_dst(self.ballot);
3378 }
3379
3380 e.set_field(
3381 72..74,
3382 match self.op {
3383 VoteOp::All => 0_u8,
3384 VoteOp::Any => 1_u8,
3385 VoteOp::Eq => 2_u8,
3386 },
3387 );
3388
3389 e.set_pred_dst(81..84, self.vote);
3390 e.set_pred_src(87..90, 90, self.pred);
3391 }
3392 }
3393
3394 macro_rules! as_sm70_op_match {
3395 ($op: expr) => {
3396 match $op {
3397 Op::FAdd(op) => op,
3398 Op::FFma(op) => op,
3399 Op::FMnMx(op) => op,
3400 Op::FMul(op) => op,
3401 Op::FSet(op) => op,
3402 Op::FSetP(op) => op,
3403 Op::FSwzAdd(op) => op,
3404 Op::DAdd(op) => op,
3405 Op::DFma(op) => op,
3406 Op::DMul(op) => op,
3407 Op::DSetP(op) => op,
3408 Op::HAdd2(op) => op,
3409 Op::HFma2(op) => op,
3410 Op::HMul2(op) => op,
3411 Op::HSet2(op) => op,
3412 Op::HSetP2(op) => op,
3413 Op::HMnMx2(op) => op,
3414 Op::MuFu(op) => op,
3415 Op::BMsk(op) => op,
3416 Op::BRev(op) => op,
3417 Op::Flo(op) => op,
3418 Op::IAbs(op) => op,
3419 Op::IAdd3(op) => op,
3420 Op::IAdd3X(op) => op,
3421 Op::IDp4(op) => op,
3422 Op::IMad(op) => op,
3423 Op::IMad64(op) => op,
3424 Op::IMnMx(op) => op,
3425 Op::ISetP(op) => op,
3426 Op::Lop3(op) => op,
3427 Op::PopC(op) => op,
3428 Op::Shf(op) => op,
3429 Op::F2F(op) => op,
3430 Op::F2FP(op) => op,
3431 Op::F2I(op) => op,
3432 Op::I2F(op) => op,
3433 Op::FRnd(op) => op,
3434 Op::Mov(op) => op,
3435 Op::Prmt(op) => op,
3436 Op::Sel(op) => op,
3437 Op::Shfl(op) => op,
3438 Op::PLop3(op) => op,
3439 Op::R2UR(op) => op,
3440 Op::Tex(op) => op,
3441 Op::Tld(op) => op,
3442 Op::Tld4(op) => op,
3443 Op::Tmml(op) => op,
3444 Op::Txd(op) => op,
3445 Op::Txq(op) => op,
3446 Op::SuLd(op) => op,
3447 Op::SuSt(op) => op,
3448 Op::SuAtom(op) => op,
3449 Op::Ld(op) => op,
3450 Op::Ldc(op) => op,
3451 Op::St(op) => op,
3452 Op::Atom(op) => op,
3453 Op::AL2P(op) => op,
3454 Op::ALd(op) => op,
3455 Op::ASt(op) => op,
3456 Op::Ipa(op) => op,
3457 Op::LdTram(op) => op,
3458 Op::CCtl(op) => op,
3459 Op::MemBar(op) => op,
3460 Op::BClear(op) => op,
3461 Op::BMov(op) => op,
3462 Op::Break(op) => op,
3463 Op::BSSy(op) => op,
3464 Op::BSync(op) => op,
3465 Op::Bra(op) => op,
3466 Op::Exit(op) => op,
3467 Op::WarpSync(op) => op,
3468 Op::Bar(op) => op,
3469 Op::CS2R(op) => op,
3470 Op::Isberd(op) => op,
3471 Op::Kill(op) => op,
3472 Op::Nop(op) => op,
3473 Op::PixLd(op) => op,
3474 Op::S2R(op) => op,
3475 Op::Out(op) => op,
3476 Op::OutFinal(op) => op,
3477 Op::Vote(op) => op,
3478 _ => panic!("Unsupported op: {}", $op),
3479 }
3480 };
3481 }
3482
as_sm70_op(op: &Op) -> &dyn SM70Op3483 fn as_sm70_op(op: &Op) -> &dyn SM70Op {
3484 as_sm70_op_match!(op)
3485 }
3486
as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op3487 fn as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op {
3488 as_sm70_op_match!(op)
3489 }
3490
encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32>3491 fn encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32> {
3492 assert!(s.functions.len() == 1);
3493 let func = &s.functions[0];
3494
3495 let mut ip = 0_usize;
3496 let mut labels = HashMap::new();
3497 for b in &func.blocks {
3498 labels.insert(b.label, ip);
3499 for instr in &b.instrs {
3500 if let Op::Nop(op) = &instr.op {
3501 if let Some(label) = op.label {
3502 labels.insert(label, ip);
3503 }
3504 }
3505 ip += 4;
3506 }
3507 }
3508
3509 let mut encoded = Vec::new();
3510 for b in &func.blocks {
3511 for instr in &b.instrs {
3512 let mut e = SM70Encoder {
3513 sm,
3514 ip: encoded.len(),
3515 labels: &labels,
3516 inst: [0_u32; 4],
3517 };
3518 as_sm70_op(&instr.op).encode(&mut e);
3519 e.set_pred(&instr.pred);
3520 e.set_instr_deps(&instr.deps);
3521 encoded.extend_from_slice(&e.inst[..]);
3522 }
3523 }
3524 encoded
3525 }
3526