1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI 4 5define amdgpu_kernel void @select_f16( 6; SI-LABEL: select_f16: 7; SI: ; %bb.0: ; %entry 8; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 9; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 10; SI-NEXT: s_mov_b32 s3, 0xf000 11; SI-NEXT: s_mov_b32 s2, -1 12; SI-NEXT: s_mov_b32 s18, s2 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s16, s6 15; SI-NEXT: s_mov_b32 s17, s7 16; SI-NEXT: s_mov_b32 s19, s3 17; SI-NEXT: s_mov_b32 s20, s8 18; SI-NEXT: s_mov_b32 s21, s9 19; SI-NEXT: s_mov_b32 s8, s10 20; SI-NEXT: s_mov_b32 s9, s11 21; SI-NEXT: s_mov_b32 s22, s2 22; SI-NEXT: s_mov_b32 s23, s3 23; SI-NEXT: s_mov_b32 s10, s2 24; SI-NEXT: s_mov_b32 s11, s3 25; SI-NEXT: s_mov_b32 s14, s2 26; SI-NEXT: s_mov_b32 s15, s3 27; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 28; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 29; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 30; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 31; SI-NEXT: s_mov_b32 s0, s4 32; SI-NEXT: s_mov_b32 s1, s5 33; SI-NEXT: s_waitcnt vmcnt(3) 34; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 35; SI-NEXT: s_waitcnt vmcnt(2) 36; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 37; SI-NEXT: s_waitcnt vmcnt(1) 38; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 39; SI-NEXT: s_waitcnt vmcnt(0) 40; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 41; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 42; SI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 43; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 44; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 45; SI-NEXT: s_endpgm 46; 47; VI-LABEL: select_f16: 48; VI: ; %bb.0: ; %entry 49; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 50; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 51; VI-NEXT: s_mov_b32 s3, 0xf000 52; VI-NEXT: s_mov_b32 s2, -1 53; VI-NEXT: s_mov_b32 s18, s2 54; VI-NEXT: s_waitcnt lgkmcnt(0) 55; VI-NEXT: s_mov_b32 s16, s6 56; VI-NEXT: s_mov_b32 s17, s7 57; VI-NEXT: s_mov_b32 s19, s3 58; VI-NEXT: s_mov_b32 s20, s8 59; VI-NEXT: s_mov_b32 s21, s9 60; VI-NEXT: s_mov_b32 s8, s10 61; VI-NEXT: s_mov_b32 s9, s11 62; VI-NEXT: s_mov_b32 s22, s2 63; VI-NEXT: s_mov_b32 s23, s3 64; VI-NEXT: s_mov_b32 s10, s2 65; VI-NEXT: s_mov_b32 s11, s3 66; VI-NEXT: s_mov_b32 s14, s2 67; VI-NEXT: s_mov_b32 s15, s3 68; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 69; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 70; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 71; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0 72; VI-NEXT: s_mov_b32 s0, s4 73; VI-NEXT: s_mov_b32 s1, s5 74; VI-NEXT: s_waitcnt vmcnt(2) 75; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 76; VI-NEXT: s_waitcnt vmcnt(0) 77; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 78; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 79; VI-NEXT: s_endpgm 80 half addrspace(1)* %r, 81 half addrspace(1)* %a, 82 half addrspace(1)* %b, 83 half addrspace(1)* %c, 84 half addrspace(1)* %d) { 85entry: 86 %a.val = load volatile half, half addrspace(1)* %a 87 %b.val = load volatile half, half addrspace(1)* %b 88 %c.val = load volatile half, half addrspace(1)* %c 89 %d.val = load volatile half, half addrspace(1)* %d 90 %fcmp = fcmp olt half %a.val, %b.val 91 %r.val = select i1 %fcmp, half %c.val, half %d.val 92 store half %r.val, half addrspace(1)* %r 93 ret void 94} 95 96define amdgpu_kernel void @select_f16_imm_a( 97; SI-LABEL: select_f16_imm_a: 98; SI: ; %bb.0: ; %entry 99; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 100; SI-NEXT: s_mov_b32 s11, 0xf000 101; SI-NEXT: s_mov_b32 s10, -1 102; SI-NEXT: s_mov_b32 s14, s10 103; SI-NEXT: s_mov_b32 s15, s11 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: s_mov_b32 s12, s2 106; SI-NEXT: s_mov_b32 s13, s3 107; SI-NEXT: s_mov_b32 s16, s4 108; SI-NEXT: s_mov_b32 s17, s5 109; SI-NEXT: s_mov_b32 s4, s6 110; SI-NEXT: s_mov_b32 s5, s7 111; SI-NEXT: s_mov_b32 s18, s10 112; SI-NEXT: s_mov_b32 s19, s11 113; SI-NEXT: s_mov_b32 s6, s10 114; SI-NEXT: s_mov_b32 s7, s11 115; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 116; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 117; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 118; SI-NEXT: s_mov_b32 s8, s0 119; SI-NEXT: s_mov_b32 s9, s1 120; SI-NEXT: s_waitcnt vmcnt(2) 121; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 122; SI-NEXT: s_waitcnt vmcnt(1) 123; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 124; SI-NEXT: s_waitcnt vmcnt(0) 125; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 126; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 127; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 128; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 129; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 130; SI-NEXT: s_endpgm 131; 132; VI-LABEL: select_f16_imm_a: 133; VI: ; %bb.0: ; %entry 134; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 135; VI-NEXT: s_mov_b32 s11, 0xf000 136; VI-NEXT: s_mov_b32 s10, -1 137; VI-NEXT: s_mov_b32 s14, s10 138; VI-NEXT: s_mov_b32 s15, s11 139; VI-NEXT: s_waitcnt lgkmcnt(0) 140; VI-NEXT: s_mov_b32 s12, s2 141; VI-NEXT: s_mov_b32 s13, s3 142; VI-NEXT: s_mov_b32 s16, s4 143; VI-NEXT: s_mov_b32 s17, s5 144; VI-NEXT: s_mov_b32 s4, s6 145; VI-NEXT: s_mov_b32 s5, s7 146; VI-NEXT: s_mov_b32 s18, s10 147; VI-NEXT: s_mov_b32 s19, s11 148; VI-NEXT: s_mov_b32 s6, s10 149; VI-NEXT: s_mov_b32 s7, s11 150; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 151; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 152; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 153; VI-NEXT: s_mov_b32 s8, s0 154; VI-NEXT: s_mov_b32 s9, s1 155; VI-NEXT: s_waitcnt vmcnt(2) 156; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 157; VI-NEXT: s_waitcnt vmcnt(0) 158; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 159; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 160; VI-NEXT: s_endpgm 161 half addrspace(1)* %r, 162 half addrspace(1)* %b, 163 half addrspace(1)* %c, 164 half addrspace(1)* %d) { 165entry: 166 %b.val = load volatile half, half addrspace(1)* %b 167 %c.val = load volatile half, half addrspace(1)* %c 168 %d.val = load volatile half, half addrspace(1)* %d 169 %fcmp = fcmp olt half 0xH3800, %b.val 170 %r.val = select i1 %fcmp, half %c.val, half %d.val 171 store half %r.val, half addrspace(1)* %r 172 ret void 173} 174 175define amdgpu_kernel void @select_f16_imm_b( 176; SI-LABEL: select_f16_imm_b: 177; SI: ; %bb.0: ; %entry 178; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 179; SI-NEXT: s_mov_b32 s11, 0xf000 180; SI-NEXT: s_mov_b32 s10, -1 181; SI-NEXT: s_mov_b32 s14, s10 182; SI-NEXT: s_mov_b32 s15, s11 183; SI-NEXT: s_waitcnt lgkmcnt(0) 184; SI-NEXT: s_mov_b32 s12, s2 185; SI-NEXT: s_mov_b32 s13, s3 186; SI-NEXT: s_mov_b32 s16, s4 187; SI-NEXT: s_mov_b32 s17, s5 188; SI-NEXT: s_mov_b32 s4, s6 189; SI-NEXT: s_mov_b32 s5, s7 190; SI-NEXT: s_mov_b32 s18, s10 191; SI-NEXT: s_mov_b32 s19, s11 192; SI-NEXT: s_mov_b32 s6, s10 193; SI-NEXT: s_mov_b32 s7, s11 194; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 195; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 196; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 197; SI-NEXT: s_mov_b32 s8, s0 198; SI-NEXT: s_mov_b32 s9, s1 199; SI-NEXT: s_waitcnt vmcnt(2) 200; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 201; SI-NEXT: s_waitcnt vmcnt(1) 202; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 203; SI-NEXT: s_waitcnt vmcnt(0) 204; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 205; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 206; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 207; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 208; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 209; SI-NEXT: s_endpgm 210; 211; VI-LABEL: select_f16_imm_b: 212; VI: ; %bb.0: ; %entry 213; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 214; VI-NEXT: s_mov_b32 s11, 0xf000 215; VI-NEXT: s_mov_b32 s10, -1 216; VI-NEXT: s_mov_b32 s14, s10 217; VI-NEXT: s_mov_b32 s15, s11 218; VI-NEXT: s_waitcnt lgkmcnt(0) 219; VI-NEXT: s_mov_b32 s12, s2 220; VI-NEXT: s_mov_b32 s13, s3 221; VI-NEXT: s_mov_b32 s16, s4 222; VI-NEXT: s_mov_b32 s17, s5 223; VI-NEXT: s_mov_b32 s4, s6 224; VI-NEXT: s_mov_b32 s5, s7 225; VI-NEXT: s_mov_b32 s18, s10 226; VI-NEXT: s_mov_b32 s19, s11 227; VI-NEXT: s_mov_b32 s6, s10 228; VI-NEXT: s_mov_b32 s7, s11 229; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 230; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 231; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 232; VI-NEXT: s_mov_b32 s8, s0 233; VI-NEXT: s_mov_b32 s9, s1 234; VI-NEXT: s_waitcnt vmcnt(2) 235; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 236; VI-NEXT: s_waitcnt vmcnt(0) 237; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 238; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 239; VI-NEXT: s_endpgm 240 half addrspace(1)* %r, 241 half addrspace(1)* %a, 242 half addrspace(1)* %c, 243 half addrspace(1)* %d) { 244entry: 245 %a.val = load volatile half, half addrspace(1)* %a 246 %c.val = load volatile half, half addrspace(1)* %c 247 %d.val = load volatile half, half addrspace(1)* %d 248 %fcmp = fcmp olt half %a.val, 0xH3800 249 %r.val = select i1 %fcmp, half %c.val, half %d.val 250 store half %r.val, half addrspace(1)* %r 251 ret void 252} 253 254define amdgpu_kernel void @select_f16_imm_c( 255; SI-LABEL: select_f16_imm_c: 256; SI: ; %bb.0: ; %entry 257; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 258; SI-NEXT: s_mov_b32 s11, 0xf000 259; SI-NEXT: s_mov_b32 s10, -1 260; SI-NEXT: s_mov_b32 s14, s10 261; SI-NEXT: s_mov_b32 s15, s11 262; SI-NEXT: s_waitcnt lgkmcnt(0) 263; SI-NEXT: s_mov_b32 s12, s2 264; SI-NEXT: s_mov_b32 s13, s3 265; SI-NEXT: s_mov_b32 s16, s4 266; SI-NEXT: s_mov_b32 s17, s5 267; SI-NEXT: s_mov_b32 s4, s6 268; SI-NEXT: s_mov_b32 s5, s7 269; SI-NEXT: s_mov_b32 s18, s10 270; SI-NEXT: s_mov_b32 s19, s11 271; SI-NEXT: s_mov_b32 s6, s10 272; SI-NEXT: s_mov_b32 s7, s11 273; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 274; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 275; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 276; SI-NEXT: s_mov_b32 s8, s0 277; SI-NEXT: s_mov_b32 s9, s1 278; SI-NEXT: s_waitcnt vmcnt(2) 279; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 280; SI-NEXT: s_waitcnt vmcnt(1) 281; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 282; SI-NEXT: s_waitcnt vmcnt(0) 283; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 284; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 285; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 286; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 287; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 288; SI-NEXT: s_endpgm 289; 290; VI-LABEL: select_f16_imm_c: 291; VI: ; %bb.0: ; %entry 292; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 293; VI-NEXT: s_mov_b32 s11, 0xf000 294; VI-NEXT: s_mov_b32 s10, -1 295; VI-NEXT: s_mov_b32 s14, s10 296; VI-NEXT: s_mov_b32 s15, s11 297; VI-NEXT: s_waitcnt lgkmcnt(0) 298; VI-NEXT: s_mov_b32 s12, s2 299; VI-NEXT: s_mov_b32 s13, s3 300; VI-NEXT: s_mov_b32 s16, s4 301; VI-NEXT: s_mov_b32 s17, s5 302; VI-NEXT: s_mov_b32 s4, s6 303; VI-NEXT: s_mov_b32 s5, s7 304; VI-NEXT: s_mov_b32 s18, s10 305; VI-NEXT: s_mov_b32 s19, s11 306; VI-NEXT: s_mov_b32 s6, s10 307; VI-NEXT: s_mov_b32 s7, s11 308; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 309; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 310; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 311; VI-NEXT: v_mov_b32_e32 v3, 0x3800 312; VI-NEXT: s_mov_b32 s8, s0 313; VI-NEXT: s_mov_b32 s9, s1 314; VI-NEXT: s_waitcnt vmcnt(1) 315; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 316; VI-NEXT: s_waitcnt vmcnt(0) 317; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 318; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 319; VI-NEXT: s_endpgm 320 half addrspace(1)* %r, 321 half addrspace(1)* %a, 322 half addrspace(1)* %b, 323 half addrspace(1)* %d) { 324entry: 325 %a.val = load volatile half, half addrspace(1)* %a 326 %b.val = load volatile half, half addrspace(1)* %b 327 %d.val = load volatile half, half addrspace(1)* %d 328 %fcmp = fcmp olt half %a.val, %b.val 329 %r.val = select i1 %fcmp, half 0xH3800, half %d.val 330 store half %r.val, half addrspace(1)* %r 331 ret void 332} 333 334define amdgpu_kernel void @select_f16_imm_d( 335; SI-LABEL: select_f16_imm_d: 336; SI: ; %bb.0: ; %entry 337; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 338; SI-NEXT: s_mov_b32 s11, 0xf000 339; SI-NEXT: s_mov_b32 s10, -1 340; SI-NEXT: s_mov_b32 s14, s10 341; SI-NEXT: s_mov_b32 s15, s11 342; SI-NEXT: s_waitcnt lgkmcnt(0) 343; SI-NEXT: s_mov_b32 s12, s2 344; SI-NEXT: s_mov_b32 s13, s3 345; SI-NEXT: s_mov_b32 s16, s4 346; SI-NEXT: s_mov_b32 s17, s5 347; SI-NEXT: s_mov_b32 s4, s6 348; SI-NEXT: s_mov_b32 s5, s7 349; SI-NEXT: s_mov_b32 s18, s10 350; SI-NEXT: s_mov_b32 s19, s11 351; SI-NEXT: s_mov_b32 s6, s10 352; SI-NEXT: s_mov_b32 s7, s11 353; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 354; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 355; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 356; SI-NEXT: s_mov_b32 s8, s0 357; SI-NEXT: s_mov_b32 s9, s1 358; SI-NEXT: s_waitcnt vmcnt(2) 359; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 360; SI-NEXT: s_waitcnt vmcnt(1) 361; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 362; SI-NEXT: s_waitcnt vmcnt(0) 363; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 364; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 365; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 366; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 367; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 368; SI-NEXT: s_endpgm 369; 370; VI-LABEL: select_f16_imm_d: 371; VI: ; %bb.0: ; %entry 372; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 373; VI-NEXT: s_mov_b32 s11, 0xf000 374; VI-NEXT: s_mov_b32 s10, -1 375; VI-NEXT: s_mov_b32 s14, s10 376; VI-NEXT: s_mov_b32 s15, s11 377; VI-NEXT: s_waitcnt lgkmcnt(0) 378; VI-NEXT: s_mov_b32 s12, s2 379; VI-NEXT: s_mov_b32 s13, s3 380; VI-NEXT: s_mov_b32 s16, s4 381; VI-NEXT: s_mov_b32 s17, s5 382; VI-NEXT: s_mov_b32 s4, s6 383; VI-NEXT: s_mov_b32 s5, s7 384; VI-NEXT: s_mov_b32 s18, s10 385; VI-NEXT: s_mov_b32 s19, s11 386; VI-NEXT: s_mov_b32 s6, s10 387; VI-NEXT: s_mov_b32 s7, s11 388; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 389; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 390; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 391; VI-NEXT: v_mov_b32_e32 v3, 0x3800 392; VI-NEXT: s_mov_b32 s8, s0 393; VI-NEXT: s_mov_b32 s9, s1 394; VI-NEXT: s_waitcnt vmcnt(1) 395; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 396; VI-NEXT: s_waitcnt vmcnt(0) 397; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 398; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 399; VI-NEXT: s_endpgm 400 half addrspace(1)* %r, 401 half addrspace(1)* %a, 402 half addrspace(1)* %b, 403 half addrspace(1)* %c) { 404entry: 405 %a.val = load volatile half, half addrspace(1)* %a 406 %b.val = load volatile half, half addrspace(1)* %b 407 %c.val = load volatile half, half addrspace(1)* %c 408 %fcmp = fcmp olt half %a.val, %b.val 409 %r.val = select i1 %fcmp, half %c.val, half 0xH3800 410 store half %r.val, half addrspace(1)* %r 411 ret void 412} 413 414define amdgpu_kernel void @select_v2f16( 415; SI-LABEL: select_v2f16: 416; SI: ; %bb.0: ; %entry 417; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 418; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 419; SI-NEXT: s_mov_b32 s3, 0xf000 420; SI-NEXT: s_mov_b32 s2, -1 421; SI-NEXT: s_mov_b32 s18, s2 422; SI-NEXT: s_waitcnt lgkmcnt(0) 423; SI-NEXT: s_mov_b32 s16, s6 424; SI-NEXT: s_mov_b32 s17, s7 425; SI-NEXT: s_mov_b32 s19, s3 426; SI-NEXT: s_mov_b32 s20, s8 427; SI-NEXT: s_mov_b32 s21, s9 428; SI-NEXT: s_mov_b32 s8, s10 429; SI-NEXT: s_mov_b32 s9, s11 430; SI-NEXT: s_mov_b32 s22, s2 431; SI-NEXT: s_mov_b32 s23, s3 432; SI-NEXT: s_mov_b32 s10, s2 433; SI-NEXT: s_mov_b32 s11, s3 434; SI-NEXT: s_mov_b32 s14, s2 435; SI-NEXT: s_mov_b32 s15, s3 436; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 437; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 438; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 439; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 440; SI-NEXT: s_mov_b32 s0, s4 441; SI-NEXT: s_mov_b32 s1, s5 442; SI-NEXT: s_waitcnt vmcnt(3) 443; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0 444; SI-NEXT: s_waitcnt vmcnt(2) 445; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 446; SI-NEXT: s_waitcnt vmcnt(1) 447; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 448; SI-NEXT: s_waitcnt vmcnt(0) 449; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 450; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 451; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 452; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 453; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 454; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 455; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 456; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 457; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 458; SI-NEXT: v_cmp_lt_f32_e32 vcc, v5, v6 459; SI-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 460; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 461; SI-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc 462; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 463; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 464; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 465; SI-NEXT: v_or_b32_e32 v0, v0, v1 466; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 467; SI-NEXT: s_endpgm 468; 469; VI-LABEL: select_v2f16: 470; VI: ; %bb.0: ; %entry 471; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 472; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 473; VI-NEXT: s_mov_b32 s3, 0xf000 474; VI-NEXT: s_mov_b32 s2, -1 475; VI-NEXT: s_mov_b32 s18, s2 476; VI-NEXT: s_waitcnt lgkmcnt(0) 477; VI-NEXT: s_mov_b32 s16, s6 478; VI-NEXT: s_mov_b32 s17, s7 479; VI-NEXT: s_mov_b32 s19, s3 480; VI-NEXT: s_mov_b32 s20, s8 481; VI-NEXT: s_mov_b32 s21, s9 482; VI-NEXT: s_mov_b32 s8, s10 483; VI-NEXT: s_mov_b32 s9, s11 484; VI-NEXT: s_mov_b32 s22, s2 485; VI-NEXT: s_mov_b32 s23, s3 486; VI-NEXT: s_mov_b32 s10, s2 487; VI-NEXT: s_mov_b32 s11, s3 488; VI-NEXT: s_mov_b32 s14, s2 489; VI-NEXT: s_mov_b32 s15, s3 490; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 491; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0 492; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 493; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 494; VI-NEXT: s_mov_b32 s0, s4 495; VI-NEXT: s_mov_b32 s1, s5 496; VI-NEXT: s_waitcnt vmcnt(3) 497; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 498; VI-NEXT: s_waitcnt vmcnt(2) 499; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 500; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 501; VI-NEXT: s_waitcnt vmcnt(0) 502; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 503; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 504; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 505; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 506; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 507; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 508; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 509; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 510; VI-NEXT: s_endpgm 511 <2 x half> addrspace(1)* %r, 512 <2 x half> addrspace(1)* %a, 513 <2 x half> addrspace(1)* %b, 514 <2 x half> addrspace(1)* %c, 515 <2 x half> addrspace(1)* %d) { 516entry: 517 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 518 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 519 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 520 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 521 %fcmp = fcmp olt <2 x half> %a.val, %b.val 522 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 523 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 524 ret void 525} 526 527define amdgpu_kernel void @select_v2f16_imm_a( 528; SI-LABEL: select_v2f16_imm_a: 529; SI: ; %bb.0: ; %entry 530; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 531; SI-NEXT: s_mov_b32 s11, 0xf000 532; SI-NEXT: s_mov_b32 s10, -1 533; SI-NEXT: s_mov_b32 s14, s10 534; SI-NEXT: s_mov_b32 s15, s11 535; SI-NEXT: s_waitcnt lgkmcnt(0) 536; SI-NEXT: s_mov_b32 s12, s2 537; SI-NEXT: s_mov_b32 s13, s3 538; SI-NEXT: s_mov_b32 s16, s4 539; SI-NEXT: s_mov_b32 s17, s5 540; SI-NEXT: s_mov_b32 s4, s6 541; SI-NEXT: s_mov_b32 s5, s7 542; SI-NEXT: s_mov_b32 s18, s10 543; SI-NEXT: s_mov_b32 s19, s11 544; SI-NEXT: s_mov_b32 s6, s10 545; SI-NEXT: s_mov_b32 s7, s11 546; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 547; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 548; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 549; SI-NEXT: s_mov_b32 s2, 0x3f200000 550; SI-NEXT: s_mov_b32 s8, s0 551; SI-NEXT: s_mov_b32 s9, s1 552; SI-NEXT: s_waitcnt vmcnt(2) 553; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 554; SI-NEXT: s_waitcnt vmcnt(1) 555; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 556; SI-NEXT: s_waitcnt vmcnt(0) 557; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 558; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 559; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 560; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 561; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 562; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 563; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 564; SI-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 565; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 566; SI-NEXT: v_cmp_lt_f32_e32 vcc, 0.5, v0 567; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 568; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 569; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 570; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 571; SI-NEXT: v_or_b32_e32 v0, v0, v1 572; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 573; SI-NEXT: s_endpgm 574; 575; VI-LABEL: select_v2f16_imm_a: 576; VI: ; %bb.0: ; %entry 577; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 578; VI-NEXT: s_mov_b32 s11, 0xf000 579; VI-NEXT: s_mov_b32 s10, -1 580; VI-NEXT: s_mov_b32 s14, s10 581; VI-NEXT: s_mov_b32 s15, s11 582; VI-NEXT: s_waitcnt lgkmcnt(0) 583; VI-NEXT: s_mov_b32 s12, s2 584; VI-NEXT: s_mov_b32 s13, s3 585; VI-NEXT: s_mov_b32 s16, s4 586; VI-NEXT: s_mov_b32 s17, s5 587; VI-NEXT: s_mov_b32 s4, s6 588; VI-NEXT: s_mov_b32 s5, s7 589; VI-NEXT: s_mov_b32 s18, s10 590; VI-NEXT: s_mov_b32 s19, s11 591; VI-NEXT: s_mov_b32 s6, s10 592; VI-NEXT: s_mov_b32 s7, s11 593; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 594; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 595; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 596; VI-NEXT: s_movk_i32 s2, 0x3900 597; VI-NEXT: s_mov_b32 s8, s0 598; VI-NEXT: s_mov_b32 s9, s1 599; VI-NEXT: s_waitcnt vmcnt(2) 600; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 601; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0 602; VI-NEXT: s_waitcnt vmcnt(0) 603; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 604; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 605; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 606; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3 607; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 608; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 609; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 610; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 611; VI-NEXT: s_endpgm 612 <2 x half> addrspace(1)* %r, 613 <2 x half> addrspace(1)* %b, 614 <2 x half> addrspace(1)* %c, 615 <2 x half> addrspace(1)* %d) { 616entry: 617 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 618 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 619 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 620 %fcmp = fcmp olt <2 x half> <half 0xH3800, half 0xH3900>, %b.val 621 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 622 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 623 ret void 624} 625 626define amdgpu_kernel void @select_v2f16_imm_b( 627; SI-LABEL: select_v2f16_imm_b: 628; SI: ; %bb.0: ; %entry 629; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 630; SI-NEXT: s_mov_b32 s11, 0xf000 631; SI-NEXT: s_mov_b32 s10, -1 632; SI-NEXT: s_mov_b32 s14, s10 633; SI-NEXT: s_mov_b32 s15, s11 634; SI-NEXT: s_waitcnt lgkmcnt(0) 635; SI-NEXT: s_mov_b32 s12, s2 636; SI-NEXT: s_mov_b32 s13, s3 637; SI-NEXT: s_mov_b32 s16, s4 638; SI-NEXT: s_mov_b32 s17, s5 639; SI-NEXT: s_mov_b32 s4, s6 640; SI-NEXT: s_mov_b32 s5, s7 641; SI-NEXT: s_mov_b32 s18, s10 642; SI-NEXT: s_mov_b32 s19, s11 643; SI-NEXT: s_mov_b32 s6, s10 644; SI-NEXT: s_mov_b32 s7, s11 645; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 646; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 647; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 648; SI-NEXT: s_mov_b32 s2, 0x3f200000 649; SI-NEXT: s_mov_b32 s8, s0 650; SI-NEXT: s_mov_b32 s9, s1 651; SI-NEXT: s_waitcnt vmcnt(2) 652; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 653; SI-NEXT: s_waitcnt vmcnt(1) 654; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 655; SI-NEXT: s_waitcnt vmcnt(0) 656; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 657; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 658; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 659; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 660; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 661; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 662; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 663; SI-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 664; SI-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc 665; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0.5, v0 666; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 667; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 668; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 669; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 670; SI-NEXT: v_or_b32_e32 v0, v0, v1 671; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 672; SI-NEXT: s_endpgm 673; 674; VI-LABEL: select_v2f16_imm_b: 675; VI: ; %bb.0: ; %entry 676; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 677; VI-NEXT: s_mov_b32 s11, 0xf000 678; VI-NEXT: s_mov_b32 s10, -1 679; VI-NEXT: s_mov_b32 s14, s10 680; VI-NEXT: s_mov_b32 s15, s11 681; VI-NEXT: s_waitcnt lgkmcnt(0) 682; VI-NEXT: s_mov_b32 s12, s2 683; VI-NEXT: s_mov_b32 s13, s3 684; VI-NEXT: s_mov_b32 s16, s4 685; VI-NEXT: s_mov_b32 s17, s5 686; VI-NEXT: s_mov_b32 s4, s6 687; VI-NEXT: s_mov_b32 s5, s7 688; VI-NEXT: s_mov_b32 s18, s10 689; VI-NEXT: s_mov_b32 s19, s11 690; VI-NEXT: s_mov_b32 s6, s10 691; VI-NEXT: s_mov_b32 s7, s11 692; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 693; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 694; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 695; VI-NEXT: s_movk_i32 s2, 0x3900 696; VI-NEXT: s_mov_b32 s8, s0 697; VI-NEXT: s_mov_b32 s9, s1 698; VI-NEXT: s_waitcnt vmcnt(2) 699; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 700; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0 701; VI-NEXT: s_waitcnt vmcnt(0) 702; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 703; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 704; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 705; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3 706; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 707; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 708; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 709; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 710; VI-NEXT: s_endpgm 711 <2 x half> addrspace(1)* %r, 712 <2 x half> addrspace(1)* %a, 713 <2 x half> addrspace(1)* %c, 714 <2 x half> addrspace(1)* %d) { 715entry: 716 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 717 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 718 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 719 %fcmp = fcmp olt <2 x half> %a.val, <half 0xH3800, half 0xH3900> 720 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> %d.val 721 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 722 ret void 723} 724 725define amdgpu_kernel void @select_v2f16_imm_c( 726; SI-LABEL: select_v2f16_imm_c: 727; SI: ; %bb.0: ; %entry 728; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 729; SI-NEXT: s_mov_b32 s11, 0xf000 730; SI-NEXT: s_mov_b32 s10, -1 731; SI-NEXT: s_mov_b32 s14, s10 732; SI-NEXT: s_mov_b32 s15, s11 733; SI-NEXT: s_waitcnt lgkmcnt(0) 734; SI-NEXT: s_mov_b32 s12, s2 735; SI-NEXT: s_mov_b32 s13, s3 736; SI-NEXT: s_mov_b32 s16, s4 737; SI-NEXT: s_mov_b32 s17, s5 738; SI-NEXT: s_mov_b32 s4, s6 739; SI-NEXT: s_mov_b32 s5, s7 740; SI-NEXT: s_mov_b32 s18, s10 741; SI-NEXT: s_mov_b32 s19, s11 742; SI-NEXT: s_mov_b32 s6, s10 743; SI-NEXT: s_mov_b32 s7, s11 744; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 745; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 746; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 747; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 748; SI-NEXT: s_mov_b32 s8, s0 749; SI-NEXT: s_mov_b32 s9, s1 750; SI-NEXT: s_waitcnt vmcnt(2) 751; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 752; SI-NEXT: s_waitcnt vmcnt(1) 753; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 754; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 755; SI-NEXT: s_waitcnt vmcnt(0) 756; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 757; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 758; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 759; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 760; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 761; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 762; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5 763; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc 764; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 765; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 766; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc 767; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 768; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 769; SI-NEXT: v_or_b32_e32 v0, v1, v0 770; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 771; SI-NEXT: s_endpgm 772; 773; VI-LABEL: select_v2f16_imm_c: 774; VI: ; %bb.0: ; %entry 775; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 776; VI-NEXT: s_mov_b32 s11, 0xf000 777; VI-NEXT: s_mov_b32 s10, -1 778; VI-NEXT: s_mov_b32 s14, s10 779; VI-NEXT: s_mov_b32 s15, s11 780; VI-NEXT: s_waitcnt lgkmcnt(0) 781; VI-NEXT: s_mov_b32 s12, s2 782; VI-NEXT: s_mov_b32 s13, s3 783; VI-NEXT: s_mov_b32 s16, s4 784; VI-NEXT: s_mov_b32 s17, s5 785; VI-NEXT: s_mov_b32 s4, s6 786; VI-NEXT: s_mov_b32 s5, s7 787; VI-NEXT: s_mov_b32 s18, s10 788; VI-NEXT: s_mov_b32 s19, s11 789; VI-NEXT: s_mov_b32 s6, s10 790; VI-NEXT: s_mov_b32 s7, s11 791; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 792; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 793; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 794; VI-NEXT: v_mov_b32_e32 v3, 0x3800 795; VI-NEXT: v_mov_b32_e32 v4, 0x3900 796; VI-NEXT: s_mov_b32 s8, s0 797; VI-NEXT: s_mov_b32 s9, s1 798; VI-NEXT: s_waitcnt vmcnt(2) 799; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 800; VI-NEXT: s_waitcnt vmcnt(1) 801; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 802; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 803; VI-NEXT: s_waitcnt vmcnt(0) 804; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 805; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 806; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5 807; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 808; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 809; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 810; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 811; VI-NEXT: s_endpgm 812 <2 x half> addrspace(1)* %r, 813 <2 x half> addrspace(1)* %a, 814 <2 x half> addrspace(1)* %b, 815 <2 x half> addrspace(1)* %d) { 816entry: 817 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 818 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 819 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d 820 %fcmp = fcmp olt <2 x half> %a.val, %b.val 821 %r.val = select <2 x i1> %fcmp, <2 x half> <half 0xH3800, half 0xH3900>, <2 x half> %d.val 822 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 823 ret void 824} 825 826define amdgpu_kernel void @select_v2f16_imm_d( 827; SI-LABEL: select_v2f16_imm_d: 828; SI: ; %bb.0: ; %entry 829; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 830; SI-NEXT: s_mov_b32 s11, 0xf000 831; SI-NEXT: s_mov_b32 s10, -1 832; SI-NEXT: s_mov_b32 s14, s10 833; SI-NEXT: s_mov_b32 s15, s11 834; SI-NEXT: s_waitcnt lgkmcnt(0) 835; SI-NEXT: s_mov_b32 s12, s2 836; SI-NEXT: s_mov_b32 s13, s3 837; SI-NEXT: s_mov_b32 s16, s4 838; SI-NEXT: s_mov_b32 s17, s5 839; SI-NEXT: s_mov_b32 s4, s6 840; SI-NEXT: s_mov_b32 s5, s7 841; SI-NEXT: s_mov_b32 s18, s10 842; SI-NEXT: s_mov_b32 s19, s11 843; SI-NEXT: s_mov_b32 s6, s10 844; SI-NEXT: s_mov_b32 s7, s11 845; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 846; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 847; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 848; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000 849; SI-NEXT: s_mov_b32 s8, s0 850; SI-NEXT: s_mov_b32 s9, s1 851; SI-NEXT: s_waitcnt vmcnt(2) 852; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 853; SI-NEXT: s_waitcnt vmcnt(1) 854; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 855; SI-NEXT: s_waitcnt vmcnt(0) 856; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 857; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 858; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 859; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 860; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 861; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 862; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 863; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5 864; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 865; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 866; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc 867; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 868; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 869; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 870; SI-NEXT: v_or_b32_e32 v0, v0, v1 871; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 872; SI-NEXT: s_endpgm 873; 874; VI-LABEL: select_v2f16_imm_d: 875; VI: ; %bb.0: ; %entry 876; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 877; VI-NEXT: s_mov_b32 s11, 0xf000 878; VI-NEXT: s_mov_b32 s10, -1 879; VI-NEXT: s_mov_b32 s14, s10 880; VI-NEXT: s_mov_b32 s15, s11 881; VI-NEXT: s_waitcnt lgkmcnt(0) 882; VI-NEXT: s_mov_b32 s12, s2 883; VI-NEXT: s_mov_b32 s13, s3 884; VI-NEXT: s_mov_b32 s16, s4 885; VI-NEXT: s_mov_b32 s17, s5 886; VI-NEXT: s_mov_b32 s4, s6 887; VI-NEXT: s_mov_b32 s5, s7 888; VI-NEXT: s_mov_b32 s18, s10 889; VI-NEXT: s_mov_b32 s19, s11 890; VI-NEXT: s_mov_b32 s6, s10 891; VI-NEXT: s_mov_b32 s7, s11 892; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 893; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 894; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 895; VI-NEXT: v_mov_b32_e32 v3, 0x3800 896; VI-NEXT: v_mov_b32_e32 v4, 0x3900 897; VI-NEXT: s_mov_b32 s8, s0 898; VI-NEXT: s_mov_b32 s9, s1 899; VI-NEXT: s_waitcnt vmcnt(2) 900; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 901; VI-NEXT: s_waitcnt vmcnt(1) 902; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 903; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 904; VI-NEXT: s_waitcnt vmcnt(0) 905; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 906; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 907; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5 908; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 909; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 910; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 911; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 912; VI-NEXT: s_endpgm 913 <2 x half> addrspace(1)* %r, 914 <2 x half> addrspace(1)* %a, 915 <2 x half> addrspace(1)* %b, 916 <2 x half> addrspace(1)* %c) { 917entry: 918 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a 919 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b 920 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c 921 %fcmp = fcmp olt <2 x half> %a.val, %b.val 922 %r.val = select <2 x i1> %fcmp, <2 x half> %c.val, <2 x half> <half 0xH3800, half 0xH3900> 923 store <2 x half> %r.val, <2 x half> addrspace(1)* %r 924 ret void 925} 926