1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s 3; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s 5 6 7define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) { 8; GFX7-LABEL: test_div_scale_f32_1: 9; GFX7: ; %bb.0: 10; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 12; GFX7-NEXT: v_mov_b32_e32 v1, 0 13; GFX7-NEXT: s_mov_b32 s6, 0 14; GFX7-NEXT: s_mov_b32 s7, 0xf000 15; GFX7-NEXT: s_waitcnt lgkmcnt(0) 16; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 17; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 18; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 19; GFX7-NEXT: s_mov_b32 s6, -1 20; GFX7-NEXT: s_waitcnt vmcnt(0) 21; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 22; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 23; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 24; GFX7-NEXT: s_endpgm 25; 26; GFX8-LABEL: test_div_scale_f32_1: 27; GFX8: ; %bb.0: 28; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 29; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; GFX8-NEXT: s_waitcnt lgkmcnt(0) 31; GFX8-NEXT: v_mov_b32_e32 v0, s2 32; GFX8-NEXT: v_mov_b32_e32 v1, s3 33; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 34; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 35; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 36; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 37; GFX8-NEXT: flat_load_dword v0, v[0:1] 38; GFX8-NEXT: flat_load_dword v1, v[2:3] 39; GFX8-NEXT: s_waitcnt vmcnt(0) 40; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 41; GFX8-NEXT: v_mov_b32_e32 v0, s0 42; GFX8-NEXT: v_mov_b32_e32 v1, s1 43; GFX8-NEXT: flat_store_dword v[0:1], v2 44; GFX8-NEXT: s_endpgm 45; 46; GFX10-LABEL: test_div_scale_f32_1: 47; GFX10: ; %bb.0: 48; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 49; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 50; GFX10-NEXT: s_waitcnt lgkmcnt(0) 51; GFX10-NEXT: s_clause 0x1 52; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 53; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 54; GFX10-NEXT: s_waitcnt vmcnt(0) 55; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 56; GFX10-NEXT: v_mov_b32_e32 v1, 0 57; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 58; GFX10-NEXT: s_endpgm 59 %tid = call i32 @llvm.amdgcn.workitem.id.x() 60 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 61 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 62 63 %a = load volatile float, float addrspace(1)* %gep.0, align 4 64 %b = load volatile float, float addrspace(1)* %gep.1, align 4 65 66 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 67 %result0 = extractvalue { float, i1 } %result, 0 68 store float %result0, float addrspace(1)* %out, align 4 69 ret void 70} 71 72define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) { 73; GFX7-LABEL: test_div_scale_f32_2: 74; GFX7: ; %bb.0: 75; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 76; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 77; GFX7-NEXT: v_mov_b32_e32 v1, 0 78; GFX7-NEXT: s_mov_b32 s6, 0 79; GFX7-NEXT: s_mov_b32 s7, 0xf000 80; GFX7-NEXT: s_waitcnt lgkmcnt(0) 81; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 82; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 83; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 84; GFX7-NEXT: s_mov_b32 s6, -1 85; GFX7-NEXT: s_waitcnt vmcnt(0) 86; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v2, v0, v2 87; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 88; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 89; GFX7-NEXT: s_endpgm 90; 91; GFX8-LABEL: test_div_scale_f32_2: 92; GFX8: ; %bb.0: 93; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 94; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 95; GFX8-NEXT: s_waitcnt lgkmcnt(0) 96; GFX8-NEXT: v_mov_b32_e32 v0, s2 97; GFX8-NEXT: v_mov_b32_e32 v1, s3 98; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 99; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 100; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 101; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 102; GFX8-NEXT: flat_load_dword v0, v[0:1] 103; GFX8-NEXT: flat_load_dword v1, v[2:3] 104; GFX8-NEXT: s_waitcnt vmcnt(0) 105; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0 106; GFX8-NEXT: v_mov_b32_e32 v0, s0 107; GFX8-NEXT: v_mov_b32_e32 v1, s1 108; GFX8-NEXT: flat_store_dword v[0:1], v2 109; GFX8-NEXT: s_endpgm 110; 111; GFX10-LABEL: test_div_scale_f32_2: 112; GFX10: ; %bb.0: 113; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 114; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 115; GFX10-NEXT: s_waitcnt lgkmcnt(0) 116; GFX10-NEXT: s_clause 0x1 117; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 118; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 119; GFX10-NEXT: s_waitcnt vmcnt(0) 120; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v0, v1 121; GFX10-NEXT: v_mov_b32_e32 v1, 0 122; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 123; GFX10-NEXT: s_endpgm 124 %tid = call i32 @llvm.amdgcn.workitem.id.x() 125 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 126 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 127 128 %a = load volatile float, float addrspace(1)* %gep.0, align 4 129 %b = load volatile float, float addrspace(1)* %gep.1, align 4 130 131 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 132 %result0 = extractvalue { float, i1 } %result, 0 133 store float %result0, float addrspace(1)* %out, align 4 134 ret void 135} 136 137define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) { 138; GFX7-LABEL: test_div_scale_f64_1: 139; GFX7: ; %bb.0: 140; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 141; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 142; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 143; GFX7-NEXT: s_waitcnt lgkmcnt(0) 144; GFX7-NEXT: v_mov_b32_e32 v0, s2 145; GFX7-NEXT: v_mov_b32_e32 v1, s3 146; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 147; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 148; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 149; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 150; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 151; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 152; GFX7-NEXT: s_waitcnt vmcnt(0) 153; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] 154; GFX7-NEXT: v_mov_b32_e32 v3, s1 155; GFX7-NEXT: v_mov_b32_e32 v2, s0 156; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 157; GFX7-NEXT: s_endpgm 158; 159; GFX8-LABEL: test_div_scale_f64_1: 160; GFX8: ; %bb.0: 161; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 162; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 163; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 164; GFX8-NEXT: s_waitcnt lgkmcnt(0) 165; GFX8-NEXT: v_mov_b32_e32 v0, s2 166; GFX8-NEXT: v_mov_b32_e32 v1, s3 167; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 168; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 169; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 170; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 171; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 172; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 173; GFX8-NEXT: s_waitcnt vmcnt(0) 174; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] 175; GFX8-NEXT: v_mov_b32_e32 v3, s1 176; GFX8-NEXT: v_mov_b32_e32 v2, s0 177; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 178; GFX8-NEXT: s_endpgm 179; 180; GFX10-LABEL: test_div_scale_f64_1: 181; GFX10: ; %bb.0: 182; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 183; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 184; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 185; GFX10-NEXT: s_waitcnt lgkmcnt(0) 186; GFX10-NEXT: s_clause 0x1 187; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 188; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 189; GFX10-NEXT: s_waitcnt vmcnt(0) 190; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] 191; GFX10-NEXT: v_mov_b32_e32 v2, 0 192; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 193; GFX10-NEXT: s_endpgm 194 %tid = call i32 @llvm.amdgcn.workitem.id.x() 195 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 196 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 197 198 %a = load volatile double, double addrspace(1)* %gep.0, align 8 199 %b = load volatile double, double addrspace(1)* %gep.1, align 8 200 201 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 202 %result0 = extractvalue { double, i1 } %result, 0 203 store double %result0, double addrspace(1)* %out, align 8 204 ret void 205} 206 207define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) { 208; GFX7-LABEL: test_div_scale_f64_2: 209; GFX7: ; %bb.0: 210; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 211; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 212; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 213; GFX7-NEXT: s_waitcnt lgkmcnt(0) 214; GFX7-NEXT: v_mov_b32_e32 v0, s2 215; GFX7-NEXT: v_mov_b32_e32 v1, s3 216; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 217; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 218; GFX7-NEXT: v_add_i32_e32 v2, vcc, 8, v0 219; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 220; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 221; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 222; GFX7-NEXT: s_waitcnt vmcnt(0) 223; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] 224; GFX7-NEXT: v_mov_b32_e32 v3, s1 225; GFX7-NEXT: v_mov_b32_e32 v2, s0 226; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 227; GFX7-NEXT: s_endpgm 228; 229; GFX8-LABEL: test_div_scale_f64_2: 230; GFX8: ; %bb.0: 231; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 232; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 233; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 234; GFX8-NEXT: s_waitcnt lgkmcnt(0) 235; GFX8-NEXT: v_mov_b32_e32 v0, s2 236; GFX8-NEXT: v_mov_b32_e32 v1, s3 237; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 238; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 239; GFX8-NEXT: v_add_u32_e32 v2, vcc, 8, v0 240; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 241; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 242; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 243; GFX8-NEXT: s_waitcnt vmcnt(0) 244; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] 245; GFX8-NEXT: v_mov_b32_e32 v3, s1 246; GFX8-NEXT: v_mov_b32_e32 v2, s0 247; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 248; GFX8-NEXT: s_endpgm 249; 250; GFX10-LABEL: test_div_scale_f64_2: 251; GFX10: ; %bb.0: 252; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 253; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 254; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 255; GFX10-NEXT: s_waitcnt lgkmcnt(0) 256; GFX10-NEXT: s_clause 0x1 257; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 258; GFX10-NEXT: global_load_dwordx2 v[2:3], v2, s[2:3] offset:8 259; GFX10-NEXT: s_waitcnt vmcnt(0) 260; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] 261; GFX10-NEXT: v_mov_b32_e32 v2, 0 262; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 263; GFX10-NEXT: s_endpgm 264 %tid = call i32 @llvm.amdgcn.workitem.id.x() 265 %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid 266 %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 267 268 %a = load volatile double, double addrspace(1)* %gep.0, align 8 269 %b = load volatile double, double addrspace(1)* %gep.1, align 8 270 271 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 272 %result0 = extractvalue { double, i1 } %result, 0 273 store double %result0, double addrspace(1)* %out, align 8 274 ret void 275} 276 277define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], float %a) { 278; GFX7-LABEL: test_div_scale_f32_scalar_num_1: 279; GFX7: ; %bb.0: 280; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 281; GFX7-NEXT: s_load_dword s8, s[0:1], 0x15 282; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 283; GFX7-NEXT: v_mov_b32_e32 v1, 0 284; GFX7-NEXT: s_mov_b32 s2, 0 285; GFX7-NEXT: s_mov_b32 s3, 0xf000 286; GFX7-NEXT: s_waitcnt lgkmcnt(0) 287; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 288; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 289; GFX7-NEXT: s_mov_b32 s2, -1 290; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 291; GFX7-NEXT: s_waitcnt vmcnt(0) 292; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s8 293; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 294; GFX7-NEXT: s_endpgm 295; 296; GFX8-LABEL: test_div_scale_f32_scalar_num_1: 297; GFX8: ; %bb.0: 298; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 299; GFX8-NEXT: s_load_dword s0, s[0:1], 0x54 300; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 301; GFX8-NEXT: s_waitcnt lgkmcnt(0) 302; GFX8-NEXT: v_mov_b32_e32 v0, s6 303; GFX8-NEXT: v_mov_b32_e32 v1, s7 304; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 305; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 306; GFX8-NEXT: flat_load_dword v0, v[0:1] 307; GFX8-NEXT: s_waitcnt vmcnt(0) 308; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 309; GFX8-NEXT: v_mov_b32_e32 v0, s4 310; GFX8-NEXT: v_mov_b32_e32 v1, s5 311; GFX8-NEXT: flat_store_dword v[0:1], v2 312; GFX8-NEXT: s_endpgm 313; 314; GFX10-LABEL: test_div_scale_f32_scalar_num_1: 315; GFX10: ; %bb.0: 316; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 317; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 318; GFX10-NEXT: s_load_dword s0, s[0:1], 0x54 319; GFX10-NEXT: v_mov_b32_e32 v1, 0 320; GFX10-NEXT: s_waitcnt lgkmcnt(0) 321; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 322; GFX10-NEXT: s_waitcnt vmcnt(0) 323; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0 324; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 325; GFX10-NEXT: s_endpgm 326 %tid = call i32 @llvm.amdgcn.workitem.id.x() 327 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 328 329 %b = load float, float addrspace(1)* %gep, align 4 330 331 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 332 %result0 = extractvalue { float, i1 } %result, 0 333 store float %result0, float addrspace(1)* %out, align 4 334 ret void 335} 336 337define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) { 338; GFX7-LABEL: test_div_scale_f32_scalar_num_2: 339; GFX7: ; %bb.0: 340; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 341; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd 342; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 343; GFX7-NEXT: v_mov_b32_e32 v1, 0 344; GFX7-NEXT: s_mov_b32 s2, 0 345; GFX7-NEXT: s_mov_b32 s3, 0xf000 346; GFX7-NEXT: s_waitcnt lgkmcnt(0) 347; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 348; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 349; GFX7-NEXT: s_mov_b32 s2, -1 350; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 351; GFX7-NEXT: s_waitcnt vmcnt(0) 352; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, v0, s8 353; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 354; GFX7-NEXT: s_endpgm 355; 356; GFX8-LABEL: test_div_scale_f32_scalar_num_2: 357; GFX8: ; %bb.0: 358; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 359; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 360; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 361; GFX8-NEXT: s_waitcnt lgkmcnt(0) 362; GFX8-NEXT: v_mov_b32_e32 v0, s6 363; GFX8-NEXT: v_mov_b32_e32 v1, s7 364; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 365; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 366; GFX8-NEXT: flat_load_dword v0, v[0:1] 367; GFX8-NEXT: s_waitcnt vmcnt(0) 368; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 369; GFX8-NEXT: v_mov_b32_e32 v0, s4 370; GFX8-NEXT: v_mov_b32_e32 v1, s5 371; GFX8-NEXT: flat_store_dword v[0:1], v2 372; GFX8-NEXT: s_endpgm 373; 374; GFX10-LABEL: test_div_scale_f32_scalar_num_2: 375; GFX10: ; %bb.0: 376; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 377; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 378; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 379; GFX10-NEXT: v_mov_b32_e32 v1, 0 380; GFX10-NEXT: s_waitcnt lgkmcnt(0) 381; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 382; GFX10-NEXT: s_waitcnt vmcnt(0) 383; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0 384; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 385; GFX10-NEXT: s_endpgm 386 %tid = call i32 @llvm.amdgcn.workitem.id.x() 387 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 388 389 %b = load float, float addrspace(1)* %gep, align 4 390 391 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 392 %result0 = extractvalue { float, i1 } %result, 0 393 store float %result0, float addrspace(1)* %out, align 4 394 ret void 395} 396 397define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) { 398; GFX7-LABEL: test_div_scale_f32_scalar_den_1: 399; GFX7: ; %bb.0: 400; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 401; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd 402; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 403; GFX7-NEXT: v_mov_b32_e32 v1, 0 404; GFX7-NEXT: s_mov_b32 s2, 0 405; GFX7-NEXT: s_mov_b32 s3, 0xf000 406; GFX7-NEXT: s_waitcnt lgkmcnt(0) 407; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 408; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 409; GFX7-NEXT: s_mov_b32 s2, -1 410; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 411; GFX7-NEXT: s_waitcnt vmcnt(0) 412; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, s8, v0 413; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 414; GFX7-NEXT: s_endpgm 415; 416; GFX8-LABEL: test_div_scale_f32_scalar_den_1: 417; GFX8: ; %bb.0: 418; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 419; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 420; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 421; GFX8-NEXT: s_waitcnt lgkmcnt(0) 422; GFX8-NEXT: v_mov_b32_e32 v0, s6 423; GFX8-NEXT: v_mov_b32_e32 v1, s7 424; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 425; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 426; GFX8-NEXT: flat_load_dword v0, v[0:1] 427; GFX8-NEXT: s_waitcnt vmcnt(0) 428; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 429; GFX8-NEXT: v_mov_b32_e32 v0, s4 430; GFX8-NEXT: v_mov_b32_e32 v1, s5 431; GFX8-NEXT: flat_store_dword v[0:1], v2 432; GFX8-NEXT: s_endpgm 433; 434; GFX10-LABEL: test_div_scale_f32_scalar_den_1: 435; GFX10: ; %bb.0: 436; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 437; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 438; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 439; GFX10-NEXT: v_mov_b32_e32 v1, 0 440; GFX10-NEXT: s_waitcnt lgkmcnt(0) 441; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 442; GFX10-NEXT: s_waitcnt vmcnt(0) 443; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0 444; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 445; GFX10-NEXT: s_endpgm 446 %tid = call i32 @llvm.amdgcn.workitem.id.x() 447 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 448 449 %a = load float, float addrspace(1)* %gep, align 4 450 451 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 452 %result0 = extractvalue { float, i1 } %result, 0 453 store float %result0, float addrspace(1)* %out, align 4 454 ret void 455} 456 457define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) { 458; GFX7-LABEL: test_div_scale_f32_scalar_den_2: 459; GFX7: ; %bb.0: 460; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 461; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd 462; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 463; GFX7-NEXT: v_mov_b32_e32 v1, 0 464; GFX7-NEXT: s_mov_b32 s2, 0 465; GFX7-NEXT: s_mov_b32 s3, 0xf000 466; GFX7-NEXT: s_waitcnt lgkmcnt(0) 467; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] 468; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 469; GFX7-NEXT: s_mov_b32 s2, -1 470; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 471; GFX7-NEXT: s_waitcnt vmcnt(0) 472; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, s8, v0 473; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 474; GFX7-NEXT: s_endpgm 475; 476; GFX8-LABEL: test_div_scale_f32_scalar_den_2: 477; GFX8: ; %bb.0: 478; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 479; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 480; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 481; GFX8-NEXT: s_waitcnt lgkmcnt(0) 482; GFX8-NEXT: v_mov_b32_e32 v0, s6 483; GFX8-NEXT: v_mov_b32_e32 v1, s7 484; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 485; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 486; GFX8-NEXT: flat_load_dword v0, v[0:1] 487; GFX8-NEXT: s_waitcnt vmcnt(0) 488; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 489; GFX8-NEXT: v_mov_b32_e32 v0, s4 490; GFX8-NEXT: v_mov_b32_e32 v1, s5 491; GFX8-NEXT: flat_store_dword v[0:1], v2 492; GFX8-NEXT: s_endpgm 493; 494; GFX10-LABEL: test_div_scale_f32_scalar_den_2: 495; GFX10: ; %bb.0: 496; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 497; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 498; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 499; GFX10-NEXT: v_mov_b32_e32 v1, 0 500; GFX10-NEXT: s_waitcnt lgkmcnt(0) 501; GFX10-NEXT: global_load_dword v0, v0, s[6:7] 502; GFX10-NEXT: s_waitcnt vmcnt(0) 503; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0 504; GFX10-NEXT: global_store_dword v1, v0, s[4:5] 505; GFX10-NEXT: s_endpgm 506 %tid = call i32 @llvm.amdgcn.workitem.id.x() 507 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid 508 509 %a = load float, float addrspace(1)* %gep, align 4 510 511 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 512 %result0 = extractvalue { float, i1 } %result, 0 513 store float %result0, float addrspace(1)* %out, align 4 514 ret void 515} 516 517define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %a) { 518; GFX7-LABEL: test_div_scale_f64_scalar_num_1: 519; GFX7: ; %bb.0: 520; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 521; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 522; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 523; GFX7-NEXT: s_waitcnt lgkmcnt(0) 524; GFX7-NEXT: v_mov_b32_e32 v0, s6 525; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 526; GFX7-NEXT: v_mov_b32_e32 v1, s7 527; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 528; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 529; GFX7-NEXT: v_mov_b32_e32 v2, s4 530; GFX7-NEXT: v_mov_b32_e32 v3, s5 531; GFX7-NEXT: s_waitcnt vmcnt(0) 532; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] 533; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 534; GFX7-NEXT: s_endpgm 535; 536; GFX8-LABEL: test_div_scale_f64_scalar_num_1: 537; GFX8: ; %bb.0: 538; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 539; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 540; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 541; GFX8-NEXT: s_waitcnt lgkmcnt(0) 542; GFX8-NEXT: v_mov_b32_e32 v0, s6 543; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 544; GFX8-NEXT: v_mov_b32_e32 v1, s7 545; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 546; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 547; GFX8-NEXT: v_mov_b32_e32 v2, s4 548; GFX8-NEXT: v_mov_b32_e32 v3, s5 549; GFX8-NEXT: s_waitcnt vmcnt(0) 550; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] 551; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 552; GFX8-NEXT: s_endpgm 553; 554; GFX10-LABEL: test_div_scale_f64_scalar_num_1: 555; GFX10: ; %bb.0: 556; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 557; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 558; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 559; GFX10-NEXT: v_mov_b32_e32 v2, 0 560; GFX10-NEXT: s_waitcnt lgkmcnt(0) 561; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 562; GFX10-NEXT: s_waitcnt vmcnt(0) 563; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] 564; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 565; GFX10-NEXT: s_endpgm 566 %tid = call i32 @llvm.amdgcn.workitem.id.x() 567 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 568 569 %b = load double, double addrspace(1)* %gep, align 8 570 571 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 572 %result0 = extractvalue { double, i1 } %result, 0 573 store double %result0, double addrspace(1)* %out, align 8 574 ret void 575} 576 577define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %a) { 578; GFX7-LABEL: test_div_scale_f64_scalar_num_2: 579; GFX7: ; %bb.0: 580; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 581; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 582; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 583; GFX7-NEXT: s_waitcnt lgkmcnt(0) 584; GFX7-NEXT: v_mov_b32_e32 v0, s6 585; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 586; GFX7-NEXT: v_mov_b32_e32 v1, s7 587; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 588; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 589; GFX7-NEXT: v_mov_b32_e32 v2, s4 590; GFX7-NEXT: v_mov_b32_e32 v3, s5 591; GFX7-NEXT: s_waitcnt vmcnt(0) 592; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] 593; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 594; GFX7-NEXT: s_endpgm 595; 596; GFX8-LABEL: test_div_scale_f64_scalar_num_2: 597; GFX8: ; %bb.0: 598; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 599; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 600; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 601; GFX8-NEXT: s_waitcnt lgkmcnt(0) 602; GFX8-NEXT: v_mov_b32_e32 v0, s6 603; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 604; GFX8-NEXT: v_mov_b32_e32 v1, s7 605; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 606; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 607; GFX8-NEXT: v_mov_b32_e32 v2, s4 608; GFX8-NEXT: v_mov_b32_e32 v3, s5 609; GFX8-NEXT: s_waitcnt vmcnt(0) 610; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] 611; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 612; GFX8-NEXT: s_endpgm 613; 614; GFX10-LABEL: test_div_scale_f64_scalar_num_2: 615; GFX10: ; %bb.0: 616; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 617; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 618; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 619; GFX10-NEXT: v_mov_b32_e32 v2, 0 620; GFX10-NEXT: s_waitcnt lgkmcnt(0) 621; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 622; GFX10-NEXT: s_waitcnt vmcnt(0) 623; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] 624; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 625; GFX10-NEXT: s_endpgm 626 %tid = call i32 @llvm.amdgcn.workitem.id.x() 627 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 628 629 %b = load double, double addrspace(1)* %gep, align 8 630 631 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 632 %result0 = extractvalue { double, i1 } %result, 0 633 store double %result0, double addrspace(1)* %out, align 8 634 ret void 635} 636 637define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) { 638; GFX7-LABEL: test_div_scale_f64_scalar_den_1: 639; GFX7: ; %bb.0: 640; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 641; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 642; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 643; GFX7-NEXT: s_waitcnt lgkmcnt(0) 644; GFX7-NEXT: v_mov_b32_e32 v0, s6 645; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 646; GFX7-NEXT: v_mov_b32_e32 v1, s7 647; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 648; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 649; GFX7-NEXT: v_mov_b32_e32 v2, s4 650; GFX7-NEXT: v_mov_b32_e32 v3, s5 651; GFX7-NEXT: s_waitcnt vmcnt(0) 652; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] 653; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 654; GFX7-NEXT: s_endpgm 655; 656; GFX8-LABEL: test_div_scale_f64_scalar_den_1: 657; GFX8: ; %bb.0: 658; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 659; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 660; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 661; GFX8-NEXT: s_waitcnt lgkmcnt(0) 662; GFX8-NEXT: v_mov_b32_e32 v0, s6 663; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 664; GFX8-NEXT: v_mov_b32_e32 v1, s7 665; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 666; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 667; GFX8-NEXT: v_mov_b32_e32 v2, s4 668; GFX8-NEXT: v_mov_b32_e32 v3, s5 669; GFX8-NEXT: s_waitcnt vmcnt(0) 670; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] 671; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 672; GFX8-NEXT: s_endpgm 673; 674; GFX10-LABEL: test_div_scale_f64_scalar_den_1: 675; GFX10: ; %bb.0: 676; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 677; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 678; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 679; GFX10-NEXT: v_mov_b32_e32 v2, 0 680; GFX10-NEXT: s_waitcnt lgkmcnt(0) 681; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 682; GFX10-NEXT: s_waitcnt vmcnt(0) 683; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] 684; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 685; GFX10-NEXT: s_endpgm 686 %tid = call i32 @llvm.amdgcn.workitem.id.x() 687 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 688 689 %a = load double, double addrspace(1)* %gep, align 8 690 691 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 692 %result0 = extractvalue { double, i1 } %result, 0 693 store double %result0, double addrspace(1)* %out, align 8 694 ret void 695} 696 697define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) { 698; GFX7-LABEL: test_div_scale_f64_scalar_den_2: 699; GFX7: ; %bb.0: 700; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 701; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 702; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v0 703; GFX7-NEXT: s_waitcnt lgkmcnt(0) 704; GFX7-NEXT: v_mov_b32_e32 v0, s6 705; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 706; GFX7-NEXT: v_mov_b32_e32 v1, s7 707; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 708; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 709; GFX7-NEXT: v_mov_b32_e32 v2, s4 710; GFX7-NEXT: v_mov_b32_e32 v3, s5 711; GFX7-NEXT: s_waitcnt vmcnt(0) 712; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] 713; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 714; GFX7-NEXT: s_endpgm 715; 716; GFX8-LABEL: test_div_scale_f64_scalar_den_2: 717; GFX8: ; %bb.0: 718; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 719; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 720; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 721; GFX8-NEXT: s_waitcnt lgkmcnt(0) 722; GFX8-NEXT: v_mov_b32_e32 v0, s6 723; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 724; GFX8-NEXT: v_mov_b32_e32 v1, s7 725; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 726; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 727; GFX8-NEXT: v_mov_b32_e32 v2, s4 728; GFX8-NEXT: v_mov_b32_e32 v3, s5 729; GFX8-NEXT: s_waitcnt vmcnt(0) 730; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] 731; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 732; GFX8-NEXT: s_endpgm 733; 734; GFX10-LABEL: test_div_scale_f64_scalar_den_2: 735; GFX10: ; %bb.0: 736; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 737; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 738; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54 739; GFX10-NEXT: v_mov_b32_e32 v2, 0 740; GFX10-NEXT: s_waitcnt lgkmcnt(0) 741; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 742; GFX10-NEXT: s_waitcnt vmcnt(0) 743; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] 744; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 745; GFX10-NEXT: s_endpgm 746 %tid = call i32 @llvm.amdgcn.workitem.id.x() 747 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid 748 749 %a = load double, double addrspace(1)* %gep, align 8 750 751 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 752 %result0 = extractvalue { double, i1 } %result, 0 753 store double %result0, double addrspace(1)* %out, align 8 754 ret void 755} 756 757define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) { 758; GFX7-LABEL: test_div_scale_f32_all_scalar_1: 759; GFX7: ; %bb.0: 760; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 761; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 762; GFX7-NEXT: s_load_dword s0, s[0:1], 0x1c 763; GFX7-NEXT: s_mov_b32 s6, -1 764; GFX7-NEXT: s_mov_b32 s7, 0xf000 765; GFX7-NEXT: s_waitcnt lgkmcnt(0) 766; GFX7-NEXT: v_mov_b32_e32 v0, s0 767; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s2 768; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 769; GFX7-NEXT: s_endpgm 770; 771; GFX8-LABEL: test_div_scale_f32_all_scalar_1: 772; GFX8: ; %bb.0: 773; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 774; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 775; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 776; GFX8-NEXT: s_waitcnt lgkmcnt(0) 777; GFX8-NEXT: v_mov_b32_e32 v0, s3 778; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s2 779; GFX8-NEXT: v_mov_b32_e32 v0, s0 780; GFX8-NEXT: v_mov_b32_e32 v1, s1 781; GFX8-NEXT: flat_store_dword v[0:1], v2 782; GFX8-NEXT: s_endpgm 783; 784; GFX10-LABEL: test_div_scale_f32_all_scalar_1: 785; GFX10: ; %bb.0: 786; GFX10-NEXT: s_clause 0x2 787; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c 788; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 789; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 790; GFX10-NEXT: v_mov_b32_e32 v1, 0 791; GFX10-NEXT: s_waitcnt lgkmcnt(0) 792; GFX10-NEXT: v_div_scale_f32 v0, s2, s3, s3, s2 793; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 794; GFX10-NEXT: s_endpgm 795 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) 796 %result0 = extractvalue { float, i1 } %result, 0 797 store float %result0, float addrspace(1)* %out, align 4 798 ret void 799} 800 801define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) { 802; GFX7-LABEL: test_div_scale_f32_all_scalar_2: 803; GFX7: ; %bb.0: 804; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 805; GFX7-NEXT: s_load_dword s2, s[0:1], 0x13 806; GFX7-NEXT: s_load_dword s0, s[0:1], 0x1c 807; GFX7-NEXT: s_mov_b32 s6, -1 808; GFX7-NEXT: s_mov_b32 s7, 0xf000 809; GFX7-NEXT: s_waitcnt lgkmcnt(0) 810; GFX7-NEXT: v_mov_b32_e32 v0, s0 811; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s2, v0, s2 812; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 813; GFX7-NEXT: s_endpgm 814; 815; GFX8-LABEL: test_div_scale_f32_all_scalar_2: 816; GFX8: ; %bb.0: 817; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c 818; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 819; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 820; GFX8-NEXT: s_waitcnt lgkmcnt(0) 821; GFX8-NEXT: v_mov_b32_e32 v0, s3 822; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s2, v0, s2 823; GFX8-NEXT: v_mov_b32_e32 v0, s0 824; GFX8-NEXT: v_mov_b32_e32 v1, s1 825; GFX8-NEXT: flat_store_dword v[0:1], v2 826; GFX8-NEXT: s_endpgm 827; 828; GFX10-LABEL: test_div_scale_f32_all_scalar_2: 829; GFX10: ; %bb.0: 830; GFX10-NEXT: s_clause 0x2 831; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c 832; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 833; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 834; GFX10-NEXT: v_mov_b32_e32 v1, 0 835; GFX10-NEXT: s_waitcnt lgkmcnt(0) 836; GFX10-NEXT: v_div_scale_f32 v0, s2, s2, s3, s2 837; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 838; GFX10-NEXT: s_endpgm 839 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) 840 %result0 = extractvalue { float, i1 } %result, 0 841 store float %result0, float addrspace(1)* %out, align 4 842 ret void 843} 844 845define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) { 846; GFX7-LABEL: test_div_scale_f64_all_scalar_1: 847; GFX7: ; %bb.0: 848; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 849; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 850; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 851; GFX7-NEXT: s_waitcnt lgkmcnt(0) 852; GFX7-NEXT: v_mov_b32_e32 v0, s4 853; GFX7-NEXT: v_mov_b32_e32 v1, s5 854; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 855; GFX7-NEXT: v_mov_b32_e32 v3, s1 856; GFX7-NEXT: v_mov_b32_e32 v2, s0 857; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 858; GFX7-NEXT: s_endpgm 859; 860; GFX8-LABEL: test_div_scale_f64_all_scalar_1: 861; GFX8: ; %bb.0: 862; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 863; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 864; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 865; GFX8-NEXT: s_waitcnt lgkmcnt(0) 866; GFX8-NEXT: v_mov_b32_e32 v0, s4 867; GFX8-NEXT: v_mov_b32_e32 v1, s5 868; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 869; GFX8-NEXT: v_mov_b32_e32 v3, s1 870; GFX8-NEXT: v_mov_b32_e32 v2, s0 871; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 872; GFX8-NEXT: s_endpgm 873; 874; GFX10-LABEL: test_div_scale_f64_all_scalar_1: 875; GFX10: ; %bb.0: 876; GFX10-NEXT: s_clause 0x2 877; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 878; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 879; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 880; GFX10-NEXT: v_mov_b32_e32 v2, 0 881; GFX10-NEXT: s_waitcnt lgkmcnt(0) 882; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] 883; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 884; GFX10-NEXT: s_endpgm 885 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) 886 %result0 = extractvalue { double, i1 } %result, 0 887 store double %result0, double addrspace(1)* %out, align 8 888 ret void 889} 890 891define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) { 892; GFX7-LABEL: test_div_scale_f64_all_scalar_2: 893; GFX7: ; %bb.0: 894; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x13 895; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x1d 896; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 897; GFX7-NEXT: s_waitcnt lgkmcnt(0) 898; GFX7-NEXT: v_mov_b32_e32 v0, s4 899; GFX7-NEXT: v_mov_b32_e32 v1, s5 900; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3] 901; GFX7-NEXT: v_mov_b32_e32 v3, s1 902; GFX7-NEXT: v_mov_b32_e32 v2, s0 903; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 904; GFX7-NEXT: s_endpgm 905; 906; GFX8-LABEL: test_div_scale_f64_all_scalar_2: 907; GFX8: ; %bb.0: 908; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 909; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 910; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 911; GFX8-NEXT: s_waitcnt lgkmcnt(0) 912; GFX8-NEXT: v_mov_b32_e32 v0, s4 913; GFX8-NEXT: v_mov_b32_e32 v1, s5 914; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3] 915; GFX8-NEXT: v_mov_b32_e32 v3, s1 916; GFX8-NEXT: v_mov_b32_e32 v2, s0 917; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 918; GFX8-NEXT: s_endpgm 919; 920; GFX10-LABEL: test_div_scale_f64_all_scalar_2: 921; GFX10: ; %bb.0: 922; GFX10-NEXT: s_clause 0x2 923; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c 924; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74 925; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 926; GFX10-NEXT: v_mov_b32_e32 v2, 0 927; GFX10-NEXT: s_waitcnt lgkmcnt(0) 928; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] 929; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 930; GFX10-NEXT: s_endpgm 931 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) 932 %result0 = extractvalue { double, i1 } %result, 0 933 store double %result0, double addrspace(1)* %out, align 8 934 ret void 935} 936 937define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) { 938; GFX7-LABEL: test_div_scale_f32_inline_imm_num: 939; GFX7: ; %bb.0: 940; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 941; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 942; GFX7-NEXT: v_mov_b32_e32 v1, 0 943; GFX7-NEXT: s_mov_b32 s6, 0 944; GFX7-NEXT: s_mov_b32 s7, 0xf000 945; GFX7-NEXT: s_waitcnt lgkmcnt(0) 946; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 947; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 948; GFX7-NEXT: s_mov_b32 s6, -1 949; GFX7-NEXT: s_waitcnt vmcnt(0) 950; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, 1.0 951; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 952; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 953; GFX7-NEXT: s_endpgm 954; 955; GFX8-LABEL: test_div_scale_f32_inline_imm_num: 956; GFX8: ; %bb.0: 957; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 958; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 959; GFX8-NEXT: s_waitcnt lgkmcnt(0) 960; GFX8-NEXT: v_mov_b32_e32 v0, s2 961; GFX8-NEXT: v_mov_b32_e32 v1, s3 962; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 963; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 964; GFX8-NEXT: flat_load_dword v0, v[0:1] 965; GFX8-NEXT: s_waitcnt vmcnt(0) 966; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 967; GFX8-NEXT: v_mov_b32_e32 v0, s0 968; GFX8-NEXT: v_mov_b32_e32 v1, s1 969; GFX8-NEXT: flat_store_dword v[0:1], v2 970; GFX8-NEXT: s_endpgm 971; 972; GFX10-LABEL: test_div_scale_f32_inline_imm_num: 973; GFX10: ; %bb.0: 974; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 975; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 976; GFX10-NEXT: v_mov_b32_e32 v1, 0 977; GFX10-NEXT: s_waitcnt lgkmcnt(0) 978; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 979; GFX10-NEXT: s_waitcnt vmcnt(0) 980; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 981; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 982; GFX10-NEXT: s_endpgm 983 %tid = call i32 @llvm.amdgcn.workitem.id.x() 984 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 985 %a = load float, float addrspace(1)* %gep.0, align 4 986 987 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false) 988 %result0 = extractvalue { float, i1 } %result, 0 989 store float %result0, float addrspace(1)* %out, align 4 990 ret void 991} 992 993define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) { 994; GFX7-LABEL: test_div_scale_f32_inline_imm_den: 995; GFX7: ; %bb.0: 996; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 997; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 998; GFX7-NEXT: v_mov_b32_e32 v1, 0 999; GFX7-NEXT: s_mov_b32 s6, 0 1000; GFX7-NEXT: s_mov_b32 s7, 0xf000 1001; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 1003; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1004; GFX7-NEXT: s_mov_b32 s6, -1 1005; GFX7-NEXT: s_waitcnt vmcnt(0) 1006; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], 2.0, 2.0, v0 1007; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 1008; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1009; GFX7-NEXT: s_endpgm 1010; 1011; GFX8-LABEL: test_div_scale_f32_inline_imm_den: 1012; GFX8: ; %bb.0: 1013; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1014; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1015; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1016; GFX8-NEXT: v_mov_b32_e32 v0, s2 1017; GFX8-NEXT: v_mov_b32_e32 v1, s3 1018; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1019; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1020; GFX8-NEXT: flat_load_dword v0, v[0:1] 1021; GFX8-NEXT: s_waitcnt vmcnt(0) 1022; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 1023; GFX8-NEXT: v_mov_b32_e32 v0, s0 1024; GFX8-NEXT: v_mov_b32_e32 v1, s1 1025; GFX8-NEXT: flat_store_dword v[0:1], v2 1026; GFX8-NEXT: s_endpgm 1027; 1028; GFX10-LABEL: test_div_scale_f32_inline_imm_den: 1029; GFX10: ; %bb.0: 1030; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1031; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1032; GFX10-NEXT: v_mov_b32_e32 v1, 0 1033; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1034; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1035; GFX10-NEXT: s_waitcnt vmcnt(0) 1036; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 1037; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1038; GFX10-NEXT: s_endpgm 1039 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1040 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 1041 %a = load float, float addrspace(1)* %gep.0, align 4 1042 1043 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false) 1044 %result0 = extractvalue { float, i1 } %result, 0 1045 store float %result0, float addrspace(1)* %out, align 4 1046 ret void 1047} 1048 1049define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) { 1050; GFX7-LABEL: test_div_scale_f32_fabs_num: 1051; GFX7: ; %bb.0: 1052; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1053; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1054; GFX7-NEXT: v_mov_b32_e32 v1, 0 1055; GFX7-NEXT: s_mov_b32 s6, 0 1056; GFX7-NEXT: s_mov_b32 s7, 0xf000 1057; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1058; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 1059; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1060; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 1061; GFX7-NEXT: s_mov_b32 s6, -1 1062; GFX7-NEXT: s_waitcnt vmcnt(1) 1063; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 1064; GFX7-NEXT: s_waitcnt vmcnt(0) 1065; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v1 1066; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 1067; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1068; GFX7-NEXT: s_endpgm 1069; 1070; GFX8-LABEL: test_div_scale_f32_fabs_num: 1071; GFX8: ; %bb.0: 1072; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1073; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1074; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1075; GFX8-NEXT: v_mov_b32_e32 v0, s2 1076; GFX8-NEXT: v_mov_b32_e32 v1, s3 1077; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1078; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1079; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 1080; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1081; GFX8-NEXT: flat_load_dword v0, v[0:1] 1082; GFX8-NEXT: flat_load_dword v1, v[2:3] 1083; GFX8-NEXT: s_waitcnt vmcnt(1) 1084; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 1085; GFX8-NEXT: s_waitcnt vmcnt(0) 1086; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 1087; GFX8-NEXT: v_mov_b32_e32 v0, s0 1088; GFX8-NEXT: v_mov_b32_e32 v1, s1 1089; GFX8-NEXT: flat_store_dword v[0:1], v2 1090; GFX8-NEXT: s_endpgm 1091; 1092; GFX10-LABEL: test_div_scale_f32_fabs_num: 1093; GFX10: ; %bb.0: 1094; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1095; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1096; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1097; GFX10-NEXT: s_clause 0x1 1098; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1099; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 1100; GFX10-NEXT: s_waitcnt vmcnt(1) 1101; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 1102; GFX10-NEXT: s_waitcnt vmcnt(0) 1103; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 1104; GFX10-NEXT: v_mov_b32_e32 v1, 0 1105; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1106; GFX10-NEXT: s_endpgm 1107 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1108 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 1109 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 1110 1111 %a = load volatile float, float addrspace(1)* %gep.0, align 4 1112 %b = load volatile float, float addrspace(1)* %gep.1, align 4 1113 1114 %a.fabs = call float @llvm.fabs.f32(float %a) 1115 1116 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false) 1117 %result0 = extractvalue { float, i1 } %result, 0 1118 store float %result0, float addrspace(1)* %out, align 4 1119 ret void 1120} 1121 1122define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) { 1123; GFX7-LABEL: test_div_scale_f32_fabs_den: 1124; GFX7: ; %bb.0: 1125; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1126; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1127; GFX7-NEXT: v_mov_b32_e32 v1, 0 1128; GFX7-NEXT: s_mov_b32 s6, 0 1129; GFX7-NEXT: s_mov_b32 s7, 0xf000 1130; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1131; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] 1132; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1133; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 1134; GFX7-NEXT: s_mov_b32 s6, -1 1135; GFX7-NEXT: s_waitcnt vmcnt(0) 1136; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 1137; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 1138; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] 1139; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1140; GFX7-NEXT: s_endpgm 1141; 1142; GFX8-LABEL: test_div_scale_f32_fabs_den: 1143; GFX8: ; %bb.0: 1144; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1145; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1146; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1147; GFX8-NEXT: v_mov_b32_e32 v0, s2 1148; GFX8-NEXT: v_mov_b32_e32 v1, s3 1149; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1150; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1151; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 1152; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1153; GFX8-NEXT: flat_load_dword v0, v[0:1] 1154; GFX8-NEXT: flat_load_dword v1, v[2:3] 1155; GFX8-NEXT: s_waitcnt vmcnt(0) 1156; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 1157; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 1158; GFX8-NEXT: v_mov_b32_e32 v0, s0 1159; GFX8-NEXT: v_mov_b32_e32 v1, s1 1160; GFX8-NEXT: flat_store_dword v[0:1], v2 1161; GFX8-NEXT: s_endpgm 1162; 1163; GFX10-LABEL: test_div_scale_f32_fabs_den: 1164; GFX10: ; %bb.0: 1165; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1166; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1167; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1168; GFX10-NEXT: s_clause 0x1 1169; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1170; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:4 1171; GFX10-NEXT: s_waitcnt vmcnt(0) 1172; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 1173; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 1174; GFX10-NEXT: v_mov_b32_e32 v1, 0 1175; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1176; GFX10-NEXT: s_endpgm 1177 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1178 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid 1179 %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 1180 1181 %a = load volatile float, float addrspace(1)* %gep.0, align 4 1182 %b = load volatile float, float addrspace(1)* %gep.1, align 4 1183 1184 %b.fabs = call float @llvm.fabs.f32(float %b) 1185 1186 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false) 1187 %result0 = extractvalue { float, i1 } %result, 0 1188 store float %result0, float addrspace(1)* %out, align 4 1189 ret void 1190} 1191 1192define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 { 1193; GFX7-LABEL: test_div_scale_f32_val_undef_val: 1194; GFX7: ; %bb.0: 1195; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1196; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 1197; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1198; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 1199; GFX7-NEXT: s_mov_b32 s2, -1 1200; GFX7-NEXT: s_mov_b32 s3, 0xf000 1201; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1202; GFX7-NEXT: s_endpgm 1203; 1204; GFX8-LABEL: test_div_scale_f32_val_undef_val: 1205; GFX8: ; %bb.0: 1206; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 1207; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0 1208; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1209; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1210; GFX8-NEXT: v_mov_b32_e32 v0, s0 1211; GFX8-NEXT: v_mov_b32_e32 v1, s1 1212; GFX8-NEXT: flat_store_dword v[0:1], v2 1213; GFX8-NEXT: s_endpgm 1214; 1215; GFX10-LABEL: test_div_scale_f32_val_undef_val: 1216; GFX10: ; %bb.0: 1217; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1218; GFX10-NEXT: v_mov_b32_e32 v1, 0 1219; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1220; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 1221; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1222; GFX10-NEXT: s_endpgm 1223 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) 1224 %result0 = extractvalue { float, i1 } %result, 0 1225 store float %result0, float addrspace(1)* %out, align 4 1226 ret void 1227} 1228 1229define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 { 1230; GFX7-LABEL: test_div_scale_f32_undef_val_val: 1231; GFX7: ; %bb.0: 1232; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1233; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 1234; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1235; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 1236; GFX7-NEXT: s_mov_b32 s2, -1 1237; GFX7-NEXT: s_mov_b32 s3, 0xf000 1238; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1239; GFX7-NEXT: s_endpgm 1240; 1241; GFX8-LABEL: test_div_scale_f32_undef_val_val: 1242; GFX8: ; %bb.0: 1243; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 1244; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0 1245; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1246; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1247; GFX8-NEXT: v_mov_b32_e32 v0, s0 1248; GFX8-NEXT: v_mov_b32_e32 v1, s1 1249; GFX8-NEXT: flat_store_dword v[0:1], v2 1250; GFX8-NEXT: s_endpgm 1251; 1252; GFX10-LABEL: test_div_scale_f32_undef_val_val: 1253; GFX10: ; %bb.0: 1254; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1255; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 1256; GFX10-NEXT: v_mov_b32_e32 v1, 0 1257; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1258; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0 1259; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1260; GFX10-NEXT: s_endpgm 1261 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) 1262 %result0 = extractvalue { float, i1 } %result, 0 1263 store float %result0, float addrspace(1)* %out, align 4 1264 ret void 1265} 1266 1267define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 { 1268; GFX7-LABEL: test_div_scale_f32_undef_undef_val: 1269; GFX7: ; %bb.0: 1270; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1271; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 1273; GFX7-NEXT: s_mov_b32 s2, -1 1274; GFX7-NEXT: s_mov_b32 s3, 0xf000 1275; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1276; GFX7-NEXT: s_endpgm 1277; 1278; GFX8-LABEL: test_div_scale_f32_undef_undef_val: 1279; GFX8: ; %bb.0: 1280; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0 1281; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1282; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1283; GFX8-NEXT: v_mov_b32_e32 v0, s0 1284; GFX8-NEXT: v_mov_b32_e32 v1, s1 1285; GFX8-NEXT: flat_store_dword v[0:1], v2 1286; GFX8-NEXT: s_endpgm 1287; 1288; GFX10-LABEL: test_div_scale_f32_undef_undef_val: 1289; GFX10: ; %bb.0: 1290; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1291; GFX10-NEXT: v_mov_b32_e32 v1, 0 1292; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 1294; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1295; GFX10-NEXT: s_endpgm 1296 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) 1297 %result0 = extractvalue { float, i1 } %result, 0 1298 store float %result0, float addrspace(1)* %out, align 4 1299 ret void 1300} 1301 1302define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 { 1303; GFX7-LABEL: test_div_scale_f64_val_undef_val: 1304; GFX7: ; %bb.0: 1305; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1306; GFX7-NEXT: s_mov_b32 s2, 0 1307; GFX7-NEXT: s_mov_b32 s3, 0x40200000 1308; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 1309; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1310; GFX7-NEXT: v_mov_b32_e32 v3, s1 1311; GFX7-NEXT: v_mov_b32_e32 v2, s0 1312; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1313; GFX7-NEXT: s_endpgm 1314; 1315; GFX8-LABEL: test_div_scale_f64_val_undef_val: 1316; GFX8: ; %bb.0: 1317; GFX8-NEXT: s_mov_b32 s2, 0 1318; GFX8-NEXT: s_mov_b32 s3, 0x40200000 1319; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] 1320; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1321; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX8-NEXT: v_mov_b32_e32 v3, s1 1323; GFX8-NEXT: v_mov_b32_e32 v2, s0 1324; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1325; GFX8-NEXT: s_endpgm 1326; 1327; GFX10-LABEL: test_div_scale_f64_val_undef_val: 1328; GFX10: ; %bb.0: 1329; GFX10-NEXT: s_mov_b32 s2, 0 1330; GFX10-NEXT: s_mov_b32 s3, 0x40200000 1331; GFX10-NEXT: v_mov_b32_e32 v2, 0 1332; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3] 1333; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1334; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1335; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1336; GFX10-NEXT: s_endpgm 1337 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) 1338 %result0 = extractvalue { double, i1 } %result, 0 1339 store double %result0, double addrspace(1)* %out, align 8 1340 ret void 1341} 1342 1343declare i32 @llvm.amdgcn.workitem.id.x() #1 1344declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1 1345declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1 1346declare float @llvm.fabs.f32(float) #1 1347 1348attributes #0 = { nounwind } 1349attributes #1 = { nounwind readnone speculatable } 1350