1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s 5 6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7; CHECK-LABEL: @udiv_i32( 8; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 9; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 10; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 11; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 12; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 13; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 14; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 15; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 16; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 17; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 18; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 19; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 20; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 21; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 22; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 23; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 24; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 25; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 26; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 27; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 28; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 29; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 30; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 31; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 32; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 33; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 34; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 35; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 36; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 37; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 38; CHECK-NEXT: ret void 39; 40; GCN-LABEL: udiv_i32: 41; GCN: ; %bb.0: 42; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 43; GCN-NEXT: s_mov_b32 s7, 0xf000 44; GCN-NEXT: s_mov_b32 s6, -1 45; GCN-NEXT: s_waitcnt lgkmcnt(0) 46; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 47; GCN-NEXT: s_sub_i32 s4, 0, s3 48; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 49; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 50; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 51; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 52; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 53; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 54; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 55; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 56; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 57; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 58; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 59; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 60; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 61; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 62; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 63; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 64; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 65; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 66; GCN-NEXT: s_waitcnt lgkmcnt(0) 67; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 68; GCN-NEXT: s_endpgm 69 %r = udiv i32 %x, %y 70 store i32 %r, i32 addrspace(1)* %out 71 ret void 72} 73 74define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 75; CHECK-LABEL: @urem_i32( 76; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 77; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 78; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 79; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 80; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 81; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 82; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 83; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 84; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 85; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 86; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 87; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 88; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 89; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 90; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 91; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 92; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 93; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 94; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 95; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 96; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 97; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 98; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 99; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 100; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 101; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 102; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 103; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 104; CHECK-NEXT: ret void 105; 106; GCN-LABEL: urem_i32: 107; GCN: ; %bb.0: 108; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 109; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 110; GCN-NEXT: s_mov_b32 s3, 0xf000 111; GCN-NEXT: s_waitcnt lgkmcnt(0) 112; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 113; GCN-NEXT: s_sub_i32 s2, 0, s5 114; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 115; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 116; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 117; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 118; GCN-NEXT: s_mov_b32 s2, -1 119; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 120; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 121; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 122; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 123; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 124; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 125; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 126; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 127; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 128; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 129; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 130; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 131; GCN-NEXT: s_endpgm 132 %r = urem i32 %x, %y 133 store i32 %r, i32 addrspace(1)* %out 134 ret void 135} 136 137define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 138; CHECK-LABEL: @sdiv_i32( 139; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 140; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 141; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 142; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 143; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 144; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 145; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 146; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 147; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 148; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 149; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 150; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 151; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 152; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 153; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 154; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 155; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 156; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 157; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 158; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 159; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 160; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 161; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 162; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 163; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 164; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 165; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 166; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 167; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 168; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 169; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 170; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 171; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 172; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 173; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 174; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 175; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 176; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 177; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 178; CHECK-NEXT: ret void 179; 180; GCN-LABEL: sdiv_i32: 181; GCN: ; %bb.0: 182; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 183; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 184; GCN-NEXT: s_mov_b32 s7, 0xf000 185; GCN-NEXT: s_mov_b32 s6, -1 186; GCN-NEXT: s_waitcnt lgkmcnt(0) 187; GCN-NEXT: s_ashr_i32 s8, s3, 31 188; GCN-NEXT: s_add_i32 s3, s3, s8 189; GCN-NEXT: s_xor_b32 s9, s3, s8 190; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 191; GCN-NEXT: s_sub_i32 s3, 0, s9 192; GCN-NEXT: s_ashr_i32 s0, s2, 31 193; GCN-NEXT: s_add_i32 s1, s2, s0 194; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 195; GCN-NEXT: s_xor_b32 s1, s1, s0 196; GCN-NEXT: s_xor_b32 s2, s0, s8 197; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 198; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 199; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 200; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 201; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 202; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 203; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 204; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 205; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 206; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 207; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 208; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1 209; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 210; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 211; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 212; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 213; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 214; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 215; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 216; GCN-NEXT: s_endpgm 217 %r = sdiv i32 %x, %y 218 store i32 %r, i32 addrspace(1)* %out 219 ret void 220} 221 222define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 223; CHECK-LABEL: @srem_i32( 224; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 225; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 226; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 227; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 228; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 229; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 230; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 231; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 232; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 233; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 234; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 235; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 236; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 237; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 238; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 239; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 240; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 241; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 242; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 243; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 244; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 245; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 246; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 247; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 248; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 249; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 250; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 251; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 252; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 253; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 254; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 255; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 256; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 257; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 258; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 259; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 260; CHECK-NEXT: ret void 261; 262; GCN-LABEL: srem_i32: 263; GCN: ; %bb.0: 264; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 265; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 266; GCN-NEXT: s_waitcnt lgkmcnt(0) 267; GCN-NEXT: s_ashr_i32 s4, s3, 31 268; GCN-NEXT: s_add_i32 s3, s3, s4 269; GCN-NEXT: s_xor_b32 s6, s3, s4 270; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 271; GCN-NEXT: s_sub_i32 s3, 0, s6 272; GCN-NEXT: s_ashr_i32 s4, s2, 31 273; GCN-NEXT: s_add_i32 s2, s2, s4 274; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 275; GCN-NEXT: s_xor_b32 s5, s2, s4 276; GCN-NEXT: s_mov_b32 s2, -1 277; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 278; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 279; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 280; GCN-NEXT: s_mov_b32 s3, 0xf000 281; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 282; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 283; GCN-NEXT: v_mul_hi_u32 v0, s5, v0 284; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 285; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 286; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 287; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 288; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 289; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 290; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 291; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 292; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 293; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 294; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 295; GCN-NEXT: s_endpgm 296 %r = srem i32 %x, %y 297 store i32 %r, i32 addrspace(1)* %out 298 ret void 299} 300 301define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 302; CHECK-LABEL: @udiv_i16( 303; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 304; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 305; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 306; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 307; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 308; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 309; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 310; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 311; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 312; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 313; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 314; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 315; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 316; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 317; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 318; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 319; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 320; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 321; CHECK-NEXT: ret void 322; 323; GCN-LABEL: udiv_i16: 324; GCN: ; %bb.0: 325; GCN-NEXT: s_load_dword s2, s[0:1], 0xb 326; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 327; GCN-NEXT: s_waitcnt lgkmcnt(0) 328; GCN-NEXT: s_lshr_b32 s3, s2, 16 329; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 330; GCN-NEXT: s_and_b32 s2, s2, 0xffff 331; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 332; GCN-NEXT: s_mov_b32 s3, 0xf000 333; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 334; GCN-NEXT: s_mov_b32 s2, -1 335; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 336; GCN-NEXT: v_trunc_f32_e32 v2, v2 337; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 338; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 339; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 340; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 341; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 342; GCN-NEXT: s_endpgm 343 %r = udiv i16 %x, %y 344 store i16 %r, i16 addrspace(1)* %out 345 ret void 346} 347 348define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 349; CHECK-LABEL: @urem_i16( 350; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 351; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 352; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 353; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 354; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 355; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 356; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 357; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 358; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 359; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 360; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 361; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 362; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 363; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 364; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 365; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 366; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 367; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 368; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 369; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 370; CHECK-NEXT: ret void 371; 372; GCN-LABEL: urem_i16: 373; GCN: ; %bb.0: 374; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 375; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 376; GCN-NEXT: s_waitcnt lgkmcnt(0) 377; GCN-NEXT: s_lshr_b32 s2, s4, 16 378; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 379; GCN-NEXT: s_and_b32 s3, s4, 0xffff 380; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 381; GCN-NEXT: s_mov_b32 s3, 0xf000 382; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 383; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 384; GCN-NEXT: v_trunc_f32_e32 v2, v2 385; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 386; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 387; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 388; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 389; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 390; GCN-NEXT: s_mov_b32 s2, -1 391; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 392; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 393; GCN-NEXT: s_endpgm 394 %r = urem i16 %x, %y 395 store i16 %r, i16 addrspace(1)* %out 396 ret void 397} 398 399define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 400; CHECK-LABEL: @sdiv_i16( 401; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 402; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 403; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 404; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 405; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 406; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 407; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 408; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 409; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 410; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 411; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 412; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 413; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 414; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 415; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 416; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 417; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 418; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 419; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 420; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 421; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 422; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 423; CHECK-NEXT: ret void 424; 425; GCN-LABEL: sdiv_i16: 426; GCN: ; %bb.0: 427; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 428; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 429; GCN-NEXT: s_mov_b32 s7, 0xf000 430; GCN-NEXT: s_mov_b32 s6, -1 431; GCN-NEXT: s_waitcnt lgkmcnt(0) 432; GCN-NEXT: s_ashr_i32 s1, s0, 16 433; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 434; GCN-NEXT: s_sext_i32_i16 s0, s0 435; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 436; GCN-NEXT: s_xor_b32 s0, s0, s1 437; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 438; GCN-NEXT: s_ashr_i32 s0, s0, 30 439; GCN-NEXT: s_or_b32 s0, s0, 1 440; GCN-NEXT: v_mov_b32_e32 v3, s0 441; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 442; GCN-NEXT: v_trunc_f32_e32 v2, v2 443; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 444; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 445; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 446; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 447; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 448; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 449; GCN-NEXT: s_endpgm 450 %r = sdiv i16 %x, %y 451 store i16 %r, i16 addrspace(1)* %out 452 ret void 453} 454 455define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 456; CHECK-LABEL: @srem_i16( 457; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 458; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 459; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 460; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 461; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 462; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 463; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 464; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 465; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 466; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 467; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 468; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 469; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 470; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 471; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 472; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 473; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 474; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 475; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 476; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 477; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 478; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 479; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 480; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 481; CHECK-NEXT: ret void 482; 483; GCN-LABEL: srem_i16: 484; GCN: ; %bb.0: 485; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 486; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 487; GCN-NEXT: s_waitcnt lgkmcnt(0) 488; GCN-NEXT: s_ashr_i32 s2, s4, 16 489; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 490; GCN-NEXT: s_sext_i32_i16 s3, s4 491; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 492; GCN-NEXT: s_xor_b32 s3, s3, s2 493; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 494; GCN-NEXT: s_ashr_i32 s3, s3, 30 495; GCN-NEXT: s_or_b32 s3, s3, 1 496; GCN-NEXT: v_mov_b32_e32 v3, s3 497; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 498; GCN-NEXT: v_trunc_f32_e32 v2, v2 499; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 500; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 501; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 502; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 503; GCN-NEXT: s_mov_b32 s3, 0xf000 504; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 505; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 506; GCN-NEXT: s_mov_b32 s2, -1 507; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 508; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 509; GCN-NEXT: s_endpgm 510 %r = srem i16 %x, %y 511 store i16 %r, i16 addrspace(1)* %out 512 ret void 513} 514 515define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 516; CHECK-LABEL: @udiv_i8( 517; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 518; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 519; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 520; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 521; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 522; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 523; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 524; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 525; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 526; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 527; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 528; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 529; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 530; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 531; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 532; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 533; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 534; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 535; CHECK-NEXT: ret void 536; 537; GCN-LABEL: udiv_i8: 538; GCN: ; %bb.0: 539; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 540; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 541; GCN-NEXT: s_mov_b32 s7, 0xf000 542; GCN-NEXT: s_mov_b32 s6, -1 543; GCN-NEXT: s_waitcnt lgkmcnt(0) 544; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 545; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 546; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 547; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 548; GCN-NEXT: v_trunc_f32_e32 v1, v1 549; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 550; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 551; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 552; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 553; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 554; GCN-NEXT: s_endpgm 555 %r = udiv i8 %x, %y 556 store i8 %r, i8 addrspace(1)* %out 557 ret void 558} 559 560define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 561; CHECK-LABEL: @urem_i8( 562; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 563; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 564; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 565; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 566; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 567; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 568; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 569; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 570; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 571; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 572; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 573; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 574; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 575; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 576; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 577; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 578; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 579; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 580; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 581; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 582; CHECK-NEXT: ret void 583; 584; GCN-LABEL: urem_i8: 585; GCN: ; %bb.0: 586; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 587; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 588; GCN-NEXT: s_mov_b32 s3, 0xf000 589; GCN-NEXT: s_waitcnt lgkmcnt(0) 590; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 591; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 592; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 593; GCN-NEXT: s_lshr_b32 s2, s4, 8 594; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 595; GCN-NEXT: v_trunc_f32_e32 v1, v1 596; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 597; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 598; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 599; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 600; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 601; GCN-NEXT: s_mov_b32 s2, -1 602; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 603; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 604; GCN-NEXT: s_endpgm 605 %r = urem i8 %x, %y 606 store i8 %r, i8 addrspace(1)* %out 607 ret void 608} 609 610define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 611; CHECK-LABEL: @sdiv_i8( 612; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 613; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 614; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 615; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 616; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 617; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 618; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 619; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 620; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 621; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 622; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 623; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 624; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 625; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 626; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 627; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 628; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 629; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 630; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 631; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 632; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 633; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 634; CHECK-NEXT: ret void 635; 636; GCN-LABEL: sdiv_i8: 637; GCN: ; %bb.0: 638; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 639; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 640; GCN-NEXT: s_mov_b32 s7, 0xf000 641; GCN-NEXT: s_mov_b32 s6, -1 642; GCN-NEXT: s_waitcnt lgkmcnt(0) 643; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 644; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 645; GCN-NEXT: s_sext_i32_i8 s0, s0 646; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 647; GCN-NEXT: s_xor_b32 s0, s0, s1 648; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 649; GCN-NEXT: s_ashr_i32 s0, s0, 30 650; GCN-NEXT: s_or_b32 s0, s0, 1 651; GCN-NEXT: v_mov_b32_e32 v3, s0 652; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 653; GCN-NEXT: v_trunc_f32_e32 v2, v2 654; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 655; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 656; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 657; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 658; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 659; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 660; GCN-NEXT: s_endpgm 661 %r = sdiv i8 %x, %y 662 store i8 %r, i8 addrspace(1)* %out 663 ret void 664} 665 666define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 667; CHECK-LABEL: @srem_i8( 668; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 669; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 670; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 671; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 672; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 673; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 674; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 675; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 676; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 677; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 678; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 679; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 680; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 681; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 682; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 683; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 684; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 685; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 686; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 687; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 688; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 689; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 690; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 691; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 692; CHECK-NEXT: ret void 693; 694; GCN-LABEL: srem_i8: 695; GCN: ; %bb.0: 696; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 697; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 698; GCN-NEXT: s_mov_b32 s7, 0xf000 699; GCN-NEXT: s_mov_b32 s6, -1 700; GCN-NEXT: s_waitcnt lgkmcnt(0) 701; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 702; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 703; GCN-NEXT: s_sext_i32_i8 s3, s0 704; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 705; GCN-NEXT: s_xor_b32 s1, s3, s1 706; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 707; GCN-NEXT: s_ashr_i32 s1, s1, 30 708; GCN-NEXT: s_or_b32 s1, s1, 1 709; GCN-NEXT: v_mov_b32_e32 v3, s1 710; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 711; GCN-NEXT: v_trunc_f32_e32 v2, v2 712; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 713; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 714; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 715; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 716; GCN-NEXT: s_lshr_b32 s2, s0, 8 717; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 718; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 719; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 720; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 721; GCN-NEXT: s_endpgm 722 %r = srem i8 %x, %y 723 store i8 %r, i8 addrspace(1)* %out 724 ret void 725} 726 727define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 728; CHECK-LABEL: @udiv_v4i32( 729; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 730; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 731; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 732; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 733; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 734; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 735; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 736; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 737; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 738; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 739; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 740; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 741; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 742; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 743; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 744; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 745; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 746; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 747; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 748; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 749; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 750; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 751; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 752; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 753; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 754; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 755; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 756; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 757; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 758; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 759; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 760; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 761; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 762; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 763; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 764; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 765; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 766; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 767; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 768; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 769; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 770; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 771; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 772; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 773; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 774; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 775; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 776; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 777; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 778; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 779; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 780; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 781; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 782; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 783; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 784; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 785; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 786; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 787; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 788; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 789; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 790; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 791; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 792; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 793; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 794; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 795; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 796; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 797; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 798; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 799; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 800; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 801; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 802; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 803; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 804; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 805; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 806; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 807; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 808; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 809; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 810; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 811; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 812; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 813; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 814; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 815; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 816; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 817; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 818; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 819; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 820; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 821; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 822; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 823; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 824; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 825; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 826; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 827; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 828; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 829; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 830; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 831; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 832; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 833; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 834; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 835; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 836; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 837; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 838; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 839; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 840; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 841; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 842; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 843; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 844; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 845; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 846; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 847; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 848; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 849; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 850; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 851; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 852; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 853; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 854; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 855; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 856; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 857; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 858; CHECK-NEXT: ret void 859; 860; GCN-LABEL: udiv_v4i32: 861; GCN: ; %bb.0: 862; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 863; GCN-NEXT: s_mov_b32 s3, 0x4f7ffffe 864; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 865; GCN-NEXT: s_mov_b32 s15, 0xf000 866; GCN-NEXT: s_mov_b32 s14, -1 867; GCN-NEXT: s_waitcnt lgkmcnt(0) 868; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 869; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 870; GCN-NEXT: s_sub_i32 s2, 0, s8 871; GCN-NEXT: v_cvt_f32_u32_e32 v4, s10 872; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 873; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 874; GCN-NEXT: v_cvt_f32_u32_e32 v6, s11 875; GCN-NEXT: v_mul_f32_e32 v0, s3, v0 876; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 877; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 878; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 879; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 880; GCN-NEXT: s_sub_i32 s2, 0, s9 881; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 882; GCN-NEXT: s_sub_i32 s2, 0, s10 883; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 884; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 885; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 886; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 887; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 888; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 889; GCN-NEXT: v_mul_lo_u32 v2, v0, s8 890; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 891; GCN-NEXT: v_mul_lo_u32 v5, v1, s9 892; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 893; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 894; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 895; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 896; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 897; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 898; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 899; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 900; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 901; GCN-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 902; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 903; GCN-NEXT: v_mul_f32_e32 v2, s3, v2 904; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 905; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 906; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 907; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 908; GCN-NEXT: v_mul_lo_u32 v4, s2, v2 909; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 910; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v1 911; GCN-NEXT: s_sub_i32 s0, 0, s11 912; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 913; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 914; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v6 915; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 916; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 917; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 918; GCN-NEXT: v_mul_f32_e32 v4, s3, v4 919; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 920; GCN-NEXT: v_mul_lo_u32 v3, v2, s10 921; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v2 922; GCN-NEXT: v_mul_lo_u32 v5, s0, v4 923; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 924; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 925; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 926; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 927; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 928; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 929; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 930; GCN-NEXT: v_mul_hi_u32 v4, s7, v4 931; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 932; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 933; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 934; GCN-NEXT: v_mul_lo_u32 v6, v4, s11 935; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 936; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 937; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 938; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 939; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 940; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 941; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 942; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 943; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 944; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 945; GCN-NEXT: s_endpgm 946 %r = udiv <4 x i32> %x, %y 947 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 948 ret void 949} 950 951define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 952; CHECK-LABEL: @urem_v4i32( 953; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 954; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 955; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 956; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 957; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 958; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 959; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 960; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 961; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 962; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 963; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 964; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 965; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 966; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 967; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 968; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 969; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 970; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 971; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 972; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 973; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 974; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 975; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 976; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 977; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 978; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 979; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 980; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 981; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 982; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 983; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 984; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 985; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 986; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 987; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 988; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 989; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 990; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 991; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 992; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 993; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 994; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 995; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 996; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 997; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 998; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 999; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1000; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1001; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1002; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1003; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1004; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1005; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1006; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1007; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1008; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1009; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1010; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1011; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1012; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1013; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1014; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1015; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1016; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1017; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1018; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1019; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1020; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1021; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1022; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1023; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1024; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1025; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1026; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1027; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1028; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1029; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1030; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1031; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1032; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1033; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1034; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1035; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1036; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1037; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1038; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1039; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1040; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1041; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1042; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1043; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1044; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1045; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1046; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1047; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1048; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1049; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1050; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1051; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1052; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1053; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1054; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1055; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1056; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1057; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1058; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1059; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1060; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1061; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1062; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1063; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1064; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1065; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1066; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1067; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1068; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1069; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1070; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1071; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1072; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1073; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1074; CHECK-NEXT: ret void 1075; 1076; GCN-LABEL: urem_v4i32: 1077; GCN: ; %bb.0: 1078; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1079; GCN-NEXT: s_mov_b32 s13, 0x4f7ffffe 1080; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1081; GCN-NEXT: s_mov_b32 s3, 0xf000 1082; GCN-NEXT: s_waitcnt lgkmcnt(0) 1083; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 1084; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 1085; GCN-NEXT: s_sub_i32 s2, 0, s8 1086; GCN-NEXT: s_sub_i32 s12, 0, s9 1087; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1088; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1089; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 1090; GCN-NEXT: v_cvt_f32_u32_e32 v5, s11 1091; GCN-NEXT: v_mul_f32_e32 v0, s13, v0 1092; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1093; GCN-NEXT: v_mul_f32_e32 v1, s13, v1 1094; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1095; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1096; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 1097; GCN-NEXT: s_mov_b32 s2, -1 1098; GCN-NEXT: v_mul_lo_u32 v4, s12, v1 1099; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1100; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 1101; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1102; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 1103; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1104; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 1105; GCN-NEXT: v_mul_f32_e32 v2, s13, v3 1106; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 1107; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1108; GCN-NEXT: v_mul_lo_u32 v1, v1, s9 1109; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1110; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1111; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1112; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1113; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1114; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1115; GCN-NEXT: s_sub_i32 s4, 0, s10 1116; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1117; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 1118; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1119; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1120; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1121; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1122; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 1123; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v5 1124; GCN-NEXT: s_sub_i32 s4, 0, s11 1125; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1126; GCN-NEXT: v_mul_f32_e32 v3, s13, v4 1127; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1128; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1129; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 1130; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1131; GCN-NEXT: v_mul_lo_u32 v5, s4, v3 1132; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1133; GCN-NEXT: v_mul_lo_u32 v2, v2, s10 1134; GCN-NEXT: v_mul_hi_u32 v4, v3, v5 1135; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1136; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1137; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1138; GCN-NEXT: v_mul_hi_u32 v3, s7, v3 1139; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1140; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1141; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1142; GCN-NEXT: v_mul_lo_u32 v3, v3, s11 1143; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1144; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1145; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1146; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1147; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1148; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1149; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1150; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1151; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1152; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1153; GCN-NEXT: s_endpgm 1154 %r = urem <4 x i32> %x, %y 1155 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1156 ret void 1157} 1158 1159define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1160; CHECK-LABEL: @sdiv_v4i32( 1161; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1162; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1163; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1164; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1165; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1166; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1167; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1168; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1169; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1170; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1171; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1172; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1173; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1174; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1175; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1176; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1177; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1178; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1179; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1180; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1181; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1182; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1183; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1184; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1185; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1186; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1187; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1188; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1189; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1190; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1191; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1192; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1193; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1194; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1195; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1196; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1197; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1198; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1199; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1200; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1201; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 1202; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1203; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1204; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1205; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1206; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1207; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1208; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1209; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1210; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1211; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1212; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1213; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1214; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1215; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1216; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1217; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1218; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1219; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1220; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1221; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1222; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1223; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1224; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1225; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1226; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1227; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1228; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1229; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1230; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1231; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1232; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1233; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1234; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1235; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1236; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1237; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1238; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1239; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1240; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1241; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1242; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1243; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1244; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1245; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1246; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1247; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1248; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1249; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1250; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1251; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1252; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1253; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1254; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1255; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1256; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1257; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1258; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1259; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1260; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1261; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1262; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1263; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1264; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1265; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1266; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1267; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1268; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1269; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1270; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1271; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1272; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1273; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1274; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1275; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1276; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1277; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1278; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1279; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1280; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1281; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1282; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1283; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1284; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1285; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1286; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1287; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1288; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1289; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1290; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1291; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1292; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1293; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1294; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1295; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1296; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1297; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1298; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1299; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1300; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1301; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1302; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1303; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1304; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1305; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1306; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1307; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1308; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1309; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1310; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1311; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1312; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1313; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1314; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1315; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1316; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1317; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1318; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1319; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1320; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1321; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1322; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1323; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1324; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1325; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1326; CHECK-NEXT: ret void 1327; 1328; GCN-LABEL: sdiv_v4i32: 1329; GCN: ; %bb.0: 1330; GCN-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd 1331; GCN-NEXT: s_mov_b32 s16, 0x4f7ffffe 1332; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1333; GCN-NEXT: s_mov_b32 s7, 0xf000 1334; GCN-NEXT: s_mov_b32 s6, -1 1335; GCN-NEXT: s_waitcnt lgkmcnt(0) 1336; GCN-NEXT: s_ashr_i32 s2, s12, 31 1337; GCN-NEXT: s_add_i32 s3, s12, s2 1338; GCN-NEXT: s_xor_b32 s12, s3, s2 1339; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 1340; GCN-NEXT: s_ashr_i32 s3, s13, 31 1341; GCN-NEXT: s_add_i32 s0, s13, s3 1342; GCN-NEXT: s_xor_b32 s13, s0, s3 1343; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1344; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 1345; GCN-NEXT: s_sub_i32 s1, 0, s12 1346; GCN-NEXT: s_ashr_i32 s0, s8, 31 1347; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 1348; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1349; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1350; GCN-NEXT: s_xor_b32 s2, s0, s2 1351; GCN-NEXT: v_mul_lo_u32 v2, s1, v0 1352; GCN-NEXT: s_add_i32 s1, s8, s0 1353; GCN-NEXT: v_mul_f32_e32 v1, s16, v1 1354; GCN-NEXT: s_xor_b32 s1, s1, s0 1355; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1356; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1357; GCN-NEXT: s_sub_i32 s0, 0, s13 1358; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1359; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 1360; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 1361; GCN-NEXT: v_mul_lo_u32 v3, v0, s12 1362; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 1363; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1364; GCN-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1365; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v3 1366; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1367; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s12, v3 1368; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 1369; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1370; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1371; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 1372; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1373; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 1374; GCN-NEXT: s_ashr_i32 s0, s9, 31 1375; GCN-NEXT: s_add_i32 s1, s9, s0 1376; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 1377; GCN-NEXT: s_xor_b32 s2, s0, s3 1378; GCN-NEXT: s_ashr_i32 s3, s14, 31 1379; GCN-NEXT: s_xor_b32 s1, s1, s0 1380; GCN-NEXT: s_add_i32 s0, s14, s3 1381; GCN-NEXT: s_xor_b32 s9, s0, s3 1382; GCN-NEXT: v_cvt_f32_u32_e32 v3, s9 1383; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 1384; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1385; GCN-NEXT: v_mul_lo_u32 v2, v1, s13 1386; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1387; GCN-NEXT: v_mul_f32_e32 v3, s16, v3 1388; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 1389; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1390; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 1391; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1392; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s13, v2 1393; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1394; GCN-NEXT: s_sub_i32 s0, 0, s9 1395; GCN-NEXT: v_mul_lo_u32 v5, s0, v3 1396; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1397; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 1398; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1399; GCN-NEXT: v_mul_hi_u32 v2, v3, v5 1400; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 1401; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 1402; GCN-NEXT: s_ashr_i32 s2, s15, 31 1403; GCN-NEXT: s_ashr_i32 s0, s10, 31 1404; GCN-NEXT: s_add_i32 s8, s15, s2 1405; GCN-NEXT: s_add_i32 s1, s10, s0 1406; GCN-NEXT: s_xor_b32 s8, s8, s2 1407; GCN-NEXT: v_cvt_f32_u32_e32 v4, s8 1408; GCN-NEXT: s_xor_b32 s1, s1, s0 1409; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1410; GCN-NEXT: v_mul_hi_u32 v2, s1, v2 1411; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 1412; GCN-NEXT: s_xor_b32 s3, s0, s3 1413; GCN-NEXT: v_mul_lo_u32 v3, v2, s9 1414; GCN-NEXT: v_mul_f32_e32 v4, s16, v4 1415; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 1416; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1417; GCN-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1418; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1419; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1420; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1421; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1422; GCN-NEXT: s_sub_i32 s0, 0, s8 1423; GCN-NEXT: v_mul_lo_u32 v5, s0, v4 1424; GCN-NEXT: s_ashr_i32 s0, s11, 31 1425; GCN-NEXT: s_add_i32 s1, s11, s0 1426; GCN-NEXT: s_xor_b32 s1, s1, s0 1427; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 1428; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1429; GCN-NEXT: s_xor_b32 s2, s0, s2 1430; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1431; GCN-NEXT: v_mul_hi_u32 v4, s1, v4 1432; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1433; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1434; GCN-NEXT: v_xor_b32_e32 v2, s3, v2 1435; GCN-NEXT: v_mul_lo_u32 v3, v4, s8 1436; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1437; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 1438; GCN-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1439; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 1440; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1441; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s8, v3 1442; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1443; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1444; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1445; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1446; GCN-NEXT: v_xor_b32_e32 v3, s2, v3 1447; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 1448; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1449; GCN-NEXT: s_endpgm 1450 %r = sdiv <4 x i32> %x, %y 1451 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1452 ret void 1453} 1454 1455define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1456; CHECK-LABEL: @srem_v4i32( 1457; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1458; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1459; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1460; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1461; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 1462; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 1463; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 1464; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 1465; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 1466; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 1467; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 1468; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 1469; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 1470; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 1471; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 1472; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 1473; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 1474; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 1475; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 1476; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 1477; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 1478; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 1479; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 1480; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 1481; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 1482; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 1483; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 1484; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 1485; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 1486; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 1487; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 1488; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 1489; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 1490; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 1491; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 1492; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 1493; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 1494; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 1495; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 1496; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1497; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 1498; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 1499; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 1500; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 1501; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 1502; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 1503; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 1504; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 1505; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 1506; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 1507; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 1508; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 1509; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 1510; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 1511; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 1512; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 1513; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 1514; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 1515; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 1516; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 1517; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 1518; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 1519; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 1520; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 1521; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 1522; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 1523; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 1524; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 1525; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 1526; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 1527; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 1528; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 1529; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 1530; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 1531; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 1532; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 1533; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 1534; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1535; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 1536; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 1537; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 1538; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 1539; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 1540; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 1541; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 1542; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 1543; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 1544; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 1545; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 1546; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 1547; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 1548; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 1549; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 1550; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 1551; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 1552; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 1553; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 1554; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 1555; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1556; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1557; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1558; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1559; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1560; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 1561; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 1562; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 1563; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 1564; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 1565; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 1566; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 1567; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 1568; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 1569; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 1570; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 1571; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 1572; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1573; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 1574; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 1575; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 1576; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 1577; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 1578; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 1579; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 1580; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 1581; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 1582; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 1583; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 1584; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 1585; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 1586; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 1587; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 1588; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 1589; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 1590; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 1591; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 1592; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 1593; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 1594; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 1595; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 1596; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 1597; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 1598; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 1599; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 1600; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 1601; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 1602; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 1603; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 1604; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 1605; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 1606; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 1607; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 1608; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 1609; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1610; CHECK-NEXT: ret void 1611; 1612; GCN-LABEL: srem_v4i32: 1613; GCN: ; %bb.0: 1614; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1615; GCN-NEXT: s_mov_b32 s13, 0x4f7ffffe 1616; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1617; GCN-NEXT: s_mov_b32 s3, 0xf000 1618; GCN-NEXT: s_waitcnt lgkmcnt(0) 1619; GCN-NEXT: s_ashr_i32 s2, s8, 31 1620; GCN-NEXT: s_add_i32 s8, s8, s2 1621; GCN-NEXT: s_xor_b32 s12, s8, s2 1622; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 1623; GCN-NEXT: s_ashr_i32 s8, s9, 31 1624; GCN-NEXT: s_add_i32 s9, s9, s8 1625; GCN-NEXT: s_xor_b32 s14, s9, s8 1626; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1627; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14 1628; GCN-NEXT: s_sub_i32 s9, 0, s12 1629; GCN-NEXT: s_ashr_i32 s8, s4, 31 1630; GCN-NEXT: v_mul_f32_e32 v0, s13, v0 1631; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1632; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1633; GCN-NEXT: s_add_i32 s4, s4, s8 1634; GCN-NEXT: s_xor_b32 s4, s4, s8 1635; GCN-NEXT: v_mul_lo_u32 v2, s9, v0 1636; GCN-NEXT: v_mul_f32_e32 v1, s13, v1 1637; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1638; GCN-NEXT: s_sub_i32 s9, 0, s14 1639; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1640; GCN-NEXT: s_mov_b32 s2, -1 1641; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1642; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 1643; GCN-NEXT: v_mul_lo_u32 v2, s9, v1 1644; GCN-NEXT: s_ashr_i32 s9, s5, 31 1645; GCN-NEXT: s_add_i32 s5, s5, s9 1646; GCN-NEXT: v_mul_lo_u32 v0, v0, s12 1647; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 1648; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1649; GCN-NEXT: s_xor_b32 s4, s5, s9 1650; GCN-NEXT: s_ashr_i32 s5, s10, 31 1651; GCN-NEXT: s_add_i32 s10, s10, s5 1652; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v0 1653; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 1654; GCN-NEXT: s_xor_b32 s10, s10, s5 1655; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1656; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1657; GCN-NEXT: v_cvt_f32_u32_e32 v2, s10 1658; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 1659; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v0 1660; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 1661; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 1662; GCN-NEXT: v_mul_lo_u32 v1, v1, s14 1663; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1664; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 1665; GCN-NEXT: v_mul_f32_e32 v2, s13, v2 1666; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1667; GCN-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 1668; GCN-NEXT: s_sub_i32 s4, 0, s10 1669; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 1670; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s14, v1 1671; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v1 1672; GCN-NEXT: v_mul_lo_u32 v4, s4, v2 1673; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1674; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s14, v1 1675; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v1 1676; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1677; GCN-NEXT: v_mul_hi_u32 v3, v2, v4 1678; GCN-NEXT: s_ashr_i32 s4, s6, 31 1679; GCN-NEXT: s_add_i32 s5, s6, s4 1680; GCN-NEXT: s_ashr_i32 s6, s11, 31 1681; GCN-NEXT: s_add_i32 s8, s11, s6 1682; GCN-NEXT: s_xor_b32 s8, s8, s6 1683; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1684; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 1685; GCN-NEXT: s_xor_b32 s5, s5, s4 1686; GCN-NEXT: v_mul_hi_u32 v2, s5, v2 1687; GCN-NEXT: v_xor_b32_e32 v1, s9, v1 1688; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1689; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 1690; GCN-NEXT: v_mul_lo_u32 v2, v2, s10 1691; GCN-NEXT: v_mul_f32_e32 v3, s13, v3 1692; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1693; GCN-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 1694; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1695; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1696; GCN-NEXT: s_sub_i32 s5, 0, s8 1697; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1698; GCN-NEXT: v_mul_lo_u32 v4, s5, v3 1699; GCN-NEXT: s_ashr_i32 s5, s7, 31 1700; GCN-NEXT: s_add_i32 s6, s7, s5 1701; GCN-NEXT: s_xor_b32 s6, s6, s5 1702; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 1703; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1704; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1705; GCN-NEXT: v_mul_hi_u32 v3, s6, v3 1706; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1707; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1708; GCN-NEXT: v_xor_b32_e32 v2, s4, v2 1709; GCN-NEXT: v_mul_lo_u32 v3, v3, s8 1710; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 1711; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1712; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 1713; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1714; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1715; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 1716; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1717; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1718; GCN-NEXT: v_xor_b32_e32 v3, s5, v3 1719; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3 1720; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1721; GCN-NEXT: s_endpgm 1722 %r = srem <4 x i32> %x, %y 1723 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1724 ret void 1725} 1726 1727define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 1728; CHECK-LABEL: @udiv_v4i16( 1729; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 1730; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 1731; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 1732; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 1733; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 1734; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 1735; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 1736; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 1737; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 1738; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 1739; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 1740; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 1741; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 1742; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 1743; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 1744; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 1745; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 1746; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 1747; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 1748; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 1749; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 1750; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 1751; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 1752; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 1753; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 1754; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 1755; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 1756; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 1757; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 1758; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 1759; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 1760; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 1761; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 1762; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 1763; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 1764; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 1765; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 1766; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 1767; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 1768; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 1769; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 1770; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 1771; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 1772; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 1773; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 1774; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 1775; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 1776; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 1777; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 1778; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 1779; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 1780; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 1781; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 1782; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 1783; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 1784; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 1785; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 1786; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 1787; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 1788; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 1789; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 1790; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 1791; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 1792; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 1793; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 1794; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 1795; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 1796; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 1797; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 1798; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 1799; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 1800; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 1801; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 1802; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 1803; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 1804; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 1805; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 1806; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 1807; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 1808; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 1809; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 1810; CHECK-NEXT: ret void 1811; 1812; GCN-LABEL: udiv_v4i16: 1813; GCN: ; %bb.0: 1814; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1815; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1816; GCN-NEXT: s_mov_b32 s8, 0xffff 1817; GCN-NEXT: s_mov_b32 s7, 0xf000 1818; GCN-NEXT: s_mov_b32 s6, -1 1819; GCN-NEXT: s_waitcnt lgkmcnt(0) 1820; GCN-NEXT: s_and_b32 s9, s2, s8 1821; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 1822; GCN-NEXT: s_lshr_b32 s9, s0, 16 1823; GCN-NEXT: s_and_b32 s0, s0, s8 1824; GCN-NEXT: s_lshr_b32 s2, s2, 16 1825; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 1826; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 1827; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 1828; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 1829; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 1830; GCN-NEXT: s_and_b32 s2, s3, s8 1831; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 1832; GCN-NEXT: v_trunc_f32_e32 v2, v2 1833; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 1834; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1835; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1836; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 1837; GCN-NEXT: v_trunc_f32_e32 v1, v1 1838; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 1839; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 1840; GCN-NEXT: v_cvt_f32_u32_e32 v4, s2 1841; GCN-NEXT: s_lshr_b32 s0, s1, 16 1842; GCN-NEXT: s_and_b32 s1, s1, s8 1843; GCN-NEXT: s_lshr_b32 s10, s3, 16 1844; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 1845; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1846; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 1847; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 1848; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 1849; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 1850; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v3 1851; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1852; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 1853; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 1854; GCN-NEXT: v_trunc_f32_e32 v1, v1 1855; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 1856; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 1857; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1858; GCN-NEXT: v_mul_f32_e32 v4, v6, v7 1859; GCN-NEXT: v_trunc_f32_e32 v4, v4 1860; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1861; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1862; GCN-NEXT: v_mad_f32 v4, -v4, v3, v6 1863; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 1864; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 1865; GCN-NEXT: v_and_b32_e32 v0, s8, v0 1866; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1867; GCN-NEXT: v_and_b32_e32 v1, s8, v1 1868; GCN-NEXT: v_or_b32_e32 v1, v1, v3 1869; GCN-NEXT: v_or_b32_e32 v0, v0, v2 1870; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1871; GCN-NEXT: s_endpgm 1872 %r = udiv <4 x i16> %x, %y 1873 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 1874 ret void 1875} 1876 1877define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 1878; CHECK-LABEL: @urem_v4i16( 1879; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 1880; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 1881; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 1882; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 1883; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 1884; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 1885; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 1886; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 1887; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 1888; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 1889; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 1890; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 1891; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 1892; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 1893; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 1894; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 1895; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 1896; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 1897; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 1898; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 1899; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 1900; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 1901; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 1902; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 1903; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 1904; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 1905; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 1906; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 1907; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 1908; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 1909; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 1910; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 1911; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 1912; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 1913; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 1914; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 1915; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 1916; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 1917; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 1918; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 1919; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 1920; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 1921; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 1922; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 1923; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 1924; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 1925; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 1926; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 1927; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 1928; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 1929; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 1930; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 1931; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 1932; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 1933; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 1934; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 1935; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 1936; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 1937; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 1938; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 1939; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 1940; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 1941; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 1942; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 1943; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 1944; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 1945; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 1946; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 1947; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 1948; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 1949; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 1950; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 1951; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 1952; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 1953; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 1954; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 1955; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 1956; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 1957; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 1958; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 1959; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 1960; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 1961; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 1962; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 1963; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 1964; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 1965; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 1966; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 1967; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 1968; CHECK-NEXT: ret void 1969; 1970; GCN-LABEL: urem_v4i16: 1971; GCN: ; %bb.0: 1972; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1973; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1974; GCN-NEXT: s_mov_b32 s8, 0xffff 1975; GCN-NEXT: s_mov_b32 s7, 0xf000 1976; GCN-NEXT: s_mov_b32 s6, -1 1977; GCN-NEXT: s_waitcnt lgkmcnt(0) 1978; GCN-NEXT: s_and_b32 s9, s2, s8 1979; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 1980; GCN-NEXT: s_and_b32 s10, s0, s8 1981; GCN-NEXT: s_lshr_b32 s11, s2, 16 1982; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 1983; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 1984; GCN-NEXT: v_cvt_f32_u32_e32 v3, s11 1985; GCN-NEXT: s_lshr_b32 s9, s0, 16 1986; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 1987; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 1988; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 1989; GCN-NEXT: v_trunc_f32_e32 v2, v2 1990; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 1991; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1992; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1993; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 1994; GCN-NEXT: v_trunc_f32_e32 v1, v1 1995; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 1996; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 1997; GCN-NEXT: v_mad_f32 v1, -v1, v3, v4 1998; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 1999; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2000; GCN-NEXT: s_and_b32 s2, s3, s8 2001; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 2002; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 2003; GCN-NEXT: s_and_b32 s2, s1, s8 2004; GCN-NEXT: v_mul_lo_u32 v1, v1, s11 2005; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 2006; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2007; GCN-NEXT: s_lshr_b32 s12, s3, 16 2008; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 2009; GCN-NEXT: s_lshr_b32 s10, s1, 16 2010; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 2011; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 2012; GCN-NEXT: v_cvt_f32_u32_e32 v6, s10 2013; GCN-NEXT: v_trunc_f32_e32 v1, v1 2014; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2015; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 2016; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 2017; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2018; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2019; GCN-NEXT: v_mul_f32_e32 v2, v6, v7 2020; GCN-NEXT: v_trunc_f32_e32 v2, v2 2021; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 2022; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2023; GCN-NEXT: v_mad_f32 v2, -v2, v4, v6 2024; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2025; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2026; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2027; GCN-NEXT: v_mul_lo_u32 v2, v2, s12 2028; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2029; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2030; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 2031; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2032; GCN-NEXT: v_and_b32_e32 v1, s8, v1 2033; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2034; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2035; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2036; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2037; GCN-NEXT: s_endpgm 2038 %r = urem <4 x i16> %x, %y 2039 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2040 ret void 2041} 2042 2043define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2044; CHECK-LABEL: @sdiv_v4i16( 2045; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2046; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2047; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2048; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2049; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2050; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2051; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2052; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2053; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2054; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2055; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2056; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2057; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2058; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2059; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2060; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2061; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2062; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2063; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2064; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2065; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2066; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2067; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2068; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2069; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2070; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2071; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2072; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2073; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2074; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2075; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2076; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2077; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2078; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2079; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2080; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2081; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2082; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2083; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2084; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2085; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2086; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2087; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2088; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2089; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2090; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2091; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2092; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2093; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2094; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2095; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2096; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2097; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2098; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2099; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2100; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2101; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2102; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2103; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2104; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2105; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2106; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2107; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2108; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2109; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2110; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2111; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2112; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2113; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2114; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2115; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2116; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2117; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2118; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2119; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2120; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2121; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2122; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2123; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2124; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2125; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2126; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2127; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2128; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2129; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2130; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2131; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2132; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2133; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2134; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2135; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2136; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2137; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2138; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2139; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2140; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2141; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2142; CHECK-NEXT: ret void 2143; 2144; GCN-LABEL: sdiv_v4i16: 2145; GCN: ; %bb.0: 2146; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2147; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2148; GCN-NEXT: s_mov_b32 s7, 0xf000 2149; GCN-NEXT: s_mov_b32 s6, -1 2150; GCN-NEXT: s_waitcnt lgkmcnt(0) 2151; GCN-NEXT: s_sext_i32_i16 s8, s2 2152; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2153; GCN-NEXT: s_sext_i32_i16 s9, s0 2154; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2155; GCN-NEXT: s_xor_b32 s8, s9, s8 2156; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2157; GCN-NEXT: s_ashr_i32 s2, s2, 16 2158; GCN-NEXT: s_ashr_i32 s8, s8, 30 2159; GCN-NEXT: s_or_b32 s8, s8, 1 2160; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2161; GCN-NEXT: v_trunc_f32_e32 v2, v2 2162; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2163; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2164; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2165; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2166; GCN-NEXT: v_mov_b32_e32 v3, s8 2167; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2168; GCN-NEXT: s_ashr_i32 s0, s0, 16 2169; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2170; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2171; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 2172; GCN-NEXT: s_xor_b32 s0, s0, s2 2173; GCN-NEXT: s_ashr_i32 s0, s0, 30 2174; GCN-NEXT: s_or_b32 s0, s0, 1 2175; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2176; GCN-NEXT: v_trunc_f32_e32 v3, v3 2177; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 2178; GCN-NEXT: v_mov_b32_e32 v4, s0 2179; GCN-NEXT: s_sext_i32_i16 s0, s3 2180; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 2181; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2182; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2183; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 2184; GCN-NEXT: s_sext_i32_i16 s2, s1 2185; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v3 2186; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2187; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2188; GCN-NEXT: s_xor_b32 s0, s2, s0 2189; GCN-NEXT: s_ashr_i32 s0, s0, 30 2190; GCN-NEXT: s_or_b32 s0, s0, 1 2191; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2192; GCN-NEXT: v_trunc_f32_e32 v4, v4 2193; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 2194; GCN-NEXT: v_mov_b32_e32 v5, s0 2195; GCN-NEXT: s_ashr_i32 s0, s3, 16 2196; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 2197; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2198; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2199; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 2200; GCN-NEXT: s_ashr_i32 s1, s1, 16 2201; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 2202; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 2203; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2204; GCN-NEXT: s_xor_b32 s0, s1, s0 2205; GCN-NEXT: s_ashr_i32 s0, s0, 30 2206; GCN-NEXT: s_or_b32 s0, s0, 1 2207; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2208; GCN-NEXT: v_trunc_f32_e32 v5, v5 2209; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 2210; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2211; GCN-NEXT: v_mov_b32_e32 v6, s0 2212; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 2213; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 2214; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 2215; GCN-NEXT: s_mov_b32 s0, 0xffff 2216; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2217; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2218; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2219; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2220; GCN-NEXT: v_and_b32_e32 v0, s0, v0 2221; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2222; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2223; GCN-NEXT: s_endpgm 2224 %r = sdiv <4 x i16> %x, %y 2225 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2226 ret void 2227} 2228 2229define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2230; CHECK-LABEL: @srem_v4i16( 2231; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2232; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2233; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2234; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2235; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2236; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2237; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2238; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2239; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2240; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2241; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2242; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2243; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2244; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2245; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2246; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2247; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2248; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2249; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2250; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2251; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 2252; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 2253; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 2254; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 2255; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 2256; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 2257; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 2258; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2259; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 2260; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 2261; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 2262; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 2263; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 2264; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 2265; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 2266; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 2267; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 2268; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 2269; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 2270; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 2271; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 2272; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 2273; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 2274; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 2275; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 2276; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 2277; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 2278; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 2279; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 2280; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 2281; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 2282; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 2283; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 2284; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2285; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 2286; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 2287; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 2288; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 2289; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 2290; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 2291; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 2292; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 2293; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 2294; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 2295; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 2296; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 2297; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 2298; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2299; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 2300; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 2301; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 2302; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 2303; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 2304; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 2305; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 2306; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 2307; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 2308; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 2309; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 2310; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2311; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 2312; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 2313; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 2314; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 2315; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 2316; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 2317; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 2318; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 2319; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 2320; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 2321; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 2322; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 2323; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 2324; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 2325; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 2326; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 2327; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 2328; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 2329; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 2330; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 2331; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 2332; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 2333; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 2334; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 2335; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2336; CHECK-NEXT: ret void 2337; 2338; GCN-LABEL: srem_v4i16: 2339; GCN: ; %bb.0: 2340; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2341; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2342; GCN-NEXT: s_mov_b32 s7, 0xf000 2343; GCN-NEXT: s_mov_b32 s6, -1 2344; GCN-NEXT: s_waitcnt lgkmcnt(0) 2345; GCN-NEXT: s_sext_i32_i16 s8, s2 2346; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2347; GCN-NEXT: s_sext_i32_i16 s9, s0 2348; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2349; GCN-NEXT: s_xor_b32 s8, s9, s8 2350; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2351; GCN-NEXT: s_ashr_i32 s8, s8, 30 2352; GCN-NEXT: s_or_b32 s8, s8, 1 2353; GCN-NEXT: v_mov_b32_e32 v3, s8 2354; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2355; GCN-NEXT: v_trunc_f32_e32 v2, v2 2356; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2357; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2358; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2359; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2360; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2361; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2362; GCN-NEXT: s_ashr_i32 s2, s2, 16 2363; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2364; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2365; GCN-NEXT: s_ashr_i32 s0, s0, 16 2366; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2367; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 2368; GCN-NEXT: s_xor_b32 s8, s0, s2 2369; GCN-NEXT: s_ashr_i32 s8, s8, 30 2370; GCN-NEXT: s_or_b32 s8, s8, 1 2371; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2372; GCN-NEXT: v_trunc_f32_e32 v3, v3 2373; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 2374; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2375; GCN-NEXT: v_mov_b32_e32 v4, s8 2376; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 2377; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 2378; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 2379; GCN-NEXT: v_mul_lo_u32 v1, v1, s2 2380; GCN-NEXT: s_sext_i32_i16 s2, s3 2381; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 2382; GCN-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 2383; GCN-NEXT: s_sext_i32_i16 s0, s1 2384; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2385; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2386; GCN-NEXT: s_xor_b32 s0, s0, s2 2387; GCN-NEXT: s_ashr_i32 s0, s0, 30 2388; GCN-NEXT: s_or_b32 s0, s0, 1 2389; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2390; GCN-NEXT: v_trunc_f32_e32 v4, v4 2391; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 2392; GCN-NEXT: v_mov_b32_e32 v5, s0 2393; GCN-NEXT: s_ashr_i32 s0, s3, 16 2394; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 2395; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2396; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2397; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 2398; GCN-NEXT: s_ashr_i32 s2, s1, 16 2399; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 2400; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 2401; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2402; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2403; GCN-NEXT: s_xor_b32 s3, s2, s0 2404; GCN-NEXT: s_ashr_i32 s3, s3, 30 2405; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2406; GCN-NEXT: v_trunc_f32_e32 v5, v5 2407; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 2408; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2409; GCN-NEXT: s_or_b32 s3, s3, 1 2410; GCN-NEXT: v_mov_b32_e32 v6, s3 2411; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 2412; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 2413; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 2414; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 2415; GCN-NEXT: s_mov_b32 s0, 0xffff 2416; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2417; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2418; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 2419; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2420; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2421; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2422; GCN-NEXT: v_and_b32_e32 v0, s0, v0 2423; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2424; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2425; GCN-NEXT: s_endpgm 2426 %r = srem <4 x i16> %x, %y 2427 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2428 ret void 2429} 2430 2431define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2432; CHECK-LABEL: @udiv_i3( 2433; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2434; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2435; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2436; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2437; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2438; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2439; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2440; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2441; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2442; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2443; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2444; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2445; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2446; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2447; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2448; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 2449; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 2450; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 2451; CHECK-NEXT: ret void 2452; 2453; GCN-LABEL: udiv_i3: 2454; GCN: ; %bb.0: 2455; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2456; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2457; GCN-NEXT: s_mov_b32 s7, 0xf000 2458; GCN-NEXT: s_mov_b32 s6, -1 2459; GCN-NEXT: s_waitcnt lgkmcnt(0) 2460; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2461; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2462; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2463; GCN-NEXT: s_and_b32 s0, s0, 7 2464; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 2465; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2466; GCN-NEXT: v_trunc_f32_e32 v1, v1 2467; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2468; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2469; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2470; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2471; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2472; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2473; GCN-NEXT: s_endpgm 2474 %r = udiv i3 %x, %y 2475 store i3 %r, i3 addrspace(1)* %out 2476 ret void 2477} 2478 2479define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2480; CHECK-LABEL: @urem_i3( 2481; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2482; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2483; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2484; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2485; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2486; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2487; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2488; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2489; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2490; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2491; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2492; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2493; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2494; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2495; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2496; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 2497; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 2498; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 2499; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 2500; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 2501; CHECK-NEXT: ret void 2502; 2503; GCN-LABEL: urem_i3: 2504; GCN: ; %bb.0: 2505; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2506; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2507; GCN-NEXT: s_mov_b32 s7, 0xf000 2508; GCN-NEXT: s_mov_b32 s6, -1 2509; GCN-NEXT: s_waitcnt lgkmcnt(0) 2510; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2511; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2512; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2513; GCN-NEXT: s_and_b32 s2, s0, 7 2514; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 2515; GCN-NEXT: s_lshr_b32 s1, s0, 8 2516; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2517; GCN-NEXT: v_trunc_f32_e32 v1, v1 2518; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2519; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2520; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2521; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2522; GCN-NEXT: v_mul_lo_u32 v0, v0, s1 2523; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2524; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2525; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2526; GCN-NEXT: s_endpgm 2527 %r = urem i3 %x, %y 2528 store i3 %r, i3 addrspace(1)* %out 2529 ret void 2530} 2531 2532define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2533; CHECK-LABEL: @sdiv_i3( 2534; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2535; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2536; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2537; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2538; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2539; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2540; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2541; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2542; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2543; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2544; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2545; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2546; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2547; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2548; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2549; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2550; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2551; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2552; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 2553; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 2554; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 2555; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 2556; CHECK-NEXT: ret void 2557; 2558; GCN-LABEL: sdiv_i3: 2559; GCN: ; %bb.0: 2560; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2561; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2562; GCN-NEXT: s_mov_b32 s7, 0xf000 2563; GCN-NEXT: s_mov_b32 s6, -1 2564; GCN-NEXT: s_waitcnt lgkmcnt(0) 2565; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 2566; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 2567; GCN-NEXT: s_bfe_i32 s0, s0, 0x30000 2568; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2569; GCN-NEXT: s_xor_b32 s0, s0, s1 2570; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2571; GCN-NEXT: s_ashr_i32 s0, s0, 30 2572; GCN-NEXT: s_or_b32 s0, s0, 1 2573; GCN-NEXT: v_mov_b32_e32 v3, s0 2574; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2575; GCN-NEXT: v_trunc_f32_e32 v2, v2 2576; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2577; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2578; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2579; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2580; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2581; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2582; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2583; GCN-NEXT: s_endpgm 2584 %r = sdiv i3 %x, %y 2585 store i3 %r, i3 addrspace(1)* %out 2586 ret void 2587} 2588 2589define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2590; CHECK-LABEL: @srem_i3( 2591; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2592; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2593; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2594; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2595; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2596; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2597; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2598; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2599; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2600; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2601; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2602; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2603; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2604; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2605; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2606; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2607; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2608; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2609; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 2610; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 2611; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 2612; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 2613; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 2614; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 2615; CHECK-NEXT: ret void 2616; 2617; GCN-LABEL: srem_i3: 2618; GCN: ; %bb.0: 2619; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2620; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2621; GCN-NEXT: s_mov_b32 s7, 0xf000 2622; GCN-NEXT: s_mov_b32 s6, -1 2623; GCN-NEXT: s_waitcnt lgkmcnt(0) 2624; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 2625; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 2626; GCN-NEXT: s_bfe_i32 s3, s0, 0x30000 2627; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 2628; GCN-NEXT: s_xor_b32 s1, s3, s1 2629; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2630; GCN-NEXT: s_ashr_i32 s1, s1, 30 2631; GCN-NEXT: s_or_b32 s1, s1, 1 2632; GCN-NEXT: v_mov_b32_e32 v3, s1 2633; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2634; GCN-NEXT: v_trunc_f32_e32 v2, v2 2635; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2636; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2637; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2638; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2639; GCN-NEXT: s_lshr_b32 s2, s0, 8 2640; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2641; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2642; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2643; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2644; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2645; GCN-NEXT: s_endpgm 2646 %r = srem i3 %x, %y 2647 store i3 %r, i3 addrspace(1)* %out 2648 ret void 2649} 2650 2651define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2652; CHECK-LABEL: @udiv_v3i16( 2653; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2654; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2655; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2656; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2657; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2658; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2659; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2660; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2661; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2662; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2663; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2664; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2665; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2666; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2667; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2668; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2669; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2670; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2671; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2672; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 2673; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 2674; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2675; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2676; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2677; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2678; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2679; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2680; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2681; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2682; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2683; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2684; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2685; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2686; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2687; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2688; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2689; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2690; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2691; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2692; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2693; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 2694; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2695; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2696; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2697; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2698; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2699; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2700; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2701; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2702; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2703; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2704; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2705; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2706; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2707; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2708; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2709; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2710; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2711; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2712; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2713; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2714; CHECK-NEXT: ret void 2715; 2716; GCN-LABEL: udiv_v3i16: 2717; GCN: ; %bb.0: 2718; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2719; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2720; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2721; GCN-NEXT: s_mov_b32 s8, 0xffff 2722; GCN-NEXT: s_mov_b32 s7, 0xf000 2723; GCN-NEXT: s_waitcnt lgkmcnt(0) 2724; GCN-NEXT: s_and_b32 s6, s0, s8 2725; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 2726; GCN-NEXT: s_and_b32 s6, s2, s8 2727; GCN-NEXT: s_lshr_b32 s0, s0, 16 2728; GCN-NEXT: v_cvt_f32_u32_e32 v3, s0 2729; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 2730; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2731; GCN-NEXT: s_lshr_b32 s0, s2, 16 2732; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 2733; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 2734; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2735; GCN-NEXT: v_trunc_f32_e32 v2, v2 2736; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2737; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 2738; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2739; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 2740; GCN-NEXT: v_trunc_f32_e32 v1, v1 2741; GCN-NEXT: s_and_b32 s0, s1, s8 2742; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2743; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 2744; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 2745; GCN-NEXT: s_and_b32 s0, s3, s8 2746; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 2747; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2748; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 2749; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 2750; GCN-NEXT: s_mov_b32 s6, -1 2751; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2752; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 2753; GCN-NEXT: v_trunc_f32_e32 v2, v2 2754; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 2755; GCN-NEXT: v_mad_f32 v2, -v2, v4, v5 2756; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2757; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2758; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2759; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2760; GCN-NEXT: v_or_b32_e32 v0, v0, v1 2761; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 2762; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2763; GCN-NEXT: s_endpgm 2764 %r = udiv <3 x i16> %x, %y 2765 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 2766 ret void 2767} 2768 2769define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2770; CHECK-LABEL: @urem_v3i16( 2771; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2772; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2773; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2774; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2775; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2776; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2777; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2778; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2779; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2780; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2781; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2782; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2783; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2784; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2785; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2786; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2787; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2788; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2789; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2790; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2791; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2792; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 2793; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 2794; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2795; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2796; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2797; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2798; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2799; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2800; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2801; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2802; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2803; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2804; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2805; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2806; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2807; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2808; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2809; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2810; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2811; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2812; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2813; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2814; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2815; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 2816; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2817; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2818; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2819; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2820; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2821; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2822; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2823; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2824; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2825; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2826; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2827; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2828; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2829; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2830; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2831; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2832; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2833; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2834; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2835; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2836; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2837; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2838; CHECK-NEXT: ret void 2839; 2840; GCN-LABEL: urem_v3i16: 2841; GCN: ; %bb.0: 2842; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2843; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2844; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2845; GCN-NEXT: s_mov_b32 s8, 0xffff 2846; GCN-NEXT: s_mov_b32 s7, 0xf000 2847; GCN-NEXT: s_waitcnt lgkmcnt(0) 2848; GCN-NEXT: v_mov_b32_e32 v1, s2 2849; GCN-NEXT: s_and_b32 s6, s0, s8 2850; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 2851; GCN-NEXT: s_and_b32 s6, s2, s8 2852; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 2853; GCN-NEXT: v_mov_b32_e32 v4, s0 2854; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 2855; GCN-NEXT: v_alignbit_b32 v4, s1, v4, 16 2856; GCN-NEXT: v_and_b32_e32 v5, s8, v4 2857; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 2858; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2859; GCN-NEXT: v_trunc_f32_e32 v3, v3 2860; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 2861; GCN-NEXT: v_cvt_u32_f32_e32 v6, v3 2862; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2863; GCN-NEXT: v_cvt_f32_u32_e32 v2, v5 2864; GCN-NEXT: v_and_b32_e32 v3, s8, v1 2865; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 2866; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 2867; GCN-NEXT: s_and_b32 s0, s1, s8 2868; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 2869; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2870; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 2871; GCN-NEXT: s_and_b32 s0, s3, s8 2872; GCN-NEXT: v_cvt_f32_u32_e32 v7, s0 2873; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 2874; GCN-NEXT: v_trunc_f32_e32 v5, v5 2875; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v6 2876; GCN-NEXT: v_mad_f32 v3, -v5, v2, v3 2877; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 2878; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2879; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2880; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 2881; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 2882; GCN-NEXT: v_trunc_f32_e32 v3, v3 2883; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 2884; GCN-NEXT: v_cvt_u32_f32_e32 v4, v3 2885; GCN-NEXT: v_mad_f32 v3, -v3, v6, v7 2886; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 2887; GCN-NEXT: s_mov_b32 s6, -1 2888; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2889; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 2890; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 2891; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2892; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2893; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 2894; GCN-NEXT: v_or_b32_e32 v0, v0, v1 2895; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 2896; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2897; GCN-NEXT: s_endpgm 2898 %r = urem <3 x i16> %x, %y 2899 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 2900 ret void 2901} 2902 2903define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2904; CHECK-LABEL: @sdiv_v3i16( 2905; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2906; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2907; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2908; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2909; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2910; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2911; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2912; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2913; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2914; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2915; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2916; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2917; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2918; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2919; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2920; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2921; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2922; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2923; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2924; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2925; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2926; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2927; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2928; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 2929; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 2930; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2931; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2932; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2933; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2934; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2935; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2936; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2937; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2938; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2939; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2940; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2941; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2942; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2943; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2944; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2945; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2946; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2947; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2948; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2949; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2950; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2951; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2952; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2953; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 2954; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2955; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2956; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2957; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2958; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2959; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2960; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2961; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2962; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2963; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2964; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2965; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2966; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2967; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2968; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2969; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2970; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2971; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2972; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2973; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2974; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2975; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2976; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2977; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2978; CHECK-NEXT: ret void 2979; 2980; GCN-LABEL: sdiv_v3i16: 2981; GCN: ; %bb.0: 2982; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2983; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2984; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2985; GCN-NEXT: s_mov_b32 s7, 0xf000 2986; GCN-NEXT: s_mov_b32 s6, -1 2987; GCN-NEXT: s_waitcnt lgkmcnt(0) 2988; GCN-NEXT: s_sext_i32_i16 s9, s2 2989; GCN-NEXT: s_sext_i32_i16 s8, s0 2990; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2991; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2992; GCN-NEXT: s_xor_b32 s8, s9, s8 2993; GCN-NEXT: s_ashr_i32 s0, s0, 16 2994; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2995; GCN-NEXT: s_ashr_i32 s8, s8, 30 2996; GCN-NEXT: s_or_b32 s8, s8, 1 2997; GCN-NEXT: v_mov_b32_e32 v3, s8 2998; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2999; GCN-NEXT: v_trunc_f32_e32 v2, v2 3000; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3001; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3002; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3003; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 3004; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3005; GCN-NEXT: s_ashr_i32 s2, s2, 16 3006; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3007; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 3008; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 3009; GCN-NEXT: s_xor_b32 s0, s2, s0 3010; GCN-NEXT: s_ashr_i32 s0, s0, 30 3011; GCN-NEXT: s_or_b32 s0, s0, 1 3012; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 3013; GCN-NEXT: v_trunc_f32_e32 v3, v3 3014; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 3015; GCN-NEXT: v_mov_b32_e32 v4, s0 3016; GCN-NEXT: s_sext_i32_i16 s0, s1 3017; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3018; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 3019; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 3020; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3021; GCN-NEXT: s_sext_i32_i16 s1, s3 3022; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 3023; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 3024; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3025; GCN-NEXT: s_xor_b32 s0, s1, s0 3026; GCN-NEXT: s_ashr_i32 s0, s0, 30 3027; GCN-NEXT: s_or_b32 s0, s0, 1 3028; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3029; GCN-NEXT: v_trunc_f32_e32 v4, v4 3030; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3031; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3032; GCN-NEXT: v_mov_b32_e32 v5, s0 3033; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3034; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3035; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3036; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3037; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 3038; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3039; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3040; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3041; GCN-NEXT: s_endpgm 3042 %r = sdiv <3 x i16> %x, %y 3043 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3044 ret void 3045} 3046 3047define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3048; CHECK-LABEL: @srem_v3i16( 3049; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3050; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3051; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3052; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3053; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3054; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3055; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3056; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3057; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3058; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3059; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3060; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3061; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3062; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3063; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3064; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3065; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3066; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3067; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3068; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3069; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3070; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3071; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3072; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3073; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3074; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 3075; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 3076; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3077; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3078; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3079; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3080; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3081; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3082; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3083; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3084; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3085; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3086; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3087; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3088; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3089; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3090; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3091; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3092; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3093; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3094; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3095; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3096; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3097; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3098; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3099; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3100; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3101; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 3102; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3103; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3104; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3105; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3106; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3107; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3108; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3109; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3110; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3111; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3112; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3113; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3114; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3115; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3116; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3117; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3118; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3119; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3120; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3121; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3122; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3123; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3124; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3125; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3126; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3127; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3128; CHECK-NEXT: ret void 3129; 3130; GCN-LABEL: srem_v3i16: 3131; GCN: ; %bb.0: 3132; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3133; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3134; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3135; GCN-NEXT: s_mov_b32 s7, 0xf000 3136; GCN-NEXT: s_waitcnt lgkmcnt(0) 3137; GCN-NEXT: s_sext_i32_i16 s8, s2 3138; GCN-NEXT: s_sext_i32_i16 s6, s0 3139; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 3140; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 3141; GCN-NEXT: s_xor_b32 s6, s8, s6 3142; GCN-NEXT: s_ashr_i32 s6, s6, 30 3143; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 3144; GCN-NEXT: s_or_b32 s6, s6, 1 3145; GCN-NEXT: v_mov_b32_e32 v3, s6 3146; GCN-NEXT: s_mov_b32 s6, -1 3147; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 3148; GCN-NEXT: v_trunc_f32_e32 v2, v2 3149; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3150; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3151; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3152; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3153; GCN-NEXT: v_mov_b32_e32 v1, s2 3154; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3155; GCN-NEXT: v_mov_b32_e32 v2, s0 3156; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 16 3157; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 3158; GCN-NEXT: v_cvt_f32_i32_e32 v4, v3 3159; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 3160; GCN-NEXT: v_bfe_i32 v5, v1, 0, 16 3161; GCN-NEXT: v_cvt_f32_i32_e32 v6, v5 3162; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 3163; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 3164; GCN-NEXT: v_xor_b32_e32 v3, v5, v3 3165; GCN-NEXT: s_sext_i32_i16 s0, s1 3166; GCN-NEXT: v_mul_f32_e32 v5, v6, v7 3167; GCN-NEXT: v_trunc_f32_e32 v5, v5 3168; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 3169; GCN-NEXT: v_mad_f32 v6, -v5, v4, v6 3170; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3171; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v3 3172; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 3173; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 3174; GCN-NEXT: v_or_b32_e32 v3, 1, v3 3175; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 3176; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3177; GCN-NEXT: s_sext_i32_i16 s2, s3 3178; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3179; GCN-NEXT: v_cvt_f32_i32_e32 v3, s2 3180; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4 3181; GCN-NEXT: s_xor_b32 s0, s2, s0 3182; GCN-NEXT: s_ashr_i32 s0, s0, 30 3183; GCN-NEXT: s_or_b32 s0, s0, 1 3184; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 3185; GCN-NEXT: v_trunc_f32_e32 v5, v5 3186; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3 3187; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3188; GCN-NEXT: v_mov_b32_e32 v6, s0 3189; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 3190; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3191; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3192; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 3193; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 3194; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3195; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 3196; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 3197; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3198; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3199; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3200; GCN-NEXT: s_endpgm 3201 %r = srem <3 x i16> %x, %y 3202 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3203 ret void 3204} 3205 3206define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3207; CHECK-LABEL: @udiv_v3i15( 3208; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3209; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3210; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3211; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3212; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3213; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3214; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3215; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3216; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3217; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3218; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3219; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3220; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3221; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3222; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3223; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3224; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3225; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 3226; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 3227; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 3228; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 3229; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3230; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 3231; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 3232; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3233; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3234; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3235; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3236; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3237; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3238; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3239; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3240; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3241; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3242; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3243; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3244; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3245; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 3246; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 3247; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 3248; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 3249; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3250; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 3251; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 3252; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3253; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3254; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3255; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3256; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3257; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3258; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3259; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3260; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3261; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3262; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3263; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3264; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3265; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 3266; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 3267; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 3268; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3269; CHECK-NEXT: ret void 3270; 3271; GCN-LABEL: udiv_v3i15: 3272; GCN: ; %bb.0: 3273; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3274; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3275; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3276; GCN-NEXT: s_mov_b32 s7, 0xf000 3277; GCN-NEXT: s_mov_b32 s6, -1 3278; GCN-NEXT: s_waitcnt lgkmcnt(0) 3279; GCN-NEXT: v_mov_b32_e32 v0, s2 3280; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3281; GCN-NEXT: s_movk_i32 s3, 0x7fff 3282; GCN-NEXT: s_and_b32 s9, s0, s3 3283; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 3284; GCN-NEXT: v_mov_b32_e32 v2, s0 3285; GCN-NEXT: s_and_b32 s8, s2, s3 3286; GCN-NEXT: s_bfe_u32 s0, s0, 0xf000f 3287; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 3288; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 3289; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3290; GCN-NEXT: s_bfe_u32 s2, s2, 0xf000f 3291; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3292; GCN-NEXT: v_cvt_f32_u32_e32 v6, s2 3293; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3294; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v5 3295; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3296; GCN-NEXT: v_trunc_f32_e32 v4, v4 3297; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3298; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3299; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 3300; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3301; GCN-NEXT: v_mul_f32_e32 v1, v6, v7 3302; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3303; GCN-NEXT: v_trunc_f32_e32 v1, v1 3304; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 3305; GCN-NEXT: v_mad_f32 v4, -v1, v5, v6 3306; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3307; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 3308; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v2 3309; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 3310; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3311; GCN-NEXT: v_mul_f32_e32 v1, v0, v6 3312; GCN-NEXT: v_trunc_f32_e32 v1, v1 3313; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1 3314; GCN-NEXT: v_mad_f32 v0, -v1, v2, v0 3315; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 3316; GCN-NEXT: v_and_b32_e32 v2, s3, v3 3317; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 3318; GCN-NEXT: v_and_b32_e32 v3, s3, v4 3319; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3320; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3321; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3322; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3323; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3324; GCN-NEXT: s_waitcnt expcnt(0) 3325; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 3326; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 3327; GCN-NEXT: s_endpgm 3328 %r = udiv <3 x i15> %x, %y 3329 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3330 ret void 3331} 3332 3333define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3334; CHECK-LABEL: @urem_v3i15( 3335; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3336; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3337; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3338; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3339; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3340; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3341; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3342; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3343; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3344; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3345; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3346; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3347; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3348; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3349; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3350; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3351; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3352; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3353; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3354; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 3355; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 3356; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 3357; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 3358; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3359; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 3360; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 3361; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3362; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3363; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3364; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3365; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3366; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3367; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3368; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3369; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3370; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3371; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3372; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3373; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3374; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3375; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3376; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 3377; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 3378; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 3379; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 3380; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3381; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 3382; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 3383; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3384; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3385; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3386; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3387; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3388; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3389; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3390; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3391; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3392; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3393; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3394; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3395; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3396; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3397; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3398; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 3399; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 3400; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 3401; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3402; CHECK-NEXT: ret void 3403; 3404; GCN-LABEL: urem_v3i15: 3405; GCN: ; %bb.0: 3406; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3407; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3408; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3409; GCN-NEXT: s_mov_b32 s7, 0xf000 3410; GCN-NEXT: s_mov_b32 s6, -1 3411; GCN-NEXT: s_waitcnt lgkmcnt(0) 3412; GCN-NEXT: v_mov_b32_e32 v0, s2 3413; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3414; GCN-NEXT: s_movk_i32 s3, 0x7fff 3415; GCN-NEXT: s_and_b32 s10, s0, s3 3416; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 3417; GCN-NEXT: s_and_b32 s9, s2, s3 3418; GCN-NEXT: v_cvt_f32_u32_e32 v3, s9 3419; GCN-NEXT: v_mov_b32_e32 v2, s0 3420; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3421; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3422; GCN-NEXT: s_bfe_u32 s1, s0, 0xf000f 3423; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 3424; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3425; GCN-NEXT: v_trunc_f32_e32 v4, v4 3426; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3427; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3428; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3429; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f 3430; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 3431; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 3432; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3433; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v5 3434; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3435; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3436; GCN-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 3437; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 3438; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 3439; GCN-NEXT: v_cvt_f32_u32_e32 v7, v0 3440; GCN-NEXT: v_trunc_f32_e32 v1, v1 3441; GCN-NEXT: v_mad_f32 v3, -v1, v5, v3 3442; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v4 3443; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 3444; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3445; GCN-NEXT: s_lshr_b32 s0, s0, 15 3446; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 3447; GCN-NEXT: v_trunc_f32_e32 v3, v3 3448; GCN-NEXT: v_cvt_u32_f32_e32 v5, v3 3449; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3450; GCN-NEXT: v_mad_f32 v3, -v3, v4, v7 3451; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3452; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 3453; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3454; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3455; GCN-NEXT: s_lshr_b32 s8, s2, 15 3456; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 3457; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 3458; GCN-NEXT: v_and_b32_e32 v3, s3, v3 3459; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3460; GCN-NEXT: v_and_b32_e32 v2, s3, v6 3461; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3462; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3463; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3464; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3465; GCN-NEXT: s_waitcnt expcnt(0) 3466; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 3467; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 3468; GCN-NEXT: s_endpgm 3469 %r = urem <3 x i15> %x, %y 3470 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3471 ret void 3472} 3473 3474define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3475; CHECK-LABEL: @sdiv_v3i15( 3476; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3477; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3478; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3479; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3480; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3481; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3482; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3483; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3484; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3485; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3486; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3487; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3488; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3489; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3490; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3491; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3492; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3493; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3494; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3495; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3496; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 3497; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 3498; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 3499; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 3500; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 3501; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3502; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 3503; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 3504; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3505; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3506; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3507; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3508; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3509; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3510; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3511; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3512; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3513; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3514; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3515; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3516; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3517; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3518; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3519; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3520; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 3521; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 3522; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 3523; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 3524; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 3525; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3526; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 3527; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 3528; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3529; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3530; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3531; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3532; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3533; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3534; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3535; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3536; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3537; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3538; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3539; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3540; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3541; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3542; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3543; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3544; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 3545; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 3546; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 3547; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 3548; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3549; CHECK-NEXT: ret void 3550; 3551; GCN-LABEL: sdiv_v3i15: 3552; GCN: ; %bb.0: 3553; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3554; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3555; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3556; GCN-NEXT: s_mov_b32 s7, 0xf000 3557; GCN-NEXT: s_mov_b32 s6, -1 3558; GCN-NEXT: s_waitcnt lgkmcnt(0) 3559; GCN-NEXT: v_mov_b32_e32 v0, s2 3560; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3561; GCN-NEXT: s_bfe_i32 s3, s0, 0xf0000 3562; GCN-NEXT: v_cvt_f32_i32_e32 v2, s3 3563; GCN-NEXT: v_mov_b32_e32 v1, s0 3564; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 3565; GCN-NEXT: s_bfe_i32 s1, s2, 0xf0000 3566; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 3567; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3568; GCN-NEXT: s_xor_b32 s1, s1, s3 3569; GCN-NEXT: s_bfe_i32 s0, s0, 0xf000f 3570; GCN-NEXT: s_ashr_i32 s1, s1, 30 3571; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3572; GCN-NEXT: v_trunc_f32_e32 v4, v4 3573; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3574; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3575; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3576; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 3577; GCN-NEXT: s_or_b32 s1, s1, 1 3578; GCN-NEXT: v_mov_b32_e32 v5, s1 3579; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3580; GCN-NEXT: s_bfe_i32 s1, s2, 0xf000f 3581; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3582; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 3583; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 3584; GCN-NEXT: s_xor_b32 s0, s1, s0 3585; GCN-NEXT: v_bfe_i32 v1, v1, 0, 15 3586; GCN-NEXT: s_ashr_i32 s0, s0, 30 3587; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 3588; GCN-NEXT: v_trunc_f32_e32 v5, v5 3589; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 3590; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 3591; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3592; GCN-NEXT: v_cvt_f32_i32_e32 v4, v1 3593; GCN-NEXT: s_or_b32 s0, s0, 1 3594; GCN-NEXT: v_mov_b32_e32 v6, s0 3595; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3596; GCN-NEXT: v_bfe_i32 v0, v0, 0, 15 3597; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3598; GCN-NEXT: v_cvt_f32_i32_e32 v5, v0 3599; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 3600; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 3601; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 3602; GCN-NEXT: v_or_b32_e32 v0, 1, v0 3603; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 3604; GCN-NEXT: v_trunc_f32_e32 v1, v1 3605; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 3606; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 3607; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 3608; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 3609; GCN-NEXT: s_movk_i32 s0, 0x7fff 3610; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3611; GCN-NEXT: v_and_b32_e32 v3, s0, v3 3612; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3613; GCN-NEXT: v_and_b32_e32 v2, s0, v2 3614; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3615; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3616; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3617; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3618; GCN-NEXT: s_waitcnt expcnt(0) 3619; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 3620; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 3621; GCN-NEXT: s_endpgm 3622 %r = sdiv <3 x i15> %x, %y 3623 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3624 ret void 3625} 3626 3627define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3628; CHECK-LABEL: @srem_v3i15( 3629; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3630; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3631; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3632; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3633; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3634; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3635; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3636; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3637; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3638; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3639; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3640; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3641; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3642; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3643; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3644; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3645; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3646; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3647; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3648; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3649; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3650; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3651; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 3652; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 3653; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 3654; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 3655; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 3656; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3657; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 3658; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 3659; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3660; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3661; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3662; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3663; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3664; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3665; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3666; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3667; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3668; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3669; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3670; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3671; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3672; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3673; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3674; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3675; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3676; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3677; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 3678; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 3679; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 3680; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 3681; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 3682; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3683; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 3684; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 3685; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3686; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3687; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3688; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3689; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3690; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3691; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3692; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3693; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3694; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3695; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3696; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3697; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3698; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3699; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3700; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3701; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3702; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3703; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 3704; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 3705; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 3706; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 3707; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3708; CHECK-NEXT: ret void 3709; 3710; GCN-LABEL: srem_v3i15: 3711; GCN: ; %bb.0: 3712; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3713; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3714; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3715; GCN-NEXT: s_mov_b32 s7, 0xf000 3716; GCN-NEXT: s_mov_b32 s6, -1 3717; GCN-NEXT: s_waitcnt lgkmcnt(0) 3718; GCN-NEXT: v_mov_b32_e32 v0, s2 3719; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3720; GCN-NEXT: s_movk_i32 s3, 0x7fff 3721; GCN-NEXT: s_and_b32 s11, s0, s3 3722; GCN-NEXT: s_bfe_i32 s11, s11, 0xf0000 3723; GCN-NEXT: v_cvt_f32_i32_e32 v2, s11 3724; GCN-NEXT: s_and_b32 s9, s2, s3 3725; GCN-NEXT: s_bfe_i32 s9, s9, 0xf0000 3726; GCN-NEXT: v_cvt_f32_i32_e32 v3, s9 3727; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3728; GCN-NEXT: s_xor_b32 s9, s9, s11 3729; GCN-NEXT: s_ashr_i32 s9, s9, 30 3730; GCN-NEXT: s_or_b32 s9, s9, 1 3731; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3732; GCN-NEXT: v_trunc_f32_e32 v4, v4 3733; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3734; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3735; GCN-NEXT: v_mov_b32_e32 v5, s9 3736; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3737; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3738; GCN-NEXT: v_mov_b32_e32 v1, s0 3739; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3740; GCN-NEXT: s_bfe_u32 s12, s0, 0xf000f 3741; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 3742; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 3743; GCN-NEXT: s_lshr_b32 s1, s0, 15 3744; GCN-NEXT: s_bfe_i32 s0, s12, 0xf0000 3745; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 3746; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f 3747; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 3748; GCN-NEXT: s_lshr_b32 s8, s2, 15 3749; GCN-NEXT: s_bfe_i32 s2, s10, 0xf0000 3750; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 3751; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 3752; GCN-NEXT: s_xor_b32 s0, s2, s0 3753; GCN-NEXT: s_ashr_i32 s0, s0, 30 3754; GCN-NEXT: s_or_b32 s0, s0, 1 3755; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 3756; GCN-NEXT: v_trunc_f32_e32 v5, v5 3757; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 3758; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3759; GCN-NEXT: v_and_b32_e32 v1, s3, v1 3760; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 3761; GCN-NEXT: v_mov_b32_e32 v6, s0 3762; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3763; GCN-NEXT: v_bfe_i32 v4, v1, 0, 15 3764; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3765; GCN-NEXT: v_cvt_f32_i32_e32 v5, v4 3766; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3767; GCN-NEXT: v_bfe_i32 v6, v0, 0, 15 3768; GCN-NEXT: v_cvt_f32_i32_e32 v7, v6 3769; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v5 3770; GCN-NEXT: v_xor_b32_e32 v4, v6, v4 3771; GCN-NEXT: v_ashrrev_i32_e32 v4, 30, v4 3772; GCN-NEXT: v_or_b32_e32 v4, 1, v4 3773; GCN-NEXT: v_mul_f32_e32 v6, v7, v8 3774; GCN-NEXT: v_trunc_f32_e32 v6, v6 3775; GCN-NEXT: v_mad_f32 v7, -v6, v5, v7 3776; GCN-NEXT: v_cvt_i32_f32_e32 v6, v6 3777; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 3778; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 3779; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 3780; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 3781; GCN-NEXT: v_mul_lo_u32 v1, v4, v1 3782; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3783; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 3784; GCN-NEXT: v_and_b32_e32 v3, s3, v3 3785; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 3786; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3787; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3788; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3789; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3790; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3791; GCN-NEXT: s_waitcnt expcnt(0) 3792; GCN-NEXT: v_and_b32_e32 v0, 0x1fff, v1 3793; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 3794; GCN-NEXT: s_endpgm 3795 %r = srem <3 x i15> %x, %y 3796 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3797 ret void 3798} 3799 3800define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 3801; CHECK-LABEL: @udiv_i32_oddk_denom( 3802; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 3803; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3804; CHECK-NEXT: ret void 3805; 3806; GCN-LABEL: udiv_i32_oddk_denom: 3807; GCN: ; %bb.0: 3808; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3809; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 3810; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 3811; GCN-NEXT: s_mov_b32 s7, 0xf000 3812; GCN-NEXT: s_mov_b32 s6, -1 3813; GCN-NEXT: s_waitcnt lgkmcnt(0) 3814; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 3815; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 3816; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 3817; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3818; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 3819; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3820; GCN-NEXT: s_endpgm 3821 %r = udiv i32 %x, 1235195 3822 store i32 %r, i32 addrspace(1)* %out 3823 ret void 3824} 3825 3826define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 3827; CHECK-LABEL: @udiv_i32_pow2k_denom( 3828; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 3829; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3830; CHECK-NEXT: ret void 3831; 3832; GCN-LABEL: udiv_i32_pow2k_denom: 3833; GCN: ; %bb.0: 3834; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3835; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 3836; GCN-NEXT: s_mov_b32 s7, 0xf000 3837; GCN-NEXT: s_mov_b32 s6, -1 3838; GCN-NEXT: s_waitcnt lgkmcnt(0) 3839; GCN-NEXT: s_lshr_b32 s0, s0, 12 3840; GCN-NEXT: v_mov_b32_e32 v0, s0 3841; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3842; GCN-NEXT: s_endpgm 3843 %r = udiv i32 %x, 4096 3844 store i32 %r, i32 addrspace(1)* %out 3845 ret void 3846} 3847 3848define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 3849; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 3850; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 3851; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 3852; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3853; CHECK-NEXT: ret void 3854; 3855; GCN-LABEL: udiv_i32_pow2_shl_denom: 3856; GCN: ; %bb.0: 3857; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3858; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3859; GCN-NEXT: s_mov_b32 s7, 0xf000 3860; GCN-NEXT: s_mov_b32 s6, -1 3861; GCN-NEXT: s_waitcnt lgkmcnt(0) 3862; GCN-NEXT: s_add_i32 s1, s1, 12 3863; GCN-NEXT: s_lshr_b32 s0, s0, s1 3864; GCN-NEXT: v_mov_b32_e32 v0, s0 3865; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3866; GCN-NEXT: s_endpgm 3867 %shl.y = shl i32 4096, %y 3868 %r = udiv i32 %x, %shl.y 3869 store i32 %r, i32 addrspace(1)* %out 3870 ret void 3871} 3872 3873define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 3874; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 3875; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3876; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 3877; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 3878; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 3879; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 3880; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 3881; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 3882; CHECK-NEXT: ret void 3883; 3884; GCN-LABEL: udiv_v2i32_pow2k_denom: 3885; GCN: ; %bb.0: 3886; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3887; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3888; GCN-NEXT: s_mov_b32 s7, 0xf000 3889; GCN-NEXT: s_mov_b32 s6, -1 3890; GCN-NEXT: s_waitcnt lgkmcnt(0) 3891; GCN-NEXT: s_lshr_b32 s0, s0, 12 3892; GCN-NEXT: s_lshr_b32 s1, s1, 12 3893; GCN-NEXT: v_mov_b32_e32 v0, s0 3894; GCN-NEXT: v_mov_b32_e32 v1, s1 3895; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3896; GCN-NEXT: s_endpgm 3897 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 3898 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 3899 ret void 3900} 3901 3902define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 3903; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 3904; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3905; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 3906; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 3907; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 3908; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 3909; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 3910; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 3911; CHECK-NEXT: ret void 3912; 3913; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom: 3914; GCN: ; %bb.0: 3915; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3916; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3917; GCN-NEXT: v_mov_b32_e32 v0, 0x100101 3918; GCN-NEXT: s_mov_b32 s7, 0xf000 3919; GCN-NEXT: s_mov_b32 s6, -1 3920; GCN-NEXT: s_waitcnt lgkmcnt(0) 3921; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 3922; GCN-NEXT: s_lshr_b32 s0, s0, 12 3923; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 3924; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 3925; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3926; GCN-NEXT: v_lshrrev_b32_e32 v1, 11, v0 3927; GCN-NEXT: v_mov_b32_e32 v0, s0 3928; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3929; GCN-NEXT: s_endpgm 3930 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 3931 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 3932 ret void 3933} 3934 3935define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 3936; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 3937; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 3938; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3939; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 3940; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 3941; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 3942; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 3943; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 3944; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 3945; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 3946; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 3947; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 3948; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 3949; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 3950; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 3951; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 3952; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 3953; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 3954; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 3955; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 3956; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 3957; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 3958; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 3959; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 3960; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 3961; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 3962; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 3963; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 3964; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 3965; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 3966; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 3967; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 3968; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 3969; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 3970; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 3971; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 3972; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 3973; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3974; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 3975; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 3976; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 3977; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 3978; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 3979; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 3980; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 3981; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 3982; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 3983; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 3984; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 3985; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 3986; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 3987; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 3988; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 3989; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 3990; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 3991; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 3992; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 3993; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 3994; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 3995; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 3996; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 3997; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 3998; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 3999; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 4000; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 4001; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 4002; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4003; CHECK-NEXT: ret void 4004; 4005; GCN-LABEL: udiv_v2i32_pow2_shl_denom: 4006; GCN: ; %bb.0: 4007; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4008; GCN-NEXT: s_movk_i32 s4, 0x1000 4009; GCN-NEXT: s_mov_b32 s7, 0xf000 4010; GCN-NEXT: s_mov_b32 s6, -1 4011; GCN-NEXT: s_waitcnt lgkmcnt(0) 4012; GCN-NEXT: s_lshl_b32 s8, s4, s2 4013; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 4014; GCN-NEXT: s_lshl_b32 s9, s4, s3 4015; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 4016; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4017; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4018; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4019; GCN-NEXT: s_mov_b32 s0, 0x4f7ffffe 4020; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4021; GCN-NEXT: v_mul_f32_e32 v0, s0, v0 4022; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4023; GCN-NEXT: v_mul_f32_e32 v1, s0, v1 4024; GCN-NEXT: s_sub_i32 s0, 0, s8 4025; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4026; GCN-NEXT: v_mul_lo_u32 v2, s0, v0 4027; GCN-NEXT: s_sub_i32 s0, 0, s9 4028; GCN-NEXT: v_mul_lo_u32 v3, s0, v1 4029; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 4030; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4031; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4032; GCN-NEXT: s_waitcnt lgkmcnt(0) 4033; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 4034; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 4035; GCN-NEXT: v_mul_hi_u32 v1, s3, v1 4036; GCN-NEXT: v_mul_lo_u32 v2, v0, s8 4037; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4038; GCN-NEXT: v_mul_lo_u32 v4, v1, s9 4039; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 4040; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 4041; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 4042; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 4043; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4044; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4045; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 4046; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4047; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 4048; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4049; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 4050; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4051; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 4052; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4053; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4054; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 4055; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4056; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4057; GCN-NEXT: s_endpgm 4058 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4059 %r = udiv <2 x i32> %x, %shl.y 4060 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4061 ret void 4062} 4063 4064define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4065; CHECK-LABEL: @urem_i32_oddk_denom( 4066; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 4067; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4068; CHECK-NEXT: ret void 4069; 4070; GCN-LABEL: urem_i32_oddk_denom: 4071; GCN: ; %bb.0: 4072; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4073; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4074; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 4075; GCN-NEXT: s_mov_b32 s7, 0xf000 4076; GCN-NEXT: s_mov_b32 s6, -1 4077; GCN-NEXT: s_waitcnt lgkmcnt(0) 4078; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4079; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 4080; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 4081; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 4082; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 4083; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x12d8fb, v0 4084; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4085; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4086; GCN-NEXT: s_endpgm 4087 %r = urem i32 %x, 1235195 4088 store i32 %r, i32 addrspace(1)* %out 4089 ret void 4090} 4091 4092define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4093; CHECK-LABEL: @urem_i32_pow2k_denom( 4094; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 4095; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4096; CHECK-NEXT: ret void 4097; 4098; GCN-LABEL: urem_i32_pow2k_denom: 4099; GCN: ; %bb.0: 4100; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4101; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4102; GCN-NEXT: s_mov_b32 s7, 0xf000 4103; GCN-NEXT: s_mov_b32 s6, -1 4104; GCN-NEXT: s_waitcnt lgkmcnt(0) 4105; GCN-NEXT: s_and_b32 s0, s0, 0xfff 4106; GCN-NEXT: v_mov_b32_e32 v0, s0 4107; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4108; GCN-NEXT: s_endpgm 4109 %r = urem i32 %x, 4096 4110 store i32 %r, i32 addrspace(1)* %out 4111 ret void 4112} 4113 4114define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4115; CHECK-LABEL: @urem_i32_pow2_shl_denom( 4116; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4117; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 4118; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4119; CHECK-NEXT: ret void 4120; 4121; GCN-LABEL: urem_i32_pow2_shl_denom: 4122; GCN: ; %bb.0: 4123; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4124; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4125; GCN-NEXT: s_mov_b32 s7, 0xf000 4126; GCN-NEXT: s_mov_b32 s6, -1 4127; GCN-NEXT: s_waitcnt lgkmcnt(0) 4128; GCN-NEXT: s_lshl_b32 s1, 0x1000, s1 4129; GCN-NEXT: s_add_i32 s1, s1, -1 4130; GCN-NEXT: s_and_b32 s0, s0, s1 4131; GCN-NEXT: v_mov_b32_e32 v0, s0 4132; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4133; GCN-NEXT: s_endpgm 4134 %shl.y = shl i32 4096, %y 4135 %r = urem i32 %x, %shl.y 4136 store i32 %r, i32 addrspace(1)* %out 4137 ret void 4138} 4139 4140define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4141; CHECK-LABEL: @urem_v2i32_pow2k_denom( 4142; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4143; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 4144; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4145; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4146; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 4147; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4148; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4149; CHECK-NEXT: ret void 4150; 4151; GCN-LABEL: urem_v2i32_pow2k_denom: 4152; GCN: ; %bb.0: 4153; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4154; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4155; GCN-NEXT: s_movk_i32 s2, 0xfff 4156; GCN-NEXT: s_mov_b32 s7, 0xf000 4157; GCN-NEXT: s_mov_b32 s6, -1 4158; GCN-NEXT: s_waitcnt lgkmcnt(0) 4159; GCN-NEXT: s_and_b32 s0, s0, s2 4160; GCN-NEXT: s_and_b32 s1, s1, s2 4161; GCN-NEXT: v_mov_b32_e32 v0, s0 4162; GCN-NEXT: v_mov_b32_e32 v1, s1 4163; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4164; GCN-NEXT: s_endpgm 4165 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 4166 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4167 ret void 4168} 4169 4170define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4171; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 4172; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4173; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4174; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4175; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 4176; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 4177; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 4178; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 4179; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 4180; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 4181; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 4182; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 4183; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 4184; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 4185; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 4186; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 4187; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 4188; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 4189; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 4190; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4191; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4192; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4193; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4194; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 4195; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 4196; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 4197; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 4198; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 4199; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 4200; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 4201; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 4202; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 4203; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 4204; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4205; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 4206; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4207; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 4208; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 4209; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 4210; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 4211; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 4212; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 4213; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 4214; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 4215; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 4216; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 4217; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 4218; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 4219; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 4220; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 4221; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 4222; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 4223; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 4224; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 4225; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 4226; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 4227; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 4228; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 4229; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 4230; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 4231; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 4232; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 4233; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4234; CHECK-NEXT: ret void 4235; 4236; GCN-LABEL: urem_v2i32_pow2_shl_denom: 4237; GCN: ; %bb.0: 4238; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4239; GCN-NEXT: s_movk_i32 s4, 0x1000 4240; GCN-NEXT: s_mov_b32 s7, 0xf000 4241; GCN-NEXT: s_mov_b32 s6, -1 4242; GCN-NEXT: s_waitcnt lgkmcnt(0) 4243; GCN-NEXT: s_lshl_b32 s8, s4, s2 4244; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 4245; GCN-NEXT: s_lshl_b32 s3, s4, s3 4246; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 4247; GCN-NEXT: s_mov_b32 s4, 0x4f7ffffe 4248; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4249; GCN-NEXT: s_sub_i32 s2, 0, s8 4250; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4251; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 4252; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4253; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 4254; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4255; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4256; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4257; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 4258; GCN-NEXT: s_sub_i32 s2, 0, s3 4259; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 4260; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 4261; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4262; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4263; GCN-NEXT: s_waitcnt lgkmcnt(0) 4264; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4265; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 4266; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 4267; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 4268; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 4269; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4270; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 4271; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 4272; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4273; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 4274; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 4275; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4276; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 4277; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 4278; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 4279; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4280; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 4281; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 4282; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4283; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4284; GCN-NEXT: s_endpgm 4285 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4286 %r = urem <2 x i32> %x, %shl.y 4287 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4288 ret void 4289} 4290 4291define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4292; CHECK-LABEL: @sdiv_i32_oddk_denom( 4293; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 4294; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4295; CHECK-NEXT: ret void 4296; 4297; GCN-LABEL: sdiv_i32_oddk_denom: 4298; GCN: ; %bb.0: 4299; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4300; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4301; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4302; GCN-NEXT: s_mov_b32 s7, 0xf000 4303; GCN-NEXT: s_mov_b32 s6, -1 4304; GCN-NEXT: s_waitcnt lgkmcnt(0) 4305; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 4306; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 4307; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4308; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 4309; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4310; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4311; GCN-NEXT: s_endpgm 4312 %r = sdiv i32 %x, 1235195 4313 store i32 %r, i32 addrspace(1)* %out 4314 ret void 4315} 4316 4317define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4318; CHECK-LABEL: @sdiv_i32_pow2k_denom( 4319; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 4320; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4321; CHECK-NEXT: ret void 4322; 4323; GCN-LABEL: sdiv_i32_pow2k_denom: 4324; GCN: ; %bb.0: 4325; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4326; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4327; GCN-NEXT: s_mov_b32 s7, 0xf000 4328; GCN-NEXT: s_mov_b32 s6, -1 4329; GCN-NEXT: s_waitcnt lgkmcnt(0) 4330; GCN-NEXT: s_ashr_i32 s1, s0, 31 4331; GCN-NEXT: s_lshr_b32 s1, s1, 20 4332; GCN-NEXT: s_add_i32 s0, s0, s1 4333; GCN-NEXT: s_ashr_i32 s0, s0, 12 4334; GCN-NEXT: v_mov_b32_e32 v0, s0 4335; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4336; GCN-NEXT: s_endpgm 4337 %r = sdiv i32 %x, 4096 4338 store i32 %r, i32 addrspace(1)* %out 4339 ret void 4340} 4341 4342define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4343; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 4344; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4345; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 4346; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4347; CHECK-NEXT: ret void 4348; 4349; GCN-LABEL: sdiv_i32_pow2_shl_denom: 4350; GCN: ; %bb.0: 4351; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4352; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4353; GCN-NEXT: s_waitcnt lgkmcnt(0) 4354; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 4355; GCN-NEXT: s_ashr_i32 s4, s3, 31 4356; GCN-NEXT: s_add_i32 s3, s3, s4 4357; GCN-NEXT: s_xor_b32 s7, s3, s4 4358; GCN-NEXT: v_cvt_f32_u32_e32 v0, s7 4359; GCN-NEXT: s_sub_i32 s3, 0, s7 4360; GCN-NEXT: s_ashr_i32 s5, s2, 31 4361; GCN-NEXT: s_add_i32 s2, s2, s5 4362; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4363; GCN-NEXT: s_xor_b32 s6, s2, s5 4364; GCN-NEXT: s_xor_b32 s4, s5, s4 4365; GCN-NEXT: s_mov_b32 s2, -1 4366; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 4367; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4368; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4369; GCN-NEXT: s_mov_b32 s3, 0xf000 4370; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4371; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4372; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 4373; GCN-NEXT: v_mul_lo_u32 v1, v0, s7 4374; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4375; GCN-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 4376; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s7, v1 4377; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 4378; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4379; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4380; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4381; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 4382; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4383; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 4384; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 4385; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 4386; GCN-NEXT: s_endpgm 4387 %shl.y = shl i32 4096, %y 4388 %r = sdiv i32 %x, %shl.y 4389 store i32 %r, i32 addrspace(1)* %out 4390 ret void 4391} 4392 4393define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4394; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 4395; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4396; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4397; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4398; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4399; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 4400; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4401; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4402; CHECK-NEXT: ret void 4403; 4404; GCN-LABEL: sdiv_v2i32_pow2k_denom: 4405; GCN: ; %bb.0: 4406; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4407; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4408; GCN-NEXT: s_mov_b32 s7, 0xf000 4409; GCN-NEXT: s_mov_b32 s6, -1 4410; GCN-NEXT: s_waitcnt lgkmcnt(0) 4411; GCN-NEXT: s_ashr_i32 s2, s0, 31 4412; GCN-NEXT: s_lshr_b32 s2, s2, 20 4413; GCN-NEXT: s_ashr_i32 s3, s1, 31 4414; GCN-NEXT: s_add_i32 s0, s0, s2 4415; GCN-NEXT: s_lshr_b32 s2, s3, 20 4416; GCN-NEXT: s_add_i32 s1, s1, s2 4417; GCN-NEXT: s_ashr_i32 s0, s0, 12 4418; GCN-NEXT: s_ashr_i32 s1, s1, 12 4419; GCN-NEXT: v_mov_b32_e32 v0, s0 4420; GCN-NEXT: v_mov_b32_e32 v1, s1 4421; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4422; GCN-NEXT: s_endpgm 4423 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 4424 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4425 ret void 4426} 4427 4428define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4429; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 4430; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4431; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4432; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4433; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4434; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 4435; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4436; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4437; CHECK-NEXT: ret void 4438; 4439; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 4440; GCN: ; %bb.0: 4441; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4442; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4443; GCN-NEXT: v_mov_b32_e32 v0, 0x80080081 4444; GCN-NEXT: s_mov_b32 s7, 0xf000 4445; GCN-NEXT: s_mov_b32 s6, -1 4446; GCN-NEXT: s_waitcnt lgkmcnt(0) 4447; GCN-NEXT: v_mul_hi_i32 v0, s1, v0 4448; GCN-NEXT: s_ashr_i32 s2, s0, 31 4449; GCN-NEXT: s_lshr_b32 s2, s2, 20 4450; GCN-NEXT: s_add_i32 s0, s0, s2 4451; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v0 4452; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4453; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 4454; GCN-NEXT: s_ashr_i32 s0, s0, 12 4455; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v0 4456; GCN-NEXT: v_mov_b32_e32 v0, s0 4457; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4458; GCN-NEXT: s_endpgm 4459 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 4460 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4461 ret void 4462} 4463 4464define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4465; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 4466; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4467; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4468; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4469; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 4470; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 4471; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4472; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 4473; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 4474; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 4475; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 4476; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 4477; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 4478; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 4479; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 4480; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 4481; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 4482; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 4483; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 4484; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4485; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4486; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4487; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4488; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 4489; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 4490; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 4491; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 4492; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 4493; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 4494; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 4495; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 4496; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 4497; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 4498; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 4499; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 4500; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 4501; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 4502; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 4503; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 4504; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 4505; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 4506; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 4507; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 4508; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 4509; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4510; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 4511; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 4512; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 4513; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 4514; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 4515; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 4516; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 4517; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 4518; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 4519; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 4520; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 4521; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 4522; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 4523; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 4524; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 4525; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 4526; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 4527; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 4528; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 4529; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 4530; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 4531; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 4532; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 4533; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 4534; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 4535; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 4536; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 4537; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 4538; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 4539; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 4540; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 4541; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 4542; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 4543; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 4544; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 4545; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 4546; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 4547; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 4548; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 4549; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4550; CHECK-NEXT: ret void 4551; 4552; GCN-LABEL: sdiv_v2i32_pow2_shl_denom: 4553; GCN: ; %bb.0: 4554; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4555; GCN-NEXT: s_movk_i32 s10, 0x1000 4556; GCN-NEXT: s_mov_b32 s13, 0x4f7ffffe 4557; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4558; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 4559; GCN-NEXT: s_mov_b32 s7, 0xf000 4560; GCN-NEXT: s_waitcnt lgkmcnt(0) 4561; GCN-NEXT: s_lshl_b32 s2, s10, s2 4562; GCN-NEXT: s_ashr_i32 s11, s2, 31 4563; GCN-NEXT: s_add_i32 s2, s2, s11 4564; GCN-NEXT: s_xor_b32 s12, s2, s11 4565; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 4566; GCN-NEXT: s_lshl_b32 s0, s10, s3 4567; GCN-NEXT: s_sub_i32 s3, 0, s12 4568; GCN-NEXT: s_ashr_i32 s2, s0, 31 4569; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4570; GCN-NEXT: s_add_i32 s0, s0, s2 4571; GCN-NEXT: s_xor_b32 s10, s0, s2 4572; GCN-NEXT: v_cvt_f32_u32_e32 v2, s10 4573; GCN-NEXT: v_mul_f32_e32 v0, s13, v0 4574; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4575; GCN-NEXT: s_ashr_i32 s1, s8, 31 4576; GCN-NEXT: s_add_i32 s0, s8, s1 4577; GCN-NEXT: s_xor_b32 s0, s0, s1 4578; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4579; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 4580; GCN-NEXT: s_xor_b32 s3, s1, s11 4581; GCN-NEXT: s_mov_b32 s6, -1 4582; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4583; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4584; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4585; GCN-NEXT: v_mul_f32_e32 v1, s13, v2 4586; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4587; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 4588; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4589; GCN-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 4590; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2 4591; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 4592; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v2 4593; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4594; GCN-NEXT: s_sub_i32 s0, 0, s10 4595; GCN-NEXT: v_mul_lo_u32 v3, s0, v1 4596; GCN-NEXT: s_ashr_i32 s0, s9, 31 4597; GCN-NEXT: s_add_i32 s1, s9, s0 4598; GCN-NEXT: s_xor_b32 s1, s1, s0 4599; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4600; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 4601; GCN-NEXT: s_xor_b32 s2, s0, s2 4602; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 4603; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 4604; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 4605; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 4606; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 4607; GCN-NEXT: v_mul_lo_u32 v2, v1, s10 4608; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4609; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 4610; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 4611; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 4612; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4613; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 4614; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4615; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4616; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 4617; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4618; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 4619; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 4620; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4621; GCN-NEXT: s_endpgm 4622 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4623 %r = sdiv <2 x i32> %x, %shl.y 4624 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4625 ret void 4626} 4627 4628define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4629; CHECK-LABEL: @srem_i32_oddk_denom( 4630; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 4631; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4632; CHECK-NEXT: ret void 4633; 4634; GCN-LABEL: srem_i32_oddk_denom: 4635; GCN: ; %bb.0: 4636; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4637; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4638; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4639; GCN-NEXT: s_mov_b32 s7, 0xf000 4640; GCN-NEXT: s_mov_b32 s6, -1 4641; GCN-NEXT: s_waitcnt lgkmcnt(0) 4642; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 4643; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 4644; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4645; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 4646; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4647; GCN-NEXT: v_mul_i32_i24_e32 v0, 0x12d8fb, v0 4648; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4649; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4650; GCN-NEXT: s_endpgm 4651 %r = srem i32 %x, 1235195 4652 store i32 %r, i32 addrspace(1)* %out 4653 ret void 4654} 4655 4656define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4657; CHECK-LABEL: @srem_i32_pow2k_denom( 4658; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 4659; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4660; CHECK-NEXT: ret void 4661; 4662; GCN-LABEL: srem_i32_pow2k_denom: 4663; GCN: ; %bb.0: 4664; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4665; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4666; GCN-NEXT: s_mov_b32 s7, 0xf000 4667; GCN-NEXT: s_mov_b32 s6, -1 4668; GCN-NEXT: s_waitcnt lgkmcnt(0) 4669; GCN-NEXT: s_ashr_i32 s1, s0, 31 4670; GCN-NEXT: s_lshr_b32 s1, s1, 20 4671; GCN-NEXT: s_add_i32 s1, s0, s1 4672; GCN-NEXT: s_and_b32 s1, s1, 0xfffff000 4673; GCN-NEXT: s_sub_i32 s0, s0, s1 4674; GCN-NEXT: v_mov_b32_e32 v0, s0 4675; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4676; GCN-NEXT: s_endpgm 4677 %r = srem i32 %x, 4096 4678 store i32 %r, i32 addrspace(1)* %out 4679 ret void 4680} 4681 4682define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4683; CHECK-LABEL: @srem_i32_pow2_shl_denom( 4684; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4685; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 4686; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4687; CHECK-NEXT: ret void 4688; 4689; GCN-LABEL: srem_i32_pow2_shl_denom: 4690; GCN: ; %bb.0: 4691; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4692; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4693; GCN-NEXT: s_waitcnt lgkmcnt(0) 4694; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 4695; GCN-NEXT: s_ashr_i32 s4, s3, 31 4696; GCN-NEXT: s_add_i32 s3, s3, s4 4697; GCN-NEXT: s_xor_b32 s6, s3, s4 4698; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 4699; GCN-NEXT: s_sub_i32 s3, 0, s6 4700; GCN-NEXT: s_ashr_i32 s4, s2, 31 4701; GCN-NEXT: s_add_i32 s2, s2, s4 4702; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4703; GCN-NEXT: s_xor_b32 s5, s2, s4 4704; GCN-NEXT: s_mov_b32 s2, -1 4705; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 4706; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4707; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4708; GCN-NEXT: s_mov_b32 s3, 0xf000 4709; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4710; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4711; GCN-NEXT: v_mul_hi_u32 v0, s5, v0 4712; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 4713; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 4714; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 4715; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 4716; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4717; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 4718; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 4719; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4720; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 4721; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 4722; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 4723; GCN-NEXT: s_endpgm 4724 %shl.y = shl i32 4096, %y 4725 %r = srem i32 %x, %shl.y 4726 store i32 %r, i32 addrspace(1)* %out 4727 ret void 4728} 4729 4730define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4731; CHECK-LABEL: @srem_v2i32_pow2k_denom( 4732; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4733; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 4734; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4735; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4736; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 4737; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4738; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4739; CHECK-NEXT: ret void 4740; 4741; GCN-LABEL: srem_v2i32_pow2k_denom: 4742; GCN: ; %bb.0: 4743; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4744; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4745; GCN-NEXT: s_movk_i32 s2, 0xf000 4746; GCN-NEXT: s_mov_b32 s7, 0xf000 4747; GCN-NEXT: s_mov_b32 s6, -1 4748; GCN-NEXT: s_waitcnt lgkmcnt(0) 4749; GCN-NEXT: s_ashr_i32 s3, s0, 31 4750; GCN-NEXT: s_lshr_b32 s3, s3, 20 4751; GCN-NEXT: s_add_i32 s3, s0, s3 4752; GCN-NEXT: s_and_b32 s3, s3, s2 4753; GCN-NEXT: s_sub_i32 s0, s0, s3 4754; GCN-NEXT: s_ashr_i32 s3, s1, 31 4755; GCN-NEXT: s_lshr_b32 s3, s3, 20 4756; GCN-NEXT: s_add_i32 s3, s1, s3 4757; GCN-NEXT: s_and_b32 s2, s3, s2 4758; GCN-NEXT: s_sub_i32 s1, s1, s2 4759; GCN-NEXT: v_mov_b32_e32 v0, s0 4760; GCN-NEXT: v_mov_b32_e32 v1, s1 4761; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4762; GCN-NEXT: s_endpgm 4763 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 4764 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4765 ret void 4766} 4767 4768define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4769; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 4770; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4771; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4772; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4773; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 4774; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 4775; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 4776; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 4777; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 4778; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 4779; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 4780; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4781; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 4782; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 4783; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 4784; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 4785; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 4786; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 4787; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 4788; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 4789; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 4790; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 4791; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 4792; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 4793; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 4794; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 4795; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 4796; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 4797; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 4798; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 4799; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 4800; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 4801; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 4802; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 4803; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 4804; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 4805; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 4806; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 4807; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 4808; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 4809; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 4810; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4811; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 4812; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 4813; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 4814; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 4815; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 4816; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 4817; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 4818; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 4819; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 4820; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 4821; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 4822; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 4823; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 4824; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 4825; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 4826; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 4827; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 4828; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 4829; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 4830; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 4831; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 4832; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 4833; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 4834; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 4835; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 4836; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 4837; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 4838; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 4839; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 4840; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 4841; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 4842; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 4843; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 4844; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 4845; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 4846; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 4847; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4848; CHECK-NEXT: ret void 4849; 4850; GCN-LABEL: srem_v2i32_pow2_shl_denom: 4851; GCN: ; %bb.0: 4852; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4853; GCN-NEXT: s_movk_i32 s6, 0x1000 4854; GCN-NEXT: s_mov_b32 s10, 0x4f7ffffe 4855; GCN-NEXT: s_mov_b32 s7, 0xf000 4856; GCN-NEXT: s_waitcnt lgkmcnt(0) 4857; GCN-NEXT: s_lshl_b32 s2, s6, s2 4858; GCN-NEXT: s_ashr_i32 s4, s2, 31 4859; GCN-NEXT: s_add_i32 s2, s2, s4 4860; GCN-NEXT: s_xor_b32 s9, s2, s4 4861; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 4862; GCN-NEXT: s_lshl_b32 s2, s6, s3 4863; GCN-NEXT: s_ashr_i32 s6, s2, 31 4864; GCN-NEXT: s_add_i32 s2, s2, s6 4865; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4866; GCN-NEXT: s_sub_i32 s8, 0, s9 4867; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4868; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4869; GCN-NEXT: v_mul_f32_e32 v0, s10, v0 4870; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4871; GCN-NEXT: s_waitcnt lgkmcnt(0) 4872; GCN-NEXT: s_ashr_i32 s3, s0, 31 4873; GCN-NEXT: s_add_i32 s0, s0, s3 4874; GCN-NEXT: v_mul_lo_u32 v1, s8, v0 4875; GCN-NEXT: s_xor_b32 s8, s2, s6 4876; GCN-NEXT: v_cvt_f32_u32_e32 v2, s8 4877; GCN-NEXT: s_xor_b32 s0, s0, s3 4878; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4879; GCN-NEXT: s_sub_i32 s2, 0, s8 4880; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 4881; GCN-NEXT: s_mov_b32 s6, -1 4882; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4883; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4884; GCN-NEXT: v_mul_f32_e32 v1, s10, v2 4885; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4886; GCN-NEXT: v_mul_lo_u32 v0, v0, s9 4887; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 4888; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4889; GCN-NEXT: s_ashr_i32 s0, s1, 31 4890; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 4891; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v0 4892; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v0 4893; GCN-NEXT: s_add_i32 s1, s1, s0 4894; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4895; GCN-NEXT: s_xor_b32 s1, s1, s0 4896; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 4897; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 4898; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v0 4899; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v0 4900; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4901; GCN-NEXT: v_mul_lo_u32 v1, v1, s8 4902; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 4903; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 4904; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 4905; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v1 4906; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 4907; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4908; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v1 4909; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 4910; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4911; GCN-NEXT: v_xor_b32_e32 v1, s0, v1 4912; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 4913; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4914; GCN-NEXT: s_endpgm 4915 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4916 %r = srem <2 x i32> %x, %shl.y 4917 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4918 ret void 4919} 4920 4921define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 4922; CHECK-LABEL: @udiv_i64_oddk_denom( 4923; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 4924; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 4925; CHECK-NEXT: ret void 4926; 4927; GCN-LABEL: udiv_i64_oddk_denom: 4928; GCN: ; %bb.0: 4929; GCN-NEXT: v_mov_b32_e32 v0, 0x4f176a73 4930; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 4931; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 4932; GCN-NEXT: v_rcp_f32_e32 v0, v0 4933; GCN-NEXT: s_movk_i32 s2, 0xfee0 4934; GCN-NEXT: s_mov_b32 s3, 0x68958c89 4935; GCN-NEXT: v_mov_b32_e32 v8, 0 4936; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 4937; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 4938; GCN-NEXT: v_trunc_f32_e32 v1, v1 4939; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 4940; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4941; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4942; GCN-NEXT: v_mov_b32_e32 v7, 0 4943; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 4944; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 4945; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 4946; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 4947; GCN-NEXT: s_mov_b32 s11, 0xf000 4948; GCN-NEXT: s_waitcnt lgkmcnt(0) 4949; GCN-NEXT: s_mov_b32 s8, s4 4950; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 4951; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 4952; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 4953; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 4954; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 4955; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 4956; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 4957; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 4958; GCN-NEXT: s_movk_i32 s4, 0x11e 4959; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 4960; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 4961; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4962; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 4963; GCN-NEXT: s_mov_b32 s10, -1 4964; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 4965; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 4966; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 4967; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 4968; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 4969; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 4970; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 4971; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 4972; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 4973; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 4974; GCN-NEXT: s_mov_b32 s2, 0x976a7377 4975; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 4976; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 4977; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 4978; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 4979; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 4980; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 4981; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 4982; GCN-NEXT: s_movk_i32 s3, 0x11f 4983; GCN-NEXT: s_mov_b32 s9, s5 4984; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 4985; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 4986; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 4987; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 4988; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 4989; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 4990; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 4991; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 4992; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 4993; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 4994; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4995; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 4996; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4997; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4998; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 4999; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 5000; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 5001; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 5002; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 5003; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5004; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5005; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 5006; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 5007; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5008; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5009; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5010; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5011; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5012; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 5013; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 5014; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 5015; GCN-NEXT: v_mov_b32_e32 v5, s3 5016; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5017; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 5018; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5019; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 5020; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 5021; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 5022; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 5023; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 5024; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 5025; GCN-NEXT: s_mov_b32 s2, 0x976a7376 5026; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 5027; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 5028; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 5029; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 5030; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 5031; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 5032; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 5033; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 5034; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 5035; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 5036; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 5037; GCN-NEXT: v_mov_b32_e32 v6, s7 5038; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 5039; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 5040; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5041; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 5042; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5043; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 5044; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 5045; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5046; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 5047; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5048; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5049; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5050; GCN-NEXT: s_endpgm 5051 %r = udiv i64 %x, 1235195949943 5052 store i64 %r, i64 addrspace(1)* %out 5053 ret void 5054} 5055 5056define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5057; CHECK-LABEL: @udiv_i64_pow2k_denom( 5058; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 5059; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5060; CHECK-NEXT: ret void 5061; 5062; GCN-LABEL: udiv_i64_pow2k_denom: 5063; GCN: ; %bb.0: 5064; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5065; GCN-NEXT: s_mov_b32 s7, 0xf000 5066; GCN-NEXT: s_mov_b32 s6, -1 5067; GCN-NEXT: s_waitcnt lgkmcnt(0) 5068; GCN-NEXT: s_mov_b32 s4, s0 5069; GCN-NEXT: s_mov_b32 s5, s1 5070; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 5071; GCN-NEXT: v_mov_b32_e32 v0, s0 5072; GCN-NEXT: v_mov_b32_e32 v1, s1 5073; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5074; GCN-NEXT: s_endpgm 5075 %r = udiv i64 %x, 4096 5076 store i64 %r, i64 addrspace(1)* %out 5077 ret void 5078} 5079 5080define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5081; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 5082; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5083; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 5084; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5085; CHECK-NEXT: ret void 5086; 5087; GCN-LABEL: udiv_i64_pow2_shl_denom: 5088; GCN: ; %bb.0: 5089; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5090; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5091; GCN-NEXT: s_mov_b32 s3, 0xf000 5092; GCN-NEXT: s_mov_b32 s2, -1 5093; GCN-NEXT: s_waitcnt lgkmcnt(0) 5094; GCN-NEXT: s_mov_b32 s0, s4 5095; GCN-NEXT: s_add_i32 s8, s8, 12 5096; GCN-NEXT: s_mov_b32 s1, s5 5097; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 5098; GCN-NEXT: v_mov_b32_e32 v0, s4 5099; GCN-NEXT: v_mov_b32_e32 v1, s5 5100; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5101; GCN-NEXT: s_endpgm 5102 %shl.y = shl i64 4096, %y 5103 %r = udiv i64 %x, %shl.y 5104 store i64 %r, i64 addrspace(1)* %out 5105 ret void 5106} 5107 5108define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5109; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 5110; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5111; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5112; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5113; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5114; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 5115; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5116; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5117; CHECK-NEXT: ret void 5118; 5119; GCN-LABEL: udiv_v2i64_pow2k_denom: 5120; GCN: ; %bb.0: 5121; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5122; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5123; GCN-NEXT: s_mov_b32 s7, 0xf000 5124; GCN-NEXT: s_mov_b32 s6, -1 5125; GCN-NEXT: s_waitcnt lgkmcnt(0) 5126; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 5127; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 5128; GCN-NEXT: v_mov_b32_e32 v0, s0 5129; GCN-NEXT: v_mov_b32_e32 v1, s1 5130; GCN-NEXT: v_mov_b32_e32 v2, s2 5131; GCN-NEXT: v_mov_b32_e32 v3, s3 5132; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5133; GCN-NEXT: s_endpgm 5134 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 5135 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5136 ret void 5137} 5138 5139define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5140; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 5141; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5142; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5143; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5144; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5145; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 5146; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5147; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5148; CHECK-NEXT: ret void 5149; 5150; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: 5151; GCN: ; %bb.0: 5152; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5153; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 5154; GCN-NEXT: v_rcp_f32_e32 v0, v0 5155; GCN-NEXT: s_movk_i32 s6, 0xf001 5156; GCN-NEXT: v_mov_b32_e32 v7, 0 5157; GCN-NEXT: v_mov_b32_e32 v2, 0 5158; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5159; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5160; GCN-NEXT: v_trunc_f32_e32 v1, v1 5161; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5162; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5163; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5164; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5165; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5166; GCN-NEXT: s_movk_i32 s0, 0xfff 5167; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 5168; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 5169; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 5170; GCN-NEXT: s_mov_b32 s7, 0xf000 5171; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 5172; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5173; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 5174; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 5175; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 5176; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 5177; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 5178; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5179; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc 5180; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 5181; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5182; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 5183; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc 5184; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc 5185; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5186; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 5187; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5188; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 5189; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] 5190; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 5191; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 5192; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 5193; GCN-NEXT: s_mov_b32 s6, -1 5194; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 5195; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 5196; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 5197; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 5198; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 5199; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5200; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc 5201; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 5202; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 5203; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 5204; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5205; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc 5206; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc 5207; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5208; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 5209; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 5210; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 5211; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 5212; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5213; GCN-NEXT: s_waitcnt lgkmcnt(0) 5214; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 5215; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 5216; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5217; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 5218; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5219; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5220; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5221; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5222; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5223; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 5224; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5225; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc 5226; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc 5227; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5228; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc 5229; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 5230; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 5231; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 5232; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5233; GCN-NEXT: v_mov_b32_e32 v3, s11 5234; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 5235; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 5236; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 5237; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 5238; GCN-NEXT: s_movk_i32 s0, 0xffe 5239; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 5240; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5241; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5242; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 5243; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5244; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5245; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5246; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 5247; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5248; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 5249; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 5250; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 5251; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 5252; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 5253; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 5254; GCN-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 5255; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc 5256; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 5257; GCN-NEXT: v_mov_b32_e32 v0, s2 5258; GCN-NEXT: v_mov_b32_e32 v1, s3 5259; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5260; GCN-NEXT: s_endpgm 5261 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 5262 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5263 ret void 5264} 5265 5266define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5267; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 5268; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5269; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5270; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5271; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 5272; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5273; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5274; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5275; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 5276; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5277; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5278; CHECK-NEXT: ret void 5279; 5280; GCN-LABEL: udiv_v2i64_pow2_shl_denom: 5281; GCN: ; %bb.0: 5282; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5283; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5284; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5285; GCN-NEXT: s_mov_b32 s7, 0xf000 5286; GCN-NEXT: s_mov_b32 s6, -1 5287; GCN-NEXT: s_waitcnt lgkmcnt(0) 5288; GCN-NEXT: s_add_i32 s0, s0, 12 5289; GCN-NEXT: s_add_i32 s2, s2, 12 5290; GCN-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 5291; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 5292; GCN-NEXT: v_mov_b32_e32 v0, s0 5293; GCN-NEXT: v_mov_b32_e32 v1, s1 5294; GCN-NEXT: v_mov_b32_e32 v2, s2 5295; GCN-NEXT: v_mov_b32_e32 v3, s3 5296; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5297; GCN-NEXT: s_endpgm 5298 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5299 %r = udiv <2 x i64> %x, %shl.y 5300 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5301 ret void 5302} 5303 5304define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5305; CHECK-LABEL: @urem_i64_oddk_denom( 5306; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 5307; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5308; CHECK-NEXT: ret void 5309; 5310; GCN-LABEL: urem_i64_oddk_denom: 5311; GCN: ; %bb.0: 5312; GCN-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 5313; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5314; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 5315; GCN-NEXT: v_rcp_f32_e32 v0, v0 5316; GCN-NEXT: s_movk_i32 s2, 0xfee0 5317; GCN-NEXT: s_mov_b32 s3, 0x689e0837 5318; GCN-NEXT: v_mov_b32_e32 v8, 0 5319; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5320; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5321; GCN-NEXT: v_trunc_f32_e32 v1, v1 5322; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5323; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5324; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5325; GCN-NEXT: v_mov_b32_e32 v7, 0 5326; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5327; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 5328; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 5329; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 5330; GCN-NEXT: s_movk_i32 s12, 0x11f 5331; GCN-NEXT: s_mov_b32 s13, 0x9761f7c9 5332; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5333; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 5334; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5335; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5336; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 5337; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 5338; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 5339; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5340; GCN-NEXT: s_waitcnt lgkmcnt(0) 5341; GCN-NEXT: s_mov_b32 s9, s5 5342; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5343; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 5344; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 5345; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5346; GCN-NEXT: s_movk_i32 s5, 0x11e 5347; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5348; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 5349; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 5350; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5351; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5352; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5353; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5354; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 5355; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5356; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 5357; GCN-NEXT: s_mov_b32 s8, s4 5358; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5359; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 5360; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 5361; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 5362; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 5363; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 5364; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 5365; GCN-NEXT: s_mov_b32 s4, 0x9761f7c8 5366; GCN-NEXT: s_mov_b32 s11, 0xf000 5367; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5368; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 5369; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 5370; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 5371; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 5372; GCN-NEXT: s_mov_b32 s10, -1 5373; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5374; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 5375; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 5376; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5377; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5378; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5379; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 5380; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5381; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5382; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 5383; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 5384; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 5385; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 5386; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 5387; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5388; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5389; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 5390; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 5391; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5392; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5393; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5394; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5395; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5396; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 5397; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 5398; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 5399; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 5400; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5401; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 5402; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 5403; GCN-NEXT: v_mov_b32_e32 v3, s12 5404; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 5405; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 5406; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 5407; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 5408; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 5409; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 5410; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 5411; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 5412; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 5413; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 5414; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 5415; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 5416; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 5417; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 5418; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 5419; GCN-NEXT: v_mov_b32_e32 v5, s7 5420; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 5421; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 5422; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 5423; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 5424; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5425; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 5426; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 5427; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 5428; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5429; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 5430; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5431; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5432; GCN-NEXT: s_endpgm 5433 %r = urem i64 %x, 1235195393993 5434 store i64 %r, i64 addrspace(1)* %out 5435 ret void 5436} 5437 5438define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5439; CHECK-LABEL: @urem_i64_pow2k_denom( 5440; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 5441; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5442; CHECK-NEXT: ret void 5443; 5444; GCN-LABEL: urem_i64_pow2k_denom: 5445; GCN: ; %bb.0: 5446; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5447; GCN-NEXT: s_mov_b32 s3, 0xf000 5448; GCN-NEXT: s_mov_b32 s2, -1 5449; GCN-NEXT: v_mov_b32_e32 v1, 0 5450; GCN-NEXT: s_waitcnt lgkmcnt(0) 5451; GCN-NEXT: s_mov_b32 s0, s4 5452; GCN-NEXT: s_and_b32 s4, s6, 0xfff 5453; GCN-NEXT: s_mov_b32 s1, s5 5454; GCN-NEXT: v_mov_b32_e32 v0, s4 5455; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5456; GCN-NEXT: s_endpgm 5457 %r = urem i64 %x, 4096 5458 store i64 %r, i64 addrspace(1)* %out 5459 ret void 5460} 5461 5462define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5463; CHECK-LABEL: @urem_i64_pow2_shl_denom( 5464; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5465; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 5466; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5467; CHECK-NEXT: ret void 5468; 5469; GCN-LABEL: urem_i64_pow2_shl_denom: 5470; GCN: ; %bb.0: 5471; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5472; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5473; GCN-NEXT: s_mov_b32 s3, 0xf000 5474; GCN-NEXT: s_mov_b32 s2, -1 5475; GCN-NEXT: s_waitcnt lgkmcnt(0) 5476; GCN-NEXT: s_mov_b32 s0, s4 5477; GCN-NEXT: s_mov_b32 s1, s5 5478; GCN-NEXT: s_mov_b32 s5, 0 5479; GCN-NEXT: s_movk_i32 s4, 0x1000 5480; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 5481; GCN-NEXT: s_add_u32 s4, s4, -1 5482; GCN-NEXT: s_addc_u32 s5, s5, -1 5483; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 5484; GCN-NEXT: v_mov_b32_e32 v0, s4 5485; GCN-NEXT: v_mov_b32_e32 v1, s5 5486; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5487; GCN-NEXT: s_endpgm 5488 %shl.y = shl i64 4096, %y 5489 %r = urem i64 %x, %shl.y 5490 store i64 %r, i64 addrspace(1)* %out 5491 ret void 5492} 5493 5494define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5495; CHECK-LABEL: @urem_v2i64_pow2k_denom( 5496; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5497; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 5498; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5499; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5500; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 5501; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5502; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5503; CHECK-NEXT: ret void 5504; 5505; GCN-LABEL: urem_v2i64_pow2k_denom: 5506; GCN: ; %bb.0: 5507; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5508; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5509; GCN-NEXT: s_movk_i32 s8, 0xfff 5510; GCN-NEXT: v_mov_b32_e32 v1, 0 5511; GCN-NEXT: s_mov_b32 s7, 0xf000 5512; GCN-NEXT: s_mov_b32 s6, -1 5513; GCN-NEXT: s_waitcnt lgkmcnt(0) 5514; GCN-NEXT: s_and_b32 s0, s0, s8 5515; GCN-NEXT: s_and_b32 s1, s2, s8 5516; GCN-NEXT: v_mov_b32_e32 v0, s0 5517; GCN-NEXT: v_mov_b32_e32 v2, s1 5518; GCN-NEXT: v_mov_b32_e32 v3, v1 5519; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5520; GCN-NEXT: s_endpgm 5521 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 5522 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5523 ret void 5524} 5525 5526define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5527; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 5528; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5529; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5530; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5531; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 5532; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5533; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5534; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5535; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 5536; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5537; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5538; CHECK-NEXT: ret void 5539; 5540; GCN-LABEL: urem_v2i64_pow2_shl_denom: 5541; GCN: ; %bb.0: 5542; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5543; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5544; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5545; GCN-NEXT: s_mov_b32 s13, 0 5546; GCN-NEXT: s_movk_i32 s12, 0x1000 5547; GCN-NEXT: s_mov_b32 s7, 0xf000 5548; GCN-NEXT: s_mov_b32 s6, -1 5549; GCN-NEXT: s_waitcnt lgkmcnt(0) 5550; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 5551; GCN-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 5552; GCN-NEXT: s_add_u32 s0, s0, -1 5553; GCN-NEXT: s_addc_u32 s1, s1, -1 5554; GCN-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 5555; GCN-NEXT: s_add_u32 s2, s2, -1 5556; GCN-NEXT: s_addc_u32 s3, s3, -1 5557; GCN-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 5558; GCN-NEXT: v_mov_b32_e32 v0, s0 5559; GCN-NEXT: v_mov_b32_e32 v1, s1 5560; GCN-NEXT: v_mov_b32_e32 v2, s2 5561; GCN-NEXT: v_mov_b32_e32 v3, s3 5562; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5563; GCN-NEXT: s_endpgm 5564 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5565 %r = urem <2 x i64> %x, %shl.y 5566 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5567 ret void 5568} 5569 5570define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5571; CHECK-LABEL: @sdiv_i64_oddk_denom( 5572; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 5573; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5574; CHECK-NEXT: ret void 5575; 5576; GCN-LABEL: sdiv_i64_oddk_denom: 5577; GCN: ; %bb.0: 5578; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5579; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 5580; GCN-NEXT: v_rcp_f32_e32 v0, v0 5581; GCN-NEXT: s_mov_b32 s2, 0xffed2705 5582; GCN-NEXT: v_mov_b32_e32 v8, 0 5583; GCN-NEXT: v_mov_b32_e32 v7, 0 5584; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5585; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5586; GCN-NEXT: v_trunc_f32_e32 v1, v1 5587; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5588; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5589; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5590; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5591; GCN-NEXT: s_mov_b32 s7, 0xf000 5592; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 5593; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 5594; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5595; GCN-NEXT: s_mov_b32 s6, -1 5596; GCN-NEXT: s_waitcnt lgkmcnt(0) 5597; GCN-NEXT: s_mov_b32 s4, s8 5598; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5599; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 5600; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5601; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 5602; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 5603; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 5604; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5605; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5606; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 5607; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5608; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 5609; GCN-NEXT: s_mov_b32 s5, s9 5610; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5611; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 5612; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 5613; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5614; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5615; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5616; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5617; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 5618; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 5619; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5620; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 5621; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 5622; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 5623; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 5624; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 5625; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 5626; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 5627; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 5628; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5629; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 5630; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 5631; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 5632; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 5633; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 5634; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5635; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5636; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5637; GCN-NEXT: s_ashr_i32 s2, s11, 31 5638; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 5639; GCN-NEXT: s_add_u32 s0, s10, s2 5640; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5641; GCN-NEXT: s_mov_b32 s3, s2 5642; GCN-NEXT: s_addc_u32 s1, s11, s2 5643; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 5644; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5645; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 5646; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 5647; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 5648; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 5649; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 5650; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5651; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5652; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 5653; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 5654; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 5655; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5656; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5657; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5658; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5659; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5660; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 5661; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 5662; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 5663; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5664; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 5665; GCN-NEXT: v_mov_b32_e32 v3, s1 5666; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 5667; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 5668; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 5669; GCN-NEXT: s_mov_b32 s0, 0x12d8fa 5670; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 5671; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5672; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5673; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 5674; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5675; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5676; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5677; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 5678; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5679; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 5680; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 5681; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 5682; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 5683; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 5684; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 5685; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 5686; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 5687; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 5688; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 5689; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 5690; GCN-NEXT: v_mov_b32_e32 v2, s2 5691; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 5692; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 5693; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5694; GCN-NEXT: s_endpgm 5695 %r = sdiv i64 %x, 1235195 5696 store i64 %r, i64 addrspace(1)* %out 5697 ret void 5698} 5699 5700define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5701; CHECK-LABEL: @sdiv_i64_pow2k_denom( 5702; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 5703; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5704; CHECK-NEXT: ret void 5705; 5706; GCN-LABEL: sdiv_i64_pow2k_denom: 5707; GCN: ; %bb.0: 5708; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5709; GCN-NEXT: s_mov_b32 s7, 0xf000 5710; GCN-NEXT: s_mov_b32 s6, -1 5711; GCN-NEXT: s_waitcnt lgkmcnt(0) 5712; GCN-NEXT: s_mov_b32 s4, s0 5713; GCN-NEXT: s_ashr_i32 s0, s3, 31 5714; GCN-NEXT: s_lshr_b32 s0, s0, 20 5715; GCN-NEXT: s_add_u32 s0, s2, s0 5716; GCN-NEXT: s_mov_b32 s5, s1 5717; GCN-NEXT: s_addc_u32 s1, s3, 0 5718; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 5719; GCN-NEXT: v_mov_b32_e32 v0, s0 5720; GCN-NEXT: v_mov_b32_e32 v1, s1 5721; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5722; GCN-NEXT: s_endpgm 5723 %r = sdiv i64 %x, 4096 5724 store i64 %r, i64 addrspace(1)* %out 5725 ret void 5726} 5727 5728define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5729; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 5730; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5731; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 5732; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5733; CHECK-NEXT: ret void 5734; 5735; GCN-LABEL: sdiv_i64_pow2_shl_denom: 5736; GCN: ; %bb.0: 5737; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 5738; GCN-NEXT: s_mov_b32 s3, 0 5739; GCN-NEXT: s_movk_i32 s2, 0x1000 5740; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5741; GCN-NEXT: s_mov_b32 s7, 0xf000 5742; GCN-NEXT: s_waitcnt lgkmcnt(0) 5743; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 5744; GCN-NEXT: s_ashr_i32 s12, s3, 31 5745; GCN-NEXT: s_add_u32 s2, s2, s12 5746; GCN-NEXT: s_mov_b32 s13, s12 5747; GCN-NEXT: s_addc_u32 s3, s3, s12 5748; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 5749; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 5750; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 5751; GCN-NEXT: s_sub_u32 s4, 0, s2 5752; GCN-NEXT: s_subb_u32 s5, 0, s3 5753; GCN-NEXT: s_ashr_i32 s14, s11, 31 5754; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 5755; GCN-NEXT: v_rcp_f32_e32 v0, v0 5756; GCN-NEXT: s_mov_b32 s15, s14 5757; GCN-NEXT: s_mov_b32 s6, -1 5758; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5759; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5760; GCN-NEXT: v_trunc_f32_e32 v1, v1 5761; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5762; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5763; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5764; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 5765; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 5766; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 5767; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 5768; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5769; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 5770; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 5771; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5772; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 5773; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 5774; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5775; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5776; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 5777; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 5778; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5779; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5780; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 5781; GCN-NEXT: v_mov_b32_e32 v4, 0 5782; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 5783; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5784; GCN-NEXT: v_mov_b32_e32 v6, 0 5785; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5786; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 5787; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5788; GCN-NEXT: v_mul_lo_u32 v5, s4, v2 5789; GCN-NEXT: v_mul_hi_u32 v7, s4, v0 5790; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 5791; GCN-NEXT: s_mov_b32 s5, s9 5792; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5793; GCN-NEXT: v_mul_lo_u32 v7, s4, v0 5794; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 5795; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 5796; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 5797; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 5798; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 5799; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 5800; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 5801; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5802; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 5803; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 5804; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 5805; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 5806; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 5807; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 5808; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 5809; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5810; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 5811; GCN-NEXT: s_add_u32 s0, s10, s14 5812; GCN-NEXT: s_addc_u32 s1, s11, s14 5813; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5814; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 5815; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5816; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 5817; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 5818; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5819; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 5820; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5821; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5822; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 5823; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5824; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5825; GCN-NEXT: s_mov_b32 s4, s8 5826; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5827; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5828; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 5829; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5830; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 5831; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 5832; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 5833; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 5834; GCN-NEXT: v_mov_b32_e32 v5, s3 5835; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5836; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 5837; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5838; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 5839; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 5840; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 5841; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 5842; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 5843; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 5844; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 5845; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 5846; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 5847; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 5848; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 5849; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 5850; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 5851; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 5852; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 5853; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 5854; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 5855; GCN-NEXT: v_mov_b32_e32 v6, s11 5856; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 5857; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 5858; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5859; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 5860; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5861; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 5862; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 5863; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5864; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 5865; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5866; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] 5867; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5868; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 5869; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 5870; GCN-NEXT: v_mov_b32_e32 v2, s1 5871; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 5872; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 5873; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5874; GCN-NEXT: s_endpgm 5875 %shl.y = shl i64 4096, %y 5876 %r = sdiv i64 %x, %shl.y 5877 store i64 %r, i64 addrspace(1)* %out 5878 ret void 5879} 5880 5881define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5882; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 5883; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5884; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 5885; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5886; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5887; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 5888; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5889; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5890; CHECK-NEXT: ret void 5891; 5892; GCN-LABEL: sdiv_v2i64_pow2k_denom: 5893; GCN: ; %bb.0: 5894; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5895; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5896; GCN-NEXT: s_mov_b32 s7, 0xf000 5897; GCN-NEXT: s_mov_b32 s6, -1 5898; GCN-NEXT: s_waitcnt lgkmcnt(0) 5899; GCN-NEXT: s_ashr_i32 s8, s1, 31 5900; GCN-NEXT: s_lshr_b32 s8, s8, 20 5901; GCN-NEXT: s_add_u32 s0, s0, s8 5902; GCN-NEXT: s_addc_u32 s1, s1, 0 5903; GCN-NEXT: s_ashr_i32 s8, s3, 31 5904; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 5905; GCN-NEXT: s_lshr_b32 s8, s8, 20 5906; GCN-NEXT: s_add_u32 s2, s2, s8 5907; GCN-NEXT: s_addc_u32 s3, s3, 0 5908; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 5909; GCN-NEXT: v_mov_b32_e32 v0, s0 5910; GCN-NEXT: v_mov_b32_e32 v1, s1 5911; GCN-NEXT: v_mov_b32_e32 v2, s2 5912; GCN-NEXT: v_mov_b32_e32 v3, s3 5913; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5914; GCN-NEXT: s_endpgm 5915 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 5916 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5917 ret void 5918} 5919 5920define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5921; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 5922; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5923; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 5924; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5925; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5926; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 5927; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5928; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5929; CHECK-NEXT: ret void 5930; 5931; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 5932; GCN: ; %bb.0: 5933; GCN-NEXT: v_mov_b32_e32 v0, 0x457ff000 5934; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5935; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 5936; GCN-NEXT: v_rcp_f32_e32 v0, v0 5937; GCN-NEXT: s_movk_i32 s6, 0xf001 5938; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5939; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5940; GCN-NEXT: s_mov_b32 s7, 0xf000 5941; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5942; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5943; GCN-NEXT: v_trunc_f32_e32 v1, v1 5944; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5945; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5946; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5947; GCN-NEXT: s_waitcnt lgkmcnt(0) 5948; GCN-NEXT: s_ashr_i32 s0, s9, 31 5949; GCN-NEXT: s_lshr_b32 s0, s0, 20 5950; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 5951; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 5952; GCN-NEXT: s_add_u32 s2, s8, s0 5953; GCN-NEXT: s_addc_u32 s3, s9, 0 5954; GCN-NEXT: s_ashr_i32 s8, s11, 31 5955; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 5956; GCN-NEXT: v_mul_lo_u32 v3, v0, s6 5957; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 5958; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 5959; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 5960; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 5961; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 5962; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5963; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 5964; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5965; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 5966; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 5967; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 5968; GCN-NEXT: s_mov_b32 s9, s8 5969; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 5970; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 5971; GCN-NEXT: v_mov_b32_e32 v4, 0 5972; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 5973; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5974; GCN-NEXT: v_mov_b32_e32 v6, 0 5975; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5976; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 5977; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5978; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 5979; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 5980; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5981; GCN-NEXT: v_mul_lo_u32 v7, v0, s6 5982; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 5983; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 5984; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 5985; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 5986; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 5987; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 5988; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 5989; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5990; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 5991; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 5992; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 5993; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 5994; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 5995; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 5996; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 5997; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5998; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 5999; GCN-NEXT: s_add_u32 s0, s10, s8 6000; GCN-NEXT: s_addc_u32 s1, s11, s8 6001; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6002; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 6003; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6004; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6005; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6006; GCN-NEXT: v_mul_hi_u32 v5, s0, v1 6007; GCN-NEXT: v_mul_hi_u32 v7, s1, v1 6008; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6009; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6010; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6011; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 6012; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6013; GCN-NEXT: s_movk_i32 s9, 0xfff 6014; GCN-NEXT: s_mov_b32 s6, -1 6015; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6016; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6017; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6018; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6019; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6020; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 6021; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 6022; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 6023; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6024; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 6025; GCN-NEXT: v_mov_b32_e32 v3, s1 6026; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 6027; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 6028; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 6029; GCN-NEXT: s_movk_i32 s0, 0xffe 6030; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 6031; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6032; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 6033; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 6034; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 6035; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 6036; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 6037; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 6038; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 6039; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 6040; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 6041; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 6042; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 6043; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 6044; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 6045; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6046; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 6047; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6048; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 6049; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 6050; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 6051; GCN-NEXT: v_mov_b32_e32 v3, s8 6052; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 6053; GCN-NEXT: v_mov_b32_e32 v0, s2 6054; GCN-NEXT: v_mov_b32_e32 v1, s3 6055; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6056; GCN-NEXT: s_endpgm 6057 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 6058 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6059 ret void 6060} 6061 6062define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 6063; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 6064; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 6065; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6066; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 6067; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 6068; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 6069; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 6070; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 6071; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 6072; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 6073; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6074; CHECK-NEXT: ret void 6075; 6076; GCN-LABEL: sdiv_v2i64_pow2_shl_denom: 6077; GCN: ; %bb.0: 6078; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 6079; GCN-NEXT: s_mov_b32 s3, 0 6080; GCN-NEXT: s_movk_i32 s2, 0x1000 6081; GCN-NEXT: s_mov_b32 s18, 0x4f800000 6082; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc 6083; GCN-NEXT: s_waitcnt lgkmcnt(0) 6084; GCN-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 6085; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6086; GCN-NEXT: s_ashr_i32 s16, s3, 31 6087; GCN-NEXT: s_add_u32 s2, s2, s16 6088; GCN-NEXT: s_mov_b32 s17, s16 6089; GCN-NEXT: s_addc_u32 s3, s3, s16 6090; GCN-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] 6091; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 6092; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 6093; GCN-NEXT: s_mov_b32 s20, 0x2f800000 6094; GCN-NEXT: s_mov_b32 s21, 0xcf800000 6095; GCN-NEXT: s_sub_u32 s6, 0, s14 6096; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 6097; GCN-NEXT: v_rcp_f32_e32 v0, v0 6098; GCN-NEXT: s_subb_u32 s7, 0, s15 6099; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6100; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6101; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 6102; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 6103; GCN-NEXT: v_trunc_f32_e32 v1, v1 6104; GCN-NEXT: v_mac_f32_e32 v0, s21, v1 6105; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6106; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6107; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 6108; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 6109; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 6110; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 6111; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6112; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6113; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 6114; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 6115; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6116; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6117; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6118; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6119; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 6120; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 6121; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 6122; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6123; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 6124; GCN-NEXT: v_mov_b32_e32 v4, 0 6125; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6126; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6127; GCN-NEXT: v_mov_b32_e32 v6, 0 6128; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 6129; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6130; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 6131; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 6132; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6133; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 6134; GCN-NEXT: s_mov_b32 s7, 0xf000 6135; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6136; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 6137; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6138; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6139; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6140; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6141; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6142; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6143; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6144; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6145; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6146; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6147; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6148; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6149; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6150; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6151; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6152; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6153; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 6154; GCN-NEXT: s_waitcnt lgkmcnt(0) 6155; GCN-NEXT: s_ashr_i32 s2, s9, 31 6156; GCN-NEXT: s_add_u32 s0, s8, s2 6157; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6158; GCN-NEXT: s_mov_b32 s3, s2 6159; GCN-NEXT: s_addc_u32 s1, s9, s2 6160; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] 6161; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6162; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 6163; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 6164; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 6165; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 6166; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 6167; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6168; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6169; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 6170; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 6171; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] 6172; GCN-NEXT: s_mov_b32 s6, -1 6173; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6174; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6175; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6176; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6177; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6178; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 6179; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 6180; GCN-NEXT: v_mul_lo_u32 v5, s15, v0 6181; GCN-NEXT: v_mov_b32_e32 v7, s15 6182; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6183; GCN-NEXT: v_mul_lo_u32 v3, s14, v0 6184; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6185; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 6186; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6187; GCN-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v7, vcc 6188; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v3 6189; GCN-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] 6190; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v5 6191; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 6192; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 6193; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 6194; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v5 6195; GCN-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] 6196; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 6197; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 6198; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v0 6199; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] 6200; GCN-NEXT: s_ashr_i32 s8, s13, 31 6201; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 6202; GCN-NEXT: s_add_u32 s12, s12, s8 6203; GCN-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] 6204; GCN-NEXT: v_mov_b32_e32 v8, s9 6205; GCN-NEXT: s_mov_b32 s9, s8 6206; GCN-NEXT: s_addc_u32 s13, s13, s8 6207; GCN-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] 6208; GCN-NEXT: v_cvt_f32_u32_e32 v10, s12 6209; GCN-NEXT: v_cvt_f32_u32_e32 v11, s13 6210; GCN-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 6211; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 6212; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6213; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 6214; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6215; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 6216; GCN-NEXT: v_mac_f32_e32 v10, s18, v11 6217; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 6218; GCN-NEXT: v_rcp_f32_e32 v3, v10 6219; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 6220; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 6221; GCN-NEXT: s_sub_u32 s14, 0, s12 6222; GCN-NEXT: v_mul_f32_e32 v3, s19, v3 6223; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 6224; GCN-NEXT: v_trunc_f32_e32 v5, v5 6225; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 6226; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 6227; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 6228; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] 6229; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6230; GCN-NEXT: v_mul_hi_u32 v2, s14, v3 6231; GCN-NEXT: v_mul_lo_u32 v7, s14, v5 6232; GCN-NEXT: s_subb_u32 s15, 0, s13 6233; GCN-NEXT: v_mul_lo_u32 v8, s15, v3 6234; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6235; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 6236; GCN-NEXT: v_mul_lo_u32 v7, s14, v3 6237; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 6238; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 6239; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 6240; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 6241; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 6242; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 6243; GCN-NEXT: v_xor_b32_e32 v1, s3, v1 6244; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6245; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 6246; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 6247; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 6248; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6249; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 6250; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 6251; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6252; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 6253; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 6254; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 6255; GCN-NEXT: v_mul_lo_u32 v8, s14, v3 6256; GCN-NEXT: v_mul_hi_u32 v9, s14, v2 6257; GCN-NEXT: v_mul_lo_u32 v10, s15, v2 6258; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6259; GCN-NEXT: v_mul_lo_u32 v9, s14, v2 6260; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6261; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 6262; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 6263; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 6264; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 6265; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 6266; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 6267; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 6268; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 6269; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 6270; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 6271; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 6272; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 6273; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 6274; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 6275; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 6276; GCN-NEXT: s_ashr_i32 s14, s11, 31 6277; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 6278; GCN-NEXT: s_add_u32 s0, s10, s14 6279; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6280; GCN-NEXT: s_mov_b32 s15, s14 6281; GCN-NEXT: s_addc_u32 s1, s11, s14 6282; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6283; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6284; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 6285; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 6286; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 6287; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 6288; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 6289; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6290; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 6291; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 6292; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 6293; GCN-NEXT: v_mov_b32_e32 v8, s3 6294; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 6295; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 6296; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 6297; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6298; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 6299; GCN-NEXT: v_mul_lo_u32 v4, s12, v3 6300; GCN-NEXT: v_mul_hi_u32 v5, s12, v2 6301; GCN-NEXT: v_mul_lo_u32 v6, s13, v2 6302; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6303; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 6304; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6305; GCN-NEXT: v_mul_lo_u32 v5, s12, v2 6306; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 6307; GCN-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 6308; GCN-NEXT: v_mov_b32_e32 v7, s13 6309; GCN-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 6310; GCN-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 6311; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 6312; GCN-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 6313; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 6314; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 6315; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 6316; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 6317; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 6318; GCN-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 6319; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 6320; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 6321; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 6322; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 6323; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 6324; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 6325; GCN-NEXT: v_mov_b32_e32 v8, s11 6326; GCN-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 6327; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 6328; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6329; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 6330; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6331; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 6332; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 6333; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 6334; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 6335; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6336; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] 6337; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 6338; GCN-NEXT: v_xor_b32_e32 v2, s0, v2 6339; GCN-NEXT: v_xor_b32_e32 v3, s1, v3 6340; GCN-NEXT: v_mov_b32_e32 v4, s1 6341; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 6342; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 6343; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6344; GCN-NEXT: s_endpgm 6345 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 6346 %r = sdiv <2 x i64> %x, %shl.y 6347 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6348 ret void 6349} 6350 6351define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 6352; CHECK-LABEL: @srem_i64_oddk_denom( 6353; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 6354; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6355; CHECK-NEXT: ret void 6356; 6357; GCN-LABEL: srem_i64_oddk_denom: 6358; GCN: ; %bb.0: 6359; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 6360; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 6361; GCN-NEXT: v_rcp_f32_e32 v0, v0 6362; GCN-NEXT: s_mov_b32 s2, 0xffed2705 6363; GCN-NEXT: v_mov_b32_e32 v8, 0 6364; GCN-NEXT: v_mov_b32_e32 v7, 0 6365; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6366; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6367; GCN-NEXT: v_trunc_f32_e32 v1, v1 6368; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6369; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6370; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6371; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6372; GCN-NEXT: s_mov_b32 s7, 0xf000 6373; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6374; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 6375; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 6376; GCN-NEXT: s_mov_b32 s6, -1 6377; GCN-NEXT: s_waitcnt lgkmcnt(0) 6378; GCN-NEXT: s_mov_b32 s4, s8 6379; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6380; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 6381; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6382; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 6383; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 6384; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 6385; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6386; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6387; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6388; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6389; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 6390; GCN-NEXT: s_mov_b32 s5, s9 6391; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6392; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 6393; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 6394; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6395; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6396; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6397; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6398; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 6399; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 6400; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6401; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 6402; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 6403; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 6404; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 6405; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 6406; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 6407; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 6408; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 6409; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6410; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 6411; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 6412; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 6413; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 6414; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 6415; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6416; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 6417; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6418; GCN-NEXT: s_ashr_i32 s2, s11, 31 6419; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 6420; GCN-NEXT: s_add_u32 s0, s10, s2 6421; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6422; GCN-NEXT: s_mov_b32 s3, s2 6423; GCN-NEXT: s_addc_u32 s1, s11, s2 6424; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 6425; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6426; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6427; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6428; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 6429; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 6430; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6431; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6432; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6433; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 6434; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6435; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 6436; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 6437; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6438; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 6439; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6440; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 6441; GCN-NEXT: v_mul_hi_u32 v2, s3, v0 6442; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 6443; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 6444; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6445; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6446; GCN-NEXT: v_mov_b32_e32 v2, s1 6447; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 6448; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 6449; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 6450; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 6451; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 6452; GCN-NEXT: s_mov_b32 s0, 0x12d8fa 6453; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 6454; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6455; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6456; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 6457; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 6458; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 6459; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 6460; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 6461; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 6462; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 6463; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 6464; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6465; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6466; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6467; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6468; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 6469; GCN-NEXT: v_mov_b32_e32 v2, s2 6470; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6471; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6472; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6473; GCN-NEXT: s_endpgm 6474 %r = srem i64 %x, 1235195 6475 store i64 %r, i64 addrspace(1)* %out 6476 ret void 6477} 6478 6479define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 6480; CHECK-LABEL: @srem_i64_pow2k_denom( 6481; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 6482; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6483; CHECK-NEXT: ret void 6484; 6485; GCN-LABEL: srem_i64_pow2k_denom: 6486; GCN: ; %bb.0: 6487; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 6488; GCN-NEXT: s_mov_b32 s3, 0xf000 6489; GCN-NEXT: s_mov_b32 s2, -1 6490; GCN-NEXT: s_waitcnt lgkmcnt(0) 6491; GCN-NEXT: s_mov_b32 s0, s4 6492; GCN-NEXT: s_ashr_i32 s4, s7, 31 6493; GCN-NEXT: s_lshr_b32 s4, s4, 20 6494; GCN-NEXT: s_add_u32 s4, s6, s4 6495; GCN-NEXT: s_mov_b32 s1, s5 6496; GCN-NEXT: s_addc_u32 s5, s7, 0 6497; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 6498; GCN-NEXT: s_sub_u32 s4, s6, s4 6499; GCN-NEXT: s_subb_u32 s5, s7, s5 6500; GCN-NEXT: v_mov_b32_e32 v0, s4 6501; GCN-NEXT: v_mov_b32_e32 v1, s5 6502; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6503; GCN-NEXT: s_endpgm 6504 %r = srem i64 %x, 4096 6505 store i64 %r, i64 addrspace(1)* %out 6506 ret void 6507} 6508 6509define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 6510; CHECK-LABEL: @srem_i64_pow2_shl_denom( 6511; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 6512; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 6513; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6514; CHECK-NEXT: ret void 6515; 6516; GCN-LABEL: srem_i64_pow2_shl_denom: 6517; GCN: ; %bb.0: 6518; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 6519; GCN-NEXT: s_mov_b32 s3, 0 6520; GCN-NEXT: s_movk_i32 s2, 0x1000 6521; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6522; GCN-NEXT: s_mov_b32 s7, 0xf000 6523; GCN-NEXT: s_waitcnt lgkmcnt(0) 6524; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6525; GCN-NEXT: s_ashr_i32 s4, s3, 31 6526; GCN-NEXT: s_add_u32 s2, s2, s4 6527; GCN-NEXT: s_mov_b32 s5, s4 6528; GCN-NEXT: s_addc_u32 s3, s3, s4 6529; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 6530; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 6531; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 6532; GCN-NEXT: s_sub_u32 s2, 0, s12 6533; GCN-NEXT: s_subb_u32 s3, 0, s13 6534; GCN-NEXT: s_ashr_i32 s14, s11, 31 6535; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 6536; GCN-NEXT: v_rcp_f32_e32 v0, v0 6537; GCN-NEXT: s_mov_b32 s15, s14 6538; GCN-NEXT: s_mov_b32 s6, -1 6539; GCN-NEXT: s_mov_b32 s4, s8 6540; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6541; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6542; GCN-NEXT: v_trunc_f32_e32 v1, v1 6543; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6544; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6545; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6546; GCN-NEXT: s_mov_b32 s5, s9 6547; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6548; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 6549; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 6550; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 6551; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6552; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6553; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 6554; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6555; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6556; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6557; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6558; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6559; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 6560; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6561; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6562; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6563; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 6564; GCN-NEXT: v_mov_b32_e32 v4, 0 6565; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6566; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6567; GCN-NEXT: v_mov_b32_e32 v6, 0 6568; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6569; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6570; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6571; GCN-NEXT: v_mul_lo_u32 v5, s2, v2 6572; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 6573; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 6574; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6575; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 6576; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6577; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6578; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6579; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6580; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6581; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6582; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6583; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6584; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6585; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6586; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6587; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6588; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6589; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6590; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6591; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6592; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 6593; GCN-NEXT: s_add_u32 s0, s10, s14 6594; GCN-NEXT: s_addc_u32 s1, s11, s14 6595; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6596; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6597; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6598; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 6599; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 6600; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 6601; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 6602; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 6603; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6604; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6605; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 6606; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 6607; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6608; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6609; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6610; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6611; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6612; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 6613; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 6614; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 6615; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 6616; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6617; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6618; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 6619; GCN-NEXT: v_mov_b32_e32 v3, s13 6620; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 6621; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 6622; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 6623; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 6624; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 6625; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 6626; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 6627; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 6628; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 6629; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 6630; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 6631; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 6632; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 6633; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 6634; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 6635; GCN-NEXT: v_mov_b32_e32 v5, s11 6636; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 6637; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 6638; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6639; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 6640; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6641; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 6642; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 6643; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 6644; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6645; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 6646; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6647; GCN-NEXT: v_xor_b32_e32 v0, s14, v0 6648; GCN-NEXT: v_xor_b32_e32 v1, s14, v1 6649; GCN-NEXT: v_mov_b32_e32 v2, s14 6650; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 6651; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6652; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6653; GCN-NEXT: s_endpgm 6654 %shl.y = shl i64 4096, %y 6655 %r = srem i64 %x, %shl.y 6656 store i64 %r, i64 addrspace(1)* %out 6657 ret void 6658} 6659 6660define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 6661; CHECK-LABEL: @srem_v2i64_pow2k_denom( 6662; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6663; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 6664; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 6665; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 6666; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 6667; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 6668; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6669; CHECK-NEXT: ret void 6670; 6671; GCN-LABEL: srem_v2i64_pow2k_denom: 6672; GCN: ; %bb.0: 6673; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6674; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 6675; GCN-NEXT: s_movk_i32 s8, 0xf000 6676; GCN-NEXT: s_mov_b32 s7, 0xf000 6677; GCN-NEXT: s_mov_b32 s6, -1 6678; GCN-NEXT: s_waitcnt lgkmcnt(0) 6679; GCN-NEXT: s_ashr_i32 s9, s1, 31 6680; GCN-NEXT: s_lshr_b32 s9, s9, 20 6681; GCN-NEXT: s_add_u32 s9, s0, s9 6682; GCN-NEXT: s_addc_u32 s10, s1, 0 6683; GCN-NEXT: s_and_b32 s9, s9, s8 6684; GCN-NEXT: s_sub_u32 s0, s0, s9 6685; GCN-NEXT: s_subb_u32 s1, s1, s10 6686; GCN-NEXT: s_ashr_i32 s9, s3, 31 6687; GCN-NEXT: s_lshr_b32 s9, s9, 20 6688; GCN-NEXT: s_add_u32 s9, s2, s9 6689; GCN-NEXT: s_addc_u32 s10, s3, 0 6690; GCN-NEXT: s_and_b32 s8, s9, s8 6691; GCN-NEXT: s_sub_u32 s2, s2, s8 6692; GCN-NEXT: s_subb_u32 s3, s3, s10 6693; GCN-NEXT: v_mov_b32_e32 v0, s0 6694; GCN-NEXT: v_mov_b32_e32 v1, s1 6695; GCN-NEXT: v_mov_b32_e32 v2, s2 6696; GCN-NEXT: v_mov_b32_e32 v3, s3 6697; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6698; GCN-NEXT: s_endpgm 6699 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 6700 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6701 ret void 6702} 6703 6704define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 6705; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 6706; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 6707; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6708; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 6709; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 6710; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 6711; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 6712; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 6713; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 6714; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 6715; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6716; CHECK-NEXT: ret void 6717; 6718; GCN-LABEL: srem_v2i64_pow2_shl_denom: 6719; GCN: ; %bb.0: 6720; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 6721; GCN-NEXT: s_mov_b32 s3, 0 6722; GCN-NEXT: s_movk_i32 s2, 0x1000 6723; GCN-NEXT: s_mov_b32 s18, 0x4f800000 6724; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc 6725; GCN-NEXT: s_waitcnt lgkmcnt(0) 6726; GCN-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 6727; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6728; GCN-NEXT: s_ashr_i32 s4, s3, 31 6729; GCN-NEXT: s_add_u32 s2, s2, s4 6730; GCN-NEXT: s_mov_b32 s5, s4 6731; GCN-NEXT: s_addc_u32 s3, s3, s4 6732; GCN-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 6733; GCN-NEXT: v_cvt_f32_u32_e32 v0, s16 6734; GCN-NEXT: v_cvt_f32_u32_e32 v1, s17 6735; GCN-NEXT: s_mov_b32 s20, 0x2f800000 6736; GCN-NEXT: s_mov_b32 s21, 0xcf800000 6737; GCN-NEXT: s_sub_u32 s6, 0, s16 6738; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 6739; GCN-NEXT: v_rcp_f32_e32 v0, v0 6740; GCN-NEXT: s_subb_u32 s7, 0, s17 6741; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6742; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6743; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 6744; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 6745; GCN-NEXT: v_trunc_f32_e32 v1, v1 6746; GCN-NEXT: v_mac_f32_e32 v0, s21, v1 6747; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6748; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6749; GCN-NEXT: s_waitcnt lgkmcnt(0) 6750; GCN-NEXT: s_ashr_i32 s12, s9, 31 6751; GCN-NEXT: s_add_u32 s0, s8, s12 6752; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 6753; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 6754; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 6755; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 6756; GCN-NEXT: s_mov_b32 s13, s12 6757; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6758; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6759; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 6760; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 6761; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6762; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6763; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6764; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6765; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 6766; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 6767; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 6768; GCN-NEXT: s_addc_u32 s1, s9, s12 6769; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 6770; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6771; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 6772; GCN-NEXT: v_mov_b32_e32 v4, 0 6773; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6774; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6775; GCN-NEXT: v_mov_b32_e32 v6, 0 6776; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 6777; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6778; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 6779; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 6780; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6781; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 6782; GCN-NEXT: s_mov_b32 s7, 0xf000 6783; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6784; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 6785; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6786; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6787; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6788; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6789; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6790; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6791; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6792; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6793; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6794; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6795; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6796; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6797; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6798; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6799; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6800; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6801; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 6802; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6803; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6804; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 6805; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 6806; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 6807; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 6808; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 6809; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6810; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6811; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 6812; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 6813; GCN-NEXT: s_mov_b32 s6, -1 6814; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6815; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6816; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6817; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6818; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6819; GCN-NEXT: v_mul_lo_u32 v1, s16, v1 6820; GCN-NEXT: v_mul_hi_u32 v2, s16, v0 6821; GCN-NEXT: v_mul_lo_u32 v3, s17, v0 6822; GCN-NEXT: v_mul_lo_u32 v0, s16, v0 6823; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6824; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6825; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 6826; GCN-NEXT: v_mov_b32_e32 v3, s17 6827; GCN-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 6828; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 6829; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 6830; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] 6831; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 6832; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 6833; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 6834; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 6835; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 6836; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 6837; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 6838; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 6839; GCN-NEXT: s_ashr_i32 s2, s15, 31 6840; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 6841; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 6842; GCN-NEXT: s_add_u32 s8, s14, s2 6843; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] 6844; GCN-NEXT: v_mov_b32_e32 v7, s9 6845; GCN-NEXT: s_mov_b32 s3, s2 6846; GCN-NEXT: s_addc_u32 s9, s15, s2 6847; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 6848; GCN-NEXT: v_cvt_f32_u32_e32 v8, s8 6849; GCN-NEXT: v_cvt_f32_u32_e32 v9, s9 6850; GCN-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc 6851; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 6852; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6853; GCN-NEXT: v_mac_f32_e32 v8, s18, v9 6854; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 6855; GCN-NEXT: v_rcp_f32_e32 v8, v8 6856; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 6857; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 6858; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 6859; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 6860; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6861; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 6862; GCN-NEXT: v_mul_f32_e32 v3, s19, v8 6863; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 6864; GCN-NEXT: v_trunc_f32_e32 v5, v5 6865; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 6866; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 6867; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 6868; GCN-NEXT: s_sub_u32 s2, 0, s8 6869; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6870; GCN-NEXT: v_mul_hi_u32 v2, s2, v3 6871; GCN-NEXT: v_mul_lo_u32 v7, s2, v5 6872; GCN-NEXT: s_subb_u32 s3, 0, s9 6873; GCN-NEXT: v_mul_lo_u32 v8, s3, v3 6874; GCN-NEXT: s_ashr_i32 s14, s11, 31 6875; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 6876; GCN-NEXT: v_mul_lo_u32 v7, s2, v3 6877; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 6878; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 6879; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 6880; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 6881; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 6882; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 6883; GCN-NEXT: s_mov_b32 s15, s14 6884; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6885; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 6886; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 6887; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 6888; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 6889; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 6890; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6891; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 6892; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 6893; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6894; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 6895; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 6896; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 6897; GCN-NEXT: v_mul_lo_u32 v8, s2, v3 6898; GCN-NEXT: v_mul_hi_u32 v9, s2, v2 6899; GCN-NEXT: v_mul_lo_u32 v10, s3, v2 6900; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6901; GCN-NEXT: v_mul_lo_u32 v9, s2, v2 6902; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6903; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 6904; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 6905; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 6906; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 6907; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 6908; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 6909; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 6910; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 6911; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 6912; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 6913; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 6914; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 6915; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 6916; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 6917; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 6918; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 6919; GCN-NEXT: s_add_u32 s0, s10, s14 6920; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6921; GCN-NEXT: s_addc_u32 s1, s11, s14 6922; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6923; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6924; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 6925; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 6926; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 6927; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 6928; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 6929; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6930; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 6931; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 6932; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 6933; GCN-NEXT: v_mov_b32_e32 v8, s12 6934; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 6935; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 6936; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 6937; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6938; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 6939; GCN-NEXT: v_mul_lo_u32 v3, s8, v3 6940; GCN-NEXT: v_mul_hi_u32 v4, s8, v2 6941; GCN-NEXT: v_mul_lo_u32 v5, s9, v2 6942; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 6943; GCN-NEXT: v_mul_lo_u32 v2, s8, v2 6944; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 6945; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6946; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6947; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 6948; GCN-NEXT: v_mov_b32_e32 v5, s9 6949; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 6950; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 6951; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 6952; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 6953; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 6954; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 6955; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 6956; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 6957; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 6958; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 6959; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 6960; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 6961; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 6962; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 6963; GCN-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 6964; GCN-NEXT: v_mov_b32_e32 v7, s11 6965; GCN-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 6966; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 6967; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6968; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 6969; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6970; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 6971; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 6972; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 6973; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 6974; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 6975; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6976; GCN-NEXT: v_xor_b32_e32 v2, s14, v2 6977; GCN-NEXT: v_xor_b32_e32 v3, s14, v3 6978; GCN-NEXT: v_mov_b32_e32 v4, s14 6979; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 6980; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 6981; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6982; GCN-NEXT: s_endpgm 6983 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 6984 %r = srem <2 x i64> %x, %shl.y 6985 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6986 ret void 6987} 6988