• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
5
6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7; CHECK-LABEL: @udiv_i32(
8; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
11; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
13; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
14; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
15; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
16; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
17; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
19; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
20; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
21; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
22; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
23; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
28; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
29; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
30; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
31; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
32; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
33; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
34; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
35; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
36; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
37; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
38; CHECK-NEXT:    ret void
39;
40; GCN-LABEL: udiv_i32:
41; GCN:       ; %bb.0:
42; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
43; GCN-NEXT:    s_mov_b32 s7, 0xf000
44; GCN-NEXT:    s_mov_b32 s6, -1
45; GCN-NEXT:    s_waitcnt lgkmcnt(0)
46; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
47; GCN-NEXT:    s_sub_i32 s4, 0, s3
48; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
49; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
50; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
51; GCN-NEXT:    v_mul_lo_u32 v1, s4, v0
52; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
53; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
54; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
55; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
56; GCN-NEXT:    v_mul_lo_u32 v1, v0, s3
57; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
58; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
59; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
60; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
61; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
62; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
63; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
64; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
65; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
66; GCN-NEXT:    s_waitcnt lgkmcnt(0)
67; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
68; GCN-NEXT:    s_endpgm
69  %r = udiv i32 %x, %y
70  store i32 %r, i32 addrspace(1)* %out
71  ret void
72}
73
74define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
75; CHECK-LABEL: @urem_i32(
76; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
77; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
78; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
79; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
80; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
81; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
82; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
83; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
84; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
85; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
86; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
87; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
88; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
89; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
90; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
91; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
92; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
93; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
94; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
95; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
96; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
97; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
98; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
99; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
100; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
101; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
102; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
103; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
104; CHECK-NEXT:    ret void
105;
106; GCN-LABEL: urem_i32:
107; GCN:       ; %bb.0:
108; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
109; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
110; GCN-NEXT:    s_mov_b32 s3, 0xf000
111; GCN-NEXT:    s_waitcnt lgkmcnt(0)
112; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
113; GCN-NEXT:    s_sub_i32 s2, 0, s5
114; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
115; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
116; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
117; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
118; GCN-NEXT:    s_mov_b32 s2, -1
119; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
120; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
121; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
122; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
123; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
124; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
125; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
126; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
127; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
128; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
129; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
130; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
131; GCN-NEXT:    s_endpgm
132  %r = urem i32 %x, %y
133  store i32 %r, i32 addrspace(1)* %out
134  ret void
135}
136
137define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
138; CHECK-LABEL: @sdiv_i32(
139; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
140; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
141; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
142; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
143; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
144; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
145; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
146; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
147; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
148; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
149; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
150; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
151; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
152; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
153; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
154; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
155; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
156; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
157; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
158; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
159; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
160; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
161; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
162; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
163; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
164; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
165; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
166; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
167; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
168; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
169; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
170; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
171; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
172; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
173; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
174; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
175; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
176; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
177; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
178; CHECK-NEXT:    ret void
179;
180; GCN-LABEL: sdiv_i32:
181; GCN:       ; %bb.0:
182; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
183; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
184; GCN-NEXT:    s_mov_b32 s7, 0xf000
185; GCN-NEXT:    s_mov_b32 s6, -1
186; GCN-NEXT:    s_waitcnt lgkmcnt(0)
187; GCN-NEXT:    s_ashr_i32 s8, s3, 31
188; GCN-NEXT:    s_add_i32 s3, s3, s8
189; GCN-NEXT:    s_xor_b32 s9, s3, s8
190; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
191; GCN-NEXT:    s_sub_i32 s3, 0, s9
192; GCN-NEXT:    s_ashr_i32 s0, s2, 31
193; GCN-NEXT:    s_add_i32 s1, s2, s0
194; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
195; GCN-NEXT:    s_xor_b32 s1, s1, s0
196; GCN-NEXT:    s_xor_b32 s2, s0, s8
197; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
198; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
199; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
200; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
201; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
202; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
203; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
204; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
205; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
206; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
207; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
208; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s9, v1
209; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
210; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
211; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
212; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
213; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
214; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
215; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
216; GCN-NEXT:    s_endpgm
217  %r = sdiv i32 %x, %y
218  store i32 %r, i32 addrspace(1)* %out
219  ret void
220}
221
222define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
223; CHECK-LABEL: @srem_i32(
224; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
225; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
226; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
227; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
228; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
229; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
230; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
231; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
232; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
233; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
234; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
235; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
236; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
237; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
238; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
239; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
240; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
241; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
242; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
243; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
244; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
245; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
246; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
247; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
248; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
249; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
250; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
251; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
252; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
253; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
254; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
255; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
256; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
257; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
258; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
259; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
260; CHECK-NEXT:    ret void
261;
262; GCN-LABEL: srem_i32:
263; GCN:       ; %bb.0:
264; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
265; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
266; GCN-NEXT:    s_waitcnt lgkmcnt(0)
267; GCN-NEXT:    s_ashr_i32 s4, s3, 31
268; GCN-NEXT:    s_add_i32 s3, s3, s4
269; GCN-NEXT:    s_xor_b32 s6, s3, s4
270; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
271; GCN-NEXT:    s_sub_i32 s3, 0, s6
272; GCN-NEXT:    s_ashr_i32 s4, s2, 31
273; GCN-NEXT:    s_add_i32 s2, s2, s4
274; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
275; GCN-NEXT:    s_xor_b32 s5, s2, s4
276; GCN-NEXT:    s_mov_b32 s2, -1
277; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
278; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
279; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
280; GCN-NEXT:    s_mov_b32 s3, 0xf000
281; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
282; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
283; GCN-NEXT:    v_mul_hi_u32 v0, s5, v0
284; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
285; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
286; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
287; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
288; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
289; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
290; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
291; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
292; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
293; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
294; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
295; GCN-NEXT:    s_endpgm
296  %r = srem i32 %x, %y
297  store i32 %r, i32 addrspace(1)* %out
298  ret void
299}
300
301define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
302; CHECK-LABEL: @udiv_i16(
303; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
304; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
305; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
306; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
307; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
308; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
309; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
310; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
311; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
312; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
313; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
314; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
315; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
316; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
317; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
318; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
319; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
320; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
321; CHECK-NEXT:    ret void
322;
323; GCN-LABEL: udiv_i16:
324; GCN:       ; %bb.0:
325; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
326; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
327; GCN-NEXT:    s_waitcnt lgkmcnt(0)
328; GCN-NEXT:    s_lshr_b32 s3, s2, 16
329; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
330; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
331; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
332; GCN-NEXT:    s_mov_b32 s3, 0xf000
333; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
334; GCN-NEXT:    s_mov_b32 s2, -1
335; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
336; GCN-NEXT:    v_trunc_f32_e32 v2, v2
337; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
338; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
339; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
340; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
341; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
342; GCN-NEXT:    s_endpgm
343  %r = udiv i16 %x, %y
344  store i16 %r, i16 addrspace(1)* %out
345  ret void
346}
347
348define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
349; CHECK-LABEL: @urem_i16(
350; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
351; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
352; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
353; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
354; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
355; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
356; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
357; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
358; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
359; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
360; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
361; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
362; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
363; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
364; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
365; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
366; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
367; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
368; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
369; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
370; CHECK-NEXT:    ret void
371;
372; GCN-LABEL: urem_i16:
373; GCN:       ; %bb.0:
374; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
375; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
376; GCN-NEXT:    s_waitcnt lgkmcnt(0)
377; GCN-NEXT:    s_lshr_b32 s2, s4, 16
378; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
379; GCN-NEXT:    s_and_b32 s3, s4, 0xffff
380; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
381; GCN-NEXT:    s_mov_b32 s3, 0xf000
382; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
383; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
384; GCN-NEXT:    v_trunc_f32_e32 v2, v2
385; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
386; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
387; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
388; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
389; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
390; GCN-NEXT:    s_mov_b32 s2, -1
391; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
392; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
393; GCN-NEXT:    s_endpgm
394  %r = urem i16 %x, %y
395  store i16 %r, i16 addrspace(1)* %out
396  ret void
397}
398
399define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
400; CHECK-LABEL: @sdiv_i16(
401; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
402; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
403; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
404; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
405; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
406; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
407; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
408; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
409; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
410; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
411; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
412; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
413; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
414; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
415; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
416; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
417; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
418; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
419; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
420; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
421; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
422; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
423; CHECK-NEXT:    ret void
424;
425; GCN-LABEL: sdiv_i16:
426; GCN:       ; %bb.0:
427; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
428; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
429; GCN-NEXT:    s_mov_b32 s7, 0xf000
430; GCN-NEXT:    s_mov_b32 s6, -1
431; GCN-NEXT:    s_waitcnt lgkmcnt(0)
432; GCN-NEXT:    s_ashr_i32 s1, s0, 16
433; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
434; GCN-NEXT:    s_sext_i32_i16 s0, s0
435; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
436; GCN-NEXT:    s_xor_b32 s0, s0, s1
437; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
438; GCN-NEXT:    s_ashr_i32 s0, s0, 30
439; GCN-NEXT:    s_or_b32 s0, s0, 1
440; GCN-NEXT:    v_mov_b32_e32 v3, s0
441; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
442; GCN-NEXT:    v_trunc_f32_e32 v2, v2
443; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
444; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
445; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
446; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
447; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
448; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
449; GCN-NEXT:    s_endpgm
450  %r = sdiv i16 %x, %y
451  store i16 %r, i16 addrspace(1)* %out
452  ret void
453}
454
455define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
456; CHECK-LABEL: @srem_i16(
457; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
458; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
459; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
460; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
461; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
462; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
463; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
464; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
465; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
466; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
467; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
468; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
469; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
470; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
471; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
472; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
473; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
474; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
475; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
476; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
477; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
478; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
479; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
480; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
481; CHECK-NEXT:    ret void
482;
483; GCN-LABEL: srem_i16:
484; GCN:       ; %bb.0:
485; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
486; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
487; GCN-NEXT:    s_waitcnt lgkmcnt(0)
488; GCN-NEXT:    s_ashr_i32 s2, s4, 16
489; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
490; GCN-NEXT:    s_sext_i32_i16 s3, s4
491; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
492; GCN-NEXT:    s_xor_b32 s3, s3, s2
493; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
494; GCN-NEXT:    s_ashr_i32 s3, s3, 30
495; GCN-NEXT:    s_or_b32 s3, s3, 1
496; GCN-NEXT:    v_mov_b32_e32 v3, s3
497; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
498; GCN-NEXT:    v_trunc_f32_e32 v2, v2
499; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
500; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
501; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
502; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
503; GCN-NEXT:    s_mov_b32 s3, 0xf000
504; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
505; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
506; GCN-NEXT:    s_mov_b32 s2, -1
507; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
508; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
509; GCN-NEXT:    s_endpgm
510  %r = srem i16 %x, %y
511  store i16 %r, i16 addrspace(1)* %out
512  ret void
513}
514
515define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
516; CHECK-LABEL: @udiv_i8(
517; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
518; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
519; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
520; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
521; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
522; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
523; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
524; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
525; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
526; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
527; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
528; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
529; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
530; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
531; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
532; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
533; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
534; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
535; CHECK-NEXT:    ret void
536;
537; GCN-LABEL: udiv_i8:
538; GCN:       ; %bb.0:
539; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
540; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
541; GCN-NEXT:    s_mov_b32 s7, 0xf000
542; GCN-NEXT:    s_mov_b32 s6, -1
543; GCN-NEXT:    s_waitcnt lgkmcnt(0)
544; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s0
545; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
546; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
547; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
548; GCN-NEXT:    v_trunc_f32_e32 v1, v1
549; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
550; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
551; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
552; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
553; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
554; GCN-NEXT:    s_endpgm
555  %r = udiv i8 %x, %y
556  store i8 %r, i8 addrspace(1)* %out
557  ret void
558}
559
560define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
561; CHECK-LABEL: @urem_i8(
562; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
563; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
564; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
565; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
566; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
567; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
568; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
569; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
570; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
571; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
572; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
573; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
574; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
575; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
576; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
577; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
578; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
579; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
580; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
581; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
582; CHECK-NEXT:    ret void
583;
584; GCN-LABEL: urem_i8:
585; GCN:       ; %bb.0:
586; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
587; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
588; GCN-NEXT:    s_mov_b32 s3, 0xf000
589; GCN-NEXT:    s_waitcnt lgkmcnt(0)
590; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
591; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
592; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
593; GCN-NEXT:    s_lshr_b32 s2, s4, 8
594; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
595; GCN-NEXT:    v_trunc_f32_e32 v1, v1
596; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
597; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
598; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
599; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
600; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
601; GCN-NEXT:    s_mov_b32 s2, -1
602; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
603; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0
604; GCN-NEXT:    s_endpgm
605  %r = urem i8 %x, %y
606  store i8 %r, i8 addrspace(1)* %out
607  ret void
608}
609
610define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
611; CHECK-LABEL: @sdiv_i8(
612; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
613; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
614; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
615; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
616; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
617; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
618; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
619; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
620; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
621; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
622; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
623; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
624; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
625; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
626; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
627; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
628; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
629; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
630; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
631; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
632; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
633; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
634; CHECK-NEXT:    ret void
635;
636; GCN-LABEL: sdiv_i8:
637; GCN:       ; %bb.0:
638; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
639; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
640; GCN-NEXT:    s_mov_b32 s7, 0xf000
641; GCN-NEXT:    s_mov_b32 s6, -1
642; GCN-NEXT:    s_waitcnt lgkmcnt(0)
643; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
644; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
645; GCN-NEXT:    s_sext_i32_i8 s0, s0
646; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
647; GCN-NEXT:    s_xor_b32 s0, s0, s1
648; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
649; GCN-NEXT:    s_ashr_i32 s0, s0, 30
650; GCN-NEXT:    s_or_b32 s0, s0, 1
651; GCN-NEXT:    v_mov_b32_e32 v3, s0
652; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
653; GCN-NEXT:    v_trunc_f32_e32 v2, v2
654; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
655; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
656; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
657; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
658; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
659; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
660; GCN-NEXT:    s_endpgm
661  %r = sdiv i8 %x, %y
662  store i8 %r, i8 addrspace(1)* %out
663  ret void
664}
665
666define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
667; CHECK-LABEL: @srem_i8(
668; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
669; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
670; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
671; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
672; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
673; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
674; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
675; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
676; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
677; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
678; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
679; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
680; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
681; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
682; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
683; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
684; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
685; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
686; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
687; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
688; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
689; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
690; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
691; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
692; CHECK-NEXT:    ret void
693;
694; GCN-LABEL: srem_i8:
695; GCN:       ; %bb.0:
696; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
697; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
698; GCN-NEXT:    s_mov_b32 s7, 0xf000
699; GCN-NEXT:    s_mov_b32 s6, -1
700; GCN-NEXT:    s_waitcnt lgkmcnt(0)
701; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
702; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
703; GCN-NEXT:    s_sext_i32_i8 s3, s0
704; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
705; GCN-NEXT:    s_xor_b32 s1, s3, s1
706; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
707; GCN-NEXT:    s_ashr_i32 s1, s1, 30
708; GCN-NEXT:    s_or_b32 s1, s1, 1
709; GCN-NEXT:    v_mov_b32_e32 v3, s1
710; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
711; GCN-NEXT:    v_trunc_f32_e32 v2, v2
712; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
713; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
714; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
715; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
716; GCN-NEXT:    s_lshr_b32 s2, s0, 8
717; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
718; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
719; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
720; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
721; GCN-NEXT:    s_endpgm
722  %r = srem i8 %x, %y
723  store i8 %r, i8 addrspace(1)* %out
724  ret void
725}
726
727define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
728; CHECK-LABEL: @udiv_v4i32(
729; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
730; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
731; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
732; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
733; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
734; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
735; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
736; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
737; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
738; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
739; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
740; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
741; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
742; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
743; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
744; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
745; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
746; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
747; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
748; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
749; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
750; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
751; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
752; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
753; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
754; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
755; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
756; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
757; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
758; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
759; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
760; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
761; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
762; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
763; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
764; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
765; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
766; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
767; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
768; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
769; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
770; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
771; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
772; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
773; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
774; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
775; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
776; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
777; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
778; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
779; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
780; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
781; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
782; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
783; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
784; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
785; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
786; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
787; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
788; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
789; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
790; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
791; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
792; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
793; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
794; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
795; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
796; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
797; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
798; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
799; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
800; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
801; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
802; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
803; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
804; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
805; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
806; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
807; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
808; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
809; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
810; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
811; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
812; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
813; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
814; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
815; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
816; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
817; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
818; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
819; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
820; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
821; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
822; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
823; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
824; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
825; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
826; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
827; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
828; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
829; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
830; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
831; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
832; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
833; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
834; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
835; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
836; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
837; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
838; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
839; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
840; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
841; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
842; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
843; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
844; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
845; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
846; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
847; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
848; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
849; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
850; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
851; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
852; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
853; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
854; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
855; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
856; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
857; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
858; CHECK-NEXT:    ret void
859;
860; GCN-LABEL: udiv_v4i32:
861; GCN:       ; %bb.0:
862; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
863; GCN-NEXT:    s_mov_b32 s3, 0x4f7ffffe
864; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
865; GCN-NEXT:    s_mov_b32 s15, 0xf000
866; GCN-NEXT:    s_mov_b32 s14, -1
867; GCN-NEXT:    s_waitcnt lgkmcnt(0)
868; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
869; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
870; GCN-NEXT:    s_sub_i32 s2, 0, s8
871; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s10
872; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
873; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
874; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s11
875; GCN-NEXT:    v_mul_f32_e32 v0, s3, v0
876; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
877; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
878; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
879; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
880; GCN-NEXT:    s_sub_i32 s2, 0, s9
881; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
882; GCN-NEXT:    s_sub_i32 s2, 0, s10
883; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
884; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
885; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
886; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
887; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
888; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
889; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
890; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
891; GCN-NEXT:    v_mul_lo_u32 v5, v1, s9
892; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
893; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
894; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
895; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
896; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
897; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
898; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
899; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v4
900; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
901; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
902; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
903; GCN-NEXT:    v_mul_f32_e32 v2, s3, v2
904; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
905; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
906; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
907; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
908; GCN-NEXT:    v_mul_lo_u32 v4, s2, v2
909; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
910; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
911; GCN-NEXT:    s_sub_i32 s0, 0, s11
912; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
913; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
914; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v6
915; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
916; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
917; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
918; GCN-NEXT:    v_mul_f32_e32 v4, s3, v4
919; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
920; GCN-NEXT:    v_mul_lo_u32 v3, v2, s10
921; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
922; GCN-NEXT:    v_mul_lo_u32 v5, s0, v4
923; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
924; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
925; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
926; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
927; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
928; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
929; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
930; GCN-NEXT:    v_mul_hi_u32 v4, s7, v4
931; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
932; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
933; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
934; GCN-NEXT:    v_mul_lo_u32 v6, v4, s11
935; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
936; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
937; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
938; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
939; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
940; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
941; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
942; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
943; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
944; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
945; GCN-NEXT:    s_endpgm
946  %r = udiv <4 x i32> %x, %y
947  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
948  ret void
949}
950
951define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
952; CHECK-LABEL: @urem_v4i32(
953; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
954; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
955; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
956; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
957; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
958; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
959; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
960; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
961; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
962; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
963; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
964; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
965; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
966; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
967; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
968; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
969; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
970; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
971; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
972; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
973; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
974; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
975; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
976; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
977; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
978; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
979; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
980; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
981; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
982; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
983; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
984; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
985; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
986; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
987; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
988; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
989; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
990; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
991; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
992; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
993; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
994; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
995; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
996; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
997; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
998; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
999; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1000; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1001; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1002; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1003; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1004; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1005; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1006; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1007; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1008; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1009; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1010; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1011; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1012; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1013; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1014; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1015; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1016; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1017; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1018; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1019; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1020; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1021; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1022; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1023; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1024; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1025; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1026; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1027; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1028; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1029; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1030; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1031; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1032; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1033; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1034; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1035; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1036; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1037; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1038; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1039; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1040; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1041; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1042; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1043; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1044; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1045; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1046; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1047; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1048; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1049; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1050; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1051; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1052; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1053; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1054; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1055; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1056; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1057; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1058; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1059; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1060; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1061; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1062; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1063; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1064; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1065; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1066; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1067; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1068; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1069; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1070; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1071; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1072; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1073; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1074; CHECK-NEXT:    ret void
1075;
1076; GCN-LABEL: urem_v4i32:
1077; GCN:       ; %bb.0:
1078; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1079; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1080; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1081; GCN-NEXT:    s_mov_b32 s3, 0xf000
1082; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1083; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
1084; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
1085; GCN-NEXT:    s_sub_i32 s2, 0, s8
1086; GCN-NEXT:    s_sub_i32 s12, 0, s9
1087; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1088; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1089; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
1090; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s11
1091; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
1092; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1093; GCN-NEXT:    v_mul_f32_e32 v1, s13, v1
1094; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1095; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1096; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
1097; GCN-NEXT:    s_mov_b32 s2, -1
1098; GCN-NEXT:    v_mul_lo_u32 v4, s12, v1
1099; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1100; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1101; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1102; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
1103; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1104; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
1105; GCN-NEXT:    v_mul_f32_e32 v2, s13, v3
1106; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
1107; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1108; GCN-NEXT:    v_mul_lo_u32 v1, v1, s9
1109; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1110; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1111; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1112; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1113; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1114; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1115; GCN-NEXT:    s_sub_i32 s4, 0, s10
1116; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1117; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
1118; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1119; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1120; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1121; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1122; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
1123; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v5
1124; GCN-NEXT:    s_sub_i32 s4, 0, s11
1125; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1126; GCN-NEXT:    v_mul_f32_e32 v3, s13, v4
1127; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1128; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1129; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
1130; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1131; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
1132; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1133; GCN-NEXT:    v_mul_lo_u32 v2, v2, s10
1134; GCN-NEXT:    v_mul_hi_u32 v4, v3, v5
1135; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1136; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1137; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1138; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
1139; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1140; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1141; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1142; GCN-NEXT:    v_mul_lo_u32 v3, v3, s11
1143; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1144; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1145; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1146; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1147; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1148; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1149; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1150; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1151; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1152; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1153; GCN-NEXT:    s_endpgm
1154  %r = urem <4 x i32> %x, %y
1155  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1156  ret void
1157}
1158
1159define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1160; CHECK-LABEL: @sdiv_v4i32(
1161; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1162; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1163; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1164; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1165; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1166; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1167; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1168; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1169; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1170; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1171; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1172; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1173; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1174; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1175; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1176; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1177; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1178; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1179; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1180; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1181; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1182; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1183; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1184; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1185; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1186; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1187; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1188; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1189; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1190; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1191; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1192; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1193; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1194; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1195; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1196; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1197; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1198; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1199; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1200; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1201; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
1202; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1203; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1204; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1205; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1206; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1207; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1208; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1209; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1210; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1211; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1212; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1213; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1214; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1215; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1216; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1217; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1218; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1219; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1220; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1221; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1222; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1223; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1224; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1225; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1226; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1227; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1228; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1229; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1230; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1231; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1232; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1233; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1234; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1235; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1236; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1237; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1238; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1239; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1240; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1241; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1242; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1243; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1244; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1245; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1246; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1247; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1248; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1249; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1250; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1251; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1252; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1253; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1254; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1255; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1256; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1257; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1258; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1259; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1260; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1261; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1262; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1263; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1264; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1265; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1266; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1267; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1268; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1269; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1270; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1271; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1272; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1273; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1274; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1275; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1276; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1277; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1278; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1279; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1280; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1281; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1282; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1283; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1284; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1285; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1286; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1287; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1288; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1289; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1290; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1291; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1292; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1293; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1294; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1295; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1296; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1297; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1298; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1299; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1300; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1301; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1302; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1303; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1304; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1305; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1306; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1307; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1308; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1309; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1310; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1311; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1312; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1313; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1314; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1315; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1316; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1317; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1318; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1319; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1320; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1321; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1322; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1323; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1324; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1325; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1326; CHECK-NEXT:    ret void
1327;
1328; GCN-LABEL: sdiv_v4i32:
1329; GCN:       ; %bb.0:
1330; GCN-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
1331; GCN-NEXT:    s_mov_b32 s16, 0x4f7ffffe
1332; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1333; GCN-NEXT:    s_mov_b32 s7, 0xf000
1334; GCN-NEXT:    s_mov_b32 s6, -1
1335; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1336; GCN-NEXT:    s_ashr_i32 s2, s12, 31
1337; GCN-NEXT:    s_add_i32 s3, s12, s2
1338; GCN-NEXT:    s_xor_b32 s12, s3, s2
1339; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
1340; GCN-NEXT:    s_ashr_i32 s3, s13, 31
1341; GCN-NEXT:    s_add_i32 s0, s13, s3
1342; GCN-NEXT:    s_xor_b32 s13, s0, s3
1343; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1344; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
1345; GCN-NEXT:    s_sub_i32 s1, 0, s12
1346; GCN-NEXT:    s_ashr_i32 s0, s8, 31
1347; GCN-NEXT:    v_mul_f32_e32 v0, s16, v0
1348; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1349; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1350; GCN-NEXT:    s_xor_b32 s2, s0, s2
1351; GCN-NEXT:    v_mul_lo_u32 v2, s1, v0
1352; GCN-NEXT:    s_add_i32 s1, s8, s0
1353; GCN-NEXT:    v_mul_f32_e32 v1, s16, v1
1354; GCN-NEXT:    s_xor_b32 s1, s1, s0
1355; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1356; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1357; GCN-NEXT:    s_sub_i32 s0, 0, s13
1358; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1359; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
1360; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
1361; GCN-NEXT:    v_mul_lo_u32 v3, v0, s12
1362; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
1363; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1364; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1365; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
1366; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1367; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s12, v3
1368; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
1369; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1370; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1371; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
1372; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1373; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
1374; GCN-NEXT:    s_ashr_i32 s0, s9, 31
1375; GCN-NEXT:    s_add_i32 s1, s9, s0
1376; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
1377; GCN-NEXT:    s_xor_b32 s2, s0, s3
1378; GCN-NEXT:    s_ashr_i32 s3, s14, 31
1379; GCN-NEXT:    s_xor_b32 s1, s1, s0
1380; GCN-NEXT:    s_add_i32 s0, s14, s3
1381; GCN-NEXT:    s_xor_b32 s9, s0, s3
1382; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s9
1383; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
1384; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1385; GCN-NEXT:    v_mul_lo_u32 v2, v1, s13
1386; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1387; GCN-NEXT:    v_mul_f32_e32 v3, s16, v3
1388; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
1389; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1390; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
1391; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1392; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s13, v2
1393; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1394; GCN-NEXT:    s_sub_i32 s0, 0, s9
1395; GCN-NEXT:    v_mul_lo_u32 v5, s0, v3
1396; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1397; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
1398; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1399; GCN-NEXT:    v_mul_hi_u32 v2, v3, v5
1400; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
1401; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
1402; GCN-NEXT:    s_ashr_i32 s2, s15, 31
1403; GCN-NEXT:    s_ashr_i32 s0, s10, 31
1404; GCN-NEXT:    s_add_i32 s8, s15, s2
1405; GCN-NEXT:    s_add_i32 s1, s10, s0
1406; GCN-NEXT:    s_xor_b32 s8, s8, s2
1407; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s8
1408; GCN-NEXT:    s_xor_b32 s1, s1, s0
1409; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1410; GCN-NEXT:    v_mul_hi_u32 v2, s1, v2
1411; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1412; GCN-NEXT:    s_xor_b32 s3, s0, s3
1413; GCN-NEXT:    v_mul_lo_u32 v3, v2, s9
1414; GCN-NEXT:    v_mul_f32_e32 v4, s16, v4
1415; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
1416; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1417; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1418; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1419; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1420; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1421; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1422; GCN-NEXT:    s_sub_i32 s0, 0, s8
1423; GCN-NEXT:    v_mul_lo_u32 v5, s0, v4
1424; GCN-NEXT:    s_ashr_i32 s0, s11, 31
1425; GCN-NEXT:    s_add_i32 s1, s11, s0
1426; GCN-NEXT:    s_xor_b32 s1, s1, s0
1427; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
1428; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1429; GCN-NEXT:    s_xor_b32 s2, s0, s2
1430; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1431; GCN-NEXT:    v_mul_hi_u32 v4, s1, v4
1432; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1433; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1434; GCN-NEXT:    v_xor_b32_e32 v2, s3, v2
1435; GCN-NEXT:    v_mul_lo_u32 v3, v4, s8
1436; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1437; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
1438; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1439; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
1440; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1441; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v3
1442; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1443; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1444; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1445; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1446; GCN-NEXT:    v_xor_b32_e32 v3, s2, v3
1447; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
1448; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1449; GCN-NEXT:    s_endpgm
1450  %r = sdiv <4 x i32> %x, %y
1451  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1452  ret void
1453}
1454
1455define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1456; CHECK-LABEL: @srem_v4i32(
1457; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1458; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1459; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1460; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1461; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
1462; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
1463; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
1464; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
1465; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
1466; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
1467; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
1468; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
1469; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
1470; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
1471; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
1472; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
1473; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
1474; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
1475; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
1476; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
1477; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
1478; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
1479; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
1480; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
1481; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
1482; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
1483; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
1484; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
1485; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
1486; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
1487; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
1488; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
1489; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
1490; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
1491; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
1492; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
1493; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
1494; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
1495; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
1496; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1497; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
1498; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
1499; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
1500; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
1501; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
1502; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
1503; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
1504; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
1505; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
1506; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
1507; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
1508; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
1509; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
1510; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
1511; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
1512; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
1513; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
1514; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
1515; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
1516; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
1517; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
1518; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
1519; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
1520; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
1521; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
1522; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
1523; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
1524; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
1525; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
1526; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
1527; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
1528; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
1529; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
1530; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
1531; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
1532; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
1533; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
1534; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1535; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
1536; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
1537; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
1538; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
1539; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
1540; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
1541; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
1542; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
1543; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
1544; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
1545; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
1546; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
1547; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
1548; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
1549; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
1550; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
1551; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
1552; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
1553; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
1554; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
1555; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1556; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1557; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1558; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1559; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1560; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
1561; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
1562; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
1563; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
1564; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
1565; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
1566; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
1567; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
1568; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
1569; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
1570; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
1571; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
1572; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1573; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
1574; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
1575; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
1576; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
1577; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
1578; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
1579; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
1580; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
1581; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
1582; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
1583; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
1584; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
1585; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
1586; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
1587; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
1588; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
1589; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
1590; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
1591; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
1592; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
1593; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
1594; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
1595; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
1596; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
1597; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
1598; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
1599; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
1600; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
1601; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
1602; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
1603; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
1604; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
1605; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
1606; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
1607; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
1608; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
1609; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1610; CHECK-NEXT:    ret void
1611;
1612; GCN-LABEL: srem_v4i32:
1613; GCN:       ; %bb.0:
1614; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1615; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1616; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1617; GCN-NEXT:    s_mov_b32 s3, 0xf000
1618; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1619; GCN-NEXT:    s_ashr_i32 s2, s8, 31
1620; GCN-NEXT:    s_add_i32 s8, s8, s2
1621; GCN-NEXT:    s_xor_b32 s12, s8, s2
1622; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
1623; GCN-NEXT:    s_ashr_i32 s8, s9, 31
1624; GCN-NEXT:    s_add_i32 s9, s9, s8
1625; GCN-NEXT:    s_xor_b32 s14, s9, s8
1626; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1627; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s14
1628; GCN-NEXT:    s_sub_i32 s9, 0, s12
1629; GCN-NEXT:    s_ashr_i32 s8, s4, 31
1630; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
1631; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1632; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1633; GCN-NEXT:    s_add_i32 s4, s4, s8
1634; GCN-NEXT:    s_xor_b32 s4, s4, s8
1635; GCN-NEXT:    v_mul_lo_u32 v2, s9, v0
1636; GCN-NEXT:    v_mul_f32_e32 v1, s13, v1
1637; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1638; GCN-NEXT:    s_sub_i32 s9, 0, s14
1639; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1640; GCN-NEXT:    s_mov_b32 s2, -1
1641; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1642; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
1643; GCN-NEXT:    v_mul_lo_u32 v2, s9, v1
1644; GCN-NEXT:    s_ashr_i32 s9, s5, 31
1645; GCN-NEXT:    s_add_i32 s5, s5, s9
1646; GCN-NEXT:    v_mul_lo_u32 v0, v0, s12
1647; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
1648; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1649; GCN-NEXT:    s_xor_b32 s4, s5, s9
1650; GCN-NEXT:    s_ashr_i32 s5, s10, 31
1651; GCN-NEXT:    s_add_i32 s10, s10, s5
1652; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
1653; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
1654; GCN-NEXT:    s_xor_b32 s10, s10, s5
1655; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1656; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1657; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
1658; GCN-NEXT:    v_mul_hi_u32 v1, s4, v1
1659; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
1660; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
1661; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1662; GCN-NEXT:    v_mul_lo_u32 v1, v1, s14
1663; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1664; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
1665; GCN-NEXT:    v_mul_f32_e32 v2, s13, v2
1666; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1667; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
1668; GCN-NEXT:    s_sub_i32 s4, 0, s10
1669; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
1670; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
1671; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
1672; GCN-NEXT:    v_mul_lo_u32 v4, s4, v2
1673; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1674; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
1675; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
1676; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1677; GCN-NEXT:    v_mul_hi_u32 v3, v2, v4
1678; GCN-NEXT:    s_ashr_i32 s4, s6, 31
1679; GCN-NEXT:    s_add_i32 s5, s6, s4
1680; GCN-NEXT:    s_ashr_i32 s6, s11, 31
1681; GCN-NEXT:    s_add_i32 s8, s11, s6
1682; GCN-NEXT:    s_xor_b32 s8, s8, s6
1683; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1684; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
1685; GCN-NEXT:    s_xor_b32 s5, s5, s4
1686; GCN-NEXT:    v_mul_hi_u32 v2, s5, v2
1687; GCN-NEXT:    v_xor_b32_e32 v1, s9, v1
1688; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1689; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
1690; GCN-NEXT:    v_mul_lo_u32 v2, v2, s10
1691; GCN-NEXT:    v_mul_f32_e32 v3, s13, v3
1692; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1693; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
1694; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1695; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1696; GCN-NEXT:    s_sub_i32 s5, 0, s8
1697; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1698; GCN-NEXT:    v_mul_lo_u32 v4, s5, v3
1699; GCN-NEXT:    s_ashr_i32 s5, s7, 31
1700; GCN-NEXT:    s_add_i32 s6, s7, s5
1701; GCN-NEXT:    s_xor_b32 s6, s6, s5
1702; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
1703; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1704; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1705; GCN-NEXT:    v_mul_hi_u32 v3, s6, v3
1706; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1707; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1708; GCN-NEXT:    v_xor_b32_e32 v2, s4, v2
1709; GCN-NEXT:    v_mul_lo_u32 v3, v3, s8
1710; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
1711; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
1712; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
1713; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1714; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1715; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
1716; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1717; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1718; GCN-NEXT:    v_xor_b32_e32 v3, s5, v3
1719; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s5, v3
1720; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1721; GCN-NEXT:    s_endpgm
1722  %r = srem <4 x i32> %x, %y
1723  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1724  ret void
1725}
1726
1727define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
1728; CHECK-LABEL: @udiv_v4i16(
1729; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
1730; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
1731; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
1732; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
1733; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
1734; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
1735; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
1736; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
1737; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
1738; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
1739; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
1740; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
1741; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
1742; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
1743; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
1744; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
1745; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
1746; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
1747; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
1748; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
1749; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
1750; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
1751; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
1752; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
1753; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
1754; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
1755; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
1756; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
1757; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
1758; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
1759; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
1760; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
1761; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
1762; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
1763; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
1764; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
1765; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
1766; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
1767; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
1768; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
1769; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
1770; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
1771; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
1772; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
1773; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
1774; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
1775; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
1776; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
1777; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
1778; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
1779; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
1780; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
1781; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
1782; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
1783; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
1784; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
1785; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
1786; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
1787; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
1788; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
1789; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
1790; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
1791; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
1792; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
1793; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
1794; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
1795; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
1796; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
1797; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
1798; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
1799; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
1800; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
1801; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
1802; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
1803; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
1804; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
1805; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
1806; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
1807; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
1808; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
1809; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
1810; CHECK-NEXT:    ret void
1811;
1812; GCN-LABEL: udiv_v4i16:
1813; GCN:       ; %bb.0:
1814; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1815; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
1816; GCN-NEXT:    s_mov_b32 s8, 0xffff
1817; GCN-NEXT:    s_mov_b32 s7, 0xf000
1818; GCN-NEXT:    s_mov_b32 s6, -1
1819; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1820; GCN-NEXT:    s_and_b32 s9, s2, s8
1821; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
1822; GCN-NEXT:    s_lshr_b32 s9, s0, 16
1823; GCN-NEXT:    s_and_b32 s0, s0, s8
1824; GCN-NEXT:    s_lshr_b32 s2, s2, 16
1825; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
1826; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
1827; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1828; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
1829; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
1830; GCN-NEXT:    s_and_b32 s2, s3, s8
1831; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
1832; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1833; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
1834; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1835; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1836; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
1837; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1838; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1839; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
1840; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s2
1841; GCN-NEXT:    s_lshr_b32 s0, s1, 16
1842; GCN-NEXT:    s_and_b32 s1, s1, s8
1843; GCN-NEXT:    s_lshr_b32 s10, s3, 16
1844; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1845; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1846; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
1847; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
1848; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
1849; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
1850; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v3
1851; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1852; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
1853; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
1854; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1855; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
1856; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
1857; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1858; GCN-NEXT:    v_mul_f32_e32 v4, v6, v7
1859; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1860; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1861; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1862; GCN-NEXT:    v_mad_f32 v4, -v4, v3, v6
1863; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
1864; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
1865; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
1866; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1867; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
1868; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
1869; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
1870; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1871; GCN-NEXT:    s_endpgm
1872  %r = udiv <4 x i16> %x, %y
1873  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
1874  ret void
1875}
1876
1877define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
1878; CHECK-LABEL: @urem_v4i16(
1879; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
1880; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
1881; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
1882; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
1883; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
1884; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
1885; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
1886; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
1887; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
1888; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
1889; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
1890; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
1891; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
1892; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
1893; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
1894; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
1895; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
1896; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
1897; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
1898; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
1899; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1900; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
1901; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
1902; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
1903; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
1904; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
1905; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
1906; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
1907; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
1908; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
1909; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
1910; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
1911; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
1912; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
1913; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
1914; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
1915; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
1916; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
1917; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
1918; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
1919; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
1920; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
1921; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
1922; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
1923; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
1924; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
1925; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
1926; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
1927; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
1928; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
1929; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
1930; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
1931; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
1932; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
1933; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
1934; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
1935; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
1936; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
1937; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
1938; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
1939; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
1940; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
1941; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
1942; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
1943; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
1944; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
1945; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
1946; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
1947; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
1948; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
1949; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
1950; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
1951; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
1952; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
1953; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
1954; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
1955; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
1956; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
1957; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
1958; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
1959; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
1960; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
1961; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
1962; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
1963; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
1964; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
1965; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
1966; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
1967; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
1968; CHECK-NEXT:    ret void
1969;
1970; GCN-LABEL: urem_v4i16:
1971; GCN:       ; %bb.0:
1972; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1973; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
1974; GCN-NEXT:    s_mov_b32 s8, 0xffff
1975; GCN-NEXT:    s_mov_b32 s7, 0xf000
1976; GCN-NEXT:    s_mov_b32 s6, -1
1977; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1978; GCN-NEXT:    s_and_b32 s9, s2, s8
1979; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
1980; GCN-NEXT:    s_and_b32 s10, s0, s8
1981; GCN-NEXT:    s_lshr_b32 s11, s2, 16
1982; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
1983; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1984; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s11
1985; GCN-NEXT:    s_lshr_b32 s9, s0, 16
1986; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
1987; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
1988; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
1989; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1990; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
1991; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1992; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1993; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
1994; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1995; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1996; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1997; GCN-NEXT:    v_mad_f32 v1, -v1, v3, v4
1998; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
1999; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2000; GCN-NEXT:    s_and_b32 s2, s3, s8
2001; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
2002; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
2003; GCN-NEXT:    s_and_b32 s2, s1, s8
2004; GCN-NEXT:    v_mul_lo_u32 v1, v1, s11
2005; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
2006; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2007; GCN-NEXT:    s_lshr_b32 s12, s3, 16
2008; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
2009; GCN-NEXT:    s_lshr_b32 s10, s1, 16
2010; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
2011; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s12
2012; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s10
2013; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2014; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2015; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2016; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
2017; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2018; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2019; GCN-NEXT:    v_mul_f32_e32 v2, v6, v7
2020; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2021; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
2022; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2023; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v6
2024; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2025; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2026; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2027; GCN-NEXT:    v_mul_lo_u32 v2, v2, s12
2028; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2029; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2030; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
2031; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2032; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
2033; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2034; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2035; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2036; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2037; GCN-NEXT:    s_endpgm
2038  %r = urem <4 x i16> %x, %y
2039  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2040  ret void
2041}
2042
2043define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2044; CHECK-LABEL: @sdiv_v4i16(
2045; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2046; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2047; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2048; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2049; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2050; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2051; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2052; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2053; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2054; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2055; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2056; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2057; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2058; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2059; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2060; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2061; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2062; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2063; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2064; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2065; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2066; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2067; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2068; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2069; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2070; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2071; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2072; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2073; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2074; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2075; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2076; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2077; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2078; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2079; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2080; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2081; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2082; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2083; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2084; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2085; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2086; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2087; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2088; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2089; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2090; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2091; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2092; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2093; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2094; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2095; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2096; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2097; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2098; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2099; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2100; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2101; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2102; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2103; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2104; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2105; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2106; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2107; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2108; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2109; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2110; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2111; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2112; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2113; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2114; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2115; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2116; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2117; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2118; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2119; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2120; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2121; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2122; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2123; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2124; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2125; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2126; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2127; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2128; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2129; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2130; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2131; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2132; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2133; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2134; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2135; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2136; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2137; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2138; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2139; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2140; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2141; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2142; CHECK-NEXT:    ret void
2143;
2144; GCN-LABEL: sdiv_v4i16:
2145; GCN:       ; %bb.0:
2146; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2147; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2148; GCN-NEXT:    s_mov_b32 s7, 0xf000
2149; GCN-NEXT:    s_mov_b32 s6, -1
2150; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2151; GCN-NEXT:    s_sext_i32_i16 s8, s2
2152; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2153; GCN-NEXT:    s_sext_i32_i16 s9, s0
2154; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2155; GCN-NEXT:    s_xor_b32 s8, s9, s8
2156; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2157; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2158; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2159; GCN-NEXT:    s_or_b32 s8, s8, 1
2160; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2161; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2162; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2163; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2164; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2165; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2166; GCN-NEXT:    v_mov_b32_e32 v3, s8
2167; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2168; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2169; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2170; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2171; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2172; GCN-NEXT:    s_xor_b32 s0, s0, s2
2173; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2174; GCN-NEXT:    s_or_b32 s0, s0, 1
2175; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2176; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2177; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
2178; GCN-NEXT:    v_mov_b32_e32 v4, s0
2179; GCN-NEXT:    s_sext_i32_i16 s0, s3
2180; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
2181; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2182; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2183; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
2184; GCN-NEXT:    s_sext_i32_i16 s2, s1
2185; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
2186; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2187; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2188; GCN-NEXT:    s_xor_b32 s0, s2, s0
2189; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2190; GCN-NEXT:    s_or_b32 s0, s0, 1
2191; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2192; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2193; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
2194; GCN-NEXT:    v_mov_b32_e32 v5, s0
2195; GCN-NEXT:    s_ashr_i32 s0, s3, 16
2196; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
2197; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2198; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2199; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
2200; GCN-NEXT:    s_ashr_i32 s1, s1, 16
2201; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
2202; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
2203; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2204; GCN-NEXT:    s_xor_b32 s0, s1, s0
2205; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2206; GCN-NEXT:    s_or_b32 s0, s0, 1
2207; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2208; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2209; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
2210; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2211; GCN-NEXT:    v_mov_b32_e32 v6, s0
2212; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
2213; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
2214; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
2215; GCN-NEXT:    s_mov_b32 s0, 0xffff
2216; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2217; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2218; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2219; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2220; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
2221; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2222; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2223; GCN-NEXT:    s_endpgm
2224  %r = sdiv <4 x i16> %x, %y
2225  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2226  ret void
2227}
2228
2229define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2230; CHECK-LABEL: @srem_v4i16(
2231; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2232; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2233; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2234; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2235; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2236; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2237; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2238; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2239; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2240; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2241; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2242; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2243; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2244; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2245; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2246; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2247; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2248; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2249; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2250; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2251; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
2252; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
2253; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
2254; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
2255; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
2256; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
2257; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
2258; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2259; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
2260; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
2261; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
2262; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
2263; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
2264; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
2265; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
2266; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
2267; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
2268; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
2269; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
2270; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
2271; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
2272; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
2273; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
2274; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
2275; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
2276; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
2277; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
2278; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
2279; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
2280; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
2281; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
2282; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
2283; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
2284; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2285; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
2286; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
2287; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
2288; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
2289; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
2290; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
2291; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
2292; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
2293; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
2294; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
2295; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
2296; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
2297; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
2298; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2299; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
2300; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
2301; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
2302; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
2303; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
2304; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
2305; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
2306; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
2307; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
2308; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
2309; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
2310; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2311; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
2312; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
2313; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
2314; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
2315; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
2316; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
2317; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
2318; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
2319; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
2320; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
2321; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
2322; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
2323; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
2324; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
2325; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
2326; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
2327; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
2328; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
2329; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
2330; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
2331; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
2332; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
2333; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
2334; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
2335; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2336; CHECK-NEXT:    ret void
2337;
2338; GCN-LABEL: srem_v4i16:
2339; GCN:       ; %bb.0:
2340; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2341; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2342; GCN-NEXT:    s_mov_b32 s7, 0xf000
2343; GCN-NEXT:    s_mov_b32 s6, -1
2344; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2345; GCN-NEXT:    s_sext_i32_i16 s8, s2
2346; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2347; GCN-NEXT:    s_sext_i32_i16 s9, s0
2348; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2349; GCN-NEXT:    s_xor_b32 s8, s9, s8
2350; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2351; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2352; GCN-NEXT:    s_or_b32 s8, s8, 1
2353; GCN-NEXT:    v_mov_b32_e32 v3, s8
2354; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2355; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2356; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2357; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2358; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2359; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2360; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2361; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2362; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2363; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2364; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2365; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2366; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2367; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2368; GCN-NEXT:    s_xor_b32 s8, s0, s2
2369; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2370; GCN-NEXT:    s_or_b32 s8, s8, 1
2371; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2372; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2373; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
2374; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2375; GCN-NEXT:    v_mov_b32_e32 v4, s8
2376; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
2377; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
2378; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
2379; GCN-NEXT:    v_mul_lo_u32 v1, v1, s2
2380; GCN-NEXT:    s_sext_i32_i16 s2, s3
2381; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
2382; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s0, v1
2383; GCN-NEXT:    s_sext_i32_i16 s0, s1
2384; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2385; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2386; GCN-NEXT:    s_xor_b32 s0, s0, s2
2387; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2388; GCN-NEXT:    s_or_b32 s0, s0, 1
2389; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2390; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2391; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
2392; GCN-NEXT:    v_mov_b32_e32 v5, s0
2393; GCN-NEXT:    s_ashr_i32 s0, s3, 16
2394; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
2395; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2396; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2397; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
2398; GCN-NEXT:    s_ashr_i32 s2, s1, 16
2399; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
2400; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
2401; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2402; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2403; GCN-NEXT:    s_xor_b32 s3, s2, s0
2404; GCN-NEXT:    s_ashr_i32 s3, s3, 30
2405; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2406; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2407; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
2408; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2409; GCN-NEXT:    s_or_b32 s3, s3, 1
2410; GCN-NEXT:    v_mov_b32_e32 v6, s3
2411; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
2412; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
2413; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
2414; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
2415; GCN-NEXT:    s_mov_b32 s0, 0xffff
2416; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2417; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2418; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
2419; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2420; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2421; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2422; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
2423; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2424; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2425; GCN-NEXT:    s_endpgm
2426  %r = srem <4 x i16> %x, %y
2427  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2428  ret void
2429}
2430
2431define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2432; CHECK-LABEL: @udiv_i3(
2433; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2434; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2435; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2436; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2437; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2438; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2439; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2440; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2441; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2442; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2443; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2444; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2445; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2446; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2447; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2448; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
2449; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
2450; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
2451; CHECK-NEXT:    ret void
2452;
2453; GCN-LABEL: udiv_i3:
2454; GCN:       ; %bb.0:
2455; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2456; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2457; GCN-NEXT:    s_mov_b32 s7, 0xf000
2458; GCN-NEXT:    s_mov_b32 s6, -1
2459; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2460; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2461; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2462; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2463; GCN-NEXT:    s_and_b32 s0, s0, 7
2464; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
2465; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2466; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2467; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2468; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2469; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2470; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2471; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2472; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2473; GCN-NEXT:    s_endpgm
2474  %r = udiv i3 %x, %y
2475  store i3 %r, i3 addrspace(1)* %out
2476  ret void
2477}
2478
2479define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2480; CHECK-LABEL: @urem_i3(
2481; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2482; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2483; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2484; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2485; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2486; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2487; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2488; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2489; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2490; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2491; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2492; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2493; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2494; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2495; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2496; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
2497; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
2498; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
2499; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
2500; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
2501; CHECK-NEXT:    ret void
2502;
2503; GCN-LABEL: urem_i3:
2504; GCN:       ; %bb.0:
2505; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2506; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2507; GCN-NEXT:    s_mov_b32 s7, 0xf000
2508; GCN-NEXT:    s_mov_b32 s6, -1
2509; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2510; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2511; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2512; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2513; GCN-NEXT:    s_and_b32 s2, s0, 7
2514; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
2515; GCN-NEXT:    s_lshr_b32 s1, s0, 8
2516; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2517; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2518; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2519; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2520; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2521; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2522; GCN-NEXT:    v_mul_lo_u32 v0, v0, s1
2523; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2524; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2525; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2526; GCN-NEXT:    s_endpgm
2527  %r = urem i3 %x, %y
2528  store i3 %r, i3 addrspace(1)* %out
2529  ret void
2530}
2531
2532define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2533; CHECK-LABEL: @sdiv_i3(
2534; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2535; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2536; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2537; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2538; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2539; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2540; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2541; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2542; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2543; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2544; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2545; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2546; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2547; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2548; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2549; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2550; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2551; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2552; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
2553; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
2554; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
2555; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
2556; CHECK-NEXT:    ret void
2557;
2558; GCN-LABEL: sdiv_i3:
2559; GCN:       ; %bb.0:
2560; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2561; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2562; GCN-NEXT:    s_mov_b32 s7, 0xf000
2563; GCN-NEXT:    s_mov_b32 s6, -1
2564; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2565; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
2566; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
2567; GCN-NEXT:    s_bfe_i32 s0, s0, 0x30000
2568; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2569; GCN-NEXT:    s_xor_b32 s0, s0, s1
2570; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2571; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2572; GCN-NEXT:    s_or_b32 s0, s0, 1
2573; GCN-NEXT:    v_mov_b32_e32 v3, s0
2574; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2575; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2576; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2577; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2578; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2579; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2580; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2581; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2582; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2583; GCN-NEXT:    s_endpgm
2584  %r = sdiv i3 %x, %y
2585  store i3 %r, i3 addrspace(1)* %out
2586  ret void
2587}
2588
2589define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2590; CHECK-LABEL: @srem_i3(
2591; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2592; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2593; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2594; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2595; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2596; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2597; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2598; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2599; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2600; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2601; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2602; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2603; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2604; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2605; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2606; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2607; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2608; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2609; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
2610; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
2611; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
2612; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
2613; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
2614; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
2615; CHECK-NEXT:    ret void
2616;
2617; GCN-LABEL: srem_i3:
2618; GCN:       ; %bb.0:
2619; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2620; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2621; GCN-NEXT:    s_mov_b32 s7, 0xf000
2622; GCN-NEXT:    s_mov_b32 s6, -1
2623; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2624; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
2625; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
2626; GCN-NEXT:    s_bfe_i32 s3, s0, 0x30000
2627; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
2628; GCN-NEXT:    s_xor_b32 s1, s3, s1
2629; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2630; GCN-NEXT:    s_ashr_i32 s1, s1, 30
2631; GCN-NEXT:    s_or_b32 s1, s1, 1
2632; GCN-NEXT:    v_mov_b32_e32 v3, s1
2633; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2634; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2635; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2636; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2637; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2638; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2639; GCN-NEXT:    s_lshr_b32 s2, s0, 8
2640; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2641; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2642; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2643; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2644; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2645; GCN-NEXT:    s_endpgm
2646  %r = srem i3 %x, %y
2647  store i3 %r, i3 addrspace(1)* %out
2648  ret void
2649}
2650
2651define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2652; CHECK-LABEL: @udiv_v3i16(
2653; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2654; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2655; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2656; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2657; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2658; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2659; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2660; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2661; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2662; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2663; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2664; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2665; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2666; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2667; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2668; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2669; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2670; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2671; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2672; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
2673; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
2674; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2675; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2676; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2677; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2678; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2679; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2680; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2681; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2682; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2683; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2684; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2685; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2686; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2687; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2688; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2689; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2690; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2691; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2692; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2693; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
2694; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2695; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2696; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2697; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2698; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2699; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2700; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2701; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2702; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2703; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2704; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2705; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2706; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2707; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2708; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2709; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2710; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2711; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2712; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2713; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2714; CHECK-NEXT:    ret void
2715;
2716; GCN-LABEL: udiv_v3i16:
2717; GCN:       ; %bb.0:
2718; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2719; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2720; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2721; GCN-NEXT:    s_mov_b32 s8, 0xffff
2722; GCN-NEXT:    s_mov_b32 s7, 0xf000
2723; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2724; GCN-NEXT:    s_and_b32 s6, s0, s8
2725; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
2726; GCN-NEXT:    s_and_b32 s6, s2, s8
2727; GCN-NEXT:    s_lshr_b32 s0, s0, 16
2728; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s0
2729; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s6
2730; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2731; GCN-NEXT:    s_lshr_b32 s0, s2, 16
2732; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
2733; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2734; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2735; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2736; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2737; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
2738; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2739; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
2740; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2741; GCN-NEXT:    s_and_b32 s0, s1, s8
2742; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2743; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
2744; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
2745; GCN-NEXT:    s_and_b32 s0, s3, s8
2746; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
2747; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2748; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2749; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
2750; GCN-NEXT:    s_mov_b32 s6, -1
2751; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2752; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
2753; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2754; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
2755; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v5
2756; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2757; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2758; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2759; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2760; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
2761; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
2762; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2763; GCN-NEXT:    s_endpgm
2764  %r = udiv <3 x i16> %x, %y
2765  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
2766  ret void
2767}
2768
2769define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2770; CHECK-LABEL: @urem_v3i16(
2771; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2772; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2773; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2774; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2775; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2776; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2777; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2778; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2779; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2780; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2781; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2782; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2783; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2784; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2785; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2786; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2787; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2788; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2789; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2790; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2791; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2792; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
2793; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
2794; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2795; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2796; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2797; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2798; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2799; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2800; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2801; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2802; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2803; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2804; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2805; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2806; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2807; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2808; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2809; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2810; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2811; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2812; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2813; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2814; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2815; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
2816; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2817; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2818; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2819; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2820; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2821; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2822; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2823; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2824; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2825; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2826; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2827; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2828; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2829; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2830; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2831; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2832; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2833; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2834; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2835; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2836; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2837; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2838; CHECK-NEXT:    ret void
2839;
2840; GCN-LABEL: urem_v3i16:
2841; GCN:       ; %bb.0:
2842; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2843; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2844; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2845; GCN-NEXT:    s_mov_b32 s8, 0xffff
2846; GCN-NEXT:    s_mov_b32 s7, 0xf000
2847; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2848; GCN-NEXT:    v_mov_b32_e32 v1, s2
2849; GCN-NEXT:    s_and_b32 s6, s0, s8
2850; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
2851; GCN-NEXT:    s_and_b32 s6, s2, s8
2852; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s6
2853; GCN-NEXT:    v_mov_b32_e32 v4, s0
2854; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2855; GCN-NEXT:    v_alignbit_b32 v4, s1, v4, 16
2856; GCN-NEXT:    v_and_b32_e32 v5, s8, v4
2857; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
2858; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2859; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2860; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
2861; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v3
2862; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2863; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v5
2864; GCN-NEXT:    v_and_b32_e32 v3, s8, v1
2865; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
2866; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
2867; GCN-NEXT:    s_and_b32 s0, s1, s8
2868; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
2869; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2870; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
2871; GCN-NEXT:    s_and_b32 s0, s3, s8
2872; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s0
2873; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
2874; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2875; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v6
2876; GCN-NEXT:    v_mad_f32 v3, -v5, v2, v3
2877; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
2878; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2879; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2880; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
2881; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
2882; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2883; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
2884; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v3
2885; GCN-NEXT:    v_mad_f32 v3, -v3, v6, v7
2886; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
2887; GCN-NEXT:    s_mov_b32 s6, -1
2888; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2889; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
2890; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
2891; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2892; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2893; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
2894; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
2895; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
2896; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2897; GCN-NEXT:    s_endpgm
2898  %r = urem <3 x i16> %x, %y
2899  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
2900  ret void
2901}
2902
2903define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2904; CHECK-LABEL: @sdiv_v3i16(
2905; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2906; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2907; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2908; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2909; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2910; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2911; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2912; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2913; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2914; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2915; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2916; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2917; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2918; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2919; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2920; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2921; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2922; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2923; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2924; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2925; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2926; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2927; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2928; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
2929; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
2930; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2931; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2932; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2933; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2934; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2935; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2936; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2937; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2938; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2939; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2940; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2941; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2942; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2943; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2944; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2945; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2946; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2947; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2948; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2949; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2950; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2951; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2952; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2953; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
2954; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2955; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2956; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2957; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2958; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2959; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2960; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2961; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2962; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2963; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2964; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2965; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2966; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2967; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2968; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2969; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2970; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2971; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2972; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2973; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2974; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2975; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2976; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2977; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2978; CHECK-NEXT:    ret void
2979;
2980; GCN-LABEL: sdiv_v3i16:
2981; GCN:       ; %bb.0:
2982; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2983; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2984; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2985; GCN-NEXT:    s_mov_b32 s7, 0xf000
2986; GCN-NEXT:    s_mov_b32 s6, -1
2987; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2988; GCN-NEXT:    s_sext_i32_i16 s9, s2
2989; GCN-NEXT:    s_sext_i32_i16 s8, s0
2990; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2991; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2992; GCN-NEXT:    s_xor_b32 s8, s9, s8
2993; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2994; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2995; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2996; GCN-NEXT:    s_or_b32 s8, s8, 1
2997; GCN-NEXT:    v_mov_b32_e32 v3, s8
2998; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2999; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3000; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3001; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3002; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3003; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
3004; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3005; GCN-NEXT:    s_ashr_i32 s2, s2, 16
3006; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3007; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
3008; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3009; GCN-NEXT:    s_xor_b32 s0, s2, s0
3010; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3011; GCN-NEXT:    s_or_b32 s0, s0, 1
3012; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
3013; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3014; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
3015; GCN-NEXT:    v_mov_b32_e32 v4, s0
3016; GCN-NEXT:    s_sext_i32_i16 s0, s1
3017; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3018; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
3019; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
3020; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3021; GCN-NEXT:    s_sext_i32_i16 s1, s3
3022; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
3023; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
3024; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3025; GCN-NEXT:    s_xor_b32 s0, s1, s0
3026; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3027; GCN-NEXT:    s_or_b32 s0, s0, 1
3028; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3029; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3030; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3031; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3032; GCN-NEXT:    v_mov_b32_e32 v5, s0
3033; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3034; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3035; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3036; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3037; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3038; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3039; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3040; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3041; GCN-NEXT:    s_endpgm
3042  %r = sdiv <3 x i16> %x, %y
3043  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3044  ret void
3045}
3046
3047define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3048; CHECK-LABEL: @srem_v3i16(
3049; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3050; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3051; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3052; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3053; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3054; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3055; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3056; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3057; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3058; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3059; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3060; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3061; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3062; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3063; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3064; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3065; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3066; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3067; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3068; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3069; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3070; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3071; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3072; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3073; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3074; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
3075; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
3076; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3077; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3078; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3079; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3080; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3081; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3082; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3083; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3084; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3085; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3086; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3087; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3088; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3089; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3090; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3091; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3092; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3093; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3094; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3095; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3096; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3097; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3098; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3099; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3100; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3101; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
3102; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3103; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3104; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3105; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3106; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3107; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3108; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3109; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3110; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3111; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3112; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3113; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3114; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3115; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3116; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3117; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3118; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3119; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3120; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3121; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3122; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3123; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3124; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3125; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3126; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3127; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3128; CHECK-NEXT:    ret void
3129;
3130; GCN-LABEL: srem_v3i16:
3131; GCN:       ; %bb.0:
3132; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3133; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3134; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3135; GCN-NEXT:    s_mov_b32 s7, 0xf000
3136; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3137; GCN-NEXT:    s_sext_i32_i16 s8, s2
3138; GCN-NEXT:    s_sext_i32_i16 s6, s0
3139; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
3140; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
3141; GCN-NEXT:    s_xor_b32 s6, s8, s6
3142; GCN-NEXT:    s_ashr_i32 s6, s6, 30
3143; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3144; GCN-NEXT:    s_or_b32 s6, s6, 1
3145; GCN-NEXT:    v_mov_b32_e32 v3, s6
3146; GCN-NEXT:    s_mov_b32 s6, -1
3147; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
3148; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3149; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3150; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3151; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3152; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3153; GCN-NEXT:    v_mov_b32_e32 v1, s2
3154; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3155; GCN-NEXT:    v_mov_b32_e32 v2, s0
3156; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 16
3157; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 16
3158; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v3
3159; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
3160; GCN-NEXT:    v_bfe_i32 v5, v1, 0, 16
3161; GCN-NEXT:    v_cvt_f32_i32_e32 v6, v5
3162; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3163; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
3164; GCN-NEXT:    v_xor_b32_e32 v3, v5, v3
3165; GCN-NEXT:    s_sext_i32_i16 s0, s1
3166; GCN-NEXT:    v_mul_f32_e32 v5, v6, v7
3167; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3168; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
3169; GCN-NEXT:    v_mad_f32 v6, -v5, v4, v6
3170; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3171; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
3172; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
3173; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
3174; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
3175; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
3176; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3177; GCN-NEXT:    s_sext_i32_i16 s2, s3
3178; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3179; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s2
3180; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v4
3181; GCN-NEXT:    s_xor_b32 s0, s2, s0
3182; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3183; GCN-NEXT:    s_or_b32 s0, s0, 1
3184; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
3185; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3186; GCN-NEXT:    v_mad_f32 v3, -v5, v4, v3
3187; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3188; GCN-NEXT:    v_mov_b32_e32 v6, s0
3189; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
3190; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3191; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3192; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
3193; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
3194; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3195; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3196; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
3197; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3198; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3199; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3200; GCN-NEXT:    s_endpgm
3201  %r = srem <3 x i16> %x, %y
3202  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3203  ret void
3204}
3205
3206define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3207; CHECK-LABEL: @udiv_v3i15(
3208; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3209; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3210; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3211; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3212; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3213; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3214; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3215; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3216; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3217; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3218; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3219; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3220; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3221; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3222; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3223; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3224; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3225; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
3226; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
3227; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
3228; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
3229; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3230; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
3231; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
3232; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3233; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3234; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3235; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3236; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3237; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3238; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3239; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3240; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3241; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3242; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3243; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3244; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3245; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
3246; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
3247; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
3248; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
3249; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3250; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
3251; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
3252; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3253; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3254; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3255; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3256; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3257; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3258; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3259; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3260; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3261; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3262; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3263; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3264; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3265; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
3266; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
3267; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
3268; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3269; CHECK-NEXT:    ret void
3270;
3271; GCN-LABEL: udiv_v3i15:
3272; GCN:       ; %bb.0:
3273; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3274; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3275; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3276; GCN-NEXT:    s_mov_b32 s7, 0xf000
3277; GCN-NEXT:    s_mov_b32 s6, -1
3278; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3279; GCN-NEXT:    v_mov_b32_e32 v0, s2
3280; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3281; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3282; GCN-NEXT:    s_and_b32 s9, s0, s3
3283; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
3284; GCN-NEXT:    v_mov_b32_e32 v2, s0
3285; GCN-NEXT:    s_and_b32 s8, s2, s3
3286; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf000f
3287; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
3288; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
3289; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3290; GCN-NEXT:    s_bfe_u32 s2, s2, 0xf000f
3291; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3292; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s2
3293; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3294; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3295; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3296; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3297; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3298; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3299; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
3300; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3301; GCN-NEXT:    v_mul_f32_e32 v1, v6, v7
3302; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3303; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3304; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
3305; GCN-NEXT:    v_mad_f32 v4, -v1, v5, v6
3306; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3307; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
3308; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v2
3309; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
3310; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3311; GCN-NEXT:    v_mul_f32_e32 v1, v0, v6
3312; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3313; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v1
3314; GCN-NEXT:    v_mad_f32 v0, -v1, v2, v0
3315; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
3316; GCN-NEXT:    v_and_b32_e32 v2, s3, v3
3317; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
3318; GCN-NEXT:    v_and_b32_e32 v3, s3, v4
3319; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3320; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3321; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3322; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3323; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3324; GCN-NEXT:    s_waitcnt expcnt(0)
3325; GCN-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
3326; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
3327; GCN-NEXT:    s_endpgm
3328  %r = udiv <3 x i15> %x, %y
3329  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3330  ret void
3331}
3332
3333define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3334; CHECK-LABEL: @urem_v3i15(
3335; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3336; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3337; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3338; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3339; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3340; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3341; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3342; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3343; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3344; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3345; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3346; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3347; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3348; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3349; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3350; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3351; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3352; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3353; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3354; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
3355; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
3356; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
3357; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
3358; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3359; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
3360; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
3361; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3362; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3363; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3364; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3365; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3366; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3367; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3368; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3369; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3370; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3371; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3372; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3373; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3374; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3375; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3376; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
3377; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
3378; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
3379; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
3380; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3381; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
3382; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
3383; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3384; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3385; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3386; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3387; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3388; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3389; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3390; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3391; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3392; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3393; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3394; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3395; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3396; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3397; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3398; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
3399; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
3400; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
3401; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3402; CHECK-NEXT:    ret void
3403;
3404; GCN-LABEL: urem_v3i15:
3405; GCN:       ; %bb.0:
3406; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3407; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3408; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3409; GCN-NEXT:    s_mov_b32 s7, 0xf000
3410; GCN-NEXT:    s_mov_b32 s6, -1
3411; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3412; GCN-NEXT:    v_mov_b32_e32 v0, s2
3413; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3414; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3415; GCN-NEXT:    s_and_b32 s10, s0, s3
3416; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
3417; GCN-NEXT:    s_and_b32 s9, s2, s3
3418; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s9
3419; GCN-NEXT:    v_mov_b32_e32 v2, s0
3420; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3421; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3422; GCN-NEXT:    s_bfe_u32 s1, s0, 0xf000f
3423; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
3424; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3425; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3426; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3427; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3428; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3429; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
3430; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
3431; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
3432; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3433; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v5
3434; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3435; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3436; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
3437; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
3438; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
3439; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v0
3440; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3441; GCN-NEXT:    v_mad_f32 v3, -v1, v5, v3
3442; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3443; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
3444; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3445; GCN-NEXT:    s_lshr_b32 s0, s0, 15
3446; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
3447; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3448; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v3
3449; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3450; GCN-NEXT:    v_mad_f32 v3, -v3, v4, v7
3451; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3452; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
3453; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3454; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3455; GCN-NEXT:    s_lshr_b32 s8, s2, 15
3456; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
3457; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
3458; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
3459; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3460; GCN-NEXT:    v_and_b32_e32 v2, s3, v6
3461; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3462; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3463; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3464; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3465; GCN-NEXT:    s_waitcnt expcnt(0)
3466; GCN-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
3467; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
3468; GCN-NEXT:    s_endpgm
3469  %r = urem <3 x i15> %x, %y
3470  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3471  ret void
3472}
3473
3474define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3475; CHECK-LABEL: @sdiv_v3i15(
3476; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3477; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3478; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3479; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3480; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3481; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3482; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3483; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3484; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3485; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3486; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3487; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3488; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3489; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3490; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3491; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3492; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3493; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3494; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3495; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3496; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
3497; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
3498; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
3499; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
3500; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
3501; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3502; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
3503; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
3504; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
3505; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
3506; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
3507; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
3508; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
3509; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
3510; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
3511; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
3512; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
3513; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
3514; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
3515; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
3516; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3517; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
3518; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
3519; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
3520; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
3521; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
3522; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
3523; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
3524; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
3525; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3526; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
3527; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
3528; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
3529; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
3530; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
3531; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
3532; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
3533; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
3534; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
3535; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
3536; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
3537; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
3538; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
3539; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
3540; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
3541; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
3542; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
3543; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
3544; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
3545; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
3546; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
3547; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
3548; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3549; CHECK-NEXT:    ret void
3550;
3551; GCN-LABEL: sdiv_v3i15:
3552; GCN:       ; %bb.0:
3553; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3554; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3555; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3556; GCN-NEXT:    s_mov_b32 s7, 0xf000
3557; GCN-NEXT:    s_mov_b32 s6, -1
3558; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3559; GCN-NEXT:    v_mov_b32_e32 v0, s2
3560; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3561; GCN-NEXT:    s_bfe_i32 s3, s0, 0xf0000
3562; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s3
3563; GCN-NEXT:    v_mov_b32_e32 v1, s0
3564; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
3565; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf0000
3566; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
3567; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3568; GCN-NEXT:    s_xor_b32 s1, s1, s3
3569; GCN-NEXT:    s_bfe_i32 s0, s0, 0xf000f
3570; GCN-NEXT:    s_ashr_i32 s1, s1, 30
3571; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3572; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3573; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3574; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3575; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3576; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
3577; GCN-NEXT:    s_or_b32 s1, s1, 1
3578; GCN-NEXT:    v_mov_b32_e32 v5, s1
3579; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3580; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf000f
3581; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3582; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
3583; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3584; GCN-NEXT:    s_xor_b32 s0, s1, s0
3585; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 15
3586; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3587; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
3588; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3589; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
3590; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
3591; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3592; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v1
3593; GCN-NEXT:    s_or_b32 s0, s0, 1
3594; GCN-NEXT:    v_mov_b32_e32 v6, s0
3595; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3596; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 15
3597; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3598; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v0
3599; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3600; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
3601; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
3602; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
3603; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
3604; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3605; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
3606; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
3607; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
3608; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
3609; GCN-NEXT:    s_movk_i32 s0, 0x7fff
3610; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3611; GCN-NEXT:    v_and_b32_e32 v3, s0, v3
3612; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3613; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
3614; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3615; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3616; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3617; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3618; GCN-NEXT:    s_waitcnt expcnt(0)
3619; GCN-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
3620; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
3621; GCN-NEXT:    s_endpgm
3622  %r = sdiv <3 x i15> %x, %y
3623  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3624  ret void
3625}
3626
3627define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3628; CHECK-LABEL: @srem_v3i15(
3629; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3630; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3631; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3632; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3633; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3634; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3635; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3636; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3637; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3638; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3639; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3640; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3641; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3642; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3643; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3644; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3645; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3646; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3647; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3648; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3649; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3650; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3651; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
3652; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
3653; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
3654; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
3655; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
3656; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3657; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
3658; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
3659; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3660; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3661; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3662; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3663; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3664; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3665; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3666; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3667; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3668; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3669; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3670; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3671; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3672; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3673; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3674; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3675; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3676; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3677; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
3678; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
3679; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
3680; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
3681; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
3682; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3683; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
3684; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
3685; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3686; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3687; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3688; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3689; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3690; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3691; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3692; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3693; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3694; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3695; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3696; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3697; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3698; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3699; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3700; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3701; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3702; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3703; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
3704; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
3705; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
3706; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
3707; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3708; CHECK-NEXT:    ret void
3709;
3710; GCN-LABEL: srem_v3i15:
3711; GCN:       ; %bb.0:
3712; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3713; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3714; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3715; GCN-NEXT:    s_mov_b32 s7, 0xf000
3716; GCN-NEXT:    s_mov_b32 s6, -1
3717; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3718; GCN-NEXT:    v_mov_b32_e32 v0, s2
3719; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3720; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3721; GCN-NEXT:    s_and_b32 s11, s0, s3
3722; GCN-NEXT:    s_bfe_i32 s11, s11, 0xf0000
3723; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s11
3724; GCN-NEXT:    s_and_b32 s9, s2, s3
3725; GCN-NEXT:    s_bfe_i32 s9, s9, 0xf0000
3726; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s9
3727; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3728; GCN-NEXT:    s_xor_b32 s9, s9, s11
3729; GCN-NEXT:    s_ashr_i32 s9, s9, 30
3730; GCN-NEXT:    s_or_b32 s9, s9, 1
3731; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3732; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3733; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3734; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3735; GCN-NEXT:    v_mov_b32_e32 v5, s9
3736; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3737; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3738; GCN-NEXT:    v_mov_b32_e32 v1, s0
3739; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3740; GCN-NEXT:    s_bfe_u32 s12, s0, 0xf000f
3741; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
3742; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
3743; GCN-NEXT:    s_lshr_b32 s1, s0, 15
3744; GCN-NEXT:    s_bfe_i32 s0, s12, 0xf0000
3745; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
3746; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
3747; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
3748; GCN-NEXT:    s_lshr_b32 s8, s2, 15
3749; GCN-NEXT:    s_bfe_i32 s2, s10, 0xf0000
3750; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
3751; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3752; GCN-NEXT:    s_xor_b32 s0, s2, s0
3753; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3754; GCN-NEXT:    s_or_b32 s0, s0, 1
3755; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
3756; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3757; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
3758; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3759; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
3760; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
3761; GCN-NEXT:    v_mov_b32_e32 v6, s0
3762; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3763; GCN-NEXT:    v_bfe_i32 v4, v1, 0, 15
3764; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3765; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v4
3766; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3767; GCN-NEXT:    v_bfe_i32 v6, v0, 0, 15
3768; GCN-NEXT:    v_cvt_f32_i32_e32 v7, v6
3769; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v5
3770; GCN-NEXT:    v_xor_b32_e32 v4, v6, v4
3771; GCN-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
3772; GCN-NEXT:    v_or_b32_e32 v4, 1, v4
3773; GCN-NEXT:    v_mul_f32_e32 v6, v7, v8
3774; GCN-NEXT:    v_trunc_f32_e32 v6, v6
3775; GCN-NEXT:    v_mad_f32 v7, -v6, v5, v7
3776; GCN-NEXT:    v_cvt_i32_f32_e32 v6, v6
3777; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
3778; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
3779; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
3780; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
3781; GCN-NEXT:    v_mul_lo_u32 v1, v4, v1
3782; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3783; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
3784; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
3785; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
3786; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3787; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3788; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3789; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3790; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3791; GCN-NEXT:    s_waitcnt expcnt(0)
3792; GCN-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
3793; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
3794; GCN-NEXT:    s_endpgm
3795  %r = srem <3 x i15> %x, %y
3796  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3797  ret void
3798}
3799
3800define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
3801; CHECK-LABEL: @udiv_i32_oddk_denom(
3802; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
3803; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3804; CHECK-NEXT:    ret void
3805;
3806; GCN-LABEL: udiv_i32_oddk_denom:
3807; GCN:       ; %bb.0:
3808; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3809; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
3810; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
3811; GCN-NEXT:    s_mov_b32 s7, 0xf000
3812; GCN-NEXT:    s_mov_b32 s6, -1
3813; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3814; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
3815; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
3816; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
3817; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3818; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
3819; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3820; GCN-NEXT:    s_endpgm
3821  %r = udiv i32 %x, 1235195
3822  store i32 %r, i32 addrspace(1)* %out
3823  ret void
3824}
3825
3826define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
3827; CHECK-LABEL: @udiv_i32_pow2k_denom(
3828; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
3829; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3830; CHECK-NEXT:    ret void
3831;
3832; GCN-LABEL: udiv_i32_pow2k_denom:
3833; GCN:       ; %bb.0:
3834; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3835; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
3836; GCN-NEXT:    s_mov_b32 s7, 0xf000
3837; GCN-NEXT:    s_mov_b32 s6, -1
3838; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3839; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3840; GCN-NEXT:    v_mov_b32_e32 v0, s0
3841; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3842; GCN-NEXT:    s_endpgm
3843  %r = udiv i32 %x, 4096
3844  store i32 %r, i32 addrspace(1)* %out
3845  ret void
3846}
3847
3848define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
3849; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
3850; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
3851; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
3852; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3853; CHECK-NEXT:    ret void
3854;
3855; GCN-LABEL: udiv_i32_pow2_shl_denom:
3856; GCN:       ; %bb.0:
3857; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3858; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3859; GCN-NEXT:    s_mov_b32 s7, 0xf000
3860; GCN-NEXT:    s_mov_b32 s6, -1
3861; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3862; GCN-NEXT:    s_add_i32 s1, s1, 12
3863; GCN-NEXT:    s_lshr_b32 s0, s0, s1
3864; GCN-NEXT:    v_mov_b32_e32 v0, s0
3865; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3866; GCN-NEXT:    s_endpgm
3867  %shl.y = shl i32 4096, %y
3868  %r = udiv i32 %x, %shl.y
3869  store i32 %r, i32 addrspace(1)* %out
3870  ret void
3871}
3872
3873define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
3874; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
3875; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3876; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
3877; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
3878; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
3879; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
3880; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
3881; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
3882; CHECK-NEXT:    ret void
3883;
3884; GCN-LABEL: udiv_v2i32_pow2k_denom:
3885; GCN:       ; %bb.0:
3886; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3887; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3888; GCN-NEXT:    s_mov_b32 s7, 0xf000
3889; GCN-NEXT:    s_mov_b32 s6, -1
3890; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3891; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3892; GCN-NEXT:    s_lshr_b32 s1, s1, 12
3893; GCN-NEXT:    v_mov_b32_e32 v0, s0
3894; GCN-NEXT:    v_mov_b32_e32 v1, s1
3895; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3896; GCN-NEXT:    s_endpgm
3897  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
3898  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
3899  ret void
3900}
3901
3902define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
3903; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
3904; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3905; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
3906; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
3907; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
3908; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
3909; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
3910; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
3911; CHECK-NEXT:    ret void
3912;
3913; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom:
3914; GCN:       ; %bb.0:
3915; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3916; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3917; GCN-NEXT:    v_mov_b32_e32 v0, 0x100101
3918; GCN-NEXT:    s_mov_b32 s7, 0xf000
3919; GCN-NEXT:    s_mov_b32 s6, -1
3920; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3921; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
3922; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3923; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
3924; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
3925; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3926; GCN-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
3927; GCN-NEXT:    v_mov_b32_e32 v0, s0
3928; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3929; GCN-NEXT:    s_endpgm
3930  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
3931  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
3932  ret void
3933}
3934
3935define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
3936; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
3937; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
3938; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3939; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
3940; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
3941; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
3942; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
3943; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
3944; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
3945; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
3946; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
3947; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
3948; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
3949; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
3950; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
3951; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
3952; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
3953; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
3954; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
3955; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
3956; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
3957; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
3958; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
3959; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
3960; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
3961; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
3962; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
3963; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
3964; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
3965; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
3966; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
3967; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
3968; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
3969; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
3970; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
3971; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
3972; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
3973; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3974; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
3975; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
3976; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
3977; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
3978; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
3979; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
3980; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
3981; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
3982; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
3983; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
3984; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
3985; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
3986; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
3987; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
3988; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
3989; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
3990; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
3991; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
3992; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
3993; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
3994; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
3995; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
3996; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
3997; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
3998; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
3999; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
4000; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
4001; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
4002; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4003; CHECK-NEXT:    ret void
4004;
4005; GCN-LABEL: udiv_v2i32_pow2_shl_denom:
4006; GCN:       ; %bb.0:
4007; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4008; GCN-NEXT:    s_movk_i32 s4, 0x1000
4009; GCN-NEXT:    s_mov_b32 s7, 0xf000
4010; GCN-NEXT:    s_mov_b32 s6, -1
4011; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4012; GCN-NEXT:    s_lshl_b32 s8, s4, s2
4013; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
4014; GCN-NEXT:    s_lshl_b32 s9, s4, s3
4015; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
4016; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4017; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4018; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4019; GCN-NEXT:    s_mov_b32 s0, 0x4f7ffffe
4020; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4021; GCN-NEXT:    v_mul_f32_e32 v0, s0, v0
4022; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4023; GCN-NEXT:    v_mul_f32_e32 v1, s0, v1
4024; GCN-NEXT:    s_sub_i32 s0, 0, s8
4025; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4026; GCN-NEXT:    v_mul_lo_u32 v2, s0, v0
4027; GCN-NEXT:    s_sub_i32 s0, 0, s9
4028; GCN-NEXT:    v_mul_lo_u32 v3, s0, v1
4029; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
4030; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4031; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4032; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4033; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
4034; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
4035; GCN-NEXT:    v_mul_hi_u32 v1, s3, v1
4036; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
4037; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4038; GCN-NEXT:    v_mul_lo_u32 v4, v1, s9
4039; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
4040; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
4041; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
4042; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
4043; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4044; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4045; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
4046; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4047; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v4
4048; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4049; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
4050; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4051; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v2
4052; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4053; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4054; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
4055; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4056; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4057; GCN-NEXT:    s_endpgm
4058  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4059  %r = udiv <2 x i32> %x, %shl.y
4060  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4061  ret void
4062}
4063
4064define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4065; CHECK-LABEL: @urem_i32_oddk_denom(
4066; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
4067; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4068; CHECK-NEXT:    ret void
4069;
4070; GCN-LABEL: urem_i32_oddk_denom:
4071; GCN:       ; %bb.0:
4072; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4073; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4074; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
4075; GCN-NEXT:    s_mov_b32 s7, 0xf000
4076; GCN-NEXT:    s_mov_b32 s6, -1
4077; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4078; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4079; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
4080; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
4081; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
4082; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
4083; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x12d8fb, v0
4084; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4085; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4086; GCN-NEXT:    s_endpgm
4087  %r = urem i32 %x, 1235195
4088  store i32 %r, i32 addrspace(1)* %out
4089  ret void
4090}
4091
4092define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4093; CHECK-LABEL: @urem_i32_pow2k_denom(
4094; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
4095; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4096; CHECK-NEXT:    ret void
4097;
4098; GCN-LABEL: urem_i32_pow2k_denom:
4099; GCN:       ; %bb.0:
4100; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4101; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4102; GCN-NEXT:    s_mov_b32 s7, 0xf000
4103; GCN-NEXT:    s_mov_b32 s6, -1
4104; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4105; GCN-NEXT:    s_and_b32 s0, s0, 0xfff
4106; GCN-NEXT:    v_mov_b32_e32 v0, s0
4107; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4108; GCN-NEXT:    s_endpgm
4109  %r = urem i32 %x, 4096
4110  store i32 %r, i32 addrspace(1)* %out
4111  ret void
4112}
4113
4114define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4115; CHECK-LABEL: @urem_i32_pow2_shl_denom(
4116; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4117; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
4118; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4119; CHECK-NEXT:    ret void
4120;
4121; GCN-LABEL: urem_i32_pow2_shl_denom:
4122; GCN:       ; %bb.0:
4123; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4124; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4125; GCN-NEXT:    s_mov_b32 s7, 0xf000
4126; GCN-NEXT:    s_mov_b32 s6, -1
4127; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4128; GCN-NEXT:    s_lshl_b32 s1, 0x1000, s1
4129; GCN-NEXT:    s_add_i32 s1, s1, -1
4130; GCN-NEXT:    s_and_b32 s0, s0, s1
4131; GCN-NEXT:    v_mov_b32_e32 v0, s0
4132; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4133; GCN-NEXT:    s_endpgm
4134  %shl.y = shl i32 4096, %y
4135  %r = urem i32 %x, %shl.y
4136  store i32 %r, i32 addrspace(1)* %out
4137  ret void
4138}
4139
4140define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4141; CHECK-LABEL: @urem_v2i32_pow2k_denom(
4142; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4143; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
4144; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4145; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4146; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
4147; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4148; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4149; CHECK-NEXT:    ret void
4150;
4151; GCN-LABEL: urem_v2i32_pow2k_denom:
4152; GCN:       ; %bb.0:
4153; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4154; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4155; GCN-NEXT:    s_movk_i32 s2, 0xfff
4156; GCN-NEXT:    s_mov_b32 s7, 0xf000
4157; GCN-NEXT:    s_mov_b32 s6, -1
4158; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4159; GCN-NEXT:    s_and_b32 s0, s0, s2
4160; GCN-NEXT:    s_and_b32 s1, s1, s2
4161; GCN-NEXT:    v_mov_b32_e32 v0, s0
4162; GCN-NEXT:    v_mov_b32_e32 v1, s1
4163; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4164; GCN-NEXT:    s_endpgm
4165  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
4166  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4167  ret void
4168}
4169
4170define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4171; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
4172; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4173; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4174; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4175; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
4176; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
4177; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
4178; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
4179; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
4180; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
4181; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
4182; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
4183; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
4184; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
4185; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
4186; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
4187; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
4188; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
4189; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
4190; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4191; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4192; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4193; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4194; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
4195; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
4196; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
4197; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
4198; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
4199; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
4200; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
4201; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
4202; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
4203; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
4204; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4205; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
4206; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4207; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
4208; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
4209; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
4210; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
4211; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
4212; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
4213; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
4214; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
4215; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
4216; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
4217; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
4218; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
4219; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
4220; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
4221; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
4222; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
4223; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
4224; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
4225; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
4226; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
4227; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
4228; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
4229; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
4230; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
4231; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
4232; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
4233; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4234; CHECK-NEXT:    ret void
4235;
4236; GCN-LABEL: urem_v2i32_pow2_shl_denom:
4237; GCN:       ; %bb.0:
4238; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4239; GCN-NEXT:    s_movk_i32 s4, 0x1000
4240; GCN-NEXT:    s_mov_b32 s7, 0xf000
4241; GCN-NEXT:    s_mov_b32 s6, -1
4242; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4243; GCN-NEXT:    s_lshl_b32 s8, s4, s2
4244; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
4245; GCN-NEXT:    s_lshl_b32 s3, s4, s3
4246; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
4247; GCN-NEXT:    s_mov_b32 s4, 0x4f7ffffe
4248; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4249; GCN-NEXT:    s_sub_i32 s2, 0, s8
4250; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4251; GCN-NEXT:    v_mul_f32_e32 v0, s4, v0
4252; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4253; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
4254; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4255; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4256; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4257; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
4258; GCN-NEXT:    s_sub_i32 s2, 0, s3
4259; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
4260; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
4261; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4262; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4263; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4264; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4265; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
4266; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
4267; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
4268; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
4269; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4270; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
4271; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
4272; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4273; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
4274; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
4275; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4276; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
4277; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
4278; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
4279; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4280; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
4281; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
4282; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4283; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4284; GCN-NEXT:    s_endpgm
4285  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4286  %r = urem <2 x i32> %x, %shl.y
4287  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4288  ret void
4289}
4290
4291define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4292; CHECK-LABEL: @sdiv_i32_oddk_denom(
4293; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
4294; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4295; CHECK-NEXT:    ret void
4296;
4297; GCN-LABEL: sdiv_i32_oddk_denom:
4298; GCN:       ; %bb.0:
4299; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4300; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4301; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4302; GCN-NEXT:    s_mov_b32 s7, 0xf000
4303; GCN-NEXT:    s_mov_b32 s6, -1
4304; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4305; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
4306; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
4307; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4308; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
4309; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4310; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4311; GCN-NEXT:    s_endpgm
4312  %r = sdiv i32 %x, 1235195
4313  store i32 %r, i32 addrspace(1)* %out
4314  ret void
4315}
4316
4317define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4318; CHECK-LABEL: @sdiv_i32_pow2k_denom(
4319; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
4320; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4321; CHECK-NEXT:    ret void
4322;
4323; GCN-LABEL: sdiv_i32_pow2k_denom:
4324; GCN:       ; %bb.0:
4325; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4326; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4327; GCN-NEXT:    s_mov_b32 s7, 0xf000
4328; GCN-NEXT:    s_mov_b32 s6, -1
4329; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4330; GCN-NEXT:    s_ashr_i32 s1, s0, 31
4331; GCN-NEXT:    s_lshr_b32 s1, s1, 20
4332; GCN-NEXT:    s_add_i32 s0, s0, s1
4333; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4334; GCN-NEXT:    v_mov_b32_e32 v0, s0
4335; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4336; GCN-NEXT:    s_endpgm
4337  %r = sdiv i32 %x, 4096
4338  store i32 %r, i32 addrspace(1)* %out
4339  ret void
4340}
4341
4342define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4343; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
4344; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4345; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
4346; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4347; CHECK-NEXT:    ret void
4348;
4349; GCN-LABEL: sdiv_i32_pow2_shl_denom:
4350; GCN:       ; %bb.0:
4351; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4352; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4353; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4354; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
4355; GCN-NEXT:    s_ashr_i32 s4, s3, 31
4356; GCN-NEXT:    s_add_i32 s3, s3, s4
4357; GCN-NEXT:    s_xor_b32 s7, s3, s4
4358; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s7
4359; GCN-NEXT:    s_sub_i32 s3, 0, s7
4360; GCN-NEXT:    s_ashr_i32 s5, s2, 31
4361; GCN-NEXT:    s_add_i32 s2, s2, s5
4362; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4363; GCN-NEXT:    s_xor_b32 s6, s2, s5
4364; GCN-NEXT:    s_xor_b32 s4, s5, s4
4365; GCN-NEXT:    s_mov_b32 s2, -1
4366; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
4367; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4368; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4369; GCN-NEXT:    s_mov_b32 s3, 0xf000
4370; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4371; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4372; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
4373; GCN-NEXT:    v_mul_lo_u32 v1, v0, s7
4374; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4375; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
4376; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v1
4377; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
4378; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4379; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4380; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4381; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
4382; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4383; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
4384; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
4385; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4386; GCN-NEXT:    s_endpgm
4387  %shl.y = shl i32 4096, %y
4388  %r = sdiv i32 %x, %shl.y
4389  store i32 %r, i32 addrspace(1)* %out
4390  ret void
4391}
4392
4393define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4394; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
4395; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4396; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4397; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4398; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4399; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
4400; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4401; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4402; CHECK-NEXT:    ret void
4403;
4404; GCN-LABEL: sdiv_v2i32_pow2k_denom:
4405; GCN:       ; %bb.0:
4406; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4407; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4408; GCN-NEXT:    s_mov_b32 s7, 0xf000
4409; GCN-NEXT:    s_mov_b32 s6, -1
4410; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4411; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4412; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4413; GCN-NEXT:    s_ashr_i32 s3, s1, 31
4414; GCN-NEXT:    s_add_i32 s0, s0, s2
4415; GCN-NEXT:    s_lshr_b32 s2, s3, 20
4416; GCN-NEXT:    s_add_i32 s1, s1, s2
4417; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4418; GCN-NEXT:    s_ashr_i32 s1, s1, 12
4419; GCN-NEXT:    v_mov_b32_e32 v0, s0
4420; GCN-NEXT:    v_mov_b32_e32 v1, s1
4421; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4422; GCN-NEXT:    s_endpgm
4423  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
4424  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4425  ret void
4426}
4427
4428define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4429; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
4430; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4431; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4432; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4433; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4434; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
4435; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4436; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4437; CHECK-NEXT:    ret void
4438;
4439; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
4440; GCN:       ; %bb.0:
4441; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4442; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4443; GCN-NEXT:    v_mov_b32_e32 v0, 0x80080081
4444; GCN-NEXT:    s_mov_b32 s7, 0xf000
4445; GCN-NEXT:    s_mov_b32 s6, -1
4446; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4447; GCN-NEXT:    v_mul_hi_i32 v0, s1, v0
4448; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4449; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4450; GCN-NEXT:    s_add_i32 s0, s0, s2
4451; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
4452; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4453; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
4454; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4455; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
4456; GCN-NEXT:    v_mov_b32_e32 v0, s0
4457; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4458; GCN-NEXT:    s_endpgm
4459  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
4460  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4461  ret void
4462}
4463
4464define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4465; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
4466; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4467; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4468; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4469; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
4470; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
4471; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4472; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
4473; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
4474; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
4475; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
4476; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
4477; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
4478; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
4479; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
4480; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
4481; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
4482; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
4483; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
4484; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4485; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4486; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4487; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4488; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
4489; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
4490; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
4491; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
4492; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
4493; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
4494; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
4495; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
4496; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
4497; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
4498; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
4499; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
4500; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
4501; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
4502; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
4503; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
4504; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
4505; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
4506; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
4507; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
4508; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
4509; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4510; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
4511; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
4512; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
4513; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
4514; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
4515; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
4516; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
4517; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
4518; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
4519; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
4520; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
4521; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
4522; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
4523; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
4524; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
4525; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
4526; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
4527; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
4528; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
4529; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
4530; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
4531; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
4532; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
4533; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
4534; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
4535; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
4536; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
4537; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
4538; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
4539; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
4540; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
4541; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
4542; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
4543; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
4544; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
4545; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
4546; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
4547; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
4548; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
4549; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4550; CHECK-NEXT:    ret void
4551;
4552; GCN-LABEL: sdiv_v2i32_pow2_shl_denom:
4553; GCN:       ; %bb.0:
4554; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4555; GCN-NEXT:    s_movk_i32 s10, 0x1000
4556; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
4557; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4558; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
4559; GCN-NEXT:    s_mov_b32 s7, 0xf000
4560; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4561; GCN-NEXT:    s_lshl_b32 s2, s10, s2
4562; GCN-NEXT:    s_ashr_i32 s11, s2, 31
4563; GCN-NEXT:    s_add_i32 s2, s2, s11
4564; GCN-NEXT:    s_xor_b32 s12, s2, s11
4565; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
4566; GCN-NEXT:    s_lshl_b32 s0, s10, s3
4567; GCN-NEXT:    s_sub_i32 s3, 0, s12
4568; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4569; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4570; GCN-NEXT:    s_add_i32 s0, s0, s2
4571; GCN-NEXT:    s_xor_b32 s10, s0, s2
4572; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
4573; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
4574; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4575; GCN-NEXT:    s_ashr_i32 s1, s8, 31
4576; GCN-NEXT:    s_add_i32 s0, s8, s1
4577; GCN-NEXT:    s_xor_b32 s0, s0, s1
4578; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4579; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
4580; GCN-NEXT:    s_xor_b32 s3, s1, s11
4581; GCN-NEXT:    s_mov_b32 s6, -1
4582; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4583; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4584; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4585; GCN-NEXT:    v_mul_f32_e32 v1, s13, v2
4586; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4587; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
4588; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4589; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
4590; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
4591; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
4592; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v2
4593; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4594; GCN-NEXT:    s_sub_i32 s0, 0, s10
4595; GCN-NEXT:    v_mul_lo_u32 v3, s0, v1
4596; GCN-NEXT:    s_ashr_i32 s0, s9, 31
4597; GCN-NEXT:    s_add_i32 s1, s9, s0
4598; GCN-NEXT:    s_xor_b32 s1, s1, s0
4599; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4600; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
4601; GCN-NEXT:    s_xor_b32 s2, s0, s2
4602; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
4603; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
4604; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
4605; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
4606; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
4607; GCN-NEXT:    v_mul_lo_u32 v2, v1, s10
4608; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4609; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
4610; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
4611; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
4612; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4613; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
4614; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4615; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4616; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
4617; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4618; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
4619; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
4620; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4621; GCN-NEXT:    s_endpgm
4622  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4623  %r = sdiv <2 x i32> %x, %shl.y
4624  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4625  ret void
4626}
4627
4628define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4629; CHECK-LABEL: @srem_i32_oddk_denom(
4630; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
4631; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4632; CHECK-NEXT:    ret void
4633;
4634; GCN-LABEL: srem_i32_oddk_denom:
4635; GCN:       ; %bb.0:
4636; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4637; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4638; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4639; GCN-NEXT:    s_mov_b32 s7, 0xf000
4640; GCN-NEXT:    s_mov_b32 s6, -1
4641; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4642; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
4643; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
4644; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4645; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
4646; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4647; GCN-NEXT:    v_mul_i32_i24_e32 v0, 0x12d8fb, v0
4648; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4649; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4650; GCN-NEXT:    s_endpgm
4651  %r = srem i32 %x, 1235195
4652  store i32 %r, i32 addrspace(1)* %out
4653  ret void
4654}
4655
4656define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4657; CHECK-LABEL: @srem_i32_pow2k_denom(
4658; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
4659; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4660; CHECK-NEXT:    ret void
4661;
4662; GCN-LABEL: srem_i32_pow2k_denom:
4663; GCN:       ; %bb.0:
4664; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4665; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4666; GCN-NEXT:    s_mov_b32 s7, 0xf000
4667; GCN-NEXT:    s_mov_b32 s6, -1
4668; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4669; GCN-NEXT:    s_ashr_i32 s1, s0, 31
4670; GCN-NEXT:    s_lshr_b32 s1, s1, 20
4671; GCN-NEXT:    s_add_i32 s1, s0, s1
4672; GCN-NEXT:    s_and_b32 s1, s1, 0xfffff000
4673; GCN-NEXT:    s_sub_i32 s0, s0, s1
4674; GCN-NEXT:    v_mov_b32_e32 v0, s0
4675; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4676; GCN-NEXT:    s_endpgm
4677  %r = srem i32 %x, 4096
4678  store i32 %r, i32 addrspace(1)* %out
4679  ret void
4680}
4681
4682define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4683; CHECK-LABEL: @srem_i32_pow2_shl_denom(
4684; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4685; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
4686; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4687; CHECK-NEXT:    ret void
4688;
4689; GCN-LABEL: srem_i32_pow2_shl_denom:
4690; GCN:       ; %bb.0:
4691; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4692; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4693; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4694; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
4695; GCN-NEXT:    s_ashr_i32 s4, s3, 31
4696; GCN-NEXT:    s_add_i32 s3, s3, s4
4697; GCN-NEXT:    s_xor_b32 s6, s3, s4
4698; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
4699; GCN-NEXT:    s_sub_i32 s3, 0, s6
4700; GCN-NEXT:    s_ashr_i32 s4, s2, 31
4701; GCN-NEXT:    s_add_i32 s2, s2, s4
4702; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4703; GCN-NEXT:    s_xor_b32 s5, s2, s4
4704; GCN-NEXT:    s_mov_b32 s2, -1
4705; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
4706; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4707; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4708; GCN-NEXT:    s_mov_b32 s3, 0xf000
4709; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4710; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4711; GCN-NEXT:    v_mul_hi_u32 v0, s5, v0
4712; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
4713; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
4714; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
4715; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
4716; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4717; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
4718; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
4719; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4720; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
4721; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
4722; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4723; GCN-NEXT:    s_endpgm
4724  %shl.y = shl i32 4096, %y
4725  %r = srem i32 %x, %shl.y
4726  store i32 %r, i32 addrspace(1)* %out
4727  ret void
4728}
4729
4730define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4731; CHECK-LABEL: @srem_v2i32_pow2k_denom(
4732; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4733; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
4734; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4735; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4736; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
4737; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4738; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4739; CHECK-NEXT:    ret void
4740;
4741; GCN-LABEL: srem_v2i32_pow2k_denom:
4742; GCN:       ; %bb.0:
4743; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4744; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4745; GCN-NEXT:    s_movk_i32 s2, 0xf000
4746; GCN-NEXT:    s_mov_b32 s7, 0xf000
4747; GCN-NEXT:    s_mov_b32 s6, -1
4748; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4749; GCN-NEXT:    s_ashr_i32 s3, s0, 31
4750; GCN-NEXT:    s_lshr_b32 s3, s3, 20
4751; GCN-NEXT:    s_add_i32 s3, s0, s3
4752; GCN-NEXT:    s_and_b32 s3, s3, s2
4753; GCN-NEXT:    s_sub_i32 s0, s0, s3
4754; GCN-NEXT:    s_ashr_i32 s3, s1, 31
4755; GCN-NEXT:    s_lshr_b32 s3, s3, 20
4756; GCN-NEXT:    s_add_i32 s3, s1, s3
4757; GCN-NEXT:    s_and_b32 s2, s3, s2
4758; GCN-NEXT:    s_sub_i32 s1, s1, s2
4759; GCN-NEXT:    v_mov_b32_e32 v0, s0
4760; GCN-NEXT:    v_mov_b32_e32 v1, s1
4761; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4762; GCN-NEXT:    s_endpgm
4763  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
4764  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4765  ret void
4766}
4767
4768define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4769; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
4770; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4771; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4772; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4773; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
4774; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
4775; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
4776; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
4777; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
4778; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
4779; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
4780; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4781; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
4782; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
4783; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
4784; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
4785; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
4786; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
4787; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
4788; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
4789; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
4790; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
4791; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
4792; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
4793; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
4794; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
4795; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
4796; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
4797; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
4798; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
4799; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
4800; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
4801; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
4802; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
4803; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
4804; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
4805; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
4806; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
4807; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
4808; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
4809; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
4810; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4811; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
4812; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
4813; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
4814; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
4815; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
4816; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
4817; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
4818; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
4819; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
4820; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
4821; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
4822; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
4823; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
4824; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
4825; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
4826; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
4827; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
4828; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
4829; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
4830; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
4831; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
4832; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
4833; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
4834; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
4835; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
4836; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
4837; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
4838; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
4839; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
4840; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
4841; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
4842; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
4843; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
4844; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
4845; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
4846; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
4847; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4848; CHECK-NEXT:    ret void
4849;
4850; GCN-LABEL: srem_v2i32_pow2_shl_denom:
4851; GCN:       ; %bb.0:
4852; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4853; GCN-NEXT:    s_movk_i32 s6, 0x1000
4854; GCN-NEXT:    s_mov_b32 s10, 0x4f7ffffe
4855; GCN-NEXT:    s_mov_b32 s7, 0xf000
4856; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4857; GCN-NEXT:    s_lshl_b32 s2, s6, s2
4858; GCN-NEXT:    s_ashr_i32 s4, s2, 31
4859; GCN-NEXT:    s_add_i32 s2, s2, s4
4860; GCN-NEXT:    s_xor_b32 s9, s2, s4
4861; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
4862; GCN-NEXT:    s_lshl_b32 s2, s6, s3
4863; GCN-NEXT:    s_ashr_i32 s6, s2, 31
4864; GCN-NEXT:    s_add_i32 s2, s2, s6
4865; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4866; GCN-NEXT:    s_sub_i32 s8, 0, s9
4867; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4868; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4869; GCN-NEXT:    v_mul_f32_e32 v0, s10, v0
4870; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4871; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4872; GCN-NEXT:    s_ashr_i32 s3, s0, 31
4873; GCN-NEXT:    s_add_i32 s0, s0, s3
4874; GCN-NEXT:    v_mul_lo_u32 v1, s8, v0
4875; GCN-NEXT:    s_xor_b32 s8, s2, s6
4876; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s8
4877; GCN-NEXT:    s_xor_b32 s0, s0, s3
4878; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4879; GCN-NEXT:    s_sub_i32 s2, 0, s8
4880; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
4881; GCN-NEXT:    s_mov_b32 s6, -1
4882; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4883; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4884; GCN-NEXT:    v_mul_f32_e32 v1, s10, v2
4885; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4886; GCN-NEXT:    v_mul_lo_u32 v0, v0, s9
4887; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
4888; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4889; GCN-NEXT:    s_ashr_i32 s0, s1, 31
4890; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
4891; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
4892; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
4893; GCN-NEXT:    s_add_i32 s1, s1, s0
4894; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4895; GCN-NEXT:    s_xor_b32 s1, s1, s0
4896; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
4897; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
4898; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
4899; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
4900; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4901; GCN-NEXT:    v_mul_lo_u32 v1, v1, s8
4902; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
4903; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
4904; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
4905; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
4906; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
4907; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4908; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
4909; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
4910; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4911; GCN-NEXT:    v_xor_b32_e32 v1, s0, v1
4912; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
4913; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4914; GCN-NEXT:    s_endpgm
4915  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4916  %r = srem <2 x i32> %x, %shl.y
4917  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4918  ret void
4919}
4920
4921define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
4922; CHECK-LABEL: @udiv_i64_oddk_denom(
4923; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
4924; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
4925; CHECK-NEXT:    ret void
4926;
4927; GCN-LABEL: udiv_i64_oddk_denom:
4928; GCN:       ; %bb.0:
4929; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
4930; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
4931; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
4932; GCN-NEXT:    v_rcp_f32_e32 v0, v0
4933; GCN-NEXT:    s_movk_i32 s2, 0xfee0
4934; GCN-NEXT:    s_mov_b32 s3, 0x68958c89
4935; GCN-NEXT:    v_mov_b32_e32 v8, 0
4936; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
4937; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
4938; GCN-NEXT:    v_trunc_f32_e32 v1, v1
4939; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
4940; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4941; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4942; GCN-NEXT:    v_mov_b32_e32 v7, 0
4943; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
4944; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
4945; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
4946; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
4947; GCN-NEXT:    s_mov_b32 s11, 0xf000
4948; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4949; GCN-NEXT:    s_mov_b32 s8, s4
4950; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
4951; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
4952; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
4953; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
4954; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
4955; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
4956; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
4957; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
4958; GCN-NEXT:    s_movk_i32 s4, 0x11e
4959; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
4960; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
4961; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4962; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
4963; GCN-NEXT:    s_mov_b32 s10, -1
4964; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
4965; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
4966; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
4967; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
4968; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
4969; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
4970; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
4971; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
4972; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
4973; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
4974; GCN-NEXT:    s_mov_b32 s2, 0x976a7377
4975; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
4976; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
4977; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
4978; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
4979; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
4980; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
4981; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
4982; GCN-NEXT:    s_movk_i32 s3, 0x11f
4983; GCN-NEXT:    s_mov_b32 s9, s5
4984; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
4985; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
4986; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
4987; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
4988; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
4989; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
4990; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
4991; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
4992; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
4993; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
4994; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
4995; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
4996; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4997; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4998; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
4999; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
5000; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
5001; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
5002; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
5003; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5004; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5005; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
5006; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
5007; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5008; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5009; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5010; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5011; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5012; GCN-NEXT:    v_mul_lo_u32 v2, v0, s3
5013; GCN-NEXT:    v_mul_hi_u32 v3, v0, s2
5014; GCN-NEXT:    v_mul_lo_u32 v4, v1, s2
5015; GCN-NEXT:    v_mov_b32_e32 v5, s3
5016; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5017; GCN-NEXT:    v_mul_lo_u32 v3, v0, s2
5018; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5019; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
5020; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
5021; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
5022; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
5023; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
5024; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v4
5025; GCN-NEXT:    s_mov_b32 s2, 0x976a7376
5026; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
5027; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v5
5028; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
5029; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
5030; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
5031; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
5032; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
5033; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
5034; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
5035; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
5036; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
5037; GCN-NEXT:    v_mov_b32_e32 v6, s7
5038; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
5039; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
5040; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5041; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
5042; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5043; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
5044; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
5045; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5046; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
5047; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5048; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5049; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5050; GCN-NEXT:    s_endpgm
5051  %r = udiv i64 %x, 1235195949943
5052  store i64 %r, i64 addrspace(1)* %out
5053  ret void
5054}
5055
5056define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5057; CHECK-LABEL: @udiv_i64_pow2k_denom(
5058; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
5059; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5060; CHECK-NEXT:    ret void
5061;
5062; GCN-LABEL: udiv_i64_pow2k_denom:
5063; GCN:       ; %bb.0:
5064; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5065; GCN-NEXT:    s_mov_b32 s7, 0xf000
5066; GCN-NEXT:    s_mov_b32 s6, -1
5067; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5068; GCN-NEXT:    s_mov_b32 s4, s0
5069; GCN-NEXT:    s_mov_b32 s5, s1
5070; GCN-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
5071; GCN-NEXT:    v_mov_b32_e32 v0, s0
5072; GCN-NEXT:    v_mov_b32_e32 v1, s1
5073; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5074; GCN-NEXT:    s_endpgm
5075  %r = udiv i64 %x, 4096
5076  store i64 %r, i64 addrspace(1)* %out
5077  ret void
5078}
5079
5080define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5081; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
5082; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5083; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
5084; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5085; CHECK-NEXT:    ret void
5086;
5087; GCN-LABEL: udiv_i64_pow2_shl_denom:
5088; GCN:       ; %bb.0:
5089; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5090; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5091; GCN-NEXT:    s_mov_b32 s3, 0xf000
5092; GCN-NEXT:    s_mov_b32 s2, -1
5093; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5094; GCN-NEXT:    s_mov_b32 s0, s4
5095; GCN-NEXT:    s_add_i32 s8, s8, 12
5096; GCN-NEXT:    s_mov_b32 s1, s5
5097; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
5098; GCN-NEXT:    v_mov_b32_e32 v0, s4
5099; GCN-NEXT:    v_mov_b32_e32 v1, s5
5100; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5101; GCN-NEXT:    s_endpgm
5102  %shl.y = shl i64 4096, %y
5103  %r = udiv i64 %x, %shl.y
5104  store i64 %r, i64 addrspace(1)* %out
5105  ret void
5106}
5107
5108define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5109; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
5110; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5111; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5112; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5113; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5114; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
5115; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5116; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5117; CHECK-NEXT:    ret void
5118;
5119; GCN-LABEL: udiv_v2i64_pow2k_denom:
5120; GCN:       ; %bb.0:
5121; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5122; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5123; GCN-NEXT:    s_mov_b32 s7, 0xf000
5124; GCN-NEXT:    s_mov_b32 s6, -1
5125; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5126; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
5127; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
5128; GCN-NEXT:    v_mov_b32_e32 v0, s0
5129; GCN-NEXT:    v_mov_b32_e32 v1, s1
5130; GCN-NEXT:    v_mov_b32_e32 v2, s2
5131; GCN-NEXT:    v_mov_b32_e32 v3, s3
5132; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5133; GCN-NEXT:    s_endpgm
5134  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
5135  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5136  ret void
5137}
5138
5139define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5140; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
5141; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5142; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5143; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5144; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5145; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
5146; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5147; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5148; CHECK-NEXT:    ret void
5149;
5150; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom:
5151; GCN:       ; %bb.0:
5152; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5153; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
5154; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5155; GCN-NEXT:    s_movk_i32 s6, 0xf001
5156; GCN-NEXT:    v_mov_b32_e32 v7, 0
5157; GCN-NEXT:    v_mov_b32_e32 v2, 0
5158; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5159; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5160; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5161; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5162; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5163; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5164; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5165; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5166; GCN-NEXT:    s_movk_i32 s0, 0xfff
5167; GCN-NEXT:    v_mul_hi_u32 v3, v0, s6
5168; GCN-NEXT:    v_mul_lo_u32 v5, v1, s6
5169; GCN-NEXT:    v_mul_lo_u32 v4, v0, s6
5170; GCN-NEXT:    s_mov_b32 s7, 0xf000
5171; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
5172; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5173; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
5174; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
5175; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
5176; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
5177; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
5178; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5179; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
5180; GCN-NEXT:    v_mul_lo_u32 v8, v1, v4
5181; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5182; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
5183; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
5184; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v2, vcc
5185; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5186; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
5187; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5188; GCN-NEXT:    v_mul_hi_u32 v5, v0, s6
5189; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
5190; GCN-NEXT:    v_mul_lo_u32 v6, v3, s6
5191; GCN-NEXT:    v_mul_lo_u32 v8, v0, s6
5192; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
5193; GCN-NEXT:    s_mov_b32 s6, -1
5194; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
5195; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
5196; GCN-NEXT:    v_mul_hi_u32 v9, v0, v8
5197; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
5198; GCN-NEXT:    v_mul_hi_u32 v11, v3, v5
5199; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5200; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
5201; GCN-NEXT:    v_mul_lo_u32 v10, v3, v8
5202; GCN-NEXT:    v_mul_hi_u32 v8, v3, v8
5203; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
5204; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5205; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v8, vcc
5206; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v2, vcc
5207; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5208; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
5209; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
5210; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
5211; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
5212; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5213; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5214; GCN-NEXT:    v_mul_lo_u32 v3, s10, v1
5215; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
5216; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5217; GCN-NEXT:    v_mul_hi_u32 v6, s11, v1
5218; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5219; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5220; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5221; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5222; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5223; GCN-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
5224; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5225; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
5226; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
5227; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5228; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
5229; GCN-NEXT:    v_mul_lo_u32 v2, v1, s0
5230; GCN-NEXT:    v_mul_hi_u32 v3, v0, s0
5231; GCN-NEXT:    v_mul_lo_u32 v4, v0, s0
5232; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5233; GCN-NEXT:    v_mov_b32_e32 v3, s11
5234; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
5235; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
5236; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v4
5237; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
5238; GCN-NEXT:    s_movk_i32 s0, 0xffe
5239; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
5240; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5241; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5242; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
5243; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5244; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5245; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5246; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
5247; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5248; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
5249; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
5250; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
5251; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
5252; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
5253; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
5254; GCN-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
5255; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
5256; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
5257; GCN-NEXT:    v_mov_b32_e32 v0, s2
5258; GCN-NEXT:    v_mov_b32_e32 v1, s3
5259; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5260; GCN-NEXT:    s_endpgm
5261  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
5262  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5263  ret void
5264}
5265
5266define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5267; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
5268; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5269; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5270; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5271; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
5272; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5273; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5274; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5275; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
5276; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5277; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5278; CHECK-NEXT:    ret void
5279;
5280; GCN-LABEL: udiv_v2i64_pow2_shl_denom:
5281; GCN:       ; %bb.0:
5282; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5283; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5284; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5285; GCN-NEXT:    s_mov_b32 s7, 0xf000
5286; GCN-NEXT:    s_mov_b32 s6, -1
5287; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5288; GCN-NEXT:    s_add_i32 s0, s0, 12
5289; GCN-NEXT:    s_add_i32 s2, s2, 12
5290; GCN-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
5291; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
5292; GCN-NEXT:    v_mov_b32_e32 v0, s0
5293; GCN-NEXT:    v_mov_b32_e32 v1, s1
5294; GCN-NEXT:    v_mov_b32_e32 v2, s2
5295; GCN-NEXT:    v_mov_b32_e32 v3, s3
5296; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5297; GCN-NEXT:    s_endpgm
5298  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5299  %r = udiv <2 x i64> %x, %shl.y
5300  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5301  ret void
5302}
5303
5304define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5305; CHECK-LABEL: @urem_i64_oddk_denom(
5306; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
5307; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5308; CHECK-NEXT:    ret void
5309;
5310; GCN-LABEL: urem_i64_oddk_denom:
5311; GCN:       ; %bb.0:
5312; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
5313; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5314; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
5315; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5316; GCN-NEXT:    s_movk_i32 s2, 0xfee0
5317; GCN-NEXT:    s_mov_b32 s3, 0x689e0837
5318; GCN-NEXT:    v_mov_b32_e32 v8, 0
5319; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5320; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5321; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5322; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5323; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5324; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5325; GCN-NEXT:    v_mov_b32_e32 v7, 0
5326; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5327; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
5328; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
5329; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
5330; GCN-NEXT:    s_movk_i32 s12, 0x11f
5331; GCN-NEXT:    s_mov_b32 s13, 0x9761f7c9
5332; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5333; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
5334; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5335; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5336; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
5337; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
5338; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
5339; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5340; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5341; GCN-NEXT:    s_mov_b32 s9, s5
5342; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5343; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
5344; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
5345; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5346; GCN-NEXT:    s_movk_i32 s5, 0x11e
5347; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5348; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
5349; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
5350; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5351; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5352; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5353; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5354; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
5355; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5356; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
5357; GCN-NEXT:    s_mov_b32 s8, s4
5358; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5359; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
5360; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
5361; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
5362; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
5363; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
5364; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
5365; GCN-NEXT:    s_mov_b32 s4, 0x9761f7c8
5366; GCN-NEXT:    s_mov_b32 s11, 0xf000
5367; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5368; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
5369; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
5370; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
5371; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
5372; GCN-NEXT:    s_mov_b32 s10, -1
5373; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5374; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
5375; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
5376; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5377; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5378; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5379; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
5380; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5381; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5382; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
5383; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
5384; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
5385; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
5386; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
5387; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5388; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5389; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
5390; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
5391; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5392; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5393; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5394; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5395; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5396; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
5397; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
5398; GCN-NEXT:    v_mul_lo_u32 v1, v1, s13
5399; GCN-NEXT:    v_mul_lo_u32 v0, v0, s13
5400; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5401; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
5402; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
5403; GCN-NEXT:    v_mov_b32_e32 v3, s12
5404; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
5405; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
5406; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s13, v0
5407; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
5408; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
5409; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
5410; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
5411; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s4, v4
5412; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s13, v4
5413; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
5414; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, v5
5415; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
5416; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
5417; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
5418; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
5419; GCN-NEXT:    v_mov_b32_e32 v5, s7
5420; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
5421; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
5422; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
5423; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
5424; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5425; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v1
5426; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
5427; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
5428; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5429; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
5430; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5431; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5432; GCN-NEXT:    s_endpgm
5433  %r = urem i64 %x, 1235195393993
5434  store i64 %r, i64 addrspace(1)* %out
5435  ret void
5436}
5437
5438define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5439; CHECK-LABEL: @urem_i64_pow2k_denom(
5440; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
5441; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5442; CHECK-NEXT:    ret void
5443;
5444; GCN-LABEL: urem_i64_pow2k_denom:
5445; GCN:       ; %bb.0:
5446; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5447; GCN-NEXT:    s_mov_b32 s3, 0xf000
5448; GCN-NEXT:    s_mov_b32 s2, -1
5449; GCN-NEXT:    v_mov_b32_e32 v1, 0
5450; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5451; GCN-NEXT:    s_mov_b32 s0, s4
5452; GCN-NEXT:    s_and_b32 s4, s6, 0xfff
5453; GCN-NEXT:    s_mov_b32 s1, s5
5454; GCN-NEXT:    v_mov_b32_e32 v0, s4
5455; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5456; GCN-NEXT:    s_endpgm
5457  %r = urem i64 %x, 4096
5458  store i64 %r, i64 addrspace(1)* %out
5459  ret void
5460}
5461
5462define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5463; CHECK-LABEL: @urem_i64_pow2_shl_denom(
5464; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5465; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
5466; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5467; CHECK-NEXT:    ret void
5468;
5469; GCN-LABEL: urem_i64_pow2_shl_denom:
5470; GCN:       ; %bb.0:
5471; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5472; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5473; GCN-NEXT:    s_mov_b32 s3, 0xf000
5474; GCN-NEXT:    s_mov_b32 s2, -1
5475; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5476; GCN-NEXT:    s_mov_b32 s0, s4
5477; GCN-NEXT:    s_mov_b32 s1, s5
5478; GCN-NEXT:    s_mov_b32 s5, 0
5479; GCN-NEXT:    s_movk_i32 s4, 0x1000
5480; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
5481; GCN-NEXT:    s_add_u32 s4, s4, -1
5482; GCN-NEXT:    s_addc_u32 s5, s5, -1
5483; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
5484; GCN-NEXT:    v_mov_b32_e32 v0, s4
5485; GCN-NEXT:    v_mov_b32_e32 v1, s5
5486; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5487; GCN-NEXT:    s_endpgm
5488  %shl.y = shl i64 4096, %y
5489  %r = urem i64 %x, %shl.y
5490  store i64 %r, i64 addrspace(1)* %out
5491  ret void
5492}
5493
5494define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5495; CHECK-LABEL: @urem_v2i64_pow2k_denom(
5496; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5497; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
5498; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5499; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5500; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
5501; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5502; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5503; CHECK-NEXT:    ret void
5504;
5505; GCN-LABEL: urem_v2i64_pow2k_denom:
5506; GCN:       ; %bb.0:
5507; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5508; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5509; GCN-NEXT:    s_movk_i32 s8, 0xfff
5510; GCN-NEXT:    v_mov_b32_e32 v1, 0
5511; GCN-NEXT:    s_mov_b32 s7, 0xf000
5512; GCN-NEXT:    s_mov_b32 s6, -1
5513; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5514; GCN-NEXT:    s_and_b32 s0, s0, s8
5515; GCN-NEXT:    s_and_b32 s1, s2, s8
5516; GCN-NEXT:    v_mov_b32_e32 v0, s0
5517; GCN-NEXT:    v_mov_b32_e32 v2, s1
5518; GCN-NEXT:    v_mov_b32_e32 v3, v1
5519; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5520; GCN-NEXT:    s_endpgm
5521  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
5522  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5523  ret void
5524}
5525
5526define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5527; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
5528; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5529; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5530; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5531; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
5532; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5533; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5534; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5535; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
5536; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5537; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5538; CHECK-NEXT:    ret void
5539;
5540; GCN-LABEL: urem_v2i64_pow2_shl_denom:
5541; GCN:       ; %bb.0:
5542; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5543; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5544; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5545; GCN-NEXT:    s_mov_b32 s13, 0
5546; GCN-NEXT:    s_movk_i32 s12, 0x1000
5547; GCN-NEXT:    s_mov_b32 s7, 0xf000
5548; GCN-NEXT:    s_mov_b32 s6, -1
5549; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5550; GCN-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
5551; GCN-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
5552; GCN-NEXT:    s_add_u32 s0, s0, -1
5553; GCN-NEXT:    s_addc_u32 s1, s1, -1
5554; GCN-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
5555; GCN-NEXT:    s_add_u32 s2, s2, -1
5556; GCN-NEXT:    s_addc_u32 s3, s3, -1
5557; GCN-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
5558; GCN-NEXT:    v_mov_b32_e32 v0, s0
5559; GCN-NEXT:    v_mov_b32_e32 v1, s1
5560; GCN-NEXT:    v_mov_b32_e32 v2, s2
5561; GCN-NEXT:    v_mov_b32_e32 v3, s3
5562; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5563; GCN-NEXT:    s_endpgm
5564  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5565  %r = urem <2 x i64> %x, %shl.y
5566  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5567  ret void
5568}
5569
5570define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5571; CHECK-LABEL: @sdiv_i64_oddk_denom(
5572; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
5573; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5574; CHECK-NEXT:    ret void
5575;
5576; GCN-LABEL: sdiv_i64_oddk_denom:
5577; GCN:       ; %bb.0:
5578; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5579; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
5580; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5581; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
5582; GCN-NEXT:    v_mov_b32_e32 v8, 0
5583; GCN-NEXT:    v_mov_b32_e32 v7, 0
5584; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5585; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5586; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5587; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5588; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5589; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5590; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5591; GCN-NEXT:    s_mov_b32 s7, 0xf000
5592; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
5593; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
5594; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5595; GCN-NEXT:    s_mov_b32 s6, -1
5596; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5597; GCN-NEXT:    s_mov_b32 s4, s8
5598; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5599; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
5600; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5601; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
5602; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
5603; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
5604; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5605; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5606; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
5607; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5608; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
5609; GCN-NEXT:    s_mov_b32 s5, s9
5610; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5611; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
5612; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
5613; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5614; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5615; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5616; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5617; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
5618; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
5619; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5620; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
5621; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
5622; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
5623; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
5624; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
5625; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
5626; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
5627; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
5628; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5629; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
5630; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
5631; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
5632; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
5633; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
5634; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5635; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5636; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5637; GCN-NEXT:    s_ashr_i32 s2, s11, 31
5638; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
5639; GCN-NEXT:    s_add_u32 s0, s10, s2
5640; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5641; GCN-NEXT:    s_mov_b32 s3, s2
5642; GCN-NEXT:    s_addc_u32 s1, s11, s2
5643; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
5644; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5645; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
5646; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
5647; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
5648; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
5649; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
5650; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5651; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5652; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
5653; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
5654; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
5655; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5656; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5657; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5658; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5659; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5660; GCN-NEXT:    v_mul_lo_u32 v2, v1, s3
5661; GCN-NEXT:    v_mul_hi_u32 v3, s3, v0
5662; GCN-NEXT:    v_mul_lo_u32 v4, v0, s3
5663; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5664; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
5665; GCN-NEXT:    v_mov_b32_e32 v3, s1
5666; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
5667; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
5668; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
5669; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
5670; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
5671; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5672; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5673; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
5674; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5675; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5676; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5677; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
5678; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5679; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
5680; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
5681; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
5682; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
5683; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
5684; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
5685; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
5686; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
5687; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
5688; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
5689; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
5690; GCN-NEXT:    v_mov_b32_e32 v2, s2
5691; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
5692; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
5693; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5694; GCN-NEXT:    s_endpgm
5695  %r = sdiv i64 %x, 1235195
5696  store i64 %r, i64 addrspace(1)* %out
5697  ret void
5698}
5699
5700define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5701; CHECK-LABEL: @sdiv_i64_pow2k_denom(
5702; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
5703; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5704; CHECK-NEXT:    ret void
5705;
5706; GCN-LABEL: sdiv_i64_pow2k_denom:
5707; GCN:       ; %bb.0:
5708; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5709; GCN-NEXT:    s_mov_b32 s7, 0xf000
5710; GCN-NEXT:    s_mov_b32 s6, -1
5711; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5712; GCN-NEXT:    s_mov_b32 s4, s0
5713; GCN-NEXT:    s_ashr_i32 s0, s3, 31
5714; GCN-NEXT:    s_lshr_b32 s0, s0, 20
5715; GCN-NEXT:    s_add_u32 s0, s2, s0
5716; GCN-NEXT:    s_mov_b32 s5, s1
5717; GCN-NEXT:    s_addc_u32 s1, s3, 0
5718; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
5719; GCN-NEXT:    v_mov_b32_e32 v0, s0
5720; GCN-NEXT:    v_mov_b32_e32 v1, s1
5721; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5722; GCN-NEXT:    s_endpgm
5723  %r = sdiv i64 %x, 4096
5724  store i64 %r, i64 addrspace(1)* %out
5725  ret void
5726}
5727
5728define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5729; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
5730; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5731; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
5732; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5733; CHECK-NEXT:    ret void
5734;
5735; GCN-LABEL: sdiv_i64_pow2_shl_denom:
5736; GCN:       ; %bb.0:
5737; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
5738; GCN-NEXT:    s_mov_b32 s3, 0
5739; GCN-NEXT:    s_movk_i32 s2, 0x1000
5740; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5741; GCN-NEXT:    s_mov_b32 s7, 0xf000
5742; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5743; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
5744; GCN-NEXT:    s_ashr_i32 s12, s3, 31
5745; GCN-NEXT:    s_add_u32 s2, s2, s12
5746; GCN-NEXT:    s_mov_b32 s13, s12
5747; GCN-NEXT:    s_addc_u32 s3, s3, s12
5748; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
5749; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
5750; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
5751; GCN-NEXT:    s_sub_u32 s4, 0, s2
5752; GCN-NEXT:    s_subb_u32 s5, 0, s3
5753; GCN-NEXT:    s_ashr_i32 s14, s11, 31
5754; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
5755; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5756; GCN-NEXT:    s_mov_b32 s15, s14
5757; GCN-NEXT:    s_mov_b32 s6, -1
5758; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5759; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5760; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5761; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5762; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5763; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5764; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
5765; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
5766; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
5767; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
5768; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5769; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
5770; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
5771; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5772; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
5773; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
5774; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5775; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5776; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
5777; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
5778; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5779; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5780; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
5781; GCN-NEXT:    v_mov_b32_e32 v4, 0
5782; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
5783; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5784; GCN-NEXT:    v_mov_b32_e32 v6, 0
5785; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5786; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
5787; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5788; GCN-NEXT:    v_mul_lo_u32 v5, s4, v2
5789; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
5790; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
5791; GCN-NEXT:    s_mov_b32 s5, s9
5792; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5793; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
5794; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
5795; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
5796; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
5797; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
5798; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
5799; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
5800; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
5801; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5802; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
5803; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
5804; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
5805; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
5806; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
5807; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
5808; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
5809; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5810; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
5811; GCN-NEXT:    s_add_u32 s0, s10, s14
5812; GCN-NEXT:    s_addc_u32 s1, s11, s14
5813; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5814; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
5815; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5816; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
5817; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
5818; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5819; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
5820; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5821; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5822; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
5823; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5824; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5825; GCN-NEXT:    s_mov_b32 s4, s8
5826; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5827; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5828; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
5829; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5830; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
5831; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
5832; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
5833; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
5834; GCN-NEXT:    v_mov_b32_e32 v5, s3
5835; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5836; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
5837; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5838; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
5839; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
5840; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
5841; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
5842; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
5843; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
5844; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
5845; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
5846; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
5847; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
5848; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
5849; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
5850; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
5851; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
5852; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
5853; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
5854; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
5855; GCN-NEXT:    v_mov_b32_e32 v6, s11
5856; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
5857; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
5858; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5859; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
5860; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5861; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
5862; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
5863; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5864; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
5865; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5866; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
5867; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5868; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
5869; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
5870; GCN-NEXT:    v_mov_b32_e32 v2, s1
5871; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
5872; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
5873; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5874; GCN-NEXT:    s_endpgm
5875  %shl.y = shl i64 4096, %y
5876  %r = sdiv i64 %x, %shl.y
5877  store i64 %r, i64 addrspace(1)* %out
5878  ret void
5879}
5880
5881define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5882; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
5883; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5884; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
5885; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5886; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5887; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
5888; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5889; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5890; CHECK-NEXT:    ret void
5891;
5892; GCN-LABEL: sdiv_v2i64_pow2k_denom:
5893; GCN:       ; %bb.0:
5894; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5895; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5896; GCN-NEXT:    s_mov_b32 s7, 0xf000
5897; GCN-NEXT:    s_mov_b32 s6, -1
5898; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5899; GCN-NEXT:    s_ashr_i32 s8, s1, 31
5900; GCN-NEXT:    s_lshr_b32 s8, s8, 20
5901; GCN-NEXT:    s_add_u32 s0, s0, s8
5902; GCN-NEXT:    s_addc_u32 s1, s1, 0
5903; GCN-NEXT:    s_ashr_i32 s8, s3, 31
5904; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
5905; GCN-NEXT:    s_lshr_b32 s8, s8, 20
5906; GCN-NEXT:    s_add_u32 s2, s2, s8
5907; GCN-NEXT:    s_addc_u32 s3, s3, 0
5908; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
5909; GCN-NEXT:    v_mov_b32_e32 v0, s0
5910; GCN-NEXT:    v_mov_b32_e32 v1, s1
5911; GCN-NEXT:    v_mov_b32_e32 v2, s2
5912; GCN-NEXT:    v_mov_b32_e32 v3, s3
5913; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5914; GCN-NEXT:    s_endpgm
5915  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
5916  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5917  ret void
5918}
5919
5920define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5921; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
5922; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5923; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
5924; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5925; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5926; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
5927; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5928; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5929; CHECK-NEXT:    ret void
5930;
5931; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
5932; GCN:       ; %bb.0:
5933; GCN-NEXT:    v_mov_b32_e32 v0, 0x457ff000
5934; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5935; GCN-NEXT:    v_mac_f32_e32 v0, 0, v1
5936; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5937; GCN-NEXT:    s_movk_i32 s6, 0xf001
5938; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5939; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5940; GCN-NEXT:    s_mov_b32 s7, 0xf000
5941; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5942; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5943; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5944; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5945; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5946; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5947; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5948; GCN-NEXT:    s_ashr_i32 s0, s9, 31
5949; GCN-NEXT:    s_lshr_b32 s0, s0, 20
5950; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
5951; GCN-NEXT:    v_mul_lo_u32 v3, v1, s6
5952; GCN-NEXT:    s_add_u32 s2, s8, s0
5953; GCN-NEXT:    s_addc_u32 s3, s9, 0
5954; GCN-NEXT:    s_ashr_i32 s8, s11, 31
5955; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
5956; GCN-NEXT:    v_mul_lo_u32 v3, v0, s6
5957; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
5958; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
5959; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
5960; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
5961; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
5962; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5963; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
5964; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5965; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
5966; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
5967; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
5968; GCN-NEXT:    s_mov_b32 s9, s8
5969; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
5970; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
5971; GCN-NEXT:    v_mov_b32_e32 v4, 0
5972; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
5973; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5974; GCN-NEXT:    v_mov_b32_e32 v6, 0
5975; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5976; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
5977; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5978; GCN-NEXT:    v_mul_lo_u32 v5, v2, s6
5979; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
5980; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5981; GCN-NEXT:    v_mul_lo_u32 v7, v0, s6
5982; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
5983; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
5984; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
5985; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
5986; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
5987; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
5988; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
5989; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5990; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
5991; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
5992; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
5993; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
5994; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
5995; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
5996; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
5997; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5998; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
5999; GCN-NEXT:    s_add_u32 s0, s10, s8
6000; GCN-NEXT:    s_addc_u32 s1, s11, s8
6001; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6002; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
6003; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6004; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6005; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6006; GCN-NEXT:    v_mul_hi_u32 v5, s0, v1
6007; GCN-NEXT:    v_mul_hi_u32 v7, s1, v1
6008; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6009; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6010; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6011; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
6012; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6013; GCN-NEXT:    s_movk_i32 s9, 0xfff
6014; GCN-NEXT:    s_mov_b32 s6, -1
6015; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6016; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6017; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6018; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6019; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6020; GCN-NEXT:    v_mul_lo_u32 v2, v1, s9
6021; GCN-NEXT:    v_mul_hi_u32 v3, s9, v0
6022; GCN-NEXT:    v_mul_lo_u32 v4, v0, s9
6023; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6024; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
6025; GCN-NEXT:    v_mov_b32_e32 v3, s1
6026; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
6027; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
6028; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
6029; GCN-NEXT:    s_movk_i32 s0, 0xffe
6030; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
6031; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6032; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
6033; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
6034; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
6035; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
6036; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
6037; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
6038; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
6039; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
6040; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
6041; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
6042; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
6043; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
6044; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
6045; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6046; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
6047; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6048; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
6049; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
6050; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
6051; GCN-NEXT:    v_mov_b32_e32 v3, s8
6052; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
6053; GCN-NEXT:    v_mov_b32_e32 v0, s2
6054; GCN-NEXT:    v_mov_b32_e32 v1, s3
6055; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6056; GCN-NEXT:    s_endpgm
6057  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
6058  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6059  ret void
6060}
6061
6062define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
6063; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
6064; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
6065; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6066; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
6067; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
6068; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
6069; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
6070; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
6071; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
6072; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
6073; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6074; CHECK-NEXT:    ret void
6075;
6076; GCN-LABEL: sdiv_v2i64_pow2_shl_denom:
6077; GCN:       ; %bb.0:
6078; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
6079; GCN-NEXT:    s_mov_b32 s3, 0
6080; GCN-NEXT:    s_movk_i32 s2, 0x1000
6081; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
6082; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
6083; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6084; GCN-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
6085; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6086; GCN-NEXT:    s_ashr_i32 s16, s3, 31
6087; GCN-NEXT:    s_add_u32 s2, s2, s16
6088; GCN-NEXT:    s_mov_b32 s17, s16
6089; GCN-NEXT:    s_addc_u32 s3, s3, s16
6090; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
6091; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
6092; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
6093; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
6094; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
6095; GCN-NEXT:    s_sub_u32 s6, 0, s14
6096; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
6097; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6098; GCN-NEXT:    s_subb_u32 s7, 0, s15
6099; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6100; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6101; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
6102; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
6103; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6104; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
6105; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6106; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6107; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
6108; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
6109; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
6110; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
6111; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6112; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6113; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
6114; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
6115; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6116; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6117; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6118; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6119; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
6120; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
6121; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
6122; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6123; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
6124; GCN-NEXT:    v_mov_b32_e32 v4, 0
6125; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6126; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6127; GCN-NEXT:    v_mov_b32_e32 v6, 0
6128; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
6129; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6130; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
6131; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
6132; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6133; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
6134; GCN-NEXT:    s_mov_b32 s7, 0xf000
6135; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6136; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
6137; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6138; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6139; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6140; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6141; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6142; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6143; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6144; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6145; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6146; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6147; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6148; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6149; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6150; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6151; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6152; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6153; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
6154; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6155; GCN-NEXT:    s_ashr_i32 s2, s9, 31
6156; GCN-NEXT:    s_add_u32 s0, s8, s2
6157; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6158; GCN-NEXT:    s_mov_b32 s3, s2
6159; GCN-NEXT:    s_addc_u32 s1, s9, s2
6160; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
6161; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6162; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
6163; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
6164; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
6165; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
6166; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
6167; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6168; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6169; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
6170; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
6171; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
6172; GCN-NEXT:    s_mov_b32 s6, -1
6173; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6174; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6175; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6176; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6177; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6178; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
6179; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
6180; GCN-NEXT:    v_mul_lo_u32 v5, s15, v0
6181; GCN-NEXT:    v_mov_b32_e32 v7, s15
6182; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6183; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
6184; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6185; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
6186; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
6187; GCN-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
6188; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
6189; GCN-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
6190; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v5
6191; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
6192; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
6193; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
6194; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v5
6195; GCN-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
6196; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
6197; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
6198; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
6199; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
6200; GCN-NEXT:    s_ashr_i32 s8, s13, 31
6201; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
6202; GCN-NEXT:    s_add_u32 s12, s12, s8
6203; GCN-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
6204; GCN-NEXT:    v_mov_b32_e32 v8, s9
6205; GCN-NEXT:    s_mov_b32 s9, s8
6206; GCN-NEXT:    s_addc_u32 s13, s13, s8
6207; GCN-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
6208; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s12
6209; GCN-NEXT:    v_cvt_f32_u32_e32 v11, s13
6210; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
6211; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
6212; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6213; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
6214; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6215; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
6216; GCN-NEXT:    v_mac_f32_e32 v10, s18, v11
6217; GCN-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
6218; GCN-NEXT:    v_rcp_f32_e32 v3, v10
6219; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
6220; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
6221; GCN-NEXT:    s_sub_u32 s14, 0, s12
6222; GCN-NEXT:    v_mul_f32_e32 v3, s19, v3
6223; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
6224; GCN-NEXT:    v_trunc_f32_e32 v5, v5
6225; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
6226; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
6227; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
6228; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
6229; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6230; GCN-NEXT:    v_mul_hi_u32 v2, s14, v3
6231; GCN-NEXT:    v_mul_lo_u32 v7, s14, v5
6232; GCN-NEXT:    s_subb_u32 s15, 0, s13
6233; GCN-NEXT:    v_mul_lo_u32 v8, s15, v3
6234; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6235; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
6236; GCN-NEXT:    v_mul_lo_u32 v7, s14, v3
6237; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
6238; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
6239; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
6240; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
6241; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
6242; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
6243; GCN-NEXT:    v_xor_b32_e32 v1, s3, v1
6244; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6245; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
6246; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
6247; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
6248; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6249; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
6250; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
6251; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6252; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
6253; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
6254; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
6255; GCN-NEXT:    v_mul_lo_u32 v8, s14, v3
6256; GCN-NEXT:    v_mul_hi_u32 v9, s14, v2
6257; GCN-NEXT:    v_mul_lo_u32 v10, s15, v2
6258; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6259; GCN-NEXT:    v_mul_lo_u32 v9, s14, v2
6260; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6261; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
6262; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
6263; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
6264; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
6265; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
6266; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
6267; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
6268; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
6269; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
6270; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
6271; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
6272; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
6273; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
6274; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
6275; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
6276; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6277; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
6278; GCN-NEXT:    s_add_u32 s0, s10, s14
6279; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6280; GCN-NEXT:    s_mov_b32 s15, s14
6281; GCN-NEXT:    s_addc_u32 s1, s11, s14
6282; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6283; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6284; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
6285; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
6286; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
6287; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
6288; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
6289; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6290; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
6291; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
6292; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
6293; GCN-NEXT:    v_mov_b32_e32 v8, s3
6294; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
6295; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
6296; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
6297; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6298; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
6299; GCN-NEXT:    v_mul_lo_u32 v4, s12, v3
6300; GCN-NEXT:    v_mul_hi_u32 v5, s12, v2
6301; GCN-NEXT:    v_mul_lo_u32 v6, s13, v2
6302; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6303; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
6304; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6305; GCN-NEXT:    v_mul_lo_u32 v5, s12, v2
6306; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
6307; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
6308; GCN-NEXT:    v_mov_b32_e32 v7, s13
6309; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
6310; GCN-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
6311; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
6312; GCN-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
6313; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
6314; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
6315; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
6316; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
6317; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
6318; GCN-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
6319; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
6320; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
6321; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
6322; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
6323; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
6324; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
6325; GCN-NEXT:    v_mov_b32_e32 v8, s11
6326; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
6327; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
6328; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6329; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
6330; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6331; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
6332; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
6333; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
6334; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
6335; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6336; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
6337; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
6338; GCN-NEXT:    v_xor_b32_e32 v2, s0, v2
6339; GCN-NEXT:    v_xor_b32_e32 v3, s1, v3
6340; GCN-NEXT:    v_mov_b32_e32 v4, s1
6341; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
6342; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
6343; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6344; GCN-NEXT:    s_endpgm
6345  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
6346  %r = sdiv <2 x i64> %x, %shl.y
6347  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6348  ret void
6349}
6350
6351define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
6352; CHECK-LABEL: @srem_i64_oddk_denom(
6353; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
6354; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6355; CHECK-NEXT:    ret void
6356;
6357; GCN-LABEL: srem_i64_oddk_denom:
6358; GCN:       ; %bb.0:
6359; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
6360; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
6361; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6362; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
6363; GCN-NEXT:    v_mov_b32_e32 v8, 0
6364; GCN-NEXT:    v_mov_b32_e32 v7, 0
6365; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6366; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6367; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6368; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6369; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6370; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6371; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6372; GCN-NEXT:    s_mov_b32 s7, 0xf000
6373; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6374; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
6375; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
6376; GCN-NEXT:    s_mov_b32 s6, -1
6377; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6378; GCN-NEXT:    s_mov_b32 s4, s8
6379; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6380; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
6381; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6382; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
6383; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
6384; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
6385; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6386; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6387; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6388; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6389; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
6390; GCN-NEXT:    s_mov_b32 s5, s9
6391; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6392; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
6393; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
6394; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6395; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6396; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6397; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6398; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
6399; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
6400; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6401; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
6402; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
6403; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
6404; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
6405; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
6406; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
6407; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
6408; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
6409; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6410; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
6411; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
6412; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
6413; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
6414; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
6415; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6416; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
6417; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6418; GCN-NEXT:    s_ashr_i32 s2, s11, 31
6419; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
6420; GCN-NEXT:    s_add_u32 s0, s10, s2
6421; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6422; GCN-NEXT:    s_mov_b32 s3, s2
6423; GCN-NEXT:    s_addc_u32 s1, s11, s2
6424; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
6425; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6426; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6427; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6428; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
6429; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
6430; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6431; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6432; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6433; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
6434; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6435; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
6436; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
6437; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6438; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
6439; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6440; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
6441; GCN-NEXT:    v_mul_hi_u32 v2, s3, v0
6442; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
6443; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
6444; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6445; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6446; GCN-NEXT:    v_mov_b32_e32 v2, s1
6447; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
6448; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
6449; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
6450; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
6451; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
6452; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
6453; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
6454; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6455; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6456; GCN-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
6457; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
6458; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
6459; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
6460; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
6461; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
6462; GCN-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
6463; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
6464; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6465; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6466; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6467; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6468; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
6469; GCN-NEXT:    v_mov_b32_e32 v2, s2
6470; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6471; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6472; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6473; GCN-NEXT:    s_endpgm
6474  %r = srem i64 %x, 1235195
6475  store i64 %r, i64 addrspace(1)* %out
6476  ret void
6477}
6478
6479define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
6480; CHECK-LABEL: @srem_i64_pow2k_denom(
6481; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
6482; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6483; CHECK-NEXT:    ret void
6484;
6485; GCN-LABEL: srem_i64_pow2k_denom:
6486; GCN:       ; %bb.0:
6487; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
6488; GCN-NEXT:    s_mov_b32 s3, 0xf000
6489; GCN-NEXT:    s_mov_b32 s2, -1
6490; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6491; GCN-NEXT:    s_mov_b32 s0, s4
6492; GCN-NEXT:    s_ashr_i32 s4, s7, 31
6493; GCN-NEXT:    s_lshr_b32 s4, s4, 20
6494; GCN-NEXT:    s_add_u32 s4, s6, s4
6495; GCN-NEXT:    s_mov_b32 s1, s5
6496; GCN-NEXT:    s_addc_u32 s5, s7, 0
6497; GCN-NEXT:    s_and_b32 s4, s4, 0xfffff000
6498; GCN-NEXT:    s_sub_u32 s4, s6, s4
6499; GCN-NEXT:    s_subb_u32 s5, s7, s5
6500; GCN-NEXT:    v_mov_b32_e32 v0, s4
6501; GCN-NEXT:    v_mov_b32_e32 v1, s5
6502; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6503; GCN-NEXT:    s_endpgm
6504  %r = srem i64 %x, 4096
6505  store i64 %r, i64 addrspace(1)* %out
6506  ret void
6507}
6508
6509define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
6510; CHECK-LABEL: @srem_i64_pow2_shl_denom(
6511; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
6512; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
6513; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6514; CHECK-NEXT:    ret void
6515;
6516; GCN-LABEL: srem_i64_pow2_shl_denom:
6517; GCN:       ; %bb.0:
6518; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
6519; GCN-NEXT:    s_mov_b32 s3, 0
6520; GCN-NEXT:    s_movk_i32 s2, 0x1000
6521; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6522; GCN-NEXT:    s_mov_b32 s7, 0xf000
6523; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6524; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6525; GCN-NEXT:    s_ashr_i32 s4, s3, 31
6526; GCN-NEXT:    s_add_u32 s2, s2, s4
6527; GCN-NEXT:    s_mov_b32 s5, s4
6528; GCN-NEXT:    s_addc_u32 s3, s3, s4
6529; GCN-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
6530; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
6531; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
6532; GCN-NEXT:    s_sub_u32 s2, 0, s12
6533; GCN-NEXT:    s_subb_u32 s3, 0, s13
6534; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6535; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
6536; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6537; GCN-NEXT:    s_mov_b32 s15, s14
6538; GCN-NEXT:    s_mov_b32 s6, -1
6539; GCN-NEXT:    s_mov_b32 s4, s8
6540; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6541; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6542; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6543; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6544; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6545; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6546; GCN-NEXT:    s_mov_b32 s5, s9
6547; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6548; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
6549; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
6550; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
6551; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6552; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6553; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
6554; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6555; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6556; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6557; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6558; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6559; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
6560; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6561; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6562; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6563; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
6564; GCN-NEXT:    v_mov_b32_e32 v4, 0
6565; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6566; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6567; GCN-NEXT:    v_mov_b32_e32 v6, 0
6568; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6569; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6570; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6571; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
6572; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
6573; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
6574; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6575; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
6576; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6577; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6578; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6579; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6580; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6581; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6582; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6583; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6584; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6585; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6586; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6587; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6588; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6589; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6590; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6591; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6592; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
6593; GCN-NEXT:    s_add_u32 s0, s10, s14
6594; GCN-NEXT:    s_addc_u32 s1, s11, s14
6595; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6596; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6597; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6598; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
6599; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
6600; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
6601; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
6602; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
6603; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6604; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6605; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
6606; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
6607; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6608; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6609; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6610; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6611; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6612; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
6613; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
6614; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
6615; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
6616; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6617; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6618; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
6619; GCN-NEXT:    v_mov_b32_e32 v3, s13
6620; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
6621; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
6622; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
6623; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
6624; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
6625; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
6626; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
6627; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
6628; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
6629; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
6630; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
6631; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
6632; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
6633; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
6634; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
6635; GCN-NEXT:    v_mov_b32_e32 v5, s11
6636; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
6637; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
6638; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6639; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
6640; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6641; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
6642; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
6643; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
6644; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6645; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
6646; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6647; GCN-NEXT:    v_xor_b32_e32 v0, s14, v0
6648; GCN-NEXT:    v_xor_b32_e32 v1, s14, v1
6649; GCN-NEXT:    v_mov_b32_e32 v2, s14
6650; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
6651; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6652; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6653; GCN-NEXT:    s_endpgm
6654  %shl.y = shl i64 4096, %y
6655  %r = srem i64 %x, %shl.y
6656  store i64 %r, i64 addrspace(1)* %out
6657  ret void
6658}
6659
6660define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
6661; CHECK-LABEL: @srem_v2i64_pow2k_denom(
6662; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6663; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
6664; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
6665; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
6666; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
6667; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
6668; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6669; CHECK-NEXT:    ret void
6670;
6671; GCN-LABEL: srem_v2i64_pow2k_denom:
6672; GCN:       ; %bb.0:
6673; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6674; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
6675; GCN-NEXT:    s_movk_i32 s8, 0xf000
6676; GCN-NEXT:    s_mov_b32 s7, 0xf000
6677; GCN-NEXT:    s_mov_b32 s6, -1
6678; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6679; GCN-NEXT:    s_ashr_i32 s9, s1, 31
6680; GCN-NEXT:    s_lshr_b32 s9, s9, 20
6681; GCN-NEXT:    s_add_u32 s9, s0, s9
6682; GCN-NEXT:    s_addc_u32 s10, s1, 0
6683; GCN-NEXT:    s_and_b32 s9, s9, s8
6684; GCN-NEXT:    s_sub_u32 s0, s0, s9
6685; GCN-NEXT:    s_subb_u32 s1, s1, s10
6686; GCN-NEXT:    s_ashr_i32 s9, s3, 31
6687; GCN-NEXT:    s_lshr_b32 s9, s9, 20
6688; GCN-NEXT:    s_add_u32 s9, s2, s9
6689; GCN-NEXT:    s_addc_u32 s10, s3, 0
6690; GCN-NEXT:    s_and_b32 s8, s9, s8
6691; GCN-NEXT:    s_sub_u32 s2, s2, s8
6692; GCN-NEXT:    s_subb_u32 s3, s3, s10
6693; GCN-NEXT:    v_mov_b32_e32 v0, s0
6694; GCN-NEXT:    v_mov_b32_e32 v1, s1
6695; GCN-NEXT:    v_mov_b32_e32 v2, s2
6696; GCN-NEXT:    v_mov_b32_e32 v3, s3
6697; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6698; GCN-NEXT:    s_endpgm
6699  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
6700  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6701  ret void
6702}
6703
6704define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
6705; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
6706; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
6707; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6708; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
6709; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
6710; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
6711; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
6712; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
6713; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
6714; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
6715; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6716; CHECK-NEXT:    ret void
6717;
6718; GCN-LABEL: srem_v2i64_pow2_shl_denom:
6719; GCN:       ; %bb.0:
6720; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
6721; GCN-NEXT:    s_mov_b32 s3, 0
6722; GCN-NEXT:    s_movk_i32 s2, 0x1000
6723; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
6724; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
6725; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6726; GCN-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
6727; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6728; GCN-NEXT:    s_ashr_i32 s4, s3, 31
6729; GCN-NEXT:    s_add_u32 s2, s2, s4
6730; GCN-NEXT:    s_mov_b32 s5, s4
6731; GCN-NEXT:    s_addc_u32 s3, s3, s4
6732; GCN-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
6733; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s16
6734; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s17
6735; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
6736; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
6737; GCN-NEXT:    s_sub_u32 s6, 0, s16
6738; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
6739; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6740; GCN-NEXT:    s_subb_u32 s7, 0, s17
6741; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6742; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6743; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
6744; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
6745; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6746; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
6747; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6748; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6749; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6750; GCN-NEXT:    s_ashr_i32 s12, s9, 31
6751; GCN-NEXT:    s_add_u32 s0, s8, s12
6752; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
6753; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
6754; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
6755; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
6756; GCN-NEXT:    s_mov_b32 s13, s12
6757; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6758; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6759; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
6760; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
6761; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6762; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6763; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6764; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6765; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
6766; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
6767; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
6768; GCN-NEXT:    s_addc_u32 s1, s9, s12
6769; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
6770; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6771; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
6772; GCN-NEXT:    v_mov_b32_e32 v4, 0
6773; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6774; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6775; GCN-NEXT:    v_mov_b32_e32 v6, 0
6776; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
6777; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6778; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
6779; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
6780; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6781; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
6782; GCN-NEXT:    s_mov_b32 s7, 0xf000
6783; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6784; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
6785; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6786; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6787; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6788; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6789; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6790; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6791; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6792; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6793; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6794; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6795; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6796; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6797; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6798; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6799; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6800; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6801; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
6802; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6803; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6804; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
6805; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
6806; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
6807; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
6808; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
6809; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6810; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6811; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
6812; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
6813; GCN-NEXT:    s_mov_b32 s6, -1
6814; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6815; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6816; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6817; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6818; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6819; GCN-NEXT:    v_mul_lo_u32 v1, s16, v1
6820; GCN-NEXT:    v_mul_hi_u32 v2, s16, v0
6821; GCN-NEXT:    v_mul_lo_u32 v3, s17, v0
6822; GCN-NEXT:    v_mul_lo_u32 v0, s16, v0
6823; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6824; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6825; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
6826; GCN-NEXT:    v_mov_b32_e32 v3, s17
6827; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
6828; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
6829; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
6830; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
6831; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
6832; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
6833; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
6834; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
6835; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
6836; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
6837; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
6838; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
6839; GCN-NEXT:    s_ashr_i32 s2, s15, 31
6840; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
6841; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
6842; GCN-NEXT:    s_add_u32 s8, s14, s2
6843; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
6844; GCN-NEXT:    v_mov_b32_e32 v7, s9
6845; GCN-NEXT:    s_mov_b32 s3, s2
6846; GCN-NEXT:    s_addc_u32 s9, s15, s2
6847; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
6848; GCN-NEXT:    v_cvt_f32_u32_e32 v8, s8
6849; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s9
6850; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
6851; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
6852; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6853; GCN-NEXT:    v_mac_f32_e32 v8, s18, v9
6854; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
6855; GCN-NEXT:    v_rcp_f32_e32 v8, v8
6856; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
6857; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
6858; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
6859; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
6860; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6861; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
6862; GCN-NEXT:    v_mul_f32_e32 v3, s19, v8
6863; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
6864; GCN-NEXT:    v_trunc_f32_e32 v5, v5
6865; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
6866; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
6867; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
6868; GCN-NEXT:    s_sub_u32 s2, 0, s8
6869; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6870; GCN-NEXT:    v_mul_hi_u32 v2, s2, v3
6871; GCN-NEXT:    v_mul_lo_u32 v7, s2, v5
6872; GCN-NEXT:    s_subb_u32 s3, 0, s9
6873; GCN-NEXT:    v_mul_lo_u32 v8, s3, v3
6874; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6875; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
6876; GCN-NEXT:    v_mul_lo_u32 v7, s2, v3
6877; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
6878; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
6879; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
6880; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
6881; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
6882; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
6883; GCN-NEXT:    s_mov_b32 s15, s14
6884; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6885; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
6886; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
6887; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
6888; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
6889; GCN-NEXT:    v_xor_b32_e32 v1, s12, v1
6890; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6891; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
6892; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
6893; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6894; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
6895; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
6896; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
6897; GCN-NEXT:    v_mul_lo_u32 v8, s2, v3
6898; GCN-NEXT:    v_mul_hi_u32 v9, s2, v2
6899; GCN-NEXT:    v_mul_lo_u32 v10, s3, v2
6900; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6901; GCN-NEXT:    v_mul_lo_u32 v9, s2, v2
6902; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6903; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
6904; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
6905; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
6906; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
6907; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
6908; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
6909; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
6910; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
6911; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
6912; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
6913; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
6914; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
6915; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
6916; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
6917; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
6918; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
6919; GCN-NEXT:    s_add_u32 s0, s10, s14
6920; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6921; GCN-NEXT:    s_addc_u32 s1, s11, s14
6922; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6923; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6924; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
6925; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
6926; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
6927; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
6928; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
6929; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6930; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
6931; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
6932; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
6933; GCN-NEXT:    v_mov_b32_e32 v8, s12
6934; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
6935; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
6936; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
6937; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6938; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
6939; GCN-NEXT:    v_mul_lo_u32 v3, s8, v3
6940; GCN-NEXT:    v_mul_hi_u32 v4, s8, v2
6941; GCN-NEXT:    v_mul_lo_u32 v5, s9, v2
6942; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
6943; GCN-NEXT:    v_mul_lo_u32 v2, s8, v2
6944; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
6945; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6946; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6947; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
6948; GCN-NEXT:    v_mov_b32_e32 v5, s9
6949; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
6950; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
6951; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
6952; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
6953; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
6954; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
6955; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
6956; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
6957; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
6958; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
6959; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
6960; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
6961; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
6962; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
6963; GCN-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
6964; GCN-NEXT:    v_mov_b32_e32 v7, s11
6965; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
6966; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
6967; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6968; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
6969; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6970; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
6971; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
6972; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
6973; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
6974; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
6975; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6976; GCN-NEXT:    v_xor_b32_e32 v2, s14, v2
6977; GCN-NEXT:    v_xor_b32_e32 v3, s14, v3
6978; GCN-NEXT:    v_mov_b32_e32 v4, s14
6979; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
6980; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
6981; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6982; GCN-NEXT:    s_endpgm
6983  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
6984  %r = srem <2 x i64> %x, %shl.y
6985  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6986  ret void
6987}
6988