• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
4
5define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
6; SI-LABEL: umulo_i64_v_v:
7; SI:       ; %bb.0: ; %bb
8; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; SI-NEXT:    v_mul_hi_u32 v4, v1, v2
10; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
11; SI-NEXT:    v_mul_hi_u32 v6, v0, v3
12; SI-NEXT:    v_mul_lo_u32 v7, v0, v3
13; SI-NEXT:    v_mul_hi_u32 v8, v0, v2
14; SI-NEXT:    v_mul_hi_u32 v9, v1, v3
15; SI-NEXT:    v_mul_lo_u32 v3, v1, v3
16; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
17; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v7
18; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v6, vcc
19; SI-NEXT:    v_add_i32_e32 v6, vcc, v1, v5
20; SI-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v5
21; SI-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
22; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v9, vcc
23; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
24; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
25; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
26; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
27; SI-NEXT:    s_setpc_b64 s[30:31]
28;
29; GFX9-LABEL: umulo_i64_v_v:
30; GFX9:       ; %bb.0: ; %bb
31; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v3
33; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
34; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
35; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
36; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v2
37; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v6, v5
38; GFX9-NEXT:    v_mul_hi_u32 v10, v1, v3
39; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
40; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
41; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
42; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
43; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
44; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v1
45; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
46; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
47; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[3:4]
48; GFX9-NEXT:    v_add3_u32 v1, v6, v5, v7
49; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
50; GFX9-NEXT:    s_setpc_b64 s[30:31]
51bb:
52  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
53  ret { i64, i1 } %umulo
54}
55
56define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
57; SI-LABEL: smulo_i64_s_s:
58; SI:       ; %bb.0: ; %bb
59; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; SI-NEXT:    v_mul_hi_u32 v6, v1, v2
61; SI-NEXT:    v_mul_lo_u32 v5, v1, v2
62; SI-NEXT:    v_mul_hi_u32 v7, v0, v3
63; SI-NEXT:    v_mul_lo_u32 v8, v0, v3
64; SI-NEXT:    v_mul_hi_u32 v9, v0, v2
65; SI-NEXT:    v_mul_hi_i32 v10, v1, v3
66; SI-NEXT:    v_mul_lo_u32 v11, v1, v3
67; SI-NEXT:    v_mov_b32_e32 v12, 0
68; SI-NEXT:    v_mul_lo_u32 v4, v0, v2
69; SI-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
70; SI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
71; SI-NEXT:    v_add_i32_e32 v9, vcc, v8, v5
72; SI-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
73; SI-NEXT:    v_addc_u32_e32 v8, vcc, v7, v6, vcc
74; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
75; SI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
76; SI-NEXT:    v_mov_b32_e32 v7, v6
77; SI-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
78; SI-NEXT:    v_addc_u32_e32 v9, vcc, v12, v9, vcc
79; SI-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
80; SI-NEXT:    v_subb_u32_e32 v10, vcc, v9, v12, vcc
81; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
82; SI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
83; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
84; SI-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
85; SI-NEXT:    v_subb_u32_e32 v8, vcc, v1, v12, vcc
86; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
87; SI-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
88; SI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
89; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
90; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
91; SI-NEXT:    v_mov_b32_e32 v0, v4
92; SI-NEXT:    v_mov_b32_e32 v1, v5
93; SI-NEXT:    s_setpc_b64 s[30:31]
94;
95; GFX9-LABEL: smulo_i64_s_s:
96; GFX9:       ; %bb.0: ; %bb
97; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v3
99; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
100; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
101; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v2
102; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v2
103; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v6, v5
104; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
105; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v7
106; GFX9-NEXT:    v_mul_hi_i32 v10, v1, v3
107; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
108; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v3
109; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
110; GFX9-NEXT:    v_mov_b32_e32 v10, 0
111; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
112; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v9, vcc
113; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v4, v2
114; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, v8, v10, vcc
115; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
116; GFX9-NEXT:    v_cndmask_b32_e32 v1, v8, v11, vcc
117; GFX9-NEXT:    v_cndmask_b32_e32 v8, v4, v9, vcc
118; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, v8, v0
119; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v1, v10, vcc
120; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
121; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
122; GFX9-NEXT:    v_add3_u32 v1, v6, v5, v7
123; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
124; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
125; GFX9-NEXT:    v_mov_b32_e32 v6, v5
126; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v2
127; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[3:4], v[5:6]
128; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
129; GFX9-NEXT:    s_setpc_b64 s[30:31]
130bb:
131  %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
132  ret { i64, i1 } %smulo
133}
134
135define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
136; SI-LABEL: umulo_i64_s:
137; SI:       ; %bb.0: ; %bb
138; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
139; SI-NEXT:    s_mov_b32 s7, 0xf000
140; SI-NEXT:    s_waitcnt lgkmcnt(0)
141; SI-NEXT:    v_mov_b32_e32 v0, s2
142; SI-NEXT:    v_mul_hi_u32 v1, s1, v0
143; SI-NEXT:    s_mul_i32 s4, s1, s2
144; SI-NEXT:    v_mov_b32_e32 v2, s3
145; SI-NEXT:    v_mul_hi_u32 v3, s0, v2
146; SI-NEXT:    s_mul_i32 s5, s0, s3
147; SI-NEXT:    v_mul_hi_u32 v0, s0, v0
148; SI-NEXT:    v_mul_hi_u32 v2, s1, v2
149; SI-NEXT:    s_mul_i32 s1, s1, s3
150; SI-NEXT:    s_mul_i32 s0, s0, s2
151; SI-NEXT:    v_add_i32_e32 v4, vcc, s5, v0
152; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
153; SI-NEXT:    v_mov_b32_e32 v5, s0
154; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
155; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
156; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
157; SI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
158; SI-NEXT:    v_add_i32_e32 v0, vcc, s1, v1
159; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
160; SI-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
161; SI-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
162; SI-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
163; SI-NEXT:    v_cndmask_b32_e64 v0, v5, 0, vcc
164; SI-NEXT:    s_mov_b32 s6, -1
165; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
166; SI-NEXT:    s_endpgm
167;
168; GFX9-LABEL: umulo_i64_s:
169; GFX9:       ; %bb.0: ; %bb
170; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
171; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX9-NEXT:    s_mul_i32 s7, s0, s3
173; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
174; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s3
175; GFX9-NEXT:    s_add_u32 s9, s8, s7
176; GFX9-NEXT:    s_mul_i32 s6, s1, s2
177; GFX9-NEXT:    s_addc_u32 s5, 0, s5
178; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
179; GFX9-NEXT:    s_add_u32 s9, s9, s6
180; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s3
181; GFX9-NEXT:    s_addc_u32 s4, s5, s4
182; GFX9-NEXT:    s_addc_u32 s5, s10, 0
183; GFX9-NEXT:    s_mul_i32 s1, s1, s3
184; GFX9-NEXT:    s_add_u32 s4, s4, s1
185; GFX9-NEXT:    s_addc_u32 s5, 0, s5
186; GFX9-NEXT:    s_add_i32 s1, s8, s7
187; GFX9-NEXT:    s_add_i32 s1, s1, s6
188; GFX9-NEXT:    s_mul_i32 s2, s0, s2
189; GFX9-NEXT:    v_mov_b32_e32 v0, s1
190; GFX9-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
191; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
192; GFX9-NEXT:    v_mov_b32_e32 v0, s2
193; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
194; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
195; GFX9-NEXT:    s_endpgm
196bb:
197  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
198  %mul = extractvalue { i64, i1 } %umulo, 0
199  %overflow = extractvalue { i64, i1 } %umulo, 1
200  %res = select i1 %overflow, i64 0, i64 %mul
201  store i64 %res, i64 addrspace(1)* undef
202  ret void
203}
204
205define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
206; SI-LABEL: smulo_i64_s:
207; SI:       ; %bb.0: ; %bb
208; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
209; SI-NEXT:    v_mov_b32_e32 v0, 0
210; SI-NEXT:    s_mov_b32 s7, 0xf000
211; SI-NEXT:    s_waitcnt lgkmcnt(0)
212; SI-NEXT:    v_mov_b32_e32 v1, s2
213; SI-NEXT:    v_mul_hi_u32 v2, s1, v1
214; SI-NEXT:    s_mul_i32 s4, s1, s2
215; SI-NEXT:    v_mov_b32_e32 v3, s3
216; SI-NEXT:    v_mul_hi_u32 v4, s0, v3
217; SI-NEXT:    s_mul_i32 s5, s0, s3
218; SI-NEXT:    v_mul_hi_u32 v1, s0, v1
219; SI-NEXT:    v_mul_hi_i32 v3, s1, v3
220; SI-NEXT:    s_mul_i32 s6, s1, s3
221; SI-NEXT:    s_mul_i32 s8, s0, s2
222; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v1
223; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
224; SI-NEXT:    v_mov_b32_e32 v6, s8
225; SI-NEXT:    v_add_i32_e32 v5, vcc, s4, v5
226; SI-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
227; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
228; SI-NEXT:    v_add_i32_e32 v1, vcc, s5, v1
229; SI-NEXT:    v_add_i32_e32 v2, vcc, s6, v2
230; SI-NEXT:    v_addc_u32_e32 v3, vcc, v0, v3, vcc
231; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v1
232; SI-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v2
233; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
234; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
235; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s1, 0
236; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
237; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
238; SI-NEXT:    v_mov_b32_e32 v1, v0
239; SI-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v2
240; SI-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v3, vcc
241; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s3, 0
242; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
243; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
244; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
245; SI-NEXT:    v_cndmask_b32_e64 v1, v4, 0, vcc
246; SI-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
247; SI-NEXT:    s_mov_b32 s6, -1
248; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
249; SI-NEXT:    s_endpgm
250;
251; GFX9-LABEL: smulo_i64_s:
252; GFX9:       ; %bb.0: ; %bb
253; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
254; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX9-NEXT:    s_mul_i32 s7, s0, s3
256; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
257; GFX9-NEXT:    s_mul_hi_u32 s6, s0, s3
258; GFX9-NEXT:    s_add_u32 s9, s8, s7
259; GFX9-NEXT:    s_mul_i32 s5, s1, s2
260; GFX9-NEXT:    s_addc_u32 s6, 0, s6
261; GFX9-NEXT:    s_add_u32 s9, s9, s5
262; GFX9-NEXT:    s_mul_hi_u32 s4, s1, s2
263; GFX9-NEXT:    s_mul_hi_i32 s10, s1, s3
264; GFX9-NEXT:    s_addc_u32 s4, s6, s4
265; GFX9-NEXT:    s_addc_u32 s6, s10, 0
266; GFX9-NEXT:    s_mul_i32 s9, s1, s3
267; GFX9-NEXT:    s_add_u32 s4, s4, s9
268; GFX9-NEXT:    s_addc_u32 s6, 0, s6
269; GFX9-NEXT:    s_sub_u32 s9, s4, s2
270; GFX9-NEXT:    s_subb_u32 s10, s6, 0
271; GFX9-NEXT:    v_cmp_lt_i32_e64 vcc, s1, 0
272; GFX9-NEXT:    v_mov_b32_e32 v0, s6
273; GFX9-NEXT:    v_mov_b32_e32 v1, s10
274; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
275; GFX9-NEXT:    v_mov_b32_e32 v1, s4
276; GFX9-NEXT:    v_mov_b32_e32 v2, s9
277; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
278; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
279; GFX9-NEXT:    s_add_i32 s1, s8, s7
280; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
281; GFX9-NEXT:    s_add_i32 s1, s1, s5
282; GFX9-NEXT:    v_cmp_lt_i32_e64 vcc, s3, 0
283; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
284; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
285; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
286; GFX9-NEXT:    s_mov_b32 s5, s4
287; GFX9-NEXT:    s_mul_i32 s0, s0, s2
288; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
289; GFX9-NEXT:    v_mov_b32_e32 v0, s0
290; GFX9-NEXT:    v_mov_b32_e32 v2, s1
291; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
292; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
293; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
294; GFX9-NEXT:    s_endpgm
295bb:
296  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
297  %mul = extractvalue { i64, i1 } %umulo, 0
298  %overflow = extractvalue { i64, i1 } %umulo, 1
299  %res = select i1 %overflow, i64 0, i64 %mul
300  store i64 %res, i64 addrspace(1)* undef
301  ret void
302}
303
304define { i64, i1 } @smulo_i64_v_4(i64 %i) {
305; SI-LABEL: smulo_i64_v_4:
306; SI:       ; %bb.0: ; %bb
307; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; SI-NEXT:    v_lshl_b64 v[5:6], v[0:1], 2
309; SI-NEXT:    v_alignbit_b32 v4, v1, v0, 30
310; SI-NEXT:    v_ashr_i64 v[2:3], v[5:6], 2
311; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
312; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
313; SI-NEXT:    v_mov_b32_e32 v0, v5
314; SI-NEXT:    v_mov_b32_e32 v1, v4
315; SI-NEXT:    s_setpc_b64 s[30:31]
316;
317; GFX9-LABEL: smulo_i64_v_4:
318; GFX9:       ; %bb.0: ; %bb
319; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
321; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
322; GFX9-NEXT:    v_ashrrev_i64 v[5:6], 2, v[4:5]
323; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
324; GFX9-NEXT:    v_mov_b32_e32 v0, v4
325; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
326; GFX9-NEXT:    v_mov_b32_e32 v1, v3
327; GFX9-NEXT:    s_setpc_b64 s[30:31]
328bb:
329  %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
330  ret { i64, i1 } %umulo
331}
332
333define { i64, i1 } @umulo_i64_v_4(i64 %i) {
334; SI-LABEL: umulo_i64_v_4:
335; SI:       ; %bb.0: ; %bb
336; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337; SI-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
338; SI-NEXT:    v_mov_b32_e32 v6, v0
339; SI-NEXT:    v_lshl_b64 v[4:5], v[0:1], 2
340; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 30
341; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
342; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
343; SI-NEXT:    v_mov_b32_e32 v0, v4
344; SI-NEXT:    v_mov_b32_e32 v1, v3
345; SI-NEXT:    s_setpc_b64 s[30:31]
346;
347; GFX9-LABEL: umulo_i64_v_4:
348; GFX9:       ; %bb.0: ; %bb
349; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
350; GFX9-NEXT:    v_and_b32_e32 v7, 0x3fffffff, v1
351; GFX9-NEXT:    v_mov_b32_e32 v6, v0
352; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
353; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 2, v[0:1]
354; GFX9-NEXT:    v_alignbit_b32 v3, v1, v0, 30
355; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
356; GFX9-NEXT:    v_mov_b32_e32 v0, v4
357; GFX9-NEXT:    v_mov_b32_e32 v1, v3
358; GFX9-NEXT:    s_setpc_b64 s[30:31]
359bb:
360  %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
361  ret { i64, i1 } %umulo
362}
363
364declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
365declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)
366