• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
5
6
7define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) {
8; GFX7-LABEL: test_div_scale_f32_1:
9; GFX7:       ; %bb.0:
10; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
11; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
12; GFX7-NEXT:    v_mov_b32_e32 v1, 0
13; GFX7-NEXT:    s_mov_b32 s6, 0
14; GFX7-NEXT:    s_mov_b32 s7, 0xf000
15; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
17; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
18; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
19; GFX7-NEXT:    s_mov_b32 s6, -1
20; GFX7-NEXT:    s_waitcnt vmcnt(0)
21; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v2
22; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
23; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
24; GFX7-NEXT:    s_endpgm
25;
26; GFX8-LABEL: test_div_scale_f32_1:
27; GFX8:       ; %bb.0:
28; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
29; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
30; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX8-NEXT:    v_mov_b32_e32 v0, s2
32; GFX8-NEXT:    v_mov_b32_e32 v1, s3
33; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
34; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
35; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
36; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
37; GFX8-NEXT:    flat_load_dword v0, v[0:1]
38; GFX8-NEXT:    flat_load_dword v1, v[2:3]
39; GFX8-NEXT:    s_waitcnt vmcnt(0)
40; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
41; GFX8-NEXT:    v_mov_b32_e32 v0, s0
42; GFX8-NEXT:    v_mov_b32_e32 v1, s1
43; GFX8-NEXT:    flat_store_dword v[0:1], v2
44; GFX8-NEXT:    s_endpgm
45;
46; GFX10-LABEL: test_div_scale_f32_1:
47; GFX10:       ; %bb.0:
48; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
49; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
50; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
51; GFX10-NEXT:    s_clause 0x1
52; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
53; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
54; GFX10-NEXT:    s_waitcnt vmcnt(0)
55; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
56; GFX10-NEXT:    v_mov_b32_e32 v1, 0
57; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
58; GFX10-NEXT:    s_endpgm
59  %tid = call i32 @llvm.amdgcn.workitem.id.x()
60  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
61  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
62
63  %a = load volatile float, float addrspace(1)* %gep.0, align 4
64  %b = load volatile float, float addrspace(1)* %gep.1, align 4
65
66  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
67  %result0 = extractvalue { float, i1 } %result, 0
68  store float %result0, float addrspace(1)* %out, align 4
69  ret void
70}
71
72define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) {
73; GFX7-LABEL: test_div_scale_f32_2:
74; GFX7:       ; %bb.0:
75; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
76; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
77; GFX7-NEXT:    v_mov_b32_e32 v1, 0
78; GFX7-NEXT:    s_mov_b32 s6, 0
79; GFX7-NEXT:    s_mov_b32 s7, 0xf000
80; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
82; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
83; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
84; GFX7-NEXT:    s_mov_b32 s6, -1
85; GFX7-NEXT:    s_waitcnt vmcnt(0)
86; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v2, v0, v2
87; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
88; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
89; GFX7-NEXT:    s_endpgm
90;
91; GFX8-LABEL: test_div_scale_f32_2:
92; GFX8:       ; %bb.0:
93; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
94; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
95; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX8-NEXT:    v_mov_b32_e32 v0, s2
97; GFX8-NEXT:    v_mov_b32_e32 v1, s3
98; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
99; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
100; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
101; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
102; GFX8-NEXT:    flat_load_dword v0, v[0:1]
103; GFX8-NEXT:    flat_load_dword v1, v[2:3]
104; GFX8-NEXT:    s_waitcnt vmcnt(0)
105; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v1, v0
106; GFX8-NEXT:    v_mov_b32_e32 v0, s0
107; GFX8-NEXT:    v_mov_b32_e32 v1, s1
108; GFX8-NEXT:    flat_store_dword v[0:1], v2
109; GFX8-NEXT:    s_endpgm
110;
111; GFX10-LABEL: test_div_scale_f32_2:
112; GFX10:       ; %bb.0:
113; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
114; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
115; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX10-NEXT:    s_clause 0x1
117; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
118; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
119; GFX10-NEXT:    s_waitcnt vmcnt(0)
120; GFX10-NEXT:    v_div_scale_f32 v0, s2, v1, v0, v1
121; GFX10-NEXT:    v_mov_b32_e32 v1, 0
122; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
123; GFX10-NEXT:    s_endpgm
124  %tid = call i32 @llvm.amdgcn.workitem.id.x()
125  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
126  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
127
128  %a = load volatile float, float addrspace(1)* %gep.0, align 4
129  %b = load volatile float, float addrspace(1)* %gep.1, align 4
130
131  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
132  %result0 = extractvalue { float, i1 } %result, 0
133  store float %result0, float addrspace(1)* %out, align 4
134  ret void
135}
136
137define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) {
138; GFX7-LABEL: test_div_scale_f64_1:
139; GFX7:       ; %bb.0:
140; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
141; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
142; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
143; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
144; GFX7-NEXT:    v_mov_b32_e32 v0, s2
145; GFX7-NEXT:    v_mov_b32_e32 v1, s3
146; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
147; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
148; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
149; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
150; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
151; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
152; GFX7-NEXT:    s_waitcnt vmcnt(0)
153; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
154; GFX7-NEXT:    v_mov_b32_e32 v3, s1
155; GFX7-NEXT:    v_mov_b32_e32 v2, s0
156; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
157; GFX7-NEXT:    s_endpgm
158;
159; GFX8-LABEL: test_div_scale_f64_1:
160; GFX8:       ; %bb.0:
161; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
162; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
163; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
164; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX8-NEXT:    v_mov_b32_e32 v0, s2
166; GFX8-NEXT:    v_mov_b32_e32 v1, s3
167; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
168; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
169; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
170; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
171; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
172; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
173; GFX8-NEXT:    s_waitcnt vmcnt(0)
174; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
175; GFX8-NEXT:    v_mov_b32_e32 v3, s1
176; GFX8-NEXT:    v_mov_b32_e32 v2, s0
177; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
178; GFX8-NEXT:    s_endpgm
179;
180; GFX10-LABEL: test_div_scale_f64_1:
181; GFX10:       ; %bb.0:
182; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
183; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
184; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
185; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX10-NEXT:    s_clause 0x1
187; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
188; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
189; GFX10-NEXT:    s_waitcnt vmcnt(0)
190; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
191; GFX10-NEXT:    v_mov_b32_e32 v2, 0
192; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
193; GFX10-NEXT:    s_endpgm
194  %tid = call i32 @llvm.amdgcn.workitem.id.x()
195  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
196  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
197
198  %a = load volatile double, double addrspace(1)* %gep.0, align 8
199  %b = load volatile double, double addrspace(1)* %gep.1, align 8
200
201  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
202  %result0 = extractvalue { double, i1 } %result, 0
203  store double %result0, double addrspace(1)* %out, align 8
204  ret void
205}
206
207define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) {
208; GFX7-LABEL: test_div_scale_f64_2:
209; GFX7:       ; %bb.0:
210; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
211; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
212; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
213; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX7-NEXT:    v_mov_b32_e32 v0, s2
215; GFX7-NEXT:    v_mov_b32_e32 v1, s3
216; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
217; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
218; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
219; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
220; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
221; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
222; GFX7-NEXT:    s_waitcnt vmcnt(0)
223; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
224; GFX7-NEXT:    v_mov_b32_e32 v3, s1
225; GFX7-NEXT:    v_mov_b32_e32 v2, s0
226; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
227; GFX7-NEXT:    s_endpgm
228;
229; GFX8-LABEL: test_div_scale_f64_2:
230; GFX8:       ; %bb.0:
231; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
232; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
233; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
234; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX8-NEXT:    v_mov_b32_e32 v0, s2
236; GFX8-NEXT:    v_mov_b32_e32 v1, s3
237; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
238; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
239; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
240; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
241; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
242; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
243; GFX8-NEXT:    s_waitcnt vmcnt(0)
244; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
245; GFX8-NEXT:    v_mov_b32_e32 v3, s1
246; GFX8-NEXT:    v_mov_b32_e32 v2, s0
247; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
248; GFX8-NEXT:    s_endpgm
249;
250; GFX10-LABEL: test_div_scale_f64_2:
251; GFX10:       ; %bb.0:
252; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
253; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
254; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
255; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
256; GFX10-NEXT:    s_clause 0x1
257; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
258; GFX10-NEXT:    global_load_dwordx2 v[2:3], v2, s[2:3] offset:8
259; GFX10-NEXT:    s_waitcnt vmcnt(0)
260; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
261; GFX10-NEXT:    v_mov_b32_e32 v2, 0
262; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
263; GFX10-NEXT:    s_endpgm
264  %tid = call i32 @llvm.amdgcn.workitem.id.x()
265  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
266  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
267
268  %a = load volatile double, double addrspace(1)* %gep.0, align 8
269  %b = load volatile double, double addrspace(1)* %gep.1, align 8
270
271  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
272  %result0 = extractvalue { double, i1 } %result, 0
273  store double %result0, double addrspace(1)* %out, align 8
274  ret void
275}
276
277define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], float %a) {
278; GFX7-LABEL: test_div_scale_f32_scalar_num_1:
279; GFX7:       ; %bb.0:
280; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
281; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x15
282; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
283; GFX7-NEXT:    v_mov_b32_e32 v1, 0
284; GFX7-NEXT:    s_mov_b32 s2, 0
285; GFX7-NEXT:    s_mov_b32 s3, 0xf000
286; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
287; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
288; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
289; GFX7-NEXT:    s_mov_b32 s2, -1
290; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
291; GFX7-NEXT:    s_waitcnt vmcnt(0)
292; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], v0, v0, s8
293; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
294; GFX7-NEXT:    s_endpgm
295;
296; GFX8-LABEL: test_div_scale_f32_scalar_num_1:
297; GFX8:       ; %bb.0:
298; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
299; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x54
300; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
301; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX8-NEXT:    v_mov_b32_e32 v0, s6
303; GFX8-NEXT:    v_mov_b32_e32 v1, s7
304; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
305; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
306; GFX8-NEXT:    flat_load_dword v0, v[0:1]
307; GFX8-NEXT:    s_waitcnt vmcnt(0)
308; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], v0, v0, s0
309; GFX8-NEXT:    v_mov_b32_e32 v0, s4
310; GFX8-NEXT:    v_mov_b32_e32 v1, s5
311; GFX8-NEXT:    flat_store_dword v[0:1], v2
312; GFX8-NEXT:    s_endpgm
313;
314; GFX10-LABEL: test_div_scale_f32_scalar_num_1:
315; GFX10:       ; %bb.0:
316; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
317; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
318; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x54
319; GFX10-NEXT:    v_mov_b32_e32 v1, 0
320; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
322; GFX10-NEXT:    s_waitcnt vmcnt(0)
323; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, v0, s0
324; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
325; GFX10-NEXT:    s_endpgm
326  %tid = call i32 @llvm.amdgcn.workitem.id.x()
327  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
328
329  %b = load float, float addrspace(1)* %gep, align 4
330
331  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
332  %result0 = extractvalue { float, i1 } %result, 0
333  store float %result0, float addrspace(1)* %out, align 4
334  ret void
335}
336
337define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) {
338; GFX7-LABEL: test_div_scale_f32_scalar_num_2:
339; GFX7:       ; %bb.0:
340; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
341; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
342; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
343; GFX7-NEXT:    v_mov_b32_e32 v1, 0
344; GFX7-NEXT:    s_mov_b32 s2, 0
345; GFX7-NEXT:    s_mov_b32 s3, 0xf000
346; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
348; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
349; GFX7-NEXT:    s_mov_b32 s2, -1
350; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
351; GFX7-NEXT:    s_waitcnt vmcnt(0)
352; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], s8, v0, s8
353; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
354; GFX7-NEXT:    s_endpgm
355;
356; GFX8-LABEL: test_div_scale_f32_scalar_num_2:
357; GFX8:       ; %bb.0:
358; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
359; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
360; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
361; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX8-NEXT:    v_mov_b32_e32 v0, s6
363; GFX8-NEXT:    v_mov_b32_e32 v1, s7
364; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
365; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
366; GFX8-NEXT:    flat_load_dword v0, v[0:1]
367; GFX8-NEXT:    s_waitcnt vmcnt(0)
368; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], s0, v0, s0
369; GFX8-NEXT:    v_mov_b32_e32 v0, s4
370; GFX8-NEXT:    v_mov_b32_e32 v1, s5
371; GFX8-NEXT:    flat_store_dword v[0:1], v2
372; GFX8-NEXT:    s_endpgm
373;
374; GFX10-LABEL: test_div_scale_f32_scalar_num_2:
375; GFX10:       ; %bb.0:
376; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
377; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
378; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
379; GFX10-NEXT:    v_mov_b32_e32 v1, 0
380; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
382; GFX10-NEXT:    s_waitcnt vmcnt(0)
383; GFX10-NEXT:    v_div_scale_f32 v0, s0, s0, v0, s0
384; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
385; GFX10-NEXT:    s_endpgm
386  %tid = call i32 @llvm.amdgcn.workitem.id.x()
387  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
388
389  %b = load float, float addrspace(1)* %gep, align 4
390
391  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
392  %result0 = extractvalue { float, i1 } %result, 0
393  store float %result0, float addrspace(1)* %out, align 4
394  ret void
395}
396
397define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) {
398; GFX7-LABEL: test_div_scale_f32_scalar_den_1:
399; GFX7:       ; %bb.0:
400; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
401; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
402; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
403; GFX7-NEXT:    v_mov_b32_e32 v1, 0
404; GFX7-NEXT:    s_mov_b32 s2, 0
405; GFX7-NEXT:    s_mov_b32 s3, 0xf000
406; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
407; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
408; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
409; GFX7-NEXT:    s_mov_b32 s2, -1
410; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
411; GFX7-NEXT:    s_waitcnt vmcnt(0)
412; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], s8, s8, v0
413; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
414; GFX7-NEXT:    s_endpgm
415;
416; GFX8-LABEL: test_div_scale_f32_scalar_den_1:
417; GFX8:       ; %bb.0:
418; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
419; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
420; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
421; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX8-NEXT:    v_mov_b32_e32 v0, s6
423; GFX8-NEXT:    v_mov_b32_e32 v1, s7
424; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
425; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
426; GFX8-NEXT:    flat_load_dword v0, v[0:1]
427; GFX8-NEXT:    s_waitcnt vmcnt(0)
428; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], s0, s0, v0
429; GFX8-NEXT:    v_mov_b32_e32 v0, s4
430; GFX8-NEXT:    v_mov_b32_e32 v1, s5
431; GFX8-NEXT:    flat_store_dword v[0:1], v2
432; GFX8-NEXT:    s_endpgm
433;
434; GFX10-LABEL: test_div_scale_f32_scalar_den_1:
435; GFX10:       ; %bb.0:
436; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
437; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
438; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
439; GFX10-NEXT:    v_mov_b32_e32 v1, 0
440; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
442; GFX10-NEXT:    s_waitcnt vmcnt(0)
443; GFX10-NEXT:    v_div_scale_f32 v0, s0, s0, s0, v0
444; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
445; GFX10-NEXT:    s_endpgm
446  %tid = call i32 @llvm.amdgcn.workitem.id.x()
447  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
448
449  %a = load float, float addrspace(1)* %gep, align 4
450
451  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
452  %result0 = extractvalue { float, i1 } %result, 0
453  store float %result0, float addrspace(1)* %out, align 4
454  ret void
455}
456
457define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) {
458; GFX7-LABEL: test_div_scale_f32_scalar_den_2:
459; GFX7:       ; %bb.0:
460; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
461; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
462; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
463; GFX7-NEXT:    v_mov_b32_e32 v1, 0
464; GFX7-NEXT:    s_mov_b32 s2, 0
465; GFX7-NEXT:    s_mov_b32 s3, 0xf000
466; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
468; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
469; GFX7-NEXT:    s_mov_b32 s2, -1
470; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
471; GFX7-NEXT:    s_waitcnt vmcnt(0)
472; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], v0, s8, v0
473; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
474; GFX7-NEXT:    s_endpgm
475;
476; GFX8-LABEL: test_div_scale_f32_scalar_den_2:
477; GFX8:       ; %bb.0:
478; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
479; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
480; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
481; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
482; GFX8-NEXT:    v_mov_b32_e32 v0, s6
483; GFX8-NEXT:    v_mov_b32_e32 v1, s7
484; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
485; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
486; GFX8-NEXT:    flat_load_dword v0, v[0:1]
487; GFX8-NEXT:    s_waitcnt vmcnt(0)
488; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], v0, s0, v0
489; GFX8-NEXT:    v_mov_b32_e32 v0, s4
490; GFX8-NEXT:    v_mov_b32_e32 v1, s5
491; GFX8-NEXT:    flat_store_dword v[0:1], v2
492; GFX8-NEXT:    s_endpgm
493;
494; GFX10-LABEL: test_div_scale_f32_scalar_den_2:
495; GFX10:       ; %bb.0:
496; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
497; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
498; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
499; GFX10-NEXT:    v_mov_b32_e32 v1, 0
500; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
502; GFX10-NEXT:    s_waitcnt vmcnt(0)
503; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, s0, v0
504; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
505; GFX10-NEXT:    s_endpgm
506  %tid = call i32 @llvm.amdgcn.workitem.id.x()
507  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
508
509  %a = load float, float addrspace(1)* %gep, align 4
510
511  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
512  %result0 = extractvalue { float, i1 } %result, 0
513  store float %result0, float addrspace(1)* %out, align 4
514  ret void
515}
516
517define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %a) {
518; GFX7-LABEL: test_div_scale_f64_scalar_num_1:
519; GFX7:       ; %bb.0:
520; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
521; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
522; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
523; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX7-NEXT:    v_mov_b32_e32 v0, s6
525; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
526; GFX7-NEXT:    v_mov_b32_e32 v1, s7
527; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
528; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
529; GFX7-NEXT:    v_mov_b32_e32 v2, s4
530; GFX7-NEXT:    v_mov_b32_e32 v3, s5
531; GFX7-NEXT:    s_waitcnt vmcnt(0)
532; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
533; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
534; GFX7-NEXT:    s_endpgm
535;
536; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
537; GFX8:       ; %bb.0:
538; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
539; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
540; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
541; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX8-NEXT:    v_mov_b32_e32 v0, s6
543; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
544; GFX8-NEXT:    v_mov_b32_e32 v1, s7
545; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
546; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
547; GFX8-NEXT:    v_mov_b32_e32 v2, s4
548; GFX8-NEXT:    v_mov_b32_e32 v3, s5
549; GFX8-NEXT:    s_waitcnt vmcnt(0)
550; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
551; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
552; GFX8-NEXT:    s_endpgm
553;
554; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
555; GFX10:       ; %bb.0:
556; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
557; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
558; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
559; GFX10-NEXT:    v_mov_b32_e32 v2, 0
560; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
562; GFX10-NEXT:    s_waitcnt vmcnt(0)
563; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
564; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
565; GFX10-NEXT:    s_endpgm
566  %tid = call i32 @llvm.amdgcn.workitem.id.x()
567  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
568
569  %b = load double, double addrspace(1)* %gep, align 8
570
571  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
572  %result0 = extractvalue { double, i1 } %result, 0
573  store double %result0, double addrspace(1)* %out, align 8
574  ret void
575}
576
577define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32],  double %a) {
578; GFX7-LABEL: test_div_scale_f64_scalar_num_2:
579; GFX7:       ; %bb.0:
580; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
581; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
582; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
583; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
584; GFX7-NEXT:    v_mov_b32_e32 v0, s6
585; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
586; GFX7-NEXT:    v_mov_b32_e32 v1, s7
587; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
588; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
589; GFX7-NEXT:    v_mov_b32_e32 v2, s4
590; GFX7-NEXT:    v_mov_b32_e32 v3, s5
591; GFX7-NEXT:    s_waitcnt vmcnt(0)
592; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
593; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
594; GFX7-NEXT:    s_endpgm
595;
596; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
597; GFX8:       ; %bb.0:
598; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
599; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
600; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
601; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
602; GFX8-NEXT:    v_mov_b32_e32 v0, s6
603; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
604; GFX8-NEXT:    v_mov_b32_e32 v1, s7
605; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
606; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
607; GFX8-NEXT:    v_mov_b32_e32 v2, s4
608; GFX8-NEXT:    v_mov_b32_e32 v3, s5
609; GFX8-NEXT:    s_waitcnt vmcnt(0)
610; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
611; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
612; GFX8-NEXT:    s_endpgm
613;
614; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
615; GFX10:       ; %bb.0:
616; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
617; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
618; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
619; GFX10-NEXT:    v_mov_b32_e32 v2, 0
620; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
622; GFX10-NEXT:    s_waitcnt vmcnt(0)
623; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
624; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
625; GFX10-NEXT:    s_endpgm
626  %tid = call i32 @llvm.amdgcn.workitem.id.x()
627  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
628
629  %b = load double, double addrspace(1)* %gep, align 8
630
631  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
632  %result0 = extractvalue { double, i1 } %result, 0
633  store double %result0, double addrspace(1)* %out, align 8
634  ret void
635}
636
637define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) {
638; GFX7-LABEL: test_div_scale_f64_scalar_den_1:
639; GFX7:       ; %bb.0:
640; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
641; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
642; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
643; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
644; GFX7-NEXT:    v_mov_b32_e32 v0, s6
645; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
646; GFX7-NEXT:    v_mov_b32_e32 v1, s7
647; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
648; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
649; GFX7-NEXT:    v_mov_b32_e32 v2, s4
650; GFX7-NEXT:    v_mov_b32_e32 v3, s5
651; GFX7-NEXT:    s_waitcnt vmcnt(0)
652; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
653; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
654; GFX7-NEXT:    s_endpgm
655;
656; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
657; GFX8:       ; %bb.0:
658; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
659; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
660; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
661; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX8-NEXT:    v_mov_b32_e32 v0, s6
663; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
664; GFX8-NEXT:    v_mov_b32_e32 v1, s7
665; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
666; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
667; GFX8-NEXT:    v_mov_b32_e32 v2, s4
668; GFX8-NEXT:    v_mov_b32_e32 v3, s5
669; GFX8-NEXT:    s_waitcnt vmcnt(0)
670; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
671; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
672; GFX8-NEXT:    s_endpgm
673;
674; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
675; GFX10:       ; %bb.0:
676; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
677; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
678; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
679; GFX10-NEXT:    v_mov_b32_e32 v2, 0
680; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
682; GFX10-NEXT:    s_waitcnt vmcnt(0)
683; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
684; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
685; GFX10-NEXT:    s_endpgm
686  %tid = call i32 @llvm.amdgcn.workitem.id.x()
687  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
688
689  %a = load double, double addrspace(1)* %gep, align 8
690
691  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
692  %result0 = extractvalue { double, i1 } %result, 0
693  store double %result0, double addrspace(1)* %out, align 8
694  ret void
695}
696
697define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) {
698; GFX7-LABEL: test_div_scale_f64_scalar_den_2:
699; GFX7:       ; %bb.0:
700; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
701; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
702; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
703; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX7-NEXT:    v_mov_b32_e32 v0, s6
705; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
706; GFX7-NEXT:    v_mov_b32_e32 v1, s7
707; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
708; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
709; GFX7-NEXT:    v_mov_b32_e32 v2, s4
710; GFX7-NEXT:    v_mov_b32_e32 v3, s5
711; GFX7-NEXT:    s_waitcnt vmcnt(0)
712; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
713; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
714; GFX7-NEXT:    s_endpgm
715;
716; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
717; GFX8:       ; %bb.0:
718; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
719; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
720; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
721; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
722; GFX8-NEXT:    v_mov_b32_e32 v0, s6
723; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
724; GFX8-NEXT:    v_mov_b32_e32 v1, s7
725; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
726; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
727; GFX8-NEXT:    v_mov_b32_e32 v2, s4
728; GFX8-NEXT:    v_mov_b32_e32 v3, s5
729; GFX8-NEXT:    s_waitcnt vmcnt(0)
730; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
731; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
732; GFX8-NEXT:    s_endpgm
733;
734; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
735; GFX10:       ; %bb.0:
736; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
737; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
738; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
739; GFX10-NEXT:    v_mov_b32_e32 v2, 0
740; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
742; GFX10-NEXT:    s_waitcnt vmcnt(0)
743; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
744; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
745; GFX10-NEXT:    s_endpgm
746  %tid = call i32 @llvm.amdgcn.workitem.id.x()
747  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
748
749  %a = load double, double addrspace(1)* %gep, align 8
750
751  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
752  %result0 = extractvalue { double, i1 } %result, 0
753  store double %result0, double addrspace(1)* %out, align 8
754  ret void
755}
756
757define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
758; GFX7-LABEL: test_div_scale_f32_all_scalar_1:
759; GFX7:       ; %bb.0:
760; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
761; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
762; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x1c
763; GFX7-NEXT:    s_mov_b32 s6, -1
764; GFX7-NEXT:    s_mov_b32 s7, 0xf000
765; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX7-NEXT:    v_mov_b32_e32 v0, s0
767; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], v0, v0, s2
768; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
769; GFX7-NEXT:    s_endpgm
770;
771; GFX8-LABEL: test_div_scale_f32_all_scalar_1:
772; GFX8:       ; %bb.0:
773; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
774; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
775; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
776; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX8-NEXT:    v_mov_b32_e32 v0, s3
778; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v0, s2
779; GFX8-NEXT:    v_mov_b32_e32 v0, s0
780; GFX8-NEXT:    v_mov_b32_e32 v1, s1
781; GFX8-NEXT:    flat_store_dword v[0:1], v2
782; GFX8-NEXT:    s_endpgm
783;
784; GFX10-LABEL: test_div_scale_f32_all_scalar_1:
785; GFX10:       ; %bb.0:
786; GFX10-NEXT:    s_clause 0x2
787; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
788; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
789; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
790; GFX10-NEXT:    v_mov_b32_e32 v1, 0
791; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
792; GFX10-NEXT:    v_div_scale_f32 v0, s2, s3, s3, s2
793; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
794; GFX10-NEXT:    s_endpgm
795  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
796  %result0 = extractvalue { float, i1 } %result, 0
797  store float %result0, float addrspace(1)* %out, align 4
798  ret void
799}
800
801define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
802; GFX7-LABEL: test_div_scale_f32_all_scalar_2:
803; GFX7:       ; %bb.0:
804; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
805; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
806; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x1c
807; GFX7-NEXT:    s_mov_b32 s6, -1
808; GFX7-NEXT:    s_mov_b32 s7, 0xf000
809; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX7-NEXT:    v_mov_b32_e32 v0, s0
811; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], s2, v0, s2
812; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
813; GFX7-NEXT:    s_endpgm
814;
815; GFX8-LABEL: test_div_scale_f32_all_scalar_2:
816; GFX8:       ; %bb.0:
817; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
818; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
819; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
820; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX8-NEXT:    v_mov_b32_e32 v0, s3
822; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], s2, v0, s2
823; GFX8-NEXT:    v_mov_b32_e32 v0, s0
824; GFX8-NEXT:    v_mov_b32_e32 v1, s1
825; GFX8-NEXT:    flat_store_dword v[0:1], v2
826; GFX8-NEXT:    s_endpgm
827;
828; GFX10-LABEL: test_div_scale_f32_all_scalar_2:
829; GFX10:       ; %bb.0:
830; GFX10-NEXT:    s_clause 0x2
831; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x4c
832; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x70
833; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
834; GFX10-NEXT:    v_mov_b32_e32 v1, 0
835; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
836; GFX10-NEXT:    v_div_scale_f32 v0, s2, s2, s3, s2
837; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
838; GFX10-NEXT:    s_endpgm
839  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
840  %result0 = extractvalue { float, i1 } %result, 0
841  store float %result0, float addrspace(1)* %out, align 4
842  ret void
843}
844
845define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) {
846; GFX7-LABEL: test_div_scale_f64_all_scalar_1:
847; GFX7:       ; %bb.0:
848; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
849; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
850; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
851; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX7-NEXT:    v_mov_b32_e32 v0, s4
853; GFX7-NEXT:    v_mov_b32_e32 v1, s5
854; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
855; GFX7-NEXT:    v_mov_b32_e32 v3, s1
856; GFX7-NEXT:    v_mov_b32_e32 v2, s0
857; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
858; GFX7-NEXT:    s_endpgm
859;
860; GFX8-LABEL: test_div_scale_f64_all_scalar_1:
861; GFX8:       ; %bb.0:
862; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
863; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
864; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
865; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
866; GFX8-NEXT:    v_mov_b32_e32 v0, s4
867; GFX8-NEXT:    v_mov_b32_e32 v1, s5
868; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
869; GFX8-NEXT:    v_mov_b32_e32 v3, s1
870; GFX8-NEXT:    v_mov_b32_e32 v2, s0
871; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
872; GFX8-NEXT:    s_endpgm
873;
874; GFX10-LABEL: test_div_scale_f64_all_scalar_1:
875; GFX10:       ; %bb.0:
876; GFX10-NEXT:    s_clause 0x2
877; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
878; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
879; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
880; GFX10-NEXT:    v_mov_b32_e32 v2, 0
881; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
882; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
883; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
884; GFX10-NEXT:    s_endpgm
885  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
886  %result0 = extractvalue { double, i1 } %result, 0
887  store double %result0, double addrspace(1)* %out, align 8
888  ret void
889}
890
891define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) {
892; GFX7-LABEL: test_div_scale_f64_all_scalar_2:
893; GFX7:       ; %bb.0:
894; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
895; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
896; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
897; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX7-NEXT:    v_mov_b32_e32 v0, s4
899; GFX7-NEXT:    v_mov_b32_e32 v1, s5
900; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3]
901; GFX7-NEXT:    v_mov_b32_e32 v3, s1
902; GFX7-NEXT:    v_mov_b32_e32 v2, s0
903; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
904; GFX7-NEXT:    s_endpgm
905;
906; GFX8-LABEL: test_div_scale_f64_all_scalar_2:
907; GFX8:       ; %bb.0:
908; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
909; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
910; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
911; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX8-NEXT:    v_mov_b32_e32 v0, s4
913; GFX8-NEXT:    v_mov_b32_e32 v1, s5
914; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3]
915; GFX8-NEXT:    v_mov_b32_e32 v3, s1
916; GFX8-NEXT:    v_mov_b32_e32 v2, s0
917; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
918; GFX8-NEXT:    s_endpgm
919;
920; GFX10-LABEL: test_div_scale_f64_all_scalar_2:
921; GFX10:       ; %bb.0:
922; GFX10-NEXT:    s_clause 0x2
923; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
924; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
925; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
926; GFX10-NEXT:    v_mov_b32_e32 v2, 0
927; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
928; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
929; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
930; GFX10-NEXT:    s_endpgm
931  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
932  %result0 = extractvalue { double, i1 } %result, 0
933  store double %result0, double addrspace(1)* %out, align 8
934  ret void
935}
936
937define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) {
938; GFX7-LABEL: test_div_scale_f32_inline_imm_num:
939; GFX7:       ; %bb.0:
940; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
941; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
942; GFX7-NEXT:    v_mov_b32_e32 v1, 0
943; GFX7-NEXT:    s_mov_b32 s6, 0
944; GFX7-NEXT:    s_mov_b32 s7, 0xf000
945; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
947; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
948; GFX7-NEXT:    s_mov_b32 s6, -1
949; GFX7-NEXT:    s_waitcnt vmcnt(0)
950; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, 1.0
951; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
952; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
953; GFX7-NEXT:    s_endpgm
954;
955; GFX8-LABEL: test_div_scale_f32_inline_imm_num:
956; GFX8:       ; %bb.0:
957; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
958; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
959; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX8-NEXT:    v_mov_b32_e32 v0, s2
961; GFX8-NEXT:    v_mov_b32_e32 v1, s3
962; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
963; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
964; GFX8-NEXT:    flat_load_dword v0, v[0:1]
965; GFX8-NEXT:    s_waitcnt vmcnt(0)
966; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
967; GFX8-NEXT:    v_mov_b32_e32 v0, s0
968; GFX8-NEXT:    v_mov_b32_e32 v1, s1
969; GFX8-NEXT:    flat_store_dword v[0:1], v2
970; GFX8-NEXT:    s_endpgm
971;
972; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
973; GFX10:       ; %bb.0:
974; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
975; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
976; GFX10-NEXT:    v_mov_b32_e32 v1, 0
977; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
978; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
979; GFX10-NEXT:    s_waitcnt vmcnt(0)
980; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, 1.0
981; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
982; GFX10-NEXT:    s_endpgm
983  %tid = call i32 @llvm.amdgcn.workitem.id.x()
984  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
985  %a = load float, float addrspace(1)* %gep.0, align 4
986
987  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false)
988  %result0 = extractvalue { float, i1 } %result, 0
989  store float %result0, float addrspace(1)* %out, align 4
990  ret void
991}
992
993define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) {
994; GFX7-LABEL: test_div_scale_f32_inline_imm_den:
995; GFX7:       ; %bb.0:
996; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
997; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
998; GFX7-NEXT:    v_mov_b32_e32 v1, 0
999; GFX7-NEXT:    s_mov_b32 s6, 0
1000; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1001; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
1003; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1004; GFX7-NEXT:    s_mov_b32 s6, -1
1005; GFX7-NEXT:    s_waitcnt vmcnt(0)
1006; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], 2.0, 2.0, v0
1007; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
1008; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1009; GFX7-NEXT:    s_endpgm
1010;
1011; GFX8-LABEL: test_div_scale_f32_inline_imm_den:
1012; GFX8:       ; %bb.0:
1013; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1014; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1015; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1016; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1017; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1018; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1019; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1020; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1021; GFX8-NEXT:    s_waitcnt vmcnt(0)
1022; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
1023; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1024; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1025; GFX8-NEXT:    flat_store_dword v[0:1], v2
1026; GFX8-NEXT:    s_endpgm
1027;
1028; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
1029; GFX10:       ; %bb.0:
1030; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1031; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1032; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1033; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1034; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1035; GFX10-NEXT:    s_waitcnt vmcnt(0)
1036; GFX10-NEXT:    v_div_scale_f32 v0, s2, 2.0, 2.0, v0
1037; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1038; GFX10-NEXT:    s_endpgm
1039  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1040  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
1041  %a = load float, float addrspace(1)* %gep.0, align 4
1042
1043  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false)
1044  %result0 = extractvalue { float, i1 } %result, 0
1045  store float %result0, float addrspace(1)* %out, align 4
1046  ret void
1047}
1048
1049define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) {
1050; GFX7-LABEL: test_div_scale_f32_fabs_num:
1051; GFX7:       ; %bb.0:
1052; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1053; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1054; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1055; GFX7-NEXT:    s_mov_b32 s6, 0
1056; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1057; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1058; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
1059; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1060; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
1061; GFX7-NEXT:    s_mov_b32 s6, -1
1062; GFX7-NEXT:    s_waitcnt vmcnt(1)
1063; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v2
1064; GFX7-NEXT:    s_waitcnt vmcnt(0)
1065; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v1
1066; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
1067; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1068; GFX7-NEXT:    s_endpgm
1069;
1070; GFX8-LABEL: test_div_scale_f32_fabs_num:
1071; GFX8:       ; %bb.0:
1072; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1073; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1074; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1076; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1077; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1078; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1079; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
1080; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1081; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1082; GFX8-NEXT:    flat_load_dword v1, v[2:3]
1083; GFX8-NEXT:    s_waitcnt vmcnt(1)
1084; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
1085; GFX8-NEXT:    s_waitcnt vmcnt(0)
1086; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
1087; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1088; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1089; GFX8-NEXT:    flat_store_dword v[0:1], v2
1090; GFX8-NEXT:    s_endpgm
1091;
1092; GFX10-LABEL: test_div_scale_f32_fabs_num:
1093; GFX10:       ; %bb.0:
1094; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1095; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1096; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1097; GFX10-NEXT:    s_clause 0x1
1098; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1099; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
1100; GFX10-NEXT:    s_waitcnt vmcnt(1)
1101; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
1102; GFX10-NEXT:    s_waitcnt vmcnt(0)
1103; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
1104; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1105; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1106; GFX10-NEXT:    s_endpgm
1107  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1108  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
1109  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
1110
1111  %a = load volatile float, float addrspace(1)* %gep.0, align 4
1112  %b = load volatile float, float addrspace(1)* %gep.1, align 4
1113
1114  %a.fabs = call float @llvm.fabs.f32(float %a)
1115
1116  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false)
1117  %result0 = extractvalue { float, i1 } %result, 0
1118  store float %result0, float addrspace(1)* %out, align 4
1119  ret void
1120}
1121
1122define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) {
1123; GFX7-LABEL: test_div_scale_f32_fabs_den:
1124; GFX7:       ; %bb.0:
1125; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1126; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1127; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1128; GFX7-NEXT:    s_mov_b32 s6, 0
1129; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1130; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1131; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
1132; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
1133; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4
1134; GFX7-NEXT:    s_mov_b32 s6, -1
1135; GFX7-NEXT:    s_waitcnt vmcnt(0)
1136; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
1137; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v2
1138; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
1139; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1140; GFX7-NEXT:    s_endpgm
1141;
1142; GFX8-LABEL: test_div_scale_f32_fabs_den:
1143; GFX8:       ; %bb.0:
1144; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1145; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1146; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1147; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1148; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1149; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1150; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1151; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
1152; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1153; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1154; GFX8-NEXT:    flat_load_dword v1, v[2:3]
1155; GFX8-NEXT:    s_waitcnt vmcnt(0)
1156; GFX8-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
1157; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
1158; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1159; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1160; GFX8-NEXT:    flat_store_dword v[0:1], v2
1161; GFX8-NEXT:    s_endpgm
1162;
1163; GFX10-LABEL: test_div_scale_f32_fabs_den:
1164; GFX10:       ; %bb.0:
1165; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1166; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1167; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1168; GFX10-NEXT:    s_clause 0x1
1169; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
1170; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
1171; GFX10-NEXT:    s_waitcnt vmcnt(0)
1172; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
1173; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
1174; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1175; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1176; GFX10-NEXT:    s_endpgm
1177  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1178  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
1179  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
1180
1181  %a = load volatile float, float addrspace(1)* %gep.0, align 4
1182  %b = load volatile float, float addrspace(1)* %gep.1, align 4
1183
1184  %b.fabs = call float @llvm.fabs.f32(float %b)
1185
1186  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false)
1187  %result0 = extractvalue { float, i1 } %result, 0
1188  store float %result0, float addrspace(1)* %out, align 4
1189  ret void
1190}
1191
1192define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 {
1193; GFX7-LABEL: test_div_scale_f32_val_undef_val:
1194; GFX7:       ; %bb.0:
1195; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1196; GFX7-NEXT:    v_mov_b32_e32 v0, 0x41000000
1197; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1198; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], s0, s0, v0
1199; GFX7-NEXT:    s_mov_b32 s2, -1
1200; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1201; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1202; GFX7-NEXT:    s_endpgm
1203;
1204; GFX8-LABEL: test_div_scale_f32_val_undef_val:
1205; GFX8:       ; %bb.0:
1206; GFX8-NEXT:    v_mov_b32_e32 v0, 0x41000000
1207; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], s0, s0, v0
1208; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1211; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1212; GFX8-NEXT:    flat_store_dword v[0:1], v2
1213; GFX8-NEXT:    s_endpgm
1214;
1215; GFX10-LABEL: test_div_scale_f32_val_undef_val:
1216; GFX10:       ; %bb.0:
1217; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1218; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1219; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1220; GFX10-NEXT:    v_div_scale_f32 v0, s2, s0, s0, 0x41000000
1221; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1222; GFX10-NEXT:    s_endpgm
1223  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
1224  %result0 = extractvalue { float, i1 } %result, 0
1225  store float %result0, float addrspace(1)* %out, align 4
1226  ret void
1227}
1228
1229define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 {
1230; GFX7-LABEL: test_div_scale_f32_undef_val_val:
1231; GFX7:       ; %bb.0:
1232; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1233; GFX7-NEXT:    v_mov_b32_e32 v0, 0x41000000
1234; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1235; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, s0
1236; GFX7-NEXT:    s_mov_b32 s2, -1
1237; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1238; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1239; GFX7-NEXT:    s_endpgm
1240;
1241; GFX8-LABEL: test_div_scale_f32_undef_val_val:
1242; GFX8:       ; %bb.0:
1243; GFX8-NEXT:    v_mov_b32_e32 v0, 0x41000000
1244; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v0, s0
1245; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1246; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1247; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1248; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1249; GFX8-NEXT:    flat_store_dword v[0:1], v2
1250; GFX8-NEXT:    s_endpgm
1251;
1252; GFX10-LABEL: test_div_scale_f32_undef_val_val:
1253; GFX10:       ; %bb.0:
1254; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1255; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
1256; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1257; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1258; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, s0
1259; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1260; GFX10-NEXT:    s_endpgm
1261  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
1262  %result0 = extractvalue { float, i1 } %result, 0
1263  store float %result0, float addrspace(1)* %out, align 4
1264  ret void
1265}
1266
1267define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 {
1268; GFX7-LABEL: test_div_scale_f32_undef_undef_val:
1269; GFX7:       ; %bb.0:
1270; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1271; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1272; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], s0, s0, s0
1273; GFX7-NEXT:    s_mov_b32 s2, -1
1274; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1275; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1276; GFX7-NEXT:    s_endpgm
1277;
1278; GFX8-LABEL: test_div_scale_f32_undef_undef_val:
1279; GFX8:       ; %bb.0:
1280; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], s0, s0, s0
1281; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1282; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1283; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1284; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1285; GFX8-NEXT:    flat_store_dword v[0:1], v2
1286; GFX8-NEXT:    s_endpgm
1287;
1288; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
1289; GFX10:       ; %bb.0:
1290; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1291; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1292; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX10-NEXT:    v_div_scale_f32 v0, s2, s0, s0, s0
1294; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1295; GFX10-NEXT:    s_endpgm
1296  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
1297  %result0 = extractvalue { float, i1 } %result, 0
1298  store float %result0, float addrspace(1)* %out, align 4
1299  ret void
1300}
1301
1302define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 {
1303; GFX7-LABEL: test_div_scale_f64_val_undef_val:
1304; GFX7:       ; %bb.0:
1305; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1306; GFX7-NEXT:    s_mov_b32 s2, 0
1307; GFX7-NEXT:    s_mov_b32 s3, 0x40200000
1308; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
1309; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1310; GFX7-NEXT:    v_mov_b32_e32 v3, s1
1311; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1312; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1313; GFX7-NEXT:    s_endpgm
1314;
1315; GFX8-LABEL: test_div_scale_f64_val_undef_val:
1316; GFX8:       ; %bb.0:
1317; GFX8-NEXT:    s_mov_b32 s2, 0
1318; GFX8-NEXT:    s_mov_b32 s3, 0x40200000
1319; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
1320; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1321; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1323; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1324; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1325; GFX8-NEXT:    s_endpgm
1326;
1327; GFX10-LABEL: test_div_scale_f64_val_undef_val:
1328; GFX10:       ; %bb.0:
1329; GFX10-NEXT:    s_mov_b32 s2, 0
1330; GFX10-NEXT:    s_mov_b32 s3, 0x40200000
1331; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1332; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
1333; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1334; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1335; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1336; GFX10-NEXT:    s_endpgm
1337  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
1338  %result0 = extractvalue { double, i1 } %result, 0
1339  store double %result0, double addrspace(1)* %out, align 8
1340  ret void
1341}
1342
1343declare i32 @llvm.amdgcn.workitem.id.x() #1
1344declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1
1345declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1
1346declare float @llvm.fabs.f32(float) #1
1347
1348attributes #0 = { nounwind }
1349attributes #1 = { nounwind readnone speculatable }
1350