1; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s 2; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s 3; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s 4 5; GFX8-NOT: s_inst_prefetch 6; GFX8-NOT: .palign 6 7 8; GCN-LABEL: test_loop_64 9; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 10; GFX10-DIS-NEXT: {{^$}} 11; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]: 12; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>: 13; GFX10: s_sleep 0 14; GFX10: s_cbranch_scc0 [[L1]] 15; GFX10-NEXT: s_endpgm 16define amdgpu_kernel void @test_loop_64(i32 addrspace(1)* nocapture %arg) { 17bb: 18 br label %bb2 19 20bb1: ; preds = %bb2 21 ret void 22 23bb2: ; preds = %bb2, %bb 24 %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] 25 %tmp2 = add nuw nsw i32 %tmp1, 1 26 %tmp3 = icmp eq i32 %tmp2, 1024 27 tail call void @llvm.amdgcn.s.sleep(i32 0) 28 br i1 %tmp3, label %bb1, label %bb2 29} 30 31; GCN-LABEL: test_loop_128 32; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 33; GFX10-ASM-NEXT: .p2align 6 34; GFX10-DIS-NEXT: s_nop 0 35; GFX10-NOT: s_inst_prefetch 36; GFX10-ASM: [[L1:BB[0-9_]+]]: 37; GFX10-DIS: <[[L1:BB[0-9_]+]]>: 38; GFX10: s_sleep 0 39; GFX10: s_cbranch_scc0 [[L1]] 40; GFX10-NEXT: s_endpgm 41define amdgpu_kernel void @test_loop_128(i32 addrspace(1)* nocapture %arg) { 42bb: 43 br label %bb2 44 45bb1: ; preds = %bb2 46 ret void 47 48bb2: ; preds = %bb2, %bb 49 %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] 50 %tmp2 = add nuw nsw i32 %tmp1, 1 51 %tmp3 = icmp eq i32 %tmp2, 1024 52 tail call void @llvm.amdgcn.s.sleep(i32 0) 53 tail call void @llvm.amdgcn.s.sleep(i32 0) 54 tail call void @llvm.amdgcn.s.sleep(i32 0) 55 tail call void @llvm.amdgcn.s.sleep(i32 0) 56 tail call void @llvm.amdgcn.s.sleep(i32 0) 57 tail call void @llvm.amdgcn.s.sleep(i32 0) 58 tail call void @llvm.amdgcn.s.sleep(i32 0) 59 tail call void @llvm.amdgcn.s.sleep(i32 0) 60 tail call void @llvm.amdgcn.s.sleep(i32 0) 61 tail call void @llvm.amdgcn.s.sleep(i32 0) 62 tail call void @llvm.amdgcn.s.sleep(i32 0) 63 tail call void @llvm.amdgcn.s.sleep(i32 0) 64 tail call void @llvm.amdgcn.s.sleep(i32 0) 65 tail call void @llvm.amdgcn.s.sleep(i32 0) 66 tail call void @llvm.amdgcn.s.sleep(i32 0) 67 tail call void @llvm.amdgcn.s.sleep(i32 0) 68 br i1 %tmp3, label %bb1, label %bb2 69} 70 71; GCN-LABEL: test_loop_192 72; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 73; GFX10-NEXT: s_inst_prefetch 0x1 74; GFX10-ASM-NEXT: .p2align 6 75; GFX10-DIS-NEXT: s_nop 0 76; GFX10-NOT: s_inst_prefetch 77; GFX10-ASM: [[L1:BB[0-9_]+]]: 78; GFX10-DIS: <[[L1:BB[0-9_]+]]>: 79; GFX10: s_sleep 0 80; GFX10: s_cbranch_scc0 [[L1]] 81; GFX10-NEXT: s_inst_prefetch 0x2 82; GFX10-NEXT: s_endpgm 83define amdgpu_kernel void @test_loop_192(i32 addrspace(1)* nocapture %arg) { 84bb: 85 br label %bb2 86 87bb1: ; preds = %bb2 88 ret void 89 90bb2: ; preds = %bb2, %bb 91 %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] 92 %tmp2 = add nuw nsw i32 %tmp1, 1 93 %tmp3 = icmp eq i32 %tmp2, 1024 94 tail call void @llvm.amdgcn.s.sleep(i32 0) 95 tail call void @llvm.amdgcn.s.sleep(i32 0) 96 tail call void @llvm.amdgcn.s.sleep(i32 0) 97 tail call void @llvm.amdgcn.s.sleep(i32 0) 98 tail call void @llvm.amdgcn.s.sleep(i32 0) 99 tail call void @llvm.amdgcn.s.sleep(i32 0) 100 tail call void @llvm.amdgcn.s.sleep(i32 0) 101 tail call void @llvm.amdgcn.s.sleep(i32 0) 102 tail call void @llvm.amdgcn.s.sleep(i32 0) 103 tail call void @llvm.amdgcn.s.sleep(i32 0) 104 tail call void @llvm.amdgcn.s.sleep(i32 0) 105 tail call void @llvm.amdgcn.s.sleep(i32 0) 106 tail call void @llvm.amdgcn.s.sleep(i32 0) 107 tail call void @llvm.amdgcn.s.sleep(i32 0) 108 tail call void @llvm.amdgcn.s.sleep(i32 0) 109 tail call void @llvm.amdgcn.s.sleep(i32 0) 110 tail call void @llvm.amdgcn.s.sleep(i32 0) 111 tail call void @llvm.amdgcn.s.sleep(i32 0) 112 tail call void @llvm.amdgcn.s.sleep(i32 0) 113 tail call void @llvm.amdgcn.s.sleep(i32 0) 114 tail call void @llvm.amdgcn.s.sleep(i32 0) 115 tail call void @llvm.amdgcn.s.sleep(i32 0) 116 tail call void @llvm.amdgcn.s.sleep(i32 0) 117 tail call void @llvm.amdgcn.s.sleep(i32 0) 118 tail call void @llvm.amdgcn.s.sleep(i32 0) 119 tail call void @llvm.amdgcn.s.sleep(i32 0) 120 tail call void @llvm.amdgcn.s.sleep(i32 0) 121 tail call void @llvm.amdgcn.s.sleep(i32 0) 122 tail call void @llvm.amdgcn.s.sleep(i32 0) 123 tail call void @llvm.amdgcn.s.sleep(i32 0) 124 tail call void @llvm.amdgcn.s.sleep(i32 0) 125 tail call void @llvm.amdgcn.s.sleep(i32 0) 126 tail call void @llvm.amdgcn.s.sleep(i32 0) 127 tail call void @llvm.amdgcn.s.sleep(i32 0) 128 br i1 %tmp3, label %bb1, label %bb2 129} 130 131; GCN-LABEL: test_loop_256 132; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 133; GFX10-DIS-NEXT: {{^$}} 134; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]: 135; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>: 136; GFX10: s_sleep 0 137; GFX10: s_cbranch_scc0 [[L1]] 138; GFX10-NEXT: s_endpgm 139define amdgpu_kernel void @test_loop_256(i32 addrspace(1)* nocapture %arg) { 140bb: 141 br label %bb2 142 143bb1: ; preds = %bb2 144 ret void 145 146bb2: ; preds = %bb2, %bb 147 %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] 148 %tmp2 = add nuw nsw i32 %tmp1, 1 149 %tmp3 = icmp eq i32 %tmp2, 1024 150 tail call void @llvm.amdgcn.s.sleep(i32 0) 151 tail call void @llvm.amdgcn.s.sleep(i32 0) 152 tail call void @llvm.amdgcn.s.sleep(i32 0) 153 tail call void @llvm.amdgcn.s.sleep(i32 0) 154 tail call void @llvm.amdgcn.s.sleep(i32 0) 155 tail call void @llvm.amdgcn.s.sleep(i32 0) 156 tail call void @llvm.amdgcn.s.sleep(i32 0) 157 tail call void @llvm.amdgcn.s.sleep(i32 0) 158 tail call void @llvm.amdgcn.s.sleep(i32 0) 159 tail call void @llvm.amdgcn.s.sleep(i32 0) 160 tail call void @llvm.amdgcn.s.sleep(i32 0) 161 tail call void @llvm.amdgcn.s.sleep(i32 0) 162 tail call void @llvm.amdgcn.s.sleep(i32 0) 163 tail call void @llvm.amdgcn.s.sleep(i32 0) 164 tail call void @llvm.amdgcn.s.sleep(i32 0) 165 tail call void @llvm.amdgcn.s.sleep(i32 0) 166 tail call void @llvm.amdgcn.s.sleep(i32 0) 167 tail call void @llvm.amdgcn.s.sleep(i32 0) 168 tail call void @llvm.amdgcn.s.sleep(i32 0) 169 tail call void @llvm.amdgcn.s.sleep(i32 0) 170 tail call void @llvm.amdgcn.s.sleep(i32 0) 171 tail call void @llvm.amdgcn.s.sleep(i32 0) 172 tail call void @llvm.amdgcn.s.sleep(i32 0) 173 tail call void @llvm.amdgcn.s.sleep(i32 0) 174 tail call void @llvm.amdgcn.s.sleep(i32 0) 175 tail call void @llvm.amdgcn.s.sleep(i32 0) 176 tail call void @llvm.amdgcn.s.sleep(i32 0) 177 tail call void @llvm.amdgcn.s.sleep(i32 0) 178 tail call void @llvm.amdgcn.s.sleep(i32 0) 179 tail call void @llvm.amdgcn.s.sleep(i32 0) 180 tail call void @llvm.amdgcn.s.sleep(i32 0) 181 tail call void @llvm.amdgcn.s.sleep(i32 0) 182 tail call void @llvm.amdgcn.s.sleep(i32 0) 183 tail call void @llvm.amdgcn.s.sleep(i32 0) 184 tail call void @llvm.amdgcn.s.sleep(i32 0) 185 tail call void @llvm.amdgcn.s.sleep(i32 0) 186 tail call void @llvm.amdgcn.s.sleep(i32 0) 187 tail call void @llvm.amdgcn.s.sleep(i32 0) 188 tail call void @llvm.amdgcn.s.sleep(i32 0) 189 tail call void @llvm.amdgcn.s.sleep(i32 0) 190 tail call void @llvm.amdgcn.s.sleep(i32 0) 191 tail call void @llvm.amdgcn.s.sleep(i32 0) 192 tail call void @llvm.amdgcn.s.sleep(i32 0) 193 tail call void @llvm.amdgcn.s.sleep(i32 0) 194 tail call void @llvm.amdgcn.s.sleep(i32 0) 195 tail call void @llvm.amdgcn.s.sleep(i32 0) 196 tail call void @llvm.amdgcn.s.sleep(i32 0) 197 tail call void @llvm.amdgcn.s.sleep(i32 0) 198 tail call void @llvm.amdgcn.s.sleep(i32 0) 199 tail call void @llvm.amdgcn.s.sleep(i32 0) 200 br i1 %tmp3, label %bb1, label %bb2 201} 202 203; GCN-LABEL: test_loop_prefetch_inner_outer 204; GFX10: s_inst_prefetch 0x1 205; GFX10-ASM-NEXT: .p2align 6 206; GFX10-DIS-NEXT: s_nop 0 207; GFX10-NOT: s_inst_prefetch 208; GFX10-ASM: [[L1:BB[0-9_]+]]: 209; GFX10-DIS: <[[L1:BB[0-9_]+]]>: 210; GFX10-NOT: s_inst_prefetch 211; GFX10-ASM: .p2align 6 212; GFX10-DIS: s_nop 0 213; GFX10-NOT: s_inst_prefetch 214; GFX10-ASM: [[L2:BB[0-9_]+]]: 215; GFX10-DIS: <[[L2:BB[0-9_]+]]>: 216; GFX10-NOT: s_inst_prefetch 217; GFX10: s_sleep 0 218; GFX10: s_cbranch_scc{{[01]}} [[L2]] 219; GFX10-NOT: s_inst_prefetch 220; GFX10: s_cbranch_scc{{[01]}} [[L1]] 221; GFX10-NEXT: s_inst_prefetch 0x2 222; GFX10-NEXT: s_endpgm 223define amdgpu_kernel void @test_loop_prefetch_inner_outer(i32 addrspace(1)* nocapture %arg) { 224bb: 225 br label %bb2 226 227bb1: 228 ret void 229 230bb2: 231 %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ] 232 %tmp2 = add nuw nsw i32 %tmp1, 1 233 %tmp3 = icmp eq i32 %tmp2, 1024 234 br label %bb3 235 236bb3: 237 %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ] 238 %tmp5 = add nuw nsw i32 %tmp4, 1 239 %tmp6 = icmp eq i32 %tmp5, 1024 240 tail call void @llvm.amdgcn.s.sleep(i32 0) 241 tail call void @llvm.amdgcn.s.sleep(i32 0) 242 tail call void @llvm.amdgcn.s.sleep(i32 0) 243 tail call void @llvm.amdgcn.s.sleep(i32 0) 244 tail call void @llvm.amdgcn.s.sleep(i32 0) 245 tail call void @llvm.amdgcn.s.sleep(i32 0) 246 tail call void @llvm.amdgcn.s.sleep(i32 0) 247 tail call void @llvm.amdgcn.s.sleep(i32 0) 248 tail call void @llvm.amdgcn.s.sleep(i32 0) 249 tail call void @llvm.amdgcn.s.sleep(i32 0) 250 tail call void @llvm.amdgcn.s.sleep(i32 0) 251 tail call void @llvm.amdgcn.s.sleep(i32 0) 252 tail call void @llvm.amdgcn.s.sleep(i32 0) 253 tail call void @llvm.amdgcn.s.sleep(i32 0) 254 tail call void @llvm.amdgcn.s.sleep(i32 0) 255 tail call void @llvm.amdgcn.s.sleep(i32 0) 256 tail call void @llvm.amdgcn.s.sleep(i32 0) 257 tail call void @llvm.amdgcn.s.sleep(i32 0) 258 tail call void @llvm.amdgcn.s.sleep(i32 0) 259 tail call void @llvm.amdgcn.s.sleep(i32 0) 260 tail call void @llvm.amdgcn.s.sleep(i32 0) 261 tail call void @llvm.amdgcn.s.sleep(i32 0) 262 tail call void @llvm.amdgcn.s.sleep(i32 0) 263 tail call void @llvm.amdgcn.s.sleep(i32 0) 264 tail call void @llvm.amdgcn.s.sleep(i32 0) 265 tail call void @llvm.amdgcn.s.sleep(i32 0) 266 tail call void @llvm.amdgcn.s.sleep(i32 0) 267 tail call void @llvm.amdgcn.s.sleep(i32 0) 268 tail call void @llvm.amdgcn.s.sleep(i32 0) 269 tail call void @llvm.amdgcn.s.sleep(i32 0) 270 tail call void @llvm.amdgcn.s.sleep(i32 0) 271 tail call void @llvm.amdgcn.s.sleep(i32 0) 272 tail call void @llvm.amdgcn.s.sleep(i32 0) 273 tail call void @llvm.amdgcn.s.sleep(i32 0) 274 br i1 %tmp6, label %bb4, label %bb3 275 276bb4: 277 br i1 %tmp3, label %bb1, label %bb2 278} 279 280; GCN-LABEL: test_loop_prefetch_inner_outer_noouter 281; GFX10-NOT: .p2align 6 282; GFX10-NOT: s_nop 283; GFX10-NOT: s_inst_prefetch 284; GFX10-ASM: [[L0:BB[0-9_]+]]: 285; GFX10-DIS: <[[L0:BB[0-9_]+]]>: 286; GFX10: s_inst_prefetch 0x1 287; GFX10-ASM-NEXT: .p2align 6 288; GFX10-DIS-NEXT: s_nop 0 289; GFX10-NOT: s_inst_prefetch 290; GFX10-ASM: [[L1:BB[0-9_]+]]: 291; GFX10-DIS: <[[L1:BB[0-9_]+]]>: 292; GFX10-NOT: s_inst_prefetch 293; GFX10-ASM: .p2align 6 294; GFX10-DIS: s_nop 0 295; GFX10-NOT: s_inst_prefetch 296; GFX10-ASM: [[L2:BB[0-9_]+]]: 297; GFX10-DIS: <[[L2:BB[0-9_]+]]>: 298; GFX10-NOT: s_inst_prefetch 299; GFX10: s_sleep 0 300; GFX10: s_cbranch_scc{{[01]}} [[L2]] 301; GFX10-NOT: s_inst_prefetch 302; GFX10: s_cbranch_scc{{[01]}} [[L1]] 303; GFX10-NEXT: s_inst_prefetch 0x2 304; GFX10: s_cbranch_scc{{[01]}} [[L0]] 305; GFX10-NEXT: s_endpgm 306define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(i32 addrspace(1)* nocapture %arg) { 307bb: 308 br label %bb2 309 310bb1: 311 ret void 312 313bb2: 314 %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb6 ] 315 %tmp2 = add nuw nsw i32 %tmp1, 1 316 %tmp3 = icmp eq i32 %tmp2, 1024 317 br label %bb3 318 319bb3: 320 %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb5 ] 321 %tmp5 = add nuw nsw i32 %tmp4, 1 322 %tmp6 = icmp eq i32 %tmp5, 1024 323 br label %bb4 324 325bb4: 326 %tmp7 = phi i32 [ 0, %bb3 ], [ %tmp8, %bb4 ] 327 %tmp8 = add nuw nsw i32 %tmp7, 1 328 %tmp9 = icmp eq i32 %tmp8, 1024 329 tail call void @llvm.amdgcn.s.sleep(i32 0) 330 tail call void @llvm.amdgcn.s.sleep(i32 0) 331 tail call void @llvm.amdgcn.s.sleep(i32 0) 332 tail call void @llvm.amdgcn.s.sleep(i32 0) 333 tail call void @llvm.amdgcn.s.sleep(i32 0) 334 tail call void @llvm.amdgcn.s.sleep(i32 0) 335 tail call void @llvm.amdgcn.s.sleep(i32 0) 336 tail call void @llvm.amdgcn.s.sleep(i32 0) 337 tail call void @llvm.amdgcn.s.sleep(i32 0) 338 tail call void @llvm.amdgcn.s.sleep(i32 0) 339 tail call void @llvm.amdgcn.s.sleep(i32 0) 340 tail call void @llvm.amdgcn.s.sleep(i32 0) 341 tail call void @llvm.amdgcn.s.sleep(i32 0) 342 tail call void @llvm.amdgcn.s.sleep(i32 0) 343 tail call void @llvm.amdgcn.s.sleep(i32 0) 344 tail call void @llvm.amdgcn.s.sleep(i32 0) 345 tail call void @llvm.amdgcn.s.sleep(i32 0) 346 tail call void @llvm.amdgcn.s.sleep(i32 0) 347 tail call void @llvm.amdgcn.s.sleep(i32 0) 348 tail call void @llvm.amdgcn.s.sleep(i32 0) 349 tail call void @llvm.amdgcn.s.sleep(i32 0) 350 tail call void @llvm.amdgcn.s.sleep(i32 0) 351 tail call void @llvm.amdgcn.s.sleep(i32 0) 352 tail call void @llvm.amdgcn.s.sleep(i32 0) 353 tail call void @llvm.amdgcn.s.sleep(i32 0) 354 tail call void @llvm.amdgcn.s.sleep(i32 0) 355 tail call void @llvm.amdgcn.s.sleep(i32 0) 356 tail call void @llvm.amdgcn.s.sleep(i32 0) 357 tail call void @llvm.amdgcn.s.sleep(i32 0) 358 tail call void @llvm.amdgcn.s.sleep(i32 0) 359 tail call void @llvm.amdgcn.s.sleep(i32 0) 360 tail call void @llvm.amdgcn.s.sleep(i32 0) 361 tail call void @llvm.amdgcn.s.sleep(i32 0) 362 tail call void @llvm.amdgcn.s.sleep(i32 0) 363 br i1 %tmp9, label %bb5, label %bb4 364 365bb5: 366 br i1 %tmp6, label %bb6, label %bb3 367 368bb6: 369 tail call void @llvm.amdgcn.s.sleep(i32 0) 370 tail call void @llvm.amdgcn.s.sleep(i32 0) 371 tail call void @llvm.amdgcn.s.sleep(i32 0) 372 tail call void @llvm.amdgcn.s.sleep(i32 0) 373 tail call void @llvm.amdgcn.s.sleep(i32 0) 374 tail call void @llvm.amdgcn.s.sleep(i32 0) 375 tail call void @llvm.amdgcn.s.sleep(i32 0) 376 tail call void @llvm.amdgcn.s.sleep(i32 0) 377 tail call void @llvm.amdgcn.s.sleep(i32 0) 378 tail call void @llvm.amdgcn.s.sleep(i32 0) 379 tail call void @llvm.amdgcn.s.sleep(i32 0) 380 tail call void @llvm.amdgcn.s.sleep(i32 0) 381 tail call void @llvm.amdgcn.s.sleep(i32 0) 382 tail call void @llvm.amdgcn.s.sleep(i32 0) 383 tail call void @llvm.amdgcn.s.sleep(i32 0) 384 tail call void @llvm.amdgcn.s.sleep(i32 0) 385 br i1 %tmp3, label %bb1, label %bb2 386} 387 388declare void @llvm.amdgcn.s.sleep(i32) 389