1# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s 2# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s 3 4# GCN-LABEL: {{^}}sdwa_imm_operand: 5# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 6# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 7# GCN: BB0_1: 8# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 9# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 10 11# GCN-LABEL: {{^}}sdwa_sgpr_operand: 12# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 13# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 14# VI: BB1_1: 15# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 16# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 17 18# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2 19# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 20# GFX9: BB1_1: 21# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 22# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 23 24--- | 25 ; ModuleID = 'sdwa-scalar-ops.opt.ll' 26 source_filename = "sdwa-scalar-ops.opt.ll" 27 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" 28 29 define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) { 30 bb: 31 br label %bb2 32 33 bb1: ; preds = %bb2 34 ret void 35 36 bb2: ; preds = %bb2, %bb 37 %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ] 38 %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)* 39 %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv 40 %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)* 41 %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4 42 %tmp6 = lshr i32 %tmp5, 8 43 %tmp7 = and i32 %tmp6, 255 44 %tmp8 = zext i32 %tmp7 to i64 45 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8 46 store i32 1, i32 addrspace(1)* %tmp9, align 4 47 %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1 48 %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4 49 %tmp14 = lshr i32 %tmp13, 8 50 %tmp15 = and i32 %tmp14, 255 51 %tmp16 = zext i32 %tmp15 to i64 52 %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16 53 store i32 1, i32 addrspace(1)* %tmp17, align 4 54 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8 55 %tmp1 = trunc i64 %lsr.iv.next to i32 56 %tmp19 = icmp eq i32 %tmp1, 4096 57 br i1 %tmp19, label %bb1, label %bb2 58 } 59 60 define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) { 61 bb: 62 br label %bb2 63 64 bb1: ; preds = %bb2 65 ret void 66 67 bb2: ; preds = %bb2, %bb 68 %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ] 69 %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)* 70 %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv 71 %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)* 72 %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4 73 %tmp6 = lshr i32 %tmp5, 8 74 %tmp7 = and i32 %tmp6, 255 75 %tmp8 = zext i32 %tmp7 to i64 76 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8 77 store i32 1, i32 addrspace(1)* %tmp9, align 4 78 %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1 79 %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4 80 %tmp14 = lshr i32 %tmp13, 8 81 %tmp15 = and i32 %tmp14, 255 82 %tmp16 = zext i32 %tmp15 to i64 83 %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16 84 store i32 1, i32 addrspace(1)* %tmp17, align 4 85 %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8 86 %tmp1 = trunc i64 %lsr.iv.next to i32 87 %tmp19 = icmp eq i32 %tmp1, 4096 88 br i1 %tmp19, label %bb1, label %bb2 89 } 90 91... 92--- 93name: sdwa_imm_operand 94alignment: 1 95exposesReturnsTwice: false 96legalized: false 97regBankSelected: false 98selected: false 99tracksRegLiveness: true 100registers: 101 - { id: 0, class: sreg_64 } 102 - { id: 1, class: sreg_64 } 103 - { id: 2, class: vgpr_32 } 104 - { id: 3, class: sgpr_128 } 105 - { id: 4, class: sgpr_64 } 106 - { id: 5, class: sreg_32_xm0 } 107 - { id: 6, class: sgpr_32 } 108 - { id: 7, class: sreg_64 } 109 - { id: 8, class: sreg_64 } 110 - { id: 9, class: sreg_64_xexec } 111 - { id: 10, class: sreg_32_xm0 } 112 - { id: 11, class: sreg_32_xm0 } 113 - { id: 12, class: sreg_32_xm0 } 114 - { id: 13, class: sreg_32_xm0 } 115 - { id: 14, class: sreg_32_xm0 } 116 - { id: 15, class: sreg_32_xm0 } 117 - { id: 16, class: sreg_64 } 118 - { id: 17, class: vgpr_32 } 119 - { id: 18, class: vreg_64 } 120 - { id: 19, class: sreg_32_xm0 } 121 - { id: 20, class: sreg_32 } 122 - { id: 21, class: sreg_32_xm0 } 123 - { id: 22, class: sreg_32_xm0 } 124 - { id: 23, class: sreg_32_xm0 } 125 - { id: 24, class: sreg_64 } 126 - { id: 25, class: sreg_32_xm0 } 127 - { id: 26, class: sreg_32_xm0 } 128 - { id: 27, class: sreg_32_xm0 } 129 - { id: 28, class: sreg_32_xm0 } 130 - { id: 29, class: sreg_64 } 131 - { id: 30, class: vgpr_32 } 132 - { id: 31, class: vreg_64 } 133 - { id: 32, class: sreg_32_xm0 } 134 - { id: 33, class: sreg_32_xm0 } 135 - { id: 34, class: sreg_64 } 136 - { id: 35, class: sreg_32_xm0 } 137 - { id: 36, class: sreg_32_xm0 } 138 - { id: 37, class: sreg_32_xm0 } 139 - { id: 38, class: sreg_32_xm0 } 140 - { id: 39, class: vreg_64 } 141 - { id: 40, class: vgpr_32 } 142 - { id: 41, class: vreg_64 } 143 - { id: 42, class: sreg_32_xm0 } 144 - { id: 43, class: sreg_32 } 145 - { id: 44, class: sreg_32_xm0 } 146 - { id: 45, class: sreg_64 } 147 - { id: 46, class: sreg_32_xm0 } 148 - { id: 47, class: sreg_32_xm0 } 149 - { id: 48, class: sreg_32_xm0 } 150 - { id: 49, class: sreg_32_xm0 } 151 - { id: 50, class: sreg_64 } 152 - { id: 51, class: vreg_64 } 153 - { id: 52, class: sreg_64 } 154 - { id: 53, class: sreg_32_xm0 } 155 - { id: 54, class: sreg_32_xm0 } 156 - { id: 55, class: sreg_32_xm0 } 157 - { id: 56, class: sreg_32_xm0 } 158 - { id: 57, class: sreg_64 } 159 - { id: 58, class: sreg_32_xm0 } 160 - { id: 59, class: sreg_32_xm0 } 161 - { id: 60, class: vgpr_32 } 162 - { id: 61, class: vgpr_32 } 163 - { id: 62, class: vreg_64 } 164 - { id: 63, class: vgpr_32 } 165 - { id: 64, class: vgpr_32 } 166 - { id: 65, class: vgpr_32 } 167 - { id: 66, class: vgpr_32 } 168 - { id: 67, class: vreg_64 } 169 - { id: 68, class: vgpr_32 } 170 - { id: 69, class: vgpr_32 } 171 - { id: 70, class: vgpr_32 } 172 - { id: 71, class: vgpr_32 } 173 - { id: 72, class: vgpr_32 } 174 - { id: 73, class: vgpr_32 } 175 - { id: 74, class: vgpr_32 } 176 - { id: 75, class: vreg_64 } 177 - { id: 76, class: vgpr_32 } 178 - { id: 77, class: vgpr_32 } 179 - { id: 78, class: vgpr_32 } 180 - { id: 79, class: vgpr_32 } 181 - { id: 80, class: vreg_64 } 182 - { id: 81, class: vgpr_32 } 183 - { id: 82, class: vgpr_32 } 184 - { id: 83, class: vgpr_32 } 185liveins: 186 - { reg: '$sgpr4_sgpr5', virtual-reg: '%4' } 187frameInfo: 188 isFrameAddressTaken: false 189 isReturnAddressTaken: false 190 hasStackMap: false 191 hasPatchPoint: false 192 stackSize: 0 193 offsetAdjustment: 0 194 maxAlignment: 0 195 adjustsStack: false 196 hasCalls: false 197 hasOpaqueSPAdjustment: false 198 hasVAStart: false 199 hasMustTailInVarArgFunc: false 200body: | 201 bb.0.bb: 202 successors: %bb.2.bb2(0x80000000) 203 liveins: $sgpr4_sgpr5 204 205 %4 = COPY $sgpr4_sgpr5 206 %9 = S_LOAD_DWORDX2_IMM %4, 0, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) 207 %8 = S_MOV_B64 0 208 %7 = COPY %9 209 %30 = V_MOV_B32_e32 1, implicit $exec 210 S_BRANCH %bb.2.bb2 211 212 bb.1.bb1: 213 S_ENDPGM 0 214 215 bb.2.bb2: 216 successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000) 217 218 %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2 219 %13 = COPY %7.sub1 220 %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def $scc 221 %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc 222 %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1 223 %18 = COPY %16 224 %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45) 225 %60 = V_BFE_U32 %17, 8, 8, implicit $exec 226 %61 = V_LSHLREV_B32_e32 2, killed %60, implicit $exec 227 %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec 228 %66 = COPY %13 229 %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec 230 %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1 231 FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9) 232 %37 = S_ADD_U32 %14, 4, implicit-def $scc 233 %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc 234 %71 = COPY killed %37 235 %72 = COPY killed %38 236 %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1 237 %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep) 238 %73 = V_BFE_U32 %40, 8, 8, implicit $exec 239 %74 = V_LSHLREV_B32_e32 2, killed %73, implicit $exec 240 %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec 241 %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec 242 %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1 243 FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17) 244 %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc 245 %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc 246 %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1 247 %1 = COPY %57 248 S_CMPK_EQ_I32 %55, 4096, implicit-def $scc 249 S_CBRANCH_SCC1 %bb.1.bb1, implicit $scc 250 S_BRANCH %bb.2.bb2 251 252... 253--- 254name: sdwa_sgpr_operand 255alignment: 1 256exposesReturnsTwice: false 257legalized: false 258regBankSelected: false 259selected: false 260tracksRegLiveness: true 261registers: 262 - { id: 0, class: sreg_64 } 263 - { id: 1, class: sreg_64 } 264 - { id: 2, class: vgpr_32 } 265 - { id: 3, class: sgpr_128 } 266 - { id: 4, class: sgpr_64 } 267 - { id: 5, class: sreg_32_xm0 } 268 - { id: 6, class: sgpr_32 } 269 - { id: 7, class: sreg_64 } 270 - { id: 8, class: sreg_64 } 271 - { id: 9, class: sreg_64_xexec } 272 - { id: 10, class: sreg_32_xm0 } 273 - { id: 11, class: sreg_32_xm0 } 274 - { id: 12, class: sreg_32_xm0 } 275 - { id: 13, class: sreg_32_xm0 } 276 - { id: 14, class: sreg_32_xm0 } 277 - { id: 15, class: sreg_32_xm0 } 278 - { id: 16, class: sreg_64 } 279 - { id: 17, class: vgpr_32 } 280 - { id: 18, class: vreg_64 } 281 - { id: 19, class: sreg_32_xm0 } 282 - { id: 20, class: sreg_32 } 283 - { id: 21, class: sreg_32_xm0 } 284 - { id: 22, class: sreg_32_xm0 } 285 - { id: 23, class: sreg_32_xm0 } 286 - { id: 24, class: sreg_64 } 287 - { id: 25, class: sreg_32_xm0 } 288 - { id: 26, class: sreg_32_xm0 } 289 - { id: 27, class: sreg_32_xm0 } 290 - { id: 28, class: sreg_32_xm0 } 291 - { id: 29, class: sreg_64 } 292 - { id: 30, class: vgpr_32 } 293 - { id: 31, class: vreg_64 } 294 - { id: 32, class: sreg_32_xm0 } 295 - { id: 33, class: sreg_32_xm0 } 296 - { id: 34, class: sreg_64 } 297 - { id: 35, class: sreg_32_xm0 } 298 - { id: 36, class: sreg_32_xm0 } 299 - { id: 37, class: sreg_32_xm0 } 300 - { id: 38, class: sreg_32_xm0 } 301 - { id: 39, class: vreg_64 } 302 - { id: 40, class: vgpr_32 } 303 - { id: 41, class: vreg_64 } 304 - { id: 42, class: sreg_32_xm0 } 305 - { id: 43, class: sreg_32 } 306 - { id: 44, class: sreg_32_xm0 } 307 - { id: 45, class: sreg_64 } 308 - { id: 46, class: sreg_32_xm0 } 309 - { id: 47, class: sreg_32_xm0 } 310 - { id: 48, class: sreg_32_xm0 } 311 - { id: 49, class: sreg_32_xm0 } 312 - { id: 50, class: sreg_64 } 313 - { id: 51, class: vreg_64 } 314 - { id: 52, class: sreg_64 } 315 - { id: 53, class: sreg_32_xm0 } 316 - { id: 54, class: sreg_32_xm0 } 317 - { id: 55, class: sreg_32_xm0 } 318 - { id: 56, class: sreg_32_xm0 } 319 - { id: 57, class: sreg_64 } 320 - { id: 58, class: sreg_32_xm0 } 321 - { id: 59, class: sreg_32_xm0 } 322 - { id: 60, class: vgpr_32 } 323 - { id: 61, class: vgpr_32 } 324 - { id: 62, class: vreg_64 } 325 - { id: 63, class: vgpr_32 } 326 - { id: 64, class: vgpr_32 } 327 - { id: 65, class: vgpr_32 } 328 - { id: 66, class: vgpr_32 } 329 - { id: 67, class: vreg_64 } 330 - { id: 68, class: vgpr_32 } 331 - { id: 69, class: vgpr_32 } 332 - { id: 70, class: vgpr_32 } 333 - { id: 71, class: vgpr_32 } 334 - { id: 72, class: vgpr_32 } 335 - { id: 73, class: vgpr_32 } 336 - { id: 74, class: vgpr_32 } 337 - { id: 75, class: vreg_64 } 338 - { id: 76, class: vgpr_32 } 339 - { id: 77, class: vgpr_32 } 340 - { id: 78, class: vgpr_32 } 341 - { id: 79, class: vgpr_32 } 342 - { id: 80, class: vreg_64 } 343 - { id: 81, class: vgpr_32 } 344 - { id: 82, class: vgpr_32 } 345 - { id: 83, class: vgpr_32 } 346 - { id: 84, class: sreg_32_xm0 } 347liveins: 348 - { reg: '$sgpr4_sgpr5', virtual-reg: '%4' } 349frameInfo: 350 isFrameAddressTaken: false 351 isReturnAddressTaken: false 352 hasStackMap: false 353 hasPatchPoint: false 354 stackSize: 0 355 offsetAdjustment: 0 356 maxAlignment: 0 357 adjustsStack: false 358 hasCalls: false 359 hasOpaqueSPAdjustment: false 360 hasVAStart: false 361 hasMustTailInVarArgFunc: false 362body: | 363 bb.0.bb: 364 successors: %bb.2.bb2(0x80000000) 365 liveins: $sgpr4_sgpr5 366 367 %4 = COPY $sgpr4_sgpr5 368 %9 = S_LOAD_DWORDX2_IMM %4, 0, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) 369 %8 = S_MOV_B64 0 370 %7 = COPY %9 371 %30 = V_MOV_B32_e32 1, implicit $exec 372 %84 = S_MOV_B32 2 373 S_BRANCH %bb.2.bb2 374 375 bb.1.bb1: 376 S_ENDPGM 0 377 378 bb.2.bb2: 379 successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000) 380 381 %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2 382 %13 = COPY %7.sub1 383 %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def $scc 384 %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc 385 %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1 386 %18 = COPY %16 387 %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45) 388 %60 = V_BFE_U32 %17, 8, 8, implicit $exec 389 %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit $exec 390 %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec 391 %66 = COPY %13 392 %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec 393 %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1 394 FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9) 395 %37 = S_ADD_U32 %14, 4, implicit-def $scc 396 %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc 397 %71 = COPY killed %37 398 %72 = COPY killed %38 399 %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1 400 %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep) 401 %73 = V_BFE_U32 %40, 8, 8, implicit $exec 402 %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit $exec 403 %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec 404 %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec 405 %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1 406 FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17) 407 %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc 408 %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc 409 %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1 410 %1 = COPY %57 411 S_CMPK_EQ_I32 %55, 4096, implicit-def $scc 412 S_CBRANCH_SCC1 %bb.1.bb1, implicit $scc 413 S_BRANCH %bb.2.bb2 414 415... 416