1; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 3 4declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone 5declare i32 @llvm.r600.read.tidig.x() nounwind readnone 6 7 8; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: 9; SI: s_load_dword [[ARG:s[0-9]+]], 10; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000 11; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]] 12; SI: buffer_store_dword [[EXTRACT]], 13 14; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 15; EG: BFE_INT [[RES]], {{.*}}, 0.0, 1 16; EG-NEXT: LSHR * [[ADDR]] 17define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { 18 %shl = shl i32 %in, 31 19 %sext = ashr i32 %shl, 31 20 store i32 %sext, i32 addrspace(1)* %out 21 ret void 22} 23 24; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32: 25; SI: s_add_i32 [[VAL:s[0-9]+]], 26; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] 27; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] 28; SI: buffer_store_dword [[VEXTRACT]], 29 30; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 31; EG: ADD_INT 32; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal 33; EG-NEXT: LSHR * [[ADDR]] 34define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 35 %c = add i32 %a, %b ; add to prevent folding into extload 36 %shl = shl i32 %c, 24 37 %ashr = ashr i32 %shl, 24 38 store i32 %ashr, i32 addrspace(1)* %out, align 4 39 ret void 40} 41 42; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32: 43; SI: s_add_i32 [[VAL:s[0-9]+]], 44; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]] 45; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] 46; SI: buffer_store_dword [[VEXTRACT]], 47 48; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 49; EG: ADD_INT 50; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal 51; EG-NEXT: LSHR * [[ADDR]] 52define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 53 %c = add i32 %a, %b ; add to prevent folding into extload 54 %shl = shl i32 %c, 16 55 %ashr = ashr i32 %shl, 16 56 store i32 %ashr, i32 addrspace(1)* %out, align 4 57 ret void 58} 59 60; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32: 61; SI: s_add_i32 [[VAL:s[0-9]+]], 62; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] 63; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] 64; SI: buffer_store_dword [[VEXTRACT]], 65 66; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 67; EG: ADD_INT 68; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal 69; EG-NEXT: LSHR * [[ADDR]] 70define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { 71 %c = add <1 x i32> %a, %b ; add to prevent folding into extload 72 %shl = shl <1 x i32> %c, <i32 24> 73 %ashr = ashr <1 x i32> %shl, <i32 24> 74 store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4 75 ret void 76} 77 78; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64: 79; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 80; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000 81; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 82; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 83; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 84define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 85 %c = shl i64 %a, %b 86 %shl = shl i64 %c, 63 87 %ashr = ashr i64 %shl, 63 88 store i64 %ashr, i64 addrspace(1)* %out, align 8 89 ret void 90} 91 92; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64: 93; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 94; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000 95; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 96; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 97; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 98 99; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] 100; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] 101; EG: LSHL 102; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal 103; EG: ASHR [[RES_HI]] 104; EG-NOT: BFE_INT 105; EG: LSHR 106; EG: LSHR 107;; TODO Check address computation, using | with variables in {{}} does not work, 108;; also the _LO/_HI order might be different 109define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 110 %c = shl i64 %a, %b 111 %shl = shl i64 %c, 56 112 %ashr = ashr i64 %shl, 56 113 store i64 %ashr, i64 addrspace(1)* %out, align 8 114 ret void 115} 116 117; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64: 118; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 119; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000 120; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 121; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 122; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 123 124; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] 125; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] 126; EG: LSHL 127; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal 128; EG: ASHR [[RES_HI]] 129; EG-NOT: BFE_INT 130; EG: LSHR 131; EG: LSHR 132;; TODO Check address computation, using | with variables in {{}} does not work, 133;; also the _LO/_HI order might be different 134define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 135 %c = shl i64 %a, %b 136 %shl = shl i64 %c, 48 137 %ashr = ashr i64 %shl, 48 138 store i64 %ashr, i64 addrspace(1)* %out, align 8 139 ret void 140} 141 142; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64: 143; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 144; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000 145; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 146; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 147; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 148 149; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] 150; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] 151; EG-NOT: BFE_INT 152 153; EG: ASHR [[RES_HI]] 154 155; EG: LSHR 156; EG: LSHR 157;; TODO Check address computation, using | with variables in {{}} does not work, 158;; also the _LO/_HI order might be different 159define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 160 %c = shl i64 %a, %b 161 %shl = shl i64 %c, 32 162 %ashr = ashr i64 %shl, 32 163 store i64 %ashr, i64 addrspace(1)* %out, align 8 164 ret void 165} 166 167; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. 168; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64: 169; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288 170; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31 171; XSI: buffer_store_dword 172; XEG: BFE_INT 173; XEG: ASHR 174; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind { 175; %c = add <1 x i64> %a, %b 176; %shl = shl <1 x i64> %c, <i64 56> 177; %ashr = ashr <1 x i64> %shl, <i64 56> 178; store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8 179; ret void 180; } 181 182; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64: 183; SI: buffer_load_dwordx2 184; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} 185; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 186; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 187; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 188define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 189 %tid = call i32 @llvm.r600.read.tidig.x() 190 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 191 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 192 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 193 %a = load i64, i64 addrspace(1)* %a.gep, align 8 194 %b = load i64, i64 addrspace(1)* %b.gep, align 8 195 196 %c = shl i64 %a, %b 197 %shl = shl i64 %c, 63 198 %ashr = ashr i64 %shl, 63 199 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 200 ret void 201} 202 203; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64: 204; SI: buffer_load_dwordx2 205; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} 206; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 207; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 208; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 209define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 210 %tid = call i32 @llvm.r600.read.tidig.x() 211 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 212 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 213 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 214 %a = load i64, i64 addrspace(1)* %a.gep, align 8 215 %b = load i64, i64 addrspace(1)* %b.gep, align 8 216 217 %c = shl i64 %a, %b 218 %shl = shl i64 %c, 56 219 %ashr = ashr i64 %shl, 56 220 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 221 ret void 222} 223 224; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64: 225; SI: buffer_load_dwordx2 226; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} 227; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 228; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 229; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 230define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 231 %tid = call i32 @llvm.r600.read.tidig.x() 232 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 233 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 234 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 235 %a = load i64, i64 addrspace(1)* %a.gep, align 8 236 %b = load i64, i64 addrspace(1)* %b.gep, align 8 237 238 %c = shl i64 %a, %b 239 %shl = shl i64 %c, 48 240 %ashr = ashr i64 %shl, 48 241 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 242 ret void 243} 244 245; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64: 246; SI: buffer_load_dwordx2 247; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 248; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] 249; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}} 250define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 251 %tid = call i32 @llvm.r600.read.tidig.x() 252 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 253 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 254 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 255 %a = load i64, i64 addrspace(1)* %a.gep, align 8 256 %b = load i64, i64 addrspace(1)* %b.gep, align 8 257 258 %c = shl i64 %a, %b 259 %shl = shl i64 %c, 32 260 %ashr = ashr i64 %shl, 32 261 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 262 ret void 263} 264 265; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount: 266; SI-NOT: s_lshl 267; SI-NOT: s_ashr 268; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 269 270; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 271; EG-NOT: BFE 272; EG: ADD_INT 273; EG: LSHL 274; EG: ASHR [[RES]] 275; EG: LSHR {{\*?}} [[ADDR]] 276define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 277 %c = add i32 %a, %b 278 %x = shl i32 %c, 6 279 %y = ashr i32 %x, 7 280 store i32 %y, i32 addrspace(1)* %out 281 ret void 282} 283 284; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: 285; SI-NOT: s_lshl 286; SI-NOT: s_ashr 287; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 288; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 289; SI: s_endpgm 290 291; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 292; EG-NOT: BFE 293; EG: ADD_INT 294; EG: LSHL 295; EG: ASHR [[RES]] 296; EG: LSHL 297; EG: ASHR [[RES]] 298; EG: LSHR {{\*?}} [[ADDR]] 299define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 300 %c = add <2 x i32> %a, %b 301 %x = shl <2 x i32> %c, <i32 6, i32 6> 302 %y = ashr <2 x i32> %x, <i32 7, i32 7> 303 store <2 x i32> %y, <2 x i32> addrspace(1)* %out, align 2 304 ret void 305} 306 307 308; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32: 309; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 310; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 311; SI: buffer_store_dwordx2 312 313; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 314; EG: BFE_INT [[RES]] 315; EG: BFE_INT [[RES]] 316; EG: LSHR {{\*?}} [[ADDR]] 317define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 318 %c = add <2 x i32> %a, %b ; add to prevent folding into extload 319 %shl = shl <2 x i32> %c, <i32 31, i32 31> 320 %ashr = ashr <2 x i32> %shl, <i32 31, i32 31> 321 store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 322 ret void 323} 324 325; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32: 326; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 327; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 328; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 329; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 330; SI: buffer_store_dwordx4 331 332; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 333; EG: BFE_INT [[RES]] 334; EG: BFE_INT [[RES]] 335; EG: BFE_INT [[RES]] 336; EG: BFE_INT [[RES]] 337; EG: LSHR {{\*?}} [[ADDR]] 338define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { 339 %c = add <4 x i32> %a, %b ; add to prevent folding into extload 340 %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31> 341 %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31> 342 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 343 ret void 344} 345 346; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32: 347; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 348; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 349; SI: buffer_store_dwordx2 350 351; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 352; EG: BFE_INT [[RES]] 353; EG: BFE_INT [[RES]] 354; EG: LSHR {{\*?}} [[ADDR]] 355define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 356 %c = add <2 x i32> %a, %b ; add to prevent folding into extload 357 %shl = shl <2 x i32> %c, <i32 24, i32 24> 358 %ashr = ashr <2 x i32> %shl, <i32 24, i32 24> 359 store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 360 ret void 361} 362 363; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32: 364; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 365; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 366; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 367; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 368; SI: buffer_store_dwordx4 369 370; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 371; EG: BFE_INT [[RES]] 372; EG: BFE_INT [[RES]] 373; EG: BFE_INT [[RES]] 374; EG: BFE_INT [[RES]] 375; EG: LSHR {{\*?}} [[ADDR]] 376define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { 377 %c = add <4 x i32> %a, %b ; add to prevent folding into extload 378 %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24> 379 %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24> 380 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 381 ret void 382} 383 384; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32: 385; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} 386; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} 387; SI: buffer_store_dwordx2 388 389; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 390; EG: BFE_INT [[RES]] 391; EG: BFE_INT [[RES]] 392; EG: LSHR {{\*?}} [[ADDR]] 393define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 394 %c = add <2 x i32> %a, %b ; add to prevent folding into extload 395 %shl = shl <2 x i32> %c, <i32 16, i32 16> 396 %ashr = ashr <2 x i32> %shl, <i32 16, i32 16> 397 store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 398 ret void 399} 400 401; FUNC-LABEL: {{^}}testcase: 402define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind { 403 %and_a_1 = and i8 %a, 1 404 %cmp_eq = icmp eq i8 %and_a_1, 0 405 %cmp_slt = icmp slt i8 %a, 0 406 %sel0 = select i1 %cmp_slt, i8 0, i8 %a 407 %sel1 = select i1 %cmp_eq, i8 0, i8 %a 408 %xor = xor i8 %sel0, %sel1 409 store i8 %xor, i8 addrspace(1)* %out 410 ret void 411} 412 413; FUNC-LABEL: {{^}}testcase_3: 414define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { 415 %and_a_1 = and i8 %a, 1 416 %cmp_eq = icmp eq i8 %and_a_1, 0 417 %cmp_slt = icmp slt i8 %a, 0 418 %sel0 = select i1 %cmp_slt, i8 0, i8 %a 419 %sel1 = select i1 %cmp_eq, i8 0, i8 %a 420 %xor = xor i8 %sel0, %sel1 421 store i8 %xor, i8 addrspace(1)* %out 422 ret void 423} 424 425; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32: 426; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 427; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 428; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 429; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 430define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { 431 %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 432 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 433 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload 434 %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24> 435 %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24> 436 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 437 ret void 438} 439 440; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: 441; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 442; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 443define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { 444 %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 445 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 446 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload 447 %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 448 %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16> 449 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 450 ret void 451} 452 453; FIXME: The BFE should really be eliminated. I think it should happen 454; when computeKnownBitsForTargetNode is implemented for imax. 455 456; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type: 457; SI: buffer_load_sbyte 458; SI: v_max_i32 459; SI: v_bfe_i32 460; SI: buffer_store_short 461define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { 462 %tmp5 = load i8, i8 addrspace(1)* %src, align 1 463 %tmp2 = sext i8 %tmp5 to i32 464 %tmp3 = tail call i32 @llvm.AMDGPU.imax(i32 %tmp2, i32 0) nounwind readnone 465 %tmp4 = trunc i32 %tmp3 to i8 466 %tmp6 = sext i8 %tmp4 to i16 467 store i16 %tmp6, i16 addrspace(1)* %out, align 2 468 ret void 469} 470 471declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone 472 473; FUNC-LABEL: {{^}}bfe_0_width: 474; SI-NOT: {{[^@]}}bfe 475; SI: s_endpgm 476define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 477 %load = load i32, i32 addrspace(1)* %ptr, align 4 478 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone 479 store i32 %bfe, i32 addrspace(1)* %out, align 4 480 ret void 481} 482 483; FUNC-LABEL: {{^}}bfe_8_bfe_8: 484; SI: v_bfe_i32 485; SI-NOT: {{[^@]}}bfe 486; SI: s_endpgm 487define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 488 %load = load i32, i32 addrspace(1)* %ptr, align 4 489 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone 490 %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone 491 store i32 %bfe1, i32 addrspace(1)* %out, align 4 492 ret void 493} 494 495; FUNC-LABEL: {{^}}bfe_8_bfe_16: 496; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 497; SI: s_endpgm 498define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 499 %load = load i32, i32 addrspace(1)* %ptr, align 4 500 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone 501 %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone 502 store i32 %bfe1, i32 addrspace(1)* %out, align 4 503 ret void 504} 505 506; This really should be folded into 1 507; FUNC-LABEL: {{^}}bfe_16_bfe_8: 508; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 509; SI-NOT: {{[^@]}}bfe 510; SI: s_endpgm 511define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 512 %load = load i32, i32 addrspace(1)* %ptr, align 4 513 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone 514 %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone 515 store i32 %bfe1, i32 addrspace(1)* %out, align 4 516 ret void 517} 518 519; Make sure there isn't a redundant BFE 520; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: 521; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} 522; SI-NOT: {{[^@]}}bfe 523; SI: s_endpgm 524define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 525 %c = add i32 %a, %b ; add to prevent folding into extload 526 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone 527 %shl = shl i32 %bfe, 24 528 %ashr = ashr i32 %shl, 24 529 store i32 %ashr, i32 addrspace(1)* %out, align 4 530 ret void 531} 532 533; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: 534define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 535 %c = add i32 %a, %b ; add to prevent folding into extload 536 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone 537 %shl = shl i32 %bfe, 24 538 %ashr = ashr i32 %shl, 24 539 store i32 %ashr, i32 addrspace(1)* %out, align 4 540 ret void 541} 542 543; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe: 544; SI: buffer_load_sbyte 545; SI-NOT: {{[^@]}}bfe 546; SI: s_endpgm 547define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { 548 %load = load i8, i8 addrspace(1)* %ptr, align 1 549 %sext = sext i8 %load to i32 550 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone 551 %shl = shl i32 %bfe, 24 552 %ashr = ashr i32 %shl, 24 553 store i32 %ashr, i32 addrspace(1)* %out, align 4 554 ret void 555} 556 557; SI: .text 558; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} 559; SI-NOT: {{[^@]}}bfe 560; SI: s_endpgm 561define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { 562 %load = load i8, i8 addrspace(1)* %ptr, align 1 563 %sext = sext i8 %load to i32 564 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone 565 %shl = shl i32 %bfe, 24 566 %ashr = ashr i32 %shl, 24 567 store i32 %ashr, i32 addrspace(1)* %out, align 4 568 ret void 569} 570 571; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: 572; SI-NOT: shr 573; SI-NOT: shl 574; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 575; SI: s_endpgm 576define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 577 %x = load i32, i32 addrspace(1)* %in, align 4 578 %shl = shl i32 %x, 31 579 %shr = ashr i32 %shl, 31 580 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) 581 store i32 %bfe, i32 addrspace(1)* %out, align 4 582 ret void 583} 584 585; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: 586; SI: buffer_load_dword 587; SI-NOT: shl 588; SI-NOT: shr 589; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 590; SI: s_endpgm 591define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 592 %x = load i32, i32 addrspace(1)* %in, align 4 593 %shl = shl i32 %x, 30 594 %shr = ashr i32 %shl, 30 595 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) 596 store i32 %bfe, i32 addrspace(1)* %out, align 4 597 ret void 598} 599 600; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: 601; SI: buffer_load_dword 602; SI-NOT: v_lshl 603; SI-NOT: v_ashr 604; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 605; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 606; SI: s_endpgm 607define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 608 %x = load i32, i32 addrspace(1)* %in, align 4 609 %shl = shl i32 %x, 30 610 %shr = ashr i32 %shl, 30 611 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) 612 store i32 %bfe, i32 addrspace(1)* %out, align 4 613 ret void 614} 615