1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s 2; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s 3 4declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone 5declare i32 @llvm.r600.read.tidig.x() nounwind readnone 6 7 8; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: 9; SI: s_load_dword [[ARG:s[0-9]+]], 10; SI: s_bfe_i32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000 11; SI: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]] 12; SI: buffer_store_dword [[EXTRACT]], 13 14; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 15; EG: LSHR * [[ADDR]] 16; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1 17define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) { 18 %shl = shl i32 %in, 31 19 %sext = ashr i32 %shl, 31 20 store i32 %sext, i32 addrspace(1)* %out 21 ret void 22} 23 24; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32: 25; SI: s_add_i32 [[VAL:s[0-9]+]], 26; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] 27; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] 28; SI: buffer_store_dword [[VEXTRACT]], 29 30; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 31; EG: ADD_INT 32; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal 33; EG-NEXT: LSHR * [[ADDR]] 34define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 35 %c = add i32 %a, %b ; add to prevent folding into extload 36 %shl = shl i32 %c, 24 37 %ashr = ashr i32 %shl, 24 38 store i32 %ashr, i32 addrspace(1)* %out, align 4 39 ret void 40} 41 42; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i32: 43; SI: s_add_i32 [[VAL:s[0-9]+]], 44; SI: s_sext_i32_i16 [[EXTRACT:s[0-9]+]], [[VAL]] 45; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] 46; SI: buffer_store_dword [[VEXTRACT]], 47 48; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 49; EG: ADD_INT 50; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal 51; EG-NEXT: LSHR * [[ADDR]] 52define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 53 %c = add i32 %a, %b ; add to prevent folding into extload 54 %shl = shl i32 %c, 16 55 %ashr = ashr i32 %shl, 16 56 store i32 %ashr, i32 addrspace(1)* %out, align 4 57 ret void 58} 59 60; FUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i32: 61; SI: s_add_i32 [[VAL:s[0-9]+]], 62; SI: s_sext_i32_i8 [[EXTRACT:s[0-9]+]], [[VAL]] 63; SI: v_mov_b32_e32 [[VEXTRACT:v[0-9]+]], [[EXTRACT]] 64; SI: buffer_store_dword [[VEXTRACT]], 65 66; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 67; EG: ADD_INT 68; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal 69; EG-NEXT: LSHR * [[ADDR]] 70define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { 71 %c = add <1 x i32> %a, %b ; add to prevent folding into extload 72 %shl = shl <1 x i32> %c, <i32 24> 73 %ashr = ashr <1 x i32> %shl, <i32 24> 74 store <1 x i32> %ashr, <1 x i32> addrspace(1)* %out, align 4 75 ret void 76} 77 78; FUNC-LABEL: {{^}}sext_in_reg_i1_to_i64: 79; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 80; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x10000 81; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 82; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 83; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 84define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 85 %c = shl i64 %a, %b 86 %shl = shl i64 %c, 63 87 %ashr = ashr i64 %shl, 63 88 store i64 %ashr, i64 addrspace(1)* %out, align 8 89 ret void 90} 91 92; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i64: 93; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 94; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x80000 95; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 96; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 97; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 98define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 99 %c = shl i64 %a, %b 100 %shl = shl i64 %c, 56 101 %ashr = ashr i64 %shl, 56 102 store i64 %ashr, i64 addrspace(1)* %out, align 8 103 ret void 104} 105 106; FUNC-LABEL: {{^}}sext_in_reg_i16_to_i64: 107; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 108; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x100000 109; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 110; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 111; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 112 113define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 114 %c = shl i64 %a, %b 115 %shl = shl i64 %c, 48 116 %ashr = ashr i64 %shl, 48 117 store i64 %ashr, i64 addrspace(1)* %out, align 8 118 ret void 119} 120 121; FUNC-LABEL: {{^}}sext_in_reg_i32_to_i64: 122; SI: s_lshl_b64 [[VAL:s\[[0-9]+:[0-9]+\]]] 123; SI-DAG: s_bfe_i64 s{{\[}}[[SLO:[0-9]+]]:[[SHI:[0-9]+]]{{\]}}, [[VAL]], 0x200000 124; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] 125; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] 126; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} 127define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { 128 %c = shl i64 %a, %b 129 %shl = shl i64 %c, 32 130 %ashr = ashr i64 %shl, 32 131 store i64 %ashr, i64 addrspace(1)* %out, align 8 132 ret void 133} 134 135; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments. 136; XFUNC-LABEL: {{^}}sext_in_reg_i8_to_v1i64: 137; XSI: s_bfe_i32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288 138; XSI: s_ashr_i32 {{v[0-9]+}}, [[EXTRACT]], 31 139; XSI: buffer_store_dword 140; XEG: BFE_INT 141; XEG: ASHR 142; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) nounwind { 143; %c = add <1 x i64> %a, %b 144; %shl = shl <1 x i64> %c, <i64 56> 145; %ashr = ashr <1 x i64> %shl, <i64 56> 146; store <1 x i64> %ashr, <1 x i64> addrspace(1)* %out, align 8 147; ret void 148; } 149 150; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64: 151; SI: buffer_load_dwordx2 152; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} 153; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 154; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 155; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 156define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 157 %tid = call i32 @llvm.r600.read.tidig.x() 158 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 159 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 160 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 161 %a = load i64, i64 addrspace(1)* %a.gep, align 8 162 %b = load i64, i64 addrspace(1)* %b.gep, align 8 163 164 %c = shl i64 %a, %b 165 %shl = shl i64 %c, 63 166 %ashr = ashr i64 %shl, 63 167 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 168 ret void 169} 170 171; FUNC-LABEL: {{^}}v_sext_in_reg_i8_to_i64: 172; SI: buffer_load_dwordx2 173; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} 174; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 175; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 176; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 177define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 178 %tid = call i32 @llvm.r600.read.tidig.x() 179 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 180 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 181 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 182 %a = load i64, i64 addrspace(1)* %a.gep, align 8 183 %b = load i64, i64 addrspace(1)* %b.gep, align 8 184 185 %c = shl i64 %a, %b 186 %shl = shl i64 %c, 56 187 %ashr = ashr i64 %shl, 56 188 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 189 ret void 190} 191 192; FUNC-LABEL: {{^}}v_sext_in_reg_i16_to_i64: 193; SI: buffer_load_dwordx2 194; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} 195; SI: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 196; SI: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 197; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} 198define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 199 %tid = call i32 @llvm.r600.read.tidig.x() 200 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 201 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 202 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 203 %a = load i64, i64 addrspace(1)* %a.gep, align 8 204 %b = load i64, i64 addrspace(1)* %b.gep, align 8 205 206 %c = shl i64 %a, %b 207 %shl = shl i64 %c, 48 208 %ashr = ashr i64 %shl, 48 209 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 210 ret void 211} 212 213; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64: 214; SI: buffer_load_dwordx2 215; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 216; SI: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] 217; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[SHR]]{{\]}} 218define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { 219 %tid = call i32 @llvm.r600.read.tidig.x() 220 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 221 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 222 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 223 %a = load i64, i64 addrspace(1)* %a.gep, align 8 224 %b = load i64, i64 addrspace(1)* %b.gep, align 8 225 226 %c = shl i64 %a, %b 227 %shl = shl i64 %c, 32 228 %ashr = ashr i64 %shl, 32 229 store i64 %ashr, i64 addrspace(1)* %out.gep, align 8 230 ret void 231} 232 233; FUNC-LABEL: {{^}}sext_in_reg_i1_in_i32_other_amount: 234; SI-NOT: s_lshl 235; SI-NOT: s_ashr 236; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 237 238; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]] 239; EG-NOT: BFE 240; EG: ADD_INT 241; EG: LSHL 242; EG: ASHR [[RES]] 243; EG: LSHR {{\*?}} [[ADDR]] 244define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 245 %c = add i32 %a, %b 246 %x = shl i32 %c, 6 247 %y = ashr i32 %x, 7 248 store i32 %y, i32 addrspace(1)* %out 249 ret void 250} 251 252; FUNC-LABEL: {{^}}sext_in_reg_v2i1_in_v2i32_other_amount: 253; SI-NOT: s_lshl 254; SI-NOT: s_ashr 255; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 256; SI-DAG: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x190001 257; SI: s_endpgm 258 259; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 260; EG-NOT: BFE 261; EG: ADD_INT 262; EG: LSHL 263; EG: ASHR [[RES]] 264; EG: LSHL 265; EG: ASHR [[RES]] 266; EG: LSHR {{\*?}} [[ADDR]] 267define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 268 %c = add <2 x i32> %a, %b 269 %x = shl <2 x i32> %c, <i32 6, i32 6> 270 %y = ashr <2 x i32> %x, <i32 7, i32 7> 271 store <2 x i32> %y, <2 x i32> addrspace(1)* %out 272 ret void 273} 274 275 276; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i32: 277; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 278; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 279; SI: buffer_store_dwordx2 280 281; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 282; EG: BFE_INT [[RES]] 283; EG: BFE_INT [[RES]] 284; EG: LSHR {{\*?}} [[ADDR]] 285define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 286 %c = add <2 x i32> %a, %b ; add to prevent folding into extload 287 %shl = shl <2 x i32> %c, <i32 31, i32 31> 288 %ashr = ashr <2 x i32> %shl, <i32 31, i32 31> 289 store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 290 ret void 291} 292 293; FUNC-LABEL: {{^}}sext_in_reg_v4i1_to_v4i32: 294; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 295; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 296; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 297; SI: s_bfe_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000 298; SI: buffer_store_dwordx4 299 300; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 301; EG: BFE_INT [[RES]] 302; EG: BFE_INT [[RES]] 303; EG: BFE_INT [[RES]] 304; EG: BFE_INT [[RES]] 305; EG: LSHR {{\*?}} [[ADDR]] 306define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { 307 %c = add <4 x i32> %a, %b ; add to prevent folding into extload 308 %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31> 309 %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31> 310 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 311 ret void 312} 313 314; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i32: 315; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 316; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 317; SI: buffer_store_dwordx2 318 319; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 320; EG: BFE_INT [[RES]] 321; EG: BFE_INT [[RES]] 322; EG: LSHR {{\*?}} [[ADDR]] 323define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 324 %c = add <2 x i32> %a, %b ; add to prevent folding into extload 325 %shl = shl <2 x i32> %c, <i32 24, i32 24> 326 %ashr = ashr <2 x i32> %shl, <i32 24, i32 24> 327 store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 328 ret void 329} 330 331; FUNC-LABEL: {{^}}sext_in_reg_v4i8_to_v4i32: 332; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 333; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 334; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 335; SI: s_sext_i32_i8 {{s[0-9]+}}, {{s[0-9]+}} 336; SI: buffer_store_dwordx4 337 338; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 339; EG: BFE_INT [[RES]] 340; EG: BFE_INT [[RES]] 341; EG: BFE_INT [[RES]] 342; EG: BFE_INT [[RES]] 343; EG: LSHR {{\*?}} [[ADDR]] 344define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { 345 %c = add <4 x i32> %a, %b ; add to prevent folding into extload 346 %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24> 347 %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24> 348 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 349 ret void 350} 351 352; FUNC-LABEL: {{^}}sext_in_reg_v2i16_to_v2i32: 353; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} 354; SI: s_sext_i32_i16 {{s[0-9]+}}, {{s[0-9]+}} 355; SI: buffer_store_dwordx2 356 357; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]] 358; EG: BFE_INT [[RES]] 359; EG: BFE_INT [[RES]] 360; EG: LSHR {{\*?}} [[ADDR]] 361define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { 362 %c = add <2 x i32> %a, %b ; add to prevent folding into extload 363 %shl = shl <2 x i32> %c, <i32 16, i32 16> 364 %ashr = ashr <2 x i32> %shl, <i32 16, i32 16> 365 store <2 x i32> %ashr, <2 x i32> addrspace(1)* %out, align 8 366 ret void 367} 368 369; FUNC-LABEL: {{^}}testcase: 370define void @testcase(i8 addrspace(1)* %out, i8 %a) nounwind { 371 %and_a_1 = and i8 %a, 1 372 %cmp_eq = icmp eq i8 %and_a_1, 0 373 %cmp_slt = icmp slt i8 %a, 0 374 %sel0 = select i1 %cmp_slt, i8 0, i8 %a 375 %sel1 = select i1 %cmp_eq, i8 0, i8 %a 376 %xor = xor i8 %sel0, %sel1 377 store i8 %xor, i8 addrspace(1)* %out 378 ret void 379} 380 381; FUNC-LABEL: {{^}}testcase_3: 382define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind { 383 %and_a_1 = and i8 %a, 1 384 %cmp_eq = icmp eq i8 %and_a_1, 0 385 %cmp_slt = icmp slt i8 %a, 0 386 %sel0 = select i1 %cmp_slt, i8 0, i8 %a 387 %sel1 = select i1 %cmp_eq, i8 0, i8 %a 388 %xor = xor i8 %sel0, %sel1 389 store i8 %xor, i8 addrspace(1)* %out 390 ret void 391} 392 393; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i8_to_v4i32: 394; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 395; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 396; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 397; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8 398define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { 399 %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 400 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 401 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload 402 %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24> 403 %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24> 404 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 405 ret void 406} 407 408; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32: 409; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 410; SI: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16 411define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind { 412 %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16 413 %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16 414 %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload 415 %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16> 416 %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16> 417 store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8 418 ret void 419} 420 421; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type: 422; SI: buffer_load_sbyte 423; SI: v_max_i32 424; SI-NOT: bfe 425; SI: buffer_store_short 426define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { 427 %tmp5 = load i8, i8 addrspace(1)* %src, align 1 428 %tmp2 = sext i8 %tmp5 to i32 429 %tmp2.5 = icmp sgt i32 %tmp2, 0 430 %tmp3 = select i1 %tmp2.5, i32 %tmp2, i32 0 431 %tmp4 = trunc i32 %tmp3 to i8 432 %tmp6 = sext i8 %tmp4 to i16 433 store i16 %tmp6, i16 addrspace(1)* %out, align 2 434 ret void 435} 436 437declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone 438 439; FUNC-LABEL: {{^}}bfe_0_width: 440; SI-NOT: {{[^@]}}bfe 441; SI: s_endpgm 442define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 443 %load = load i32, i32 addrspace(1)* %ptr, align 4 444 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone 445 store i32 %bfe, i32 addrspace(1)* %out, align 4 446 ret void 447} 448 449; FUNC-LABEL: {{^}}bfe_8_bfe_8: 450; SI: v_bfe_i32 451; SI-NOT: {{[^@]}}bfe 452; SI: s_endpgm 453define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 454 %load = load i32, i32 addrspace(1)* %ptr, align 4 455 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone 456 %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone 457 store i32 %bfe1, i32 addrspace(1)* %out, align 4 458 ret void 459} 460 461; FUNC-LABEL: {{^}}bfe_8_bfe_16: 462; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 463; SI: s_endpgm 464define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 465 %load = load i32, i32 addrspace(1)* %ptr, align 4 466 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone 467 %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone 468 store i32 %bfe1, i32 addrspace(1)* %out, align 4 469 ret void 470} 471 472; This really should be folded into 1 473; FUNC-LABEL: {{^}}bfe_16_bfe_8: 474; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 475; SI-NOT: {{[^@]}}bfe 476; SI: s_endpgm 477define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) nounwind { 478 %load = load i32, i32 addrspace(1)* %ptr, align 4 479 %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone 480 %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone 481 store i32 %bfe1, i32 addrspace(1)* %out, align 4 482 ret void 483} 484 485; Make sure there isn't a redundant BFE 486; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe: 487; SI: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}} 488; SI-NOT: {{[^@]}}bfe 489; SI: s_endpgm 490define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 491 %c = add i32 %a, %b ; add to prevent folding into extload 492 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone 493 %shl = shl i32 %bfe, 24 494 %ashr = ashr i32 %shl, 24 495 store i32 %ashr, i32 addrspace(1)* %out, align 4 496 ret void 497} 498 499; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong: 500define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 501 %c = add i32 %a, %b ; add to prevent folding into extload 502 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone 503 %shl = shl i32 %bfe, 24 504 %ashr = ashr i32 %shl, 24 505 store i32 %ashr, i32 addrspace(1)* %out, align 4 506 ret void 507} 508 509; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe: 510; SI: buffer_load_sbyte 511; SI-NOT: {{[^@]}}bfe 512; SI: s_endpgm 513define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { 514 %load = load i8, i8 addrspace(1)* %ptr, align 1 515 %sext = sext i8 %load to i32 516 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone 517 %shl = shl i32 %bfe, 24 518 %ashr = ashr i32 %shl, 24 519 store i32 %ashr, i32 addrspace(1)* %out, align 4 520 ret void 521} 522 523; SI: .text 524; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}} 525; SI-NOT: {{[^@]}}bfe 526; SI: s_endpgm 527define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) nounwind { 528 %load = load i8, i8 addrspace(1)* %ptr, align 1 529 %sext = sext i8 %load to i32 530 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone 531 %shl = shl i32 %bfe, 24 532 %ashr = ashr i32 %shl, 24 533 store i32 %ashr, i32 addrspace(1)* %out, align 4 534 ret void 535} 536 537; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0: 538; SI-NOT: shr 539; SI-NOT: shl 540; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1 541; SI: s_endpgm 542define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 543 %x = load i32, i32 addrspace(1)* %in, align 4 544 %shl = shl i32 %x, 31 545 %shr = ashr i32 %shl, 31 546 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1) 547 store i32 %bfe, i32 addrspace(1)* %out, align 4 548 ret void 549} 550 551; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1: 552; SI: buffer_load_dword 553; SI-NOT: shl 554; SI-NOT: shr 555; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1 556; SI: s_endpgm 557define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 558 %x = load i32, i32 addrspace(1)* %in, align 4 559 %shl = shl i32 %x, 30 560 %shr = ashr i32 %shl, 30 561 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1) 562 store i32 %bfe, i32 addrspace(1)* %out, align 4 563 ret void 564} 565 566; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1: 567; SI: buffer_load_dword 568; SI-NOT: v_lshl 569; SI-NOT: v_ashr 570; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2 571; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2 572; SI: s_endpgm 573define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 574 %x = load i32, i32 addrspace(1)* %in, align 4 575 %shl = shl i32 %x, 30 576 %shr = ashr i32 %shl, 30 577 %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2) 578 store i32 %bfe, i32 addrspace(1)* %out, align 4 579 ret void 580} 581 582; Make sure we propagate the VALUness to users of a moved scalar BFE. 583 584; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use: 585; SI: buffer_load_dwordx2 586; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} 587; SI-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 588; SI-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 589; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] 590; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]] 591; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} 592define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind { 593 %tid = call i32 @llvm.r600.read.tidig.x() 594 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 595 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 596 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 597 %a = load i64, i64 addrspace(1)* %a.gep, align 8 598 %b = load i64, i64 addrspace(1)* %b.gep, align 8 599 600 %c = shl i64 %a, %b 601 %shl = shl i64 %c, 63 602 %ashr = ashr i64 %shl, 63 603 604 %and = and i64 %ashr, %s.val 605 store i64 %and, i64 addrspace(1)* %out.gep, align 8 606 ret void 607} 608 609; FUNC-LABEL: {{^}}v_sext_in_reg_i32_to_i64_move_use: 610; SI: buffer_load_dwordx2 611; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 612; SI-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] 613; SI-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] 614; SI-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]] 615; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} 616define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) nounwind { 617 %tid = call i32 @llvm.r600.read.tidig.x() 618 %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 619 %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid 620 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid 621 %a = load i64, i64 addrspace(1)* %a.gep, align 8 622 %b = load i64, i64 addrspace(1)* %b.gep, align 8 623 624 %c = shl i64 %a, %b 625 %shl = shl i64 %c, 32 626 %ashr = ashr i64 %shl, 32 627 %and = and i64 %ashr, %s.val 628 store i64 %and, i64 addrspace(1)* %out.gep, align 8 629 ret void 630} 631