1; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s 4 5; Test expansion of scalar selects on vectors. 6; Evergreen not enabled since it seems to be having problems with doubles. 7 8; GCN-LABEL: {{^}}v_select_v2i8: 9; SI: v_cndmask_b32 10; SI-NOT: cndmask 11 12; GFX9: v_cndmask_b32 13; GFX9-NOT: cndmask 14 15; This is worse when i16 is legal and packed is not because 16; SelectionDAGBuilder for some reason changes the select type. 17; VI: v_cndmask_b32 18; VI: v_cndmask_b32 19define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 20 %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2 21 %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2 22 %cmp = icmp eq i32 %c, 0 23 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b 24 store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2 25 ret void 26} 27 28; GCN-LABEL: {{^}}v_select_v4i8: 29; GCN: v_cndmask_b32_e32 30; GCN-NOT: cndmask 31define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 32 %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr 33 %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr 34 %cmp = icmp eq i32 %c, 0 35 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 36 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 37 ret void 38} 39 40; GCN-LABEL: {{^}}v_select_v8i8: 41; GCN: v_cndmask_b32_e32 42; GCN: v_cndmask_b32_e32 43; GCN-NOT: cndmask 44define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 45 %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr 46 %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr 47 %cmp = icmp eq i32 %c, 0 48 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b 49 store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4 50 ret void 51} 52 53; GCN-LABEL: {{^}}v_select_v16i8: 54; GCN: v_cndmask_b32_e32 55; GCN: v_cndmask_b32_e32 56; GCN: v_cndmask_b32_e32 57; GCN: v_cndmask_b32_e32 58; GCN-NOT: cndmask 59define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { 60 %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr 61 %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr 62 %cmp = icmp eq i32 %c, 0 63 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b 64 store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4 65 ret void 66} 67 68; GCN-LABEL: {{^}}select_v4i8: 69; GFX89: s_cselect_b32 70; GFX89-NOT: s_cselect_b32 71 72; SI: v_cndmask_b32 73; SI-NOT: cndmask 74define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { 75 %cmp = icmp eq i8 %c, 0 76 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b 77 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 78 ret void 79} 80 81; GCN-LABEL: {{^}}select_v2i16: 82; GFX89: s_load_dword 83; GFX89: s_load_dword 84; GFX89: s_load_dword 85; GFX89: s_cselect_b32 86; GFX89-NOT: s_cselect_b32 87 88; SI: v_cndmask_b32_e32 89; SI-NOT: v_cndmask_b32e 90define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { 91 %cmp = icmp eq i32 %c, 0 92 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 93 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 94 ret void 95} 96 97; GCN-LABEL: {{^}}v_select_v2i16: 98; GCN: buffer_load_dword v 99; GCN: buffer_load_dword v 100; GCN: v_cndmask_b32 101; GCN-NOT: cndmask 102define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 103 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr 104 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr 105 %cmp = icmp eq i32 %c, 0 106 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b 107 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 108 ret void 109} 110 111; GCN-LABEL: {{^}}v_select_v3i16: 112; SI: v_cndmask_b32_e32 113; SI: cndmask 114; SI-NOT: cndmask 115 116; GFX89: v_cndmask_b32_e32 117; GFX89: cndmask 118; VI: cndmask 119; GFX89-NOT: cndmask 120define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 121 %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr 122 %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr 123 %cmp = icmp eq i32 %c, 0 124 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b 125 store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4 126 ret void 127} 128 129; GCN-LABEL: {{^}}v_select_v4i16: 130; GCN: v_cndmask_b32_e32 131; GCN: v_cndmask_b32_e32 132; GCN-NOT: cndmask 133define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 134 %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr 135 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr 136 %cmp = icmp eq i32 %c, 0 137 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b 138 store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 139 ret void 140} 141 142; GCN-LABEL: {{^}}v_select_v8i16: 143; GCN: v_cndmask_b32_e32 144; GCN: v_cndmask_b32_e32 145; GCN: v_cndmask_b32_e32 146; GCN: v_cndmask_b32_e32 147; GCN-NOT: cndmask 148define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { 149 %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr 150 %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr 151 %cmp = icmp eq i32 %c, 0 152 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b 153 store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4 154 ret void 155} 156 157; FIXME: Expansion with bitwise operations may be better if doing a 158; vector select with SGPR inputs. 159 160; GCN-LABEL: {{^}}s_select_v2i32: 161; GCN: v_cndmask_b32_e32 162; GCN: v_cndmask_b32_e32 163; GCN: buffer_store_dwordx2 164define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { 165 %cmp = icmp eq i32 %c, 0 166 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b 167 store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 168 ret void 169} 170 171; GCN-LABEL: {{^}}s_select_v4i32: 172; GCN: v_cndmask_b32_e32 173; GCN: v_cndmask_b32_e32 174; GCN: v_cndmask_b32_e32 175; GCN: v_cndmask_b32_e32 176; GCN: buffer_store_dwordx4 177define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { 178 %cmp = icmp eq i32 %c, 0 179 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b 180 store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 181 ret void 182} 183 184; GCN-LABEL: {{^}}v_select_v4i32: 185; GCN: buffer_load_dwordx4 186; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 187; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 188; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 189; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 190; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 191; GCN: buffer_store_dwordx4 192define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { 193bb: 194 %tmp2 = icmp ult i32 %cond, 32 195 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in 196 %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer 197 store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16 198 ret void 199} 200 201; GCN-LABEL: {{^}}select_v8i32: 202; GCN: v_cndmask_b32_e32 203; GCN: v_cndmask_b32_e32 204; GCN: v_cndmask_b32_e32 205; GCN: v_cndmask_b32_e32 206; GCN: v_cndmask_b32_e32 207; GCN: v_cndmask_b32_e32 208; GCN: v_cndmask_b32_e32 209; GCN: v_cndmask_b32_e32 210define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { 211 %cmp = icmp eq i32 %c, 0 212 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b 213 store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 214 ret void 215} 216 217; GCN-LABEL: {{^}}s_select_v2f32: 218; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} 219; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} 220 221; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] 222; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] 223; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] 224; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 225 226; GCN-DAG: v_cndmask_b32_e32 227; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] 228; GCN-DAG: v_cndmask_b32_e32 229; GCN: buffer_store_dwordx2 230define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { 231 %cmp = icmp eq i32 %c, 0 232 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b 233 store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 234 ret void 235} 236 237; GCN-LABEL: {{^}}s_select_v3f32: 238; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 239 240; GCN: v_cndmask_b32_e32 241; GCN: v_cndmask_b32_e32 242; GCN: v_cndmask_b32_e32 243 244; GCN: buffer_store_dwordx 245define amdgpu_kernel void @s_select_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b, i32 %c) #0 { 246 %cmp = icmp eq i32 %c, 0 247 %select = select i1 %cmp, <3 x float> %a, <3 x float> %b 248 store <3 x float> %select, <3 x float> addrspace(1)* %out, align 16 249 ret void 250} 251 252; GCN-LABEL: {{^}}s_select_v4f32: 253; GCN: s_load_dwordx4 254; GCN: s_load_dwordx4 255; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 256 257; GCN: v_cndmask_b32_e32 258; GCN: v_cndmask_b32_e32 259; GCN: v_cndmask_b32_e32 260; GCN: v_cndmask_b32_e32 261 262; GCN: buffer_store_dwordx4 263define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { 264 %cmp = icmp eq i32 %c, 0 265 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b 266 store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 267 ret void 268} 269 270; GCN-LABEL: {{^}}v_select_v4f32: 271; GCN: buffer_load_dwordx4 272; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 273; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 274; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 275; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 276; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 277; GCN: buffer_store_dwordx4 278define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { 279bb: 280 %tmp2 = icmp ult i32 %cond, 32 281 %val = load <4 x float>, <4 x float> addrspace(1)* %in 282 %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer 283 store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16 284 ret void 285} 286 287; GCN-LABEL: {{^}}s_select_v5f32: 288; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} 289 290; GCN: v_cndmask_b32_e32 291; GCN: v_cndmask_b32_e32 292; GCN: v_cndmask_b32_e32 293; GCN: v_cndmask_b32_e32 294; GCN: v_cndmask_b32_e32 295 296; GCN: buffer_store_dwordx 297define amdgpu_kernel void @s_select_v5f32(<5 x float> addrspace(1)* %out, <5 x float> %a, <5 x float> %b, i32 %c) #0 { 298 %cmp = icmp eq i32 %c, 0 299 %select = select i1 %cmp, <5 x float> %a, <5 x float> %b 300 store <5 x float> %select, <5 x float> addrspace(1)* %out, align 16 301 ret void 302} 303 304; GCN-LABEL: {{^}}select_v8f32: 305; GCN: v_cndmask_b32_e32 306; GCN: v_cndmask_b32_e32 307; GCN: v_cndmask_b32_e32 308; GCN: v_cndmask_b32_e32 309; GCN: v_cndmask_b32_e32 310; GCN: v_cndmask_b32_e32 311; GCN: v_cndmask_b32_e32 312; GCN: v_cndmask_b32_e32 313define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { 314 %cmp = icmp eq i32 %c, 0 315 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b 316 store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 317 ret void 318} 319 320; GCN-LABEL: {{^}}select_v2f64: 321; GCN: v_cndmask_b32_e32 322; GCN: v_cndmask_b32_e32 323; GCN: v_cndmask_b32_e32 324; GCN: v_cndmask_b32_e32 325define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { 326 %cmp = icmp eq i32 %c, 0 327 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b 328 store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 329 ret void 330} 331 332; GCN-LABEL: {{^}}select_v4f64: 333; GCN: v_cndmask_b32_e32 334; GCN: v_cndmask_b32_e32 335; GCN: v_cndmask_b32_e32 336; GCN: v_cndmask_b32_e32 337; GCN: v_cndmask_b32_e32 338; GCN: v_cndmask_b32_e32 339; GCN: v_cndmask_b32_e32 340; GCN: v_cndmask_b32_e32 341define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { 342 %cmp = icmp eq i32 %c, 0 343 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b 344 store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 345 ret void 346} 347 348; GCN-LABEL: {{^}}select_v8f64: 349; GCN: v_cndmask_b32_e32 350; GCN: v_cndmask_b32_e32 351; GCN: v_cndmask_b32_e32 352; GCN: v_cndmask_b32_e32 353; GCN: v_cndmask_b32_e32 354; GCN: v_cndmask_b32_e32 355; GCN: v_cndmask_b32_e32 356; GCN: v_cndmask_b32_e32 357; GCN: v_cndmask_b32_e32 358; GCN: v_cndmask_b32_e32 359; GCN: v_cndmask_b32_e32 360; GCN: v_cndmask_b32_e32 361; GCN: v_cndmask_b32_e32 362; GCN: v_cndmask_b32_e32 363; GCN: v_cndmask_b32_e32 364; GCN: v_cndmask_b32_e32 365define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { 366 %cmp = icmp eq i32 %c, 0 367 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b 368 store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 369 ret void 370} 371 372; GCN-LABEL: {{^}}v_select_v2f16: 373; GCN: v_cndmask_b32 374; GCN-NOT: cndmask 375define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 376 %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr 377 %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr 378 %cmp = icmp eq i32 %c, 0 379 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b 380 store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4 381 ret void 382} 383 384; GCN-LABEL: {{^}}v_select_v3f16: 385; GCN: v_cndmask_b32_e32 386; GCN: v_cndmask_b32_e32 387; GCN-NOT: cndmask 388define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 389 %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr 390 %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr 391 %cmp = icmp eq i32 %c, 0 392 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b 393 store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4 394 ret void 395} 396 397; GCN-LABEL: {{^}}v_select_v4f16: 398; GCN: v_cndmask_b32_e32 399; GCN: v_cndmask_b32_e32 400; GCN-NOT: cndmask 401define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 { 402 %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr 403 %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr 404 %cmp = icmp eq i32 %c, 0 405 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b 406 store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4 407 ret void 408} 409 410; Function Attrs: nounwind readnone 411declare i32 @llvm.amdgcn.workitem.id.x() #1 412 413attributes #0 = { nounwind } 414attributes #1 = { nounwind readnone } 415