1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 2; FIXME: Fails with -enable-var-scope 3 4; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half. 5 6; Extract the high bit of the low half 7; GCN-LABEL: {{^}}v_uextract_bit_31_i64: 8; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 9; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] 10; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 11; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} 12define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 13 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 14 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 15 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 16 %ld.64 = load i64, i64 addrspace(1)* %in.gep 17 %srl = lshr i64 %ld.64, 31 18 %bit = and i64 %srl, 1 19 store i64 %bit, i64 addrspace(1)* %out.gep 20 ret void 21} 22 23; Extract the high bit of the high half 24; GCN-LABEL: {{^}}v_uextract_bit_63_i64: 25; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 26; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 27; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] 28; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 29; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}} 30define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 31 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 32 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 33 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 34 %ld.64 = load i64, i64 addrspace(1)* %in.gep 35 %srl = lshr i64 %ld.64, 63 36 %bit = and i64 %srl, 1 37 store i64 %bit, i64 addrspace(1)* %out.gep 38 ret void 39} 40 41; GCN-LABEL: {{^}}v_uextract_bit_1_i64: 42; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 43; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1 44; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 45; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 46define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 47 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 48 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 49 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 50 %ld.64 = load i64, i64 addrspace(1)* %in.gep 51 %srl = lshr i64 %ld.64, 1 52 %bit = and i64 %srl, 1 53 store i64 %bit, i64 addrspace(1)* %out.gep 54 ret void 55} 56 57; GCN-LABEL: {{^}}v_uextract_bit_20_i64: 58; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 59; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1 60; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 61; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 62define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 63 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 64 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 65 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 66 %ld.64 = load i64, i64 addrspace(1)* %in.gep 67 %srl = lshr i64 %ld.64, 20 68 %bit = and i64 %srl, 1 69 store i64 %bit, i64 addrspace(1)* %out.gep 70 ret void 71} 72 73; GCN-LABEL: {{^}}v_uextract_bit_32_i64: 74; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 75; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]] 76; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 77; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}} 78; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}} 79define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 80 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 81 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 82 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 83 %ld.64 = load i64, i64 addrspace(1)* %in.gep 84 %srl = lshr i64 %ld.64, 32 85 %bit = and i64 %srl, 1 86 store i64 %bit, i64 addrspace(1)* %out.gep 87 ret void 88} 89 90; GCN-LABEL: {{^}}v_uextract_bit_33_i64: 91; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 92; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 93; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}} 94; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 95; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}} 96define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 97 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 98 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 99 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 100 %ld.64 = load i64, i64 addrspace(1)* %in.gep 101 %srl = lshr i64 %ld.64, 33 102 %bit = and i64 %srl, 1 103 store i64 %bit, i64 addrspace(1)* %out.gep 104 ret void 105} 106 107; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64: 108; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 109; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2 110; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 111; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 112define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 113 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 114 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 115 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 116 %ld.64 = load i64, i64 addrspace(1)* %in.gep 117 %srl = lshr i64 %ld.64, 20 118 %bit = and i64 %srl, 3 119 store i64 %bit, i64 addrspace(1)* %out.gep 120 ret void 121} 122 123; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64: 124; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 125; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 126; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 127; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} 128define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 129 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 130 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 131 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 132 %ld.64 = load i64, i64 addrspace(1)* %in.gep 133 %srl = lshr i64 %ld.64, 1 134 %bit = and i64 %srl, 1073741823 135 store i64 %bit, i64 addrspace(1)* %out.gep 136 ret void 137} 138 139; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64: 140; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 141; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]] 142; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 143; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} 144define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 145 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 146 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 147 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 148 %ld.64 = load i64, i64 addrspace(1)* %in.gep 149 %srl = lshr i64 %ld.64, 1 150 %bit = and i64 %srl, 2147483647 151 store i64 %bit, i64 addrspace(1)* %out.gep 152 ret void 153} 154 155; Spans the dword boundary, so requires full shift. 156; Truncated after the shift, so only low shift result is used. 157; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64: 158; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 159; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 160; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}} 161; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 162; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} 163define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 164 %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() 165 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 166 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 167 %ld.64 = load i64, i64 addrspace(1)* %in.gep 168 %srl = lshr i64 %ld.64, 31 169 %bit = and i64 %srl, 3 170 store i64 %bit, i64 addrspace(1)* %out.gep 171 ret void 172} 173 174; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64: 175; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 176; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 177; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2 178; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 179; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}} 180define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 181 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 182 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 183 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 184 %ld.64 = load i64, i64 addrspace(1)* %in.gep 185 %srl = lshr i64 %ld.64, 33 186 %bit = and i64 %srl, 3 187 store i64 %bit, i64 addrspace(1)* %out.gep 188 ret void 189} 190 191; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64: 192; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 193; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 194; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30 195; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}} 196; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 197; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}} 198define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 199 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 200 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 201 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 202 %ld.64 = load i64, i64 addrspace(1)* %in.gep 203 %srl = lshr i64 %ld.64, 30 204 %bit = and i64 %srl, 1073741823 205 store i64 %bit, i64 addrspace(1)* %out.gep 206 ret void 207} 208 209; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64: 210; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 211; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 212; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 213; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] 214; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}} 215define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 216 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 217 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 218 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 219 %ld.64 = load i64, i64 addrspace(1)* %in.gep 220 %srl = lshr i64 %ld.64, 33 221 %bit = and i64 %srl, 1073741823 222 store i64 %bit, i64 addrspace(1)* %out.gep 223 ret void 224} 225 226; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64: 227; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 228; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 229; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 230; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[ZERO]]{{\]}} 231define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 232 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 233 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 234 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 235 %ld.64 = load i64, i64 addrspace(1)* %in.gep 236 %srl = lshr i64 %ld.64, 31 237 %and = and i64 %srl, 4294967295 238 store i64 %and, i64 addrspace(1)* %out 239 ret void 240} 241 242; trunc applied before and mask 243; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32: 244; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 245; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] 246; GCN: buffer_store_dword v[[SHIFT]] 247define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 248 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 249 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 250 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 251 %ld.64 = load i64, i64 addrspace(1)* %in.gep 252 %srl = lshr i64 %ld.64, 31 253 %trunc = trunc i64 %srl to i32 254 %bit = and i32 %trunc, 1 255 store i32 %bit, i32 addrspace(1)* %out.gep 256 ret void 257} 258 259; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32: 260; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} 261; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}} 262; GCN: buffer_store_dword [[BFE]] 263define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 264 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 265 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 266 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 267 %ld.64 = load i64, i64 addrspace(1)* %in.gep 268 %srl = lshr i64 %ld.64, 3 269 %trunc = trunc i64 %srl to i32 270 %bit = and i32 %trunc, 1 271 store i32 %bit, i32 addrspace(1)* %out.gep 272 ret void 273} 274 275; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32: 276; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 277; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}} 278; GCN: buffer_store_dword [[BFE]] 279define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 280 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 281 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 282 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 283 %ld.64 = load i64, i64 addrspace(1)* %in.gep 284 %srl = lshr i64 %ld.64, 33 285 %trunc = trunc i64 %srl to i32 286 %bit = and i32 %trunc, 1 287 store i32 %bit, i32 addrspace(1)* %out.gep 288 ret void 289} 290 291; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32: 292; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 293; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31 294; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]] 295; GCN-NOT: v[[SHRLO]] 296; GCN: buffer_store_dword v[[SHRLO]] 297define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 298 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 299 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 300 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x 301 %ld.64 = load i64, i64 addrspace(1)* %in.gep 302 %srl = lshr i64 %ld.64, 31 303 %trunc = trunc i64 %srl to i32 304 %bit = and i32 %trunc, 3 305 store i32 %bit, i32 addrspace(1)* %out.gep 306 ret void 307} 308 309; GCN-LABEL: {{^}}and_not_mask_i64: 310; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} 311; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 312; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}} 313; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] 314; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]] 315; GCN-NOT: v[[SHRLO]] 316; GCN-NOT: v[[SHRHI]] 317; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} 318define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 319 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 320 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 321 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 322 %ld.64 = load i64, i64 addrspace(1)* %in.gep 323 %srl = lshr i64 %ld.64, 20 324 %bit = and i64 %srl, 4 325 store i64 %bit, i64 addrspace(1)* %out.gep 326 ret void 327} 328 329; The instruction count is the same with/without hasOneUse, but 330; keeping the 32-bit and has a smaller encoding size than the bfe. 331 332; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64: 333; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] 334; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27 335; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]] 336; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 337; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} 338; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} 339define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 340 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 341 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 342 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 343 %ld.64 = load i64, i64 addrspace(1)* %in.gep 344 %srl = lshr i64 %ld.64, 27 345 %bit = and i64 %srl, 3 346 store volatile i64 %srl, i64 addrspace(1)* %out 347 store volatile i64 %bit, i64 addrspace(1)* %out 348 ret void 349} 350 351; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64: 352; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 353; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}} 354; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]] 355; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]] 356; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 357; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}} 358; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}} 359define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { 360 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 361 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 362 %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x 363 %ld.64 = load i64, i64 addrspace(1)* %in.gep 364 %srl = lshr i64 %ld.64, 34 365 %bit = and i64 %srl, 7 366 store volatile i64 %srl, i64 addrspace(1)* %out 367 store volatile i64 %bit, i64 addrspace(1)* %out 368 ret void 369} 370 371; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64: 372; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} 373; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3 374; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} 375; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}} 376; GCN: buffer_store_dword v[[ZERO]] 377define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { 378 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 379 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x 380 %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x 381 %out1.gep = getelementptr i32, i32 addrspace(1)* %out1, i32 %id.x 382 %ld.64 = load i64, i64 addrspace(1)* %in.gep 383 %srl = lshr i64 %ld.64, 33 384 %bit = and i64 %srl, 7 385 store volatile i64 %bit, i64 addrspace(1)* %out0.gep 386 387 %srl.srl32 = lshr i64 %srl, 32 388 %srl.hi = trunc i64 %srl.srl32 to i32 389 store volatile i32 %srl.hi, i32 addrspace(1)* %out1.gep 390 ret void 391} 392 393declare i32 @llvm.amdgcn.workitem.id.x() #0 394 395declare i32 @llvm.amdgcn.workgroup.id.x() #0 396 397attributes #0 = { nounwind readnone } 398attributes #1 = { nounwind } 399