1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s 3; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 4 5define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) { 6; GCN-LABEL: s_orn2_i32: 7; GCN: ; %bb.0: 8; GCN-NEXT: s_orn2_b32 s0, s2, s3 9; GCN-NEXT: ; return to shader part epilog 10 %not.src1 = xor i32 %src1, -1 11 %or = or i32 %src0, %not.src1 12 ret i32 %or 13} 14 15define amdgpu_ps i32 @s_orn2_i32_commute(i32 inreg %src0, i32 inreg %src1) { 16; GCN-LABEL: s_orn2_i32_commute: 17; GCN: ; %bb.0: 18; GCN-NEXT: s_orn2_b32 s0, s2, s3 19; GCN-NEXT: ; return to shader part epilog 20 %not.src1 = xor i32 %src1, -1 21 %or = or i32 %not.src1, %src0 22 ret i32 %or 23} 24 25define amdgpu_ps { i32, i32 } @s_orn2_i32_multi_use(i32 inreg %src0, i32 inreg %src1) { 26; GCN-LABEL: s_orn2_i32_multi_use: 27; GCN: ; %bb.0: 28; GCN-NEXT: s_not_b32 s1, s3 29; GCN-NEXT: s_orn2_b32 s0, s2, s3 30; GCN-NEXT: ; return to shader part epilog 31 %not.src1 = xor i32 %src1, -1 32 %or = or i32 %src0, %not.src1 33 %insert.0 = insertvalue { i32, i32 } undef, i32 %or, 0 34 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %not.src1, 1 35 ret { i32, i32 } %insert.1 36} 37 38define amdgpu_ps { i32, i32 } @s_orn2_i32_multi_foldable_use(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) { 39; GCN-LABEL: s_orn2_i32_multi_foldable_use: 40; GCN: ; %bb.0: 41; GCN-NEXT: s_orn2_b32 s0, s2, s4 42; GCN-NEXT: s_orn2_b32 s1, s3, s4 43; GCN-NEXT: ; return to shader part epilog 44 %not.src2 = xor i32 %src2, -1 45 %or0 = or i32 %src0, %not.src2 46 %or1 = or i32 %src1, %not.src2 47 %insert.0 = insertvalue { i32, i32 } undef, i32 %or0, 0 48 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %or1, 1 49 ret { i32, i32 } %insert.1 50} 51 52define i32 @v_orn2_i32(i32 %src0, i32 %src1) { 53; GCN-LABEL: v_orn2_i32: 54; GCN: ; %bb.0: 55; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 57; GCN-NEXT: v_or_b32_e32 v0, v0, v1 58; GCN-NEXT: s_setpc_b64 s[30:31] 59 %not.src1 = xor i32 %src1, -1 60 %or = or i32 %src0, %not.src1 61 ret i32 %or 62} 63 64define amdgpu_ps float @v_orn2_i32_sv(i32 inreg %src0, i32 %src1) { 65; GCN-LABEL: v_orn2_i32_sv: 66; GCN: ; %bb.0: 67; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 68; GCN-NEXT: v_or_b32_e32 v0, s2, v0 69; GCN-NEXT: ; return to shader part epilog 70 %not.src1 = xor i32 %src1, -1 71 %or = or i32 %src0, %not.src1 72 %cast = bitcast i32 %or to float 73 ret float %cast 74} 75 76define amdgpu_ps float @v_orn2_i32_vs(i32 %src0, i32 inreg %src1) { 77; GCN-LABEL: v_orn2_i32_vs: 78; GCN: ; %bb.0: 79; GCN-NEXT: s_not_b32 s0, s2 80; GCN-NEXT: v_or_b32_e32 v0, s0, v0 81; GCN-NEXT: ; return to shader part epilog 82 %not.src1 = xor i32 %src1, -1 83 %or = or i32 %src0, %not.src1 84 %cast = bitcast i32 %or to float 85 ret float %cast 86} 87 88define amdgpu_ps i64 @s_orn2_i64(i64 inreg %src0, i64 inreg %src1) { 89; GCN-LABEL: s_orn2_i64: 90; GCN: ; %bb.0: 91; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] 92; GCN-NEXT: ; return to shader part epilog 93 %not.src1 = xor i64 %src1, -1 94 %or = or i64 %src0, %not.src1 95 ret i64 %or 96} 97 98define amdgpu_ps i64 @s_orn2_i64_commute(i64 inreg %src0, i64 inreg %src1) { 99; GCN-LABEL: s_orn2_i64_commute: 100; GCN: ; %bb.0: 101; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] 102; GCN-NEXT: ; return to shader part epilog 103 %not.src1 = xor i64 %src1, -1 104 %or = or i64 %not.src1, %src0 105 ret i64 %or 106} 107 108define amdgpu_ps { i64, i64 } @s_orn2_i64_multi_foldable_use(i64 inreg %src0, i64 inreg %src1, i64 inreg %src2) { 109; GCN-LABEL: s_orn2_i64_multi_foldable_use: 110; GCN: ; %bb.0: 111; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[6:7] 112; GCN-NEXT: s_orn2_b64 s[2:3], s[4:5], s[6:7] 113; GCN-NEXT: ; return to shader part epilog 114 %not.src2 = xor i64 %src2, -1 115 %or0 = or i64 %src0, %not.src2 116 %or1 = or i64 %src1, %not.src2 117 %insert.0 = insertvalue { i64, i64 } undef, i64 %or0, 0 118 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %or1, 1 119 ret { i64, i64 } %insert.1 120} 121 122define amdgpu_ps { i64, i64 } @s_orn2_i64_multi_use(i64 inreg %src0, i64 inreg %src1) { 123; GCN-LABEL: s_orn2_i64_multi_use: 124; GCN: ; %bb.0: 125; GCN-NEXT: s_not_b64 s[6:7], s[4:5] 126; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] 127; GCN-NEXT: s_mov_b32 s2, s6 128; GCN-NEXT: s_mov_b32 s3, s7 129; GCN-NEXT: ; return to shader part epilog 130 %not.src1 = xor i64 %src1, -1 131 %or = or i64 %src0, %not.src1 132 %insert.0 = insertvalue { i64, i64 } undef, i64 %or, 0 133 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %not.src1, 1 134 ret { i64, i64 } %insert.1 135} 136 137define i64 @v_orn2_i64(i64 %src0, i64 %src1) { 138; GCN-LABEL: v_orn2_i64: 139; GCN: ; %bb.0: 140; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GCN-NEXT: v_xor_b32_e32 v2, -1, v2 142; GCN-NEXT: v_xor_b32_e32 v3, -1, v3 143; GCN-NEXT: v_or_b32_e32 v0, v0, v2 144; GCN-NEXT: v_or_b32_e32 v1, v1, v3 145; GCN-NEXT: s_setpc_b64 s[30:31] 146 %not.src1 = xor i64 %src1, -1 147 %or = or i64 %src0, %not.src1 148 ret i64 %or 149} 150 151define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) { 152; GCN-LABEL: v_orn2_i64_sv: 153; GCN: ; %bb.0: 154; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 155; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 156; GCN-NEXT: v_or_b32_e32 v0, s2, v0 157; GCN-NEXT: v_or_b32_e32 v1, s3, v1 158; GCN-NEXT: ; return to shader part epilog 159 %not.src1 = xor i64 %src1, -1 160 %or = or i64 %src0, %not.src1 161 %cast = bitcast i64 %or to <2 x float> 162 ret <2 x float> %cast 163} 164 165define amdgpu_ps <2 x float> @v_orn2_i64_vs(i64 %src0, i64 inreg %src1) { 166; GCN-LABEL: v_orn2_i64_vs: 167; GCN: ; %bb.0: 168; GCN-NEXT: s_not_b64 s[0:1], s[2:3] 169; GCN-NEXT: v_or_b32_e32 v0, s0, v0 170; GCN-NEXT: v_or_b32_e32 v1, s1, v1 171; GCN-NEXT: ; return to shader part epilog 172 %not.src1 = xor i64 %src1, -1 173 %or = or i64 %src0, %not.src1 174 %cast = bitcast i64 %or to <2 x float> 175 ret <2 x float> %cast 176} 177 178define amdgpu_ps <2 x i32> @s_orn2_v2i32(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { 179; GCN-LABEL: s_orn2_v2i32: 180; GCN: ; %bb.0: 181; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] 182; GCN-NEXT: ; return to shader part epilog 183 %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1> 184 %or = or <2 x i32> %src0, %not.src1 185 ret <2 x i32> %or 186} 187 188define amdgpu_ps <2 x i32> @s_orn2_v2i32_commute(<2 x i32> inreg %src0, <2 x i32> inreg %src1) { 189; GCN-LABEL: s_orn2_v2i32_commute: 190; GCN: ; %bb.0: 191; GCN-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] 192; GCN-NEXT: ; return to shader part epilog 193 %not.src1 = xor <2 x i32> %src1, <i32 -1, i32 -1> 194 %or = or <2 x i32> %not.src1, %src0 195 ret <2 x i32> %or 196} 197 198define amdgpu_ps i16 @s_orn2_i16(i16 inreg %src0, i16 inreg %src1) { 199; GCN-LABEL: s_orn2_i16: 200; GCN: ; %bb.0: 201; GCN-NEXT: s_orn2_b32 s0, s2, s3 202; GCN-NEXT: ; return to shader part epilog 203 %not.src1 = xor i16 %src1, -1 204 %or = or i16 %src0, %not.src1 205 ret i16 %or 206} 207 208define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) { 209; GCN-LABEL: s_orn2_i16_commute: 210; GCN: ; %bb.0: 211; GCN-NEXT: s_orn2_b32 s0, s2, s3 212; GCN-NEXT: ; return to shader part epilog 213 %not.src1 = xor i16 %src1, -1 214 %or = or i16 %not.src1, %src0 215 ret i16 %or 216} 217 218define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) { 219; GCN-LABEL: s_orn2_i16_multi_use: 220; GCN: ; %bb.0: 221; GCN-NEXT: s_xor_b32 s1, s3, -1 222; GCN-NEXT: s_orn2_b32 s0, s2, s3 223; GCN-NEXT: ; return to shader part epilog 224 %not.src1 = xor i16 %src1, -1 225 %or = or i16 %src0, %not.src1 226 %insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0 227 %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %not.src1, 1 228 ret { i16, i16 } %insert.1 229} 230 231define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_foldable_use(i16 inreg %src0, i16 inreg %src1, i16 inreg %src2) { 232; GCN-LABEL: s_orn2_i16_multi_foldable_use: 233; GCN: ; %bb.0: 234; GCN-NEXT: s_orn2_b32 s0, s2, s4 235; GCN-NEXT: s_orn2_b32 s1, s3, s4 236; GCN-NEXT: ; return to shader part epilog 237 %not.src2 = xor i16 %src2, -1 238 %or0 = or i16 %src0, %not.src2 239 %or1 = or i16 %src1, %not.src2 240 %insert.0 = insertvalue { i16, i16 } undef, i16 %or0, 0 241 %insert.1 = insertvalue { i16, i16 } %insert.0, i16 %or1, 1 242 ret { i16, i16 } %insert.1 243} 244 245define i16 @v_orn2_i16(i16 %src0, i16 %src1) { 246; GCN-LABEL: v_orn2_i16: 247; GCN: ; %bb.0: 248; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 249; GCN-NEXT: v_xor_b32_e32 v1, -1, v1 250; GCN-NEXT: v_or_b32_e32 v0, v0, v1 251; GCN-NEXT: s_setpc_b64 s[30:31] 252 %not.src1 = xor i16 %src1, -1 253 %or = or i16 %src0, %not.src1 254 ret i16 %or 255} 256 257define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) { 258; GCN-LABEL: v_orn2_i16_sv: 259; GCN: ; %bb.0: 260; GCN-NEXT: v_xor_b32_e32 v0, -1, v0 261; GCN-NEXT: v_or_b32_e32 v0, s2, v0 262; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 263; GCN-NEXT: ; return to shader part epilog 264 %not.src1 = xor i16 %src1, -1 265 %or = or i16 %src0, %not.src1 266 %zext = zext i16 %or to i32 267 %cast.zext = bitcast i32 %zext to float 268 ret float %cast.zext 269} 270 271define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) { 272; GCN-LABEL: v_orn2_i16_vs: 273; GCN: ; %bb.0: 274; GCN-NEXT: s_xor_b32 s0, s2, -1 275; GCN-NEXT: v_or_b32_e32 v0, s0, v0 276; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 277; GCN-NEXT: ; return to shader part epilog 278 %not.src1 = xor i16 %src1, -1 279 %or = or i16 %src0, %not.src1 280 %zext = zext i16 %or to i32 281 %cast.zext = bitcast i32 %zext to float 282 ret float %cast.zext 283} 284 285define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { 286; GFX6-LABEL: s_orn2_v2i16: 287; GFX6: ; %bb.0: 288; GFX6-NEXT: s_mov_b32 s1, 0xffff 289; GFX6-NEXT: s_and_b32 s2, s2, s1 290; GFX6-NEXT: s_lshl_b32 s0, s3, 16 291; GFX6-NEXT: s_or_b32 s0, s0, s2 292; GFX6-NEXT: s_lshl_b32 s2, s5, 16 293; GFX6-NEXT: s_and_b32 s1, s4, s1 294; GFX6-NEXT: s_or_b32 s1, s2, s1 295; GFX6-NEXT: s_xor_b32 s1, s1, -1 296; GFX6-NEXT: s_or_b32 s0, s0, s1 297; GFX6-NEXT: ; return to shader part epilog 298; 299; GFX9-LABEL: s_orn2_v2i16: 300; GFX9: ; %bb.0: 301; GFX9-NEXT: s_orn2_b32 s0, s2, s3 302; GFX9-NEXT: ; return to shader part epilog 303 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1> 304 %or = or <2 x i16> %src0, %not.src1 305 %cast = bitcast <2 x i16> %or to i32 306 ret i32 %cast 307} 308 309define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { 310; GFX6-LABEL: s_orn2_v2i16_commute: 311; GFX6: ; %bb.0: 312; GFX6-NEXT: s_mov_b32 s1, 0xffff 313; GFX6-NEXT: s_and_b32 s2, s2, s1 314; GFX6-NEXT: s_lshl_b32 s0, s3, 16 315; GFX6-NEXT: s_or_b32 s0, s0, s2 316; GFX6-NEXT: s_lshl_b32 s2, s5, 16 317; GFX6-NEXT: s_and_b32 s1, s4, s1 318; GFX6-NEXT: s_or_b32 s1, s2, s1 319; GFX6-NEXT: s_xor_b32 s1, s1, -1 320; GFX6-NEXT: s_or_b32 s0, s1, s0 321; GFX6-NEXT: ; return to shader part epilog 322; 323; GFX9-LABEL: s_orn2_v2i16_commute: 324; GFX9: ; %bb.0: 325; GFX9-NEXT: s_orn2_b32 s0, s2, s3 326; GFX9-NEXT: ; return to shader part epilog 327 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1> 328 %or = or <2 x i16> %not.src1, %src0 329 %cast = bitcast <2 x i16> %or to i32 330 ret i32 %cast 331} 332 333define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { 334; GFX6-LABEL: s_orn2_v2i16_multi_use: 335; GFX6: ; %bb.0: 336; GFX6-NEXT: s_mov_b32 s1, 0xffff 337; GFX6-NEXT: s_and_b32 s2, s2, s1 338; GFX6-NEXT: s_lshl_b32 s0, s3, 16 339; GFX6-NEXT: s_or_b32 s0, s0, s2 340; GFX6-NEXT: s_lshl_b32 s2, s5, 16 341; GFX6-NEXT: s_and_b32 s1, s4, s1 342; GFX6-NEXT: s_or_b32 s1, s2, s1 343; GFX6-NEXT: s_xor_b32 s1, s1, -1 344; GFX6-NEXT: s_or_b32 s0, s0, s1 345; GFX6-NEXT: ; return to shader part epilog 346; 347; GFX9-LABEL: s_orn2_v2i16_multi_use: 348; GFX9: ; %bb.0: 349; GFX9-NEXT: s_xor_b32 s1, s3, -1 350; GFX9-NEXT: s_orn2_b32 s0, s2, s3 351; GFX9-NEXT: ; return to shader part epilog 352 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1> 353 %or = or <2 x i16> %src0, %not.src1 354 355 %cast.0 = bitcast <2 x i16> %or to i32 356 %cast.1 = bitcast <2 x i16> %not.src1 to i32 357 %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0 358 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1 359 ret { i32, i32 } %insert.1 360} 361 362define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { 363; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use: 364; GFX6: ; %bb.0: 365; GFX6-NEXT: s_mov_b32 s1, 0xffff 366; GFX6-NEXT: s_lshl_b32 s0, s3, 16 367; GFX6-NEXT: s_and_b32 s2, s2, s1 368; GFX6-NEXT: s_or_b32 s0, s0, s2 369; GFX6-NEXT: s_and_b32 s3, s4, s1 370; GFX6-NEXT: s_lshl_b32 s2, s5, 16 371; GFX6-NEXT: s_or_b32 s2, s2, s3 372; GFX6-NEXT: s_lshl_b32 s3, s7, 16 373; GFX6-NEXT: s_and_b32 s1, s6, s1 374; GFX6-NEXT: s_or_b32 s1, s3, s1 375; GFX6-NEXT: s_xor_b32 s1, s1, -1 376; GFX6-NEXT: s_or_b32 s0, s0, s1 377; GFX6-NEXT: s_or_b32 s1, s2, s1 378; GFX6-NEXT: ; return to shader part epilog 379; 380; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use: 381; GFX9: ; %bb.0: 382; GFX9-NEXT: s_orn2_b32 s0, s2, s4 383; GFX9-NEXT: s_orn2_b32 s1, s3, s4 384; GFX9-NEXT: ; return to shader part epilog 385 %not.src2 = xor <2 x i16> %src2, <i16 -1, i16 -1> 386 %or0 = or <2 x i16> %src0, %not.src2 387 %or1 = or <2 x i16> %src1, %not.src2 388 389 %cast.0 = bitcast <2 x i16> %or0 to i32 390 %cast.1 = bitcast <2 x i16> %or1 to i32 391 %insert.0 = insertvalue { i32, i32 } undef, i32 %cast.0, 0 392 %insert.1 = insertvalue { i32, i32 } %insert.0, i32 %cast.1, 1 393 ret { i32, i32 } %insert.1 394} 395 396define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) { 397; GFX6-LABEL: v_orn2_v2i16: 398; GFX6: ; %bb.0: 399; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 400; GFX6-NEXT: v_mov_b32_e32 v4, 0xffff 401; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 402; GFX6-NEXT: v_and_b32_e32 v0, v0, v4 403; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 404; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 405; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 406; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 407; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 408; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 409; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 410; GFX6-NEXT: s_setpc_b64 s[30:31] 411; 412; GFX9-LABEL: v_orn2_v2i16: 413; GFX9: ; %bb.0: 414; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 415; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 416; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 417; GFX9-NEXT: s_setpc_b64 s[30:31] 418 %not.src1 = xor <2 x i16> %src1, <i16 -1, i16 -1> 419 %or = or <2 x i16> %src0, %not.src1 420 ret <2 x i16> %or 421} 422 423; FIXME: 424; define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { 425; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1> 426; %or = or <3 x i16> %src0, %not.src1 427; %cast = bitcast <3 x i16> %or to i48 428; ret i48 %cast 429; } 430 431; define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { 432; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1> 433; %or = or <3 x i16> %not.src1, %src0 434; %cast = bitcast <3 x i16> %or to i48 435; ret i48 %cast 436; } 437 438; define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) { 439; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -1> 440; %or = or <3 x i16> %src0, %not.src1 441 442; %cast.0 = bitcast <3 x i16> %or to i48 443; %cast.1 = bitcast <3 x i16> %not.src1 to i48 444; %insert.0 = insertvalue { i48, i48 } undef, i48 %cast.0, 0 445; %insert.1 = insertvalue { i48, i48 } %insert.0, i48 %cast.1, 1 446; ret { i48, i48 } %insert.1 447; } 448 449; define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) { 450; %not.src1 = xor <3 x i16> %src1, <i16 -1, i16 -1, i16 -11> 451; %or = or <3 x i16> %src0, %not.src1 452; ret <3 x i16> %or 453; } 454 455define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { 456; GFX6-LABEL: s_orn2_v4i16: 457; GFX6: ; %bb.0: 458; GFX6-NEXT: s_lshl_b32 s0, s3, 16 459; GFX6-NEXT: s_mov_b32 s3, 0xffff 460; GFX6-NEXT: s_and_b32 s1, s2, s3 461; GFX6-NEXT: s_or_b32 s0, s0, s1 462; GFX6-NEXT: s_and_b32 s2, s4, s3 463; GFX6-NEXT: s_lshl_b32 s1, s5, 16 464; GFX6-NEXT: s_or_b32 s1, s1, s2 465; GFX6-NEXT: s_and_b32 s4, s6, s3 466; GFX6-NEXT: s_lshl_b32 s2, s7, 16 467; GFX6-NEXT: s_or_b32 s2, s2, s4 468; GFX6-NEXT: s_lshl_b32 s4, s9, 16 469; GFX6-NEXT: s_and_b32 s3, s8, s3 470; GFX6-NEXT: s_or_b32 s3, s4, s3 471; GFX6-NEXT: s_mov_b32 s4, -1 472; GFX6-NEXT: s_mov_b32 s5, s4 473; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 474; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 475; GFX6-NEXT: ; return to shader part epilog 476; 477; GFX9-LABEL: s_orn2_v4i16: 478; GFX9: ; %bb.0: 479; GFX9-NEXT: s_mov_b32 s0, -1 480; GFX9-NEXT: s_mov_b32 s1, s0 481; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] 482; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 483; GFX9-NEXT: ; return to shader part epilog 484 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1> 485 %or = or <4 x i16> %src0, %not.src1 486 %cast = bitcast <4 x i16> %or to i64 487 ret i64 %cast 488} 489 490define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { 491; GFX6-LABEL: s_orn2_v4i16_commute: 492; GFX6: ; %bb.0: 493; GFX6-NEXT: s_lshl_b32 s0, s3, 16 494; GFX6-NEXT: s_mov_b32 s3, 0xffff 495; GFX6-NEXT: s_and_b32 s1, s2, s3 496; GFX6-NEXT: s_or_b32 s0, s0, s1 497; GFX6-NEXT: s_and_b32 s2, s4, s3 498; GFX6-NEXT: s_lshl_b32 s1, s5, 16 499; GFX6-NEXT: s_or_b32 s1, s1, s2 500; GFX6-NEXT: s_and_b32 s4, s6, s3 501; GFX6-NEXT: s_lshl_b32 s2, s7, 16 502; GFX6-NEXT: s_or_b32 s2, s2, s4 503; GFX6-NEXT: s_lshl_b32 s4, s9, 16 504; GFX6-NEXT: s_and_b32 s3, s8, s3 505; GFX6-NEXT: s_or_b32 s3, s4, s3 506; GFX6-NEXT: s_mov_b32 s4, -1 507; GFX6-NEXT: s_mov_b32 s5, s4 508; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 509; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 510; GFX6-NEXT: ; return to shader part epilog 511; 512; GFX9-LABEL: s_orn2_v4i16_commute: 513; GFX9: ; %bb.0: 514; GFX9-NEXT: s_mov_b32 s0, -1 515; GFX9-NEXT: s_mov_b32 s1, s0 516; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] 517; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 518; GFX9-NEXT: ; return to shader part epilog 519 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1> 520 %or = or <4 x i16> %not.src1, %src0 521 %cast = bitcast <4 x i16> %or to i64 522 ret i64 %cast 523} 524 525define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1) { 526; GFX6-LABEL: s_orn2_v4i16_multi_use: 527; GFX6: ; %bb.0: 528; GFX6-NEXT: s_lshl_b32 s0, s3, 16 529; GFX6-NEXT: s_mov_b32 s3, 0xffff 530; GFX6-NEXT: s_and_b32 s1, s2, s3 531; GFX6-NEXT: s_or_b32 s0, s0, s1 532; GFX6-NEXT: s_and_b32 s2, s4, s3 533; GFX6-NEXT: s_lshl_b32 s1, s5, 16 534; GFX6-NEXT: s_or_b32 s1, s1, s2 535; GFX6-NEXT: s_and_b32 s4, s6, s3 536; GFX6-NEXT: s_lshl_b32 s2, s7, 16 537; GFX6-NEXT: s_or_b32 s2, s2, s4 538; GFX6-NEXT: s_lshl_b32 s4, s9, 16 539; GFX6-NEXT: s_and_b32 s3, s8, s3 540; GFX6-NEXT: s_or_b32 s3, s4, s3 541; GFX6-NEXT: s_mov_b32 s4, -1 542; GFX6-NEXT: s_mov_b32 s5, s4 543; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 544; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 545; GFX6-NEXT: ; return to shader part epilog 546; 547; GFX9-LABEL: s_orn2_v4i16_multi_use: 548; GFX9: ; %bb.0: 549; GFX9-NEXT: s_mov_b32 s0, -1 550; GFX9-NEXT: s_mov_b32 s1, s0 551; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] 552; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] 553; GFX9-NEXT: s_mov_b32 s2, s4 554; GFX9-NEXT: s_mov_b32 s3, s5 555; GFX9-NEXT: ; return to shader part epilog 556 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1> 557 %or = or <4 x i16> %src0, %not.src1 558 559 %cast.0 = bitcast <4 x i16> %or to i64 560 %cast.1 = bitcast <4 x i16> %not.src1 to i64 561 %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0 562 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1 563 ret { i64, i64 } %insert.1 564} 565 566define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { 567; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use: 568; GFX6: ; %bb.0: 569; GFX6-NEXT: s_mov_b32 s14, 0xffff 570; GFX6-NEXT: s_lshl_b32 s0, s3, 16 571; GFX6-NEXT: s_and_b32 s1, s2, s14 572; GFX6-NEXT: s_or_b32 s0, s0, s1 573; GFX6-NEXT: s_lshl_b32 s1, s5, 16 574; GFX6-NEXT: s_and_b32 s2, s4, s14 575; GFX6-NEXT: s_or_b32 s1, s1, s2 576; GFX6-NEXT: s_and_b32 s3, s6, s14 577; GFX6-NEXT: s_lshl_b32 s2, s7, 16 578; GFX6-NEXT: s_or_b32 s2, s2, s3 579; GFX6-NEXT: s_lshl_b32 s3, s9, 16 580; GFX6-NEXT: s_and_b32 s4, s8, s14 581; GFX6-NEXT: s_or_b32 s3, s3, s4 582; GFX6-NEXT: s_lshl_b32 s4, s11, 16 583; GFX6-NEXT: s_and_b32 s5, s10, s14 584; GFX6-NEXT: s_or_b32 s4, s4, s5 585; GFX6-NEXT: s_lshl_b32 s5, s13, 16 586; GFX6-NEXT: s_and_b32 s6, s12, s14 587; GFX6-NEXT: s_or_b32 s5, s5, s6 588; GFX6-NEXT: s_mov_b32 s6, -1 589; GFX6-NEXT: s_mov_b32 s7, s6 590; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] 591; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] 592; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] 593; GFX6-NEXT: ; return to shader part epilog 594; 595; GFX9-LABEL: s_orn2_v4i16_multi_foldable_use: 596; GFX9: ; %bb.0: 597; GFX9-NEXT: s_mov_b32 s0, -1 598; GFX9-NEXT: s_mov_b32 s1, s0 599; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] 600; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] 601; GFX9-NEXT: s_or_b64 s[2:3], s[4:5], s[6:7] 602; GFX9-NEXT: ; return to shader part epilog 603 %not.src2 = xor <4 x i16> %src2, <i16 -1, i16 -1, i16 -1, i16 -1> 604 %or0 = or <4 x i16> %src0, %not.src2 605 %or1 = or <4 x i16> %src1, %not.src2 606 607 %cast.0 = bitcast <4 x i16> %or0 to i64 608 %cast.1 = bitcast <4 x i16> %or1 to i64 609 %insert.0 = insertvalue { i64, i64 } undef, i64 %cast.0, 0 610 %insert.1 = insertvalue { i64, i64 } %insert.0, i64 %cast.1, 1 611 ret { i64, i64 } %insert.1 612} 613 614define <4 x i16> @v_orn2_v4i16(<4 x i16> %src0, <4 x i16> %src1) { 615; GFX6-LABEL: v_orn2_v4i16: 616; GFX6: ; %bb.0: 617; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 618; GFX6-NEXT: v_mov_b32_e32 v8, 0xffff 619; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 620; GFX6-NEXT: v_and_b32_e32 v0, v0, v8 621; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 622; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 623; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 624; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 625; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 626; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 627; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 628; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 629; GFX6-NEXT: v_and_b32_e32 v4, v6, v8 630; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 631; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 632; GFX6-NEXT: v_xor_b32_e32 v3, -1, v3 633; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 634; GFX6-NEXT: v_or_b32_e32 v2, v1, v3 635; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 636; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 637; GFX6-NEXT: s_setpc_b64 s[30:31] 638; 639; GFX9-LABEL: v_orn2_v4i16: 640; GFX9: ; %bb.0: 641; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 642; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 643; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 644; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 645; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 646; GFX9-NEXT: s_setpc_b64 s[30:31] 647 %not.src1 = xor <4 x i16> %src1, <i16 -1, i16 -1, i16 -1, i16 -1> 648 %or = or <4 x i16> %src0, %not.src1 649 ret <4 x i16> %or 650} 651