1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 8 9; 10; 128-bit vectors 11; 12 13define <2 x i64> @bitselect_v2i64_rr(<2 x i64>, <2 x i64>) { 14; SSE-LABEL: bitselect_v2i64_rr: 15; SSE: # %bb.0: 16; SSE-NEXT: andps {{.*}}(%rip), %xmm0 17; SSE-NEXT: andps {{.*}}(%rip), %xmm1 18; SSE-NEXT: orps %xmm1, %xmm0 19; SSE-NEXT: retq 20; 21; XOP-LABEL: bitselect_v2i64_rr: 22; XOP: # %bb.0: 23; XOP-NEXT: vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0 24; XOP-NEXT: retq 25; 26; AVX-LABEL: bitselect_v2i64_rr: 27; AVX: # %bb.0: 28; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 29; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 30; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 31; AVX-NEXT: retq 32; 33; AVX512F-LABEL: bitselect_v2i64_rr: 34; AVX512F: # %bb.0: 35; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 36; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 37; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 38; AVX512F-NEXT: retq 39; 40; AVX512VL-LABEL: bitselect_v2i64_rr: 41; AVX512VL: # %bb.0: 42; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 43; AVX512VL-NEXT: retq 44 %3 = and <2 x i64> %0, <i64 4294967296, i64 12884901890> 45 %4 = and <2 x i64> %1, <i64 -4294967297, i64 -12884901891> 46 %5 = or <2 x i64> %4, %3 47 ret <2 x i64> %5 48} 49 50define <2 x i64> @bitselect_v2i64_rm(<2 x i64>, <2 x i64>* nocapture readonly) { 51; SSE-LABEL: bitselect_v2i64_rm: 52; SSE: # %bb.0: 53; SSE-NEXT: movaps (%rdi), %xmm1 54; SSE-NEXT: andps {{.*}}(%rip), %xmm0 55; SSE-NEXT: andps {{.*}}(%rip), %xmm1 56; SSE-NEXT: orps %xmm1, %xmm0 57; SSE-NEXT: retq 58; 59; XOP-LABEL: bitselect_v2i64_rm: 60; XOP: # %bb.0: 61; XOP-NEXT: vmovdqa (%rdi), %xmm1 62; XOP-NEXT: vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0 63; XOP-NEXT: retq 64; 65; AVX-LABEL: bitselect_v2i64_rm: 66; AVX: # %bb.0: 67; AVX-NEXT: vmovaps (%rdi), %xmm1 68; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 69; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 70; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 71; AVX-NEXT: retq 72; 73; AVX512F-LABEL: bitselect_v2i64_rm: 74; AVX512F: # %bb.0: 75; AVX512F-NEXT: vmovaps (%rdi), %xmm1 76; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 77; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 78; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 79; AVX512F-NEXT: retq 80; 81; AVX512VL-LABEL: bitselect_v2i64_rm: 82; AVX512VL: # %bb.0: 83; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 84; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 85; AVX512VL-NEXT: retq 86 %3 = load <2 x i64>, <2 x i64>* %1 87 %4 = and <2 x i64> %0, <i64 8589934593, i64 3> 88 %5 = and <2 x i64> %3, <i64 -8589934594, i64 -4> 89 %6 = or <2 x i64> %5, %4 90 ret <2 x i64> %6 91} 92 93define <2 x i64> @bitselect_v2i64_mr(<2 x i64>* nocapture readonly, <2 x i64>) { 94; SSE-LABEL: bitselect_v2i64_mr: 95; SSE: # %bb.0: 96; SSE-NEXT: movaps (%rdi), %xmm1 97; SSE-NEXT: andps {{.*}}(%rip), %xmm1 98; SSE-NEXT: andps {{.*}}(%rip), %xmm0 99; SSE-NEXT: orps %xmm1, %xmm0 100; SSE-NEXT: retq 101; 102; XOP-LABEL: bitselect_v2i64_mr: 103; XOP: # %bb.0: 104; XOP-NEXT: vmovdqa (%rdi), %xmm1 105; XOP-NEXT: vpcmov {{.*}}(%rip), %xmm0, %xmm1, %xmm0 106; XOP-NEXT: retq 107; 108; AVX-LABEL: bitselect_v2i64_mr: 109; AVX: # %bb.0: 110; AVX-NEXT: vmovaps (%rdi), %xmm1 111; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 112; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 113; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 114; AVX-NEXT: retq 115; 116; AVX512F-LABEL: bitselect_v2i64_mr: 117; AVX512F: # %bb.0: 118; AVX512F-NEXT: vmovaps (%rdi), %xmm1 119; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 120; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 121; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 122; AVX512F-NEXT: retq 123; 124; AVX512VL-LABEL: bitselect_v2i64_mr: 125; AVX512VL: # %bb.0: 126; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 127; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 128; AVX512VL-NEXT: retq 129 %3 = load <2 x i64>, <2 x i64>* %0 130 %4 = and <2 x i64> %3, <i64 12884901890, i64 4294967296> 131 %5 = and <2 x i64> %1, <i64 -12884901891, i64 -4294967297> 132 %6 = or <2 x i64> %4, %5 133 ret <2 x i64> %6 134} 135 136define <2 x i64> @bitselect_v2i64_mm(<2 x i64>* nocapture readonly, <2 x i64>* nocapture readonly) { 137; SSE-LABEL: bitselect_v2i64_mm: 138; SSE: # %bb.0: 139; SSE-NEXT: movaps (%rdi), %xmm1 140; SSE-NEXT: movaps (%rsi), %xmm0 141; SSE-NEXT: andps {{.*}}(%rip), %xmm1 142; SSE-NEXT: andps {{.*}}(%rip), %xmm0 143; SSE-NEXT: orps %xmm1, %xmm0 144; SSE-NEXT: retq 145; 146; XOP-LABEL: bitselect_v2i64_mm: 147; XOP: # %bb.0: 148; XOP-NEXT: vmovdqa (%rsi), %xmm0 149; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022] 150; XOP-NEXT: vpcmov %xmm1, (%rdi), %xmm0, %xmm0 151; XOP-NEXT: retq 152; 153; AVX-LABEL: bitselect_v2i64_mm: 154; AVX: # %bb.0: 155; AVX-NEXT: vmovaps (%rdi), %xmm0 156; AVX-NEXT: vmovaps (%rsi), %xmm1 157; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 158; AVX-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 159; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 160; AVX-NEXT: retq 161; 162; AVX512F-LABEL: bitselect_v2i64_mm: 163; AVX512F: # %bb.0: 164; AVX512F-NEXT: vmovaps (%rdi), %xmm0 165; AVX512F-NEXT: vmovaps (%rsi), %xmm1 166; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 167; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1 168; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 169; AVX512F-NEXT: retq 170; 171; AVX512VL-LABEL: bitselect_v2i64_mm: 172; AVX512VL: # %bb.0: 173; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1 174; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551612,18446744065119617022] 175; AVX512VL-NEXT: vpternlogq $202, (%rdi), %xmm1, %xmm0 176; AVX512VL-NEXT: retq 177 %3 = load <2 x i64>, <2 x i64>* %0 178 %4 = load <2 x i64>, <2 x i64>* %1 179 %5 = and <2 x i64> %3, <i64 3, i64 8589934593> 180 %6 = and <2 x i64> %4, <i64 -4, i64 -8589934594> 181 %7 = or <2 x i64> %6, %5 182 ret <2 x i64> %7 183} 184 185define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i64 %a2) { 186; SSE-LABEL: bitselect_v2i64_broadcast_rrr: 187; SSE: # %bb.0: 188; SSE-NEXT: movq %rdi, %xmm2 189; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 190; SSE-NEXT: pand %xmm2, %xmm0 191; SSE-NEXT: pandn %xmm1, %xmm2 192; SSE-NEXT: por %xmm2, %xmm0 193; SSE-NEXT: retq 194; 195; XOP-LABEL: bitselect_v2i64_broadcast_rrr: 196; XOP: # %bb.0: 197; XOP-NEXT: vmovq %rdi, %xmm2 198; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 199; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 200; XOP-NEXT: retq 201; 202; AVX1-LABEL: bitselect_v2i64_broadcast_rrr: 203; AVX1: # %bb.0: 204; AVX1-NEXT: vmovq %rdi, %xmm2 205; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 206; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 207; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 208; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 209; AVX1-NEXT: retq 210; 211; AVX2-LABEL: bitselect_v2i64_broadcast_rrr: 212; AVX2: # %bb.0: 213; AVX2-NEXT: vmovq %rdi, %xmm2 214; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 215; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 216; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm1 217; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 218; AVX2-NEXT: retq 219; 220; AVX512F-LABEL: bitselect_v2i64_broadcast_rrr: 221; AVX512F: # %bb.0: 222; AVX512F-NEXT: vmovq %rdi, %xmm2 223; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 224; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 225; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm1 226; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 227; AVX512F-NEXT: retq 228; 229; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrr: 230; AVX512VL: # %bb.0: 231; AVX512VL-NEXT: vpbroadcastq %rdi, %xmm2 232; AVX512VL-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 233; AVX512VL-NEXT: retq 234 %1 = insertelement <2 x i64> undef, i64 %a2, i32 0 235 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer 236 %3 = xor <2 x i64> %1, <i64 -1, i64 undef> 237 %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer 238 %5 = and <2 x i64> %a0, %2 239 %6 = and <2 x i64> %a1, %4 240 %7 = or <2 x i64> %5, %6 241 ret <2 x i64> %7 242} 243 244define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, i64* %p2) { 245; SSE-LABEL: bitselect_v2i64_broadcast_rrm: 246; SSE: # %bb.0: 247; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 248; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 249; SSE-NEXT: pand %xmm2, %xmm0 250; SSE-NEXT: pandn %xmm1, %xmm2 251; SSE-NEXT: por %xmm2, %xmm0 252; SSE-NEXT: retq 253; 254; XOP-LABEL: bitselect_v2i64_broadcast_rrm: 255; XOP: # %bb.0: 256; XOP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 257; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] 258; XOP-NEXT: vandps %xmm2, %xmm0, %xmm0 259; XOP-NEXT: vandnps %xmm1, %xmm2, %xmm1 260; XOP-NEXT: vorps %xmm1, %xmm0, %xmm0 261; XOP-NEXT: retq 262; 263; AVX1-LABEL: bitselect_v2i64_broadcast_rrm: 264; AVX1: # %bb.0: 265; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 266; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] 267; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 268; AVX1-NEXT: vandnps %xmm1, %xmm2, %xmm1 269; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 270; AVX1-NEXT: retq 271; 272; AVX2-LABEL: bitselect_v2i64_broadcast_rrm: 273; AVX2: # %bb.0: 274; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] 275; AVX2-NEXT: vandps %xmm2, %xmm0, %xmm0 276; AVX2-NEXT: vandnps %xmm1, %xmm2, %xmm1 277; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0 278; AVX2-NEXT: retq 279; 280; AVX512F-LABEL: bitselect_v2i64_broadcast_rrm: 281; AVX512F: # %bb.0: 282; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] 283; AVX512F-NEXT: vandps %xmm2, %xmm0, %xmm0 284; AVX512F-NEXT: vandnps %xmm1, %xmm2, %xmm1 285; AVX512F-NEXT: vorps %xmm1, %xmm0, %xmm0 286; AVX512F-NEXT: retq 287; 288; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm: 289; AVX512VL: # %bb.0: 290; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to2}, %xmm1, %xmm0 291; AVX512VL-NEXT: retq 292 %a2 = load i64, i64* %p2 293 %1 = insertelement <2 x i64> undef, i64 %a2, i32 0 294 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer 295 %3 = xor <2 x i64> %1, <i64 -1, i64 undef> 296 %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer 297 %5 = and <2 x i64> %a0, %2 298 %6 = and <2 x i64> %a1, %4 299 %7 = or <2 x i64> %5, %6 300 ret <2 x i64> %7 301} 302 303; 304; 256-bit vectors 305; 306 307define <4 x i64> @bitselect_v4i64_rr(<4 x i64>, <4 x i64>) { 308; SSE-LABEL: bitselect_v4i64_rr: 309; SSE: # %bb.0: 310; SSE-NEXT: andps {{.*}}(%rip), %xmm1 311; SSE-NEXT: andps {{.*}}(%rip), %xmm0 312; SSE-NEXT: andps {{.*}}(%rip), %xmm3 313; SSE-NEXT: orps %xmm3, %xmm1 314; SSE-NEXT: andps {{.*}}(%rip), %xmm2 315; SSE-NEXT: orps %xmm2, %xmm0 316; SSE-NEXT: retq 317; 318; XOP-LABEL: bitselect_v4i64_rr: 319; XOP: # %bb.0: 320; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 321; XOP-NEXT: retq 322; 323; AVX-LABEL: bitselect_v4i64_rr: 324; AVX: # %bb.0: 325; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 326; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 327; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 328; AVX-NEXT: retq 329; 330; AVX512F-LABEL: bitselect_v4i64_rr: 331; AVX512F: # %bb.0: 332; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 333; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 334; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 335; AVX512F-NEXT: retq 336; 337; AVX512VL-LABEL: bitselect_v4i64_rr: 338; AVX512VL: # %bb.0: 339; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 340; AVX512VL-NEXT: retq 341 %3 = and <4 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890> 342 %4 = and <4 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891> 343 %5 = or <4 x i64> %4, %3 344 ret <4 x i64> %5 345} 346 347define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, <4 x i64>* nocapture readonly) { 348; SSE-LABEL: bitselect_v4i64_rm: 349; SSE: # %bb.0: 350; SSE-NEXT: movaps {{.*#+}} xmm2 = [18446744065119617022,18446744073709551612] 351; SSE-NEXT: movaps 16(%rdi), %xmm4 352; SSE-NEXT: andps %xmm2, %xmm4 353; SSE-NEXT: movaps (%rdi), %xmm5 354; SSE-NEXT: andps %xmm2, %xmm5 355; SSE-NEXT: movaps %xmm2, %xmm3 356; SSE-NEXT: andnps %xmm0, %xmm3 357; SSE-NEXT: orps %xmm5, %xmm3 358; SSE-NEXT: andnps %xmm1, %xmm2 359; SSE-NEXT: orps %xmm4, %xmm2 360; SSE-NEXT: movaps %xmm3, %xmm0 361; SSE-NEXT: movaps %xmm2, %xmm1 362; SSE-NEXT: retq 363; 364; XOP-LABEL: bitselect_v4i64_rm: 365; XOP: # %bb.0: 366; XOP-NEXT: vmovdqa (%rdi), %ymm1 367; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 368; XOP-NEXT: retq 369; 370; AVX-LABEL: bitselect_v4i64_rm: 371; AVX: # %bb.0: 372; AVX-NEXT: vmovaps (%rdi), %ymm1 373; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 374; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 375; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 376; AVX-NEXT: retq 377; 378; AVX512F-LABEL: bitselect_v4i64_rm: 379; AVX512F: # %bb.0: 380; AVX512F-NEXT: vmovaps (%rdi), %ymm1 381; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 382; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 383; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 384; AVX512F-NEXT: retq 385; 386; AVX512VL-LABEL: bitselect_v4i64_rm: 387; AVX512VL: # %bb.0: 388; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 389; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 390; AVX512VL-NEXT: retq 391 %3 = load <4 x i64>, <4 x i64>* %1 392 %4 = and <4 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3> 393 %5 = and <4 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4> 394 %6 = or <4 x i64> %5, %4 395 ret <4 x i64> %6 396} 397 398define <4 x i64> @bitselect_v4i64_mr(<4 x i64>* nocapture readonly, <4 x i64>) { 399; SSE-LABEL: bitselect_v4i64_mr: 400; SSE: # %bb.0: 401; SSE-NEXT: movaps {{.*#+}} xmm2 = [12884901890,4294967296] 402; SSE-NEXT: movaps 16(%rdi), %xmm4 403; SSE-NEXT: andps %xmm2, %xmm4 404; SSE-NEXT: movaps (%rdi), %xmm5 405; SSE-NEXT: andps %xmm2, %xmm5 406; SSE-NEXT: movaps %xmm2, %xmm3 407; SSE-NEXT: andnps %xmm0, %xmm3 408; SSE-NEXT: orps %xmm5, %xmm3 409; SSE-NEXT: andnps %xmm1, %xmm2 410; SSE-NEXT: orps %xmm4, %xmm2 411; SSE-NEXT: movaps %xmm3, %xmm0 412; SSE-NEXT: movaps %xmm2, %xmm1 413; SSE-NEXT: retq 414; 415; XOP-LABEL: bitselect_v4i64_mr: 416; XOP: # %bb.0: 417; XOP-NEXT: vmovdqa (%rdi), %ymm1 418; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 419; XOP-NEXT: retq 420; 421; AVX-LABEL: bitselect_v4i64_mr: 422; AVX: # %bb.0: 423; AVX-NEXT: vmovaps (%rdi), %ymm1 424; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 425; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 426; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 427; AVX-NEXT: retq 428; 429; AVX512F-LABEL: bitselect_v4i64_mr: 430; AVX512F: # %bb.0: 431; AVX512F-NEXT: vmovaps (%rdi), %ymm1 432; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 433; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 434; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 435; AVX512F-NEXT: retq 436; 437; AVX512VL-LABEL: bitselect_v4i64_mr: 438; AVX512VL: # %bb.0: 439; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 440; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 441; AVX512VL-NEXT: retq 442 %3 = load <4 x i64>, <4 x i64>* %0 443 %4 = and <4 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296> 444 %5 = and <4 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297> 445 %6 = or <4 x i64> %4, %5 446 ret <4 x i64> %6 447} 448 449define <4 x i64> @bitselect_v4i64_mm(<4 x i64>* nocapture readonly, <4 x i64>* nocapture readonly) { 450; SSE-LABEL: bitselect_v4i64_mm: 451; SSE: # %bb.0: 452; SSE-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022] 453; SSE-NEXT: movaps 16(%rsi), %xmm2 454; SSE-NEXT: andps %xmm1, %xmm2 455; SSE-NEXT: movaps (%rsi), %xmm3 456; SSE-NEXT: andps %xmm1, %xmm3 457; SSE-NEXT: movaps %xmm1, %xmm0 458; SSE-NEXT: andnps (%rdi), %xmm0 459; SSE-NEXT: orps %xmm3, %xmm0 460; SSE-NEXT: andnps 16(%rdi), %xmm1 461; SSE-NEXT: orps %xmm2, %xmm1 462; SSE-NEXT: retq 463; 464; XOP-LABEL: bitselect_v4i64_mm: 465; XOP: # %bb.0: 466; XOP-NEXT: vmovdqa (%rsi), %ymm0 467; XOP-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 468; XOP-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 469; XOP-NEXT: retq 470; 471; AVX-LABEL: bitselect_v4i64_mm: 472; AVX: # %bb.0: 473; AVX-NEXT: vmovaps (%rdi), %ymm0 474; AVX-NEXT: vmovaps (%rsi), %ymm1 475; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 476; AVX-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 477; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 478; AVX-NEXT: retq 479; 480; AVX512F-LABEL: bitselect_v4i64_mm: 481; AVX512F: # %bb.0: 482; AVX512F-NEXT: vmovaps (%rdi), %ymm0 483; AVX512F-NEXT: vmovaps (%rsi), %ymm1 484; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 485; AVX512F-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 486; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 487; AVX512F-NEXT: retq 488; 489; AVX512VL-LABEL: bitselect_v4i64_mm: 490; AVX512VL: # %bb.0: 491; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 492; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 493; AVX512VL-NEXT: vpternlogq $202, (%rdi), %ymm1, %ymm0 494; AVX512VL-NEXT: retq 495 %3 = load <4 x i64>, <4 x i64>* %0 496 %4 = load <4 x i64>, <4 x i64>* %1 497 %5 = and <4 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593> 498 %6 = and <4 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594> 499 %7 = or <4 x i64> %6, %5 500 ret <4 x i64> %7 501} 502 503define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i64 %a2) { 504; SSE-LABEL: bitselect_v4i64_broadcast_rrr: 505; SSE: # %bb.0: 506; SSE-NEXT: movq %rdi, %xmm4 507; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 508; SSE-NEXT: pand %xmm4, %xmm1 509; SSE-NEXT: pand %xmm4, %xmm0 510; SSE-NEXT: movdqa %xmm4, %xmm5 511; SSE-NEXT: pandn %xmm3, %xmm5 512; SSE-NEXT: por %xmm5, %xmm1 513; SSE-NEXT: pandn %xmm2, %xmm4 514; SSE-NEXT: por %xmm4, %xmm0 515; SSE-NEXT: retq 516; 517; XOP-LABEL: bitselect_v4i64_broadcast_rrr: 518; XOP: # %bb.0: 519; XOP-NEXT: vmovq %rdi, %xmm2 520; XOP-NEXT: vmovq %rdi, %xmm3 521; XOP-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] 522; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 523; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] 524; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 525; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0 526; XOP-NEXT: vandnps %ymm1, %ymm3, %ymm1 527; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0 528; XOP-NEXT: retq 529; 530; AVX1-LABEL: bitselect_v4i64_broadcast_rrr: 531; AVX1: # %bb.0: 532; AVX1-NEXT: vmovq %rdi, %xmm2 533; AVX1-NEXT: vmovq %rdi, %xmm3 534; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0] 535; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 536; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] 537; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 538; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 539; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 540; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 541; AVX1-NEXT: retq 542; 543; AVX2-LABEL: bitselect_v4i64_broadcast_rrr: 544; AVX2: # %bb.0: 545; AVX2-NEXT: vmovq %rdi, %xmm2 546; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 547; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 548; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 549; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 550; AVX2-NEXT: retq 551; 552; AVX512F-LABEL: bitselect_v4i64_broadcast_rrr: 553; AVX512F: # %bb.0: 554; AVX512F-NEXT: vmovq %rdi, %xmm2 555; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 556; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 557; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 558; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 559; AVX512F-NEXT: retq 560; 561; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr: 562; AVX512VL: # %bb.0: 563; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2 564; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 565; AVX512VL-NEXT: retq 566 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 567 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer 568 %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef> 569 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer 570 %5 = and <4 x i64> %a0, %2 571 %6 = and <4 x i64> %a1, %4 572 %7 = or <4 x i64> %5, %6 573 ret <4 x i64> %7 574} 575 576define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, i64* %p2) { 577; SSE-LABEL: bitselect_v4i64_broadcast_rrm: 578; SSE: # %bb.0: 579; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 580; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 581; SSE-NEXT: pand %xmm4, %xmm1 582; SSE-NEXT: pand %xmm4, %xmm0 583; SSE-NEXT: movdqa %xmm4, %xmm5 584; SSE-NEXT: pandn %xmm3, %xmm5 585; SSE-NEXT: por %xmm5, %xmm1 586; SSE-NEXT: pandn %xmm2, %xmm4 587; SSE-NEXT: por %xmm4, %xmm0 588; SSE-NEXT: retq 589; 590; XOP-LABEL: bitselect_v4i64_broadcast_rrm: 591; XOP: # %bb.0: 592; XOP-NEXT: vbroadcastsd (%rdi), %ymm2 593; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 594; XOP-NEXT: retq 595; 596; AVX-LABEL: bitselect_v4i64_broadcast_rrm: 597; AVX: # %bb.0: 598; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 599; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 600; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 601; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 602; AVX-NEXT: retq 603; 604; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: 605; AVX512F: # %bb.0: 606; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2 607; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0 608; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1 609; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 610; AVX512F-NEXT: retq 611; 612; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm: 613; AVX512VL: # %bb.0: 614; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0 615; AVX512VL-NEXT: retq 616 %a2 = load i64, i64* %p2 617 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 618 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer 619 %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef> 620 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer 621 %5 = and <4 x i64> %a0, %2 622 %6 = and <4 x i64> %a1, %4 623 %7 = or <4 x i64> %5, %6 624 ret <4 x i64> %7 625} 626 627; 628; 512-bit vectors 629; 630 631define <8 x i64> @bitselect_v8i64_rr(<8 x i64>, <8 x i64>) { 632; SSE-LABEL: bitselect_v8i64_rr: 633; SSE: # %bb.0: 634; SSE-NEXT: movaps {{.*#+}} xmm8 = [18446744060824649725,18446744060824649725] 635; SSE-NEXT: andps %xmm8, %xmm7 636; SSE-NEXT: movaps {{.*#+}} xmm9 = [18446744069414584319,18446744060824649725] 637; SSE-NEXT: andps %xmm9, %xmm6 638; SSE-NEXT: andps %xmm8, %xmm5 639; SSE-NEXT: andps %xmm9, %xmm4 640; SSE-NEXT: movaps %xmm9, %xmm10 641; SSE-NEXT: andnps %xmm0, %xmm10 642; SSE-NEXT: orps %xmm4, %xmm10 643; SSE-NEXT: movaps %xmm8, %xmm4 644; SSE-NEXT: andnps %xmm1, %xmm4 645; SSE-NEXT: orps %xmm5, %xmm4 646; SSE-NEXT: andnps %xmm2, %xmm9 647; SSE-NEXT: orps %xmm6, %xmm9 648; SSE-NEXT: andnps %xmm3, %xmm8 649; SSE-NEXT: orps %xmm7, %xmm8 650; SSE-NEXT: movaps %xmm10, %xmm0 651; SSE-NEXT: movaps %xmm4, %xmm1 652; SSE-NEXT: movaps %xmm9, %xmm2 653; SSE-NEXT: movaps %xmm8, %xmm3 654; SSE-NEXT: retq 655; 656; XOP-LABEL: bitselect_v8i64_rr: 657; XOP: # %bb.0: 658; XOP-NEXT: vmovdqa {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] 659; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 660; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 661; XOP-NEXT: retq 662; 663; AVX-LABEL: bitselect_v8i64_rr: 664; AVX: # %bb.0: 665; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] 666; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 667; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2 668; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 669; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 670; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 671; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 672; AVX-NEXT: retq 673; 674; AVX512-LABEL: bitselect_v8i64_rr: 675; AVX512: # %bb.0: 676; AVX512-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 677; AVX512-NEXT: retq 678 %3 = and <8 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890, i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890> 679 %4 = and <8 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891> 680 %5 = or <8 x i64> %4, %3 681 ret <8 x i64> %5 682} 683 684define <8 x i64> @bitselect_v8i64_rm(<8 x i64>, <8 x i64>* nocapture readonly) { 685; SSE-LABEL: bitselect_v8i64_rm: 686; SSE: # %bb.0: 687; SSE-NEXT: movaps {{.*#+}} xmm4 = [18446744065119617022,18446744073709551612] 688; SSE-NEXT: movaps 48(%rdi), %xmm8 689; SSE-NEXT: andps %xmm4, %xmm8 690; SSE-NEXT: movaps 32(%rdi), %xmm9 691; SSE-NEXT: andps %xmm4, %xmm9 692; SSE-NEXT: movaps 16(%rdi), %xmm7 693; SSE-NEXT: andps %xmm4, %xmm7 694; SSE-NEXT: movaps (%rdi), %xmm6 695; SSE-NEXT: andps %xmm4, %xmm6 696; SSE-NEXT: movaps %xmm4, %xmm5 697; SSE-NEXT: andnps %xmm0, %xmm5 698; SSE-NEXT: orps %xmm6, %xmm5 699; SSE-NEXT: movaps %xmm4, %xmm6 700; SSE-NEXT: andnps %xmm1, %xmm6 701; SSE-NEXT: orps %xmm7, %xmm6 702; SSE-NEXT: movaps %xmm4, %xmm7 703; SSE-NEXT: andnps %xmm2, %xmm7 704; SSE-NEXT: orps %xmm9, %xmm7 705; SSE-NEXT: andnps %xmm3, %xmm4 706; SSE-NEXT: orps %xmm8, %xmm4 707; SSE-NEXT: movaps %xmm5, %xmm0 708; SSE-NEXT: movaps %xmm6, %xmm1 709; SSE-NEXT: movaps %xmm7, %xmm2 710; SSE-NEXT: movaps %xmm4, %xmm3 711; SSE-NEXT: retq 712; 713; XOP-LABEL: bitselect_v8i64_rm: 714; XOP: # %bb.0: 715; XOP-NEXT: vmovdqa (%rdi), %ymm2 716; XOP-NEXT: vmovdqa 32(%rdi), %ymm3 717; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] 718; XOP-NEXT: # ymm4 = mem[0,1,0,1] 719; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 720; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 721; XOP-NEXT: retq 722; 723; AVX-LABEL: bitselect_v8i64_rm: 724; AVX: # %bb.0: 725; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] 726; AVX-NEXT: # ymm2 = mem[0,1,0,1] 727; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 728; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 729; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 730; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 731; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 732; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 733; AVX-NEXT: retq 734; 735; AVX512-LABEL: bitselect_v8i64_rm: 736; AVX512: # %bb.0: 737; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 738; AVX512-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 739; AVX512-NEXT: retq 740 %3 = load <8 x i64>, <8 x i64>* %1 741 %4 = and <8 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3> 742 %5 = and <8 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4> 743 %6 = or <8 x i64> %5, %4 744 ret <8 x i64> %6 745} 746 747define <8 x i64> @bitselect_v8i64_mr(<8 x i64>* nocapture readonly, <8 x i64>) { 748; SSE-LABEL: bitselect_v8i64_mr: 749; SSE: # %bb.0: 750; SSE-NEXT: movaps {{.*#+}} xmm4 = [12884901890,4294967296] 751; SSE-NEXT: movaps 48(%rdi), %xmm8 752; SSE-NEXT: andps %xmm4, %xmm8 753; SSE-NEXT: movaps 32(%rdi), %xmm9 754; SSE-NEXT: andps %xmm4, %xmm9 755; SSE-NEXT: movaps 16(%rdi), %xmm7 756; SSE-NEXT: andps %xmm4, %xmm7 757; SSE-NEXT: movaps (%rdi), %xmm6 758; SSE-NEXT: andps %xmm4, %xmm6 759; SSE-NEXT: movaps %xmm4, %xmm5 760; SSE-NEXT: andnps %xmm0, %xmm5 761; SSE-NEXT: orps %xmm6, %xmm5 762; SSE-NEXT: movaps %xmm4, %xmm6 763; SSE-NEXT: andnps %xmm1, %xmm6 764; SSE-NEXT: orps %xmm7, %xmm6 765; SSE-NEXT: movaps %xmm4, %xmm7 766; SSE-NEXT: andnps %xmm2, %xmm7 767; SSE-NEXT: orps %xmm9, %xmm7 768; SSE-NEXT: andnps %xmm3, %xmm4 769; SSE-NEXT: orps %xmm8, %xmm4 770; SSE-NEXT: movaps %xmm5, %xmm0 771; SSE-NEXT: movaps %xmm6, %xmm1 772; SSE-NEXT: movaps %xmm7, %xmm2 773; SSE-NEXT: movaps %xmm4, %xmm3 774; SSE-NEXT: retq 775; 776; XOP-LABEL: bitselect_v8i64_mr: 777; XOP: # %bb.0: 778; XOP-NEXT: vmovdqa (%rdi), %ymm2 779; XOP-NEXT: vmovdqa 32(%rdi), %ymm3 780; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [12884901890,4294967296,12884901890,4294967296] 781; XOP-NEXT: # ymm4 = mem[0,1,0,1] 782; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 783; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 784; XOP-NEXT: retq 785; 786; AVX-LABEL: bitselect_v8i64_mr: 787; AVX: # %bb.0: 788; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] 789; AVX-NEXT: # ymm2 = mem[0,1,0,1] 790; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 791; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 792; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 793; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 794; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 795; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 796; AVX-NEXT: retq 797; 798; AVX512-LABEL: bitselect_v8i64_mr: 799; AVX512: # %bb.0: 800; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 801; AVX512-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 802; AVX512-NEXT: retq 803 %3 = load <8 x i64>, <8 x i64>* %0 804 %4 = and <8 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296> 805 %5 = and <8 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297> 806 %6 = or <8 x i64> %4, %5 807 ret <8 x i64> %6 808} 809 810define <8 x i64> @bitselect_v8i64_mm(<8 x i64>* nocapture readonly, <8 x i64>* nocapture readonly) { 811; SSE-LABEL: bitselect_v8i64_mm: 812; SSE: # %bb.0: 813; SSE-NEXT: movaps {{.*#+}} xmm3 = [18446744073709551612,18446744065119617022] 814; SSE-NEXT: movaps 48(%rsi), %xmm4 815; SSE-NEXT: andps %xmm3, %xmm4 816; SSE-NEXT: movaps 32(%rsi), %xmm5 817; SSE-NEXT: andps %xmm3, %xmm5 818; SSE-NEXT: movaps 16(%rsi), %xmm2 819; SSE-NEXT: andps %xmm3, %xmm2 820; SSE-NEXT: movaps (%rsi), %xmm1 821; SSE-NEXT: andps %xmm3, %xmm1 822; SSE-NEXT: movaps %xmm3, %xmm0 823; SSE-NEXT: andnps (%rdi), %xmm0 824; SSE-NEXT: orps %xmm1, %xmm0 825; SSE-NEXT: movaps %xmm3, %xmm1 826; SSE-NEXT: andnps 16(%rdi), %xmm1 827; SSE-NEXT: orps %xmm2, %xmm1 828; SSE-NEXT: movaps %xmm3, %xmm2 829; SSE-NEXT: andnps 32(%rdi), %xmm2 830; SSE-NEXT: orps %xmm5, %xmm2 831; SSE-NEXT: andnps 48(%rdi), %xmm3 832; SSE-NEXT: orps %xmm4, %xmm3 833; SSE-NEXT: retq 834; 835; XOP-LABEL: bitselect_v8i64_mm: 836; XOP: # %bb.0: 837; XOP-NEXT: vmovdqa (%rsi), %ymm0 838; XOP-NEXT: vmovdqa 32(%rsi), %ymm1 839; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 840; XOP-NEXT: # ymm2 = mem[0,1,0,1] 841; XOP-NEXT: vpcmov %ymm2, (%rdi), %ymm0, %ymm0 842; XOP-NEXT: vpcmov %ymm2, 32(%rdi), %ymm1, %ymm1 843; XOP-NEXT: retq 844; 845; AVX-LABEL: bitselect_v8i64_mm: 846; AVX: # %bb.0: 847; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 848; AVX-NEXT: # ymm1 = mem[0,1,0,1] 849; AVX-NEXT: vandps 32(%rsi), %ymm1, %ymm2 850; AVX-NEXT: vandps (%rsi), %ymm1, %ymm0 851; AVX-NEXT: vandnps (%rdi), %ymm1, %ymm3 852; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 853; AVX-NEXT: vandnps 32(%rdi), %ymm1, %ymm1 854; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 855; AVX-NEXT: retq 856; 857; AVX512-LABEL: bitselect_v8i64_mm: 858; AVX512: # %bb.0: 859; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 860; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 861; AVX512-NEXT: vpternlogq $202, (%rdi), %zmm1, %zmm0 862; AVX512-NEXT: retq 863 %3 = load <8 x i64>, <8 x i64>* %0 864 %4 = load <8 x i64>, <8 x i64>* %1 865 %5 = and <8 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593> 866 %6 = and <8 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594> 867 %7 = or <8 x i64> %6, %5 868 ret <8 x i64> %7 869} 870 871define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i64 %a2) { 872; SSE-LABEL: bitselect_v8i64_broadcast_rrr: 873; SSE: # %bb.0: 874; SSE-NEXT: movq %rdi, %xmm8 875; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] 876; SSE-NEXT: pand %xmm8, %xmm3 877; SSE-NEXT: pand %xmm8, %xmm2 878; SSE-NEXT: pand %xmm8, %xmm1 879; SSE-NEXT: pand %xmm8, %xmm0 880; SSE-NEXT: movdqa %xmm8, %xmm9 881; SSE-NEXT: pandn %xmm7, %xmm9 882; SSE-NEXT: por %xmm9, %xmm3 883; SSE-NEXT: movdqa %xmm8, %xmm7 884; SSE-NEXT: pandn %xmm6, %xmm7 885; SSE-NEXT: por %xmm7, %xmm2 886; SSE-NEXT: movdqa %xmm8, %xmm6 887; SSE-NEXT: pandn %xmm5, %xmm6 888; SSE-NEXT: por %xmm6, %xmm1 889; SSE-NEXT: pandn %xmm4, %xmm8 890; SSE-NEXT: por %xmm8, %xmm0 891; SSE-NEXT: retq 892; 893; XOP-LABEL: bitselect_v8i64_broadcast_rrr: 894; XOP: # %bb.0: 895; XOP-NEXT: vmovq %rdi, %xmm4 896; XOP-NEXT: vmovq %rdi, %xmm5 897; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm4[0,0] 898; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 899; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] 900; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 901; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1 902; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0 903; XOP-NEXT: vandnps %ymm3, %ymm5, %ymm3 904; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1 905; XOP-NEXT: vandnps %ymm2, %ymm5, %ymm2 906; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0 907; XOP-NEXT: retq 908; 909; AVX1-LABEL: bitselect_v8i64_broadcast_rrr: 910; AVX1: # %bb.0: 911; AVX1-NEXT: vmovq %rdi, %xmm4 912; AVX1-NEXT: vmovq %rdi, %xmm5 913; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm4[0,0] 914; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 915; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] 916; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 917; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 918; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 919; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3 920; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 921; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 922; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 923; AVX1-NEXT: retq 924; 925; AVX2-LABEL: bitselect_v8i64_broadcast_rrr: 926; AVX2: # %bb.0: 927; AVX2-NEXT: vmovq %rdi, %xmm4 928; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 929; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 930; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 931; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 932; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 933; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 934; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 935; AVX2-NEXT: retq 936; 937; AVX512-LABEL: bitselect_v8i64_broadcast_rrr: 938; AVX512: # %bb.0: 939; AVX512-NEXT: vpbroadcastq %rdi, %zmm2 940; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 941; AVX512-NEXT: retq 942 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 943 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer 944 %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef> 945 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer 946 %5 = and <8 x i64> %a0, %2 947 %6 = and <8 x i64> %a1, %4 948 %7 = or <8 x i64> %5, %6 949 ret <8 x i64> %7 950} 951 952define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, i64* %p2) { 953; SSE-LABEL: bitselect_v8i64_broadcast_rrm: 954; SSE: # %bb.0: 955; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero 956; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] 957; SSE-NEXT: pand %xmm8, %xmm3 958; SSE-NEXT: pand %xmm8, %xmm2 959; SSE-NEXT: pand %xmm8, %xmm1 960; SSE-NEXT: pand %xmm8, %xmm0 961; SSE-NEXT: movdqa %xmm8, %xmm9 962; SSE-NEXT: pandn %xmm7, %xmm9 963; SSE-NEXT: por %xmm9, %xmm3 964; SSE-NEXT: movdqa %xmm8, %xmm7 965; SSE-NEXT: pandn %xmm6, %xmm7 966; SSE-NEXT: por %xmm7, %xmm2 967; SSE-NEXT: movdqa %xmm8, %xmm6 968; SSE-NEXT: pandn %xmm5, %xmm6 969; SSE-NEXT: por %xmm6, %xmm1 970; SSE-NEXT: pandn %xmm4, %xmm8 971; SSE-NEXT: por %xmm8, %xmm0 972; SSE-NEXT: retq 973; 974; XOP-LABEL: bitselect_v8i64_broadcast_rrm: 975; XOP: # %bb.0: 976; XOP-NEXT: vbroadcastsd (%rdi), %ymm4 977; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0 978; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 979; XOP-NEXT: retq 980; 981; AVX-LABEL: bitselect_v8i64_broadcast_rrm: 982; AVX: # %bb.0: 983; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 984; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 985; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 986; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 987; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 988; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 989; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 990; AVX-NEXT: retq 991; 992; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: 993; AVX512: # %bb.0: 994; AVX512-NEXT: vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0 995; AVX512-NEXT: retq 996 %a2 = load i64, i64* %p2 997 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 998 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer 999 %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef> 1000 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer 1001 %5 = and <8 x i64> %a0, %2 1002 %6 = and <8 x i64> %a1, %4 1003 %7 = or <8 x i64> %5, %6 1004 ret <8 x i64> %7 1005} 1006 1007; Check that mask registers don't get canonicalized. 1008define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) { 1009; SSE-LABEL: bitselect_v4i1_loop: 1010; SSE: # %bb.0: # %bb 1011; SSE-NEXT: pxor %xmm2, %xmm2 1012; SSE-NEXT: pcmpeqd %xmm0, %xmm2 1013; SSE-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12] 1014; SSE-NEXT: pcmpeqd %xmm1, %xmm0 1015; SSE-NEXT: pcmpeqd {{.*}}(%rip), %xmm1 1016; SSE-NEXT: pand %xmm2, %xmm1 1017; SSE-NEXT: pandn %xmm0, %xmm2 1018; SSE-NEXT: por %xmm1, %xmm2 1019; SSE-NEXT: movdqa %xmm2, %xmm0 1020; SSE-NEXT: retq 1021; 1022; XOP-LABEL: bitselect_v4i1_loop: 1023; XOP: # %bb.0: # %bb 1024; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 1025; XOP-NEXT: vpcomneqd %xmm2, %xmm0, %xmm0 1026; XOP-NEXT: vpcomeqd {{.*}}(%rip), %xmm1, %xmm2 1027; XOP-NEXT: vpcomeqd {{.*}}(%rip), %xmm1, %xmm1 1028; XOP-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 1029; XOP-NEXT: retq 1030; 1031; AVX1-LABEL: bitselect_v4i1_loop: 1032; AVX1: # %bb.0: # %bb 1033; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1034; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1035; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm1, %xmm2 1036; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm1, %xmm1 1037; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1038; AVX1-NEXT: retq 1039; 1040; AVX2-LABEL: bitselect_v4i1_loop: 1041; AVX2: # %bb.0: # %bb 1042; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1043; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1044; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [12,12,12,12] 1045; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 1046; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [15,15,15,15] 1047; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 1048; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1049; AVX2-NEXT: retq 1050; 1051; AVX512F-LABEL: bitselect_v4i1_loop: 1052; AVX512F: # %bb.0: # %bb 1053; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1054; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1055; AVX512F-NEXT: vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k1 1056; AVX512F-NEXT: vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k2 1057; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k2} 1058; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 1059; AVX512F-NEXT: korw %k0, %k1, %k1 1060; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1061; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1062; AVX512F-NEXT: vzeroupper 1063; AVX512F-NEXT: retq 1064; 1065; AVX512VL-LABEL: bitselect_v4i1_loop: 1066; AVX512VL: # %bb.0: # %bb 1067; AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k1 1068; AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k2 1069; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2} 1070; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1} 1071; AVX512VL-NEXT: korw %k0, %k1, %k1 1072; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1073; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1074; AVX512VL-NEXT: retq 1075bb: 1076 %tmp = icmp ne <4 x i32> %a0, zeroinitializer 1077 %tmp2 = icmp eq <4 x i32> %a1, <i32 12, i32 12, i32 12, i32 12> 1078 %tmp3 = icmp eq <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15> 1079 %tmp4 = select <4 x i1> %tmp, <4 x i1> %tmp2, <4 x i1> %tmp3 1080 ret <4 x i1> %tmp4 1081} 1082 1083