1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 7; 8; Verify that the DAG combiner correctly folds bitwise operations across 9; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 10; basic and always-safe patterns. Also test that the DAG combiner will combine 11; target-specific shuffle instructions where reasonable. 12 13target triple = "x86_64-unknown-unknown" 14 15declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 16declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 17declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 18 19define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 20; ALL-LABEL: combine_pshufd1: 21; ALL: # BB#0: # %entry 22; ALL-NEXT: retq 23entry: 24 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 25 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 26 ret <4 x i32> %c 27} 28 29define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 30; ALL-LABEL: combine_pshufd2: 31; ALL: # BB#0: # %entry 32; ALL-NEXT: retq 33entry: 34 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 35 %b.cast = bitcast <4 x i32> %b to <8 x i16> 36 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 37 %c.cast = bitcast <8 x i16> %c to <4 x i32> 38 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 39 ret <4 x i32> %d 40} 41 42define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 43; ALL-LABEL: combine_pshufd3: 44; ALL: # BB#0: # %entry 45; ALL-NEXT: retq 46entry: 47 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 48 %b.cast = bitcast <4 x i32> %b to <8 x i16> 49 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 50 %c.cast = bitcast <8 x i16> %c to <4 x i32> 51 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 52 ret <4 x i32> %d 53} 54 55define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 56; SSE-LABEL: combine_pshufd4: 57; SSE: # BB#0: # %entry 58; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 59; SSE-NEXT: retq 60; 61; AVX-LABEL: combine_pshufd4: 62; AVX: # BB#0: # %entry 63; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 64; AVX-NEXT: retq 65entry: 66 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 67 %b.cast = bitcast <4 x i32> %b to <8 x i16> 68 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 69 %c.cast = bitcast <8 x i16> %c to <4 x i32> 70 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 71 ret <4 x i32> %d 72} 73 74define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 75; SSE-LABEL: combine_pshufd5: 76; SSE: # BB#0: # %entry 77; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 78; SSE-NEXT: retq 79; 80; AVX-LABEL: combine_pshufd5: 81; AVX: # BB#0: # %entry 82; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 83; AVX-NEXT: retq 84entry: 85 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 86 %b.cast = bitcast <4 x i32> %b to <8 x i16> 87 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 88 %c.cast = bitcast <8 x i16> %c to <4 x i32> 89 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 90 ret <4 x i32> %d 91} 92 93define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 94; SSE-LABEL: combine_pshufd6: 95; SSE: # BB#0: # %entry 96; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 97; SSE-NEXT: retq 98; 99; AVX1-LABEL: combine_pshufd6: 100; AVX1: # BB#0: # %entry 101; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 102; AVX1-NEXT: retq 103; 104; AVX2-LABEL: combine_pshufd6: 105; AVX2: # BB#0: # %entry 106; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 107; AVX2-NEXT: retq 108entry: 109 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 110 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 111 ret <4 x i32> %c 112} 113 114define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 115; ALL-LABEL: combine_pshuflw1: 116; ALL: # BB#0: # %entry 117; ALL-NEXT: retq 118entry: 119 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 120 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 121 ret <8 x i16> %c 122} 123 124define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 125; ALL-LABEL: combine_pshuflw2: 126; ALL: # BB#0: # %entry 127; ALL-NEXT: retq 128entry: 129 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 130 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 131 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 132 ret <8 x i16> %d 133} 134 135define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 136; SSE-LABEL: combine_pshuflw3: 137; SSE: # BB#0: # %entry 138; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 139; SSE-NEXT: retq 140; 141; AVX-LABEL: combine_pshuflw3: 142; AVX: # BB#0: # %entry 143; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 144; AVX-NEXT: retq 145entry: 146 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 147 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 148 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 149 ret <8 x i16> %d 150} 151 152define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 153; SSE-LABEL: combine_pshufhw1: 154; SSE: # BB#0: # %entry 155; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 156; SSE-NEXT: retq 157; 158; AVX-LABEL: combine_pshufhw1: 159; AVX: # BB#0: # %entry 160; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 161; AVX-NEXT: retq 162entry: 163 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 164 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 165 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 166 ret <8 x i16> %d 167} 168 169define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 170; SSE-LABEL: combine_bitwise_ops_test1: 171; SSE: # BB#0: 172; SSE-NEXT: pand %xmm1, %xmm0 173; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 174; SSE-NEXT: retq 175; 176; AVX-LABEL: combine_bitwise_ops_test1: 177; AVX: # BB#0: 178; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 179; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 180; AVX-NEXT: retq 181 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 182 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 183 %and = and <4 x i32> %shuf1, %shuf2 184 ret <4 x i32> %and 185} 186 187define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 188; SSE-LABEL: combine_bitwise_ops_test2: 189; SSE: # BB#0: 190; SSE-NEXT: por %xmm1, %xmm0 191; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 192; SSE-NEXT: retq 193; 194; AVX-LABEL: combine_bitwise_ops_test2: 195; AVX: # BB#0: 196; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 197; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 198; AVX-NEXT: retq 199 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 200 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 201 %or = or <4 x i32> %shuf1, %shuf2 202 ret <4 x i32> %or 203} 204 205define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 206; SSE-LABEL: combine_bitwise_ops_test3: 207; SSE: # BB#0: 208; SSE-NEXT: pxor %xmm1, %xmm0 209; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 210; SSE-NEXT: retq 211; 212; AVX-LABEL: combine_bitwise_ops_test3: 213; AVX: # BB#0: 214; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 215; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 216; AVX-NEXT: retq 217 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 218 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 219 %xor = xor <4 x i32> %shuf1, %shuf2 220 ret <4 x i32> %xor 221} 222 223define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 224; SSE-LABEL: combine_bitwise_ops_test4: 225; SSE: # BB#0: 226; SSE-NEXT: pand %xmm1, %xmm0 227; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 228; SSE-NEXT: retq 229; 230; AVX-LABEL: combine_bitwise_ops_test4: 231; AVX: # BB#0: 232; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 233; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 234; AVX-NEXT: retq 235 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 236 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 237 %and = and <4 x i32> %shuf1, %shuf2 238 ret <4 x i32> %and 239} 240 241define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 242; SSE-LABEL: combine_bitwise_ops_test5: 243; SSE: # BB#0: 244; SSE-NEXT: por %xmm1, %xmm0 245; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 246; SSE-NEXT: retq 247; 248; AVX-LABEL: combine_bitwise_ops_test5: 249; AVX: # BB#0: 250; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 251; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 252; AVX-NEXT: retq 253 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 254 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 255 %or = or <4 x i32> %shuf1, %shuf2 256 ret <4 x i32> %or 257} 258 259define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 260; SSE-LABEL: combine_bitwise_ops_test6: 261; SSE: # BB#0: 262; SSE-NEXT: pxor %xmm1, %xmm0 263; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 264; SSE-NEXT: retq 265; 266; AVX-LABEL: combine_bitwise_ops_test6: 267; AVX: # BB#0: 268; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 269; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 270; AVX-NEXT: retq 271 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 272 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 273 %xor = xor <4 x i32> %shuf1, %shuf2 274 ret <4 x i32> %xor 275} 276 277 278; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 279; are not performing a swizzle operations. 280 281define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 282; SSE2-LABEL: combine_bitwise_ops_test1b: 283; SSE2: # BB#0: 284; SSE2-NEXT: pand %xmm1, %xmm0 285; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 286; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 287; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 288; SSE2-NEXT: retq 289; 290; SSSE3-LABEL: combine_bitwise_ops_test1b: 291; SSSE3: # BB#0: 292; SSSE3-NEXT: pand %xmm1, %xmm0 293; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 294; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 295; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 296; SSSE3-NEXT: retq 297; 298; SSE41-LABEL: combine_bitwise_ops_test1b: 299; SSE41: # BB#0: 300; SSE41-NEXT: pand %xmm1, %xmm0 301; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 302; SSE41-NEXT: retq 303; 304; AVX1-LABEL: combine_bitwise_ops_test1b: 305; AVX1: # BB#0: 306; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 307; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 308; AVX1-NEXT: retq 309; 310; AVX2-LABEL: combine_bitwise_ops_test1b: 311; AVX2: # BB#0: 312; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 313; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 314; AVX2-NEXT: retq 315 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 316 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 317 %and = and <4 x i32> %shuf1, %shuf2 318 ret <4 x i32> %and 319} 320 321define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 322; SSE2-LABEL: combine_bitwise_ops_test2b: 323; SSE2: # BB#0: 324; SSE2-NEXT: por %xmm1, %xmm0 325; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 326; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 327; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 328; SSE2-NEXT: retq 329; 330; SSSE3-LABEL: combine_bitwise_ops_test2b: 331; SSSE3: # BB#0: 332; SSSE3-NEXT: por %xmm1, %xmm0 333; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 334; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 335; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 336; SSSE3-NEXT: retq 337; 338; SSE41-LABEL: combine_bitwise_ops_test2b: 339; SSE41: # BB#0: 340; SSE41-NEXT: por %xmm1, %xmm0 341; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 342; SSE41-NEXT: retq 343; 344; AVX1-LABEL: combine_bitwise_ops_test2b: 345; AVX1: # BB#0: 346; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 347; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 348; AVX1-NEXT: retq 349; 350; AVX2-LABEL: combine_bitwise_ops_test2b: 351; AVX2: # BB#0: 352; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 353; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 354; AVX2-NEXT: retq 355 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 356 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 357 %or = or <4 x i32> %shuf1, %shuf2 358 ret <4 x i32> %or 359} 360 361define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 362; SSE2-LABEL: combine_bitwise_ops_test3b: 363; SSE2: # BB#0: 364; SSE2-NEXT: xorps %xmm1, %xmm0 365; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 366; SSE2-NEXT: retq 367; 368; SSSE3-LABEL: combine_bitwise_ops_test3b: 369; SSSE3: # BB#0: 370; SSSE3-NEXT: xorps %xmm1, %xmm0 371; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 372; SSSE3-NEXT: retq 373; 374; SSE41-LABEL: combine_bitwise_ops_test3b: 375; SSE41: # BB#0: 376; SSE41-NEXT: pxor %xmm1, %xmm0 377; SSE41-NEXT: pxor %xmm1, %xmm1 378; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 379; SSE41-NEXT: retq 380; 381; AVX1-LABEL: combine_bitwise_ops_test3b: 382; AVX1: # BB#0: 383; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 384; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 385; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 386; AVX1-NEXT: retq 387; 388; AVX2-LABEL: combine_bitwise_ops_test3b: 389; AVX2: # BB#0: 390; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 391; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 392; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 393; AVX2-NEXT: retq 394 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 395 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 396 %xor = xor <4 x i32> %shuf1, %shuf2 397 ret <4 x i32> %xor 398} 399 400define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 401; SSE2-LABEL: combine_bitwise_ops_test4b: 402; SSE2: # BB#0: 403; SSE2-NEXT: pand %xmm1, %xmm0 404; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 405; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 406; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 407; SSE2-NEXT: retq 408; 409; SSSE3-LABEL: combine_bitwise_ops_test4b: 410; SSSE3: # BB#0: 411; SSSE3-NEXT: pand %xmm1, %xmm0 412; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 413; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 414; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 415; SSSE3-NEXT: retq 416; 417; SSE41-LABEL: combine_bitwise_ops_test4b: 418; SSE41: # BB#0: 419; SSE41-NEXT: pand %xmm1, %xmm0 420; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 421; SSE41-NEXT: retq 422; 423; AVX1-LABEL: combine_bitwise_ops_test4b: 424; AVX1: # BB#0: 425; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 426; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 427; AVX1-NEXT: retq 428; 429; AVX2-LABEL: combine_bitwise_ops_test4b: 430; AVX2: # BB#0: 431; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 432; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 433; AVX2-NEXT: retq 434 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 435 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 436 %and = and <4 x i32> %shuf1, %shuf2 437 ret <4 x i32> %and 438} 439 440define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 441; SSE2-LABEL: combine_bitwise_ops_test5b: 442; SSE2: # BB#0: 443; SSE2-NEXT: por %xmm1, %xmm0 444; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 445; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 446; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 447; SSE2-NEXT: retq 448; 449; SSSE3-LABEL: combine_bitwise_ops_test5b: 450; SSSE3: # BB#0: 451; SSSE3-NEXT: por %xmm1, %xmm0 452; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 453; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 454; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 455; SSSE3-NEXT: retq 456; 457; SSE41-LABEL: combine_bitwise_ops_test5b: 458; SSE41: # BB#0: 459; SSE41-NEXT: por %xmm1, %xmm0 460; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 461; SSE41-NEXT: retq 462; 463; AVX1-LABEL: combine_bitwise_ops_test5b: 464; AVX1: # BB#0: 465; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 466; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 467; AVX1-NEXT: retq 468; 469; AVX2-LABEL: combine_bitwise_ops_test5b: 470; AVX2: # BB#0: 471; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 472; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 473; AVX2-NEXT: retq 474 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 475 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 476 %or = or <4 x i32> %shuf1, %shuf2 477 ret <4 x i32> %or 478} 479 480define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 481; SSE2-LABEL: combine_bitwise_ops_test6b: 482; SSE2: # BB#0: 483; SSE2-NEXT: xorps %xmm1, %xmm0 484; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 485; SSE2-NEXT: retq 486; 487; SSSE3-LABEL: combine_bitwise_ops_test6b: 488; SSSE3: # BB#0: 489; SSSE3-NEXT: xorps %xmm1, %xmm0 490; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 491; SSSE3-NEXT: retq 492; 493; SSE41-LABEL: combine_bitwise_ops_test6b: 494; SSE41: # BB#0: 495; SSE41-NEXT: pxor %xmm1, %xmm0 496; SSE41-NEXT: pxor %xmm1, %xmm1 497; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] 498; SSE41-NEXT: retq 499; 500; AVX1-LABEL: combine_bitwise_ops_test6b: 501; AVX1: # BB#0: 502; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 503; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 504; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] 505; AVX1-NEXT: retq 506; 507; AVX2-LABEL: combine_bitwise_ops_test6b: 508; AVX2: # BB#0: 509; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 510; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 511; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 512; AVX2-NEXT: retq 513 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 514 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 515 %xor = xor <4 x i32> %shuf1, %shuf2 516 ret <4 x i32> %xor 517} 518 519define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 520; SSE2-LABEL: combine_bitwise_ops_test1c: 521; SSE2: # BB#0: 522; SSE2-NEXT: pand %xmm1, %xmm0 523; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 524; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 525; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 526; SSE2-NEXT: retq 527; 528; SSSE3-LABEL: combine_bitwise_ops_test1c: 529; SSSE3: # BB#0: 530; SSSE3-NEXT: pand %xmm1, %xmm0 531; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 532; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 533; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 534; SSSE3-NEXT: retq 535; 536; SSE41-LABEL: combine_bitwise_ops_test1c: 537; SSE41: # BB#0: 538; SSE41-NEXT: pand %xmm1, %xmm0 539; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 540; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 541; SSE41-NEXT: retq 542; 543; AVX1-LABEL: combine_bitwise_ops_test1c: 544; AVX1: # BB#0: 545; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 546; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 547; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 548; AVX1-NEXT: retq 549; 550; AVX2-LABEL: combine_bitwise_ops_test1c: 551; AVX2: # BB#0: 552; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 553; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 554; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 555; AVX2-NEXT: retq 556 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 557 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 558 %and = and <4 x i32> %shuf1, %shuf2 559 ret <4 x i32> %and 560} 561 562define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 563; SSE2-LABEL: combine_bitwise_ops_test2c: 564; SSE2: # BB#0: 565; SSE2-NEXT: por %xmm1, %xmm0 566; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 567; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 568; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 569; SSE2-NEXT: retq 570; 571; SSSE3-LABEL: combine_bitwise_ops_test2c: 572; SSSE3: # BB#0: 573; SSSE3-NEXT: por %xmm1, %xmm0 574; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 575; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 576; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 577; SSSE3-NEXT: retq 578; 579; SSE41-LABEL: combine_bitwise_ops_test2c: 580; SSE41: # BB#0: 581; SSE41-NEXT: por %xmm1, %xmm0 582; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 583; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 584; SSE41-NEXT: retq 585; 586; AVX1-LABEL: combine_bitwise_ops_test2c: 587; AVX1: # BB#0: 588; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 589; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 590; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 591; AVX1-NEXT: retq 592; 593; AVX2-LABEL: combine_bitwise_ops_test2c: 594; AVX2: # BB#0: 595; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 596; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 597; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 598; AVX2-NEXT: retq 599 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 600 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 601 %or = or <4 x i32> %shuf1, %shuf2 602 ret <4 x i32> %or 603} 604 605define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 606; SSE2-LABEL: combine_bitwise_ops_test3c: 607; SSE2: # BB#0: 608; SSE2-NEXT: pxor %xmm1, %xmm0 609; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 610; SSE2-NEXT: pxor %xmm1, %xmm1 611; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 612; SSE2-NEXT: retq 613; 614; SSSE3-LABEL: combine_bitwise_ops_test3c: 615; SSSE3: # BB#0: 616; SSSE3-NEXT: pxor %xmm1, %xmm0 617; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 618; SSSE3-NEXT: pxor %xmm1, %xmm1 619; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 620; SSSE3-NEXT: retq 621; 622; SSE41-LABEL: combine_bitwise_ops_test3c: 623; SSE41: # BB#0: 624; SSE41-NEXT: pxor %xmm1, %xmm0 625; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 626; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 627; SSE41-NEXT: retq 628; 629; AVX-LABEL: combine_bitwise_ops_test3c: 630; AVX: # BB#0: 631; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 632; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 633; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 634; AVX-NEXT: retq 635 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 636 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 637 %xor = xor <4 x i32> %shuf1, %shuf2 638 ret <4 x i32> %xor 639} 640 641define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 642; SSE2-LABEL: combine_bitwise_ops_test4c: 643; SSE2: # BB#0: 644; SSE2-NEXT: pand %xmm1, %xmm0 645; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 646; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 647; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 648; SSE2-NEXT: retq 649; 650; SSSE3-LABEL: combine_bitwise_ops_test4c: 651; SSSE3: # BB#0: 652; SSSE3-NEXT: pand %xmm1, %xmm0 653; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 654; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 655; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 656; SSSE3-NEXT: retq 657; 658; SSE41-LABEL: combine_bitwise_ops_test4c: 659; SSE41: # BB#0: 660; SSE41-NEXT: pand %xmm1, %xmm0 661; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 662; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 663; SSE41-NEXT: retq 664; 665; AVX1-LABEL: combine_bitwise_ops_test4c: 666; AVX1: # BB#0: 667; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 668; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 669; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 670; AVX1-NEXT: retq 671; 672; AVX2-LABEL: combine_bitwise_ops_test4c: 673; AVX2: # BB#0: 674; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 675; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 676; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 677; AVX2-NEXT: retq 678 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 679 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 680 %and = and <4 x i32> %shuf1, %shuf2 681 ret <4 x i32> %and 682} 683 684define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 685; SSE2-LABEL: combine_bitwise_ops_test5c: 686; SSE2: # BB#0: 687; SSE2-NEXT: por %xmm1, %xmm0 688; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 689; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 690; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 691; SSE2-NEXT: retq 692; 693; SSSE3-LABEL: combine_bitwise_ops_test5c: 694; SSSE3: # BB#0: 695; SSSE3-NEXT: por %xmm1, %xmm0 696; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 697; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 698; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 699; SSSE3-NEXT: retq 700; 701; SSE41-LABEL: combine_bitwise_ops_test5c: 702; SSE41: # BB#0: 703; SSE41-NEXT: por %xmm1, %xmm0 704; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 705; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 706; SSE41-NEXT: retq 707; 708; AVX1-LABEL: combine_bitwise_ops_test5c: 709; AVX1: # BB#0: 710; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 711; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 712; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 713; AVX1-NEXT: retq 714; 715; AVX2-LABEL: combine_bitwise_ops_test5c: 716; AVX2: # BB#0: 717; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 718; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 719; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 720; AVX2-NEXT: retq 721 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 722 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 723 %or = or <4 x i32> %shuf1, %shuf2 724 ret <4 x i32> %or 725} 726 727define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 728; SSE2-LABEL: combine_bitwise_ops_test6c: 729; SSE2: # BB#0: 730; SSE2-NEXT: pxor %xmm1, %xmm0 731; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 732; SSE2-NEXT: pxor %xmm0, %xmm0 733; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 734; SSE2-NEXT: retq 735; 736; SSSE3-LABEL: combine_bitwise_ops_test6c: 737; SSSE3: # BB#0: 738; SSSE3-NEXT: pxor %xmm1, %xmm0 739; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 740; SSSE3-NEXT: pxor %xmm0, %xmm0 741; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 742; SSSE3-NEXT: retq 743; 744; SSE41-LABEL: combine_bitwise_ops_test6c: 745; SSE41: # BB#0: 746; SSE41-NEXT: pxor %xmm1, %xmm0 747; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 748; SSE41-NEXT: pxor %xmm0, %xmm0 749; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 750; SSE41-NEXT: retq 751; 752; AVX1-LABEL: combine_bitwise_ops_test6c: 753; AVX1: # BB#0: 754; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 755; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 756; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 757; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 758; AVX1-NEXT: retq 759; 760; AVX2-LABEL: combine_bitwise_ops_test6c: 761; AVX2: # BB#0: 762; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 763; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 764; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 765; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 766; AVX2-NEXT: retq 767 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 768 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 769 %xor = xor <4 x i32> %shuf1, %shuf2 770 ret <4 x i32> %xor 771} 772 773define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 774; SSE-LABEL: combine_nested_undef_test1: 775; SSE: # BB#0: 776; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 777; SSE-NEXT: retq 778; 779; AVX-LABEL: combine_nested_undef_test1: 780; AVX: # BB#0: 781; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 782; AVX-NEXT: retq 783 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 784 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 785 ret <4 x i32> %2 786} 787 788define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 789; SSE-LABEL: combine_nested_undef_test2: 790; SSE: # BB#0: 791; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 792; SSE-NEXT: retq 793; 794; AVX-LABEL: combine_nested_undef_test2: 795; AVX: # BB#0: 796; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 797; AVX-NEXT: retq 798 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 799 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 800 ret <4 x i32> %2 801} 802 803define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 804; SSE-LABEL: combine_nested_undef_test3: 805; SSE: # BB#0: 806; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 807; SSE-NEXT: retq 808; 809; AVX-LABEL: combine_nested_undef_test3: 810; AVX: # BB#0: 811; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 812; AVX-NEXT: retq 813 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 814 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 815 ret <4 x i32> %2 816} 817 818define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 819; SSE-LABEL: combine_nested_undef_test4: 820; SSE: # BB#0: 821; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 822; SSE-NEXT: retq 823; 824; AVX1-LABEL: combine_nested_undef_test4: 825; AVX1: # BB#0: 826; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 827; AVX1-NEXT: retq 828; 829; AVX2-LABEL: combine_nested_undef_test4: 830; AVX2: # BB#0: 831; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 832; AVX2-NEXT: retq 833 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 834 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 835 ret <4 x i32> %2 836} 837 838define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 839; SSE-LABEL: combine_nested_undef_test5: 840; SSE: # BB#0: 841; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 842; SSE-NEXT: retq 843; 844; AVX-LABEL: combine_nested_undef_test5: 845; AVX: # BB#0: 846; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 847; AVX-NEXT: retq 848 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 849 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 850 ret <4 x i32> %2 851} 852 853define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 854; SSE-LABEL: combine_nested_undef_test6: 855; SSE: # BB#0: 856; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 857; SSE-NEXT: retq 858; 859; AVX-LABEL: combine_nested_undef_test6: 860; AVX: # BB#0: 861; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 862; AVX-NEXT: retq 863 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 864 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 865 ret <4 x i32> %2 866} 867 868define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 869; SSE-LABEL: combine_nested_undef_test7: 870; SSE: # BB#0: 871; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 872; SSE-NEXT: retq 873; 874; AVX-LABEL: combine_nested_undef_test7: 875; AVX: # BB#0: 876; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 877; AVX-NEXT: retq 878 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 879 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 880 ret <4 x i32> %2 881} 882 883define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 884; SSE-LABEL: combine_nested_undef_test8: 885; SSE: # BB#0: 886; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 887; SSE-NEXT: retq 888; 889; AVX-LABEL: combine_nested_undef_test8: 890; AVX: # BB#0: 891; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 892; AVX-NEXT: retq 893 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 894 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 895 ret <4 x i32> %2 896} 897 898define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 899; SSE-LABEL: combine_nested_undef_test9: 900; SSE: # BB#0: 901; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 902; SSE-NEXT: retq 903; 904; AVX-LABEL: combine_nested_undef_test9: 905; AVX: # BB#0: 906; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 907; AVX-NEXT: retq 908 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 909 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 910 ret <4 x i32> %2 911} 912 913define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 914; SSE-LABEL: combine_nested_undef_test10: 915; SSE: # BB#0: 916; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] 917; SSE-NEXT: retq 918; 919; AVX-LABEL: combine_nested_undef_test10: 920; AVX: # BB#0: 921; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] 922; AVX-NEXT: retq 923 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 924 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 925 ret <4 x i32> %2 926} 927 928define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 929; SSE-LABEL: combine_nested_undef_test11: 930; SSE: # BB#0: 931; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 932; SSE-NEXT: retq 933; 934; AVX-LABEL: combine_nested_undef_test11: 935; AVX: # BB#0: 936; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 937; AVX-NEXT: retq 938 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 939 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 940 ret <4 x i32> %2 941} 942 943define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 944; SSE-LABEL: combine_nested_undef_test12: 945; SSE: # BB#0: 946; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 947; SSE-NEXT: retq 948; 949; AVX1-LABEL: combine_nested_undef_test12: 950; AVX1: # BB#0: 951; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 952; AVX1-NEXT: retq 953; 954; AVX2-LABEL: combine_nested_undef_test12: 955; AVX2: # BB#0: 956; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 957; AVX2-NEXT: retq 958 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 959 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 960 ret <4 x i32> %2 961} 962 963; The following pair of shuffles is folded into vector %A. 964define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 965; ALL-LABEL: combine_nested_undef_test13: 966; ALL: # BB#0: 967; ALL-NEXT: retq 968 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 969 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 970 ret <4 x i32> %2 971} 972 973; The following pair of shuffles is folded into vector %B. 974define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 975; SSE-LABEL: combine_nested_undef_test14: 976; SSE: # BB#0: 977; SSE-NEXT: movaps %xmm1, %xmm0 978; SSE-NEXT: retq 979; 980; AVX-LABEL: combine_nested_undef_test14: 981; AVX: # BB#0: 982; AVX-NEXT: vmovaps %xmm1, %xmm0 983; AVX-NEXT: retq 984 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 985 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 986 ret <4 x i32> %2 987} 988 989 990; Verify that we don't optimize the following cases. We expect more than one shuffle. 991; 992; FIXME: Many of these already don't make sense, and the rest should stop 993; making sense with th enew vector shuffle lowering. Revisit at least testing for 994; it. 995 996define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 997; SSE2-LABEL: combine_nested_undef_test15: 998; SSE2: # BB#0: 999; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 1000; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 1001; SSE2-NEXT: movaps %xmm1, %xmm0 1002; SSE2-NEXT: retq 1003; 1004; SSSE3-LABEL: combine_nested_undef_test15: 1005; SSSE3: # BB#0: 1006; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 1007; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 1008; SSSE3-NEXT: movaps %xmm1, %xmm0 1009; SSSE3-NEXT: retq 1010; 1011; SSE41-LABEL: combine_nested_undef_test15: 1012; SSE41: # BB#0: 1013; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1014; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1015; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1016; SSE41-NEXT: retq 1017; 1018; AVX1-LABEL: combine_nested_undef_test15: 1019; AVX1: # BB#0: 1020; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1021; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1022; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1023; AVX1-NEXT: retq 1024; 1025; AVX2-LABEL: combine_nested_undef_test15: 1026; AVX2: # BB#0: 1027; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 1028; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1029; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1030; AVX2-NEXT: retq 1031 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 1032 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1033 ret <4 x i32> %2 1034} 1035 1036define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 1037; SSE2-LABEL: combine_nested_undef_test16: 1038; SSE2: # BB#0: 1039; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1040; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 1041; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1042; SSE2-NEXT: retq 1043; 1044; SSSE3-LABEL: combine_nested_undef_test16: 1045; SSSE3: # BB#0: 1046; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1047; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 1048; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1049; SSSE3-NEXT: retq 1050; 1051; SSE41-LABEL: combine_nested_undef_test16: 1052; SSE41: # BB#0: 1053; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1054; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1055; SSE41-NEXT: retq 1056; 1057; AVX1-LABEL: combine_nested_undef_test16: 1058; AVX1: # BB#0: 1059; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1060; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1061; AVX1-NEXT: retq 1062; 1063; AVX2-LABEL: combine_nested_undef_test16: 1064; AVX2: # BB#0: 1065; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1066; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1067; AVX2-NEXT: retq 1068 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1069 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1070 ret <4 x i32> %2 1071} 1072 1073define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 1074; SSE2-LABEL: combine_nested_undef_test17: 1075; SSE2: # BB#0: 1076; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 1077; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 1078; SSE2-NEXT: retq 1079; 1080; SSSE3-LABEL: combine_nested_undef_test17: 1081; SSSE3: # BB#0: 1082; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 1083; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 1084; SSSE3-NEXT: retq 1085; 1086; SSE41-LABEL: combine_nested_undef_test17: 1087; SSE41: # BB#0: 1088; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1089; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1090; SSE41-NEXT: retq 1091; 1092; AVX1-LABEL: combine_nested_undef_test17: 1093; AVX1: # BB#0: 1094; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1095; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1096; AVX1-NEXT: retq 1097; 1098; AVX2-LABEL: combine_nested_undef_test17: 1099; AVX2: # BB#0: 1100; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1101; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1102; AVX2-NEXT: retq 1103 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1104 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1105 ret <4 x i32> %2 1106} 1107 1108define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 1109; SSE-LABEL: combine_nested_undef_test18: 1110; SSE: # BB#0: 1111; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 1112; SSE-NEXT: retq 1113; 1114; AVX-LABEL: combine_nested_undef_test18: 1115; AVX: # BB#0: 1116; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 1117; AVX-NEXT: retq 1118 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1119 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 1120 ret <4 x i32> %2 1121} 1122 1123define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 1124; SSE2-LABEL: combine_nested_undef_test19: 1125; SSE2: # BB#0: 1126; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1127; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 1128; SSE2-NEXT: retq 1129; 1130; SSSE3-LABEL: combine_nested_undef_test19: 1131; SSSE3: # BB#0: 1132; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1133; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 1134; SSSE3-NEXT: retq 1135; 1136; SSE41-LABEL: combine_nested_undef_test19: 1137; SSE41: # BB#0: 1138; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1139; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1140; SSE41-NEXT: retq 1141; 1142; AVX1-LABEL: combine_nested_undef_test19: 1143; AVX1: # BB#0: 1144; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1145; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1146; AVX1-NEXT: retq 1147; 1148; AVX2-LABEL: combine_nested_undef_test19: 1149; AVX2: # BB#0: 1150; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1151; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1152; AVX2-NEXT: retq 1153 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 1154 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 1155 ret <4 x i32> %2 1156} 1157 1158define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 1159; SSE2-LABEL: combine_nested_undef_test20: 1160; SSE2: # BB#0: 1161; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1162; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1163; SSE2-NEXT: movaps %xmm1, %xmm0 1164; SSE2-NEXT: retq 1165; 1166; SSSE3-LABEL: combine_nested_undef_test20: 1167; SSSE3: # BB#0: 1168; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1169; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1170; SSSE3-NEXT: movaps %xmm1, %xmm0 1171; SSSE3-NEXT: retq 1172; 1173; SSE41-LABEL: combine_nested_undef_test20: 1174; SSE41: # BB#0: 1175; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1176; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1177; SSE41-NEXT: retq 1178; 1179; AVX1-LABEL: combine_nested_undef_test20: 1180; AVX1: # BB#0: 1181; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1182; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1183; AVX1-NEXT: retq 1184; 1185; AVX2-LABEL: combine_nested_undef_test20: 1186; AVX2: # BB#0: 1187; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1188; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1189; AVX2-NEXT: retq 1190 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1191 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1192 ret <4 x i32> %2 1193} 1194 1195define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1196; SSE2-LABEL: combine_nested_undef_test21: 1197; SSE2: # BB#0: 1198; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1199; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1200; SSE2-NEXT: retq 1201; 1202; SSSE3-LABEL: combine_nested_undef_test21: 1203; SSSE3: # BB#0: 1204; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1205; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1206; SSSE3-NEXT: retq 1207; 1208; SSE41-LABEL: combine_nested_undef_test21: 1209; SSE41: # BB#0: 1210; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1211; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1212; SSE41-NEXT: retq 1213; 1214; AVX1-LABEL: combine_nested_undef_test21: 1215; AVX1: # BB#0: 1216; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1217; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1218; AVX1-NEXT: retq 1219; 1220; AVX2-LABEL: combine_nested_undef_test21: 1221; AVX2: # BB#0: 1222; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1223; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1224; AVX2-NEXT: retq 1225 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1226 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1227 ret <4 x i32> %2 1228} 1229 1230 1231; Test that we correctly combine shuffles according to rule 1232; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1233 1234define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1235; SSE-LABEL: combine_nested_undef_test22: 1236; SSE: # BB#0: 1237; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1238; SSE-NEXT: retq 1239; 1240; AVX-LABEL: combine_nested_undef_test22: 1241; AVX: # BB#0: 1242; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1243; AVX-NEXT: retq 1244 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1245 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1246 ret <4 x i32> %2 1247} 1248 1249define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1250; SSE-LABEL: combine_nested_undef_test23: 1251; SSE: # BB#0: 1252; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1253; SSE-NEXT: retq 1254; 1255; AVX-LABEL: combine_nested_undef_test23: 1256; AVX: # BB#0: 1257; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1258; AVX-NEXT: retq 1259 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1260 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1261 ret <4 x i32> %2 1262} 1263 1264define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1265; SSE-LABEL: combine_nested_undef_test24: 1266; SSE: # BB#0: 1267; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1268; SSE-NEXT: retq 1269; 1270; AVX-LABEL: combine_nested_undef_test24: 1271; AVX: # BB#0: 1272; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1273; AVX-NEXT: retq 1274 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1275 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1276 ret <4 x i32> %2 1277} 1278 1279define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1280; SSE-LABEL: combine_nested_undef_test25: 1281; SSE: # BB#0: 1282; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1283; SSE-NEXT: retq 1284; 1285; AVX1-LABEL: combine_nested_undef_test25: 1286; AVX1: # BB#0: 1287; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1288; AVX1-NEXT: retq 1289; 1290; AVX2-LABEL: combine_nested_undef_test25: 1291; AVX2: # BB#0: 1292; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1293; AVX2-NEXT: retq 1294 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1295 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1296 ret <4 x i32> %2 1297} 1298 1299define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1300; SSE-LABEL: combine_nested_undef_test26: 1301; SSE: # BB#0: 1302; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1303; SSE-NEXT: retq 1304; 1305; AVX-LABEL: combine_nested_undef_test26: 1306; AVX: # BB#0: 1307; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1308; AVX-NEXT: retq 1309 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1310 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1311 ret <4 x i32> %2 1312} 1313 1314define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1315; SSE-LABEL: combine_nested_undef_test27: 1316; SSE: # BB#0: 1317; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1318; SSE-NEXT: retq 1319; 1320; AVX1-LABEL: combine_nested_undef_test27: 1321; AVX1: # BB#0: 1322; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1323; AVX1-NEXT: retq 1324; 1325; AVX2-LABEL: combine_nested_undef_test27: 1326; AVX2: # BB#0: 1327; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1328; AVX2-NEXT: retq 1329 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1330 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1331 ret <4 x i32> %2 1332} 1333 1334define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1335; SSE-LABEL: combine_nested_undef_test28: 1336; SSE: # BB#0: 1337; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1338; SSE-NEXT: retq 1339; 1340; AVX-LABEL: combine_nested_undef_test28: 1341; AVX: # BB#0: 1342; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1343; AVX-NEXT: retq 1344 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1345 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1346 ret <4 x i32> %2 1347} 1348 1349define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1350; SSE-LABEL: combine_test1: 1351; SSE: # BB#0: 1352; SSE-NEXT: movaps %xmm1, %xmm0 1353; SSE-NEXT: retq 1354; 1355; AVX-LABEL: combine_test1: 1356; AVX: # BB#0: 1357; AVX-NEXT: vmovaps %xmm1, %xmm0 1358; AVX-NEXT: retq 1359 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1360 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1361 ret <4 x float> %2 1362} 1363 1364define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1365; SSE2-LABEL: combine_test2: 1366; SSE2: # BB#0: 1367; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1368; SSE2-NEXT: movaps %xmm1, %xmm0 1369; SSE2-NEXT: retq 1370; 1371; SSSE3-LABEL: combine_test2: 1372; SSSE3: # BB#0: 1373; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1374; SSSE3-NEXT: movaps %xmm1, %xmm0 1375; SSSE3-NEXT: retq 1376; 1377; SSE41-LABEL: combine_test2: 1378; SSE41: # BB#0: 1379; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1380; SSE41-NEXT: retq 1381; 1382; AVX-LABEL: combine_test2: 1383; AVX: # BB#0: 1384; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1385; AVX-NEXT: retq 1386 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1387 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1388 ret <4 x float> %2 1389} 1390 1391define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1392; SSE-LABEL: combine_test3: 1393; SSE: # BB#0: 1394; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1395; SSE-NEXT: retq 1396; 1397; AVX-LABEL: combine_test3: 1398; AVX: # BB#0: 1399; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1400; AVX-NEXT: retq 1401 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1402 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1403 ret <4 x float> %2 1404} 1405 1406define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1407; SSE-LABEL: combine_test4: 1408; SSE: # BB#0: 1409; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1410; SSE-NEXT: movapd %xmm1, %xmm0 1411; SSE-NEXT: retq 1412; 1413; AVX-LABEL: combine_test4: 1414; AVX: # BB#0: 1415; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1416; AVX-NEXT: retq 1417 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1418 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1419 ret <4 x float> %2 1420} 1421 1422define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1423; SSE2-LABEL: combine_test5: 1424; SSE2: # BB#0: 1425; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1426; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1427; SSE2-NEXT: retq 1428; 1429; SSSE3-LABEL: combine_test5: 1430; SSSE3: # BB#0: 1431; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1432; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1433; SSSE3-NEXT: retq 1434; 1435; SSE41-LABEL: combine_test5: 1436; SSE41: # BB#0: 1437; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1438; SSE41-NEXT: retq 1439; 1440; AVX-LABEL: combine_test5: 1441; AVX: # BB#0: 1442; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1443; AVX-NEXT: retq 1444 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1445 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1446 ret <4 x float> %2 1447} 1448 1449define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1450; SSE-LABEL: combine_test6: 1451; SSE: # BB#0: 1452; SSE-NEXT: movaps %xmm1, %xmm0 1453; SSE-NEXT: retq 1454; 1455; AVX-LABEL: combine_test6: 1456; AVX: # BB#0: 1457; AVX-NEXT: vmovaps %xmm1, %xmm0 1458; AVX-NEXT: retq 1459 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1460 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1461 ret <4 x i32> %2 1462} 1463 1464define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1465; SSE2-LABEL: combine_test7: 1466; SSE2: # BB#0: 1467; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1468; SSE2-NEXT: movaps %xmm1, %xmm0 1469; SSE2-NEXT: retq 1470; 1471; SSSE3-LABEL: combine_test7: 1472; SSSE3: # BB#0: 1473; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1474; SSSE3-NEXT: movaps %xmm1, %xmm0 1475; SSSE3-NEXT: retq 1476; 1477; SSE41-LABEL: combine_test7: 1478; SSE41: # BB#0: 1479; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1480; SSE41-NEXT: retq 1481; 1482; AVX1-LABEL: combine_test7: 1483; AVX1: # BB#0: 1484; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1485; AVX1-NEXT: retq 1486; 1487; AVX2-LABEL: combine_test7: 1488; AVX2: # BB#0: 1489; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1490; AVX2-NEXT: retq 1491 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1492 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1493 ret <4 x i32> %2 1494} 1495 1496define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1497; SSE-LABEL: combine_test8: 1498; SSE: # BB#0: 1499; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1500; SSE-NEXT: retq 1501; 1502; AVX-LABEL: combine_test8: 1503; AVX: # BB#0: 1504; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1505; AVX-NEXT: retq 1506 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1507 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1508 ret <4 x i32> %2 1509} 1510 1511define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1512; SSE-LABEL: combine_test9: 1513; SSE: # BB#0: 1514; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1515; SSE-NEXT: movdqa %xmm1, %xmm0 1516; SSE-NEXT: retq 1517; 1518; AVX-LABEL: combine_test9: 1519; AVX: # BB#0: 1520; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1521; AVX-NEXT: retq 1522 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1523 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1524 ret <4 x i32> %2 1525} 1526 1527define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1528; SSE2-LABEL: combine_test10: 1529; SSE2: # BB#0: 1530; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1531; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1532; SSE2-NEXT: retq 1533; 1534; SSSE3-LABEL: combine_test10: 1535; SSSE3: # BB#0: 1536; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1537; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1538; SSSE3-NEXT: retq 1539; 1540; SSE41-LABEL: combine_test10: 1541; SSE41: # BB#0: 1542; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1543; SSE41-NEXT: retq 1544; 1545; AVX1-LABEL: combine_test10: 1546; AVX1: # BB#0: 1547; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1548; AVX1-NEXT: retq 1549; 1550; AVX2-LABEL: combine_test10: 1551; AVX2: # BB#0: 1552; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1553; AVX2-NEXT: retq 1554 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1555 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1556 ret <4 x i32> %2 1557} 1558 1559define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1560; ALL-LABEL: combine_test11: 1561; ALL: # BB#0: 1562; ALL-NEXT: retq 1563 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1564 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1565 ret <4 x float> %2 1566} 1567 1568define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1569; SSE2-LABEL: combine_test12: 1570; SSE2: # BB#0: 1571; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1572; SSE2-NEXT: movaps %xmm1, %xmm0 1573; SSE2-NEXT: retq 1574; 1575; SSSE3-LABEL: combine_test12: 1576; SSSE3: # BB#0: 1577; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1578; SSSE3-NEXT: movaps %xmm1, %xmm0 1579; SSSE3-NEXT: retq 1580; 1581; SSE41-LABEL: combine_test12: 1582; SSE41: # BB#0: 1583; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1584; SSE41-NEXT: retq 1585; 1586; AVX-LABEL: combine_test12: 1587; AVX: # BB#0: 1588; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1589; AVX-NEXT: retq 1590 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1591 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1592 ret <4 x float> %2 1593} 1594 1595define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1596; SSE-LABEL: combine_test13: 1597; SSE: # BB#0: 1598; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1599; SSE-NEXT: retq 1600; 1601; AVX-LABEL: combine_test13: 1602; AVX: # BB#0: 1603; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1604; AVX-NEXT: retq 1605 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1606 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1607 ret <4 x float> %2 1608} 1609 1610define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1611; SSE-LABEL: combine_test14: 1612; SSE: # BB#0: 1613; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1614; SSE-NEXT: retq 1615; 1616; AVX-LABEL: combine_test14: 1617; AVX: # BB#0: 1618; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1619; AVX-NEXT: retq 1620 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1621 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1622 ret <4 x float> %2 1623} 1624 1625define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1626; SSE2-LABEL: combine_test15: 1627; SSE2: # BB#0: 1628; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1629; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1630; SSE2-NEXT: retq 1631; 1632; SSSE3-LABEL: combine_test15: 1633; SSSE3: # BB#0: 1634; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1635; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1636; SSSE3-NEXT: retq 1637; 1638; SSE41-LABEL: combine_test15: 1639; SSE41: # BB#0: 1640; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1641; SSE41-NEXT: retq 1642; 1643; AVX-LABEL: combine_test15: 1644; AVX: # BB#0: 1645; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1646; AVX-NEXT: retq 1647 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1648 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1649 ret <4 x float> %2 1650} 1651 1652define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1653; ALL-LABEL: combine_test16: 1654; ALL: # BB#0: 1655; ALL-NEXT: retq 1656 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1657 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1658 ret <4 x i32> %2 1659} 1660 1661define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1662; SSE2-LABEL: combine_test17: 1663; SSE2: # BB#0: 1664; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1665; SSE2-NEXT: movaps %xmm1, %xmm0 1666; SSE2-NEXT: retq 1667; 1668; SSSE3-LABEL: combine_test17: 1669; SSSE3: # BB#0: 1670; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1671; SSSE3-NEXT: movaps %xmm1, %xmm0 1672; SSSE3-NEXT: retq 1673; 1674; SSE41-LABEL: combine_test17: 1675; SSE41: # BB#0: 1676; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1677; SSE41-NEXT: retq 1678; 1679; AVX1-LABEL: combine_test17: 1680; AVX1: # BB#0: 1681; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1682; AVX1-NEXT: retq 1683; 1684; AVX2-LABEL: combine_test17: 1685; AVX2: # BB#0: 1686; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1687; AVX2-NEXT: retq 1688 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1689 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1690 ret <4 x i32> %2 1691} 1692 1693define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1694; SSE-LABEL: combine_test18: 1695; SSE: # BB#0: 1696; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1697; SSE-NEXT: retq 1698; 1699; AVX-LABEL: combine_test18: 1700; AVX: # BB#0: 1701; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1702; AVX-NEXT: retq 1703 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1704 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1705 ret <4 x i32> %2 1706} 1707 1708define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1709; SSE-LABEL: combine_test19: 1710; SSE: # BB#0: 1711; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1712; SSE-NEXT: retq 1713; 1714; AVX-LABEL: combine_test19: 1715; AVX: # BB#0: 1716; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1717; AVX-NEXT: retq 1718 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1719 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1720 ret <4 x i32> %2 1721} 1722 1723define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1724; SSE2-LABEL: combine_test20: 1725; SSE2: # BB#0: 1726; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1727; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1728; SSE2-NEXT: retq 1729; 1730; SSSE3-LABEL: combine_test20: 1731; SSSE3: # BB#0: 1732; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1733; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1734; SSSE3-NEXT: retq 1735; 1736; SSE41-LABEL: combine_test20: 1737; SSE41: # BB#0: 1738; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1739; SSE41-NEXT: retq 1740; 1741; AVX1-LABEL: combine_test20: 1742; AVX1: # BB#0: 1743; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1744; AVX1-NEXT: retq 1745; 1746; AVX2-LABEL: combine_test20: 1747; AVX2: # BB#0: 1748; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1749; AVX2-NEXT: retq 1750 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1751 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1752 ret <4 x i32> %2 1753} 1754 1755define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { 1756; SSE-LABEL: combine_test21: 1757; SSE: # BB#0: 1758; SSE-NEXT: movdqa %xmm0, %xmm2 1759; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1760; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1761; SSE-NEXT: movdqa %xmm2, (%rdi) 1762; SSE-NEXT: retq 1763; 1764; AVX1-LABEL: combine_test21: 1765; AVX1: # BB#0: 1766; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1767; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1768; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1769; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 1770; AVX1-NEXT: vzeroupper 1771; AVX1-NEXT: retq 1772; 1773; AVX2-LABEL: combine_test21: 1774; AVX2: # BB#0: 1775; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1776; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1777; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1778; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 1779; AVX2-NEXT: vzeroupper 1780; AVX2-NEXT: retq 1781 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1782 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1783 store <4 x i32> %1, <4 x i32>* %ptr, align 16 1784 ret <4 x i32> %2 1785} 1786 1787define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { 1788; SSE-LABEL: combine_test22: 1789; SSE: # BB#0: 1790; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1791; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 1792; SSE-NEXT: retq 1793; 1794; AVX-LABEL: combine_test22: 1795; AVX: # BB#0: 1796; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1797; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 1798; AVX-NEXT: retq 1799; Current AVX2 lowering of this is still awful, not adding a test case. 1800 %1 = load <2 x float>, <2 x float>* %a, align 8 1801 %2 = load <2 x float>, <2 x float>* %b, align 8 1802 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1803 ret <8 x float> %3 1804} 1805 1806; PR22359 1807define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) { 1808; SSE-LABEL: combine_test23: 1809; SSE: # BB#0: 1810; SSE-NEXT: movups %xmm0, (%rdi) 1811; SSE-NEXT: retq 1812; 1813; AVX-LABEL: combine_test23: 1814; AVX: # BB#0: 1815; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1816; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3] 1817; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 1818; AVX-NEXT: vmovups %xmm0, (%rdi) 1819; AVX-NEXT: vzeroupper 1820; AVX-NEXT: retq 1821 %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1 1822 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1> 1823 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3> 1824 store <2 x float> %shuffle0, <2 x float>* %ptr, align 8 1825 store <2 x float> %shuffle1, <2 x float>* %idx2, align 8 1826 ret void 1827} 1828 1829; Check some negative cases. 1830; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1831 1832define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1833; SSE-LABEL: combine_test1b: 1834; SSE: # BB#0: 1835; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 1836; SSE-NEXT: movaps %xmm1, %xmm0 1837; SSE-NEXT: retq 1838; 1839; AVX-LABEL: combine_test1b: 1840; AVX: # BB#0: 1841; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1842; AVX-NEXT: retq 1843 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1844 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1845 ret <4 x float> %2 1846} 1847 1848define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1849; SSE2-LABEL: combine_test2b: 1850; SSE2: # BB#0: 1851; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] 1852; SSE2-NEXT: movaps %xmm1, %xmm0 1853; SSE2-NEXT: retq 1854; 1855; SSSE3-LABEL: combine_test2b: 1856; SSSE3: # BB#0: 1857; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1858; SSSE3-NEXT: retq 1859; 1860; SSE41-LABEL: combine_test2b: 1861; SSE41: # BB#0: 1862; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1863; SSE41-NEXT: retq 1864; 1865; AVX-LABEL: combine_test2b: 1866; AVX: # BB#0: 1867; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1868; AVX-NEXT: retq 1869 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1870 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1871 ret <4 x float> %2 1872} 1873 1874define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1875; SSE2-LABEL: combine_test3b: 1876; SSE2: # BB#0: 1877; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1878; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1879; SSE2-NEXT: retq 1880; 1881; SSSE3-LABEL: combine_test3b: 1882; SSSE3: # BB#0: 1883; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1884; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1885; SSSE3-NEXT: retq 1886; 1887; SSE41-LABEL: combine_test3b: 1888; SSE41: # BB#0: 1889; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1890; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1891; SSE41-NEXT: retq 1892; 1893; AVX-LABEL: combine_test3b: 1894; AVX: # BB#0: 1895; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1896; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1897; AVX-NEXT: retq 1898 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1899 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1900 ret <4 x float> %2 1901} 1902 1903define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1904; SSE-LABEL: combine_test4b: 1905; SSE: # BB#0: 1906; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] 1907; SSE-NEXT: movaps %xmm1, %xmm0 1908; SSE-NEXT: retq 1909; 1910; AVX-LABEL: combine_test4b: 1911; AVX: # BB#0: 1912; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1913; AVX-NEXT: retq 1914 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1915 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1916 ret <4 x float> %2 1917} 1918 1919 1920; Verify that we correctly fold shuffles even when we use illegal vector types. 1921 1922define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { 1923; SSE2-LABEL: combine_test1c: 1924; SSE2: # BB#0: 1925; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1926; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1927; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1928; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1929; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1930; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1931; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1932; SSE2-NEXT: retq 1933; 1934; SSSE3-LABEL: combine_test1c: 1935; SSSE3: # BB#0: 1936; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1937; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1938; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1939; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1940; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1941; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1942; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1943; SSSE3-NEXT: retq 1944; 1945; SSE41-LABEL: combine_test1c: 1946; SSE41: # BB#0: 1947; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1948; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1949; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1950; SSE41-NEXT: retq 1951; 1952; AVX1-LABEL: combine_test1c: 1953; AVX1: # BB#0: 1954; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1955; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1956; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1957; AVX1-NEXT: retq 1958; 1959; AVX2-LABEL: combine_test1c: 1960; AVX2: # BB#0: 1961; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1962; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1963; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1964; AVX2-NEXT: retq 1965 %A = load <4 x i8>, <4 x i8>* %a 1966 %B = load <4 x i8>, <4 x i8>* %b 1967 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1968 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1969 ret <4 x i8> %2 1970} 1971 1972define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { 1973; SSE2-LABEL: combine_test2c: 1974; SSE2: # BB#0: 1975; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1976; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1977; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1978; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1979; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1980; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1981; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1982; SSE2-NEXT: retq 1983; 1984; SSSE3-LABEL: combine_test2c: 1985; SSSE3: # BB#0: 1986; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1987; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1988; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1989; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1990; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1991; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1992; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1993; SSSE3-NEXT: retq 1994; 1995; SSE41-LABEL: combine_test2c: 1996; SSE41: # BB#0: 1997; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1998; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1999; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2000; SSE41-NEXT: retq 2001; 2002; AVX-LABEL: combine_test2c: 2003; AVX: # BB#0: 2004; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2005; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2006; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2007; AVX-NEXT: retq 2008 %A = load <4 x i8>, <4 x i8>* %a 2009 %B = load <4 x i8>, <4 x i8>* %b 2010 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 2011 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2012 ret <4 x i8> %2 2013} 2014 2015define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { 2016; SSE2-LABEL: combine_test3c: 2017; SSE2: # BB#0: 2018; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2019; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2020; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2021; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2022; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2023; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2024; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2025; SSE2-NEXT: retq 2026; 2027; SSSE3-LABEL: combine_test3c: 2028; SSSE3: # BB#0: 2029; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2030; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2031; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2032; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2033; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2034; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2035; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2036; SSSE3-NEXT: retq 2037; 2038; SSE41-LABEL: combine_test3c: 2039; SSE41: # BB#0: 2040; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2041; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2042; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2043; SSE41-NEXT: retq 2044; 2045; AVX-LABEL: combine_test3c: 2046; AVX: # BB#0: 2047; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2048; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2049; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2050; AVX-NEXT: retq 2051 %A = load <4 x i8>, <4 x i8>* %a 2052 %B = load <4 x i8>, <4 x i8>* %b 2053 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2054 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2055 ret <4 x i8> %2 2056} 2057 2058define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { 2059; SSE2-LABEL: combine_test4c: 2060; SSE2: # BB#0: 2061; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2062; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2063; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2064; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2065; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2066; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2067; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2068; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2069; SSE2-NEXT: retq 2070; 2071; SSSE3-LABEL: combine_test4c: 2072; SSSE3: # BB#0: 2073; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2074; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2075; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2076; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2077; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2078; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2079; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2080; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2081; SSSE3-NEXT: retq 2082; 2083; SSE41-LABEL: combine_test4c: 2084; SSE41: # BB#0: 2085; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2086; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2087; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 2088; SSE41-NEXT: retq 2089; 2090; AVX1-LABEL: combine_test4c: 2091; AVX1: # BB#0: 2092; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2093; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2094; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 2095; AVX1-NEXT: retq 2096; 2097; AVX2-LABEL: combine_test4c: 2098; AVX2: # BB#0: 2099; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2100; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2101; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2102; AVX2-NEXT: retq 2103 %A = load <4 x i8>, <4 x i8>* %a 2104 %B = load <4 x i8>, <4 x i8>* %b 2105 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 2106 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2107 ret <4 x i8> %2 2108} 2109 2110 2111; The following test cases are generated from this C++ code 2112; 2113;__m128 blend_01(__m128 a, __m128 b) 2114;{ 2115; __m128 s = a; 2116; s = _mm_blend_ps( s, b, 1<<0 ); 2117; s = _mm_blend_ps( s, b, 1<<1 ); 2118; return s; 2119;} 2120; 2121;__m128 blend_02(__m128 a, __m128 b) 2122;{ 2123; __m128 s = a; 2124; s = _mm_blend_ps( s, b, 1<<0 ); 2125; s = _mm_blend_ps( s, b, 1<<2 ); 2126; return s; 2127;} 2128; 2129;__m128 blend_123(__m128 a, __m128 b) 2130;{ 2131; __m128 s = a; 2132; s = _mm_blend_ps( s, b, 1<<1 ); 2133; s = _mm_blend_ps( s, b, 1<<2 ); 2134; s = _mm_blend_ps( s, b, 1<<3 ); 2135; return s; 2136;} 2137 2138; Ideally, we should collapse the following shuffles into a single one. 2139 2140define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 2141; SSE2-LABEL: combine_blend_01: 2142; SSE2: # BB#0: 2143; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2144; SSE2-NEXT: retq 2145; 2146; SSSE3-LABEL: combine_blend_01: 2147; SSSE3: # BB#0: 2148; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2149; SSSE3-NEXT: retq 2150; 2151; SSE41-LABEL: combine_blend_01: 2152; SSE41: # BB#0: 2153; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2154; SSE41-NEXT: retq 2155; 2156; AVX-LABEL: combine_blend_01: 2157; AVX: # BB#0: 2158; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2159; AVX-NEXT: retq 2160 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 2161 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 2162 ret <4 x float> %shuffle6 2163} 2164 2165define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 2166; SSE2-LABEL: combine_blend_02: 2167; SSE2: # BB#0: 2168; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 2169; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 2170; SSE2-NEXT: movaps %xmm1, %xmm0 2171; SSE2-NEXT: retq 2172; 2173; SSSE3-LABEL: combine_blend_02: 2174; SSSE3: # BB#0: 2175; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 2176; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 2177; SSSE3-NEXT: movaps %xmm1, %xmm0 2178; SSSE3-NEXT: retq 2179; 2180; SSE41-LABEL: combine_blend_02: 2181; SSE41: # BB#0: 2182; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2183; SSE41-NEXT: retq 2184; 2185; AVX-LABEL: combine_blend_02: 2186; AVX: # BB#0: 2187; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2188; AVX-NEXT: retq 2189 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 2190 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 2191 ret <4 x float> %shuffle6 2192} 2193 2194define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 2195; SSE2-LABEL: combine_blend_123: 2196; SSE2: # BB#0: 2197; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2198; SSE2-NEXT: movaps %xmm1, %xmm0 2199; SSE2-NEXT: retq 2200; 2201; SSSE3-LABEL: combine_blend_123: 2202; SSSE3: # BB#0: 2203; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2204; SSSE3-NEXT: movaps %xmm1, %xmm0 2205; SSSE3-NEXT: retq 2206; 2207; SSE41-LABEL: combine_blend_123: 2208; SSE41: # BB#0: 2209; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2210; SSE41-NEXT: retq 2211; 2212; AVX-LABEL: combine_blend_123: 2213; AVX: # BB#0: 2214; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2215; AVX-NEXT: retq 2216 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2217 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 2218 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2219 ret <4 x float> %shuffle12 2220} 2221 2222define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 2223; SSE-LABEL: combine_test_movhl_1: 2224; SSE: # BB#0: 2225; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2226; SSE-NEXT: movdqa %xmm1, %xmm0 2227; SSE-NEXT: retq 2228; 2229; AVX-LABEL: combine_test_movhl_1: 2230; AVX: # BB#0: 2231; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2232; AVX-NEXT: retq 2233 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 2234 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 2235 ret <4 x i32> %2 2236} 2237 2238define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 2239; SSE-LABEL: combine_test_movhl_2: 2240; SSE: # BB#0: 2241; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2242; SSE-NEXT: movdqa %xmm1, %xmm0 2243; SSE-NEXT: retq 2244; 2245; AVX-LABEL: combine_test_movhl_2: 2246; AVX: # BB#0: 2247; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2248; AVX-NEXT: retq 2249 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 2250 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 2251 ret <4 x i32> %2 2252} 2253 2254define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 2255; SSE-LABEL: combine_test_movhl_3: 2256; SSE: # BB#0: 2257; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2258; SSE-NEXT: movdqa %xmm1, %xmm0 2259; SSE-NEXT: retq 2260; 2261; AVX-LABEL: combine_test_movhl_3: 2262; AVX: # BB#0: 2263; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2264; AVX-NEXT: retq 2265 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 2266 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 2267 ret <4 x i32> %2 2268} 2269 2270 2271; Verify that we fold shuffles according to rule: 2272; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2273 2274define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2275; SSE2-LABEL: combine_undef_input_test1: 2276; SSE2: # BB#0: 2277; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2278; SSE2-NEXT: retq 2279; 2280; SSSE3-LABEL: combine_undef_input_test1: 2281; SSSE3: # BB#0: 2282; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2283; SSSE3-NEXT: retq 2284; 2285; SSE41-LABEL: combine_undef_input_test1: 2286; SSE41: # BB#0: 2287; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2288; SSE41-NEXT: retq 2289; 2290; AVX-LABEL: combine_undef_input_test1: 2291; AVX: # BB#0: 2292; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2293; AVX-NEXT: retq 2294 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2295 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2296 ret <4 x float> %2 2297} 2298 2299define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2300; SSE-LABEL: combine_undef_input_test2: 2301; SSE: # BB#0: 2302; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2303; SSE-NEXT: retq 2304; 2305; AVX-LABEL: combine_undef_input_test2: 2306; AVX: # BB#0: 2307; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2308; AVX-NEXT: retq 2309 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2310 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2311 ret <4 x float> %2 2312} 2313 2314define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2315; SSE-LABEL: combine_undef_input_test3: 2316; SSE: # BB#0: 2317; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2318; SSE-NEXT: retq 2319; 2320; AVX-LABEL: combine_undef_input_test3: 2321; AVX: # BB#0: 2322; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2323; AVX-NEXT: retq 2324 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2325 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2326 ret <4 x float> %2 2327} 2328 2329define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2330; SSE-LABEL: combine_undef_input_test4: 2331; SSE: # BB#0: 2332; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2333; SSE-NEXT: movapd %xmm1, %xmm0 2334; SSE-NEXT: retq 2335; 2336; AVX-LABEL: combine_undef_input_test4: 2337; AVX: # BB#0: 2338; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2339; AVX-NEXT: retq 2340 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2341 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2342 ret <4 x float> %2 2343} 2344 2345define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2346; SSE2-LABEL: combine_undef_input_test5: 2347; SSE2: # BB#0: 2348; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2349; SSE2-NEXT: movapd %xmm1, %xmm0 2350; SSE2-NEXT: retq 2351; 2352; SSSE3-LABEL: combine_undef_input_test5: 2353; SSSE3: # BB#0: 2354; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2355; SSSE3-NEXT: movapd %xmm1, %xmm0 2356; SSSE3-NEXT: retq 2357; 2358; SSE41-LABEL: combine_undef_input_test5: 2359; SSE41: # BB#0: 2360; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2361; SSE41-NEXT: retq 2362; 2363; AVX-LABEL: combine_undef_input_test5: 2364; AVX: # BB#0: 2365; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2366; AVX-NEXT: retq 2367 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2368 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2369 ret <4 x float> %2 2370} 2371 2372 2373; Verify that we fold shuffles according to rule: 2374; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2375 2376define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2377; ALL-LABEL: combine_undef_input_test6: 2378; ALL: # BB#0: 2379; ALL-NEXT: retq 2380 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2381 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2382 ret <4 x float> %2 2383} 2384 2385define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2386; SSE2-LABEL: combine_undef_input_test7: 2387; SSE2: # BB#0: 2388; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2389; SSE2-NEXT: retq 2390; 2391; SSSE3-LABEL: combine_undef_input_test7: 2392; SSSE3: # BB#0: 2393; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2394; SSSE3-NEXT: retq 2395; 2396; SSE41-LABEL: combine_undef_input_test7: 2397; SSE41: # BB#0: 2398; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2399; SSE41-NEXT: retq 2400; 2401; AVX-LABEL: combine_undef_input_test7: 2402; AVX: # BB#0: 2403; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2404; AVX-NEXT: retq 2405 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2406 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2407 ret <4 x float> %2 2408} 2409 2410define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2411; SSE2-LABEL: combine_undef_input_test8: 2412; SSE2: # BB#0: 2413; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2414; SSE2-NEXT: retq 2415; 2416; SSSE3-LABEL: combine_undef_input_test8: 2417; SSSE3: # BB#0: 2418; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2419; SSSE3-NEXT: retq 2420; 2421; SSE41-LABEL: combine_undef_input_test8: 2422; SSE41: # BB#0: 2423; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2424; SSE41-NEXT: retq 2425; 2426; AVX-LABEL: combine_undef_input_test8: 2427; AVX: # BB#0: 2428; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2429; AVX-NEXT: retq 2430 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2431 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2432 ret <4 x float> %2 2433} 2434 2435define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2436; SSE-LABEL: combine_undef_input_test9: 2437; SSE: # BB#0: 2438; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2439; SSE-NEXT: retq 2440; 2441; AVX-LABEL: combine_undef_input_test9: 2442; AVX: # BB#0: 2443; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2444; AVX-NEXT: retq 2445 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2446 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2447 ret <4 x float> %2 2448} 2449 2450define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2451; ALL-LABEL: combine_undef_input_test10: 2452; ALL: # BB#0: 2453; ALL-NEXT: retq 2454 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2455 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2456 ret <4 x float> %2 2457} 2458 2459define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2460; SSE2-LABEL: combine_undef_input_test11: 2461; SSE2: # BB#0: 2462; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2463; SSE2-NEXT: retq 2464; 2465; SSSE3-LABEL: combine_undef_input_test11: 2466; SSSE3: # BB#0: 2467; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2468; SSSE3-NEXT: retq 2469; 2470; SSE41-LABEL: combine_undef_input_test11: 2471; SSE41: # BB#0: 2472; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2473; SSE41-NEXT: retq 2474; 2475; AVX-LABEL: combine_undef_input_test11: 2476; AVX: # BB#0: 2477; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2478; AVX-NEXT: retq 2479 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2480 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2481 ret <4 x float> %2 2482} 2483 2484define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2485; SSE-LABEL: combine_undef_input_test12: 2486; SSE: # BB#0: 2487; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2488; SSE-NEXT: retq 2489; 2490; AVX-LABEL: combine_undef_input_test12: 2491; AVX: # BB#0: 2492; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2493; AVX-NEXT: retq 2494 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2495 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2496 ret <4 x float> %2 2497} 2498 2499define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2500; SSE-LABEL: combine_undef_input_test13: 2501; SSE: # BB#0: 2502; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2503; SSE-NEXT: retq 2504; 2505; AVX-LABEL: combine_undef_input_test13: 2506; AVX: # BB#0: 2507; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2508; AVX-NEXT: retq 2509 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2510 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2511 ret <4 x float> %2 2512} 2513 2514define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2515; SSE-LABEL: combine_undef_input_test14: 2516; SSE: # BB#0: 2517; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2518; SSE-NEXT: movapd %xmm1, %xmm0 2519; SSE-NEXT: retq 2520; 2521; AVX-LABEL: combine_undef_input_test14: 2522; AVX: # BB#0: 2523; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2524; AVX-NEXT: retq 2525 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2526 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2527 ret <4 x float> %2 2528} 2529 2530define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2531; SSE2-LABEL: combine_undef_input_test15: 2532; SSE2: # BB#0: 2533; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2534; SSE2-NEXT: movapd %xmm1, %xmm0 2535; SSE2-NEXT: retq 2536; 2537; SSSE3-LABEL: combine_undef_input_test15: 2538; SSSE3: # BB#0: 2539; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2540; SSSE3-NEXT: movapd %xmm1, %xmm0 2541; SSSE3-NEXT: retq 2542; 2543; SSE41-LABEL: combine_undef_input_test15: 2544; SSE41: # BB#0: 2545; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2546; SSE41-NEXT: retq 2547; 2548; AVX-LABEL: combine_undef_input_test15: 2549; AVX: # BB#0: 2550; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2551; AVX-NEXT: retq 2552 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2553 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2554 ret <4 x float> %2 2555} 2556 2557 2558; Verify that shuffles are canonicalized according to rules: 2559; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2560; 2561; This allows to trigger the following combine rule: 2562; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2563; 2564; As a result, all the shuffle pairs in each function below should be 2565; combined into a single legal shuffle operation. 2566 2567define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2568; ALL-LABEL: combine_undef_input_test16: 2569; ALL: # BB#0: 2570; ALL-NEXT: retq 2571 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2572 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2573 ret <4 x float> %2 2574} 2575 2576define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2577; SSE2-LABEL: combine_undef_input_test17: 2578; SSE2: # BB#0: 2579; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2580; SSE2-NEXT: retq 2581; 2582; SSSE3-LABEL: combine_undef_input_test17: 2583; SSSE3: # BB#0: 2584; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2585; SSSE3-NEXT: retq 2586; 2587; SSE41-LABEL: combine_undef_input_test17: 2588; SSE41: # BB#0: 2589; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2590; SSE41-NEXT: retq 2591; 2592; AVX-LABEL: combine_undef_input_test17: 2593; AVX: # BB#0: 2594; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2595; AVX-NEXT: retq 2596 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2597 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2598 ret <4 x float> %2 2599} 2600 2601define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2602; SSE2-LABEL: combine_undef_input_test18: 2603; SSE2: # BB#0: 2604; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2605; SSE2-NEXT: retq 2606; 2607; SSSE3-LABEL: combine_undef_input_test18: 2608; SSSE3: # BB#0: 2609; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2610; SSSE3-NEXT: retq 2611; 2612; SSE41-LABEL: combine_undef_input_test18: 2613; SSE41: # BB#0: 2614; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2615; SSE41-NEXT: retq 2616; 2617; AVX-LABEL: combine_undef_input_test18: 2618; AVX: # BB#0: 2619; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2620; AVX-NEXT: retq 2621 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2622 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2623 ret <4 x float> %2 2624} 2625 2626define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2627; SSE-LABEL: combine_undef_input_test19: 2628; SSE: # BB#0: 2629; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2630; SSE-NEXT: retq 2631; 2632; AVX-LABEL: combine_undef_input_test19: 2633; AVX: # BB#0: 2634; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2635; AVX-NEXT: retq 2636 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2637 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2638 ret <4 x float> %2 2639} 2640 2641define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2642; ALL-LABEL: combine_undef_input_test20: 2643; ALL: # BB#0: 2644; ALL-NEXT: retq 2645 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2646 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2647 ret <4 x float> %2 2648} 2649 2650; These tests are designed to test the ability to combine away unnecessary 2651; operations feeding into a shuffle. The AVX cases are the important ones as 2652; they leverage operations which cannot be done naturally on the entire vector 2653; and thus are decomposed into multiple smaller operations. 2654 2655define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2656; SSE-LABEL: combine_unneeded_subvector1: 2657; SSE: # BB#0: 2658; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2659; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2660; SSE-NEXT: movdqa %xmm0, %xmm1 2661; SSE-NEXT: retq 2662; 2663; AVX1-LABEL: combine_unneeded_subvector1: 2664; AVX1: # BB#0: 2665; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2666; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2667; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2668; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2669; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2670; AVX1-NEXT: retq 2671; 2672; AVX2-LABEL: combine_unneeded_subvector1: 2673; AVX2: # BB#0: 2674; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2675; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2676; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2677; AVX2-NEXT: retq 2678 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2679 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2680 ret <8 x i32> %c 2681} 2682 2683define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2684; SSE-LABEL: combine_unneeded_subvector2: 2685; SSE: # BB#0: 2686; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2687; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2688; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2689; SSE-NEXT: retq 2690; 2691; AVX1-LABEL: combine_unneeded_subvector2: 2692; AVX1: # BB#0: 2693; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2694; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2695; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2696; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2697; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2698; AVX1-NEXT: retq 2699; 2700; AVX2-LABEL: combine_unneeded_subvector2: 2701; AVX2: # BB#0: 2702; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2703; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2704; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2705; AVX2-NEXT: retq 2706 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2707 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2708 ret <8 x i32> %d 2709} 2710 2711define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2712; SSE2-LABEL: combine_insertps1: 2713; SSE2: # BB#0: 2714; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2715; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2716; SSE2-NEXT: movaps %xmm1, %xmm0 2717; SSE2-NEXT: retq 2718; 2719; SSSE3-LABEL: combine_insertps1: 2720; SSSE3: # BB#0: 2721; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2722; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2723; SSSE3-NEXT: movaps %xmm1, %xmm0 2724; SSSE3-NEXT: retq 2725; 2726; SSE41-LABEL: combine_insertps1: 2727; SSE41: # BB#0: 2728; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2729; SSE41-NEXT: retq 2730; 2731; AVX-LABEL: combine_insertps1: 2732; AVX: # BB#0: 2733; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2734; AVX-NEXT: retq 2735 2736 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2737 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2738 ret <4 x float> %d 2739} 2740 2741define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2742; SSE2-LABEL: combine_insertps2: 2743; SSE2: # BB#0: 2744; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2745; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2746; SSE2-NEXT: movaps %xmm1, %xmm0 2747; SSE2-NEXT: retq 2748; 2749; SSSE3-LABEL: combine_insertps2: 2750; SSSE3: # BB#0: 2751; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2752; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2753; SSSE3-NEXT: movaps %xmm1, %xmm0 2754; SSSE3-NEXT: retq 2755; 2756; SSE41-LABEL: combine_insertps2: 2757; SSE41: # BB#0: 2758; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2759; SSE41-NEXT: retq 2760; 2761; AVX-LABEL: combine_insertps2: 2762; AVX: # BB#0: 2763; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2764; AVX-NEXT: retq 2765 2766 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2767 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2768 ret <4 x float> %d 2769} 2770 2771define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2772; SSE2-LABEL: combine_insertps3: 2773; SSE2: # BB#0: 2774; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2775; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2776; SSE2-NEXT: retq 2777; 2778; SSSE3-LABEL: combine_insertps3: 2779; SSSE3: # BB#0: 2780; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2781; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2782; SSSE3-NEXT: retq 2783; 2784; SSE41-LABEL: combine_insertps3: 2785; SSE41: # BB#0: 2786; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2787; SSE41-NEXT: retq 2788; 2789; AVX-LABEL: combine_insertps3: 2790; AVX: # BB#0: 2791; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2792; AVX-NEXT: retq 2793 2794 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2795 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2796 ret <4 x float> %d 2797} 2798 2799define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2800; SSE2-LABEL: combine_insertps4: 2801; SSE2: # BB#0: 2802; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2803; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2804; SSE2-NEXT: retq 2805; 2806; SSSE3-LABEL: combine_insertps4: 2807; SSSE3: # BB#0: 2808; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2809; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2810; SSSE3-NEXT: retq 2811; 2812; SSE41-LABEL: combine_insertps4: 2813; SSE41: # BB#0: 2814; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2815; SSE41-NEXT: retq 2816; 2817; AVX-LABEL: combine_insertps4: 2818; AVX: # BB#0: 2819; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2820; AVX-NEXT: retq 2821 2822 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2823 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2824 ret <4 x float> %d 2825} 2826 2827; FIXME: Failed to recognise that the VMOVSD has already zero'd the upper element 2828define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) { 2829; SSE2-LABEL: combine_scalar_load_with_blend_with_zero: 2830; SSE2: # BB#0: 2831; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2832; SSE2-NEXT: xorps %xmm1, %xmm1 2833; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2834; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2835; SSE2-NEXT: movaps %xmm0, (%rsi) 2836; SSE2-NEXT: retq 2837; 2838; SSSE3-LABEL: combine_scalar_load_with_blend_with_zero: 2839; SSSE3: # BB#0: 2840; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2841; SSSE3-NEXT: xorps %xmm1, %xmm1 2842; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2843; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2844; SSSE3-NEXT: movaps %xmm0, (%rsi) 2845; SSSE3-NEXT: retq 2846; 2847; SSE41-LABEL: combine_scalar_load_with_blend_with_zero: 2848; SSE41: # BB#0: 2849; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2850; SSE41-NEXT: xorpd %xmm1, %xmm1 2851; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2852; SSE41-NEXT: movapd %xmm1, (%rsi) 2853; SSE41-NEXT: retq 2854; 2855; AVX-LABEL: combine_scalar_load_with_blend_with_zero: 2856; AVX: # BB#0: 2857; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2858; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 2859; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2860; AVX-NEXT: vmovapd %xmm0, (%rsi) 2861; AVX-NEXT: retq 2862 %1 = load double, double* %a0, align 8 2863 %2 = insertelement <2 x double> undef, double %1, i32 0 2864 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 2865 %4 = bitcast <2 x double> %3 to <4 x float> 2866 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 2867 store <4 x float> %5, <4 x float>* %a1, align 16 2868 ret void 2869} 2870 2871define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2872; SSE-LABEL: PR22377: 2873; SSE: # BB#0: # %entry 2874; SSE-NEXT: movaps %xmm0, %xmm1 2875; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,1,3] 2876; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2877; SSE-NEXT: addps %xmm0, %xmm1 2878; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2879; SSE-NEXT: retq 2880; 2881; AVX-LABEL: PR22377: 2882; AVX: # BB#0: # %entry 2883; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] 2884; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2885; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 2886; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2887; AVX-NEXT: retq 2888entry: 2889 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2890 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2891 %r2 = fadd <4 x float> %s1, %s2 2892 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2893 ret <4 x float> %s3 2894} 2895 2896define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2897; SSE2-LABEL: PR22390: 2898; SSE2: # BB#0: # %entry 2899; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2900; SSE2-NEXT: movaps %xmm0, %xmm2 2901; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2902; SSE2-NEXT: addps %xmm0, %xmm2 2903; SSE2-NEXT: movaps %xmm2, %xmm0 2904; SSE2-NEXT: retq 2905; 2906; SSSE3-LABEL: PR22390: 2907; SSSE3: # BB#0: # %entry 2908; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2909; SSSE3-NEXT: movaps %xmm0, %xmm2 2910; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2911; SSSE3-NEXT: addps %xmm0, %xmm2 2912; SSSE3-NEXT: movaps %xmm2, %xmm0 2913; SSSE3-NEXT: retq 2914; 2915; SSE41-LABEL: PR22390: 2916; SSE41: # BB#0: # %entry 2917; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2918; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2919; SSE41-NEXT: addps %xmm1, %xmm0 2920; SSE41-NEXT: retq 2921; 2922; AVX-LABEL: PR22390: 2923; AVX: # BB#0: # %entry 2924; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2925; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2926; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2927; AVX-NEXT: retq 2928entry: 2929 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2930 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2931 %r2 = fadd <4 x float> %s1, %s2 2932 ret <4 x float> %r2 2933} 2934 2935define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2936; SSE2-LABEL: PR22412: 2937; SSE2: # BB#0: # %entry 2938; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2939; SSE2-NEXT: movapd %xmm2, %xmm0 2940; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2941; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2942; SSE2-NEXT: movaps %xmm3, %xmm1 2943; SSE2-NEXT: retq 2944; 2945; SSSE3-LABEL: PR22412: 2946; SSSE3: # BB#0: # %entry 2947; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2948; SSSE3-NEXT: movapd %xmm2, %xmm0 2949; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2950; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2951; SSSE3-NEXT: movaps %xmm3, %xmm1 2952; SSSE3-NEXT: retq 2953; 2954; SSE41-LABEL: PR22412: 2955; SSE41: # BB#0: # %entry 2956; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] 2957; SSE41-NEXT: movapd %xmm0, %xmm1 2958; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2] 2959; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2] 2960; SSE41-NEXT: movaps %xmm1, %xmm0 2961; SSE41-NEXT: movaps %xmm3, %xmm1 2962; SSE41-NEXT: retq 2963; 2964; AVX1-LABEL: PR22412: 2965; AVX1: # BB#0: # %entry 2966; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 2967; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 2968; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2969; AVX1-NEXT: retq 2970; 2971; AVX2-LABEL: PR22412: 2972; AVX2: # BB#0: # %entry 2973; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 2974; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 2975; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] 2976; AVX2-NEXT: retq 2977entry: 2978 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2979 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2980 ret <8 x float> %s2 2981} 2982