1; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 2; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 3; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 6; 7; Verify that the DAG combiner correctly folds bitwise operations across 8; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 9; basic and always-safe patterns. Also test that the DAG combiner will combine 10; target-specific shuffle instructions where reasonable. 11 12target triple = "x86_64-unknown-unknown" 13 14declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 15declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 16declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 17 18define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 19; ALL-LABEL: combine_pshufd1: 20; ALL: # BB#0: # %entry 21; ALL-NEXT: retq 22entry: 23 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 24 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 25 ret <4 x i32> %c 26} 27 28define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 29; ALL-LABEL: combine_pshufd2: 30; ALL: # BB#0: # %entry 31; ALL-NEXT: retq 32entry: 33 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 34 %b.cast = bitcast <4 x i32> %b to <8 x i16> 35 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 36 %c.cast = bitcast <8 x i16> %c to <4 x i32> 37 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 38 ret <4 x i32> %d 39} 40 41define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 42; ALL-LABEL: combine_pshufd3: 43; ALL: # BB#0: # %entry 44; ALL-NEXT: retq 45entry: 46 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 47 %b.cast = bitcast <4 x i32> %b to <8 x i16> 48 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 49 %c.cast = bitcast <8 x i16> %c to <4 x i32> 50 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 51 ret <4 x i32> %d 52} 53 54define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 55; SSE-LABEL: combine_pshufd4: 56; SSE: # BB#0: # %entry 57; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 58; SSE-NEXT: retq 59; 60; AVX-LABEL: combine_pshufd4: 61; AVX: # BB#0: # %entry 62; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 63; AVX-NEXT: retq 64entry: 65 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 66 %b.cast = bitcast <4 x i32> %b to <8 x i16> 67 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 68 %c.cast = bitcast <8 x i16> %c to <4 x i32> 69 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 70 ret <4 x i32> %d 71} 72 73define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 74; SSE-LABEL: combine_pshufd5: 75; SSE: # BB#0: # %entry 76; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 77; SSE-NEXT: retq 78; 79; AVX-LABEL: combine_pshufd5: 80; AVX: # BB#0: # %entry 81; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 82; AVX-NEXT: retq 83entry: 84 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 85 %b.cast = bitcast <4 x i32> %b to <8 x i16> 86 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 87 %c.cast = bitcast <8 x i16> %c to <4 x i32> 88 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 89 ret <4 x i32> %d 90} 91 92define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 93; SSE-LABEL: combine_pshufd6: 94; SSE: # BB#0: # %entry 95; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 96; SSE-NEXT: retq 97; 98; AVX-LABEL: combine_pshufd6: 99; AVX: # BB#0: # %entry 100; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 101; AVX-NEXT: retq 102entry: 103 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 104 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 105 ret <4 x i32> %c 106} 107 108define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 109; ALL-LABEL: combine_pshuflw1: 110; ALL: # BB#0: # %entry 111; ALL-NEXT: retq 112entry: 113 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 114 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 115 ret <8 x i16> %c 116} 117 118define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 119; ALL-LABEL: combine_pshuflw2: 120; ALL: # BB#0: # %entry 121; ALL-NEXT: retq 122entry: 123 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 124 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 125 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 126 ret <8 x i16> %d 127} 128 129define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 130; SSE-LABEL: combine_pshuflw3: 131; SSE: # BB#0: # %entry 132; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 133; SSE-NEXT: retq 134; 135; AVX-LABEL: combine_pshuflw3: 136; AVX: # BB#0: # %entry 137; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 138; AVX-NEXT: retq 139entry: 140 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 141 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 142 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 143 ret <8 x i16> %d 144} 145 146define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 147; SSE-LABEL: combine_pshufhw1: 148; SSE: # BB#0: # %entry 149; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 150; SSE-NEXT: retq 151; 152; AVX-LABEL: combine_pshufhw1: 153; AVX: # BB#0: # %entry 154; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 155; AVX-NEXT: retq 156entry: 157 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 158 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 159 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 160 ret <8 x i16> %d 161} 162 163define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 164; SSE-LABEL: combine_bitwise_ops_test1: 165; SSE: # BB#0: 166; SSE-NEXT: pand %xmm1, %xmm0 167; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 168; SSE-NEXT: retq 169; 170; AVX-LABEL: combine_bitwise_ops_test1: 171; AVX: # BB#0: 172; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 173; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 174; AVX-NEXT: retq 175 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 176 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 177 %and = and <4 x i32> %shuf1, %shuf2 178 ret <4 x i32> %and 179} 180 181define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 182; SSE-LABEL: combine_bitwise_ops_test2: 183; SSE: # BB#0: 184; SSE-NEXT: por %xmm1, %xmm0 185; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 186; SSE-NEXT: retq 187; 188; AVX-LABEL: combine_bitwise_ops_test2: 189; AVX: # BB#0: 190; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 191; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 192; AVX-NEXT: retq 193 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 194 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 195 %or = or <4 x i32> %shuf1, %shuf2 196 ret <4 x i32> %or 197} 198 199define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 200; SSE-LABEL: combine_bitwise_ops_test3: 201; SSE: # BB#0: 202; SSE-NEXT: pxor %xmm1, %xmm0 203; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 204; SSE-NEXT: retq 205; 206; AVX-LABEL: combine_bitwise_ops_test3: 207; AVX: # BB#0: 208; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 209; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 210; AVX-NEXT: retq 211 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 212 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 213 %xor = xor <4 x i32> %shuf1, %shuf2 214 ret <4 x i32> %xor 215} 216 217define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 218; SSE-LABEL: combine_bitwise_ops_test4: 219; SSE: # BB#0: 220; SSE-NEXT: pand %xmm1, %xmm0 221; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 222; SSE-NEXT: retq 223; 224; AVX-LABEL: combine_bitwise_ops_test4: 225; AVX: # BB#0: 226; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 227; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 228; AVX-NEXT: retq 229 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 230 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 231 %and = and <4 x i32> %shuf1, %shuf2 232 ret <4 x i32> %and 233} 234 235define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 236; SSE-LABEL: combine_bitwise_ops_test5: 237; SSE: # BB#0: 238; SSE-NEXT: por %xmm1, %xmm0 239; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 240; SSE-NEXT: retq 241; 242; AVX-LABEL: combine_bitwise_ops_test5: 243; AVX: # BB#0: 244; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 245; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 246; AVX-NEXT: retq 247 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 248 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 249 %or = or <4 x i32> %shuf1, %shuf2 250 ret <4 x i32> %or 251} 252 253define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 254; SSE-LABEL: combine_bitwise_ops_test6: 255; SSE: # BB#0: 256; SSE-NEXT: pxor %xmm1, %xmm0 257; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 258; SSE-NEXT: retq 259; 260; AVX-LABEL: combine_bitwise_ops_test6: 261; AVX: # BB#0: 262; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 263; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 264; AVX-NEXT: retq 265 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 266 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 267 %xor = xor <4 x i32> %shuf1, %shuf2 268 ret <4 x i32> %xor 269} 270 271 272; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 273; are not performing a swizzle operations. 274 275define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 276; SSE2-LABEL: combine_bitwise_ops_test1b: 277; SSE2: # BB#0: 278; SSE2-NEXT: pand %xmm1, %xmm0 279; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 280; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 281; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 282; SSE2-NEXT: retq 283; 284; SSSE3-LABEL: combine_bitwise_ops_test1b: 285; SSSE3: # BB#0: 286; SSSE3-NEXT: pand %xmm1, %xmm0 287; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 288; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 289; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 290; SSSE3-NEXT: retq 291; 292; SSE41-LABEL: combine_bitwise_ops_test1b: 293; SSE41: # BB#0: 294; SSE41-NEXT: pand %xmm1, %xmm0 295; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 296; SSE41-NEXT: retq 297; 298; AVX1-LABEL: combine_bitwise_ops_test1b: 299; AVX1: # BB#0: 300; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 301; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 302; AVX1-NEXT: retq 303; 304; AVX2-LABEL: combine_bitwise_ops_test1b: 305; AVX2: # BB#0: 306; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 307; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 308; AVX2-NEXT: retq 309 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 310 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 311 %and = and <4 x i32> %shuf1, %shuf2 312 ret <4 x i32> %and 313} 314 315define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 316; SSE2-LABEL: combine_bitwise_ops_test2b: 317; SSE2: # BB#0: 318; SSE2-NEXT: por %xmm1, %xmm0 319; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 320; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 321; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 322; SSE2-NEXT: retq 323; 324; SSSE3-LABEL: combine_bitwise_ops_test2b: 325; SSSE3: # BB#0: 326; SSSE3-NEXT: por %xmm1, %xmm0 327; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 328; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 329; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 330; SSSE3-NEXT: retq 331; 332; SSE41-LABEL: combine_bitwise_ops_test2b: 333; SSE41: # BB#0: 334; SSE41-NEXT: por %xmm1, %xmm0 335; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 336; SSE41-NEXT: retq 337; 338; AVX1-LABEL: combine_bitwise_ops_test2b: 339; AVX1: # BB#0: 340; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 341; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 342; AVX1-NEXT: retq 343; 344; AVX2-LABEL: combine_bitwise_ops_test2b: 345; AVX2: # BB#0: 346; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 347; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 348; AVX2-NEXT: retq 349 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 350 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 351 %or = or <4 x i32> %shuf1, %shuf2 352 ret <4 x i32> %or 353} 354 355define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 356; SSE2-LABEL: combine_bitwise_ops_test3b: 357; SSE2: # BB#0: 358; SSE2-NEXT: xorps %xmm1, %xmm0 359; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 360; SSE2-NEXT: retq 361; 362; SSSE3-LABEL: combine_bitwise_ops_test3b: 363; SSSE3: # BB#0: 364; SSSE3-NEXT: xorps %xmm1, %xmm0 365; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 366; SSSE3-NEXT: retq 367; 368; SSE41-LABEL: combine_bitwise_ops_test3b: 369; SSE41: # BB#0: 370; SSE41-NEXT: pxor %xmm1, %xmm0 371; SSE41-NEXT: pxor %xmm1, %xmm1 372; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 373; SSE41-NEXT: retq 374; 375; AVX1-LABEL: combine_bitwise_ops_test3b: 376; AVX1: # BB#0: 377; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 378; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 379; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 380; AVX1-NEXT: retq 381; 382; AVX2-LABEL: combine_bitwise_ops_test3b: 383; AVX2: # BB#0: 384; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 385; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 386; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 387; AVX2-NEXT: retq 388 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 389 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 390 %xor = xor <4 x i32> %shuf1, %shuf2 391 ret <4 x i32> %xor 392} 393 394define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 395; SSE2-LABEL: combine_bitwise_ops_test4b: 396; SSE2: # BB#0: 397; SSE2-NEXT: pand %xmm1, %xmm0 398; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 399; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 400; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 401; SSE2-NEXT: retq 402; 403; SSSE3-LABEL: combine_bitwise_ops_test4b: 404; SSSE3: # BB#0: 405; SSSE3-NEXT: pand %xmm1, %xmm0 406; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 407; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 408; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 409; SSSE3-NEXT: retq 410; 411; SSE41-LABEL: combine_bitwise_ops_test4b: 412; SSE41: # BB#0: 413; SSE41-NEXT: pand %xmm1, %xmm0 414; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 415; SSE41-NEXT: retq 416; 417; AVX1-LABEL: combine_bitwise_ops_test4b: 418; AVX1: # BB#0: 419; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 420; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 421; AVX1-NEXT: retq 422; 423; AVX2-LABEL: combine_bitwise_ops_test4b: 424; AVX2: # BB#0: 425; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 426; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 427; AVX2-NEXT: retq 428 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 429 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 430 %and = and <4 x i32> %shuf1, %shuf2 431 ret <4 x i32> %and 432} 433 434define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 435; SSE2-LABEL: combine_bitwise_ops_test5b: 436; SSE2: # BB#0: 437; SSE2-NEXT: por %xmm1, %xmm0 438; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 439; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 440; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 441; SSE2-NEXT: retq 442; 443; SSSE3-LABEL: combine_bitwise_ops_test5b: 444; SSSE3: # BB#0: 445; SSSE3-NEXT: por %xmm1, %xmm0 446; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 447; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 448; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 449; SSSE3-NEXT: retq 450; 451; SSE41-LABEL: combine_bitwise_ops_test5b: 452; SSE41: # BB#0: 453; SSE41-NEXT: por %xmm1, %xmm0 454; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 455; SSE41-NEXT: retq 456; 457; AVX1-LABEL: combine_bitwise_ops_test5b: 458; AVX1: # BB#0: 459; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 460; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 461; AVX1-NEXT: retq 462; 463; AVX2-LABEL: combine_bitwise_ops_test5b: 464; AVX2: # BB#0: 465; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 466; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 467; AVX2-NEXT: retq 468 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 469 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 470 %or = or <4 x i32> %shuf1, %shuf2 471 ret <4 x i32> %or 472} 473 474define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 475; SSE2-LABEL: combine_bitwise_ops_test6b: 476; SSE2: # BB#0: 477; SSE2-NEXT: xorps %xmm1, %xmm0 478; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 479; SSE2-NEXT: retq 480; 481; SSSE3-LABEL: combine_bitwise_ops_test6b: 482; SSSE3: # BB#0: 483; SSSE3-NEXT: xorps %xmm1, %xmm0 484; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 485; SSSE3-NEXT: retq 486; 487; SSE41-LABEL: combine_bitwise_ops_test6b: 488; SSE41: # BB#0: 489; SSE41-NEXT: pxor %xmm1, %xmm0 490; SSE41-NEXT: pxor %xmm1, %xmm1 491; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] 492; SSE41-NEXT: retq 493; 494; AVX1-LABEL: combine_bitwise_ops_test6b: 495; AVX1: # BB#0: 496; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 497; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 498; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] 499; AVX1-NEXT: retq 500; 501; AVX2-LABEL: combine_bitwise_ops_test6b: 502; AVX2: # BB#0: 503; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 504; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 505; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 506; AVX2-NEXT: retq 507 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 508 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 509 %xor = xor <4 x i32> %shuf1, %shuf2 510 ret <4 x i32> %xor 511} 512 513define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 514; SSE2-LABEL: combine_bitwise_ops_test1c: 515; SSE2: # BB#0: 516; SSE2-NEXT: pand %xmm1, %xmm0 517; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 518; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 519; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 520; SSE2-NEXT: retq 521; 522; SSSE3-LABEL: combine_bitwise_ops_test1c: 523; SSSE3: # BB#0: 524; SSSE3-NEXT: pand %xmm1, %xmm0 525; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 526; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 527; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 528; SSSE3-NEXT: retq 529; 530; SSE41-LABEL: combine_bitwise_ops_test1c: 531; SSE41: # BB#0: 532; SSE41-NEXT: pand %xmm1, %xmm0 533; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 534; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 535; SSE41-NEXT: retq 536; 537; AVX1-LABEL: combine_bitwise_ops_test1c: 538; AVX1: # BB#0: 539; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 540; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 541; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 542; AVX1-NEXT: retq 543; 544; AVX2-LABEL: combine_bitwise_ops_test1c: 545; AVX2: # BB#0: 546; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 547; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 548; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 549; AVX2-NEXT: retq 550 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 551 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 552 %and = and <4 x i32> %shuf1, %shuf2 553 ret <4 x i32> %and 554} 555 556define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 557; SSE2-LABEL: combine_bitwise_ops_test2c: 558; SSE2: # BB#0: 559; SSE2-NEXT: por %xmm1, %xmm0 560; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 561; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 562; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 563; SSE2-NEXT: retq 564; 565; SSSE3-LABEL: combine_bitwise_ops_test2c: 566; SSSE3: # BB#0: 567; SSSE3-NEXT: por %xmm1, %xmm0 568; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 569; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 570; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 571; SSSE3-NEXT: retq 572; 573; SSE41-LABEL: combine_bitwise_ops_test2c: 574; SSE41: # BB#0: 575; SSE41-NEXT: por %xmm1, %xmm0 576; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 577; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 578; SSE41-NEXT: retq 579; 580; AVX1-LABEL: combine_bitwise_ops_test2c: 581; AVX1: # BB#0: 582; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 583; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 584; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 585; AVX1-NEXT: retq 586; 587; AVX2-LABEL: combine_bitwise_ops_test2c: 588; AVX2: # BB#0: 589; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 590; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 591; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 592; AVX2-NEXT: retq 593 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 594 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 595 %or = or <4 x i32> %shuf1, %shuf2 596 ret <4 x i32> %or 597} 598 599define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 600; SSE2-LABEL: combine_bitwise_ops_test3c: 601; SSE2: # BB#0: 602; SSE2-NEXT: pxor %xmm1, %xmm0 603; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 604; SSE2-NEXT: pxor %xmm1, %xmm1 605; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 606; SSE2-NEXT: retq 607; 608; SSSE3-LABEL: combine_bitwise_ops_test3c: 609; SSSE3: # BB#0: 610; SSSE3-NEXT: pxor %xmm1, %xmm0 611; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] 612; SSSE3-NEXT: pxor %xmm1, %xmm1 613; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 614; SSSE3-NEXT: retq 615; 616; SSE41-LABEL: combine_bitwise_ops_test3c: 617; SSE41: # BB#0: 618; SSE41-NEXT: pxor %xmm1, %xmm0 619; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 620; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 621; SSE41-NEXT: retq 622; 623; AVX-LABEL: combine_bitwise_ops_test3c: 624; AVX: # BB#0: 625; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 626; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 627; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 628; AVX-NEXT: retq 629 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 630 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 631 %xor = xor <4 x i32> %shuf1, %shuf2 632 ret <4 x i32> %xor 633} 634 635define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 636; SSE2-LABEL: combine_bitwise_ops_test4c: 637; SSE2: # BB#0: 638; SSE2-NEXT: pand %xmm1, %xmm0 639; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 640; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 641; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 642; SSE2-NEXT: retq 643; 644; SSSE3-LABEL: combine_bitwise_ops_test4c: 645; SSSE3: # BB#0: 646; SSSE3-NEXT: pand %xmm1, %xmm0 647; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 648; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 649; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 650; SSSE3-NEXT: retq 651; 652; SSE41-LABEL: combine_bitwise_ops_test4c: 653; SSE41: # BB#0: 654; SSE41-NEXT: pand %xmm1, %xmm0 655; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 656; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 657; SSE41-NEXT: retq 658; 659; AVX1-LABEL: combine_bitwise_ops_test4c: 660; AVX1: # BB#0: 661; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 662; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 663; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 664; AVX1-NEXT: retq 665; 666; AVX2-LABEL: combine_bitwise_ops_test4c: 667; AVX2: # BB#0: 668; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 669; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 670; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 671; AVX2-NEXT: retq 672 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 673 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 674 %and = and <4 x i32> %shuf1, %shuf2 675 ret <4 x i32> %and 676} 677 678define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 679; SSE2-LABEL: combine_bitwise_ops_test5c: 680; SSE2: # BB#0: 681; SSE2-NEXT: por %xmm1, %xmm0 682; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 683; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 684; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 685; SSE2-NEXT: retq 686; 687; SSSE3-LABEL: combine_bitwise_ops_test5c: 688; SSSE3: # BB#0: 689; SSSE3-NEXT: por %xmm1, %xmm0 690; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 691; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 692; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 693; SSSE3-NEXT: retq 694; 695; SSE41-LABEL: combine_bitwise_ops_test5c: 696; SSE41: # BB#0: 697; SSE41-NEXT: por %xmm1, %xmm0 698; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 699; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 700; SSE41-NEXT: retq 701; 702; AVX1-LABEL: combine_bitwise_ops_test5c: 703; AVX1: # BB#0: 704; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 705; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 706; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 707; AVX1-NEXT: retq 708; 709; AVX2-LABEL: combine_bitwise_ops_test5c: 710; AVX2: # BB#0: 711; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 712; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 713; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 714; AVX2-NEXT: retq 715 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 716 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 717 %or = or <4 x i32> %shuf1, %shuf2 718 ret <4 x i32> %or 719} 720 721define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 722; SSE2-LABEL: combine_bitwise_ops_test6c: 723; SSE2: # BB#0: 724; SSE2-NEXT: pxor %xmm1, %xmm0 725; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 726; SSE2-NEXT: pxor %xmm0, %xmm0 727; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 728; SSE2-NEXT: retq 729; 730; SSSE3-LABEL: combine_bitwise_ops_test6c: 731; SSSE3: # BB#0: 732; SSSE3-NEXT: pxor %xmm1, %xmm0 733; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 734; SSSE3-NEXT: pxor %xmm0, %xmm0 735; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 736; SSSE3-NEXT: retq 737; 738; SSE41-LABEL: combine_bitwise_ops_test6c: 739; SSE41: # BB#0: 740; SSE41-NEXT: pxor %xmm1, %xmm0 741; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 742; SSE41-NEXT: pxor %xmm0, %xmm0 743; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 744; SSE41-NEXT: retq 745; 746; AVX1-LABEL: combine_bitwise_ops_test6c: 747; AVX1: # BB#0: 748; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 749; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 750; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 751; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 752; AVX1-NEXT: retq 753; 754; AVX2-LABEL: combine_bitwise_ops_test6c: 755; AVX2: # BB#0: 756; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 757; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 758; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 759; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 760; AVX2-NEXT: retq 761 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 762 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 763 %xor = xor <4 x i32> %shuf1, %shuf2 764 ret <4 x i32> %xor 765} 766 767define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 768; SSE-LABEL: combine_nested_undef_test1: 769; SSE: # BB#0: 770; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 771; SSE-NEXT: retq 772; 773; AVX-LABEL: combine_nested_undef_test1: 774; AVX: # BB#0: 775; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 776; AVX-NEXT: retq 777 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 778 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 779 ret <4 x i32> %2 780} 781 782define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 783; SSE-LABEL: combine_nested_undef_test2: 784; SSE: # BB#0: 785; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 786; SSE-NEXT: retq 787; 788; AVX-LABEL: combine_nested_undef_test2: 789; AVX: # BB#0: 790; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 791; AVX-NEXT: retq 792 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 793 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 794 ret <4 x i32> %2 795} 796 797define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 798; SSE-LABEL: combine_nested_undef_test3: 799; SSE: # BB#0: 800; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 801; SSE-NEXT: retq 802; 803; AVX-LABEL: combine_nested_undef_test3: 804; AVX: # BB#0: 805; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 806; AVX-NEXT: retq 807 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 808 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 809 ret <4 x i32> %2 810} 811 812define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 813; SSE-LABEL: combine_nested_undef_test4: 814; SSE: # BB#0: 815; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 816; SSE-NEXT: retq 817; 818; AVX1-LABEL: combine_nested_undef_test4: 819; AVX1: # BB#0: 820; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 821; AVX1-NEXT: retq 822; 823; AVX2-LABEL: combine_nested_undef_test4: 824; AVX2: # BB#0: 825; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 826; AVX2-NEXT: retq 827 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 828 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 829 ret <4 x i32> %2 830} 831 832define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 833; SSE-LABEL: combine_nested_undef_test5: 834; SSE: # BB#0: 835; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 836; SSE-NEXT: retq 837; 838; AVX-LABEL: combine_nested_undef_test5: 839; AVX: # BB#0: 840; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 841; AVX-NEXT: retq 842 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 843 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 844 ret <4 x i32> %2 845} 846 847define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 848; SSE-LABEL: combine_nested_undef_test6: 849; SSE: # BB#0: 850; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 851; SSE-NEXT: retq 852; 853; AVX-LABEL: combine_nested_undef_test6: 854; AVX: # BB#0: 855; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 856; AVX-NEXT: retq 857 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 858 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 859 ret <4 x i32> %2 860} 861 862define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 863; SSE-LABEL: combine_nested_undef_test7: 864; SSE: # BB#0: 865; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 866; SSE-NEXT: retq 867; 868; AVX-LABEL: combine_nested_undef_test7: 869; AVX: # BB#0: 870; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 871; AVX-NEXT: retq 872 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 873 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 874 ret <4 x i32> %2 875} 876 877define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 878; SSE-LABEL: combine_nested_undef_test8: 879; SSE: # BB#0: 880; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 881; SSE-NEXT: retq 882; 883; AVX-LABEL: combine_nested_undef_test8: 884; AVX: # BB#0: 885; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 886; AVX-NEXT: retq 887 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 888 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 889 ret <4 x i32> %2 890} 891 892define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 893; SSE-LABEL: combine_nested_undef_test9: 894; SSE: # BB#0: 895; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 896; SSE-NEXT: retq 897; 898; AVX-LABEL: combine_nested_undef_test9: 899; AVX: # BB#0: 900; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 901; AVX-NEXT: retq 902 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 903 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 904 ret <4 x i32> %2 905} 906 907define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 908; SSE-LABEL: combine_nested_undef_test10: 909; SSE: # BB#0: 910; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] 911; SSE-NEXT: retq 912; 913; AVX-LABEL: combine_nested_undef_test10: 914; AVX: # BB#0: 915; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] 916; AVX-NEXT: retq 917 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 918 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 919 ret <4 x i32> %2 920} 921 922define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 923; SSE-LABEL: combine_nested_undef_test11: 924; SSE: # BB#0: 925; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 926; SSE-NEXT: retq 927; 928; AVX-LABEL: combine_nested_undef_test11: 929; AVX: # BB#0: 930; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 931; AVX-NEXT: retq 932 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 933 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 934 ret <4 x i32> %2 935} 936 937define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 938; SSE-LABEL: combine_nested_undef_test12: 939; SSE: # BB#0: 940; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 941; SSE-NEXT: retq 942; 943; AVX1-LABEL: combine_nested_undef_test12: 944; AVX1: # BB#0: 945; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 946; AVX1-NEXT: retq 947; 948; AVX2-LABEL: combine_nested_undef_test12: 949; AVX2: # BB#0: 950; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 951; AVX2-NEXT: retq 952 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 953 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 954 ret <4 x i32> %2 955} 956 957; The following pair of shuffles is folded into vector %A. 958define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 959; ALL-LABEL: combine_nested_undef_test13: 960; ALL: # BB#0: 961; ALL-NEXT: retq 962 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 963 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 964 ret <4 x i32> %2 965} 966 967; The following pair of shuffles is folded into vector %B. 968define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 969; SSE-LABEL: combine_nested_undef_test14: 970; SSE: # BB#0: 971; SSE-NEXT: movaps %xmm1, %xmm0 972; SSE-NEXT: retq 973; 974; AVX-LABEL: combine_nested_undef_test14: 975; AVX: # BB#0: 976; AVX-NEXT: vmovaps %xmm1, %xmm0 977; AVX-NEXT: retq 978 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 979 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 980 ret <4 x i32> %2 981} 982 983 984; Verify that we don't optimize the following cases. We expect more than one shuffle. 985; 986; FIXME: Many of these already don't make sense, and the rest should stop 987; making sense with th enew vector shuffle lowering. Revisit at least testing for 988; it. 989 990define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 991; SSE2-LABEL: combine_nested_undef_test15: 992; SSE2: # BB#0: 993; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 994; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 995; SSE2-NEXT: movaps %xmm1, %xmm0 996; SSE2-NEXT: retq 997; 998; SSSE3-LABEL: combine_nested_undef_test15: 999; SSSE3: # BB#0: 1000; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 1001; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 1002; SSSE3-NEXT: movaps %xmm1, %xmm0 1003; SSSE3-NEXT: retq 1004; 1005; SSE41-LABEL: combine_nested_undef_test15: 1006; SSE41: # BB#0: 1007; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1008; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1009; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1010; SSE41-NEXT: retq 1011; 1012; AVX1-LABEL: combine_nested_undef_test15: 1013; AVX1: # BB#0: 1014; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1015; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1016; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1017; AVX1-NEXT: retq 1018; 1019; AVX2-LABEL: combine_nested_undef_test15: 1020; AVX2: # BB#0: 1021; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 1022; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1023; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1024; AVX2-NEXT: retq 1025 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 1026 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1027 ret <4 x i32> %2 1028} 1029 1030define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 1031; SSE2-LABEL: combine_nested_undef_test16: 1032; SSE2: # BB#0: 1033; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1034; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 1035; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1036; SSE2-NEXT: retq 1037; 1038; SSSE3-LABEL: combine_nested_undef_test16: 1039; SSSE3: # BB#0: 1040; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 1041; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 1042; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1043; SSSE3-NEXT: retq 1044; 1045; SSE41-LABEL: combine_nested_undef_test16: 1046; SSE41: # BB#0: 1047; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1048; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1049; SSE41-NEXT: retq 1050; 1051; AVX1-LABEL: combine_nested_undef_test16: 1052; AVX1: # BB#0: 1053; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1054; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1055; AVX1-NEXT: retq 1056; 1057; AVX2-LABEL: combine_nested_undef_test16: 1058; AVX2: # BB#0: 1059; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 1060; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1061; AVX2-NEXT: retq 1062 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1063 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1064 ret <4 x i32> %2 1065} 1066 1067define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 1068; SSE2-LABEL: combine_nested_undef_test17: 1069; SSE2: # BB#0: 1070; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 1071; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 1072; SSE2-NEXT: retq 1073; 1074; SSSE3-LABEL: combine_nested_undef_test17: 1075; SSSE3: # BB#0: 1076; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 1077; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 1078; SSSE3-NEXT: retq 1079; 1080; SSE41-LABEL: combine_nested_undef_test17: 1081; SSE41: # BB#0: 1082; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1083; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1084; SSE41-NEXT: retq 1085; 1086; AVX1-LABEL: combine_nested_undef_test17: 1087; AVX1: # BB#0: 1088; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1089; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1090; AVX1-NEXT: retq 1091; 1092; AVX2-LABEL: combine_nested_undef_test17: 1093; AVX2: # BB#0: 1094; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1095; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 1096; AVX2-NEXT: retq 1097 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1098 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1099 ret <4 x i32> %2 1100} 1101 1102define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 1103; SSE-LABEL: combine_nested_undef_test18: 1104; SSE: # BB#0: 1105; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 1106; SSE-NEXT: retq 1107; 1108; AVX-LABEL: combine_nested_undef_test18: 1109; AVX: # BB#0: 1110; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 1111; AVX-NEXT: retq 1112 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1113 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 1114 ret <4 x i32> %2 1115} 1116 1117define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 1118; SSE2-LABEL: combine_nested_undef_test19: 1119; SSE2: # BB#0: 1120; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1121; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 1122; SSE2-NEXT: retq 1123; 1124; SSSE3-LABEL: combine_nested_undef_test19: 1125; SSSE3: # BB#0: 1126; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1127; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 1128; SSSE3-NEXT: retq 1129; 1130; SSE41-LABEL: combine_nested_undef_test19: 1131; SSE41: # BB#0: 1132; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1133; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1134; SSE41-NEXT: retq 1135; 1136; AVX1-LABEL: combine_nested_undef_test19: 1137; AVX1: # BB#0: 1138; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1139; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1140; AVX1-NEXT: retq 1141; 1142; AVX2-LABEL: combine_nested_undef_test19: 1143; AVX2: # BB#0: 1144; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1145; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1146; AVX2-NEXT: retq 1147 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 1148 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 1149 ret <4 x i32> %2 1150} 1151 1152define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 1153; SSE2-LABEL: combine_nested_undef_test20: 1154; SSE2: # BB#0: 1155; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1156; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1157; SSE2-NEXT: movaps %xmm1, %xmm0 1158; SSE2-NEXT: retq 1159; 1160; SSSE3-LABEL: combine_nested_undef_test20: 1161; SSSE3: # BB#0: 1162; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1163; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1164; SSSE3-NEXT: movaps %xmm1, %xmm0 1165; SSSE3-NEXT: retq 1166; 1167; SSE41-LABEL: combine_nested_undef_test20: 1168; SSE41: # BB#0: 1169; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1170; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1171; SSE41-NEXT: retq 1172; 1173; AVX1-LABEL: combine_nested_undef_test20: 1174; AVX1: # BB#0: 1175; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1176; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1177; AVX1-NEXT: retq 1178; 1179; AVX2-LABEL: combine_nested_undef_test20: 1180; AVX2: # BB#0: 1181; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1182; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1183; AVX2-NEXT: retq 1184 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1185 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1186 ret <4 x i32> %2 1187} 1188 1189define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1190; SSE2-LABEL: combine_nested_undef_test21: 1191; SSE2: # BB#0: 1192; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1193; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1194; SSE2-NEXT: retq 1195; 1196; SSSE3-LABEL: combine_nested_undef_test21: 1197; SSSE3: # BB#0: 1198; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1199; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1200; SSSE3-NEXT: retq 1201; 1202; SSE41-LABEL: combine_nested_undef_test21: 1203; SSE41: # BB#0: 1204; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1205; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1206; SSE41-NEXT: retq 1207; 1208; AVX1-LABEL: combine_nested_undef_test21: 1209; AVX1: # BB#0: 1210; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1211; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1212; AVX1-NEXT: retq 1213; 1214; AVX2-LABEL: combine_nested_undef_test21: 1215; AVX2: # BB#0: 1216; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1217; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1218; AVX2-NEXT: retq 1219 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1220 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1221 ret <4 x i32> %2 1222} 1223 1224 1225; Test that we correctly combine shuffles according to rule 1226; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1227 1228define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1229; SSE-LABEL: combine_nested_undef_test22: 1230; SSE: # BB#0: 1231; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1232; SSE-NEXT: retq 1233; 1234; AVX-LABEL: combine_nested_undef_test22: 1235; AVX: # BB#0: 1236; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1237; AVX-NEXT: retq 1238 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1239 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1240 ret <4 x i32> %2 1241} 1242 1243define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1244; SSE-LABEL: combine_nested_undef_test23: 1245; SSE: # BB#0: 1246; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1247; SSE-NEXT: retq 1248; 1249; AVX-LABEL: combine_nested_undef_test23: 1250; AVX: # BB#0: 1251; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1252; AVX-NEXT: retq 1253 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1254 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1255 ret <4 x i32> %2 1256} 1257 1258define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1259; SSE-LABEL: combine_nested_undef_test24: 1260; SSE: # BB#0: 1261; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1262; SSE-NEXT: retq 1263; 1264; AVX-LABEL: combine_nested_undef_test24: 1265; AVX: # BB#0: 1266; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1267; AVX-NEXT: retq 1268 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1269 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1270 ret <4 x i32> %2 1271} 1272 1273define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1274; SSE-LABEL: combine_nested_undef_test25: 1275; SSE: # BB#0: 1276; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1277; SSE-NEXT: retq 1278; 1279; AVX1-LABEL: combine_nested_undef_test25: 1280; AVX1: # BB#0: 1281; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1282; AVX1-NEXT: retq 1283; 1284; AVX2-LABEL: combine_nested_undef_test25: 1285; AVX2: # BB#0: 1286; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1287; AVX2-NEXT: retq 1288 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1289 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1290 ret <4 x i32> %2 1291} 1292 1293define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1294; SSE-LABEL: combine_nested_undef_test26: 1295; SSE: # BB#0: 1296; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1297; SSE-NEXT: retq 1298; 1299; AVX-LABEL: combine_nested_undef_test26: 1300; AVX: # BB#0: 1301; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1302; AVX-NEXT: retq 1303 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1304 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1305 ret <4 x i32> %2 1306} 1307 1308define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1309; SSE-LABEL: combine_nested_undef_test27: 1310; SSE: # BB#0: 1311; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1312; SSE-NEXT: retq 1313; 1314; AVX1-LABEL: combine_nested_undef_test27: 1315; AVX1: # BB#0: 1316; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1317; AVX1-NEXT: retq 1318; 1319; AVX2-LABEL: combine_nested_undef_test27: 1320; AVX2: # BB#0: 1321; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1322; AVX2-NEXT: retq 1323 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1324 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1325 ret <4 x i32> %2 1326} 1327 1328define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1329; SSE-LABEL: combine_nested_undef_test28: 1330; SSE: # BB#0: 1331; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1332; SSE-NEXT: retq 1333; 1334; AVX-LABEL: combine_nested_undef_test28: 1335; AVX: # BB#0: 1336; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1337; AVX-NEXT: retq 1338 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1339 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1340 ret <4 x i32> %2 1341} 1342 1343define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1344; SSE-LABEL: combine_test1: 1345; SSE: # BB#0: 1346; SSE-NEXT: movaps %xmm1, %xmm0 1347; SSE-NEXT: retq 1348; 1349; AVX-LABEL: combine_test1: 1350; AVX: # BB#0: 1351; AVX-NEXT: vmovaps %xmm1, %xmm0 1352; AVX-NEXT: retq 1353 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1354 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1355 ret <4 x float> %2 1356} 1357 1358define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1359; SSE2-LABEL: combine_test2: 1360; SSE2: # BB#0: 1361; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1362; SSE2-NEXT: movaps %xmm1, %xmm0 1363; SSE2-NEXT: retq 1364; 1365; SSSE3-LABEL: combine_test2: 1366; SSSE3: # BB#0: 1367; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1368; SSSE3-NEXT: movaps %xmm1, %xmm0 1369; SSSE3-NEXT: retq 1370; 1371; SSE41-LABEL: combine_test2: 1372; SSE41: # BB#0: 1373; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1374; SSE41-NEXT: retq 1375; 1376; AVX-LABEL: combine_test2: 1377; AVX: # BB#0: 1378; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1379; AVX-NEXT: retq 1380 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1381 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1382 ret <4 x float> %2 1383} 1384 1385define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1386; SSE-LABEL: combine_test3: 1387; SSE: # BB#0: 1388; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1389; SSE-NEXT: retq 1390; 1391; AVX-LABEL: combine_test3: 1392; AVX: # BB#0: 1393; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1394; AVX-NEXT: retq 1395 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1396 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1397 ret <4 x float> %2 1398} 1399 1400define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1401; SSE-LABEL: combine_test4: 1402; SSE: # BB#0: 1403; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1404; SSE-NEXT: movapd %xmm1, %xmm0 1405; SSE-NEXT: retq 1406; 1407; AVX-LABEL: combine_test4: 1408; AVX: # BB#0: 1409; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1410; AVX-NEXT: retq 1411 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1412 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1413 ret <4 x float> %2 1414} 1415 1416define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1417; SSE2-LABEL: combine_test5: 1418; SSE2: # BB#0: 1419; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1420; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1421; SSE2-NEXT: retq 1422; 1423; SSSE3-LABEL: combine_test5: 1424; SSSE3: # BB#0: 1425; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1426; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1427; SSSE3-NEXT: retq 1428; 1429; SSE41-LABEL: combine_test5: 1430; SSE41: # BB#0: 1431; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1432; SSE41-NEXT: retq 1433; 1434; AVX-LABEL: combine_test5: 1435; AVX: # BB#0: 1436; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1437; AVX-NEXT: retq 1438 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1439 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1440 ret <4 x float> %2 1441} 1442 1443define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1444; SSE-LABEL: combine_test6: 1445; SSE: # BB#0: 1446; SSE-NEXT: movaps %xmm1, %xmm0 1447; SSE-NEXT: retq 1448; 1449; AVX-LABEL: combine_test6: 1450; AVX: # BB#0: 1451; AVX-NEXT: vmovaps %xmm1, %xmm0 1452; AVX-NEXT: retq 1453 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1454 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1455 ret <4 x i32> %2 1456} 1457 1458define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1459; SSE2-LABEL: combine_test7: 1460; SSE2: # BB#0: 1461; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1462; SSE2-NEXT: movaps %xmm1, %xmm0 1463; SSE2-NEXT: retq 1464; 1465; SSSE3-LABEL: combine_test7: 1466; SSSE3: # BB#0: 1467; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1468; SSSE3-NEXT: movaps %xmm1, %xmm0 1469; SSSE3-NEXT: retq 1470; 1471; SSE41-LABEL: combine_test7: 1472; SSE41: # BB#0: 1473; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1474; SSE41-NEXT: retq 1475; 1476; AVX1-LABEL: combine_test7: 1477; AVX1: # BB#0: 1478; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1479; AVX1-NEXT: retq 1480; 1481; AVX2-LABEL: combine_test7: 1482; AVX2: # BB#0: 1483; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1484; AVX2-NEXT: retq 1485 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1486 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1487 ret <4 x i32> %2 1488} 1489 1490define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1491; SSE-LABEL: combine_test8: 1492; SSE: # BB#0: 1493; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1494; SSE-NEXT: retq 1495; 1496; AVX-LABEL: combine_test8: 1497; AVX: # BB#0: 1498; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1499; AVX-NEXT: retq 1500 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1501 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1502 ret <4 x i32> %2 1503} 1504 1505define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1506; SSE-LABEL: combine_test9: 1507; SSE: # BB#0: 1508; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1509; SSE-NEXT: movdqa %xmm1, %xmm0 1510; SSE-NEXT: retq 1511; 1512; AVX-LABEL: combine_test9: 1513; AVX: # BB#0: 1514; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1515; AVX-NEXT: retq 1516 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1517 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1518 ret <4 x i32> %2 1519} 1520 1521define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1522; SSE2-LABEL: combine_test10: 1523; SSE2: # BB#0: 1524; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1525; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1526; SSE2-NEXT: retq 1527; 1528; SSSE3-LABEL: combine_test10: 1529; SSSE3: # BB#0: 1530; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1531; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1532; SSSE3-NEXT: retq 1533; 1534; SSE41-LABEL: combine_test10: 1535; SSE41: # BB#0: 1536; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1537; SSE41-NEXT: retq 1538; 1539; AVX1-LABEL: combine_test10: 1540; AVX1: # BB#0: 1541; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1542; AVX1-NEXT: retq 1543; 1544; AVX2-LABEL: combine_test10: 1545; AVX2: # BB#0: 1546; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1547; AVX2-NEXT: retq 1548 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1549 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1550 ret <4 x i32> %2 1551} 1552 1553define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1554; ALL-LABEL: combine_test11: 1555; ALL: # BB#0: 1556; ALL-NEXT: retq 1557 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1558 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1559 ret <4 x float> %2 1560} 1561 1562define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1563; SSE2-LABEL: combine_test12: 1564; SSE2: # BB#0: 1565; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1566; SSE2-NEXT: movaps %xmm1, %xmm0 1567; SSE2-NEXT: retq 1568; 1569; SSSE3-LABEL: combine_test12: 1570; SSSE3: # BB#0: 1571; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1572; SSSE3-NEXT: movaps %xmm1, %xmm0 1573; SSSE3-NEXT: retq 1574; 1575; SSE41-LABEL: combine_test12: 1576; SSE41: # BB#0: 1577; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1578; SSE41-NEXT: retq 1579; 1580; AVX-LABEL: combine_test12: 1581; AVX: # BB#0: 1582; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1583; AVX-NEXT: retq 1584 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1585 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1586 ret <4 x float> %2 1587} 1588 1589define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1590; SSE-LABEL: combine_test13: 1591; SSE: # BB#0: 1592; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1593; SSE-NEXT: retq 1594; 1595; AVX-LABEL: combine_test13: 1596; AVX: # BB#0: 1597; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1598; AVX-NEXT: retq 1599 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1600 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1601 ret <4 x float> %2 1602} 1603 1604define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1605; SSE-LABEL: combine_test14: 1606; SSE: # BB#0: 1607; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1608; SSE-NEXT: retq 1609; 1610; AVX-LABEL: combine_test14: 1611; AVX: # BB#0: 1612; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1613; AVX-NEXT: retq 1614 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1615 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1616 ret <4 x float> %2 1617} 1618 1619define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1620; SSE2-LABEL: combine_test15: 1621; SSE2: # BB#0: 1622; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1623; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1624; SSE2-NEXT: retq 1625; 1626; SSSE3-LABEL: combine_test15: 1627; SSSE3: # BB#0: 1628; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1629; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1630; SSSE3-NEXT: retq 1631; 1632; SSE41-LABEL: combine_test15: 1633; SSE41: # BB#0: 1634; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1635; SSE41-NEXT: retq 1636; 1637; AVX-LABEL: combine_test15: 1638; AVX: # BB#0: 1639; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1640; AVX-NEXT: retq 1641 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1642 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1643 ret <4 x float> %2 1644} 1645 1646define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1647; ALL-LABEL: combine_test16: 1648; ALL: # BB#0: 1649; ALL-NEXT: retq 1650 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1651 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1652 ret <4 x i32> %2 1653} 1654 1655define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1656; SSE2-LABEL: combine_test17: 1657; SSE2: # BB#0: 1658; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1659; SSE2-NEXT: movaps %xmm1, %xmm0 1660; SSE2-NEXT: retq 1661; 1662; SSSE3-LABEL: combine_test17: 1663; SSSE3: # BB#0: 1664; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1665; SSSE3-NEXT: movaps %xmm1, %xmm0 1666; SSSE3-NEXT: retq 1667; 1668; SSE41-LABEL: combine_test17: 1669; SSE41: # BB#0: 1670; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1671; SSE41-NEXT: retq 1672; 1673; AVX1-LABEL: combine_test17: 1674; AVX1: # BB#0: 1675; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1676; AVX1-NEXT: retq 1677; 1678; AVX2-LABEL: combine_test17: 1679; AVX2: # BB#0: 1680; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1681; AVX2-NEXT: retq 1682 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1683 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1684 ret <4 x i32> %2 1685} 1686 1687define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1688; SSE-LABEL: combine_test18: 1689; SSE: # BB#0: 1690; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1691; SSE-NEXT: retq 1692; 1693; AVX-LABEL: combine_test18: 1694; AVX: # BB#0: 1695; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1696; AVX-NEXT: retq 1697 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1698 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1699 ret <4 x i32> %2 1700} 1701 1702define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1703; SSE-LABEL: combine_test19: 1704; SSE: # BB#0: 1705; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1706; SSE-NEXT: retq 1707; 1708; AVX-LABEL: combine_test19: 1709; AVX: # BB#0: 1710; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1711; AVX-NEXT: retq 1712 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1713 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1714 ret <4 x i32> %2 1715} 1716 1717define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1718; SSE2-LABEL: combine_test20: 1719; SSE2: # BB#0: 1720; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1721; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1722; SSE2-NEXT: retq 1723; 1724; SSSE3-LABEL: combine_test20: 1725; SSSE3: # BB#0: 1726; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1727; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1728; SSSE3-NEXT: retq 1729; 1730; SSE41-LABEL: combine_test20: 1731; SSE41: # BB#0: 1732; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1733; SSE41-NEXT: retq 1734; 1735; AVX1-LABEL: combine_test20: 1736; AVX1: # BB#0: 1737; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1738; AVX1-NEXT: retq 1739; 1740; AVX2-LABEL: combine_test20: 1741; AVX2: # BB#0: 1742; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1743; AVX2-NEXT: retq 1744 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1745 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1746 ret <4 x i32> %2 1747} 1748 1749define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { 1750; SSE-LABEL: combine_test21: 1751; SSE: # BB#0: 1752; SSE-NEXT: movdqa %xmm0, %xmm2 1753; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1754; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1755; SSE-NEXT: movdqa %xmm2, (%rdi) 1756; SSE-NEXT: retq 1757; 1758; AVX1-LABEL: combine_test21: 1759; AVX1: # BB#0: 1760; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1761; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1762; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1763; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 1764; AVX1-NEXT: vzeroupper 1765; AVX1-NEXT: retq 1766; 1767; AVX2-LABEL: combine_test21: 1768; AVX2: # BB#0: 1769; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1770; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1771; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1772; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 1773; AVX2-NEXT: vzeroupper 1774; AVX2-NEXT: retq 1775 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1776 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1777 store <4 x i32> %1, <4 x i32>* %ptr, align 16 1778 ret <4 x i32> %2 1779} 1780 1781define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { 1782; SSE-LABEL: combine_test22: 1783; SSE: # BB#0: 1784; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 1785; SSE-NEXT: movhpd (%rsi), %xmm0 1786; SSE-NEXT: retq 1787; 1788; AVX-LABEL: combine_test22: 1789; AVX: # BB#0: 1790; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 1791; AVX-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 1792; AVX-NEXT: retq 1793; Current AVX2 lowering of this is still awful, not adding a test case. 1794 %1 = load <2 x float>, <2 x float>* %a, align 8 1795 %2 = load <2 x float>, <2 x float>* %b, align 8 1796 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1797 ret <8 x float> %3 1798} 1799 1800; Check some negative cases. 1801; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1802 1803define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1804; SSE-LABEL: combine_test1b: 1805; SSE: # BB#0: 1806; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 1807; SSE-NEXT: movaps %xmm1, %xmm0 1808; SSE-NEXT: retq 1809; 1810; AVX-LABEL: combine_test1b: 1811; AVX: # BB#0: 1812; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1813; AVX-NEXT: retq 1814 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1815 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1816 ret <4 x float> %2 1817} 1818 1819define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1820; SSE2-LABEL: combine_test2b: 1821; SSE2: # BB#0: 1822; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] 1823; SSE2-NEXT: movaps %xmm1, %xmm0 1824; SSE2-NEXT: retq 1825; 1826; SSSE3-LABEL: combine_test2b: 1827; SSSE3: # BB#0: 1828; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1829; SSSE3-NEXT: retq 1830; 1831; SSE41-LABEL: combine_test2b: 1832; SSE41: # BB#0: 1833; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1834; SSE41-NEXT: retq 1835; 1836; AVX-LABEL: combine_test2b: 1837; AVX: # BB#0: 1838; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1839; AVX-NEXT: retq 1840 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1841 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1842 ret <4 x float> %2 1843} 1844 1845define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1846; SSE2-LABEL: combine_test3b: 1847; SSE2: # BB#0: 1848; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1849; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1850; SSE2-NEXT: retq 1851; 1852; SSSE3-LABEL: combine_test3b: 1853; SSSE3: # BB#0: 1854; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1855; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1856; SSSE3-NEXT: retq 1857; 1858; SSE41-LABEL: combine_test3b: 1859; SSE41: # BB#0: 1860; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1861; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1862; SSE41-NEXT: retq 1863; 1864; AVX-LABEL: combine_test3b: 1865; AVX: # BB#0: 1866; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1867; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1868; AVX-NEXT: retq 1869 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1870 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1871 ret <4 x float> %2 1872} 1873 1874define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1875; SSE-LABEL: combine_test4b: 1876; SSE: # BB#0: 1877; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] 1878; SSE-NEXT: movaps %xmm1, %xmm0 1879; SSE-NEXT: retq 1880; 1881; AVX-LABEL: combine_test4b: 1882; AVX: # BB#0: 1883; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1884; AVX-NEXT: retq 1885 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1886 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1887 ret <4 x float> %2 1888} 1889 1890 1891; Verify that we correctly fold shuffles even when we use illegal vector types. 1892 1893define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { 1894; SSE2-LABEL: combine_test1c: 1895; SSE2: # BB#0: 1896; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1897; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1898; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1899; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1900; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1901; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1902; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1903; SSE2-NEXT: retq 1904; 1905; SSSE3-LABEL: combine_test1c: 1906; SSSE3: # BB#0: 1907; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1908; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1909; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1910; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1911; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1912; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1913; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1914; SSSE3-NEXT: retq 1915; 1916; SSE41-LABEL: combine_test1c: 1917; SSE41: # BB#0: 1918; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1919; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1920; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1921; SSE41-NEXT: retq 1922; 1923; AVX1-LABEL: combine_test1c: 1924; AVX1: # BB#0: 1925; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1926; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1927; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1928; AVX1-NEXT: retq 1929; 1930; AVX2-LABEL: combine_test1c: 1931; AVX2: # BB#0: 1932; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1933; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1934; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1935; AVX2-NEXT: retq 1936 %A = load <4 x i8>, <4 x i8>* %a 1937 %B = load <4 x i8>, <4 x i8>* %b 1938 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1939 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1940 ret <4 x i8> %2 1941} 1942 1943define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { 1944; SSE2-LABEL: combine_test2c: 1945; SSE2: # BB#0: 1946; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1947; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1948; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1949; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1950; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1951; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1952; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1953; SSE2-NEXT: retq 1954; 1955; SSSE3-LABEL: combine_test2c: 1956; SSSE3: # BB#0: 1957; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1958; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1959; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1960; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1961; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1962; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1963; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1964; SSSE3-NEXT: retq 1965; 1966; SSE41-LABEL: combine_test2c: 1967; SSE41: # BB#0: 1968; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1969; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1970; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1971; SSE41-NEXT: retq 1972; 1973; AVX-LABEL: combine_test2c: 1974; AVX: # BB#0: 1975; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1976; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1977; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1978; AVX-NEXT: retq 1979 %A = load <4 x i8>, <4 x i8>* %a 1980 %B = load <4 x i8>, <4 x i8>* %b 1981 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 1982 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1983 ret <4 x i8> %2 1984} 1985 1986define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { 1987; SSE2-LABEL: combine_test3c: 1988; SSE2: # BB#0: 1989; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1990; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1991; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1992; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1993; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1994; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1995; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1996; SSE2-NEXT: retq 1997; 1998; SSSE3-LABEL: combine_test3c: 1999; SSSE3: # BB#0: 2000; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2001; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2002; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2003; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2004; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2005; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2006; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2007; SSSE3-NEXT: retq 2008; 2009; SSE41-LABEL: combine_test3c: 2010; SSE41: # BB#0: 2011; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2012; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2013; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2014; SSE41-NEXT: retq 2015; 2016; AVX-LABEL: combine_test3c: 2017; AVX: # BB#0: 2018; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2019; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2020; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2021; AVX-NEXT: retq 2022 %A = load <4 x i8>, <4 x i8>* %a 2023 %B = load <4 x i8>, <4 x i8>* %b 2024 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2025 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2026 ret <4 x i8> %2 2027} 2028 2029define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { 2030; SSE2-LABEL: combine_test4c: 2031; SSE2: # BB#0: 2032; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2033; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2034; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2035; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2036; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2037; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2038; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2039; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2040; SSE2-NEXT: retq 2041; 2042; SSSE3-LABEL: combine_test4c: 2043; SSSE3: # BB#0: 2044; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2045; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2046; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2047; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2048; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2049; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2050; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2051; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2052; SSSE3-NEXT: retq 2053; 2054; SSE41-LABEL: combine_test4c: 2055; SSE41: # BB#0: 2056; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2057; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2058; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 2059; SSE41-NEXT: retq 2060; 2061; AVX1-LABEL: combine_test4c: 2062; AVX1: # BB#0: 2063; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2064; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2065; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 2066; AVX1-NEXT: retq 2067; 2068; AVX2-LABEL: combine_test4c: 2069; AVX2: # BB#0: 2070; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2071; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 2072; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2073; AVX2-NEXT: retq 2074 %A = load <4 x i8>, <4 x i8>* %a 2075 %B = load <4 x i8>, <4 x i8>* %b 2076 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 2077 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2078 ret <4 x i8> %2 2079} 2080 2081 2082; The following test cases are generated from this C++ code 2083; 2084;__m128 blend_01(__m128 a, __m128 b) 2085;{ 2086; __m128 s = a; 2087; s = _mm_blend_ps( s, b, 1<<0 ); 2088; s = _mm_blend_ps( s, b, 1<<1 ); 2089; return s; 2090;} 2091; 2092;__m128 blend_02(__m128 a, __m128 b) 2093;{ 2094; __m128 s = a; 2095; s = _mm_blend_ps( s, b, 1<<0 ); 2096; s = _mm_blend_ps( s, b, 1<<2 ); 2097; return s; 2098;} 2099; 2100;__m128 blend_123(__m128 a, __m128 b) 2101;{ 2102; __m128 s = a; 2103; s = _mm_blend_ps( s, b, 1<<1 ); 2104; s = _mm_blend_ps( s, b, 1<<2 ); 2105; s = _mm_blend_ps( s, b, 1<<3 ); 2106; return s; 2107;} 2108 2109; Ideally, we should collapse the following shuffles into a single one. 2110 2111define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 2112; SSE2-LABEL: combine_blend_01: 2113; SSE2: # BB#0: 2114; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2115; SSE2-NEXT: retq 2116; 2117; SSSE3-LABEL: combine_blend_01: 2118; SSSE3: # BB#0: 2119; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2120; SSSE3-NEXT: retq 2121; 2122; SSE41-LABEL: combine_blend_01: 2123; SSE41: # BB#0: 2124; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2125; SSE41-NEXT: retq 2126; 2127; AVX-LABEL: combine_blend_01: 2128; AVX: # BB#0: 2129; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2130; AVX-NEXT: retq 2131 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 2132 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 2133 ret <4 x float> %shuffle6 2134} 2135 2136define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 2137; SSE2-LABEL: combine_blend_02: 2138; SSE2: # BB#0: 2139; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 2140; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 2141; SSE2-NEXT: movaps %xmm1, %xmm0 2142; SSE2-NEXT: retq 2143; 2144; SSSE3-LABEL: combine_blend_02: 2145; SSSE3: # BB#0: 2146; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 2147; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 2148; SSSE3-NEXT: movaps %xmm1, %xmm0 2149; SSSE3-NEXT: retq 2150; 2151; SSE41-LABEL: combine_blend_02: 2152; SSE41: # BB#0: 2153; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2154; SSE41-NEXT: retq 2155; 2156; AVX-LABEL: combine_blend_02: 2157; AVX: # BB#0: 2158; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2159; AVX-NEXT: retq 2160 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 2161 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 2162 ret <4 x float> %shuffle6 2163} 2164 2165define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 2166; SSE2-LABEL: combine_blend_123: 2167; SSE2: # BB#0: 2168; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2169; SSE2-NEXT: movaps %xmm1, %xmm0 2170; SSE2-NEXT: retq 2171; 2172; SSSE3-LABEL: combine_blend_123: 2173; SSSE3: # BB#0: 2174; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2175; SSSE3-NEXT: movaps %xmm1, %xmm0 2176; SSSE3-NEXT: retq 2177; 2178; SSE41-LABEL: combine_blend_123: 2179; SSE41: # BB#0: 2180; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2181; SSE41-NEXT: retq 2182; 2183; AVX-LABEL: combine_blend_123: 2184; AVX: # BB#0: 2185; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2186; AVX-NEXT: retq 2187 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2188 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 2189 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2190 ret <4 x float> %shuffle12 2191} 2192 2193define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 2194; SSE-LABEL: combine_test_movhl_1: 2195; SSE: # BB#0: 2196; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2197; SSE-NEXT: movdqa %xmm1, %xmm0 2198; SSE-NEXT: retq 2199; 2200; AVX-LABEL: combine_test_movhl_1: 2201; AVX: # BB#0: 2202; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2203; AVX-NEXT: retq 2204 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 2205 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 2206 ret <4 x i32> %2 2207} 2208 2209define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 2210; SSE-LABEL: combine_test_movhl_2: 2211; SSE: # BB#0: 2212; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2213; SSE-NEXT: movdqa %xmm1, %xmm0 2214; SSE-NEXT: retq 2215; 2216; AVX-LABEL: combine_test_movhl_2: 2217; AVX: # BB#0: 2218; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2219; AVX-NEXT: retq 2220 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 2221 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 2222 ret <4 x i32> %2 2223} 2224 2225define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 2226; SSE-LABEL: combine_test_movhl_3: 2227; SSE: # BB#0: 2228; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2229; SSE-NEXT: movdqa %xmm1, %xmm0 2230; SSE-NEXT: retq 2231; 2232; AVX-LABEL: combine_test_movhl_3: 2233; AVX: # BB#0: 2234; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2235; AVX-NEXT: retq 2236 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 2237 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 2238 ret <4 x i32> %2 2239} 2240 2241 2242; Verify that we fold shuffles according to rule: 2243; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2244 2245define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2246; SSE2-LABEL: combine_undef_input_test1: 2247; SSE2: # BB#0: 2248; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2249; SSE2-NEXT: retq 2250; 2251; SSSE3-LABEL: combine_undef_input_test1: 2252; SSSE3: # BB#0: 2253; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2254; SSSE3-NEXT: retq 2255; 2256; SSE41-LABEL: combine_undef_input_test1: 2257; SSE41: # BB#0: 2258; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2259; SSE41-NEXT: retq 2260; 2261; AVX-LABEL: combine_undef_input_test1: 2262; AVX: # BB#0: 2263; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2264; AVX-NEXT: retq 2265 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2266 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2267 ret <4 x float> %2 2268} 2269 2270define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2271; SSE-LABEL: combine_undef_input_test2: 2272; SSE: # BB#0: 2273; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2274; SSE-NEXT: retq 2275; 2276; AVX-LABEL: combine_undef_input_test2: 2277; AVX: # BB#0: 2278; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2279; AVX-NEXT: retq 2280 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2281 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2282 ret <4 x float> %2 2283} 2284 2285define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2286; SSE-LABEL: combine_undef_input_test3: 2287; SSE: # BB#0: 2288; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2289; SSE-NEXT: retq 2290; 2291; AVX-LABEL: combine_undef_input_test3: 2292; AVX: # BB#0: 2293; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2294; AVX-NEXT: retq 2295 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2296 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2297 ret <4 x float> %2 2298} 2299 2300define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2301; SSE-LABEL: combine_undef_input_test4: 2302; SSE: # BB#0: 2303; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2304; SSE-NEXT: movapd %xmm1, %xmm0 2305; SSE-NEXT: retq 2306; 2307; AVX-LABEL: combine_undef_input_test4: 2308; AVX: # BB#0: 2309; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2310; AVX-NEXT: retq 2311 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2312 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2313 ret <4 x float> %2 2314} 2315 2316define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2317; SSE2-LABEL: combine_undef_input_test5: 2318; SSE2: # BB#0: 2319; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2320; SSE2-NEXT: movapd %xmm1, %xmm0 2321; SSE2-NEXT: retq 2322; 2323; SSSE3-LABEL: combine_undef_input_test5: 2324; SSSE3: # BB#0: 2325; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2326; SSSE3-NEXT: movapd %xmm1, %xmm0 2327; SSSE3-NEXT: retq 2328; 2329; SSE41-LABEL: combine_undef_input_test5: 2330; SSE41: # BB#0: 2331; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2332; SSE41-NEXT: retq 2333; 2334; AVX-LABEL: combine_undef_input_test5: 2335; AVX: # BB#0: 2336; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2337; AVX-NEXT: retq 2338 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2339 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2340 ret <4 x float> %2 2341} 2342 2343 2344; Verify that we fold shuffles according to rule: 2345; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2346 2347define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2348; ALL-LABEL: combine_undef_input_test6: 2349; ALL: # BB#0: 2350; ALL-NEXT: retq 2351 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2352 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2353 ret <4 x float> %2 2354} 2355 2356define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2357; SSE2-LABEL: combine_undef_input_test7: 2358; SSE2: # BB#0: 2359; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2360; SSE2-NEXT: retq 2361; 2362; SSSE3-LABEL: combine_undef_input_test7: 2363; SSSE3: # BB#0: 2364; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2365; SSSE3-NEXT: retq 2366; 2367; SSE41-LABEL: combine_undef_input_test7: 2368; SSE41: # BB#0: 2369; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2370; SSE41-NEXT: retq 2371; 2372; AVX-LABEL: combine_undef_input_test7: 2373; AVX: # BB#0: 2374; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2375; AVX-NEXT: retq 2376 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2377 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2378 ret <4 x float> %2 2379} 2380 2381define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2382; SSE2-LABEL: combine_undef_input_test8: 2383; SSE2: # BB#0: 2384; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2385; SSE2-NEXT: retq 2386; 2387; SSSE3-LABEL: combine_undef_input_test8: 2388; SSSE3: # BB#0: 2389; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2390; SSSE3-NEXT: retq 2391; 2392; SSE41-LABEL: combine_undef_input_test8: 2393; SSE41: # BB#0: 2394; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2395; SSE41-NEXT: retq 2396; 2397; AVX-LABEL: combine_undef_input_test8: 2398; AVX: # BB#0: 2399; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2400; AVX-NEXT: retq 2401 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2402 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2403 ret <4 x float> %2 2404} 2405 2406define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2407; SSE-LABEL: combine_undef_input_test9: 2408; SSE: # BB#0: 2409; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2410; SSE-NEXT: retq 2411; 2412; AVX-LABEL: combine_undef_input_test9: 2413; AVX: # BB#0: 2414; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] 2415; AVX-NEXT: retq 2416 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2417 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2418 ret <4 x float> %2 2419} 2420 2421define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2422; ALL-LABEL: combine_undef_input_test10: 2423; ALL: # BB#0: 2424; ALL-NEXT: retq 2425 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2426 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2427 ret <4 x float> %2 2428} 2429 2430define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2431; SSE2-LABEL: combine_undef_input_test11: 2432; SSE2: # BB#0: 2433; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2434; SSE2-NEXT: retq 2435; 2436; SSSE3-LABEL: combine_undef_input_test11: 2437; SSSE3: # BB#0: 2438; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2439; SSSE3-NEXT: retq 2440; 2441; SSE41-LABEL: combine_undef_input_test11: 2442; SSE41: # BB#0: 2443; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2444; SSE41-NEXT: retq 2445; 2446; AVX-LABEL: combine_undef_input_test11: 2447; AVX: # BB#0: 2448; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2449; AVX-NEXT: retq 2450 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2451 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2452 ret <4 x float> %2 2453} 2454 2455define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2456; SSE-LABEL: combine_undef_input_test12: 2457; SSE: # BB#0: 2458; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2459; SSE-NEXT: retq 2460; 2461; AVX-LABEL: combine_undef_input_test12: 2462; AVX: # BB#0: 2463; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2464; AVX-NEXT: retq 2465 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2466 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2467 ret <4 x float> %2 2468} 2469 2470define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2471; SSE-LABEL: combine_undef_input_test13: 2472; SSE: # BB#0: 2473; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2474; SSE-NEXT: retq 2475; 2476; AVX-LABEL: combine_undef_input_test13: 2477; AVX: # BB#0: 2478; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2479; AVX-NEXT: retq 2480 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2481 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2482 ret <4 x float> %2 2483} 2484 2485define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2486; SSE-LABEL: combine_undef_input_test14: 2487; SSE: # BB#0: 2488; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2489; SSE-NEXT: movapd %xmm1, %xmm0 2490; SSE-NEXT: retq 2491; 2492; AVX-LABEL: combine_undef_input_test14: 2493; AVX: # BB#0: 2494; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2495; AVX-NEXT: retq 2496 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2497 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2498 ret <4 x float> %2 2499} 2500 2501define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2502; SSE2-LABEL: combine_undef_input_test15: 2503; SSE2: # BB#0: 2504; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2505; SSE2-NEXT: movapd %xmm1, %xmm0 2506; SSE2-NEXT: retq 2507; 2508; SSSE3-LABEL: combine_undef_input_test15: 2509; SSSE3: # BB#0: 2510; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2511; SSSE3-NEXT: movapd %xmm1, %xmm0 2512; SSSE3-NEXT: retq 2513; 2514; SSE41-LABEL: combine_undef_input_test15: 2515; SSE41: # BB#0: 2516; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2517; SSE41-NEXT: retq 2518; 2519; AVX-LABEL: combine_undef_input_test15: 2520; AVX: # BB#0: 2521; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2522; AVX-NEXT: retq 2523 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2524 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2525 ret <4 x float> %2 2526} 2527 2528 2529; Verify that shuffles are canonicalized according to rules: 2530; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2531; 2532; This allows to trigger the following combine rule: 2533; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2534; 2535; As a result, all the shuffle pairs in each function below should be 2536; combined into a single legal shuffle operation. 2537 2538define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2539; ALL-LABEL: combine_undef_input_test16: 2540; ALL: # BB#0: 2541; ALL-NEXT: retq 2542 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2543 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2544 ret <4 x float> %2 2545} 2546 2547define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2548; SSE2-LABEL: combine_undef_input_test17: 2549; SSE2: # BB#0: 2550; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2551; SSE2-NEXT: retq 2552; 2553; SSSE3-LABEL: combine_undef_input_test17: 2554; SSSE3: # BB#0: 2555; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2556; SSSE3-NEXT: retq 2557; 2558; SSE41-LABEL: combine_undef_input_test17: 2559; SSE41: # BB#0: 2560; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2561; SSE41-NEXT: retq 2562; 2563; AVX-LABEL: combine_undef_input_test17: 2564; AVX: # BB#0: 2565; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2566; AVX-NEXT: retq 2567 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2568 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2569 ret <4 x float> %2 2570} 2571 2572define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2573; SSE2-LABEL: combine_undef_input_test18: 2574; SSE2: # BB#0: 2575; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2576; SSE2-NEXT: retq 2577; 2578; SSSE3-LABEL: combine_undef_input_test18: 2579; SSSE3: # BB#0: 2580; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2581; SSSE3-NEXT: retq 2582; 2583; SSE41-LABEL: combine_undef_input_test18: 2584; SSE41: # BB#0: 2585; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2586; SSE41-NEXT: retq 2587; 2588; AVX-LABEL: combine_undef_input_test18: 2589; AVX: # BB#0: 2590; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2591; AVX-NEXT: retq 2592 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2593 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2594 ret <4 x float> %2 2595} 2596 2597define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2598; SSE-LABEL: combine_undef_input_test19: 2599; SSE: # BB#0: 2600; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2601; SSE-NEXT: retq 2602; 2603; AVX-LABEL: combine_undef_input_test19: 2604; AVX: # BB#0: 2605; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] 2606; AVX-NEXT: retq 2607 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2608 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2609 ret <4 x float> %2 2610} 2611 2612define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2613; ALL-LABEL: combine_undef_input_test20: 2614; ALL: # BB#0: 2615; ALL-NEXT: retq 2616 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2617 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2618 ret <4 x float> %2 2619} 2620 2621; These tests are designed to test the ability to combine away unnecessary 2622; operations feeding into a shuffle. The AVX cases are the important ones as 2623; they leverage operations which cannot be done naturally on the entire vector 2624; and thus are decomposed into multiple smaller operations. 2625 2626define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2627; SSE-LABEL: combine_unneeded_subvector1: 2628; SSE: # BB#0: 2629; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2630; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2631; SSE-NEXT: movdqa %xmm0, %xmm1 2632; SSE-NEXT: retq 2633; 2634; AVX1-LABEL: combine_unneeded_subvector1: 2635; AVX1: # BB#0: 2636; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2637; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2638; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] 2639; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2640; AVX1-NEXT: retq 2641; 2642; AVX2-LABEL: combine_unneeded_subvector1: 2643; AVX2: # BB#0: 2644; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2645; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] 2646; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 2647; AVX2-NEXT: retq 2648 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2649 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2650 ret <8 x i32> %c 2651} 2652 2653define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2654; SSE-LABEL: combine_unneeded_subvector2: 2655; SSE: # BB#0: 2656; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2657; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2658; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2659; SSE-NEXT: retq 2660; 2661; AVX1-LABEL: combine_unneeded_subvector2: 2662; AVX1: # BB#0: 2663; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2664; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2665; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2666; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2667; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2668; AVX1-NEXT: retq 2669; 2670; AVX2-LABEL: combine_unneeded_subvector2: 2671; AVX2: # BB#0: 2672; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2673; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2674; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2675; AVX2-NEXT: retq 2676 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2677 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2678 ret <8 x i32> %d 2679} 2680 2681define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2682; SSE2-LABEL: combine_insertps1: 2683; SSE2: # BB#0: 2684; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2685; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2686; SSE2-NEXT: movaps %xmm1, %xmm0 2687; SSE2-NEXT: retq 2688; 2689; SSSE3-LABEL: combine_insertps1: 2690; SSSE3: # BB#0: 2691; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2692; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2693; SSSE3-NEXT: movaps %xmm1, %xmm0 2694; SSSE3-NEXT: retq 2695; 2696; SSE41-LABEL: combine_insertps1: 2697; SSE41: # BB#0: 2698; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2699; SSE41-NEXT: retq 2700; 2701; AVX-LABEL: combine_insertps1: 2702; AVX: # BB#0: 2703; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2704; AVX-NEXT: retq 2705 2706 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2707 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2708 ret <4 x float> %d 2709} 2710 2711define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2712; SSE2-LABEL: combine_insertps2: 2713; SSE2: # BB#0: 2714; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2715; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2716; SSE2-NEXT: movaps %xmm1, %xmm0 2717; SSE2-NEXT: retq 2718; 2719; SSSE3-LABEL: combine_insertps2: 2720; SSSE3: # BB#0: 2721; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2722; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2723; SSSE3-NEXT: movaps %xmm1, %xmm0 2724; SSSE3-NEXT: retq 2725; 2726; SSE41-LABEL: combine_insertps2: 2727; SSE41: # BB#0: 2728; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2729; SSE41-NEXT: retq 2730; 2731; AVX-LABEL: combine_insertps2: 2732; AVX: # BB#0: 2733; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2734; AVX-NEXT: retq 2735 2736 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2737 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2738 ret <4 x float> %d 2739} 2740 2741define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2742; SSE2-LABEL: combine_insertps3: 2743; SSE2: # BB#0: 2744; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2745; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2746; SSE2-NEXT: retq 2747; 2748; SSSE3-LABEL: combine_insertps3: 2749; SSSE3: # BB#0: 2750; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2751; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2752; SSSE3-NEXT: retq 2753; 2754; SSE41-LABEL: combine_insertps3: 2755; SSE41: # BB#0: 2756; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2757; SSE41-NEXT: retq 2758; 2759; AVX-LABEL: combine_insertps3: 2760; AVX: # BB#0: 2761; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2762; AVX-NEXT: retq 2763 2764 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2765 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2766 ret <4 x float> %d 2767} 2768 2769define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2770; SSE2-LABEL: combine_insertps4: 2771; SSE2: # BB#0: 2772; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2773; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2774; SSE2-NEXT: retq 2775; 2776; SSSE3-LABEL: combine_insertps4: 2777; SSSE3: # BB#0: 2778; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2779; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2780; SSSE3-NEXT: retq 2781; 2782; SSE41-LABEL: combine_insertps4: 2783; SSE41: # BB#0: 2784; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2785; SSE41-NEXT: retq 2786; 2787; AVX-LABEL: combine_insertps4: 2788; AVX: # BB#0: 2789; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2790; AVX-NEXT: retq 2791 2792 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2793 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2794 ret <4 x float> %d 2795} 2796 2797define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2798; SSE-LABEL: PR22377: 2799; SSE: # BB#0: # %entry 2800; SSE-NEXT: movaps %xmm0, %xmm1 2801; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,1,3] 2802; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2803; SSE-NEXT: addps %xmm0, %xmm1 2804; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2805; SSE-NEXT: retq 2806; 2807; AVX-LABEL: PR22377: 2808; AVX: # BB#0: # %entry 2809; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] 2810; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2811; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 2812; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2813; AVX-NEXT: retq 2814entry: 2815 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2816 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2817 %r2 = fadd <4 x float> %s1, %s2 2818 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2819 ret <4 x float> %s3 2820} 2821 2822define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2823; SSE2-LABEL: PR22390: 2824; SSE2: # BB#0: # %entry 2825; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2826; SSE2-NEXT: movaps %xmm0, %xmm2 2827; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2828; SSE2-NEXT: addps %xmm0, %xmm2 2829; SSE2-NEXT: movaps %xmm2, %xmm0 2830; SSE2-NEXT: retq 2831; 2832; SSSE3-LABEL: PR22390: 2833; SSSE3: # BB#0: # %entry 2834; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2835; SSSE3-NEXT: movaps %xmm0, %xmm2 2836; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2837; SSSE3-NEXT: addps %xmm0, %xmm2 2838; SSSE3-NEXT: movaps %xmm2, %xmm0 2839; SSSE3-NEXT: retq 2840; 2841; SSE41-LABEL: PR22390: 2842; SSE41: # BB#0: # %entry 2843; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2844; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2845; SSE41-NEXT: addps %xmm1, %xmm0 2846; SSE41-NEXT: retq 2847; 2848; AVX-LABEL: PR22390: 2849; AVX: # BB#0: # %entry 2850; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2851; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2852; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2853; AVX-NEXT: retq 2854entry: 2855 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2856 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2857 %r2 = fadd <4 x float> %s1, %s2 2858 ret <4 x float> %r2 2859} 2860 2861define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2862; SSE2-LABEL: PR22412: 2863; SSE2: # BB#0: # %entry 2864; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2865; SSE2-NEXT: movapd %xmm2, %xmm0 2866; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2867; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2868; SSE2-NEXT: movaps %xmm3, %xmm1 2869; SSE2-NEXT: retq 2870; 2871; SSSE3-LABEL: PR22412: 2872; SSSE3: # BB#0: # %entry 2873; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2874; SSSE3-NEXT: movapd %xmm2, %xmm0 2875; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2876; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2877; SSSE3-NEXT: movaps %xmm3, %xmm1 2878; SSSE3-NEXT: retq 2879; 2880; SSE41-LABEL: PR22412: 2881; SSE41: # BB#0: # %entry 2882; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] 2883; SSE41-NEXT: movapd %xmm0, %xmm1 2884; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2] 2885; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2] 2886; SSE41-NEXT: movaps %xmm1, %xmm0 2887; SSE41-NEXT: movaps %xmm3, %xmm1 2888; SSE41-NEXT: retq 2889; 2890; AVX1-LABEL: PR22412: 2891; AVX1: # BB#0: # %entry 2892; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 2893; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 2894; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2895; AVX1-NEXT: retq 2896; 2897; AVX2-LABEL: PR22412: 2898; AVX2: # BB#0: # %entry 2899; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 2900; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2] 2901; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 2902; AVX2-NEXT: retq 2903entry: 2904 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2905 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2906 ret <8 x float> %s2 2907} 2908