1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST 8 9; The next 8 tests check for matching the horizontal op and eliminating the shuffle. 10; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111 11 12define <4 x float> @hadd_v4f32(<4 x float> %a) { 13; SSSE3-LABEL: hadd_v4f32: 14; SSSE3: # %bb.0: 15; SSSE3-NEXT: haddps %xmm0, %xmm0 16; SSSE3-NEXT: retq 17; 18; AVX-LABEL: hadd_v4f32: 19; AVX: # %bb.0: 20; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 21; AVX-NEXT: retq 22 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2> 23 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3> 24 %hop = fadd <2 x float> %a02, %a13 25 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 26 ret <4 x float> %shuf 27} 28 29define <8 x float> @hadd_v8f32a(<8 x float> %a) { 30; SSSE3_SLOW-LABEL: hadd_v8f32a: 31; SSSE3_SLOW: # %bb.0: 32; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 33; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm2 34; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 35; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1 36; SSSE3_SLOW-NEXT: retq 37; 38; SSSE3_FAST-LABEL: hadd_v8f32a: 39; SSSE3_FAST: # %bb.0: 40; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2 41; SSSE3_FAST-NEXT: haddps %xmm1, %xmm2 42; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 43; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1 44; SSSE3_FAST-NEXT: retq 45; 46; AVX1_SLOW-LABEL: hadd_v8f32a: 47; AVX1_SLOW: # %bb.0: 48; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 49; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 50; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] 51; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 52; AVX1_SLOW-NEXT: retq 53; 54; AVX1_FAST-LABEL: hadd_v8f32a: 55; AVX1_FAST: # %bb.0: 56; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 57; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm1, %ymm0 58; AVX1_FAST-NEXT: retq 59; 60; AVX2-LABEL: hadd_v8f32a: 61; AVX2: # %bb.0: 62; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 63; AVX2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 64; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] 65; AVX2-NEXT: retq 66 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 67 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 68 %hop = fadd <4 x float> %a0, %a1 69 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 70 ret <8 x float> %shuf 71} 72 73define <8 x float> @hadd_v8f32b(<8 x float> %a) { 74; SSSE3-LABEL: hadd_v8f32b: 75; SSSE3: # %bb.0: 76; SSSE3-NEXT: haddps %xmm0, %xmm0 77; SSSE3-NEXT: haddps %xmm1, %xmm1 78; SSSE3-NEXT: retq 79; 80; AVX-LABEL: hadd_v8f32b: 81; AVX: # %bb.0: 82; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 83; AVX-NEXT: retq 84 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 85 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 86 %hop = fadd <8 x float> %a0, %a1 87 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 88 ret <8 x float> %shuf 89} 90 91define <4 x float> @hsub_v4f32(<4 x float> %a) { 92; SSSE3-LABEL: hsub_v4f32: 93; SSSE3: # %bb.0: 94; SSSE3-NEXT: hsubps %xmm0, %xmm0 95; SSSE3-NEXT: retq 96; 97; AVX-LABEL: hsub_v4f32: 98; AVX: # %bb.0: 99; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 100; AVX-NEXT: retq 101 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2> 102 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3> 103 %hop = fsub <2 x float> %a02, %a13 104 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 105 ret <4 x float> %shuf 106} 107 108define <8 x float> @hsub_v8f32a(<8 x float> %a) { 109; SSSE3_SLOW-LABEL: hsub_v8f32a: 110; SSSE3_SLOW: # %bb.0: 111; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 112; SSSE3_SLOW-NEXT: hsubps %xmm1, %xmm2 113; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 114; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1 115; SSSE3_SLOW-NEXT: retq 116; 117; SSSE3_FAST-LABEL: hsub_v8f32a: 118; SSSE3_FAST: # %bb.0: 119; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2 120; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm2 121; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 122; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1 123; SSSE3_FAST-NEXT: retq 124; 125; AVX1_SLOW-LABEL: hsub_v8f32a: 126; AVX1_SLOW: # %bb.0: 127; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 128; AVX1_SLOW-NEXT: vhsubps %xmm1, %xmm0, %xmm0 129; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] 130; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 131; AVX1_SLOW-NEXT: retq 132; 133; AVX1_FAST-LABEL: hsub_v8f32a: 134; AVX1_FAST: # %bb.0: 135; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 136; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm1, %ymm0 137; AVX1_FAST-NEXT: retq 138; 139; AVX2-LABEL: hsub_v8f32a: 140; AVX2: # %bb.0: 141; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 142; AVX2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 143; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] 144; AVX2-NEXT: retq 145 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 146 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 147 %hop = fsub <4 x float> %a0, %a1 148 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 149 ret <8 x float> %shuf 150} 151 152define <8 x float> @hsub_v8f32b(<8 x float> %a) { 153; SSSE3-LABEL: hsub_v8f32b: 154; SSSE3: # %bb.0: 155; SSSE3-NEXT: hsubps %xmm0, %xmm0 156; SSSE3-NEXT: hsubps %xmm1, %xmm1 157; SSSE3-NEXT: retq 158; 159; AVX-LABEL: hsub_v8f32b: 160; AVX: # %bb.0: 161; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 162; AVX-NEXT: retq 163 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 164 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 165 %hop = fsub <8 x float> %a0, %a1 166 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 167 ret <8 x float> %shuf 168} 169 170define <2 x double> @hadd_v2f64(<2 x double> %a) { 171; SSSE3_SLOW-LABEL: hadd_v2f64: 172; SSSE3_SLOW: # %bb.0: 173; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 174; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 175; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1 176; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 177; SSSE3_SLOW-NEXT: retq 178; 179; SSSE3_FAST-LABEL: hadd_v2f64: 180; SSSE3_FAST: # %bb.0: 181; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 182; SSSE3_FAST-NEXT: retq 183; 184; AVX1_SLOW-LABEL: hadd_v2f64: 185; AVX1_SLOW: # %bb.0: 186; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 187; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 188; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 189; AVX1_SLOW-NEXT: retq 190; 191; AVX1_FAST-LABEL: hadd_v2f64: 192; AVX1_FAST: # %bb.0: 193; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 194; AVX1_FAST-NEXT: retq 195; 196; AVX2_SLOW-LABEL: hadd_v2f64: 197; AVX2_SLOW: # %bb.0: 198; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 199; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 200; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 201; AVX2_SLOW-NEXT: retq 202; 203; AVX2_FAST-LABEL: hadd_v2f64: 204; AVX2_FAST: # %bb.0: 205; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 206; AVX2_FAST-NEXT: retq 207 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 208 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 209 %hop = fadd <2 x double> %a0, %a1 210 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0> 211 ret <2 x double> %shuf 212} 213 214define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) { 215; SSSE3_SLOW-LABEL: hadd_v2f64_scalar_splat: 216; SSSE3_SLOW: # %bb.0: 217; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 218; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 219; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1 220; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 221; SSSE3_SLOW-NEXT: retq 222; 223; SSSE3_FAST-LABEL: hadd_v2f64_scalar_splat: 224; SSSE3_FAST: # %bb.0: 225; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 226; SSSE3_FAST-NEXT: retq 227; 228; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat: 229; AVX1_SLOW: # %bb.0: 230; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 231; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 232; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 233; AVX1_SLOW-NEXT: retq 234; 235; AVX1_FAST-LABEL: hadd_v2f64_scalar_splat: 236; AVX1_FAST: # %bb.0: 237; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 238; AVX1_FAST-NEXT: retq 239; 240; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat: 241; AVX2_SLOW: # %bb.0: 242; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 243; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 244; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 245; AVX2_SLOW-NEXT: retq 246; 247; AVX2_FAST-LABEL: hadd_v2f64_scalar_splat: 248; AVX2_FAST: # %bb.0: 249; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 250; AVX2_FAST-NEXT: retq 251 %a0 = extractelement <2 x double> %a, i32 0 252 %a1 = extractelement <2 x double> %a, i32 1 253 %hop = fadd double %a0, %a1 254 %ins = insertelement <2 x double> undef, double %hop, i32 0 255 %shuf = shufflevector <2 x double> %ins, <2 x double> undef, <2 x i32> <i32 0, i32 0> 256 ret <2 x double> %shuf 257} 258 259define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) { 260; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_splat: 261; SSSE3_SLOW: # %bb.0: 262; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 263; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 264; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2 265; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 266; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 267; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm3 268; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 269; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] 270; SSSE3_SLOW-NEXT: retq 271; 272; SSSE3_FAST-LABEL: hadd_v4f64_scalar_splat: 273; SSSE3_FAST: # %bb.0: 274; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 275; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 276; SSSE3_FAST-NEXT: retq 277; 278; AVX-LABEL: hadd_v4f64_scalar_splat: 279; AVX: # %bb.0: 280; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 281; AVX-NEXT: retq 282 %a0 = extractelement <4 x double> %a, i32 0 283 %a1 = extractelement <4 x double> %a, i32 1 284 %hop0 = fadd double %a0, %a1 285 %a2 = extractelement <4 x double> %a, i32 2 286 %a3 = extractelement <4 x double> %a, i32 3 287 %hop1 = fadd double %a2, %a3 288 %ins = insertelement <4 x double> undef, double %hop0, i32 0 289 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2 290 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 291 ret <4 x double> %shuf 292} 293 294define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) { 295; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_broadcast: 296; SSSE3_SLOW: # %bb.0: 297; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 298; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 299; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1 300; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 301; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 302; SSSE3_SLOW-NEXT: retq 303; 304; SSSE3_FAST-LABEL: hadd_v4f64_scalar_broadcast: 305; SSSE3_FAST: # %bb.0: 306; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 307; SSSE3_FAST-NEXT: movapd %xmm0, %xmm1 308; SSSE3_FAST-NEXT: retq 309; 310; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast: 311; AVX1_SLOW: # %bb.0: 312; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 313; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 314; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 315; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 316; AVX1_SLOW-NEXT: retq 317; 318; AVX1_FAST-LABEL: hadd_v4f64_scalar_broadcast: 319; AVX1_FAST: # %bb.0: 320; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 321; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 322; AVX1_FAST-NEXT: retq 323; 324; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast: 325; AVX2_SLOW: # %bb.0: 326; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 327; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 328; AVX2_SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 329; AVX2_SLOW-NEXT: retq 330; 331; AVX2_FAST-LABEL: hadd_v4f64_scalar_broadcast: 332; AVX2_FAST: # %bb.0: 333; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 334; AVX2_FAST-NEXT: vbroadcastsd %xmm0, %ymm0 335; AVX2_FAST-NEXT: retq 336 %a0 = extractelement <4 x double> %a, i32 0 337 %a1 = extractelement <4 x double> %a, i32 1 338 %hop0 = fadd double %a0, %a1 339 %a2 = extractelement <4 x double> %a, i32 2 340 %a3 = extractelement <4 x double> %a, i32 3 341 %hop1 = fadd double %a2, %a3 342 %ins = insertelement <4 x double> undef, double %hop0, i32 0 343 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2 344 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 345 ret <4 x double> %shuf 346} 347 348define <4 x double> @hadd_v4f64(<4 x double> %a) { 349; SSSE3_SLOW-LABEL: hadd_v4f64: 350; SSSE3_SLOW: # %bb.0: 351; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 352; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 353; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2 354; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 355; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2 356; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 357; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm2 358; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] 359; SSSE3_SLOW-NEXT: retq 360; 361; SSSE3_FAST-LABEL: hadd_v4f64: 362; SSSE3_FAST: # %bb.0: 363; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 364; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 365; SSSE3_FAST-NEXT: retq 366; 367; AVX1_SLOW-LABEL: hadd_v4f64: 368; AVX1_SLOW: # %bb.0: 369; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] 370; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 371; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 372; AVX1_SLOW-NEXT: retq 373; 374; AVX1_FAST-LABEL: hadd_v4f64: 375; AVX1_FAST: # %bb.0: 376; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 377; AVX1_FAST-NEXT: retq 378; 379; AVX2_SLOW-LABEL: hadd_v4f64: 380; AVX2_SLOW: # %bb.0: 381; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] 382; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 383; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 384; AVX2_SLOW-NEXT: retq 385; 386; AVX2_FAST-LABEL: hadd_v4f64: 387; AVX2_FAST: # %bb.0: 388; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 389; AVX2_FAST-NEXT: retq 390 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef> 391 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef> 392 %hop = fadd <4 x double> %a0, %a1 393 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 394 ret <4 x double> %shuf 395} 396 397define <2 x double> @hsub_v2f64(<2 x double> %a) { 398; SSSE3_SLOW-LABEL: hsub_v2f64: 399; SSSE3_SLOW: # %bb.0: 400; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 401; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 402; SSSE3_SLOW-NEXT: subsd %xmm1, %xmm0 403; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 404; SSSE3_SLOW-NEXT: retq 405; 406; SSSE3_FAST-LABEL: hsub_v2f64: 407; SSSE3_FAST: # %bb.0: 408; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 409; SSSE3_FAST-NEXT: retq 410; 411; AVX1_SLOW-LABEL: hsub_v2f64: 412; AVX1_SLOW: # %bb.0: 413; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 414; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 415; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 416; AVX1_SLOW-NEXT: retq 417; 418; AVX1_FAST-LABEL: hsub_v2f64: 419; AVX1_FAST: # %bb.0: 420; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 421; AVX1_FAST-NEXT: retq 422; 423; AVX2_SLOW-LABEL: hsub_v2f64: 424; AVX2_SLOW: # %bb.0: 425; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 426; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 427; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 428; AVX2_SLOW-NEXT: retq 429; 430; AVX2_FAST-LABEL: hsub_v2f64: 431; AVX2_FAST: # %bb.0: 432; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 433; AVX2_FAST-NEXT: retq 434 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 435 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 436 %hop = fsub <2 x double> %a0, %a1 437 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0> 438 ret <2 x double> %shuf 439} 440 441define <4 x double> @hsub_v4f64(<4 x double> %a) { 442; SSSE3_SLOW-LABEL: hsub_v4f64: 443; SSSE3_SLOW: # %bb.0: 444; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 445; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 446; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm0 447; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 448; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2 449; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 450; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm1 451; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] 452; SSSE3_SLOW-NEXT: retq 453; 454; SSSE3_FAST-LABEL: hsub_v4f64: 455; SSSE3_FAST: # %bb.0: 456; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 457; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1 458; SSSE3_FAST-NEXT: retq 459; 460; AVX1_SLOW-LABEL: hsub_v4f64: 461; AVX1_SLOW: # %bb.0: 462; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] 463; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 464; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 465; AVX1_SLOW-NEXT: retq 466; 467; AVX1_FAST-LABEL: hsub_v4f64: 468; AVX1_FAST: # %bb.0: 469; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 470; AVX1_FAST-NEXT: retq 471; 472; AVX2_SLOW-LABEL: hsub_v4f64: 473; AVX2_SLOW: # %bb.0: 474; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] 475; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 476; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 477; AVX2_SLOW-NEXT: retq 478; 479; AVX2_FAST-LABEL: hsub_v4f64: 480; AVX2_FAST: # %bb.0: 481; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 482; AVX2_FAST-NEXT: retq 483 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef> 484 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef> 485 %hop = fsub <4 x double> %a0, %a1 486 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 487 ret <4 x double> %shuf 488} 489 490define <4 x i32> @hadd_v4i32(<4 x i32> %a) { 491; SSSE3-LABEL: hadd_v4i32: 492; SSSE3: # %bb.0: 493; SSSE3-NEXT: phaddd %xmm0, %xmm0 494; SSSE3-NEXT: retq 495; 496; AVX-LABEL: hadd_v4i32: 497; AVX: # %bb.0: 498; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 499; AVX-NEXT: retq 500 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 501 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 502 %hop = add <4 x i32> %a02, %a13 503 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1> 504 ret <4 x i32> %shuf 505} 506 507define <8 x i32> @hadd_v8i32a(<8 x i32> %a) { 508; SSSE3_SLOW-LABEL: hadd_v8i32a: 509; SSSE3_SLOW: # %bb.0: 510; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 511; SSSE3_SLOW-NEXT: phaddd %xmm1, %xmm2 512; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 513; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 514; SSSE3_SLOW-NEXT: retq 515; 516; SSSE3_FAST-LABEL: hadd_v8i32a: 517; SSSE3_FAST: # %bb.0: 518; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 519; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm2 520; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 521; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 522; SSSE3_FAST-NEXT: retq 523; 524; AVX1_SLOW-LABEL: hadd_v8i32a: 525; AVX1_SLOW: # %bb.0: 526; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 527; AVX1_SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 528; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 529; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 530; AVX1_SLOW-NEXT: retq 531; 532; AVX1_FAST-LABEL: hadd_v8i32a: 533; AVX1_FAST: # %bb.0: 534; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 535; AVX1_FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm1 536; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 537; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 538; AVX1_FAST-NEXT: retq 539; 540; AVX2-LABEL: hadd_v8i32a: 541; AVX2: # %bb.0: 542; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 543; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 544; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 545; AVX2-NEXT: retq 546 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 547 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 548 %hop = add <4 x i32> %a0, %a1 549 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 550 ret <8 x i32> %shuf 551} 552 553define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { 554; SSSE3-LABEL: hadd_v8i32b: 555; SSSE3: # %bb.0: 556; SSSE3-NEXT: phaddd %xmm0, %xmm0 557; SSSE3-NEXT: phaddd %xmm1, %xmm1 558; SSSE3-NEXT: retq 559; 560; AVX1-LABEL: hadd_v8i32b: 561; AVX1: # %bb.0: 562; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1 563; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 564; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 565; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 566; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 567; AVX1-NEXT: retq 568; 569; AVX2-LABEL: hadd_v8i32b: 570; AVX2: # %bb.0: 571; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 572; AVX2-NEXT: retq 573 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 574 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 575 %hop = add <8 x i32> %a0, %a1 576 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 577 ret <8 x i32> %shuf 578} 579 580define <4 x i32> @hsub_v4i32(<4 x i32> %a) { 581; SSSE3-LABEL: hsub_v4i32: 582; SSSE3: # %bb.0: 583; SSSE3-NEXT: phsubd %xmm0, %xmm0 584; SSSE3-NEXT: retq 585; 586; AVX-LABEL: hsub_v4i32: 587; AVX: # %bb.0: 588; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 589; AVX-NEXT: retq 590 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 591 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 592 %hop = sub <4 x i32> %a02, %a13 593 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef> 594 ret <4 x i32> %shuf 595} 596 597define <8 x i32> @hsub_v8i32a(<8 x i32> %a) { 598; SSSE3_SLOW-LABEL: hsub_v8i32a: 599; SSSE3_SLOW: # %bb.0: 600; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 601; SSSE3_SLOW-NEXT: phsubd %xmm1, %xmm2 602; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 603; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 604; SSSE3_SLOW-NEXT: retq 605; 606; SSSE3_FAST-LABEL: hsub_v8i32a: 607; SSSE3_FAST: # %bb.0: 608; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 609; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm2 610; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 611; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 612; SSSE3_FAST-NEXT: retq 613; 614; AVX1_SLOW-LABEL: hsub_v8i32a: 615; AVX1_SLOW: # %bb.0: 616; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 617; AVX1_SLOW-NEXT: vphsubd %xmm1, %xmm0, %xmm0 618; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 619; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 620; AVX1_SLOW-NEXT: retq 621; 622; AVX1_FAST-LABEL: hsub_v8i32a: 623; AVX1_FAST: # %bb.0: 624; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 625; AVX1_FAST-NEXT: vphsubd %xmm1, %xmm0, %xmm1 626; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 627; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 628; AVX1_FAST-NEXT: retq 629; 630; AVX2-LABEL: hsub_v8i32a: 631; AVX2: # %bb.0: 632; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 633; AVX2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 634; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 635; AVX2-NEXT: retq 636 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 637 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 638 %hop = sub <4 x i32> %a0, %a1 639 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 640 ret <8 x i32> %shuf 641} 642 643define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { 644; SSSE3-LABEL: hsub_v8i32b: 645; SSSE3: # %bb.0: 646; SSSE3-NEXT: phsubd %xmm0, %xmm0 647; SSSE3-NEXT: phsubd %xmm1, %xmm1 648; SSSE3-NEXT: retq 649; 650; AVX1-LABEL: hsub_v8i32b: 651; AVX1: # %bb.0: 652; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1 653; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 654; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 655; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 656; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 657; AVX1-NEXT: retq 658; 659; AVX2-LABEL: hsub_v8i32b: 660; AVX2: # %bb.0: 661; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0 662; AVX2-NEXT: retq 663 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 664 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 665 %hop = sub <8 x i32> %a0, %a1 666 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 667 ret <8 x i32> %shuf 668} 669 670define <8 x i16> @hadd_v8i16(<8 x i16> %a) { 671; SSSE3-LABEL: hadd_v8i16: 672; SSSE3: # %bb.0: 673; SSSE3-NEXT: phaddw %xmm0, %xmm0 674; SSSE3-NEXT: retq 675; 676; AVX-LABEL: hadd_v8i16: 677; AVX: # %bb.0: 678; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 679; AVX-NEXT: retq 680 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 681 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 682 %hop = add <8 x i16> %a0246, %a1357 683 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3> 684 ret <8 x i16> %shuf 685} 686 687define <16 x i16> @hadd_v16i16a(<16 x i16> %a) { 688; SSSE3_SLOW-LABEL: hadd_v16i16a: 689; SSSE3_SLOW: # %bb.0: 690; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 691; SSSE3_SLOW-NEXT: phaddw %xmm1, %xmm2 692; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 693; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 694; SSSE3_SLOW-NEXT: retq 695; 696; SSSE3_FAST-LABEL: hadd_v16i16a: 697; SSSE3_FAST: # %bb.0: 698; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 699; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm2 700; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 701; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 702; SSSE3_FAST-NEXT: retq 703; 704; AVX1_SLOW-LABEL: hadd_v16i16a: 705; AVX1_SLOW: # %bb.0: 706; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 707; AVX1_SLOW-NEXT: vphaddw %xmm1, %xmm0, %xmm0 708; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 709; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 710; AVX1_SLOW-NEXT: retq 711; 712; AVX1_FAST-LABEL: hadd_v16i16a: 713; AVX1_FAST: # %bb.0: 714; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 715; AVX1_FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm1 716; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 717; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 718; AVX1_FAST-NEXT: retq 719; 720; AVX2-LABEL: hadd_v16i16a: 721; AVX2: # %bb.0: 722; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 723; AVX2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 724; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 725; AVX2-NEXT: retq 726 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 727 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 728 %hop = add <8 x i16> %a0, %a1 729 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7> 730 ret <16 x i16> %shuf 731} 732 733define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { 734; SSSE3-LABEL: hadd_v16i16b: 735; SSSE3: # %bb.0: 736; SSSE3-NEXT: phaddw %xmm0, %xmm0 737; SSSE3-NEXT: phaddw %xmm1, %xmm1 738; SSSE3-NEXT: retq 739; 740; AVX1-LABEL: hadd_v16i16b: 741; AVX1: # %bb.0: 742; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1 743; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 744; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 745; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 746; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 747; AVX1-NEXT: retq 748; 749; AVX2-LABEL: hadd_v16i16b: 750; AVX2: # %bb.0: 751; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 752; AVX2-NEXT: retq 753 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef> 754 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef> 755 %hop = add <16 x i16> %a0, %a1 756 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11> 757 ret <16 x i16> %shuf 758} 759 760define <8 x i16> @hsub_v8i16(<8 x i16> %a) { 761; SSSE3-LABEL: hsub_v8i16: 762; SSSE3: # %bb.0: 763; SSSE3-NEXT: phsubw %xmm0, %xmm0 764; SSSE3-NEXT: retq 765; 766; AVX-LABEL: hsub_v8i16: 767; AVX: # %bb.0: 768; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0 769; AVX-NEXT: retq 770 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 771 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 772 %hop = sub <8 x i16> %a0246, %a1357 773 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3> 774 ret <8 x i16> %shuf 775} 776 777define <16 x i16> @hsub_v16i16a(<16 x i16> %a) { 778; SSSE3_SLOW-LABEL: hsub_v16i16a: 779; SSSE3_SLOW: # %bb.0: 780; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 781; SSSE3_SLOW-NEXT: phsubw %xmm1, %xmm2 782; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 783; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 784; SSSE3_SLOW-NEXT: retq 785; 786; SSSE3_FAST-LABEL: hsub_v16i16a: 787; SSSE3_FAST: # %bb.0: 788; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 789; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm2 790; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 791; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 792; SSSE3_FAST-NEXT: retq 793; 794; AVX1_SLOW-LABEL: hsub_v16i16a: 795; AVX1_SLOW: # %bb.0: 796; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 797; AVX1_SLOW-NEXT: vphsubw %xmm1, %xmm0, %xmm0 798; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 799; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 800; AVX1_SLOW-NEXT: retq 801; 802; AVX1_FAST-LABEL: hsub_v16i16a: 803; AVX1_FAST: # %bb.0: 804; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 805; AVX1_FAST-NEXT: vphsubw %xmm1, %xmm0, %xmm1 806; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 807; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 808; AVX1_FAST-NEXT: retq 809; 810; AVX2-LABEL: hsub_v16i16a: 811; AVX2: # %bb.0: 812; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 813; AVX2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 814; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 815; AVX2-NEXT: retq 816 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 817 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 818 %hop = sub <8 x i16> %a0, %a1 819 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7> 820 ret <16 x i16> %shuf 821} 822 823define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { 824; SSSE3-LABEL: hsub_v16i16b: 825; SSSE3: # %bb.0: 826; SSSE3-NEXT: phsubw %xmm0, %xmm0 827; SSSE3-NEXT: phsubw %xmm1, %xmm1 828; SSSE3-NEXT: retq 829; 830; AVX1-LABEL: hsub_v16i16b: 831; AVX1: # %bb.0: 832; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1 833; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 834; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 835; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 836; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 837; AVX1-NEXT: retq 838; 839; AVX2-LABEL: hsub_v16i16b: 840; AVX2: # %bb.0: 841; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0 842; AVX2-NEXT: retq 843 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef> 844 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef> 845 %hop = sub <16 x i16> %a0, %a1 846 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11> 847 ret <16 x i16> %shuf 848} 849 850define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) { 851; SSSE3-LABEL: broadcast_haddps_v4f32: 852; SSSE3: # %bb.0: 853; SSSE3-NEXT: haddps %xmm0, %xmm0 854; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 855; SSSE3-NEXT: retq 856; 857; AVX1-LABEL: broadcast_haddps_v4f32: 858; AVX1: # %bb.0: 859; AVX1-NEXT: vhaddps %xmm0, %xmm0, %xmm0 860; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 861; AVX1-NEXT: retq 862; 863; AVX2-LABEL: broadcast_haddps_v4f32: 864; AVX2: # %bb.0: 865; AVX2-NEXT: vhaddps %xmm0, %xmm0, %xmm0 866; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 867; AVX2-NEXT: retq 868 %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0) 869 %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer 870 ret <4 x float> %2 871} 872 873declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) 874 875define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { 876; SSSE3_SLOW-LABEL: PR34724_1: 877; SSSE3_SLOW: # %bb.0: 878; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 879; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] 880; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 881; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] 882; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 883; SSSE3_SLOW-NEXT: retq 884; 885; SSSE3_FAST-LABEL: PR34724_1: 886; SSSE3_FAST: # %bb.0: 887; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 888; SSSE3_FAST-NEXT: retq 889; 890; AVX1_SLOW-LABEL: PR34724_1: 891; AVX1_SLOW: # %bb.0: 892; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 893; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] 894; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 895; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 896; AVX1_SLOW-NEXT: retq 897; 898; AVX1_FAST-LABEL: PR34724_1: 899; AVX1_FAST: # %bb.0: 900; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 901; AVX1_FAST-NEXT: retq 902; 903; AVX2_SLOW-LABEL: PR34724_1: 904; AVX2_SLOW: # %bb.0: 905; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 906; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] 907; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 908; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 909; AVX2_SLOW-NEXT: retq 910; 911; AVX2_FAST-LABEL: PR34724_1: 912; AVX2_FAST: # %bb.0: 913; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 914; AVX2_FAST-NEXT: retq 915 %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4> 916 %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5> 917 %t2 = fadd <2 x float> %t0, %t1 918 %vecinit9 = shufflevector <2 x float> %t2, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef> 919 %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 920 %t4 = fadd <4 x float> %t3, %b 921 %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7> 922 ret <4 x float> %vecinit13 923} 924 925define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { 926; SSSE3_SLOW-LABEL: PR34724_2: 927; SSSE3_SLOW: # %bb.0: 928; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 929; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] 930; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 931; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] 932; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 933; SSSE3_SLOW-NEXT: retq 934; 935; SSSE3_FAST-LABEL: PR34724_2: 936; SSSE3_FAST: # %bb.0: 937; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 938; SSSE3_FAST-NEXT: retq 939; 940; AVX1_SLOW-LABEL: PR34724_2: 941; AVX1_SLOW: # %bb.0: 942; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 943; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] 944; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 945; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 946; AVX1_SLOW-NEXT: retq 947; 948; AVX1_FAST-LABEL: PR34724_2: 949; AVX1_FAST: # %bb.0: 950; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 951; AVX1_FAST-NEXT: retq 952; 953; AVX2_SLOW-LABEL: PR34724_2: 954; AVX2_SLOW: # %bb.0: 955; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 956; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] 957; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 958; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 959; AVX2_SLOW-NEXT: retq 960; 961; AVX2_FAST-LABEL: PR34724_2: 962; AVX2_FAST: # %bb.0: 963; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 964; AVX2_FAST-NEXT: retq 965 %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef> 966 %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef> 967 %t2 = fadd <4 x float> %t0, %t1 968 %vecinit9 = shufflevector <4 x float> %t2, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef> 969 %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 970 %t4 = fadd <4 x float> %t3, %b 971 %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7> 972 ret <4 x float> %vecinit13 973} 974 975; 976; fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) 977; --> SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))). 978; 979 980define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) { 981; SSSE3-LABEL: hadd_4f32_v8f32_shuffle: 982; SSSE3: # %bb.0: 983; SSSE3-NEXT: haddps %xmm1, %xmm0 984; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 985; SSSE3-NEXT: retq 986; 987; AVX-LABEL: hadd_4f32_v8f32_shuffle: 988; AVX: # %bb.0: 989; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 990; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 991; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 992; AVX-NEXT: vzeroupper 993; AVX-NEXT: retq 994 %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 995 %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 996 %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 997 %hadd0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 998 %hadd1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 999 %hadd = fadd <4 x float> %hadd0, %hadd1 1000 ret <4 x float> %hadd 1001} 1002 1003define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) { 1004; SSSE3-LABEL: hsub_4f32_v8f32_shuffle: 1005; SSSE3: # %bb.0: 1006; SSSE3-NEXT: haddps %xmm1, %xmm0 1007; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1008; SSSE3-NEXT: retq 1009; 1010; AVX-LABEL: hsub_4f32_v8f32_shuffle: 1011; AVX: # %bb.0: 1012; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1013; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1014; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1015; AVX-NEXT: vzeroupper 1016; AVX-NEXT: retq 1017 %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 1018 %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1019 %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1020 %hsub0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1021 %hsub1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1022 %hsub = fadd <4 x float> %hsub0, %hsub1 1023 ret <4 x float> %hsub 1024} 1025 1026define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) { 1027; SSSE3-LABEL: hadd_4i32_v8i32_shuffle: 1028; SSSE3: # %bb.0: 1029; SSSE3-NEXT: phaddd %xmm1, %xmm0 1030; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1031; SSSE3-NEXT: retq 1032; 1033; AVX1-LABEL: hadd_4i32_v8i32_shuffle: 1034; AVX1: # %bb.0: 1035; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1036; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1037; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1038; AVX1-NEXT: vzeroupper 1039; AVX1-NEXT: retq 1040; 1041; AVX2-LABEL: hadd_4i32_v8i32_shuffle: 1042; AVX2: # %bb.0: 1043; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1044; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1045; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1046; AVX2-NEXT: vzeroupper 1047; AVX2-NEXT: retq 1048 %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 1049 %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1050 %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1051 %hadd0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1052 %hadd1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1053 %hadd = add <4 x i32> %hadd0, %hadd1 1054 ret <4 x i32> %hadd 1055} 1056 1057define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) { 1058; SSSE3-LABEL: hsub_4i32_v8i32_shuffle: 1059; SSSE3: # %bb.0: 1060; SSSE3-NEXT: phaddd %xmm1, %xmm0 1061; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1062; SSSE3-NEXT: retq 1063; 1064; AVX1-LABEL: hsub_4i32_v8i32_shuffle: 1065; AVX1: # %bb.0: 1066; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1067; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1068; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1069; AVX1-NEXT: vzeroupper 1070; AVX1-NEXT: retq 1071; 1072; AVX2-LABEL: hsub_4i32_v8i32_shuffle: 1073; AVX2: # %bb.0: 1074; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1075; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1076; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1077; AVX2-NEXT: vzeroupper 1078; AVX2-NEXT: retq 1079 %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 1080 %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1081 %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1082 %hsub0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1083 %hsub1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1084 %hsub = add <4 x i32> %hsub0, %hsub1 1085 ret <4 x i32> %hsub 1086} 1087 1088; 1089; fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) --> SHUFFLE(HOP(X,Y)). 1090; 1091 1092define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) { 1093; SSSE3-LABEL: hadd_4f64_v4f64_shuffle: 1094; SSSE3: # %bb.0: 1095; SSSE3-NEXT: haddpd %xmm1, %xmm0 1096; SSSE3-NEXT: haddpd %xmm3, %xmm2 1097; SSSE3-NEXT: movapd %xmm2, %xmm1 1098; SSSE3-NEXT: retq 1099; 1100; AVX1-LABEL: hadd_4f64_v4f64_shuffle: 1101; AVX1: # %bb.0: 1102; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1103; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1104; AVX1-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 1105; AVX1-NEXT: retq 1106; 1107; AVX2-LABEL: hadd_4f64_v4f64_shuffle: 1108; AVX2: # %bb.0: 1109; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1110; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1111; AVX2-NEXT: retq 1112 %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1113 %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1114 %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 1115 %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 1116 %hadd = fadd <4 x double> %hadd0, %hadd1 1117 ret <4 x double> %hadd 1118} 1119 1120define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) { 1121; SSSE3-LABEL: hsub_4f64_v4f64_shuffle: 1122; SSSE3: # %bb.0: 1123; SSSE3-NEXT: hsubpd %xmm1, %xmm0 1124; SSSE3-NEXT: hsubpd %xmm3, %xmm2 1125; SSSE3-NEXT: movapd %xmm2, %xmm1 1126; SSSE3-NEXT: retq 1127; 1128; AVX1-LABEL: hsub_4f64_v4f64_shuffle: 1129; AVX1: # %bb.0: 1130; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1131; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1132; AVX1-NEXT: vhsubpd %ymm0, %ymm2, %ymm0 1133; AVX1-NEXT: retq 1134; 1135; AVX2-LABEL: hsub_4f64_v4f64_shuffle: 1136; AVX2: # %bb.0: 1137; AVX2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1138; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1139; AVX2-NEXT: retq 1140 %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1141 %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1142 %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 1143 %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 1144 %hadd = fsub <4 x double> %hadd0, %hadd1 1145 ret <4 x double> %hadd 1146} 1147 1148define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { 1149; SSSE3-LABEL: hadd_8f32_v8f32_shuffle: 1150; SSSE3: # %bb.0: 1151; SSSE3-NEXT: haddps %xmm1, %xmm0 1152; SSSE3-NEXT: haddps %xmm3, %xmm2 1153; SSSE3-NEXT: movaps %xmm2, %xmm1 1154; SSSE3-NEXT: retq 1155; 1156; AVX1-LABEL: hadd_8f32_v8f32_shuffle: 1157; AVX1: # %bb.0: 1158; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1159; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1160; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0 1161; AVX1-NEXT: retq 1162; 1163; AVX2-LABEL: hadd_8f32_v8f32_shuffle: 1164; AVX2: # %bb.0: 1165; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1166; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1167; AVX2-NEXT: retq 1168 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1169 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1170 %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1171 %hadd1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1172 %hadd = fadd <8 x float> %hadd0, %hadd1 1173 ret <8 x float> %hadd 1174} 1175 1176define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { 1177; SSSE3-LABEL: hsub_8f32_v8f32_shuffle: 1178; SSSE3: # %bb.0: 1179; SSSE3-NEXT: haddps %xmm1, %xmm0 1180; SSSE3-NEXT: haddps %xmm3, %xmm2 1181; SSSE3-NEXT: movaps %xmm2, %xmm1 1182; SSSE3-NEXT: retq 1183; 1184; AVX1-LABEL: hsub_8f32_v8f32_shuffle: 1185; AVX1: # %bb.0: 1186; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1187; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1188; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0 1189; AVX1-NEXT: retq 1190; 1191; AVX2-LABEL: hsub_8f32_v8f32_shuffle: 1192; AVX2: # %bb.0: 1193; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1194; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1195; AVX2-NEXT: retq 1196 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1197 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1198 %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1199 %hsub1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1200 %hsub = fadd <8 x float> %hsub0, %hsub1 1201 ret <8 x float> %hsub 1202} 1203 1204define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { 1205; SSSE3-LABEL: hadd_8i32_v8i32_shuffle: 1206; SSSE3: # %bb.0: 1207; SSSE3-NEXT: phaddd %xmm1, %xmm0 1208; SSSE3-NEXT: phaddd %xmm3, %xmm2 1209; SSSE3-NEXT: movdqa %xmm2, %xmm1 1210; SSSE3-NEXT: retq 1211; 1212; AVX1-LABEL: hadd_8i32_v8i32_shuffle: 1213; AVX1: # %bb.0: 1214; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1215; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 1216; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1217; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 1218; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1219; AVX1-NEXT: retq 1220; 1221; AVX2-LABEL: hadd_8i32_v8i32_shuffle: 1222; AVX2: # %bb.0: 1223; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1224; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1225; AVX2-NEXT: retq 1226 %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1227 %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1228 %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1229 %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1230 %hadd = add <8 x i32> %hadd0, %hadd1 1231 ret <8 x i32> %hadd 1232} 1233 1234define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { 1235; SSSE3-LABEL: hsub_8i32_v8i32_shuffle: 1236; SSSE3: # %bb.0: 1237; SSSE3-NEXT: phsubd %xmm1, %xmm0 1238; SSSE3-NEXT: phsubd %xmm3, %xmm2 1239; SSSE3-NEXT: movdqa %xmm2, %xmm1 1240; SSSE3-NEXT: retq 1241; 1242; AVX1-LABEL: hsub_8i32_v8i32_shuffle: 1243; AVX1: # %bb.0: 1244; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1245; AVX1-NEXT: vphsubd %xmm2, %xmm1, %xmm1 1246; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1247; AVX1-NEXT: vphsubd %xmm2, %xmm0, %xmm0 1248; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1249; AVX1-NEXT: retq 1250; 1251; AVX2-LABEL: hsub_8i32_v8i32_shuffle: 1252; AVX2: # %bb.0: 1253; AVX2-NEXT: vphsubd %ymm1, %ymm0, %ymm0 1254; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1255; AVX2-NEXT: retq 1256 %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1257 %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1258 %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1259 %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1260 %hadd = sub <8 x i32> %hadd0, %hadd1 1261 ret <8 x i32> %hadd 1262} 1263 1264define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) { 1265; SSSE3-LABEL: hadd_16i16_16i16_shuffle: 1266; SSSE3: # %bb.0: 1267; SSSE3-NEXT: phaddw %xmm1, %xmm0 1268; SSSE3-NEXT: phaddw %xmm3, %xmm2 1269; SSSE3-NEXT: movdqa %xmm2, %xmm1 1270; SSSE3-NEXT: retq 1271; 1272; AVX1-LABEL: hadd_16i16_16i16_shuffle: 1273; AVX1: # %bb.0: 1274; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1275; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 1276; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1277; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 1278; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1279; AVX1-NEXT: retq 1280; 1281; AVX2-LABEL: hadd_16i16_16i16_shuffle: 1282; AVX2: # %bb.0: 1283; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1284; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1285; AVX2-NEXT: retq 1286 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 1287 %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1288 %hadd0 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30> 1289 %hadd1 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> 1290 %hadd = add <16 x i16> %hadd0, %hadd1 1291 ret <16 x i16> %hadd 1292} 1293