1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSE 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2 8 9define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 10; SSE-LABEL: hadd_reverse_v8i16: 11; SSE: # %bb.0: 12; SSE-NEXT: phaddw %xmm1, %xmm0 13; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 14; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 15; SSE-NEXT: retq 16; 17; AVX-LABEL: hadd_reverse_v8i16: 18; AVX: # %bb.0: 19; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 20; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 21; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 22; AVX-NEXT: retq 23 %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9> 24 %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8> 25 %add = add <8 x i16> %lhs, %rhs 26 ret <8 x i16> %add 27} 28 29define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 30; SSE-LABEL: hadd_reverse2_v8i16: 31; SSE: # %bb.0: 32; SSE-NEXT: movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 33; SSE-NEXT: pshufb %xmm2, %xmm0 34; SSE-NEXT: pshufb %xmm2, %xmm1 35; SSE-NEXT: phaddw %xmm1, %xmm0 36; SSE-NEXT: retq 37; 38; AVX-LABEL: hadd_reverse2_v8i16: 39; AVX: # %bb.0: 40; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 41; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 42; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 43; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 44; AVX-NEXT: retq 45 %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 46 %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 47 %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 48 %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 49 %add = add <8 x i16> %lhs, %rhs 50 ret <8 x i16> %add 51} 52 53define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) { 54; SSE-LABEL: hadd_reverse_v8f32: 55; SSE: # %bb.0: 56; SSE-NEXT: movaps %xmm0, %xmm4 57; SSE-NEXT: haddps %xmm3, %xmm1 58; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2] 59; SSE-NEXT: haddps %xmm2, %xmm4 60; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2] 61; SSE-NEXT: movaps %xmm1, %xmm0 62; SSE-NEXT: movaps %xmm4, %xmm1 63; SSE-NEXT: retq 64; 65; AVX1-LABEL: hadd_reverse_v8f32: 66; AVX1: # %bb.0: 67; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 68; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 69; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 70; AVX1-NEXT: retq 71; 72; AVX2-LABEL: hadd_reverse_v8f32: 73; AVX2: # %bb.0: 74; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 75; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 76; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 77; AVX2-NEXT: retq 78 %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9> 79 %rhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8> 80 %add = fadd <8 x float> %lhs, %rhs 81 ret <8 x float> %add 82} 83 84define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) { 85; SSE-LABEL: hadd_reverse2_v8f32: 86; SSE: # %bb.0: 87; SSE-NEXT: movaps %xmm0, %xmm4 88; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2],xmm0[1,0] 89; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0] 90; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0] 91; SSE-NEXT: haddps %xmm2, %xmm4 92; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2,1,0] 93; SSE-NEXT: haddps %xmm3, %xmm1 94; SSE-NEXT: movaps %xmm1, %xmm0 95; SSE-NEXT: movaps %xmm4, %xmm1 96; SSE-NEXT: retq 97; 98; AVX1-LABEL: hadd_reverse2_v8f32: 99; AVX1: # %bb.0: 100; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 101; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 102; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 103; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 104; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 105; AVX1-NEXT: retq 106; 107; AVX2-LABEL: hadd_reverse2_v8f32: 108; AVX2: # %bb.0: 109; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 110; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 111; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 112; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 113; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 114; AVX2-NEXT: retq 115 %shuf0 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 116 %shuf1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 117 %lhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 118 %rhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 119 %add = fadd <8 x float> %lhs, %rhs 120 ret <8 x float> %add 121} 122 123define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 124; SSE-LABEL: hadd_reverse_v16i16: 125; SSE: # %bb.0: 126; SSE-NEXT: phaddw %xmm3, %xmm1 127; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 128; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] 129; SSE-NEXT: phaddw %xmm2, %xmm0 130; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 131; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 132; SSE-NEXT: movdqa %xmm3, %xmm0 133; SSE-NEXT: retq 134; 135; AVX1-LABEL: hadd_reverse_v16i16: 136; AVX1: # %bb.0: 137; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 138; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 139; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 140; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 141; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 142; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 143; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 144; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 145; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 146; AVX1-NEXT: retq 147; 148; AVX2-LABEL: hadd_reverse_v16i16: 149; AVX2: # %bb.0: 150; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 151; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 152; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 153; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 154; AVX2-NEXT: retq 155 %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17> 156 %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16> 157 %add = add <16 x i16> %lhs, %rhs 158 ret <16 x i16> %add 159} 160 161define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 162; SSE-LABEL: hadd_reverse2_v16i16: 163; SSE: # %bb.0: 164; SSE-NEXT: movdqa %xmm0, %xmm4 165; SSE-NEXT: movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 166; SSE-NEXT: pshufb %xmm0, %xmm4 167; SSE-NEXT: pshufb %xmm0, %xmm1 168; SSE-NEXT: pshufb %xmm0, %xmm2 169; SSE-NEXT: phaddw %xmm2, %xmm4 170; SSE-NEXT: pshufb %xmm0, %xmm3 171; SSE-NEXT: phaddw %xmm3, %xmm1 172; SSE-NEXT: movdqa %xmm1, %xmm0 173; SSE-NEXT: movdqa %xmm4, %xmm1 174; SSE-NEXT: retq 175; 176; AVX1-LABEL: hadd_reverse2_v16i16: 177; AVX1: # %bb.0: 178; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 179; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 180; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 181; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 182; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 183; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 184; AVX1-NEXT: vphaddw %xmm4, %xmm2, %xmm2 185; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 186; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 187; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 188; AVX1-NEXT: retq 189; 190; AVX2-LABEL: hadd_reverse2_v16i16: 191; AVX2: # %bb.0: 192; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] 193; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 194; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 195; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 196; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] 197; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 198; AVX2-NEXT: retq 199 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 200 %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 201 %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30> 202 %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> 203 %add = add <16 x i16> %lhs, %rhs 204 ret <16 x i16> %add 205} 206 207define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { 208; SSE-LABEL: hadd_reverse_v8f64: 209; SSE: # %bb.0: 210; SSE-NEXT: movapd %xmm1, %xmm8 211; SSE-NEXT: movapd %xmm0, %xmm9 212; SSE-NEXT: haddpd %xmm7, %xmm3 213; SSE-NEXT: haddpd %xmm6, %xmm2 214; SSE-NEXT: haddpd %xmm5, %xmm8 215; SSE-NEXT: haddpd %xmm4, %xmm9 216; SSE-NEXT: movapd %xmm3, %xmm0 217; SSE-NEXT: movapd %xmm2, %xmm1 218; SSE-NEXT: movapd %xmm8, %xmm2 219; SSE-NEXT: movapd %xmm9, %xmm3 220; SSE-NEXT: retq 221; 222; AVX1-LABEL: hadd_reverse_v8f64: 223; AVX1: # %bb.0: 224; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 225; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] 226; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 227; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 228; AVX1-NEXT: vmovapd %ymm3, %ymm0 229; AVX1-NEXT: retq 230; 231; AVX2-LABEL: hadd_reverse_v8f64: 232; AVX2: # %bb.0: 233; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 234; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] 235; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 236; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] 237; AVX2-NEXT: vmovapd %ymm3, %ymm0 238; AVX2-NEXT: retq 239 %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9> 240 %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8> 241 %fadd = fadd <8 x double> %lhs, %rhs 242 ret <8 x double> %fadd 243} 244 245define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { 246; SSE-LABEL: hadd_reverse2_v8f64: 247; SSE: # %bb.0: 248; SSE-NEXT: movapd %xmm1, %xmm8 249; SSE-NEXT: movapd %xmm0, %xmm9 250; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0] 251; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm1[0] 252; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 253; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1,0] 254; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0] 255; SSE-NEXT: haddpd %xmm4, %xmm9 256; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1,0] 257; SSE-NEXT: haddpd %xmm5, %xmm8 258; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1,0] 259; SSE-NEXT: haddpd %xmm6, %xmm2 260; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1,0] 261; SSE-NEXT: haddpd %xmm7, %xmm3 262; SSE-NEXT: movapd %xmm3, %xmm0 263; SSE-NEXT: movapd %xmm2, %xmm1 264; SSE-NEXT: movapd %xmm8, %xmm2 265; SSE-NEXT: movapd %xmm9, %xmm3 266; SSE-NEXT: retq 267; 268; AVX1-LABEL: hadd_reverse2_v8f64: 269; AVX1: # %bb.0: 270; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 271; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 272; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,3,2] 273; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] 274; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm2[1,0,3,2] 275; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 276; AVX1-NEXT: vhaddpd %ymm1, %ymm0, %ymm1 277; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm3[1,0,3,2] 278; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 279; AVX1-NEXT: vhaddpd %ymm0, %ymm4, %ymm0 280; AVX1-NEXT: retq 281; 282; AVX2-LABEL: hadd_reverse2_v8f64: 283; AVX2: # %bb.0: 284; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] 285; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[3,2,1,0] 286; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[3,2,1,0] 287; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm1 288; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,2,1,0] 289; AVX2-NEXT: vhaddpd %ymm0, %ymm4, %ymm0 290; AVX2-NEXT: retq 291 %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 292 %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 293 %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 294 %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 295 %fadd = fadd <8 x double> %lhs, %rhs 296 ret <8 x double> %fadd 297} 298 299define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { 300; SSE-LABEL: hadd_reverse_v16f32: 301; SSE: # %bb.0: 302; SSE-NEXT: movaps %xmm5, %xmm8 303; SSE-NEXT: movaps %xmm1, %xmm5 304; SSE-NEXT: haddps %xmm2, %xmm3 305; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] 306; SSE-NEXT: haddps %xmm6, %xmm7 307; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0,3,2] 308; SSE-NEXT: haddps %xmm0, %xmm5 309; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0,3,2] 310; SSE-NEXT: haddps %xmm4, %xmm8 311; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0,3,2] 312; SSE-NEXT: movaps %xmm3, %xmm0 313; SSE-NEXT: movaps %xmm7, %xmm1 314; SSE-NEXT: movaps %xmm5, %xmm2 315; SSE-NEXT: movaps %xmm8, %xmm3 316; SSE-NEXT: retq 317; 318; AVX1-LABEL: hadd_reverse_v16f32: 319; AVX1: # %bb.0: 320; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] 321; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 322; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm2 323; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3] 324; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 325; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 326; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 327; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] 328; AVX1-NEXT: retq 329; 330; AVX2-LABEL: hadd_reverse_v16f32: 331; AVX2: # %bb.0: 332; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 333; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] 334; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1] 335; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 336; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 337; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1] 338; AVX2-NEXT: vmovaps %ymm3, %ymm0 339; AVX2-NEXT: retq 340 %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17> 341 %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16> 342 %fadd = fadd <16 x float> %lhs, %rhs 343 ret <16 x float> %fadd 344} 345 346define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { 347; SSE-LABEL: hadd_reverse2_v16f32: 348; SSE: # %bb.0: 349; SSE-NEXT: movaps %xmm1, %xmm8 350; SSE-NEXT: movaps %xmm0, %xmm9 351; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,2],xmm0[1,0] 352; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,2],xmm1[1,0] 353; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0] 354; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2,1,0] 355; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] 356; SSE-NEXT: haddps %xmm4, %xmm9 357; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2,1,0] 358; SSE-NEXT: haddps %xmm5, %xmm8 359; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2,1,0] 360; SSE-NEXT: haddps %xmm6, %xmm2 361; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2,1,0] 362; SSE-NEXT: haddps %xmm7, %xmm3 363; SSE-NEXT: movaps %xmm3, %xmm0 364; SSE-NEXT: movaps %xmm2, %xmm1 365; SSE-NEXT: movaps %xmm8, %xmm2 366; SSE-NEXT: movaps %xmm9, %xmm3 367; SSE-NEXT: retq 368; 369; AVX1-LABEL: hadd_reverse2_v16f32: 370; AVX1: # %bb.0: 371; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 372; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 373; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 374; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] 375; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4] 376; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 377; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm1 378; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4] 379; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 380; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm0 381; AVX1-NEXT: retq 382; 383; AVX2-LABEL: hadd_reverse2_v16f32: 384; AVX2: # %bb.0: 385; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 386; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 387; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 388; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm1[2,3,0,1] 389; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4] 390; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 391; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm1 392; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4] 393; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 394; AVX2-NEXT: vhaddps %ymm0, %ymm4, %ymm0 395; AVX2-NEXT: retq 396 %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 397 %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 398 %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30> 399 %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31> 400 %fadd = fadd <16 x float> %lhs, %rhs 401 ret <16 x float> %fadd 402} 403