1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) { 8; SSE-LABEL: hadd_ps_test1: 9; SSE: # %bb.0: 10; SSE-NEXT: haddps %xmm1, %xmm0 11; SSE-NEXT: retq 12; 13; AVX-LABEL: hadd_ps_test1: 14; AVX: # %bb.0: 15; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 16; AVX-NEXT: retq 17 %vecext = extractelement <4 x float> %A, i32 0 18 %vecext1 = extractelement <4 x float> %A, i32 1 19 %add = fadd float %vecext, %vecext1 20 %vecinit = insertelement <4 x float> undef, float %add, i32 0 21 %vecext2 = extractelement <4 x float> %A, i32 2 22 %vecext3 = extractelement <4 x float> %A, i32 3 23 %add4 = fadd float %vecext2, %vecext3 24 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 25 %vecext6 = extractelement <4 x float> %B, i32 0 26 %vecext7 = extractelement <4 x float> %B, i32 1 27 %add8 = fadd float %vecext6, %vecext7 28 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 29 %vecext10 = extractelement <4 x float> %B, i32 2 30 %vecext11 = extractelement <4 x float> %B, i32 3 31 %add12 = fadd float %vecext10, %vecext11 32 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 33 ret <4 x float> %vecinit13 34} 35 36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) { 37; SSE-LABEL: hadd_ps_test2: 38; SSE: # %bb.0: 39; SSE-NEXT: haddps %xmm1, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: hadd_ps_test2: 43; AVX: # %bb.0: 44; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %vecext = extractelement <4 x float> %A, i32 2 47 %vecext1 = extractelement <4 x float> %A, i32 3 48 %add = fadd float %vecext, %vecext1 49 %vecinit = insertelement <4 x float> undef, float %add, i32 1 50 %vecext2 = extractelement <4 x float> %A, i32 0 51 %vecext3 = extractelement <4 x float> %A, i32 1 52 %add4 = fadd float %vecext2, %vecext3 53 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0 54 %vecext6 = extractelement <4 x float> %B, i32 2 55 %vecext7 = extractelement <4 x float> %B, i32 3 56 %add8 = fadd float %vecext6, %vecext7 57 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3 58 %vecext10 = extractelement <4 x float> %B, i32 0 59 %vecext11 = extractelement <4 x float> %B, i32 1 60 %add12 = fadd float %vecext10, %vecext11 61 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2 62 ret <4 x float> %vecinit13 63} 64 65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) { 66; SSE-LABEL: hsub_ps_test1: 67; SSE: # %bb.0: 68; SSE-NEXT: hsubps %xmm1, %xmm0 69; SSE-NEXT: retq 70; 71; AVX-LABEL: hsub_ps_test1: 72; AVX: # %bb.0: 73; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 74; AVX-NEXT: retq 75 %vecext = extractelement <4 x float> %A, i32 0 76 %vecext1 = extractelement <4 x float> %A, i32 1 77 %sub = fsub float %vecext, %vecext1 78 %vecinit = insertelement <4 x float> undef, float %sub, i32 0 79 %vecext2 = extractelement <4 x float> %A, i32 2 80 %vecext3 = extractelement <4 x float> %A, i32 3 81 %sub4 = fsub float %vecext2, %vecext3 82 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1 83 %vecext6 = extractelement <4 x float> %B, i32 0 84 %vecext7 = extractelement <4 x float> %B, i32 1 85 %sub8 = fsub float %vecext6, %vecext7 86 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2 87 %vecext10 = extractelement <4 x float> %B, i32 2 88 %vecext11 = extractelement <4 x float> %B, i32 3 89 %sub12 = fsub float %vecext10, %vecext11 90 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3 91 ret <4 x float> %vecinit13 92} 93 94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) { 95; SSE-LABEL: hsub_ps_test2: 96; SSE: # %bb.0: 97; SSE-NEXT: hsubps %xmm1, %xmm0 98; SSE-NEXT: retq 99; 100; AVX-LABEL: hsub_ps_test2: 101; AVX: # %bb.0: 102; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 103; AVX-NEXT: retq 104 %vecext = extractelement <4 x float> %A, i32 2 105 %vecext1 = extractelement <4 x float> %A, i32 3 106 %sub = fsub float %vecext, %vecext1 107 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 108 %vecext2 = extractelement <4 x float> %A, i32 0 109 %vecext3 = extractelement <4 x float> %A, i32 1 110 %sub4 = fsub float %vecext2, %vecext3 111 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 112 %vecext6 = extractelement <4 x float> %B, i32 2 113 %vecext7 = extractelement <4 x float> %B, i32 3 114 %sub8 = fsub float %vecext6, %vecext7 115 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 116 %vecext10 = extractelement <4 x float> %B, i32 0 117 %vecext11 = extractelement <4 x float> %B, i32 1 118 %sub12 = fsub float %vecext10, %vecext11 119 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 120 ret <4 x float> %vecinit13 121} 122 123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { 124; SSE3-LABEL: phadd_d_test1: 125; SSE3: # %bb.0: 126; SSE3-NEXT: movd %xmm0, %eax 127; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 128; SSE3-NEXT: movd %xmm2, %ecx 129; SSE3-NEXT: addl %eax, %ecx 130; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 131; SSE3-NEXT: movd %xmm2, %eax 132; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 133; SSE3-NEXT: movd %xmm0, %edx 134; SSE3-NEXT: addl %eax, %edx 135; SSE3-NEXT: movd %xmm1, %eax 136; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 137; SSE3-NEXT: movd %xmm0, %esi 138; SSE3-NEXT: addl %eax, %esi 139; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 140; SSE3-NEXT: movd %xmm0, %eax 141; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 142; SSE3-NEXT: movd %xmm0, %edi 143; SSE3-NEXT: addl %eax, %edi 144; SSE3-NEXT: movd %edi, %xmm0 145; SSE3-NEXT: movd %esi, %xmm1 146; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 147; SSE3-NEXT: movd %edx, %xmm2 148; SSE3-NEXT: movd %ecx, %xmm0 149; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 150; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 151; SSE3-NEXT: retq 152; 153; SSSE3-LABEL: phadd_d_test1: 154; SSSE3: # %bb.0: 155; SSSE3-NEXT: phaddd %xmm1, %xmm0 156; SSSE3-NEXT: retq 157; 158; AVX-LABEL: phadd_d_test1: 159; AVX: # %bb.0: 160; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 161; AVX-NEXT: retq 162 %vecext = extractelement <4 x i32> %A, i32 0 163 %vecext1 = extractelement <4 x i32> %A, i32 1 164 %add = add i32 %vecext, %vecext1 165 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0 166 %vecext2 = extractelement <4 x i32> %A, i32 2 167 %vecext3 = extractelement <4 x i32> %A, i32 3 168 %add4 = add i32 %vecext2, %vecext3 169 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1 170 %vecext6 = extractelement <4 x i32> %B, i32 0 171 %vecext7 = extractelement <4 x i32> %B, i32 1 172 %add8 = add i32 %vecext6, %vecext7 173 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2 174 %vecext10 = extractelement <4 x i32> %B, i32 2 175 %vecext11 = extractelement <4 x i32> %B, i32 3 176 %add12 = add i32 %vecext10, %vecext11 177 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3 178 ret <4 x i32> %vecinit13 179} 180 181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { 182; SSE3-LABEL: phadd_d_test2: 183; SSE3: # %bb.0: 184; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 185; SSE3-NEXT: movd %xmm2, %eax 186; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 187; SSE3-NEXT: movd %xmm2, %ecx 188; SSE3-NEXT: addl %eax, %ecx 189; SSE3-NEXT: movd %xmm0, %eax 190; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 191; SSE3-NEXT: movd %xmm0, %edx 192; SSE3-NEXT: addl %eax, %edx 193; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 194; SSE3-NEXT: movd %xmm0, %eax 195; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 196; SSE3-NEXT: movd %xmm0, %esi 197; SSE3-NEXT: addl %eax, %esi 198; SSE3-NEXT: movd %esi, %xmm0 199; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] 200; SSE3-NEXT: movd %xmm2, %eax 201; SSE3-NEXT: movd %xmm1, %esi 202; SSE3-NEXT: addl %eax, %esi 203; SSE3-NEXT: movd %esi, %xmm1 204; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 205; SSE3-NEXT: movd %ecx, %xmm2 206; SSE3-NEXT: movd %edx, %xmm0 207; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 208; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 209; SSE3-NEXT: retq 210; 211; SSSE3-LABEL: phadd_d_test2: 212; SSSE3: # %bb.0: 213; SSSE3-NEXT: phaddd %xmm1, %xmm0 214; SSSE3-NEXT: retq 215; 216; AVX-LABEL: phadd_d_test2: 217; AVX: # %bb.0: 218; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 219; AVX-NEXT: retq 220 %vecext = extractelement <4 x i32> %A, i32 2 221 %vecext1 = extractelement <4 x i32> %A, i32 3 222 %add = add i32 %vecext, %vecext1 223 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1 224 %vecext2 = extractelement <4 x i32> %A, i32 0 225 %vecext3 = extractelement <4 x i32> %A, i32 1 226 %add4 = add i32 %vecext2, %vecext3 227 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0 228 %vecext6 = extractelement <4 x i32> %B, i32 3 229 %vecext7 = extractelement <4 x i32> %B, i32 2 230 %add8 = add i32 %vecext6, %vecext7 231 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3 232 %vecext10 = extractelement <4 x i32> %B, i32 1 233 %vecext11 = extractelement <4 x i32> %B, i32 0 234 %add12 = add i32 %vecext10, %vecext11 235 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2 236 ret <4 x i32> %vecinit13 237} 238 239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { 240; SSE3-LABEL: phsub_d_test1: 241; SSE3: # %bb.0: 242; SSE3-NEXT: movd %xmm0, %eax 243; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 244; SSE3-NEXT: movd %xmm2, %ecx 245; SSE3-NEXT: subl %ecx, %eax 246; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 247; SSE3-NEXT: movd %xmm2, %ecx 248; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 249; SSE3-NEXT: movd %xmm0, %edx 250; SSE3-NEXT: subl %edx, %ecx 251; SSE3-NEXT: movd %xmm1, %edx 252; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 253; SSE3-NEXT: movd %xmm0, %esi 254; SSE3-NEXT: subl %esi, %edx 255; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 256; SSE3-NEXT: movd %xmm0, %esi 257; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 258; SSE3-NEXT: movd %xmm0, %edi 259; SSE3-NEXT: subl %edi, %esi 260; SSE3-NEXT: movd %esi, %xmm0 261; SSE3-NEXT: movd %edx, %xmm1 262; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 263; SSE3-NEXT: movd %ecx, %xmm2 264; SSE3-NEXT: movd %eax, %xmm0 265; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 266; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 267; SSE3-NEXT: retq 268; 269; SSSE3-LABEL: phsub_d_test1: 270; SSSE3: # %bb.0: 271; SSSE3-NEXT: phsubd %xmm1, %xmm0 272; SSSE3-NEXT: retq 273; 274; AVX-LABEL: phsub_d_test1: 275; AVX: # %bb.0: 276; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 277; AVX-NEXT: retq 278 %vecext = extractelement <4 x i32> %A, i32 0 279 %vecext1 = extractelement <4 x i32> %A, i32 1 280 %sub = sub i32 %vecext, %vecext1 281 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 282 %vecext2 = extractelement <4 x i32> %A, i32 2 283 %vecext3 = extractelement <4 x i32> %A, i32 3 284 %sub4 = sub i32 %vecext2, %vecext3 285 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 286 %vecext6 = extractelement <4 x i32> %B, i32 0 287 %vecext7 = extractelement <4 x i32> %B, i32 1 288 %sub8 = sub i32 %vecext6, %vecext7 289 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 290 %vecext10 = extractelement <4 x i32> %B, i32 2 291 %vecext11 = extractelement <4 x i32> %B, i32 3 292 %sub12 = sub i32 %vecext10, %vecext11 293 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 294 ret <4 x i32> %vecinit13 295} 296 297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { 298; SSE3-LABEL: phsub_d_test2: 299; SSE3: # %bb.0: 300; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 301; SSE3-NEXT: movd %xmm2, %eax 302; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 303; SSE3-NEXT: movd %xmm2, %ecx 304; SSE3-NEXT: subl %ecx, %eax 305; SSE3-NEXT: movd %xmm0, %ecx 306; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 307; SSE3-NEXT: movd %xmm0, %edx 308; SSE3-NEXT: subl %edx, %ecx 309; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 310; SSE3-NEXT: movd %xmm0, %edx 311; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 312; SSE3-NEXT: movd %xmm0, %esi 313; SSE3-NEXT: subl %esi, %edx 314; SSE3-NEXT: movd %edx, %xmm0 315; SSE3-NEXT: movd %xmm1, %edx 316; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] 317; SSE3-NEXT: movd %xmm1, %esi 318; SSE3-NEXT: subl %esi, %edx 319; SSE3-NEXT: movd %edx, %xmm1 320; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 321; SSE3-NEXT: movd %eax, %xmm2 322; SSE3-NEXT: movd %ecx, %xmm0 323; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 324; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 325; SSE3-NEXT: retq 326; 327; SSSE3-LABEL: phsub_d_test2: 328; SSSE3: # %bb.0: 329; SSSE3-NEXT: phsubd %xmm1, %xmm0 330; SSSE3-NEXT: retq 331; 332; AVX-LABEL: phsub_d_test2: 333; AVX: # %bb.0: 334; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 335; AVX-NEXT: retq 336 %vecext = extractelement <4 x i32> %A, i32 2 337 %vecext1 = extractelement <4 x i32> %A, i32 3 338 %sub = sub i32 %vecext, %vecext1 339 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1 340 %vecext2 = extractelement <4 x i32> %A, i32 0 341 %vecext3 = extractelement <4 x i32> %A, i32 1 342 %sub4 = sub i32 %vecext2, %vecext3 343 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0 344 %vecext6 = extractelement <4 x i32> %B, i32 2 345 %vecext7 = extractelement <4 x i32> %B, i32 3 346 %sub8 = sub i32 %vecext6, %vecext7 347 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3 348 %vecext10 = extractelement <4 x i32> %B, i32 0 349 %vecext11 = extractelement <4 x i32> %B, i32 1 350 %sub12 = sub i32 %vecext10, %vecext11 351 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2 352 ret <4 x i32> %vecinit13 353} 354 355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) { 356; SSE-LABEL: hadd_pd_test1: 357; SSE: # %bb.0: 358; SSE-NEXT: haddpd %xmm1, %xmm0 359; SSE-NEXT: retq 360; 361; AVX-LABEL: hadd_pd_test1: 362; AVX: # %bb.0: 363; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 364; AVX-NEXT: retq 365 %vecext = extractelement <2 x double> %A, i32 0 366 %vecext1 = extractelement <2 x double> %A, i32 1 367 %add = fadd double %vecext, %vecext1 368 %vecinit = insertelement <2 x double> undef, double %add, i32 0 369 %vecext2 = extractelement <2 x double> %B, i32 0 370 %vecext3 = extractelement <2 x double> %B, i32 1 371 %add2 = fadd double %vecext2, %vecext3 372 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 373 ret <2 x double> %vecinit2 374} 375 376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) { 377; SSE-LABEL: hadd_pd_test2: 378; SSE: # %bb.0: 379; SSE-NEXT: haddpd %xmm1, %xmm0 380; SSE-NEXT: retq 381; 382; AVX-LABEL: hadd_pd_test2: 383; AVX: # %bb.0: 384; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 385; AVX-NEXT: retq 386 %vecext = extractelement <2 x double> %A, i32 1 387 %vecext1 = extractelement <2 x double> %A, i32 0 388 %add = fadd double %vecext, %vecext1 389 %vecinit = insertelement <2 x double> undef, double %add, i32 0 390 %vecext2 = extractelement <2 x double> %B, i32 1 391 %vecext3 = extractelement <2 x double> %B, i32 0 392 %add2 = fadd double %vecext2, %vecext3 393 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 394 ret <2 x double> %vecinit2 395} 396 397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) { 398; SSE-LABEL: hsub_pd_test1: 399; SSE: # %bb.0: 400; SSE-NEXT: hsubpd %xmm1, %xmm0 401; SSE-NEXT: retq 402; 403; AVX-LABEL: hsub_pd_test1: 404; AVX: # %bb.0: 405; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 406; AVX-NEXT: retq 407 %vecext = extractelement <2 x double> %A, i32 0 408 %vecext1 = extractelement <2 x double> %A, i32 1 409 %sub = fsub double %vecext, %vecext1 410 %vecinit = insertelement <2 x double> undef, double %sub, i32 0 411 %vecext2 = extractelement <2 x double> %B, i32 0 412 %vecext3 = extractelement <2 x double> %B, i32 1 413 %sub2 = fsub double %vecext2, %vecext3 414 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1 415 ret <2 x double> %vecinit2 416} 417 418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) { 419; SSE-LABEL: hsub_pd_test2: 420; SSE: # %bb.0: 421; SSE-NEXT: hsubpd %xmm1, %xmm0 422; SSE-NEXT: retq 423; 424; AVX-LABEL: hsub_pd_test2: 425; AVX: # %bb.0: 426; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 427; AVX-NEXT: retq 428 %vecext = extractelement <2 x double> %B, i32 0 429 %vecext1 = extractelement <2 x double> %B, i32 1 430 %sub = fsub double %vecext, %vecext1 431 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 432 %vecext2 = extractelement <2 x double> %A, i32 0 433 %vecext3 = extractelement <2 x double> %A, i32 1 434 %sub2 = fsub double %vecext2, %vecext3 435 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 436 ret <2 x double> %vecinit2 437} 438 439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { 440; SSE-LABEL: avx_vhadd_pd_test: 441; SSE: # %bb.0: 442; SSE-NEXT: haddpd %xmm1, %xmm0 443; SSE-NEXT: haddpd %xmm3, %xmm2 444; SSE-NEXT: movapd %xmm2, %xmm1 445; SSE-NEXT: retq 446; 447; AVX-LABEL: avx_vhadd_pd_test: 448; AVX: # %bb.0: 449; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 450; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1 451; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 452; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 453; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 454; AVX-NEXT: retq 455 %vecext = extractelement <4 x double> %A, i32 0 456 %vecext1 = extractelement <4 x double> %A, i32 1 457 %add = fadd double %vecext, %vecext1 458 %vecinit = insertelement <4 x double> undef, double %add, i32 0 459 %vecext2 = extractelement <4 x double> %A, i32 2 460 %vecext3 = extractelement <4 x double> %A, i32 3 461 %add4 = fadd double %vecext2, %vecext3 462 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 463 %vecext6 = extractelement <4 x double> %B, i32 0 464 %vecext7 = extractelement <4 x double> %B, i32 1 465 %add8 = fadd double %vecext6, %vecext7 466 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 467 %vecext10 = extractelement <4 x double> %B, i32 2 468 %vecext11 = extractelement <4 x double> %B, i32 3 469 %add12 = fadd double %vecext10, %vecext11 470 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 471 ret <4 x double> %vecinit13 472} 473 474define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { 475; SSE-LABEL: avx_vhsub_pd_test: 476; SSE: # %bb.0: 477; SSE-NEXT: hsubpd %xmm1, %xmm0 478; SSE-NEXT: hsubpd %xmm3, %xmm2 479; SSE-NEXT: movapd %xmm2, %xmm1 480; SSE-NEXT: retq 481; 482; AVX-LABEL: avx_vhsub_pd_test: 483; AVX: # %bb.0: 484; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 485; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1 486; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 487; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0 488; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 489; AVX-NEXT: retq 490 %vecext = extractelement <4 x double> %A, i32 0 491 %vecext1 = extractelement <4 x double> %A, i32 1 492 %sub = fsub double %vecext, %vecext1 493 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 494 %vecext2 = extractelement <4 x double> %A, i32 2 495 %vecext3 = extractelement <4 x double> %A, i32 3 496 %sub4 = fsub double %vecext2, %vecext3 497 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 498 %vecext6 = extractelement <4 x double> %B, i32 0 499 %vecext7 = extractelement <4 x double> %B, i32 1 500 %sub8 = fsub double %vecext6, %vecext7 501 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 502 %vecext10 = extractelement <4 x double> %B, i32 2 503 %vecext11 = extractelement <4 x double> %B, i32 3 504 %sub12 = fsub double %vecext10, %vecext11 505 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 506 ret <4 x double> %vecinit13 507} 508 509define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { 510; SSE3-LABEL: avx2_vphadd_d_test: 511; SSE3: # %bb.0: 512; SSE3-NEXT: movd %xmm0, %ecx 513; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 514; SSE3-NEXT: movd %xmm4, %r8d 515; SSE3-NEXT: addl %ecx, %r8d 516; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 517; SSE3-NEXT: movd %xmm4, %edx 518; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 519; SSE3-NEXT: movd %xmm0, %r9d 520; SSE3-NEXT: addl %edx, %r9d 521; SSE3-NEXT: movd %xmm1, %edx 522; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 523; SSE3-NEXT: movd %xmm0, %esi 524; SSE3-NEXT: addl %edx, %esi 525; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 526; SSE3-NEXT: movd %xmm0, %edx 527; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 528; SSE3-NEXT: movd %xmm0, %edi 529; SSE3-NEXT: addl %edx, %edi 530; SSE3-NEXT: movd %xmm2, %eax 531; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 532; SSE3-NEXT: movd %xmm0, %r10d 533; SSE3-NEXT: addl %eax, %r10d 534; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 535; SSE3-NEXT: movd %xmm0, %eax 536; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 537; SSE3-NEXT: movd %xmm0, %ecx 538; SSE3-NEXT: addl %eax, %ecx 539; SSE3-NEXT: movd %xmm3, %eax 540; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 541; SSE3-NEXT: movd %xmm0, %edx 542; SSE3-NEXT: addl %eax, %edx 543; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 544; SSE3-NEXT: movd %xmm0, %r11d 545; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 546; SSE3-NEXT: movd %xmm0, %eax 547; SSE3-NEXT: addl %r11d, %eax 548; SSE3-NEXT: movd %edi, %xmm0 549; SSE3-NEXT: movd %esi, %xmm1 550; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 551; SSE3-NEXT: movd %r9d, %xmm2 552; SSE3-NEXT: movd %r8d, %xmm0 553; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 554; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 555; SSE3-NEXT: movd %eax, %xmm1 556; SSE3-NEXT: movd %edx, %xmm2 557; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 558; SSE3-NEXT: movd %ecx, %xmm3 559; SSE3-NEXT: movd %r10d, %xmm1 560; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 561; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 562; SSE3-NEXT: retq 563; 564; SSSE3-LABEL: avx2_vphadd_d_test: 565; SSSE3: # %bb.0: 566; SSSE3-NEXT: phaddd %xmm1, %xmm0 567; SSSE3-NEXT: phaddd %xmm3, %xmm2 568; SSSE3-NEXT: movdqa %xmm2, %xmm1 569; SSSE3-NEXT: retq 570; 571; AVX1-LABEL: avx2_vphadd_d_test: 572; AVX1: # %bb.0: 573; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 574; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 575; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 576; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 577; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 578; AVX1-NEXT: retq 579; 580; AVX2-LABEL: avx2_vphadd_d_test: 581; AVX2: # %bb.0: 582; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 583; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1 584; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 585; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0 586; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 587; AVX2-NEXT: retq 588 %vecext = extractelement <8 x i32> %A, i32 0 589 %vecext1 = extractelement <8 x i32> %A, i32 1 590 %add = add i32 %vecext, %vecext1 591 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 592 %vecext2 = extractelement <8 x i32> %A, i32 2 593 %vecext3 = extractelement <8 x i32> %A, i32 3 594 %add4 = add i32 %vecext2, %vecext3 595 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 596 %vecext6 = extractelement <8 x i32> %A, i32 4 597 %vecext7 = extractelement <8 x i32> %A, i32 5 598 %add8 = add i32 %vecext6, %vecext7 599 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 600 %vecext10 = extractelement <8 x i32> %A, i32 6 601 %vecext11 = extractelement <8 x i32> %A, i32 7 602 %add12 = add i32 %vecext10, %vecext11 603 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 604 %vecext14 = extractelement <8 x i32> %B, i32 0 605 %vecext15 = extractelement <8 x i32> %B, i32 1 606 %add16 = add i32 %vecext14, %vecext15 607 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 608 %vecext18 = extractelement <8 x i32> %B, i32 2 609 %vecext19 = extractelement <8 x i32> %B, i32 3 610 %add20 = add i32 %vecext18, %vecext19 611 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 612 %vecext22 = extractelement <8 x i32> %B, i32 4 613 %vecext23 = extractelement <8 x i32> %B, i32 5 614 %add24 = add i32 %vecext22, %vecext23 615 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 616 %vecext26 = extractelement <8 x i32> %B, i32 6 617 %vecext27 = extractelement <8 x i32> %B, i32 7 618 %add28 = add i32 %vecext26, %vecext27 619 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 620 ret <8 x i32> %vecinit29 621} 622 623define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) { 624; SSE3-LABEL: avx2_vphadd_w_test: 625; SSE3: # %bb.0: 626; SSE3-NEXT: pushq %rbp 627; SSE3-NEXT: .cfi_def_cfa_offset 16 628; SSE3-NEXT: pushq %r15 629; SSE3-NEXT: .cfi_def_cfa_offset 24 630; SSE3-NEXT: pushq %r14 631; SSE3-NEXT: .cfi_def_cfa_offset 32 632; SSE3-NEXT: pushq %r13 633; SSE3-NEXT: .cfi_def_cfa_offset 40 634; SSE3-NEXT: pushq %r12 635; SSE3-NEXT: .cfi_def_cfa_offset 48 636; SSE3-NEXT: pushq %rbx 637; SSE3-NEXT: .cfi_def_cfa_offset 56 638; SSE3-NEXT: .cfi_offset %rbx, -56 639; SSE3-NEXT: .cfi_offset %r12, -48 640; SSE3-NEXT: .cfi_offset %r13, -40 641; SSE3-NEXT: .cfi_offset %r14, -32 642; SSE3-NEXT: .cfi_offset %r15, -24 643; SSE3-NEXT: .cfi_offset %rbp, -16 644; SSE3-NEXT: movd %xmm0, %eax 645; SSE3-NEXT: pextrw $1, %xmm0, %ecx 646; SSE3-NEXT: addl %eax, %ecx 647; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 648; SSE3-NEXT: pextrw $2, %xmm0, %eax 649; SSE3-NEXT: pextrw $3, %xmm0, %ecx 650; SSE3-NEXT: addl %eax, %ecx 651; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 652; SSE3-NEXT: pextrw $4, %xmm0, %eax 653; SSE3-NEXT: pextrw $5, %xmm0, %r11d 654; SSE3-NEXT: addl %eax, %r11d 655; SSE3-NEXT: pextrw $6, %xmm0, %eax 656; SSE3-NEXT: pextrw $7, %xmm0, %r15d 657; SSE3-NEXT: addl %eax, %r15d 658; SSE3-NEXT: movd %xmm1, %eax 659; SSE3-NEXT: pextrw $1, %xmm1, %r13d 660; SSE3-NEXT: addl %eax, %r13d 661; SSE3-NEXT: pextrw $2, %xmm1, %eax 662; SSE3-NEXT: pextrw $3, %xmm1, %ebx 663; SSE3-NEXT: addl %eax, %ebx 664; SSE3-NEXT: pextrw $4, %xmm1, %eax 665; SSE3-NEXT: pextrw $5, %xmm1, %r8d 666; SSE3-NEXT: addl %eax, %r8d 667; SSE3-NEXT: pextrw $6, %xmm1, %eax 668; SSE3-NEXT: pextrw $7, %xmm1, %esi 669; SSE3-NEXT: addl %eax, %esi 670; SSE3-NEXT: movd %xmm2, %eax 671; SSE3-NEXT: pextrw $1, %xmm2, %r10d 672; SSE3-NEXT: addl %eax, %r10d 673; SSE3-NEXT: pextrw $2, %xmm2, %eax 674; SSE3-NEXT: pextrw $3, %xmm2, %r14d 675; SSE3-NEXT: addl %eax, %r14d 676; SSE3-NEXT: pextrw $4, %xmm2, %eax 677; SSE3-NEXT: pextrw $5, %xmm2, %r12d 678; SSE3-NEXT: addl %eax, %r12d 679; SSE3-NEXT: pextrw $6, %xmm2, %eax 680; SSE3-NEXT: pextrw $7, %xmm2, %r9d 681; SSE3-NEXT: addl %eax, %r9d 682; SSE3-NEXT: movd %xmm3, %eax 683; SSE3-NEXT: pextrw $1, %xmm3, %ebp 684; SSE3-NEXT: addl %eax, %ebp 685; SSE3-NEXT: pextrw $2, %xmm3, %edx 686; SSE3-NEXT: pextrw $3, %xmm3, %edi 687; SSE3-NEXT: addl %edx, %edi 688; SSE3-NEXT: pextrw $4, %xmm3, %edx 689; SSE3-NEXT: pextrw $5, %xmm3, %ecx 690; SSE3-NEXT: addl %edx, %ecx 691; SSE3-NEXT: pextrw $6, %xmm3, %edx 692; SSE3-NEXT: pextrw $7, %xmm3, %eax 693; SSE3-NEXT: addl %edx, %eax 694; SSE3-NEXT: movd %esi, %xmm8 695; SSE3-NEXT: movd %r8d, %xmm3 696; SSE3-NEXT: movd %ebx, %xmm9 697; SSE3-NEXT: movd %r13d, %xmm4 698; SSE3-NEXT: movd %r15d, %xmm10 699; SSE3-NEXT: movd %r11d, %xmm7 700; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload 701; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero 702; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 703; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 704; SSE3-NEXT: movd %eax, %xmm12 705; SSE3-NEXT: movd %ecx, %xmm6 706; SSE3-NEXT: movd %edi, %xmm13 707; SSE3-NEXT: movd %ebp, %xmm5 708; SSE3-NEXT: movd %r9d, %xmm14 709; SSE3-NEXT: movd %r12d, %xmm2 710; SSE3-NEXT: movd %r14d, %xmm15 711; SSE3-NEXT: movd %r10d, %xmm1 712; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 713; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 714; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 715; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 716; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 717; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 718; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 719; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 720; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 721; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 722; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 723; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 724; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 725; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 726; SSE3-NEXT: popq %rbx 727; SSE3-NEXT: .cfi_def_cfa_offset 48 728; SSE3-NEXT: popq %r12 729; SSE3-NEXT: .cfi_def_cfa_offset 40 730; SSE3-NEXT: popq %r13 731; SSE3-NEXT: .cfi_def_cfa_offset 32 732; SSE3-NEXT: popq %r14 733; SSE3-NEXT: .cfi_def_cfa_offset 24 734; SSE3-NEXT: popq %r15 735; SSE3-NEXT: .cfi_def_cfa_offset 16 736; SSE3-NEXT: popq %rbp 737; SSE3-NEXT: .cfi_def_cfa_offset 8 738; SSE3-NEXT: retq 739; 740; SSSE3-LABEL: avx2_vphadd_w_test: 741; SSSE3: # %bb.0: 742; SSSE3-NEXT: phaddw %xmm1, %xmm0 743; SSSE3-NEXT: phaddw %xmm3, %xmm2 744; SSSE3-NEXT: movdqa %xmm2, %xmm1 745; SSSE3-NEXT: retq 746; 747; AVX1-LABEL: avx2_vphadd_w_test: 748; AVX1: # %bb.0: 749; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 750; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 751; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 752; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 753; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 754; AVX1-NEXT: retq 755; 756; AVX2-LABEL: avx2_vphadd_w_test: 757; AVX2: # %bb.0: 758; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 759; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1 760; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 761; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0 762; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 763; AVX2-NEXT: retq 764 %vecext = extractelement <16 x i16> %a, i32 0 765 %vecext1 = extractelement <16 x i16> %a, i32 1 766 %add = add i16 %vecext, %vecext1 767 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 768 %vecext4 = extractelement <16 x i16> %a, i32 2 769 %vecext6 = extractelement <16 x i16> %a, i32 3 770 %add8 = add i16 %vecext4, %vecext6 771 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 772 %vecext11 = extractelement <16 x i16> %a, i32 4 773 %vecext13 = extractelement <16 x i16> %a, i32 5 774 %add15 = add i16 %vecext11, %vecext13 775 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 776 %vecext18 = extractelement <16 x i16> %a, i32 6 777 %vecext20 = extractelement <16 x i16> %a, i32 7 778 %add22 = add i16 %vecext18, %vecext20 779 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 780 %vecext25 = extractelement <16 x i16> %a, i32 8 781 %vecext27 = extractelement <16 x i16> %a, i32 9 782 %add29 = add i16 %vecext25, %vecext27 783 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4 784 %vecext32 = extractelement <16 x i16> %a, i32 10 785 %vecext34 = extractelement <16 x i16> %a, i32 11 786 %add36 = add i16 %vecext32, %vecext34 787 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5 788 %vecext39 = extractelement <16 x i16> %a, i32 12 789 %vecext41 = extractelement <16 x i16> %a, i32 13 790 %add43 = add i16 %vecext39, %vecext41 791 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6 792 %vecext46 = extractelement <16 x i16> %a, i32 14 793 %vecext48 = extractelement <16 x i16> %a, i32 15 794 %add50 = add i16 %vecext46, %vecext48 795 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7 796 %vecext53 = extractelement <16 x i16> %b, i32 0 797 %vecext55 = extractelement <16 x i16> %b, i32 1 798 %add57 = add i16 %vecext53, %vecext55 799 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8 800 %vecext60 = extractelement <16 x i16> %b, i32 2 801 %vecext62 = extractelement <16 x i16> %b, i32 3 802 %add64 = add i16 %vecext60, %vecext62 803 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9 804 %vecext67 = extractelement <16 x i16> %b, i32 4 805 %vecext69 = extractelement <16 x i16> %b, i32 5 806 %add71 = add i16 %vecext67, %vecext69 807 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10 808 %vecext74 = extractelement <16 x i16> %b, i32 6 809 %vecext76 = extractelement <16 x i16> %b, i32 7 810 %add78 = add i16 %vecext74, %vecext76 811 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11 812 %vecext81 = extractelement <16 x i16> %b, i32 8 813 %vecext83 = extractelement <16 x i16> %b, i32 9 814 %add85 = add i16 %vecext81, %vecext83 815 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 816 %vecext88 = extractelement <16 x i16> %b, i32 10 817 %vecext90 = extractelement <16 x i16> %b, i32 11 818 %add92 = add i16 %vecext88, %vecext90 819 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 820 %vecext95 = extractelement <16 x i16> %b, i32 12 821 %vecext97 = extractelement <16 x i16> %b, i32 13 822 %add99 = add i16 %vecext95, %vecext97 823 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 824 %vecext102 = extractelement <16 x i16> %b, i32 14 825 %vecext104 = extractelement <16 x i16> %b, i32 15 826 %add106 = add i16 %vecext102, %vecext104 827 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 828 ret <16 x i16> %vecinit108 829} 830 831; Verify that we don't select horizontal subs in the following functions. 832 833define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { 834; SSE-LABEL: not_a_hsub_1: 835; SSE: # %bb.0: 836; SSE-NEXT: movd %xmm0, %eax 837; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 838; SSE-NEXT: movd %xmm2, %ecx 839; SSE-NEXT: subl %ecx, %eax 840; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 841; SSE-NEXT: movd %xmm2, %ecx 842; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 843; SSE-NEXT: movd %xmm0, %edx 844; SSE-NEXT: subl %edx, %ecx 845; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 846; SSE-NEXT: movd %xmm0, %edx 847; SSE-NEXT: movd %xmm1, %esi 848; SSE-NEXT: subl %esi, %edx 849; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 850; SSE-NEXT: movd %xmm0, %esi 851; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 852; SSE-NEXT: movd %xmm0, %edi 853; SSE-NEXT: subl %edi, %esi 854; SSE-NEXT: movd %esi, %xmm0 855; SSE-NEXT: movd %edx, %xmm1 856; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 857; SSE-NEXT: movd %ecx, %xmm2 858; SSE-NEXT: movd %eax, %xmm0 859; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 860; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 861; SSE-NEXT: retq 862; 863; AVX-LABEL: not_a_hsub_1: 864; AVX: # %bb.0: 865; AVX-NEXT: vmovd %xmm0, %eax 866; AVX-NEXT: vpextrd $1, %xmm0, %ecx 867; AVX-NEXT: subl %ecx, %eax 868; AVX-NEXT: vpextrd $2, %xmm0, %ecx 869; AVX-NEXT: vpextrd $3, %xmm0, %edx 870; AVX-NEXT: subl %edx, %ecx 871; AVX-NEXT: vpextrd $1, %xmm1, %edx 872; AVX-NEXT: vmovd %xmm1, %esi 873; AVX-NEXT: subl %esi, %edx 874; AVX-NEXT: vpextrd $3, %xmm1, %esi 875; AVX-NEXT: vpextrd $2, %xmm1, %edi 876; AVX-NEXT: subl %edi, %esi 877; AVX-NEXT: vmovd %eax, %xmm0 878; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 879; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 880; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 881; AVX-NEXT: retq 882 %vecext = extractelement <4 x i32> %A, i32 0 883 %vecext1 = extractelement <4 x i32> %A, i32 1 884 %sub = sub i32 %vecext, %vecext1 885 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 886 %vecext2 = extractelement <4 x i32> %A, i32 2 887 %vecext3 = extractelement <4 x i32> %A, i32 3 888 %sub4 = sub i32 %vecext2, %vecext3 889 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 890 %vecext6 = extractelement <4 x i32> %B, i32 1 891 %vecext7 = extractelement <4 x i32> %B, i32 0 892 %sub8 = sub i32 %vecext6, %vecext7 893 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 894 %vecext10 = extractelement <4 x i32> %B, i32 3 895 %vecext11 = extractelement <4 x i32> %B, i32 2 896 %sub12 = sub i32 %vecext10, %vecext11 897 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 898 ret <4 x i32> %vecinit13 899} 900 901define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { 902; SSE-LABEL: not_a_hsub_2: 903; SSE: # %bb.0: 904; SSE-NEXT: movaps %xmm0, %xmm2 905; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 906; SSE-NEXT: movaps %xmm0, %xmm3 907; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] 908; SSE-NEXT: subss %xmm3, %xmm2 909; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 910; SSE-NEXT: subss %xmm3, %xmm0 911; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 912; SSE-NEXT: movaps %xmm1, %xmm2 913; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 914; SSE-NEXT: movaps %xmm1, %xmm3 915; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 916; SSE-NEXT: subss %xmm3, %xmm2 917; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 918; SSE-NEXT: subss %xmm3, %xmm1 919; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 920; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 921; SSE-NEXT: retq 922; 923; AVX-LABEL: not_a_hsub_2: 924; AVX: # %bb.0: 925; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 926; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 927; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 928; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 929; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 930; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 931; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] 932; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] 933; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 934; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 935; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 936; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 937; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 938; AVX-NEXT: retq 939 %vecext = extractelement <4 x float> %A, i32 2 940 %vecext1 = extractelement <4 x float> %A, i32 3 941 %sub = fsub float %vecext, %vecext1 942 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 943 %vecext2 = extractelement <4 x float> %A, i32 0 944 %vecext3 = extractelement <4 x float> %A, i32 1 945 %sub4 = fsub float %vecext2, %vecext3 946 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 947 %vecext6 = extractelement <4 x float> %B, i32 3 948 %vecext7 = extractelement <4 x float> %B, i32 2 949 %sub8 = fsub float %vecext6, %vecext7 950 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 951 %vecext10 = extractelement <4 x float> %B, i32 0 952 %vecext11 = extractelement <4 x float> %B, i32 1 953 %sub12 = fsub float %vecext10, %vecext11 954 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 955 ret <4 x float> %vecinit13 956} 957 958define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { 959; SSE-LABEL: not_a_hsub_3: 960; SSE: # %bb.0: 961; SSE-NEXT: movaps %xmm1, %xmm2 962; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] 963; SSE-NEXT: subsd %xmm2, %xmm1 964; SSE-NEXT: movaps %xmm0, %xmm2 965; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] 966; SSE-NEXT: subsd %xmm0, %xmm2 967; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] 968; SSE-NEXT: movapd %xmm2, %xmm0 969; SSE-NEXT: retq 970; 971; AVX-LABEL: not_a_hsub_3: 972; AVX: # %bb.0: 973; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 974; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1 975; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 976; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0 977; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 978; AVX-NEXT: retq 979 %vecext = extractelement <2 x double> %B, i32 0 980 %vecext1 = extractelement <2 x double> %B, i32 1 981 %sub = fsub double %vecext, %vecext1 982 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 983 %vecext2 = extractelement <2 x double> %A, i32 1 984 %vecext3 = extractelement <2 x double> %A, i32 0 985 %sub2 = fsub double %vecext2, %vecext3 986 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 987 ret <2 x double> %vecinit2 988} 989 990; Test AVX horizontal add/sub of packed single/double precision 991; floating point values from 256-bit vectors. 992 993define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) { 994; SSE-LABEL: avx_vhadd_ps: 995; SSE: # %bb.0: 996; SSE-NEXT: haddps %xmm2, %xmm0 997; SSE-NEXT: haddps %xmm3, %xmm1 998; SSE-NEXT: retq 999; 1000; AVX-LABEL: avx_vhadd_ps: 1001; AVX: # %bb.0: 1002; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1003; AVX-NEXT: retq 1004 %vecext = extractelement <8 x float> %a, i32 0 1005 %vecext1 = extractelement <8 x float> %a, i32 1 1006 %add = fadd float %vecext, %vecext1 1007 %vecinit = insertelement <8 x float> undef, float %add, i32 0 1008 %vecext2 = extractelement <8 x float> %a, i32 2 1009 %vecext3 = extractelement <8 x float> %a, i32 3 1010 %add4 = fadd float %vecext2, %vecext3 1011 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 1012 %vecext6 = extractelement <8 x float> %b, i32 0 1013 %vecext7 = extractelement <8 x float> %b, i32 1 1014 %add8 = fadd float %vecext6, %vecext7 1015 %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2 1016 %vecext10 = extractelement <8 x float> %b, i32 2 1017 %vecext11 = extractelement <8 x float> %b, i32 3 1018 %add12 = fadd float %vecext10, %vecext11 1019 %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3 1020 %vecext14 = extractelement <8 x float> %a, i32 4 1021 %vecext15 = extractelement <8 x float> %a, i32 5 1022 %add16 = fadd float %vecext14, %vecext15 1023 %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4 1024 %vecext18 = extractelement <8 x float> %a, i32 6 1025 %vecext19 = extractelement <8 x float> %a, i32 7 1026 %add20 = fadd float %vecext18, %vecext19 1027 %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5 1028 %vecext22 = extractelement <8 x float> %b, i32 4 1029 %vecext23 = extractelement <8 x float> %b, i32 5 1030 %add24 = fadd float %vecext22, %vecext23 1031 %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6 1032 %vecext26 = extractelement <8 x float> %b, i32 6 1033 %vecext27 = extractelement <8 x float> %b, i32 7 1034 %add28 = fadd float %vecext26, %vecext27 1035 %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7 1036 ret <8 x float> %vecinit29 1037} 1038 1039define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) { 1040; SSE-LABEL: avx_vhsub_ps: 1041; SSE: # %bb.0: 1042; SSE-NEXT: hsubps %xmm2, %xmm0 1043; SSE-NEXT: hsubps %xmm3, %xmm1 1044; SSE-NEXT: retq 1045; 1046; AVX-LABEL: avx_vhsub_ps: 1047; AVX: # %bb.0: 1048; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 1049; AVX-NEXT: retq 1050 %vecext = extractelement <8 x float> %a, i32 0 1051 %vecext1 = extractelement <8 x float> %a, i32 1 1052 %sub = fsub float %vecext, %vecext1 1053 %vecinit = insertelement <8 x float> undef, float %sub, i32 0 1054 %vecext2 = extractelement <8 x float> %a, i32 2 1055 %vecext3 = extractelement <8 x float> %a, i32 3 1056 %sub4 = fsub float %vecext2, %vecext3 1057 %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1 1058 %vecext6 = extractelement <8 x float> %b, i32 0 1059 %vecext7 = extractelement <8 x float> %b, i32 1 1060 %sub8 = fsub float %vecext6, %vecext7 1061 %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2 1062 %vecext10 = extractelement <8 x float> %b, i32 2 1063 %vecext11 = extractelement <8 x float> %b, i32 3 1064 %sub12 = fsub float %vecext10, %vecext11 1065 %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3 1066 %vecext14 = extractelement <8 x float> %a, i32 4 1067 %vecext15 = extractelement <8 x float> %a, i32 5 1068 %sub16 = fsub float %vecext14, %vecext15 1069 %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4 1070 %vecext18 = extractelement <8 x float> %a, i32 6 1071 %vecext19 = extractelement <8 x float> %a, i32 7 1072 %sub20 = fsub float %vecext18, %vecext19 1073 %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5 1074 %vecext22 = extractelement <8 x float> %b, i32 4 1075 %vecext23 = extractelement <8 x float> %b, i32 5 1076 %sub24 = fsub float %vecext22, %vecext23 1077 %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6 1078 %vecext26 = extractelement <8 x float> %b, i32 6 1079 %vecext27 = extractelement <8 x float> %b, i32 7 1080 %sub28 = fsub float %vecext26, %vecext27 1081 %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7 1082 ret <8 x float> %vecinit29 1083} 1084 1085define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) { 1086; SSE-LABEL: avx_hadd_pd: 1087; SSE: # %bb.0: 1088; SSE-NEXT: haddpd %xmm2, %xmm0 1089; SSE-NEXT: haddpd %xmm3, %xmm1 1090; SSE-NEXT: retq 1091; 1092; AVX-LABEL: avx_hadd_pd: 1093; AVX: # %bb.0: 1094; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1095; AVX-NEXT: retq 1096 %vecext = extractelement <4 x double> %a, i32 0 1097 %vecext1 = extractelement <4 x double> %a, i32 1 1098 %add = fadd double %vecext, %vecext1 1099 %vecinit = insertelement <4 x double> undef, double %add, i32 0 1100 %vecext2 = extractelement <4 x double> %b, i32 0 1101 %vecext3 = extractelement <4 x double> %b, i32 1 1102 %add4 = fadd double %vecext2, %vecext3 1103 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 1104 %vecext6 = extractelement <4 x double> %a, i32 2 1105 %vecext7 = extractelement <4 x double> %a, i32 3 1106 %add8 = fadd double %vecext6, %vecext7 1107 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 1108 %vecext10 = extractelement <4 x double> %b, i32 2 1109 %vecext11 = extractelement <4 x double> %b, i32 3 1110 %add12 = fadd double %vecext10, %vecext11 1111 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 1112 ret <4 x double> %vecinit13 1113} 1114 1115define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) { 1116; SSE-LABEL: avx_hsub_pd: 1117; SSE: # %bb.0: 1118; SSE-NEXT: hsubpd %xmm2, %xmm0 1119; SSE-NEXT: hsubpd %xmm3, %xmm1 1120; SSE-NEXT: retq 1121; 1122; AVX-LABEL: avx_hsub_pd: 1123; AVX: # %bb.0: 1124; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1125; AVX-NEXT: retq 1126 %vecext = extractelement <4 x double> %a, i32 0 1127 %vecext1 = extractelement <4 x double> %a, i32 1 1128 %sub = fsub double %vecext, %vecext1 1129 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 1130 %vecext2 = extractelement <4 x double> %b, i32 0 1131 %vecext3 = extractelement <4 x double> %b, i32 1 1132 %sub4 = fsub double %vecext2, %vecext3 1133 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 1134 %vecext6 = extractelement <4 x double> %a, i32 2 1135 %vecext7 = extractelement <4 x double> %a, i32 3 1136 %sub8 = fsub double %vecext6, %vecext7 1137 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 1138 %vecext10 = extractelement <4 x double> %b, i32 2 1139 %vecext11 = extractelement <4 x double> %b, i32 3 1140 %sub12 = fsub double %vecext10, %vecext11 1141 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 1142 ret <4 x double> %vecinit13 1143} 1144 1145; Test AVX2 horizontal add of packed integer values from 256-bit vectors. 1146 1147define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { 1148; SSE3-LABEL: avx2_hadd_d: 1149; SSE3: # %bb.0: 1150; SSE3-NEXT: movd %xmm0, %ecx 1151; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 1152; SSE3-NEXT: movd %xmm4, %r8d 1153; SSE3-NEXT: addl %ecx, %r8d 1154; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 1155; SSE3-NEXT: movd %xmm4, %edx 1156; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 1157; SSE3-NEXT: movd %xmm0, %r9d 1158; SSE3-NEXT: addl %edx, %r9d 1159; SSE3-NEXT: movd %xmm2, %edx 1160; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 1161; SSE3-NEXT: movd %xmm0, %esi 1162; SSE3-NEXT: addl %edx, %esi 1163; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 1164; SSE3-NEXT: movd %xmm0, %edx 1165; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 1166; SSE3-NEXT: movd %xmm0, %edi 1167; SSE3-NEXT: addl %edx, %edi 1168; SSE3-NEXT: movd %xmm1, %eax 1169; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1170; SSE3-NEXT: movd %xmm0, %r10d 1171; SSE3-NEXT: addl %eax, %r10d 1172; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1173; SSE3-NEXT: movd %xmm0, %eax 1174; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 1175; SSE3-NEXT: movd %xmm0, %ecx 1176; SSE3-NEXT: addl %eax, %ecx 1177; SSE3-NEXT: movd %xmm3, %eax 1178; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 1179; SSE3-NEXT: movd %xmm0, %edx 1180; SSE3-NEXT: addl %eax, %edx 1181; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 1182; SSE3-NEXT: movd %xmm0, %r11d 1183; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 1184; SSE3-NEXT: movd %xmm0, %eax 1185; SSE3-NEXT: addl %r11d, %eax 1186; SSE3-NEXT: movd %edi, %xmm0 1187; SSE3-NEXT: movd %esi, %xmm1 1188; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1189; SSE3-NEXT: movd %r9d, %xmm2 1190; SSE3-NEXT: movd %r8d, %xmm0 1191; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1192; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1193; SSE3-NEXT: movd %eax, %xmm1 1194; SSE3-NEXT: movd %edx, %xmm2 1195; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1196; SSE3-NEXT: movd %ecx, %xmm3 1197; SSE3-NEXT: movd %r10d, %xmm1 1198; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1199; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1200; SSE3-NEXT: retq 1201; 1202; SSSE3-LABEL: avx2_hadd_d: 1203; SSSE3: # %bb.0: 1204; SSSE3-NEXT: phaddd %xmm2, %xmm0 1205; SSSE3-NEXT: phaddd %xmm3, %xmm1 1206; SSSE3-NEXT: retq 1207; 1208; AVX1-LABEL: avx2_hadd_d: 1209; AVX1: # %bb.0: 1210; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1211; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1212; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 1213; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1214; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1215; AVX1-NEXT: retq 1216; 1217; AVX2-LABEL: avx2_hadd_d: 1218; AVX2: # %bb.0: 1219; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1220; AVX2-NEXT: retq 1221 %vecext = extractelement <8 x i32> %a, i32 0 1222 %vecext1 = extractelement <8 x i32> %a, i32 1 1223 %add = add i32 %vecext, %vecext1 1224 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 1225 %vecext2 = extractelement <8 x i32> %a, i32 2 1226 %vecext3 = extractelement <8 x i32> %a, i32 3 1227 %add4 = add i32 %vecext2, %vecext3 1228 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 1229 %vecext6 = extractelement <8 x i32> %b, i32 0 1230 %vecext7 = extractelement <8 x i32> %b, i32 1 1231 %add8 = add i32 %vecext6, %vecext7 1232 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 1233 %vecext10 = extractelement <8 x i32> %b, i32 2 1234 %vecext11 = extractelement <8 x i32> %b, i32 3 1235 %add12 = add i32 %vecext10, %vecext11 1236 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 1237 %vecext14 = extractelement <8 x i32> %a, i32 4 1238 %vecext15 = extractelement <8 x i32> %a, i32 5 1239 %add16 = add i32 %vecext14, %vecext15 1240 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 1241 %vecext18 = extractelement <8 x i32> %a, i32 6 1242 %vecext19 = extractelement <8 x i32> %a, i32 7 1243 %add20 = add i32 %vecext18, %vecext19 1244 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 1245 %vecext22 = extractelement <8 x i32> %b, i32 4 1246 %vecext23 = extractelement <8 x i32> %b, i32 5 1247 %add24 = add i32 %vecext22, %vecext23 1248 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 1249 %vecext26 = extractelement <8 x i32> %b, i32 6 1250 %vecext27 = extractelement <8 x i32> %b, i32 7 1251 %add28 = add i32 %vecext26, %vecext27 1252 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 1253 ret <8 x i32> %vecinit29 1254} 1255 1256define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) { 1257; SSE3-LABEL: avx2_hadd_w: 1258; SSE3: # %bb.0: 1259; SSE3-NEXT: pushq %rbp 1260; SSE3-NEXT: .cfi_def_cfa_offset 16 1261; SSE3-NEXT: pushq %r15 1262; SSE3-NEXT: .cfi_def_cfa_offset 24 1263; SSE3-NEXT: pushq %r14 1264; SSE3-NEXT: .cfi_def_cfa_offset 32 1265; SSE3-NEXT: pushq %r13 1266; SSE3-NEXT: .cfi_def_cfa_offset 40 1267; SSE3-NEXT: pushq %r12 1268; SSE3-NEXT: .cfi_def_cfa_offset 48 1269; SSE3-NEXT: pushq %rbx 1270; SSE3-NEXT: .cfi_def_cfa_offset 56 1271; SSE3-NEXT: .cfi_offset %rbx, -56 1272; SSE3-NEXT: .cfi_offset %r12, -48 1273; SSE3-NEXT: .cfi_offset %r13, -40 1274; SSE3-NEXT: .cfi_offset %r14, -32 1275; SSE3-NEXT: .cfi_offset %r15, -24 1276; SSE3-NEXT: .cfi_offset %rbp, -16 1277; SSE3-NEXT: movd %xmm0, %eax 1278; SSE3-NEXT: pextrw $1, %xmm0, %r10d 1279; SSE3-NEXT: addl %eax, %r10d 1280; SSE3-NEXT: pextrw $2, %xmm0, %eax 1281; SSE3-NEXT: pextrw $3, %xmm0, %r11d 1282; SSE3-NEXT: addl %eax, %r11d 1283; SSE3-NEXT: pextrw $4, %xmm0, %eax 1284; SSE3-NEXT: pextrw $5, %xmm0, %r12d 1285; SSE3-NEXT: addl %eax, %r12d 1286; SSE3-NEXT: pextrw $6, %xmm0, %eax 1287; SSE3-NEXT: pextrw $7, %xmm0, %r13d 1288; SSE3-NEXT: addl %eax, %r13d 1289; SSE3-NEXT: movd %xmm1, %eax 1290; SSE3-NEXT: pextrw $1, %xmm1, %ecx 1291; SSE3-NEXT: addl %eax, %ecx 1292; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1293; SSE3-NEXT: pextrw $2, %xmm1, %eax 1294; SSE3-NEXT: pextrw $3, %xmm1, %ecx 1295; SSE3-NEXT: addl %eax, %ecx 1296; SSE3-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1297; SSE3-NEXT: pextrw $4, %xmm1, %eax 1298; SSE3-NEXT: pextrw $5, %xmm1, %r14d 1299; SSE3-NEXT: addl %eax, %r14d 1300; SSE3-NEXT: pextrw $6, %xmm1, %esi 1301; SSE3-NEXT: pextrw $7, %xmm1, %r15d 1302; SSE3-NEXT: addl %esi, %r15d 1303; SSE3-NEXT: movd %xmm2, %esi 1304; SSE3-NEXT: pextrw $1, %xmm2, %ebp 1305; SSE3-NEXT: addl %esi, %ebp 1306; SSE3-NEXT: pextrw $2, %xmm2, %esi 1307; SSE3-NEXT: pextrw $3, %xmm2, %edi 1308; SSE3-NEXT: addl %esi, %edi 1309; SSE3-NEXT: pextrw $4, %xmm2, %esi 1310; SSE3-NEXT: pextrw $5, %xmm2, %eax 1311; SSE3-NEXT: addl %esi, %eax 1312; SSE3-NEXT: pextrw $6, %xmm2, %esi 1313; SSE3-NEXT: pextrw $7, %xmm2, %ecx 1314; SSE3-NEXT: addl %esi, %ecx 1315; SSE3-NEXT: movd %xmm3, %ebx 1316; SSE3-NEXT: pextrw $1, %xmm3, %r9d 1317; SSE3-NEXT: addl %ebx, %r9d 1318; SSE3-NEXT: pextrw $2, %xmm3, %edx 1319; SSE3-NEXT: pextrw $3, %xmm3, %ebx 1320; SSE3-NEXT: addl %edx, %ebx 1321; SSE3-NEXT: pextrw $4, %xmm3, %edx 1322; SSE3-NEXT: pextrw $5, %xmm3, %esi 1323; SSE3-NEXT: addl %edx, %esi 1324; SSE3-NEXT: pextrw $6, %xmm3, %r8d 1325; SSE3-NEXT: pextrw $7, %xmm3, %edx 1326; SSE3-NEXT: addl %r8d, %edx 1327; SSE3-NEXT: movd %ecx, %xmm8 1328; SSE3-NEXT: movd %eax, %xmm3 1329; SSE3-NEXT: movd %edi, %xmm9 1330; SSE3-NEXT: movd %ebp, %xmm4 1331; SSE3-NEXT: movd %r13d, %xmm10 1332; SSE3-NEXT: movd %r12d, %xmm7 1333; SSE3-NEXT: movd %r11d, %xmm11 1334; SSE3-NEXT: movd %r10d, %xmm0 1335; SSE3-NEXT: movd %edx, %xmm12 1336; SSE3-NEXT: movd %esi, %xmm6 1337; SSE3-NEXT: movd %ebx, %xmm13 1338; SSE3-NEXT: movd %r9d, %xmm5 1339; SSE3-NEXT: movd %r15d, %xmm14 1340; SSE3-NEXT: movd %r14d, %xmm2 1341; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload 1342; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero 1343; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload 1344; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 1345; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 1346; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 1347; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1348; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1349; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 1350; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 1351; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 1352; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 1353; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 1354; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1355; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 1356; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 1357; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1358; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 1359; SSE3-NEXT: popq %rbx 1360; SSE3-NEXT: .cfi_def_cfa_offset 48 1361; SSE3-NEXT: popq %r12 1362; SSE3-NEXT: .cfi_def_cfa_offset 40 1363; SSE3-NEXT: popq %r13 1364; SSE3-NEXT: .cfi_def_cfa_offset 32 1365; SSE3-NEXT: popq %r14 1366; SSE3-NEXT: .cfi_def_cfa_offset 24 1367; SSE3-NEXT: popq %r15 1368; SSE3-NEXT: .cfi_def_cfa_offset 16 1369; SSE3-NEXT: popq %rbp 1370; SSE3-NEXT: .cfi_def_cfa_offset 8 1371; SSE3-NEXT: retq 1372; 1373; SSSE3-LABEL: avx2_hadd_w: 1374; SSSE3: # %bb.0: 1375; SSSE3-NEXT: phaddw %xmm2, %xmm0 1376; SSSE3-NEXT: phaddw %xmm3, %xmm1 1377; SSSE3-NEXT: retq 1378; 1379; AVX1-LABEL: avx2_hadd_w: 1380; AVX1: # %bb.0: 1381; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1382; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1383; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 1384; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 1385; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1386; AVX1-NEXT: retq 1387; 1388; AVX2-LABEL: avx2_hadd_w: 1389; AVX2: # %bb.0: 1390; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1391; AVX2-NEXT: retq 1392 %vecext = extractelement <16 x i16> %a, i32 0 1393 %vecext1 = extractelement <16 x i16> %a, i32 1 1394 %add = add i16 %vecext, %vecext1 1395 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 1396 %vecext4 = extractelement <16 x i16> %a, i32 2 1397 %vecext6 = extractelement <16 x i16> %a, i32 3 1398 %add8 = add i16 %vecext4, %vecext6 1399 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 1400 %vecext11 = extractelement <16 x i16> %a, i32 4 1401 %vecext13 = extractelement <16 x i16> %a, i32 5 1402 %add15 = add i16 %vecext11, %vecext13 1403 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 1404 %vecext18 = extractelement <16 x i16> %a, i32 6 1405 %vecext20 = extractelement <16 x i16> %a, i32 7 1406 %add22 = add i16 %vecext18, %vecext20 1407 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 1408 %vecext25 = extractelement <16 x i16> %a, i32 8 1409 %vecext27 = extractelement <16 x i16> %a, i32 9 1410 %add29 = add i16 %vecext25, %vecext27 1411 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8 1412 %vecext32 = extractelement <16 x i16> %a, i32 10 1413 %vecext34 = extractelement <16 x i16> %a, i32 11 1414 %add36 = add i16 %vecext32, %vecext34 1415 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9 1416 %vecext39 = extractelement <16 x i16> %a, i32 12 1417 %vecext41 = extractelement <16 x i16> %a, i32 13 1418 %add43 = add i16 %vecext39, %vecext41 1419 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10 1420 %vecext46 = extractelement <16 x i16> %a, i32 14 1421 %vecext48 = extractelement <16 x i16> %a, i32 15 1422 %add50 = add i16 %vecext46, %vecext48 1423 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11 1424 %vecext53 = extractelement <16 x i16> %b, i32 0 1425 %vecext55 = extractelement <16 x i16> %b, i32 1 1426 %add57 = add i16 %vecext53, %vecext55 1427 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4 1428 %vecext60 = extractelement <16 x i16> %b, i32 2 1429 %vecext62 = extractelement <16 x i16> %b, i32 3 1430 %add64 = add i16 %vecext60, %vecext62 1431 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5 1432 %vecext67 = extractelement <16 x i16> %b, i32 4 1433 %vecext69 = extractelement <16 x i16> %b, i32 5 1434 %add71 = add i16 %vecext67, %vecext69 1435 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6 1436 %vecext74 = extractelement <16 x i16> %b, i32 6 1437 %vecext76 = extractelement <16 x i16> %b, i32 7 1438 %add78 = add i16 %vecext74, %vecext76 1439 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7 1440 %vecext81 = extractelement <16 x i16> %b, i32 8 1441 %vecext83 = extractelement <16 x i16> %b, i32 9 1442 %add85 = add i16 %vecext81, %vecext83 1443 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 1444 %vecext88 = extractelement <16 x i16> %b, i32 10 1445 %vecext90 = extractelement <16 x i16> %b, i32 11 1446 %add92 = add i16 %vecext88, %vecext90 1447 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 1448 %vecext95 = extractelement <16 x i16> %b, i32 12 1449 %vecext97 = extractelement <16 x i16> %b, i32 13 1450 %add99 = add i16 %vecext95, %vecext97 1451 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 1452 %vecext102 = extractelement <16 x i16> %b, i32 14 1453 %vecext104 = extractelement <16 x i16> %b, i32 15 1454 %add106 = add i16 %vecext102, %vecext104 1455 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 1456 ret <16 x i16> %vecinit108 1457} 1458