1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) { 8; SSE-LABEL: hadd_ps_test1: 9; SSE: # BB#0: 10; SSE-NEXT: haddps %xmm1, %xmm0 11; SSE-NEXT: retq 12; 13; AVX-LABEL: hadd_ps_test1: 14; AVX: # BB#0: 15; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 16; AVX-NEXT: retq 17 %vecext = extractelement <4 x float> %A, i32 0 18 %vecext1 = extractelement <4 x float> %A, i32 1 19 %add = fadd float %vecext, %vecext1 20 %vecinit = insertelement <4 x float> undef, float %add, i32 0 21 %vecext2 = extractelement <4 x float> %A, i32 2 22 %vecext3 = extractelement <4 x float> %A, i32 3 23 %add4 = fadd float %vecext2, %vecext3 24 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 25 %vecext6 = extractelement <4 x float> %B, i32 0 26 %vecext7 = extractelement <4 x float> %B, i32 1 27 %add8 = fadd float %vecext6, %vecext7 28 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 29 %vecext10 = extractelement <4 x float> %B, i32 2 30 %vecext11 = extractelement <4 x float> %B, i32 3 31 %add12 = fadd float %vecext10, %vecext11 32 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 33 ret <4 x float> %vecinit13 34} 35 36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) { 37; SSE-LABEL: hadd_ps_test2: 38; SSE: # BB#0: 39; SSE-NEXT: haddps %xmm1, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: hadd_ps_test2: 43; AVX: # BB#0: 44; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %vecext = extractelement <4 x float> %A, i32 2 47 %vecext1 = extractelement <4 x float> %A, i32 3 48 %add = fadd float %vecext, %vecext1 49 %vecinit = insertelement <4 x float> undef, float %add, i32 1 50 %vecext2 = extractelement <4 x float> %A, i32 0 51 %vecext3 = extractelement <4 x float> %A, i32 1 52 %add4 = fadd float %vecext2, %vecext3 53 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0 54 %vecext6 = extractelement <4 x float> %B, i32 2 55 %vecext7 = extractelement <4 x float> %B, i32 3 56 %add8 = fadd float %vecext6, %vecext7 57 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3 58 %vecext10 = extractelement <4 x float> %B, i32 0 59 %vecext11 = extractelement <4 x float> %B, i32 1 60 %add12 = fadd float %vecext10, %vecext11 61 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2 62 ret <4 x float> %vecinit13 63} 64 65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) { 66; SSE-LABEL: hsub_ps_test1: 67; SSE: # BB#0: 68; SSE-NEXT: hsubps %xmm1, %xmm0 69; SSE-NEXT: retq 70; 71; AVX-LABEL: hsub_ps_test1: 72; AVX: # BB#0: 73; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 74; AVX-NEXT: retq 75 %vecext = extractelement <4 x float> %A, i32 0 76 %vecext1 = extractelement <4 x float> %A, i32 1 77 %sub = fsub float %vecext, %vecext1 78 %vecinit = insertelement <4 x float> undef, float %sub, i32 0 79 %vecext2 = extractelement <4 x float> %A, i32 2 80 %vecext3 = extractelement <4 x float> %A, i32 3 81 %sub4 = fsub float %vecext2, %vecext3 82 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1 83 %vecext6 = extractelement <4 x float> %B, i32 0 84 %vecext7 = extractelement <4 x float> %B, i32 1 85 %sub8 = fsub float %vecext6, %vecext7 86 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2 87 %vecext10 = extractelement <4 x float> %B, i32 2 88 %vecext11 = extractelement <4 x float> %B, i32 3 89 %sub12 = fsub float %vecext10, %vecext11 90 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3 91 ret <4 x float> %vecinit13 92} 93 94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) { 95; SSE-LABEL: hsub_ps_test2: 96; SSE: # BB#0: 97; SSE-NEXT: hsubps %xmm1, %xmm0 98; SSE-NEXT: retq 99; 100; AVX-LABEL: hsub_ps_test2: 101; AVX: # BB#0: 102; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 103; AVX-NEXT: retq 104 %vecext = extractelement <4 x float> %A, i32 2 105 %vecext1 = extractelement <4 x float> %A, i32 3 106 %sub = fsub float %vecext, %vecext1 107 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 108 %vecext2 = extractelement <4 x float> %A, i32 0 109 %vecext3 = extractelement <4 x float> %A, i32 1 110 %sub4 = fsub float %vecext2, %vecext3 111 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 112 %vecext6 = extractelement <4 x float> %B, i32 2 113 %vecext7 = extractelement <4 x float> %B, i32 3 114 %sub8 = fsub float %vecext6, %vecext7 115 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 116 %vecext10 = extractelement <4 x float> %B, i32 0 117 %vecext11 = extractelement <4 x float> %B, i32 1 118 %sub12 = fsub float %vecext10, %vecext11 119 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 120 ret <4 x float> %vecinit13 121} 122 123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { 124; SSE3-LABEL: phadd_d_test1: 125; SSE3: # BB#0: 126; SSE3-NEXT: movd %xmm0, %eax 127; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 128; SSE3-NEXT: movd %xmm2, %ecx 129; SSE3-NEXT: addl %eax, %ecx 130; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 131; SSE3-NEXT: movd %xmm2, %eax 132; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 133; SSE3-NEXT: movd %xmm0, %edx 134; SSE3-NEXT: addl %eax, %edx 135; SSE3-NEXT: movd %xmm1, %eax 136; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 137; SSE3-NEXT: movd %xmm0, %esi 138; SSE3-NEXT: addl %eax, %esi 139; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 140; SSE3-NEXT: movd %xmm0, %eax 141; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 142; SSE3-NEXT: movd %xmm0, %edi 143; SSE3-NEXT: addl %eax, %edi 144; SSE3-NEXT: movd %edi, %xmm0 145; SSE3-NEXT: movd %edx, %xmm1 146; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 147; SSE3-NEXT: movd %esi, %xmm2 148; SSE3-NEXT: movd %ecx, %xmm0 149; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 150; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 151; SSE3-NEXT: retq 152; 153; SSSE3-LABEL: phadd_d_test1: 154; SSSE3: # BB#0: 155; SSSE3-NEXT: phaddd %xmm1, %xmm0 156; SSSE3-NEXT: retq 157; 158; AVX-LABEL: phadd_d_test1: 159; AVX: # BB#0: 160; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 161; AVX-NEXT: retq 162 %vecext = extractelement <4 x i32> %A, i32 0 163 %vecext1 = extractelement <4 x i32> %A, i32 1 164 %add = add i32 %vecext, %vecext1 165 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0 166 %vecext2 = extractelement <4 x i32> %A, i32 2 167 %vecext3 = extractelement <4 x i32> %A, i32 3 168 %add4 = add i32 %vecext2, %vecext3 169 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1 170 %vecext6 = extractelement <4 x i32> %B, i32 0 171 %vecext7 = extractelement <4 x i32> %B, i32 1 172 %add8 = add i32 %vecext6, %vecext7 173 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2 174 %vecext10 = extractelement <4 x i32> %B, i32 2 175 %vecext11 = extractelement <4 x i32> %B, i32 3 176 %add12 = add i32 %vecext10, %vecext11 177 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3 178 ret <4 x i32> %vecinit13 179} 180 181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { 182; SSE3-LABEL: phadd_d_test2: 183; SSE3: # BB#0: 184; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 185; SSE3-NEXT: movd %xmm2, %eax 186; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 187; SSE3-NEXT: movd %xmm2, %ecx 188; SSE3-NEXT: addl %eax, %ecx 189; SSE3-NEXT: movd %xmm0, %eax 190; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 191; SSE3-NEXT: movd %xmm0, %edx 192; SSE3-NEXT: addl %eax, %edx 193; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 194; SSE3-NEXT: movd %xmm0, %eax 195; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 196; SSE3-NEXT: movd %xmm0, %esi 197; SSE3-NEXT: addl %eax, %esi 198; SSE3-NEXT: movd %esi, %xmm0 199; SSE3-NEXT: movd %ecx, %xmm2 200; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 201; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 202; SSE3-NEXT: movd %xmm0, %eax 203; SSE3-NEXT: movd %xmm1, %ecx 204; SSE3-NEXT: addl %eax, %ecx 205; SSE3-NEXT: movd %ecx, %xmm1 206; SSE3-NEXT: movd %edx, %xmm0 207; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 208; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 209; SSE3-NEXT: retq 210; 211; SSSE3-LABEL: phadd_d_test2: 212; SSSE3: # BB#0: 213; SSSE3-NEXT: phaddd %xmm1, %xmm0 214; SSSE3-NEXT: retq 215; 216; AVX-LABEL: phadd_d_test2: 217; AVX: # BB#0: 218; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 219; AVX-NEXT: retq 220 %vecext = extractelement <4 x i32> %A, i32 2 221 %vecext1 = extractelement <4 x i32> %A, i32 3 222 %add = add i32 %vecext, %vecext1 223 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1 224 %vecext2 = extractelement <4 x i32> %A, i32 0 225 %vecext3 = extractelement <4 x i32> %A, i32 1 226 %add4 = add i32 %vecext2, %vecext3 227 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0 228 %vecext6 = extractelement <4 x i32> %B, i32 3 229 %vecext7 = extractelement <4 x i32> %B, i32 2 230 %add8 = add i32 %vecext6, %vecext7 231 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3 232 %vecext10 = extractelement <4 x i32> %B, i32 1 233 %vecext11 = extractelement <4 x i32> %B, i32 0 234 %add12 = add i32 %vecext10, %vecext11 235 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2 236 ret <4 x i32> %vecinit13 237} 238 239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { 240; SSE3-LABEL: phsub_d_test1: 241; SSE3: # BB#0: 242; SSE3-NEXT: movd %xmm0, %eax 243; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 244; SSE3-NEXT: movd %xmm2, %ecx 245; SSE3-NEXT: subl %ecx, %eax 246; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 247; SSE3-NEXT: movd %xmm2, %ecx 248; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 249; SSE3-NEXT: movd %xmm0, %edx 250; SSE3-NEXT: subl %edx, %ecx 251; SSE3-NEXT: movd %xmm1, %edx 252; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 253; SSE3-NEXT: movd %xmm0, %esi 254; SSE3-NEXT: subl %esi, %edx 255; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 256; SSE3-NEXT: movd %xmm0, %esi 257; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 258; SSE3-NEXT: movd %xmm0, %edi 259; SSE3-NEXT: subl %edi, %esi 260; SSE3-NEXT: movd %esi, %xmm0 261; SSE3-NEXT: movd %ecx, %xmm1 262; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 263; SSE3-NEXT: movd %edx, %xmm2 264; SSE3-NEXT: movd %eax, %xmm0 265; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 266; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 267; SSE3-NEXT: retq 268; 269; SSSE3-LABEL: phsub_d_test1: 270; SSSE3: # BB#0: 271; SSSE3-NEXT: phsubd %xmm1, %xmm0 272; SSSE3-NEXT: retq 273; 274; AVX-LABEL: phsub_d_test1: 275; AVX: # BB#0: 276; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 277; AVX-NEXT: retq 278 %vecext = extractelement <4 x i32> %A, i32 0 279 %vecext1 = extractelement <4 x i32> %A, i32 1 280 %sub = sub i32 %vecext, %vecext1 281 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 282 %vecext2 = extractelement <4 x i32> %A, i32 2 283 %vecext3 = extractelement <4 x i32> %A, i32 3 284 %sub4 = sub i32 %vecext2, %vecext3 285 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 286 %vecext6 = extractelement <4 x i32> %B, i32 0 287 %vecext7 = extractelement <4 x i32> %B, i32 1 288 %sub8 = sub i32 %vecext6, %vecext7 289 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 290 %vecext10 = extractelement <4 x i32> %B, i32 2 291 %vecext11 = extractelement <4 x i32> %B, i32 3 292 %sub12 = sub i32 %vecext10, %vecext11 293 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 294 ret <4 x i32> %vecinit13 295} 296 297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { 298; SSE3-LABEL: phsub_d_test2: 299; SSE3: # BB#0: 300; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 301; SSE3-NEXT: movd %xmm2, %eax 302; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 303; SSE3-NEXT: movd %xmm2, %ecx 304; SSE3-NEXT: subl %ecx, %eax 305; SSE3-NEXT: movd %xmm0, %ecx 306; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 307; SSE3-NEXT: movd %xmm0, %edx 308; SSE3-NEXT: subl %edx, %ecx 309; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 310; SSE3-NEXT: movd %xmm0, %edx 311; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 312; SSE3-NEXT: movd %xmm0, %esi 313; SSE3-NEXT: subl %esi, %edx 314; SSE3-NEXT: movd %edx, %xmm0 315; SSE3-NEXT: movd %eax, %xmm2 316; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 317; SSE3-NEXT: movd %xmm1, %eax 318; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 319; SSE3-NEXT: movd %xmm0, %edx 320; SSE3-NEXT: subl %edx, %eax 321; SSE3-NEXT: movd %eax, %xmm1 322; SSE3-NEXT: movd %ecx, %xmm0 323; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 324; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 325; SSE3-NEXT: retq 326; 327; SSSE3-LABEL: phsub_d_test2: 328; SSSE3: # BB#0: 329; SSSE3-NEXT: phsubd %xmm1, %xmm0 330; SSSE3-NEXT: retq 331; 332; AVX-LABEL: phsub_d_test2: 333; AVX: # BB#0: 334; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 335; AVX-NEXT: retq 336 %vecext = extractelement <4 x i32> %A, i32 2 337 %vecext1 = extractelement <4 x i32> %A, i32 3 338 %sub = sub i32 %vecext, %vecext1 339 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1 340 %vecext2 = extractelement <4 x i32> %A, i32 0 341 %vecext3 = extractelement <4 x i32> %A, i32 1 342 %sub4 = sub i32 %vecext2, %vecext3 343 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0 344 %vecext6 = extractelement <4 x i32> %B, i32 2 345 %vecext7 = extractelement <4 x i32> %B, i32 3 346 %sub8 = sub i32 %vecext6, %vecext7 347 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3 348 %vecext10 = extractelement <4 x i32> %B, i32 0 349 %vecext11 = extractelement <4 x i32> %B, i32 1 350 %sub12 = sub i32 %vecext10, %vecext11 351 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2 352 ret <4 x i32> %vecinit13 353} 354 355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) { 356; SSE-LABEL: hadd_pd_test1: 357; SSE: # BB#0: 358; SSE-NEXT: haddpd %xmm1, %xmm0 359; SSE-NEXT: retq 360; 361; AVX-LABEL: hadd_pd_test1: 362; AVX: # BB#0: 363; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 364; AVX-NEXT: retq 365 %vecext = extractelement <2 x double> %A, i32 0 366 %vecext1 = extractelement <2 x double> %A, i32 1 367 %add = fadd double %vecext, %vecext1 368 %vecinit = insertelement <2 x double> undef, double %add, i32 0 369 %vecext2 = extractelement <2 x double> %B, i32 0 370 %vecext3 = extractelement <2 x double> %B, i32 1 371 %add2 = fadd double %vecext2, %vecext3 372 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 373 ret <2 x double> %vecinit2 374} 375 376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) { 377; SSE-LABEL: hadd_pd_test2: 378; SSE: # BB#0: 379; SSE-NEXT: haddpd %xmm1, %xmm0 380; SSE-NEXT: retq 381; 382; AVX-LABEL: hadd_pd_test2: 383; AVX: # BB#0: 384; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 385; AVX-NEXT: retq 386 %vecext = extractelement <2 x double> %A, i32 1 387 %vecext1 = extractelement <2 x double> %A, i32 0 388 %add = fadd double %vecext, %vecext1 389 %vecinit = insertelement <2 x double> undef, double %add, i32 0 390 %vecext2 = extractelement <2 x double> %B, i32 1 391 %vecext3 = extractelement <2 x double> %B, i32 0 392 %add2 = fadd double %vecext2, %vecext3 393 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 394 ret <2 x double> %vecinit2 395} 396 397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) { 398; SSE-LABEL: hsub_pd_test1: 399; SSE: # BB#0: 400; SSE-NEXT: hsubpd %xmm1, %xmm0 401; SSE-NEXT: retq 402; 403; AVX-LABEL: hsub_pd_test1: 404; AVX: # BB#0: 405; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 406; AVX-NEXT: retq 407 %vecext = extractelement <2 x double> %A, i32 0 408 %vecext1 = extractelement <2 x double> %A, i32 1 409 %sub = fsub double %vecext, %vecext1 410 %vecinit = insertelement <2 x double> undef, double %sub, i32 0 411 %vecext2 = extractelement <2 x double> %B, i32 0 412 %vecext3 = extractelement <2 x double> %B, i32 1 413 %sub2 = fsub double %vecext2, %vecext3 414 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1 415 ret <2 x double> %vecinit2 416} 417 418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) { 419; SSE-LABEL: hsub_pd_test2: 420; SSE: # BB#0: 421; SSE-NEXT: hsubpd %xmm1, %xmm0 422; SSE-NEXT: retq 423; 424; AVX-LABEL: hsub_pd_test2: 425; AVX: # BB#0: 426; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 427; AVX-NEXT: retq 428 %vecext = extractelement <2 x double> %B, i32 0 429 %vecext1 = extractelement <2 x double> %B, i32 1 430 %sub = fsub double %vecext, %vecext1 431 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 432 %vecext2 = extractelement <2 x double> %A, i32 0 433 %vecext3 = extractelement <2 x double> %A, i32 1 434 %sub2 = fsub double %vecext2, %vecext3 435 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 436 ret <2 x double> %vecinit2 437} 438 439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { 440; SSE-LABEL: avx_vhadd_pd_test: 441; SSE: # BB#0: 442; SSE-NEXT: haddpd %xmm1, %xmm0 443; SSE-NEXT: haddpd %xmm3, %xmm2 444; SSE-NEXT: movapd %xmm2, %xmm1 445; SSE-NEXT: retq 446; 447; AVX-LABEL: avx_vhadd_pd_test: 448; AVX: # BB#0: 449; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 450; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1 451; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 452; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 453; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 454; AVX-NEXT: retq 455 %vecext = extractelement <4 x double> %A, i32 0 456 %vecext1 = extractelement <4 x double> %A, i32 1 457 %add = fadd double %vecext, %vecext1 458 %vecinit = insertelement <4 x double> undef, double %add, i32 0 459 %vecext2 = extractelement <4 x double> %A, i32 2 460 %vecext3 = extractelement <4 x double> %A, i32 3 461 %add4 = fadd double %vecext2, %vecext3 462 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 463 %vecext6 = extractelement <4 x double> %B, i32 0 464 %vecext7 = extractelement <4 x double> %B, i32 1 465 %add8 = fadd double %vecext6, %vecext7 466 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 467 %vecext10 = extractelement <4 x double> %B, i32 2 468 %vecext11 = extractelement <4 x double> %B, i32 3 469 %add12 = fadd double %vecext10, %vecext11 470 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 471 ret <4 x double> %vecinit13 472} 473 474define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { 475; SSE-LABEL: avx_vhsub_pd_test: 476; SSE: # BB#0: 477; SSE-NEXT: hsubpd %xmm1, %xmm0 478; SSE-NEXT: hsubpd %xmm3, %xmm2 479; SSE-NEXT: movapd %xmm2, %xmm1 480; SSE-NEXT: retq 481; 482; AVX-LABEL: avx_vhsub_pd_test: 483; AVX: # BB#0: 484; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 485; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1 486; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 487; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0 488; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 489; AVX-NEXT: retq 490 %vecext = extractelement <4 x double> %A, i32 0 491 %vecext1 = extractelement <4 x double> %A, i32 1 492 %sub = fsub double %vecext, %vecext1 493 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 494 %vecext2 = extractelement <4 x double> %A, i32 2 495 %vecext3 = extractelement <4 x double> %A, i32 3 496 %sub4 = fsub double %vecext2, %vecext3 497 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 498 %vecext6 = extractelement <4 x double> %B, i32 0 499 %vecext7 = extractelement <4 x double> %B, i32 1 500 %sub8 = fsub double %vecext6, %vecext7 501 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 502 %vecext10 = extractelement <4 x double> %B, i32 2 503 %vecext11 = extractelement <4 x double> %B, i32 3 504 %sub12 = fsub double %vecext10, %vecext11 505 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 506 ret <4 x double> %vecinit13 507} 508 509define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { 510; SSE3-LABEL: avx2_vphadd_d_test: 511; SSE3: # BB#0: 512; SSE3-NEXT: movd %xmm0, %ecx 513; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 514; SSE3-NEXT: movd %xmm4, %r8d 515; SSE3-NEXT: addl %ecx, %r8d 516; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 517; SSE3-NEXT: movd %xmm4, %edx 518; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 519; SSE3-NEXT: movd %xmm0, %r9d 520; SSE3-NEXT: addl %edx, %r9d 521; SSE3-NEXT: movd %xmm1, %esi 522; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 523; SSE3-NEXT: movd %xmm0, %r10d 524; SSE3-NEXT: addl %esi, %r10d 525; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 526; SSE3-NEXT: movd %xmm0, %esi 527; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 528; SSE3-NEXT: movd %xmm0, %edi 529; SSE3-NEXT: addl %esi, %edi 530; SSE3-NEXT: movd %xmm2, %eax 531; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 532; SSE3-NEXT: movd %xmm0, %r11d 533; SSE3-NEXT: addl %eax, %r11d 534; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 535; SSE3-NEXT: movd %xmm0, %eax 536; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 537; SSE3-NEXT: movd %xmm0, %ecx 538; SSE3-NEXT: addl %eax, %ecx 539; SSE3-NEXT: movd %xmm3, %eax 540; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 541; SSE3-NEXT: movd %xmm0, %edx 542; SSE3-NEXT: addl %eax, %edx 543; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 544; SSE3-NEXT: movd %xmm0, %eax 545; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 546; SSE3-NEXT: movd %xmm0, %esi 547; SSE3-NEXT: addl %eax, %esi 548; SSE3-NEXT: movd %edi, %xmm0 549; SSE3-NEXT: movd %r9d, %xmm1 550; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 551; SSE3-NEXT: movd %r10d, %xmm2 552; SSE3-NEXT: movd %r8d, %xmm0 553; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 554; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 555; SSE3-NEXT: movd %esi, %xmm1 556; SSE3-NEXT: movd %ecx, %xmm2 557; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 558; SSE3-NEXT: movd %edx, %xmm3 559; SSE3-NEXT: movd %r11d, %xmm1 560; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 561; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 562; SSE3-NEXT: retq 563; 564; SSSE3-LABEL: avx2_vphadd_d_test: 565; SSSE3: # BB#0: 566; SSSE3-NEXT: phaddd %xmm1, %xmm0 567; SSSE3-NEXT: phaddd %xmm3, %xmm2 568; SSSE3-NEXT: movdqa %xmm2, %xmm1 569; SSSE3-NEXT: retq 570; 571; AVX1-LABEL: avx2_vphadd_d_test: 572; AVX1: # BB#0: 573; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 574; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 575; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 576; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 577; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 578; AVX1-NEXT: retq 579; 580; AVX2-LABEL: avx2_vphadd_d_test: 581; AVX2: # BB#0: 582; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 583; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1 584; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 585; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0 586; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 587; AVX2-NEXT: retq 588 %vecext = extractelement <8 x i32> %A, i32 0 589 %vecext1 = extractelement <8 x i32> %A, i32 1 590 %add = add i32 %vecext, %vecext1 591 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 592 %vecext2 = extractelement <8 x i32> %A, i32 2 593 %vecext3 = extractelement <8 x i32> %A, i32 3 594 %add4 = add i32 %vecext2, %vecext3 595 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 596 %vecext6 = extractelement <8 x i32> %A, i32 4 597 %vecext7 = extractelement <8 x i32> %A, i32 5 598 %add8 = add i32 %vecext6, %vecext7 599 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 600 %vecext10 = extractelement <8 x i32> %A, i32 6 601 %vecext11 = extractelement <8 x i32> %A, i32 7 602 %add12 = add i32 %vecext10, %vecext11 603 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 604 %vecext14 = extractelement <8 x i32> %B, i32 0 605 %vecext15 = extractelement <8 x i32> %B, i32 1 606 %add16 = add i32 %vecext14, %vecext15 607 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 608 %vecext18 = extractelement <8 x i32> %B, i32 2 609 %vecext19 = extractelement <8 x i32> %B, i32 3 610 %add20 = add i32 %vecext18, %vecext19 611 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 612 %vecext22 = extractelement <8 x i32> %B, i32 4 613 %vecext23 = extractelement <8 x i32> %B, i32 5 614 %add24 = add i32 %vecext22, %vecext23 615 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 616 %vecext26 = extractelement <8 x i32> %B, i32 6 617 %vecext27 = extractelement <8 x i32> %B, i32 7 618 %add28 = add i32 %vecext26, %vecext27 619 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 620 ret <8 x i32> %vecinit29 621} 622 623define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) { 624; SSE3-LABEL: avx2_vphadd_w_test: 625; SSE3: # BB#0: 626; SSE3-NEXT: pushq %rbp 627; SSE3-NEXT: .Ltmp0: 628; SSE3-NEXT: .cfi_def_cfa_offset 16 629; SSE3-NEXT: pushq %r15 630; SSE3-NEXT: .Ltmp1: 631; SSE3-NEXT: .cfi_def_cfa_offset 24 632; SSE3-NEXT: pushq %r14 633; SSE3-NEXT: .Ltmp2: 634; SSE3-NEXT: .cfi_def_cfa_offset 32 635; SSE3-NEXT: pushq %r13 636; SSE3-NEXT: .Ltmp3: 637; SSE3-NEXT: .cfi_def_cfa_offset 40 638; SSE3-NEXT: pushq %r12 639; SSE3-NEXT: .Ltmp4: 640; SSE3-NEXT: .cfi_def_cfa_offset 48 641; SSE3-NEXT: pushq %rbx 642; SSE3-NEXT: .Ltmp5: 643; SSE3-NEXT: .cfi_def_cfa_offset 56 644; SSE3-NEXT: .Ltmp6: 645; SSE3-NEXT: .cfi_offset %rbx, -56 646; SSE3-NEXT: .Ltmp7: 647; SSE3-NEXT: .cfi_offset %r12, -48 648; SSE3-NEXT: .Ltmp8: 649; SSE3-NEXT: .cfi_offset %r13, -40 650; SSE3-NEXT: .Ltmp9: 651; SSE3-NEXT: .cfi_offset %r14, -32 652; SSE3-NEXT: .Ltmp10: 653; SSE3-NEXT: .cfi_offset %r15, -24 654; SSE3-NEXT: .Ltmp11: 655; SSE3-NEXT: .cfi_offset %rbp, -16 656; SSE3-NEXT: movd %xmm0, %eax 657; SSE3-NEXT: pextrw $1, %xmm0, %ecx 658; SSE3-NEXT: addl %eax, %ecx 659; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 660; SSE3-NEXT: pextrw $2, %xmm0, %eax 661; SSE3-NEXT: pextrw $3, %xmm0, %r11d 662; SSE3-NEXT: addl %eax, %r11d 663; SSE3-NEXT: pextrw $4, %xmm0, %eax 664; SSE3-NEXT: pextrw $5, %xmm0, %r10d 665; SSE3-NEXT: addl %eax, %r10d 666; SSE3-NEXT: pextrw $6, %xmm0, %eax 667; SSE3-NEXT: pextrw $7, %xmm0, %r13d 668; SSE3-NEXT: addl %eax, %r13d 669; SSE3-NEXT: movd %xmm1, %eax 670; SSE3-NEXT: pextrw $1, %xmm1, %r14d 671; SSE3-NEXT: addl %eax, %r14d 672; SSE3-NEXT: pextrw $2, %xmm1, %eax 673; SSE3-NEXT: pextrw $3, %xmm1, %ebp 674; SSE3-NEXT: addl %eax, %ebp 675; SSE3-NEXT: pextrw $4, %xmm1, %eax 676; SSE3-NEXT: pextrw $5, %xmm1, %ebx 677; SSE3-NEXT: addl %eax, %ebx 678; SSE3-NEXT: pextrw $6, %xmm1, %eax 679; SSE3-NEXT: pextrw $7, %xmm1, %edx 680; SSE3-NEXT: addl %eax, %edx 681; SSE3-NEXT: movd %xmm2, %eax 682; SSE3-NEXT: pextrw $1, %xmm2, %ecx 683; SSE3-NEXT: addl %eax, %ecx 684; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 685; SSE3-NEXT: pextrw $2, %xmm2, %eax 686; SSE3-NEXT: pextrw $3, %xmm2, %r12d 687; SSE3-NEXT: addl %eax, %r12d 688; SSE3-NEXT: pextrw $4, %xmm2, %eax 689; SSE3-NEXT: pextrw $5, %xmm2, %r15d 690; SSE3-NEXT: addl %eax, %r15d 691; SSE3-NEXT: pextrw $6, %xmm2, %eax 692; SSE3-NEXT: pextrw $7, %xmm2, %r8d 693; SSE3-NEXT: addl %eax, %r8d 694; SSE3-NEXT: movd %xmm3, %eax 695; SSE3-NEXT: pextrw $1, %xmm3, %r9d 696; SSE3-NEXT: addl %eax, %r9d 697; SSE3-NEXT: pextrw $2, %xmm3, %eax 698; SSE3-NEXT: pextrw $3, %xmm3, %esi 699; SSE3-NEXT: addl %eax, %esi 700; SSE3-NEXT: pextrw $4, %xmm3, %eax 701; SSE3-NEXT: pextrw $5, %xmm3, %edi 702; SSE3-NEXT: addl %eax, %edi 703; SSE3-NEXT: pextrw $6, %xmm3, %ecx 704; SSE3-NEXT: pextrw $7, %xmm3, %eax 705; SSE3-NEXT: addl %ecx, %eax 706; SSE3-NEXT: movd %edx, %xmm8 707; SSE3-NEXT: movd %r13d, %xmm3 708; SSE3-NEXT: movd %ebp, %xmm9 709; SSE3-NEXT: movd %r11d, %xmm4 710; SSE3-NEXT: movd %ebx, %xmm10 711; SSE3-NEXT: movd %r10d, %xmm7 712; SSE3-NEXT: movd %r14d, %xmm11 713; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload 714; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 715; SSE3-NEXT: movd %eax, %xmm12 716; SSE3-NEXT: movd %r8d, %xmm6 717; SSE3-NEXT: movd %esi, %xmm13 718; SSE3-NEXT: movd %r12d, %xmm5 719; SSE3-NEXT: movd %edi, %xmm14 720; SSE3-NEXT: movd %r15d, %xmm2 721; SSE3-NEXT: movd %r9d, %xmm15 722; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload 723; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 724; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 725; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 726; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 727; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 728; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 729; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 730; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 731; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 732; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 733; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 734; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 735; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 736; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 737; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] 738; SSE3-NEXT: popq %rbx 739; SSE3-NEXT: popq %r12 740; SSE3-NEXT: popq %r13 741; SSE3-NEXT: popq %r14 742; SSE3-NEXT: popq %r15 743; SSE3-NEXT: popq %rbp 744; SSE3-NEXT: retq 745; 746; SSSE3-LABEL: avx2_vphadd_w_test: 747; SSSE3: # BB#0: 748; SSSE3-NEXT: phaddw %xmm1, %xmm0 749; SSSE3-NEXT: phaddw %xmm3, %xmm2 750; SSSE3-NEXT: movdqa %xmm2, %xmm1 751; SSSE3-NEXT: retq 752; 753; AVX1-LABEL: avx2_vphadd_w_test: 754; AVX1: # BB#0: 755; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 756; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 757; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 758; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 759; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 760; AVX1-NEXT: retq 761; 762; AVX2-LABEL: avx2_vphadd_w_test: 763; AVX2: # BB#0: 764; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 765; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1 766; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 767; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0 768; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 769; AVX2-NEXT: retq 770 %vecext = extractelement <16 x i16> %a, i32 0 771 %vecext1 = extractelement <16 x i16> %a, i32 1 772 %add = add i16 %vecext, %vecext1 773 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 774 %vecext4 = extractelement <16 x i16> %a, i32 2 775 %vecext6 = extractelement <16 x i16> %a, i32 3 776 %add8 = add i16 %vecext4, %vecext6 777 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 778 %vecext11 = extractelement <16 x i16> %a, i32 4 779 %vecext13 = extractelement <16 x i16> %a, i32 5 780 %add15 = add i16 %vecext11, %vecext13 781 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 782 %vecext18 = extractelement <16 x i16> %a, i32 6 783 %vecext20 = extractelement <16 x i16> %a, i32 7 784 %add22 = add i16 %vecext18, %vecext20 785 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 786 %vecext25 = extractelement <16 x i16> %a, i32 8 787 %vecext27 = extractelement <16 x i16> %a, i32 9 788 %add29 = add i16 %vecext25, %vecext27 789 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4 790 %vecext32 = extractelement <16 x i16> %a, i32 10 791 %vecext34 = extractelement <16 x i16> %a, i32 11 792 %add36 = add i16 %vecext32, %vecext34 793 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5 794 %vecext39 = extractelement <16 x i16> %a, i32 12 795 %vecext41 = extractelement <16 x i16> %a, i32 13 796 %add43 = add i16 %vecext39, %vecext41 797 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6 798 %vecext46 = extractelement <16 x i16> %a, i32 14 799 %vecext48 = extractelement <16 x i16> %a, i32 15 800 %add50 = add i16 %vecext46, %vecext48 801 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7 802 %vecext53 = extractelement <16 x i16> %b, i32 0 803 %vecext55 = extractelement <16 x i16> %b, i32 1 804 %add57 = add i16 %vecext53, %vecext55 805 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8 806 %vecext60 = extractelement <16 x i16> %b, i32 2 807 %vecext62 = extractelement <16 x i16> %b, i32 3 808 %add64 = add i16 %vecext60, %vecext62 809 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9 810 %vecext67 = extractelement <16 x i16> %b, i32 4 811 %vecext69 = extractelement <16 x i16> %b, i32 5 812 %add71 = add i16 %vecext67, %vecext69 813 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10 814 %vecext74 = extractelement <16 x i16> %b, i32 6 815 %vecext76 = extractelement <16 x i16> %b, i32 7 816 %add78 = add i16 %vecext74, %vecext76 817 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11 818 %vecext81 = extractelement <16 x i16> %b, i32 8 819 %vecext83 = extractelement <16 x i16> %b, i32 9 820 %add85 = add i16 %vecext81, %vecext83 821 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 822 %vecext88 = extractelement <16 x i16> %b, i32 10 823 %vecext90 = extractelement <16 x i16> %b, i32 11 824 %add92 = add i16 %vecext88, %vecext90 825 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 826 %vecext95 = extractelement <16 x i16> %b, i32 12 827 %vecext97 = extractelement <16 x i16> %b, i32 13 828 %add99 = add i16 %vecext95, %vecext97 829 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 830 %vecext102 = extractelement <16 x i16> %b, i32 14 831 %vecext104 = extractelement <16 x i16> %b, i32 15 832 %add106 = add i16 %vecext102, %vecext104 833 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 834 ret <16 x i16> %vecinit108 835} 836 837; Verify that we don't select horizontal subs in the following functions. 838 839define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { 840; SSE-LABEL: not_a_hsub_1: 841; SSE: # BB#0: 842; SSE-NEXT: movd %xmm0, %eax 843; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] 844; SSE-NEXT: movd %xmm2, %ecx 845; SSE-NEXT: subl %ecx, %eax 846; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 847; SSE-NEXT: movd %xmm2, %ecx 848; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 849; SSE-NEXT: movd %xmm0, %edx 850; SSE-NEXT: subl %edx, %ecx 851; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 852; SSE-NEXT: movd %xmm0, %edx 853; SSE-NEXT: movd %xmm1, %esi 854; SSE-NEXT: subl %esi, %edx 855; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 856; SSE-NEXT: movd %xmm0, %esi 857; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 858; SSE-NEXT: movd %xmm0, %edi 859; SSE-NEXT: subl %edi, %esi 860; SSE-NEXT: movd %esi, %xmm0 861; SSE-NEXT: movd %ecx, %xmm1 862; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 863; SSE-NEXT: movd %edx, %xmm2 864; SSE-NEXT: movd %eax, %xmm0 865; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 866; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 867; SSE-NEXT: retq 868; 869; AVX-LABEL: not_a_hsub_1: 870; AVX: # BB#0: 871; AVX-NEXT: vmovd %xmm0, %eax 872; AVX-NEXT: vpextrd $1, %xmm0, %ecx 873; AVX-NEXT: subl %ecx, %eax 874; AVX-NEXT: vpextrd $2, %xmm0, %ecx 875; AVX-NEXT: vpextrd $3, %xmm0, %edx 876; AVX-NEXT: subl %edx, %ecx 877; AVX-NEXT: vpextrd $1, %xmm1, %edx 878; AVX-NEXT: vmovd %xmm1, %esi 879; AVX-NEXT: subl %esi, %edx 880; AVX-NEXT: vpextrd $3, %xmm1, %esi 881; AVX-NEXT: vpextrd $2, %xmm1, %edi 882; AVX-NEXT: subl %edi, %esi 883; AVX-NEXT: vmovd %eax, %xmm0 884; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 885; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 886; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 887; AVX-NEXT: retq 888 %vecext = extractelement <4 x i32> %A, i32 0 889 %vecext1 = extractelement <4 x i32> %A, i32 1 890 %sub = sub i32 %vecext, %vecext1 891 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 892 %vecext2 = extractelement <4 x i32> %A, i32 2 893 %vecext3 = extractelement <4 x i32> %A, i32 3 894 %sub4 = sub i32 %vecext2, %vecext3 895 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 896 %vecext6 = extractelement <4 x i32> %B, i32 1 897 %vecext7 = extractelement <4 x i32> %B, i32 0 898 %sub8 = sub i32 %vecext6, %vecext7 899 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 900 %vecext10 = extractelement <4 x i32> %B, i32 3 901 %vecext11 = extractelement <4 x i32> %B, i32 2 902 %sub12 = sub i32 %vecext10, %vecext11 903 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 904 ret <4 x i32> %vecinit13 905} 906 907define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { 908; SSE-LABEL: not_a_hsub_2: 909; SSE: # BB#0: 910; SSE-NEXT: movapd %xmm0, %xmm2 911; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 912; SSE-NEXT: movapd %xmm0, %xmm3 913; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 914; SSE-NEXT: subss %xmm3, %xmm2 915; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 916; SSE-NEXT: subss %xmm3, %xmm0 917; SSE-NEXT: movaps %xmm1, %xmm3 918; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] 919; SSE-NEXT: movaps %xmm1, %xmm4 920; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0] 921; SSE-NEXT: subss %xmm4, %xmm3 922; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 923; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 924; SSE-NEXT: subss %xmm3, %xmm1 925; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 926; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 927; SSE-NEXT: retq 928; 929; AVX-LABEL: not_a_hsub_2: 930; AVX: # BB#0: 931; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 932; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] 933; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 934; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 935; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 936; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] 937; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 938; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 939; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 940; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 941; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1 942; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 943; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 944; AVX-NEXT: retq 945 %vecext = extractelement <4 x float> %A, i32 2 946 %vecext1 = extractelement <4 x float> %A, i32 3 947 %sub = fsub float %vecext, %vecext1 948 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 949 %vecext2 = extractelement <4 x float> %A, i32 0 950 %vecext3 = extractelement <4 x float> %A, i32 1 951 %sub4 = fsub float %vecext2, %vecext3 952 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 953 %vecext6 = extractelement <4 x float> %B, i32 3 954 %vecext7 = extractelement <4 x float> %B, i32 2 955 %sub8 = fsub float %vecext6, %vecext7 956 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 957 %vecext10 = extractelement <4 x float> %B, i32 0 958 %vecext11 = extractelement <4 x float> %B, i32 1 959 %sub12 = fsub float %vecext10, %vecext11 960 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 961 ret <4 x float> %vecinit13 962} 963 964define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { 965; SSE-LABEL: not_a_hsub_3: 966; SSE: # BB#0: 967; SSE-NEXT: movapd %xmm1, %xmm2 968; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 969; SSE-NEXT: subsd %xmm2, %xmm1 970; SSE-NEXT: movapd %xmm0, %xmm2 971; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] 972; SSE-NEXT: subsd %xmm0, %xmm2 973; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] 974; SSE-NEXT: movapd %xmm2, %xmm0 975; SSE-NEXT: retq 976; 977; AVX-LABEL: not_a_hsub_3: 978; AVX: # BB#0: 979; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 980; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1 981; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] 982; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0 983; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 984; AVX-NEXT: retq 985 %vecext = extractelement <2 x double> %B, i32 0 986 %vecext1 = extractelement <2 x double> %B, i32 1 987 %sub = fsub double %vecext, %vecext1 988 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 989 %vecext2 = extractelement <2 x double> %A, i32 1 990 %vecext3 = extractelement <2 x double> %A, i32 0 991 %sub2 = fsub double %vecext2, %vecext3 992 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 993 ret <2 x double> %vecinit2 994} 995 996; Test AVX horizontal add/sub of packed single/double precision 997; floating point values from 256-bit vectors. 998 999define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) { 1000; SSE-LABEL: avx_vhadd_ps: 1001; SSE: # BB#0: 1002; SSE-NEXT: haddps %xmm2, %xmm0 1003; SSE-NEXT: haddps %xmm3, %xmm1 1004; SSE-NEXT: retq 1005; 1006; AVX-LABEL: avx_vhadd_ps: 1007; AVX: # BB#0: 1008; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1009; AVX-NEXT: retq 1010 %vecext = extractelement <8 x float> %a, i32 0 1011 %vecext1 = extractelement <8 x float> %a, i32 1 1012 %add = fadd float %vecext, %vecext1 1013 %vecinit = insertelement <8 x float> undef, float %add, i32 0 1014 %vecext2 = extractelement <8 x float> %a, i32 2 1015 %vecext3 = extractelement <8 x float> %a, i32 3 1016 %add4 = fadd float %vecext2, %vecext3 1017 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 1018 %vecext6 = extractelement <8 x float> %b, i32 0 1019 %vecext7 = extractelement <8 x float> %b, i32 1 1020 %add8 = fadd float %vecext6, %vecext7 1021 %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2 1022 %vecext10 = extractelement <8 x float> %b, i32 2 1023 %vecext11 = extractelement <8 x float> %b, i32 3 1024 %add12 = fadd float %vecext10, %vecext11 1025 %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3 1026 %vecext14 = extractelement <8 x float> %a, i32 4 1027 %vecext15 = extractelement <8 x float> %a, i32 5 1028 %add16 = fadd float %vecext14, %vecext15 1029 %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4 1030 %vecext18 = extractelement <8 x float> %a, i32 6 1031 %vecext19 = extractelement <8 x float> %a, i32 7 1032 %add20 = fadd float %vecext18, %vecext19 1033 %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5 1034 %vecext22 = extractelement <8 x float> %b, i32 4 1035 %vecext23 = extractelement <8 x float> %b, i32 5 1036 %add24 = fadd float %vecext22, %vecext23 1037 %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6 1038 %vecext26 = extractelement <8 x float> %b, i32 6 1039 %vecext27 = extractelement <8 x float> %b, i32 7 1040 %add28 = fadd float %vecext26, %vecext27 1041 %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7 1042 ret <8 x float> %vecinit29 1043} 1044 1045define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) { 1046; SSE-LABEL: avx_vhsub_ps: 1047; SSE: # BB#0: 1048; SSE-NEXT: hsubps %xmm2, %xmm0 1049; SSE-NEXT: hsubps %xmm3, %xmm1 1050; SSE-NEXT: retq 1051; 1052; AVX-LABEL: avx_vhsub_ps: 1053; AVX: # BB#0: 1054; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 1055; AVX-NEXT: retq 1056 %vecext = extractelement <8 x float> %a, i32 0 1057 %vecext1 = extractelement <8 x float> %a, i32 1 1058 %sub = fsub float %vecext, %vecext1 1059 %vecinit = insertelement <8 x float> undef, float %sub, i32 0 1060 %vecext2 = extractelement <8 x float> %a, i32 2 1061 %vecext3 = extractelement <8 x float> %a, i32 3 1062 %sub4 = fsub float %vecext2, %vecext3 1063 %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1 1064 %vecext6 = extractelement <8 x float> %b, i32 0 1065 %vecext7 = extractelement <8 x float> %b, i32 1 1066 %sub8 = fsub float %vecext6, %vecext7 1067 %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2 1068 %vecext10 = extractelement <8 x float> %b, i32 2 1069 %vecext11 = extractelement <8 x float> %b, i32 3 1070 %sub12 = fsub float %vecext10, %vecext11 1071 %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3 1072 %vecext14 = extractelement <8 x float> %a, i32 4 1073 %vecext15 = extractelement <8 x float> %a, i32 5 1074 %sub16 = fsub float %vecext14, %vecext15 1075 %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4 1076 %vecext18 = extractelement <8 x float> %a, i32 6 1077 %vecext19 = extractelement <8 x float> %a, i32 7 1078 %sub20 = fsub float %vecext18, %vecext19 1079 %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5 1080 %vecext22 = extractelement <8 x float> %b, i32 4 1081 %vecext23 = extractelement <8 x float> %b, i32 5 1082 %sub24 = fsub float %vecext22, %vecext23 1083 %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6 1084 %vecext26 = extractelement <8 x float> %b, i32 6 1085 %vecext27 = extractelement <8 x float> %b, i32 7 1086 %sub28 = fsub float %vecext26, %vecext27 1087 %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7 1088 ret <8 x float> %vecinit29 1089} 1090 1091define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) { 1092; SSE-LABEL: avx_hadd_pd: 1093; SSE: # BB#0: 1094; SSE-NEXT: haddpd %xmm2, %xmm0 1095; SSE-NEXT: haddpd %xmm3, %xmm1 1096; SSE-NEXT: retq 1097; 1098; AVX-LABEL: avx_hadd_pd: 1099; AVX: # BB#0: 1100; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1101; AVX-NEXT: retq 1102 %vecext = extractelement <4 x double> %a, i32 0 1103 %vecext1 = extractelement <4 x double> %a, i32 1 1104 %add = fadd double %vecext, %vecext1 1105 %vecinit = insertelement <4 x double> undef, double %add, i32 0 1106 %vecext2 = extractelement <4 x double> %b, i32 0 1107 %vecext3 = extractelement <4 x double> %b, i32 1 1108 %add4 = fadd double %vecext2, %vecext3 1109 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 1110 %vecext6 = extractelement <4 x double> %a, i32 2 1111 %vecext7 = extractelement <4 x double> %a, i32 3 1112 %add8 = fadd double %vecext6, %vecext7 1113 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 1114 %vecext10 = extractelement <4 x double> %b, i32 2 1115 %vecext11 = extractelement <4 x double> %b, i32 3 1116 %add12 = fadd double %vecext10, %vecext11 1117 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 1118 ret <4 x double> %vecinit13 1119} 1120 1121define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) { 1122; SSE-LABEL: avx_hsub_pd: 1123; SSE: # BB#0: 1124; SSE-NEXT: hsubpd %xmm2, %xmm0 1125; SSE-NEXT: hsubpd %xmm3, %xmm1 1126; SSE-NEXT: retq 1127; 1128; AVX-LABEL: avx_hsub_pd: 1129; AVX: # BB#0: 1130; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1131; AVX-NEXT: retq 1132 %vecext = extractelement <4 x double> %a, i32 0 1133 %vecext1 = extractelement <4 x double> %a, i32 1 1134 %sub = fsub double %vecext, %vecext1 1135 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 1136 %vecext2 = extractelement <4 x double> %b, i32 0 1137 %vecext3 = extractelement <4 x double> %b, i32 1 1138 %sub4 = fsub double %vecext2, %vecext3 1139 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 1140 %vecext6 = extractelement <4 x double> %a, i32 2 1141 %vecext7 = extractelement <4 x double> %a, i32 3 1142 %sub8 = fsub double %vecext6, %vecext7 1143 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 1144 %vecext10 = extractelement <4 x double> %b, i32 2 1145 %vecext11 = extractelement <4 x double> %b, i32 3 1146 %sub12 = fsub double %vecext10, %vecext11 1147 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 1148 ret <4 x double> %vecinit13 1149} 1150 1151; Test AVX2 horizontal add of packed integer values from 256-bit vectors. 1152 1153define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { 1154; SSE3-LABEL: avx2_hadd_d: 1155; SSE3: # BB#0: 1156; SSE3-NEXT: movd %xmm0, %ecx 1157; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] 1158; SSE3-NEXT: movd %xmm4, %r8d 1159; SSE3-NEXT: addl %ecx, %r8d 1160; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] 1161; SSE3-NEXT: movd %xmm4, %edx 1162; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] 1163; SSE3-NEXT: movd %xmm0, %r9d 1164; SSE3-NEXT: addl %edx, %r9d 1165; SSE3-NEXT: movd %xmm2, %esi 1166; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] 1167; SSE3-NEXT: movd %xmm0, %r10d 1168; SSE3-NEXT: addl %esi, %r10d 1169; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 1170; SSE3-NEXT: movd %xmm0, %esi 1171; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] 1172; SSE3-NEXT: movd %xmm0, %edi 1173; SSE3-NEXT: addl %esi, %edi 1174; SSE3-NEXT: movd %xmm1, %eax 1175; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] 1176; SSE3-NEXT: movd %xmm0, %r11d 1177; SSE3-NEXT: addl %eax, %r11d 1178; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1179; SSE3-NEXT: movd %xmm0, %eax 1180; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] 1181; SSE3-NEXT: movd %xmm0, %ecx 1182; SSE3-NEXT: addl %eax, %ecx 1183; SSE3-NEXT: movd %xmm3, %eax 1184; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] 1185; SSE3-NEXT: movd %xmm0, %edx 1186; SSE3-NEXT: addl %eax, %edx 1187; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 1188; SSE3-NEXT: movd %xmm0, %eax 1189; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] 1190; SSE3-NEXT: movd %xmm0, %esi 1191; SSE3-NEXT: addl %eax, %esi 1192; SSE3-NEXT: movd %edi, %xmm0 1193; SSE3-NEXT: movd %r9d, %xmm1 1194; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1195; SSE3-NEXT: movd %r10d, %xmm2 1196; SSE3-NEXT: movd %r8d, %xmm0 1197; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1198; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1199; SSE3-NEXT: movd %esi, %xmm1 1200; SSE3-NEXT: movd %ecx, %xmm2 1201; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1202; SSE3-NEXT: movd %edx, %xmm3 1203; SSE3-NEXT: movd %r11d, %xmm1 1204; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1205; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1206; SSE3-NEXT: retq 1207; 1208; SSSE3-LABEL: avx2_hadd_d: 1209; SSSE3: # BB#0: 1210; SSSE3-NEXT: phaddd %xmm2, %xmm0 1211; SSSE3-NEXT: phaddd %xmm3, %xmm1 1212; SSSE3-NEXT: retq 1213; 1214; AVX1-LABEL: avx2_hadd_d: 1215; AVX1: # BB#0: 1216; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1217; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1218; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 1219; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1220; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1221; AVX1-NEXT: retq 1222; 1223; AVX2-LABEL: avx2_hadd_d: 1224; AVX2: # BB#0: 1225; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1226; AVX2-NEXT: retq 1227 %vecext = extractelement <8 x i32> %a, i32 0 1228 %vecext1 = extractelement <8 x i32> %a, i32 1 1229 %add = add i32 %vecext, %vecext1 1230 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 1231 %vecext2 = extractelement <8 x i32> %a, i32 2 1232 %vecext3 = extractelement <8 x i32> %a, i32 3 1233 %add4 = add i32 %vecext2, %vecext3 1234 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 1235 %vecext6 = extractelement <8 x i32> %b, i32 0 1236 %vecext7 = extractelement <8 x i32> %b, i32 1 1237 %add8 = add i32 %vecext6, %vecext7 1238 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 1239 %vecext10 = extractelement <8 x i32> %b, i32 2 1240 %vecext11 = extractelement <8 x i32> %b, i32 3 1241 %add12 = add i32 %vecext10, %vecext11 1242 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 1243 %vecext14 = extractelement <8 x i32> %a, i32 4 1244 %vecext15 = extractelement <8 x i32> %a, i32 5 1245 %add16 = add i32 %vecext14, %vecext15 1246 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 1247 %vecext18 = extractelement <8 x i32> %a, i32 6 1248 %vecext19 = extractelement <8 x i32> %a, i32 7 1249 %add20 = add i32 %vecext18, %vecext19 1250 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 1251 %vecext22 = extractelement <8 x i32> %b, i32 4 1252 %vecext23 = extractelement <8 x i32> %b, i32 5 1253 %add24 = add i32 %vecext22, %vecext23 1254 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 1255 %vecext26 = extractelement <8 x i32> %b, i32 6 1256 %vecext27 = extractelement <8 x i32> %b, i32 7 1257 %add28 = add i32 %vecext26, %vecext27 1258 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 1259 ret <8 x i32> %vecinit29 1260} 1261 1262define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) { 1263; SSE3-LABEL: avx2_hadd_w: 1264; SSE3: # BB#0: 1265; SSE3-NEXT: pushq %rbp 1266; SSE3-NEXT: .Ltmp12: 1267; SSE3-NEXT: .cfi_def_cfa_offset 16 1268; SSE3-NEXT: pushq %r15 1269; SSE3-NEXT: .Ltmp13: 1270; SSE3-NEXT: .cfi_def_cfa_offset 24 1271; SSE3-NEXT: pushq %r14 1272; SSE3-NEXT: .Ltmp14: 1273; SSE3-NEXT: .cfi_def_cfa_offset 32 1274; SSE3-NEXT: pushq %r13 1275; SSE3-NEXT: .Ltmp15: 1276; SSE3-NEXT: .cfi_def_cfa_offset 40 1277; SSE3-NEXT: pushq %r12 1278; SSE3-NEXT: .Ltmp16: 1279; SSE3-NEXT: .cfi_def_cfa_offset 48 1280; SSE3-NEXT: pushq %rbx 1281; SSE3-NEXT: .Ltmp17: 1282; SSE3-NEXT: .cfi_def_cfa_offset 56 1283; SSE3-NEXT: .Ltmp18: 1284; SSE3-NEXT: .cfi_offset %rbx, -56 1285; SSE3-NEXT: .Ltmp19: 1286; SSE3-NEXT: .cfi_offset %r12, -48 1287; SSE3-NEXT: .Ltmp20: 1288; SSE3-NEXT: .cfi_offset %r13, -40 1289; SSE3-NEXT: .Ltmp21: 1290; SSE3-NEXT: .cfi_offset %r14, -32 1291; SSE3-NEXT: .Ltmp22: 1292; SSE3-NEXT: .cfi_offset %r15, -24 1293; SSE3-NEXT: .Ltmp23: 1294; SSE3-NEXT: .cfi_offset %rbp, -16 1295; SSE3-NEXT: movd %xmm0, %eax 1296; SSE3-NEXT: pextrw $1, %xmm0, %ecx 1297; SSE3-NEXT: addl %eax, %ecx 1298; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 1299; SSE3-NEXT: pextrw $2, %xmm0, %eax 1300; SSE3-NEXT: pextrw $3, %xmm0, %r15d 1301; SSE3-NEXT: addl %eax, %r15d 1302; SSE3-NEXT: pextrw $4, %xmm0, %eax 1303; SSE3-NEXT: pextrw $5, %xmm0, %r14d 1304; SSE3-NEXT: addl %eax, %r14d 1305; SSE3-NEXT: pextrw $6, %xmm0, %eax 1306; SSE3-NEXT: pextrw $7, %xmm0, %r13d 1307; SSE3-NEXT: addl %eax, %r13d 1308; SSE3-NEXT: movd %xmm1, %eax 1309; SSE3-NEXT: pextrw $1, %xmm1, %ecx 1310; SSE3-NEXT: addl %eax, %ecx 1311; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill 1312; SSE3-NEXT: pextrw $2, %xmm1, %eax 1313; SSE3-NEXT: pextrw $3, %xmm1, %r11d 1314; SSE3-NEXT: addl %eax, %r11d 1315; SSE3-NEXT: pextrw $4, %xmm1, %eax 1316; SSE3-NEXT: pextrw $5, %xmm1, %r10d 1317; SSE3-NEXT: addl %eax, %r10d 1318; SSE3-NEXT: pextrw $6, %xmm1, %eax 1319; SSE3-NEXT: pextrw $7, %xmm1, %r12d 1320; SSE3-NEXT: addl %eax, %r12d 1321; SSE3-NEXT: movd %xmm2, %eax 1322; SSE3-NEXT: pextrw $1, %xmm2, %ebx 1323; SSE3-NEXT: addl %eax, %ebx 1324; SSE3-NEXT: pextrw $2, %xmm2, %eax 1325; SSE3-NEXT: pextrw $3, %xmm2, %ecx 1326; SSE3-NEXT: addl %eax, %ecx 1327; SSE3-NEXT: pextrw $4, %xmm2, %esi 1328; SSE3-NEXT: pextrw $5, %xmm2, %r8d 1329; SSE3-NEXT: addl %esi, %r8d 1330; SSE3-NEXT: pextrw $6, %xmm2, %esi 1331; SSE3-NEXT: pextrw $7, %xmm2, %edx 1332; SSE3-NEXT: addl %esi, %edx 1333; SSE3-NEXT: movd %xmm3, %edi 1334; SSE3-NEXT: pextrw $1, %xmm3, %r9d 1335; SSE3-NEXT: addl %edi, %r9d 1336; SSE3-NEXT: pextrw $2, %xmm3, %ebp 1337; SSE3-NEXT: pextrw $3, %xmm3, %edi 1338; SSE3-NEXT: addl %ebp, %edi 1339; SSE3-NEXT: pextrw $4, %xmm3, %eax 1340; SSE3-NEXT: pextrw $5, %xmm3, %ebp 1341; SSE3-NEXT: addl %eax, %ebp 1342; SSE3-NEXT: pextrw $6, %xmm3, %esi 1343; SSE3-NEXT: pextrw $7, %xmm3, %eax 1344; SSE3-NEXT: addl %esi, %eax 1345; SSE3-NEXT: movd %edx, %xmm8 1346; SSE3-NEXT: movd %r13d, %xmm3 1347; SSE3-NEXT: movd %ecx, %xmm9 1348; SSE3-NEXT: movd %r15d, %xmm4 1349; SSE3-NEXT: movd %r8d, %xmm10 1350; SSE3-NEXT: movd %r14d, %xmm7 1351; SSE3-NEXT: movd %ebx, %xmm11 1352; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload 1353; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 1354; SSE3-NEXT: movd %eax, %xmm12 1355; SSE3-NEXT: movd %r12d, %xmm6 1356; SSE3-NEXT: movd %edi, %xmm13 1357; SSE3-NEXT: movd %r11d, %xmm5 1358; SSE3-NEXT: movd %ebp, %xmm14 1359; SSE3-NEXT: movd %r10d, %xmm2 1360; SSE3-NEXT: movd %r9d, %xmm15 1361; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload 1362; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 1363; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] 1364; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] 1365; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1366; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] 1367; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 1368; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 1369; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1370; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] 1371; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] 1372; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 1373; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 1374; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 1375; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1376; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] 1377; SSE3-NEXT: popq %rbx 1378; SSE3-NEXT: popq %r12 1379; SSE3-NEXT: popq %r13 1380; SSE3-NEXT: popq %r14 1381; SSE3-NEXT: popq %r15 1382; SSE3-NEXT: popq %rbp 1383; SSE3-NEXT: retq 1384; 1385; SSSE3-LABEL: avx2_hadd_w: 1386; SSSE3: # BB#0: 1387; SSSE3-NEXT: phaddw %xmm2, %xmm0 1388; SSSE3-NEXT: phaddw %xmm3, %xmm1 1389; SSSE3-NEXT: retq 1390; 1391; AVX1-LABEL: avx2_hadd_w: 1392; AVX1: # BB#0: 1393; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1394; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1395; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 1396; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 1397; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1398; AVX1-NEXT: retq 1399; 1400; AVX2-LABEL: avx2_hadd_w: 1401; AVX2: # BB#0: 1402; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1403; AVX2-NEXT: retq 1404 %vecext = extractelement <16 x i16> %a, i32 0 1405 %vecext1 = extractelement <16 x i16> %a, i32 1 1406 %add = add i16 %vecext, %vecext1 1407 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 1408 %vecext4 = extractelement <16 x i16> %a, i32 2 1409 %vecext6 = extractelement <16 x i16> %a, i32 3 1410 %add8 = add i16 %vecext4, %vecext6 1411 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 1412 %vecext11 = extractelement <16 x i16> %a, i32 4 1413 %vecext13 = extractelement <16 x i16> %a, i32 5 1414 %add15 = add i16 %vecext11, %vecext13 1415 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 1416 %vecext18 = extractelement <16 x i16> %a, i32 6 1417 %vecext20 = extractelement <16 x i16> %a, i32 7 1418 %add22 = add i16 %vecext18, %vecext20 1419 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 1420 %vecext25 = extractelement <16 x i16> %a, i32 8 1421 %vecext27 = extractelement <16 x i16> %a, i32 9 1422 %add29 = add i16 %vecext25, %vecext27 1423 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8 1424 %vecext32 = extractelement <16 x i16> %a, i32 10 1425 %vecext34 = extractelement <16 x i16> %a, i32 11 1426 %add36 = add i16 %vecext32, %vecext34 1427 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9 1428 %vecext39 = extractelement <16 x i16> %a, i32 12 1429 %vecext41 = extractelement <16 x i16> %a, i32 13 1430 %add43 = add i16 %vecext39, %vecext41 1431 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10 1432 %vecext46 = extractelement <16 x i16> %a, i32 14 1433 %vecext48 = extractelement <16 x i16> %a, i32 15 1434 %add50 = add i16 %vecext46, %vecext48 1435 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11 1436 %vecext53 = extractelement <16 x i16> %b, i32 0 1437 %vecext55 = extractelement <16 x i16> %b, i32 1 1438 %add57 = add i16 %vecext53, %vecext55 1439 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4 1440 %vecext60 = extractelement <16 x i16> %b, i32 2 1441 %vecext62 = extractelement <16 x i16> %b, i32 3 1442 %add64 = add i16 %vecext60, %vecext62 1443 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5 1444 %vecext67 = extractelement <16 x i16> %b, i32 4 1445 %vecext69 = extractelement <16 x i16> %b, i32 5 1446 %add71 = add i16 %vecext67, %vecext69 1447 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6 1448 %vecext74 = extractelement <16 x i16> %b, i32 6 1449 %vecext76 = extractelement <16 x i16> %b, i32 7 1450 %add78 = add i16 %vecext74, %vecext76 1451 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7 1452 %vecext81 = extractelement <16 x i16> %b, i32 8 1453 %vecext83 = extractelement <16 x i16> %b, i32 9 1454 %add85 = add i16 %vecext81, %vecext83 1455 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 1456 %vecext88 = extractelement <16 x i16> %b, i32 10 1457 %vecext90 = extractelement <16 x i16> %b, i32 11 1458 %add92 = add i16 %vecext88, %vecext90 1459 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 1460 %vecext95 = extractelement <16 x i16> %b, i32 12 1461 %vecext97 = extractelement <16 x i16> %b, i32 13 1462 %add99 = add i16 %vecext95, %vecext97 1463 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 1464 %vecext102 = extractelement <16 x i16> %b, i32 14 1465 %vecext104 = extractelement <16 x i16> %b, i32 15 1466 %add106 = add i16 %vecext102, %vecext104 1467 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 1468 ret <16 x i16> %vecinit108 1469} 1470