1; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s 2; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3 3; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX 4; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2 5 6define fastcc float @reduction_cost_float(<4 x float> %rdx) { 7 %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 8 %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 9 %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 10 %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 11 12; Check that we recognize the tree starting at the extractelement as a 13; reduction. 14; CHECK-LABEL: reduction_cost 15; CHECK: cost of 9 {{.*}} extractelement 16 17 %r = extractelement <4 x float> %bin.rdx8, i32 0 18 ret float %r 19} 20 21define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { 22 %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, 23 <8 x i32> <i32 4 , i32 5, i32 6, i32 7, 24 i32 undef, i32 undef, i32 undef, i32 undef> 25 %bin.rdx = add <8 x i32> %rdx, %rdx.shuf 26 %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, 27 <8 x i32> <i32 2 , i32 3, i32 undef, i32 undef, 28 i32 undef, i32 undef, i32 undef, i32 undef> 29 %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 30 %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, 31 <8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef, 32 i32 undef, i32 undef, i32 undef, i32 undef> 33 %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 34 35; CHECK-LABEL: reduction_cost_int 36; CHECK: cost of 17 {{.*}} extractelement 37 38 %r = extractelement <8 x i32> %bin.rdx.3, i32 0 39 ret i32 %r 40} 41 42define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { 43 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 44 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 45 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 46 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 47 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 48 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 49 <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 50 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 51 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 52 %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 53 54; CHECK-LABEL: pairwise_hadd 55; CHECK: cost of 11 {{.*}} extractelement 56 57 %r = extractelement <4 x float> %bin.rdx.1, i32 0 58 %r2 = fadd float %r, %f1 59 ret float %r2 60} 61 62define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { 63 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 64 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 65 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 66 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 67 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0 68 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 69 <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 70 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 71 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 72 %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 73 74; CHECK-LABEL: pairwise_hadd_assoc 75; CHECK: cost of 11 {{.*}} extractelement 76 77 %r = extractelement <4 x float> %bin.rdx.1, i32 0 78 %r2 = fadd float %r, %f1 79 ret float %r2 80} 81 82define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { 83 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, 84 <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> 85 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, 86 <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 87 %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 88 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, 89 <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 90 %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 91 92; CHECK-LABEL: pairwise_hadd_skip_first 93; CHECK: cost of 11 {{.*}} extractelement 94 95 %r = extractelement <4 x float> %bin.rdx.1, i32 0 96 %r2 = fadd float %r, %f1 97 ret float %r2 98} 99 100define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) { 101 %rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 102 %bin.rdx = fadd <2 x double> %rdx, %rdx.shuf 103 104; SSE3: cost of 2 {{.*}} extractelement 105; AVX: cost of 2 {{.*}} extractelement 106; AVX2: cost of 2 {{.*}} extractelement 107 108 %r = extractelement <2 x double> %bin.rdx, i32 0 109 ret double %r 110} 111 112define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) { 113 %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 114 %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf 115 %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 116 %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 117 118; SSE3: cost of 4 {{.*}} extractelement 119; AVX: cost of 3 {{.*}} extractelement 120; AVX2: cost of 3 {{.*}} extractelement 121 122 %r = extractelement <4 x float> %bin.rdx8, i32 0 123 ret float %r 124} 125 126define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) { 127 %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 128 %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf 129 %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 130 %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 131 132; AVX: cost of 3 {{.*}} extractelement 133; AVX2: cost of 3 {{.*}} extractelement 134 135 %r = extractelement <4 x double> %bin.rdx8, i32 0 136 ret double %r 137} 138 139define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { 140 %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 141 %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3 142 %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 143 %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf 144 %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 145 %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 146 147; AVX: cost of 4 {{.*}} extractelement 148; AVX2: cost of 4 {{.*}} extractelement 149 150 %r = extractelement <8 x float> %bin.rdx8, i32 0 151 ret float %r 152} 153 154define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 155 %rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 156 %bin.rdx = add <2 x i64> %rdx, %rdx.shuf 157 158; SSE3: cost of 2 {{.*}} extractelement 159; AVX: cost of 1 {{.*}} extractelement 160; AVX2: cost of 1 {{.*}} extractelement 161 162 %r = extractelement <2 x i64> %bin.rdx, i32 0 163 ret i64 %r 164} 165 166define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 167 %rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 168 %bin.rdx = add <4 x i32> %rdx, %rdx.shuf 169 %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 170 %bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7 171 172; SSE3: cost of 3 {{.*}} extractelement 173; AVX: cost of 3 {{.*}} extractelement 174; AVX2: cost of 3 {{.*}} extractelement 175 176 %r = extractelement <4 x i32> %bin.rdx8, i32 0 177 ret i32 %r 178} 179 180define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 181 %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 182 %bin.rdx = add <4 x i64> %rdx, %rdx.shuf 183 %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 184 %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 185 186; AVX: cost of 3 {{.*}} extractelement 187; AVX2: cost of 3 {{.*}} extractelement 188 189 %r = extractelement <4 x i64> %bin.rdx8, i32 0 190 ret i64 %r 191} 192 193define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 194 %rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 195 %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3 196 %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 197 %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf 198 %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 199 %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7 200 201; SSE3: cost of 4 {{.*}} extractelement 202; AVX: cost of 4 {{.*}} extractelement 203; AVX2: cost of 4 {{.*}} extractelement 204 205 %r = extractelement <8 x i16> %bin.rdx8, i32 0 206 ret i16 %r 207} 208 209define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 210 %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 211 %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3 212 %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 213 %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf 214 %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 215 %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 216 217; AVX: cost of 5 {{.*}} extractelement 218; AVX2: cost of 5 {{.*}} extractelement 219 220 %r = extractelement <8 x i32> %bin.rdx8, i32 0 221 ret i32 %r 222} 223 224define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { 225 %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 226 %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 227 %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 228 229; SSE3: cost of 2 {{.*}} extractelement 230; AVX: cost of 2 {{.*}} extractelement 231; AVX2: cost of 2 {{.*}} extractelement 232 233 %r = extractelement <2 x double> %bin.rdx8, i32 0 234 ret double %r 235} 236 237define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { 238 %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 239 %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 240 %bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 241 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 242 %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 243 %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 244 245; SSE3: cost of 4 {{.*}} extractelement 246; AVX: cost of 4 {{.*}} extractelement 247; AVX2: cost of 4 {{.*}} extractelement 248 249 %r = extractelement <4 x float> %bin.rdx8, i32 0 250 ret float %r 251} 252 253define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { 254 %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 255 %rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 256 %bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1 257 %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 258 %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 259 %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 260 261; AVX: cost of 5 {{.*}} extractelement 262; AVX2: cost of 5 {{.*}} extractelement 263 264 %r = extractelement <4 x double> %bin.rdx8, i32 0 265 ret double %r 266} 267 268define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { 269 %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 270 %rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 271 %bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 272 %rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 273 %rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 274 %bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 275 %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 276 %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 277 %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 278 279; AVX: cost of 7 {{.*}} extractelement 280; AVX2: cost of 7 {{.*}} extractelement 281 282 %r = extractelement <8 x float> %bin.rdx9, i32 0 283 ret float %r 284} 285 286define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { 287 %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef> 288 %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 289 %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 290 291; SSE3: cost of 2 {{.*}} extractelement 292; AVX: cost of 1 {{.*}} extractelement 293; AVX2: cost of 1 {{.*}} extractelement 294 295 %r = extractelement <2 x i64> %bin.rdx8, i32 0 296 ret i64 %r 297} 298 299define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { 300 %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 301 %rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 302 %bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 303 %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 304 %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 305 %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 306 307; SSE3: cost of 3 {{.*}} extractelement 308; AVX: cost of 3 {{.*}} extractelement 309; AVX2: cost of 3 {{.*}} extractelement 310 311 %r = extractelement <4 x i32> %bin.rdx8, i32 0 312 ret i32 %r 313} 314 315define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { 316 %rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 317 %rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 318 %bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1 319 %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 320 %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 321 %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 322 323; AVX: cost of 5 {{.*}} extractelement 324; AVX2: cost of 5 {{.*}} extractelement 325 326 %r = extractelement <4 x i64> %bin.rdx8, i32 0 327 ret i64 %r 328} 329 330define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { 331 %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 332 %rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 333 %bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1 334 %rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 335 %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 336 %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1 337 %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 338 %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 339 %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 340 341; SSE3: cost of 5 {{.*}} extractelement 342; AVX: cost of 5 {{.*}} extractelement 343; AVX2: cost of 5 {{.*}} extractelement 344 345 %r = extractelement <8 x i16> %bin.rdx9, i32 0 346 ret i16 %r 347} 348 349define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { 350 %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef> 351 %rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef> 352 %bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1 353 %rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 354 %rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 355 %bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 356 %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 357 %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 358 %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 359 360; AVX: cost of 5 {{.*}} extractelement 361; AVX2: cost of 5 {{.*}} extractelement 362 363 %r = extractelement <8 x i32> %bin.rdx9, i32 0 364 ret i32 %r 365} 366