1; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s 2; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s | FileCheck %s -check-prefix=ZEROTHRESH 3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128" 4 5target triple = "x86_64-apple-macosx10.8.0" 6 7define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { 8; CHECK-LABEL: @simple_select( 9; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer 10; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b 11 %c0 = extractelement <4 x i32> %c, i32 0 12 %c1 = extractelement <4 x i32> %c, i32 1 13 %c2 = extractelement <4 x i32> %c, i32 2 14 %c3 = extractelement <4 x i32> %c, i32 3 15 %a0 = extractelement <4 x float> %a, i32 0 16 %a1 = extractelement <4 x float> %a, i32 1 17 %a2 = extractelement <4 x float> %a, i32 2 18 %a3 = extractelement <4 x float> %a, i32 3 19 %b0 = extractelement <4 x float> %b, i32 0 20 %b1 = extractelement <4 x float> %b, i32 1 21 %b2 = extractelement <4 x float> %b, i32 2 22 %b3 = extractelement <4 x float> %b, i32 3 23 %cmp0 = icmp ne i32 %c0, 0 24 %cmp1 = icmp ne i32 %c1, 0 25 %cmp2 = icmp ne i32 %c2, 0 26 %cmp3 = icmp ne i32 %c3, 0 27 %s0 = select i1 %cmp0, float %a0, float %b0 28 %s1 = select i1 %cmp1, float %a1, float %b1 29 %s2 = select i1 %cmp2, float %a2, float %b2 30 %s3 = select i1 %cmp3, float %a3, float %b3 31 %ra = insertelement <4 x float> undef, float %s0, i32 0 32 %rb = insertelement <4 x float> %ra, float %s1, i32 1 33 %rc = insertelement <4 x float> %rb, float %s2, i32 2 34 %rd = insertelement <4 x float> %rc, float %s3, i32 3 35 ret <4 x float> %rd 36} 37 38declare void @llvm.assume(i1) nounwind 39 40; This entire tree is ephemeral, don't vectorize any of it. 41define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { 42; CHECK-LABEL: @simple_select_eph( 43; CHECK-NOT: icmp ne <4 x i32> 44; CHECK-NOT: select <4 x i1> 45 %c0 = extractelement <4 x i32> %c, i32 0 46 %c1 = extractelement <4 x i32> %c, i32 1 47 %c2 = extractelement <4 x i32> %c, i32 2 48 %c3 = extractelement <4 x i32> %c, i32 3 49 %a0 = extractelement <4 x float> %a, i32 0 50 %a1 = extractelement <4 x float> %a, i32 1 51 %a2 = extractelement <4 x float> %a, i32 2 52 %a3 = extractelement <4 x float> %a, i32 3 53 %b0 = extractelement <4 x float> %b, i32 0 54 %b1 = extractelement <4 x float> %b, i32 1 55 %b2 = extractelement <4 x float> %b, i32 2 56 %b3 = extractelement <4 x float> %b, i32 3 57 %cmp0 = icmp ne i32 %c0, 0 58 %cmp1 = icmp ne i32 %c1, 0 59 %cmp2 = icmp ne i32 %c2, 0 60 %cmp3 = icmp ne i32 %c3, 0 61 %s0 = select i1 %cmp0, float %a0, float %b0 62 %s1 = select i1 %cmp1, float %a1, float %b1 63 %s2 = select i1 %cmp2, float %a2, float %b2 64 %s3 = select i1 %cmp3, float %a3, float %b3 65 %ra = insertelement <4 x float> undef, float %s0, i32 0 66 %rb = insertelement <4 x float> %ra, float %s1, i32 1 67 %rc = insertelement <4 x float> %rb, float %s2, i32 2 68 %rd = insertelement <4 x float> %rc, float %s3, i32 3 69 %q0 = extractelement <4 x float> %rd, i32 0 70 %q1 = extractelement <4 x float> %rd, i32 1 71 %q2 = extractelement <4 x float> %rd, i32 2 72 %q3 = extractelement <4 x float> %rd, i32 3 73 %q4 = fadd float %q0, %q1 74 %q5 = fadd float %q2, %q3 75 %q6 = fadd float %q4, %q5 76 %qi = fcmp olt float %q6, %q5 77 call void @llvm.assume(i1 %qi) 78 ret <4 x float> undef 79} 80 81; Insert in an order different from the vector indices to make sure it 82; doesn't matter 83define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { 84; CHECK-LABEL: @simple_select_insert_out_of_order( 85; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer 86; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b 87 %c0 = extractelement <4 x i32> %c, i32 0 88 %c1 = extractelement <4 x i32> %c, i32 1 89 %c2 = extractelement <4 x i32> %c, i32 2 90 %c3 = extractelement <4 x i32> %c, i32 3 91 %a0 = extractelement <4 x float> %a, i32 0 92 %a1 = extractelement <4 x float> %a, i32 1 93 %a2 = extractelement <4 x float> %a, i32 2 94 %a3 = extractelement <4 x float> %a, i32 3 95 %b0 = extractelement <4 x float> %b, i32 0 96 %b1 = extractelement <4 x float> %b, i32 1 97 %b2 = extractelement <4 x float> %b, i32 2 98 %b3 = extractelement <4 x float> %b, i32 3 99 %cmp0 = icmp ne i32 %c0, 0 100 %cmp1 = icmp ne i32 %c1, 0 101 %cmp2 = icmp ne i32 %c2, 0 102 %cmp3 = icmp ne i32 %c3, 0 103 %s0 = select i1 %cmp0, float %a0, float %b0 104 %s1 = select i1 %cmp1, float %a1, float %b1 105 %s2 = select i1 %cmp2, float %a2, float %b2 106 %s3 = select i1 %cmp3, float %a3, float %b3 107 %ra = insertelement <4 x float> undef, float %s0, i32 2 108 %rb = insertelement <4 x float> %ra, float %s1, i32 1 109 %rc = insertelement <4 x float> %rb, float %s2, i32 0 110 %rd = insertelement <4 x float> %rc, float %s3, i32 3 111 ret <4 x float> %rd 112} 113 114declare void @v4f32_user(<4 x float>) #0 115declare void @f32_user(float) #0 116 117; Multiple users of the final constructed vector 118define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { 119; CHECK-LABEL: @simple_select_users( 120; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer 121; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b 122 %c0 = extractelement <4 x i32> %c, i32 0 123 %c1 = extractelement <4 x i32> %c, i32 1 124 %c2 = extractelement <4 x i32> %c, i32 2 125 %c3 = extractelement <4 x i32> %c, i32 3 126 %a0 = extractelement <4 x float> %a, i32 0 127 %a1 = extractelement <4 x float> %a, i32 1 128 %a2 = extractelement <4 x float> %a, i32 2 129 %a3 = extractelement <4 x float> %a, i32 3 130 %b0 = extractelement <4 x float> %b, i32 0 131 %b1 = extractelement <4 x float> %b, i32 1 132 %b2 = extractelement <4 x float> %b, i32 2 133 %b3 = extractelement <4 x float> %b, i32 3 134 %cmp0 = icmp ne i32 %c0, 0 135 %cmp1 = icmp ne i32 %c1, 0 136 %cmp2 = icmp ne i32 %c2, 0 137 %cmp3 = icmp ne i32 %c3, 0 138 %s0 = select i1 %cmp0, float %a0, float %b0 139 %s1 = select i1 %cmp1, float %a1, float %b1 140 %s2 = select i1 %cmp2, float %a2, float %b2 141 %s3 = select i1 %cmp3, float %a3, float %b3 142 %ra = insertelement <4 x float> undef, float %s0, i32 0 143 %rb = insertelement <4 x float> %ra, float %s1, i32 1 144 %rc = insertelement <4 x float> %rb, float %s2, i32 2 145 %rd = insertelement <4 x float> %rc, float %s3, i32 3 146 call void @v4f32_user(<4 x float> %rd) #0 147 ret <4 x float> %rd 148} 149 150; Unused insertelement 151define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { 152; CHECK-LABEL: @simple_select_no_users( 153; CHECK-NOT: icmp ne <4 x i32> 154; CHECK-NOT: select <4 x i1> 155 %c0 = extractelement <4 x i32> %c, i32 0 156 %c1 = extractelement <4 x i32> %c, i32 1 157 %c2 = extractelement <4 x i32> %c, i32 2 158 %c3 = extractelement <4 x i32> %c, i32 3 159 %a0 = extractelement <4 x float> %a, i32 0 160 %a1 = extractelement <4 x float> %a, i32 1 161 %a2 = extractelement <4 x float> %a, i32 2 162 %a3 = extractelement <4 x float> %a, i32 3 163 %b0 = extractelement <4 x float> %b, i32 0 164 %b1 = extractelement <4 x float> %b, i32 1 165 %b2 = extractelement <4 x float> %b, i32 2 166 %b3 = extractelement <4 x float> %b, i32 3 167 %cmp0 = icmp ne i32 %c0, 0 168 %cmp1 = icmp ne i32 %c1, 0 169 %cmp2 = icmp ne i32 %c2, 0 170 %cmp3 = icmp ne i32 %c3, 0 171 %s0 = select i1 %cmp0, float %a0, float %b0 172 %s1 = select i1 %cmp1, float %a1, float %b1 173 %s2 = select i1 %cmp2, float %a2, float %b2 174 %s3 = select i1 %cmp3, float %a3, float %b3 175 %ra = insertelement <4 x float> undef, float %s0, i32 0 176 %rb = insertelement <4 x float> %ra, float %s1, i32 1 177 %rc = insertelement <4 x float> undef, float %s2, i32 2 178 %rd = insertelement <4 x float> %rc, float %s3, i32 3 179 ret <4 x float> %rd 180} 181 182; Make sure infinite loop doesn't happen which I ran into when trying 183; to do this backwards this backwards 184define <4 x i32> @reconstruct(<4 x i32> %c) #0 { 185; CHECK-LABEL: @reconstruct( 186 %c0 = extractelement <4 x i32> %c, i32 0 187 %c1 = extractelement <4 x i32> %c, i32 1 188 %c2 = extractelement <4 x i32> %c, i32 2 189 %c3 = extractelement <4 x i32> %c, i32 3 190 %ra = insertelement <4 x i32> undef, i32 %c0, i32 0 191 %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1 192 %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2 193 %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3 194 ret <4 x i32> %rd 195} 196 197define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 { 198; CHECK-LABEL: @simple_select_v2( 199; CHECK: icmp ne <2 x i32> 200; CHECK: select <2 x i1> 201 %c0 = extractelement <2 x i32> %c, i32 0 202 %c1 = extractelement <2 x i32> %c, i32 1 203 %a0 = extractelement <2 x float> %a, i32 0 204 %a1 = extractelement <2 x float> %a, i32 1 205 %b0 = extractelement <2 x float> %b, i32 0 206 %b1 = extractelement <2 x float> %b, i32 1 207 %cmp0 = icmp ne i32 %c0, 0 208 %cmp1 = icmp ne i32 %c1, 0 209 %s0 = select i1 %cmp0, float %a0, float %b0 210 %s1 = select i1 %cmp1, float %a1, float %b1 211 %ra = insertelement <2 x float> undef, float %s0, i32 0 212 %rb = insertelement <2 x float> %ra, float %s1, i32 1 213 ret <2 x float> %rb 214} 215 216; Make sure when we construct partial vectors, we don't keep 217; re-visiting the insertelement chains starting with undef 218; (low cost threshold needed to force this to happen) 219define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { 220 %c0 = extractelement <4 x i32> %c, i32 0 221 %c1 = extractelement <4 x i32> %c, i32 1 222 %a0 = extractelement <4 x float> %a, i32 0 223 %a1 = extractelement <4 x float> %a, i32 1 224 %b0 = extractelement <4 x float> %b, i32 0 225 %b1 = extractelement <4 x float> %b, i32 1 226 %1 = insertelement <2 x i32> undef, i32 %c0, i32 0 227 %2 = insertelement <2 x i32> %1, i32 %c1, i32 1 228 %3 = icmp ne <2 x i32> %2, zeroinitializer 229 %4 = insertelement <2 x float> undef, float %a0, i32 0 230 %5 = insertelement <2 x float> %4, float %a1, i32 1 231 %6 = insertelement <2 x float> undef, float %b0, i32 0 232 %7 = insertelement <2 x float> %6, float %b1, i32 1 233 %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7 234 %9 = extractelement <2 x float> %8, i32 0 235 %ra = insertelement <4 x float> undef, float %9, i32 0 236 %10 = extractelement <2 x float> %8, i32 1 237 %rb = insertelement <4 x float> %ra, float %10, i32 1 238 ret <4 x float> %rb 239} 240 241; Make sure that vectorization happens even if insertelements operations 242; must be rescheduled. The case here is from compiling Julia. 243define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { 244; CHECK-LABEL: @reschedule_extract( 245; CHECK: %1 = fadd <4 x float> %a, %b 246 %a0 = extractelement <4 x float> %a, i32 0 247 %b0 = extractelement <4 x float> %b, i32 0 248 %c0 = fadd float %a0, %b0 249 %v0 = insertelement <4 x float> undef, float %c0, i32 0 250 %a1 = extractelement <4 x float> %a, i32 1 251 %b1 = extractelement <4 x float> %b, i32 1 252 %c1 = fadd float %a1, %b1 253 %v1 = insertelement <4 x float> %v0, float %c1, i32 1 254 %a2 = extractelement <4 x float> %a, i32 2 255 %b2 = extractelement <4 x float> %b, i32 2 256 %c2 = fadd float %a2, %b2 257 %v2 = insertelement <4 x float> %v1, float %c2, i32 2 258 %a3 = extractelement <4 x float> %a, i32 3 259 %b3 = extractelement <4 x float> %b, i32 3 260 %c3 = fadd float %a3, %b3 261 %v3 = insertelement <4 x float> %v2, float %c3, i32 3 262 ret <4 x float> %v3 263} 264 265; Check that cost model for vectorization takes credit for 266; instructions that are erased. 267define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { 268; ZEROTHRESH-LABEL: @take_credit( 269; ZEROTHRESH: %1 = fadd <4 x float> %a, %b 270 %a0 = extractelement <4 x float> %a, i32 0 271 %b0 = extractelement <4 x float> %b, i32 0 272 %c0 = fadd float %a0, %b0 273 %a1 = extractelement <4 x float> %a, i32 1 274 %b1 = extractelement <4 x float> %b, i32 1 275 %c1 = fadd float %a1, %b1 276 %a2 = extractelement <4 x float> %a, i32 2 277 %b2 = extractelement <4 x float> %b, i32 2 278 %c2 = fadd float %a2, %b2 279 %a3 = extractelement <4 x float> %a, i32 3 280 %b3 = extractelement <4 x float> %b, i32 3 281 %c3 = fadd float %a3, %b3 282 %v0 = insertelement <4 x float> undef, float %c0, i32 0 283 %v1 = insertelement <4 x float> %v0, float %c1, i32 1 284 %v2 = insertelement <4 x float> %v1, float %c2, i32 2 285 %v3 = insertelement <4 x float> %v2, float %c3, i32 3 286 ret <4 x float> %v3 287} 288 289; Make sure we handle multiple trees that feed one build vector correctly. 290define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) { 291entry: 292 %t0 = fadd double %w , 0.000000e+00 293 %t1 = fadd double %x , 1.000000e+00 294 %t2 = fadd double %y , 2.000000e+00 295 %t3 = fadd double %z , 3.000000e+00 296 %t4 = fmul double %t0, 1.000000e+00 297 %i1 = insertelement <4 x double> undef, double %t4, i32 3 298 %t5 = fmul double %t1, 1.000000e+00 299 %i2 = insertelement <4 x double> %i1, double %t5, i32 2 300 %t6 = fmul double %t2, 1.000000e+00 301 %i3 = insertelement <4 x double> %i2, double %t6, i32 1 302 %t7 = fmul double %t3, 1.000000e+00 303 %i4 = insertelement <4 x double> %i3, double %t7, i32 0 304 ret <4 x double> %i4 305} 306; CHECK-LABEL: @multi_tree 307; CHECK-DAG: %[[V0:.+]] = insertelement <2 x double> undef, double %w, i32 0 308; CHECK-DAG: %[[V1:.+]] = insertelement <2 x double> %[[V0]], double %x, i32 1 309; CHECK-DAG: %[[V2:.+]] = fadd <2 x double> %[[V1]], <double 0.000000e+00, double 1.000000e+00> 310; CHECK-DAG: %[[V3:.+]] = insertelement <2 x double> undef, double %y, i32 0 311; CHECK-DAG: %[[V4:.+]] = insertelement <2 x double> %[[V3]], double %z, i32 1 312; CHECK-DAG: %[[V5:.+]] = fadd <2 x double> %[[V4]], <double 2.000000e+00, double 3.000000e+00> 313; CHECK-DAG: %[[V6:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V2]] 314; CHECK-DAG: %[[V7:.+]] = extractelement <2 x double> %[[V6]], i32 0 315; CHECK-DAG: %[[I1:.+]] = insertelement <4 x double> undef, double %[[V7]], i32 3 316; CHECK-DAG: %[[V8:.+]] = extractelement <2 x double> %[[V6]], i32 1 317; CHECK-DAG: %[[I2:.+]] = insertelement <4 x double> %[[I1]], double %[[V8]], i32 2 318; CHECK-DAG: %[[V9:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V5]] 319; CHECK-DAG: %[[V10:.+]] = extractelement <2 x double> %[[V9]], i32 0 320; CHECK-DAG: %[[I3:.+]] = insertelement <4 x double> %i2, double %[[V10]], i32 1 321; CHECK-DAG: %[[V11:.+]] = extractelement <2 x double> %[[V9]], i32 1 322; CHECK-DAG: %[[I4:.+]] = insertelement <4 x double> %i3, double %[[V11]], i32 0 323; CHECK: ret <4 x double> %[[I4]] 324 325attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } 326