1; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s 2 3target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" 4 5; Float pattern: 6; Check vectorization of reduction code which has an fadd instruction after 7; an fcmp instruction which compares an array element and 0. 8; 9; float fcmp_0_fadd_select1(float * restrict x, const int N) { 10; float sum = 0. 11; for (int i = 0; i < N; ++i) 12; if (x[i] > (float)0.) 13; sum += x[i]; 14; return sum; 15; } 16 17; CHECK-LABEL: @fcmp_0_fadd_select1( 18; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer 19; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]] 20; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] 21define float @fcmp_0_fadd_select1(float* noalias %x, i32 %N) nounwind readonly { 22entry: 23 %cmp.1 = icmp sgt i32 %N, 0 24 br i1 %cmp.1, label %for.header, label %for.end 25 26for.header: ; preds = %entry 27 %zext = zext i32 %N to i64 28 br label %for.body 29 30for.body: ; preds = %header, %for.body 31 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 32 %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 33 %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv 34 %0 = load float, float* %arrayidx, align 4 35 %cmp.2 = fcmp fast ogt float %0, 0.000000e+00 36 %add = fadd fast float %0, %sum.1 37 %sum.2 = select i1 %cmp.2, float %add, float %sum.1 38 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 39 %exitcond = icmp eq i64 %indvars.iv.next, %zext 40 br i1 %exitcond, label %for.end, label %for.body 41 42for.end: ; preds = %for.body, %entry 43 %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 44 ret float %1 45} 46 47; Double pattern: 48; Check vectorization of reduction code which has an fadd instruction after 49; an fcmp instruction which compares an array element and 0. 50; 51; double fcmp_0_fadd_select2(double * restrict x, const int N) { 52; double sum = 0. 53; for (int i = 0; i < N; ++i) 54; if (x[i] > 0.) 55; sum += x[i]; 56; return sum; 57; } 58 59; CHECK-LABEL: @fcmp_0_fadd_select2( 60; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer 61; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]] 62; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] 63define double @fcmp_0_fadd_select2(double* noalias %x, i32 %N) nounwind readonly { 64entry: 65 %cmp.1 = icmp sgt i32 %N, 0 66 br i1 %cmp.1, label %for.header, label %for.end 67 68for.header: ; preds = %entry 69 %zext = zext i32 %N to i64 70 br label %for.body 71 72for.body: ; preds = %header, %for.body 73 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 74 %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 75 %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv 76 %0 = load double, double* %arrayidx, align 4 77 %cmp.2 = fcmp fast ogt double %0, 0.000000e+00 78 %add = fadd fast double %0, %sum.1 79 %sum.2 = select i1 %cmp.2, double %add, double %sum.1 80 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 81 %exitcond = icmp eq i64 %indvars.iv.next, %zext 82 br i1 %exitcond, label %for.end, label %for.body 83 84for.end: ; preds = %for.body, %entry 85 %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 86 ret double %1 87} 88 89; Float pattern: 90; Check vectorization of reduction code which has an fadd instruction after 91; an fcmp instruction which compares an array element and a floating-point 92; value. 93; 94; float fcmp_val_fadd_select1(float * restrict x, float y, const int N) { 95; float sum = 0. 96; for (int i = 0; i < N; ++i) 97; if (x[i] > y) 98; sum += x[i]; 99; return sum; 100; } 101 102; CHECK-LABEL: @fcmp_val_fadd_select1( 103; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat 104; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]] 105; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] 106define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly { 107entry: 108 %cmp.1 = icmp sgt i32 %N, 0 109 br i1 %cmp.1, label %for.header, label %for.end 110 111for.header: ; preds = %entry 112 %zext = zext i32 %N to i64 113 br label %for.body 114 115for.body: ; preds = %header, %for.body 116 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 117 %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 118 %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv 119 %0 = load float, float* %arrayidx, align 4 120 %cmp.2 = fcmp fast ogt float %0, %y 121 %add = fadd fast float %0, %sum.1 122 %sum.2 = select i1 %cmp.2, float %add, float %sum.1 123 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 124 %exitcond = icmp eq i64 %indvars.iv.next, %zext 125 br i1 %exitcond, label %for.end, label %for.body 126 127for.end: ; preds = %for.body, %entry 128 %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 129 ret float %1 130} 131 132; Double pattern: 133; Check vectorization of reduction code which has an fadd instruction after 134; an fcmp instruction which compares an array element and a floating-point 135; value. 136; 137; double fcmp_val_fadd_select2(double * restrict x, double y, const int N) { 138; double sum = 0. 139; for (int i = 0; i < N; ++i) 140; if (x[i] > y) 141; sum += x[i]; 142; return sum; 143; } 144 145; CHECK-LABEL: @fcmp_val_fadd_select2( 146; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat 147; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]] 148; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] 149define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly { 150entry: 151 %cmp.1 = icmp sgt i32 %N, 0 152 br i1 %cmp.1, label %for.header, label %for.end 153 154for.header: ; preds = %entry 155 %zext = zext i32 %N to i64 156 br label %for.body 157 158for.body: ; preds = %header, %for.body 159 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 160 %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 161 %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv 162 %0 = load double, double* %arrayidx, align 4 163 %cmp.2 = fcmp fast ogt double %0, %y 164 %add = fadd fast double %0, %sum.1 165 %sum.2 = select i1 %cmp.2, double %add, double %sum.1 166 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 167 %exitcond = icmp eq i64 %indvars.iv.next, %zext 168 br i1 %exitcond, label %for.end, label %for.body 169 170for.end: ; preds = %for.body, %entry 171 %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 172 ret double %1 173} 174 175; Float pattern: 176; Check vectorization of reduction code which has an fadd instruction after 177; an fcmp instruction which compares an array element and another array 178; element. 179; 180; float fcmp_array_elm_fadd_select1(float * restrict x, float * restrict y, 181; const int N) { 182; float sum = 0. 183; for (int i = 0; i < N; ++i) 184; if (x[i] > y[i]) 185; sum += x[i]; 186; return sum; 187; } 188 189; CHECK-LABEL: @fcmp_array_elm_fadd_select1( 190; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]] 191; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]] 192; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]] 193define float @fcmp_array_elm_fadd_select1(float* noalias %x, float* noalias %y, i32 %N) nounwind readonly { 194entry: 195 %cmp.1 = icmp sgt i32 %N, 0 196 br i1 %cmp.1, label %for.header, label %for.end 197 198for.header: ; preds = %entry 199 %zext = zext i32 %N to i64 200 br label %for.body 201 202for.body: ; preds = %for.body, %for.header 203 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 204 %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 205 %arrayidx.1 = getelementptr inbounds float, float* %x, i64 %indvars.iv 206 %0 = load float, float* %arrayidx.1, align 4 207 %arrayidx.2 = getelementptr inbounds float, float* %y, i64 %indvars.iv 208 %1 = load float, float* %arrayidx.2, align 4 209 %cmp.2 = fcmp fast ogt float %0, %1 210 %add = fadd fast float %0, %sum.1 211 %sum.2 = select i1 %cmp.2, float %add, float %sum.1 212 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 213 %exitcond = icmp eq i64 %indvars.iv.next, %zext 214 br i1 %exitcond, label %for.end, label %for.body 215 216for.end: ; preds = %for.body, %entry 217 %2 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 218 ret float %2 219} 220 221; Double pattern: 222; Check vectorization of reduction code which has an fadd instruction after 223; an fcmp instruction which compares an array element and another array 224; element. 225; 226; double fcmp_array_elm_fadd_select2(double * restrict x, double * restrict y, 227; const int N) { 228; double sum = 0. 229; for (int i = 0; i < N; ++i) 230; if (x[i] > y[i]) 231; sum += x[i]; 232; return sum; 233; } 234 235; CHECK-LABEL: @fcmp_array_elm_fadd_select2( 236; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]] 237; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]] 238; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]] 239define double @fcmp_array_elm_fadd_select2(double* noalias %x, double* noalias %y, i32 %N) nounwind readonly { 240entry: 241 %cmp.1 = icmp sgt i32 %N, 0 242 br i1 %cmp.1, label %for.header, label %for.end 243 244for.header: ; preds = %entry 245 %zext = zext i32 %N to i64 246 br label %for.body 247 248for.body: ; preds = %for.body, %for.header 249 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 250 %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 251 %arrayidx.1 = getelementptr inbounds double, double* %x, i64 %indvars.iv 252 %0 = load double, double* %arrayidx.1, align 4 253 %arrayidx.2 = getelementptr inbounds double, double* %y, i64 %indvars.iv 254 %1 = load double, double* %arrayidx.2, align 4 255 %cmp.2 = fcmp fast ogt double %0, %1 256 %add = fadd fast double %0, %sum.1 257 %sum.2 = select i1 %cmp.2, double %add, double %sum.1 258 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 259 %exitcond = icmp eq i64 %indvars.iv.next, %zext 260 br i1 %exitcond, label %for.end, label %for.body 261 262for.end: ; preds = %for.body, %entry 263 %2 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 264 ret double %2 265} 266 267; Float pattern: 268; Check vectorization of reduction code which has an fsub instruction after 269; an fcmp instruction which compares an array element and 0. 270; 271; float fcmp_0_fsub_select1(float * restrict x, const int N) { 272; float sum = 0. 273; for (int i = 0; i < N; ++i) 274; if (x[i] > (float)0.) 275; sum -= x[i]; 276; return sum; 277; } 278 279; CHECK-LABEL: @fcmp_0_fsub_select1( 280; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer 281; CHECK: %[[V3:.*]] = fsub fast <4 x float> %[[V2:.*]], %[[V0]] 282; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] 283define float @fcmp_0_fsub_select1(float* noalias %x, i32 %N) nounwind readonly { 284entry: 285 %cmp.1 = icmp sgt i32 %N, 0 286 br i1 %cmp.1, label %for.header, label %for.end 287 288for.header: ; preds = %entry 289 %zext = zext i32 %N to i64 290 br label %for.body 291 292for.body: ; preds = %for.body, %for.header 293 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 294 %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 295 %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv 296 %0 = load float, float* %arrayidx, align 4 297 %cmp.2 = fcmp fast ogt float %0, 0.000000e+00 298 %sub = fsub fast float %sum.1, %0 299 %sum.2 = select i1 %cmp.2, float %sub, float %sum.1 300 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 301 %exitcond = icmp eq i64 %indvars.iv.next, %zext 302 br i1 %exitcond, label %for.end, label %for.body 303 304for.end: ; preds = %for.body, %entry 305 %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 306 ret float %1 307} 308 309; Float pattern: 310; Check that is not vectorized if fp-instruction has no fast-math property. 311; float fcmp_0_fsub_select1_novectorize(float * restrict x, const int N) { 312; float sum = 0. 313; for (int i = 0; i < N; ++i) 314; if (x[i] > (float)0.) 315; sum -= x[i]; 316; return sum; 317; } 318 319; CHECK-LABEL: @fcmp_0_fsub_select1_novectorize( 320; CHECK-NOT: <4 x float> 321define float @fcmp_0_fsub_select1_novectorize(float* noalias %x, i32 %N) nounwind readonly { 322entry: 323 %cmp.1 = icmp sgt i32 %N, 0 324 br i1 %cmp.1, label %for.header, label %for.end 325 326for.header: ; preds = %entry 327 %zext = zext i32 %N to i64 328 br label %for.body 329 330for.body: ; preds = %for.body, %for.header 331 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 332 %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 333 %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv 334 %0 = load float, float* %arrayidx, align 4 335 %cmp.2 = fcmp ogt float %0, 0.000000e+00 336 %sub = fsub float %sum.1, %0 337 %sum.2 = select i1 %cmp.2, float %sub, float %sum.1 338 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 339 %exitcond = icmp eq i64 %indvars.iv.next, %zext 340 br i1 %exitcond, label %for.end, label %for.body 341 342for.end: ; preds = %for.body, %entry 343 %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 344 ret float %1 345} 346 347; Double pattern: 348; Check vectorization of reduction code which has an fsub instruction after 349; an fcmp instruction which compares an array element and 0. 350; 351; double fcmp_0_fsub_select2(double * restrict x, const int N) { 352; double sum = 0. 353; for (int i = 0; i < N; ++i) 354; if (x[i] > 0.) 355; sum -= x[i]; 356; return sum; 357; } 358 359; CHECK-LABEL: @fcmp_0_fsub_select2( 360; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer 361; CHECK: %[[V3:.*]] = fsub fast <4 x double> %[[V2:.*]], %[[V0]] 362; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] 363define double @fcmp_0_fsub_select2(double* noalias %x, i32 %N) nounwind readonly { 364entry: 365 %cmp.1 = icmp sgt i32 %N, 0 366 br i1 %cmp.1, label %for.header, label %for.end 367 368for.header: ; preds = %entry 369 %zext = zext i32 %N to i64 370 br label %for.body 371 372for.body: ; preds = %for.body, %for.header 373 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 374 %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 375 %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv 376 %0 = load double, double* %arrayidx, align 4 377 %cmp.2 = fcmp fast ogt double %0, 0.000000e+00 378 %sub = fsub fast double %sum.1, %0 379 %sum.2 = select i1 %cmp.2, double %sub, double %sum.1 380 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 381 %exitcond = icmp eq i64 %indvars.iv.next, %zext 382 br i1 %exitcond, label %for.end, label %for.body 383 384for.end: ; preds = %for.body, %entry 385 %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 386 ret double %1 387} 388 389; Double pattern: 390; Check that is not vectorized if fp-instruction has no fast-math property. 391; 392; double fcmp_0_fsub_select2_notvectorize(double * restrict x, const int N) { 393; double sum = 0. 394; for (int i = 0; i < N; ++i) 395; if (x[i] > 0.) 396; sum -= x[i]; 397; return sum; 398; } 399 400; CHECK-LABEL: @fcmp_0_fsub_select2_notvectorize( 401; CHECK-NOT: <4 x doubole> 402define double @fcmp_0_fsub_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly { 403entry: 404 %cmp.1 = icmp sgt i32 %N, 0 405 br i1 %cmp.1, label %for.header, label %for.end 406 407for.header: ; preds = %entry 408 %zext = zext i32 %N to i64 409 br label %for.body 410 411for.body: ; preds = %for.body, %for.header 412 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 413 %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 414 %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv 415 %0 = load double, double* %arrayidx, align 4 416 %cmp.2 = fcmp ogt double %0, 0.000000e+00 417 %sub = fsub double %sum.1, %0 418 %sum.2 = select i1 %cmp.2, double %sub, double %sum.1 419 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 420 %exitcond = icmp eq i64 %indvars.iv.next, %zext 421 br i1 %exitcond, label %for.end, label %for.body 422 423for.end: ; preds = %for.body, %entry 424 %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 425 ret double %1 426} 427 428; Float pattern: 429; Check vectorization of reduction code which has an fmul instruction after 430; an fcmp instruction which compares an array element and 0. 431; 432; float fcmp_0_fmult_select1(float * restrict x, const int N) { 433; float sum = 0. 434; for (int i = 0; i < N; ++i) 435; if (x[i] > (float)0.) 436; sum *= x[i]; 437; return sum; 438; } 439 440; CHECK-LABEL: @fcmp_0_fmult_select1( 441; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer 442; CHECK: %[[V3:.*]] = fmul fast <4 x float> %[[V2:.*]], %[[V0]] 443; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] 444define float @fcmp_0_fmult_select1(float* noalias %x, i32 %N) nounwind readonly { 445entry: 446 %cmp.1 = icmp sgt i32 %N, 0 447 br i1 %cmp.1, label %for.header, label %for.end 448 449for.header: ; preds = %entry 450 %zext = zext i32 %N to i64 451 br label %for.body 452 453for.body: ; preds = %for.body, %for.header 454 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 455 %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 456 %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv 457 %0 = load float, float* %arrayidx, align 4 458 %cmp.2 = fcmp fast ogt float %0, 0.000000e+00 459 %mult = fmul fast float %sum.1, %0 460 %sum.2 = select i1 %cmp.2, float %mult, float %sum.1 461 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 462 %exitcond = icmp eq i64 %indvars.iv.next, %zext 463 br i1 %exitcond, label %for.end, label %for.body 464 465for.end: ; preds = %for.body, %entry 466 %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 467 ret float %1 468} 469 470; Float pattern: 471; Check that is not vectorized if fp-instruction has no fast-math property. 472; 473; float fcmp_0_fmult_select1_notvectorize(float * restrict x, const int N) { 474; float sum = 0. 475; for (int i = 0; i < N; ++i) 476; if (x[i] > (float)0.) 477; sum *= x[i]; 478; return sum; 479; } 480 481; CHECK-LABEL: @fcmp_0_fmult_select1_notvectorize( 482; CHECK-NOT: <4 x float> 483define float @fcmp_0_fmult_select1_notvectorize(float* noalias %x, i32 %N) nounwind readonly { 484entry: 485 %cmp.1 = icmp sgt i32 %N, 0 486 br i1 %cmp.1, label %for.header, label %for.end 487 488for.header: ; preds = %entry 489 %zext = zext i32 %N to i64 490 br label %for.body 491 492for.body: ; preds = %for.body, %for.header 493 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 494 %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 495 %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv 496 %0 = load float, float* %arrayidx, align 4 497 %cmp.2 = fcmp ogt float %0, 0.000000e+00 498 %mult = fmul float %sum.1, %0 499 %sum.2 = select i1 %cmp.2, float %mult, float %sum.1 500 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 501 %exitcond = icmp eq i64 %indvars.iv.next, %zext 502 br i1 %exitcond, label %for.end, label %for.body 503 504for.end: ; preds = %for.body, %entry 505 %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 506 ret float %1 507} 508 509; Double pattern: 510; Check vectorization of reduction code which has an fmul instruction after 511; an fcmp instruction which compares an array element and 0. 512; 513; double fcmp_0_fmult_select2(double * restrict x, const int N) { 514; double sum = 0. 515; for (int i = 0; i < N; ++i) 516; if (x[i] > 0.) 517; sum *= x[i]; 518; return sum; 519; } 520 521; CHECK-LABEL: @fcmp_0_fmult_select2( 522; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer 523; CHECK: %[[V3:.*]] = fmul fast <4 x double> %[[V2:.*]], %[[V0]] 524; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] 525define double @fcmp_0_fmult_select2(double* noalias %x, i32 %N) nounwind readonly { 526entry: 527 %cmp.1 = icmp sgt i32 %N, 0 528 br i1 %cmp.1, label %for.header, label %for.end 529 530for.header: ; preds = %entry 531 %zext = zext i32 %N to i64 532 br label %for.body 533 534for.body: ; preds = %for.body, %for.header 535 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 536 %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 537 %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv 538 %0 = load double, double* %arrayidx, align 4 539 %cmp.2 = fcmp fast ogt double %0, 0.000000e+00 540 %mult = fmul fast double %sum.1, %0 541 %sum.2 = select i1 %cmp.2, double %mult, double %sum.1 542 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 543 %exitcond = icmp eq i64 %indvars.iv.next, %zext 544 br i1 %exitcond, label %for.end, label %for.body 545 546for.end: ; preds = %for.body, %entry 547 %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 548 ret double %1 549} 550 551; Double pattern: 552; Check that is not vectorized if fp-instruction has no fast-math property. 553; 554; double fcmp_0_fmult_select2_notvectorize(double * restrict x, const int N) { 555; double sum = 0. 556; for (int i = 0; i < N; ++i) 557; if (x[i] > 0.) 558; sum *= x[i]; 559; return sum; 560; } 561 562; CHECK-LABEL: @fcmp_0_fmult_select2_notvectorize( 563; CHECK-NOT: <4 x double> 564define double @fcmp_0_fmult_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly { 565entry: 566 %cmp.1 = icmp sgt i32 %N, 0 567 br i1 %cmp.1, label %for.header, label %for.end 568 569for.header: ; preds = %entry 570 %zext = zext i32 %N to i64 571 br label %for.body 572 573for.body: ; preds = %for.body, %for.header 574 %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] 575 %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] 576 %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv 577 %0 = load double, double* %arrayidx, align 4 578 %cmp.2 = fcmp ogt double %0, 0.000000e+00 579 %mult = fmul double %sum.1, %0 580 %sum.2 = select i1 %cmp.2, double %mult, double %sum.1 581 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 582 %exitcond = icmp eq i64 %indvars.iv.next, %zext 583 br i1 %exitcond, label %for.end, label %for.body 584 585for.end: ; preds = %for.body, %entry 586 %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] 587 ret double %1 588} 589 590; Float multi pattern 591; Check vectorisation of reduction code with a pair of selects to different 592; fadd patterns. 593; 594; float fcmp_multi(float *a, int n) { 595; float sum=0.0; 596; for (int i=0;i<n;i++) { 597; if (a[i]>1.0) 598; sum+=a[i]; 599; else if (a[i]<3.0) 600; sum+=2*a[i]; 601; else 602; sum+=3*a[i]; 603; } 604; return sum; 605; } 606 607; CHECK-LABEL: @fcmp_multi( 608; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00, 609; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00, 610; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00, 611; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00, 612; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true, 613; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]] 614; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true, 615; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]] 616; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]] 617; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]] 618; CHECK: fadd fast <4 x float> %[[S2]], 619define float @fcmp_multi(float* nocapture readonly %a, i32 %n) nounwind readonly { 620entry: 621 %cmp10 = icmp sgt i32 %n, 0 622 br i1 %cmp10, label %for.body.preheader, label %for.end 623 624for.body.preheader: ; preds = %entry 625 %wide.trip.count = zext i32 %n to i64 626 br label %for.body 627 628for.body: ; preds = %for.inc, %for.body.preheader 629 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] 630 %sum.011 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ] 631 %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv 632 %0 = load float, float* %arrayidx, align 4 633 %cmp1 = fcmp ogt float %0, 1.000000e+00 634 br i1 %cmp1, label %for.inc, label %if.else 635 636if.else: ; preds = %for.body 637 %cmp8 = fcmp olt float %0, 3.000000e+00 638 br i1 %cmp8, label %if.then10, label %if.else14 639 640if.then10: ; preds = %if.else 641 %mul = fmul fast float %0, 2.000000e+00 642 br label %for.inc 643 644if.else14: ; preds = %if.else 645 %mul17 = fmul fast float %0, 3.000000e+00 646 br label %for.inc 647 648for.inc: ; preds = %for.body, %if.else14, %if.then10 649 %.pn = phi float [ %mul, %if.then10 ], [ %mul17, %if.else14 ], [ %0, %for.body ] 650 %sum.1 = fadd fast float %.pn, %sum.011 651 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 652 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 653 br i1 %exitcond, label %for.end, label %for.body 654 655for.end: ; preds = %for.inc, %entry 656 %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ] 657 ret float %sum.0.lcssa 658} 659 660; Float fadd + fsub patterns 661; Check vectorisation of reduction code with a pair of selects to different 662; instructions { fadd, fsub } but equivalent (change in constant). 663; 664; float fcmp_multi(float *a, int n) { 665; float sum=0.0; 666; for (int i=0;i<n;i++) { 667; if (a[i]>1.0) 668; sum+=a[i]; 669; else if (a[i]<3.0) 670; sum-=a[i]; 671; } 672; return sum; 673; } 674 675; CHECK-LABEL: @fcmp_fadd_fsub( 676; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00, 677; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00, 678; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float> 679; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float> 680; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true, 681; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]] 682; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true, 683; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]] 684; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]] 685; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]] 686define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly { 687entry: 688 %cmp9 = icmp sgt i32 %n, 0 689 br i1 %cmp9, label %for.body.preheader, label %for.end 690 691for.body.preheader: ; preds = %entry 692 %wide.trip.count = zext i32 %n to i64 693 br label %for.body 694 695for.body: ; preds = %for.inc, %for.body.preheader 696 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] 697 %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ] 698 %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv 699 %0 = load float, float* %arrayidx, align 4 700 %cmp1 = fcmp ogt float %0, 1.000000e+00 701 br i1 %cmp1, label %if.then, label %if.else 702 703if.then: ; preds = %for.body 704 %add = fadd fast float %0, %sum.010 705 br label %for.inc 706 707if.else: ; preds = %for.body 708 %cmp8 = fcmp olt float %0, 3.000000e+00 709 br i1 %cmp8, label %if.then10, label %for.inc 710 711if.then10: ; preds = %if.else 712 %sub = fsub fast float %sum.010, %0 713 br label %for.inc 714 715for.inc: ; preds = %if.then, %if.then10, %if.else 716 %sum.1 = phi float [ %add, %if.then ], [ %sub, %if.then10 ], [ %sum.010, %if.else ] 717 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 718 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 719 br i1 %exitcond, label %for.end, label %for.body 720 721for.end: ; preds = %for.inc, %entry 722 %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ] 723 ret float %sum.0.lcssa 724} 725 726; Float fadd + fmul patterns 727; Check lack of vectorisation of reduction code with a pair of non-compatible 728; instructions { fadd, fmul }. 729; 730; float fcmp_multi(float *a, int n) { 731; float sum=0.0; 732; for (int i=0;i<n;i++) { 733; if (a[i]>1.0) 734; sum+=a[i]; 735; else if (a[i]<3.0) 736; sum*=a[i]; 737; } 738; return sum; 739; } 740 741; CHECK-LABEL: @fcmp_fadd_fmul( 742; CHECK-NOT: <4 x float> 743define float @fcmp_fadd_fmul(float* nocapture readonly %a, i32 %n) nounwind readonly { 744entry: 745 %cmp9 = icmp sgt i32 %n, 0 746 br i1 %cmp9, label %for.body.preheader, label %for.end 747 748for.body.preheader: ; preds = %entry 749 %wide.trip.count = zext i32 %n to i64 750 br label %for.body 751 752for.body: ; preds = %for.inc, %for.body.preheader 753 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] 754 %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ] 755 %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv 756 %0 = load float, float* %arrayidx, align 4 757 %cmp1 = fcmp ogt float %0, 1.000000e+00 758 br i1 %cmp1, label %if.then, label %if.else 759 760if.then: ; preds = %for.body 761 %add = fadd fast float %0, %sum.010 762 br label %for.inc 763 764if.else: ; preds = %for.body 765 %cmp8 = fcmp olt float %0, 3.000000e+00 766 br i1 %cmp8, label %if.then10, label %for.inc 767 768if.then10: ; preds = %if.else 769 %mul = fmul fast float %0, %sum.010 770 br label %for.inc 771 772for.inc: ; preds = %if.then, %if.then10, %if.else 773 %sum.1 = phi float [ %add, %if.then ], [ %mul, %if.then10 ], [ %sum.010, %if.else ] 774 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 775 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 776 br i1 %exitcond, label %for.end, label %for.body 777 778for.end: ; preds = %for.inc, %entry 779 %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ] 780 ret float %sum.0.lcssa 781} 782 783; Float fadd + store patterns 784; Check lack of vectorisation of reduction code with a store back, given it 785; has loop dependency on a[i]. 786; 787; float fcmp_store_back(float a[], int LEN) { 788; float sum = 0.0; 789; for (int i = 0; i < LEN; i++) { 790; sum += a[i]; 791; a[i] = sum; 792; } 793; return sum; 794; } 795 796; CHECK-LABEL: @fcmp_store_back( 797; CHECK-NOT: <4 x float> 798define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly { 799entry: 800 %cmp7 = icmp sgt i32 %LEN, 0 801 br i1 %cmp7, label %for.body.preheader, label %for.end 802 803for.body.preheader: ; preds = %entry 804 %wide.trip.count = zext i32 %LEN to i64 805 br label %for.body 806 807for.body: ; preds = %for.body, %for.body.preheader 808 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 809 %sum.08 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ] 810 %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv 811 %0 = load float, float* %arrayidx, align 4 812 %add = fadd fast float %0, %sum.08 813 store float %add, float* %arrayidx, align 4 814 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 815 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 816 br i1 %exitcond, label %for.end, label %for.body 817 818for.end: ; preds = %for.body, %entry 819 %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] 820 ret float %sum.0.lcssa 821} 822