1; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE 2; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX 4; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512vl -mcpu=skx | FileCheck %s --check-prefix=AVX 5 6define double @t1(float* nocapture %x) nounwind readonly ssp { 7entry: 8; SSE-LABEL: t1: 9; SSE: movss ([[A0:%rdi|%rcx]]), %xmm0 10; SSE: cvtss2sd %xmm0, %xmm0 11 12 %0 = load float, float* %x, align 4 13 %1 = fpext float %0 to double 14 ret double %1 15} 16 17define float @t2(double* nocapture %x) nounwind readonly ssp optsize { 18entry: 19; SSE-LABEL: t2: 20; SSE: cvtsd2ss ([[A0]]), %xmm0 21 %0 = load double, double* %x, align 8 22 %1 = fptrunc double %0 to float 23 ret float %1 24} 25 26define float @squirtf(float* %x) nounwind { 27entry: 28; SSE-LABEL: squirtf: 29; SSE: movss ([[A0]]), %xmm0 30; SSE: sqrtss %xmm0, %xmm0 31 %z = load float, float* %x 32 %t = call float @llvm.sqrt.f32(float %z) 33 ret float %t 34} 35 36define double @squirt(double* %x) nounwind { 37entry: 38; SSE-LABEL: squirt: 39; SSE: movsd ([[A0]]), %xmm0 40; SSE: sqrtsd %xmm0, %xmm0 41 %z = load double, double* %x 42 %t = call double @llvm.sqrt.f64(double %z) 43 ret double %t 44} 45 46define float @squirtf_size(float* %x) nounwind optsize { 47entry: 48; SSE-LABEL: squirtf_size: 49; SSE: sqrtss ([[A0]]), %xmm0 50 %z = load float, float* %x 51 %t = call float @llvm.sqrt.f32(float %z) 52 ret float %t 53} 54 55define double @squirt_size(double* %x) nounwind optsize { 56entry: 57; SSE-LABEL: squirt_size: 58; SSE: sqrtsd ([[A0]]), %xmm0 59 %z = load double, double* %x 60 %t = call double @llvm.sqrt.f64(double %z) 61 ret double %t 62} 63 64declare float @llvm.sqrt.f32(float) 65declare double @llvm.sqrt.f64(double) 66 67; SSE-LABEL: loopdep1 68; SSE: for.body{{$}} 69; 70; This loop contains two cvtsi2ss instructions that update the same xmm 71; register. Verify that the break false dependency fix pass breaks those 72; dependencies by inserting xorps instructions. 73; 74; If the register allocator chooses different registers for the two cvtsi2ss 75; instructions, they are still dependent on themselves. 76; SSE: xorps [[XMM1:%xmm[0-9]+]] 77; SSE: , [[XMM1]] 78; SSE: cvtsi2ssl %{{.*}}, [[XMM1]] 79; SSE: xorps [[XMM2:%xmm[0-9]+]] 80; SSE: , [[XMM2]] 81; SSE: cvtsi2ssl %{{.*}}, [[XMM2]] 82; 83define float @loopdep1(i32 %m) nounwind uwtable readnone ssp { 84entry: 85 %tobool3 = icmp eq i32 %m, 0 86 br i1 %tobool3, label %for.end, label %for.body 87 88for.body: ; preds = %entry, %for.body 89 %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ] 90 %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] 91 %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ] 92 %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ] 93 %conv = sitofp i32 %n.04 to float 94 %add = fadd float %s1.06, %conv 95 %conv1 = sitofp i32 %m.addr.07 to float 96 %add2 = fadd float %s2.05, %conv1 97 %inc = add nsw i32 %n.04, 1 98 %dec = add nsw i32 %m.addr.07, -1 99 %tobool = icmp eq i32 %dec, 0 100 br i1 %tobool, label %for.end, label %for.body 101 102for.end: ; preds = %for.body, %entry 103 %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] 104 %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ] 105 %sub = fsub float %s1.0.lcssa, %s2.0.lcssa 106 ret float %sub 107} 108 109; rdar:15221834 False AVX register dependencies cause 5x slowdown on 110; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed 111; to avoid cyclic dependence on a write to the same register in a 112; previous iteration. 113 114; AVX-LABEL: loopdep2: 115; AVX-LABEL: %loop 116; AVX: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}} 117; AVX: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}} 118; SSE-LABEL: loopdep2: 119; SSE-LABEL: %loop 120; SSE: xorps %[[REG:xmm.]], %[[REG]] 121; SSE: cvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]] 122define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind { 123entry: 124 %vx = load i64, i64* %x 125 br label %loop 126loop: 127 %i = phi i64 [ 1, %entry ], [ %inc, %loop ] 128 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] 129 %fi = sitofp i64 %i to double 130 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 131 %vy = load double, double* %y 132 %fipy = fadd double %fi, %vy 133 %iipy = fptosi double %fipy to i64 134 %s2 = add i64 %s1, %iipy 135 %inc = add nsw i64 %i, 1 136 %exitcond = icmp eq i64 %inc, 156250000 137 br i1 %exitcond, label %ret, label %loop 138ret: 139 ret i64 %s2 140} 141 142; This loop contains a cvtsi2sd instruction that has a loop-carried 143; false dependency on an xmm that is modified by other scalar instructions 144; that follow it in the loop. Additionally, the source of convert is a 145; memory operand. Verify the break false dependency fix pass breaks this 146; dependency by inserting a xor before the convert. 147@x = common global [1024 x double] zeroinitializer, align 16 148@y = common global [1024 x double] zeroinitializer, align 16 149@z = common global [1024 x double] zeroinitializer, align 16 150@w = common global [1024 x double] zeroinitializer, align 16 151@v = common global [1024 x i32] zeroinitializer, align 16 152 153define void @loopdep3() { 154entry: 155 br label %for.cond1.preheader 156 157for.cond1.preheader: ; preds = %for.inc14, %entry 158 %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ] 159 br label %for.body3 160 161for.body3: 162 %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] 163 %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @v, i64 0, i64 %indvars.iv 164 %0 = load i32, i32* %arrayidx, align 4 165 %conv = sitofp i32 %0 to double 166 %arrayidx5 = getelementptr inbounds [1024 x double], [1024 x double]* @x, i64 0, i64 %indvars.iv 167 %1 = load double, double* %arrayidx5, align 8 168 %mul = fmul double %conv, %1 169 %arrayidx7 = getelementptr inbounds [1024 x double], [1024 x double]* @y, i64 0, i64 %indvars.iv 170 %2 = load double, double* %arrayidx7, align 8 171 %mul8 = fmul double %mul, %2 172 %arrayidx10 = getelementptr inbounds [1024 x double], [1024 x double]* @z, i64 0, i64 %indvars.iv 173 %3 = load double, double* %arrayidx10, align 8 174 %mul11 = fmul double %mul8, %3 175 %arrayidx13 = getelementptr inbounds [1024 x double], [1024 x double]* @w, i64 0, i64 %indvars.iv 176 store double %mul11, double* %arrayidx13, align 8 177 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 178 %exitcond = icmp eq i64 %indvars.iv.next, 1024 179 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 180 br i1 %exitcond, label %for.inc14, label %for.body3 181 182for.inc14: ; preds = %for.body3 183 %inc15 = add nsw i32 %i.025, 1 184 %exitcond26 = icmp eq i32 %inc15, 100000 185 br i1 %exitcond26, label %for.end16, label %for.cond1.preheader 186 187for.end16: ; preds = %for.inc14 188 ret void 189 190;SSE-LABEL:@loopdep3 191;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]] 192;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]] 193;SSE-NEXT: mulsd {{.*}}, [[XMM0]] 194;SSE-NEXT: mulsd {{.*}}, [[XMM0]] 195;SSE-NEXT: mulsd {{.*}}, [[XMM0]] 196;SSE-NEXT: movsd [[XMM0]], 197;AVX-LABEL:@loopdep3 198;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] 199;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], {{%xmm[0-9]+}} 200;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] 201;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] 202;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] 203;AVX-NEXT: vmovsd [[XMM0]], 204} 205 206define double @inlineasmdep(i64 %arg) { 207top: 208 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 209 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"() 210 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 211 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 212 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 213 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 214 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 215 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 216 %tmp1 = sitofp i64 %arg to double 217 ret double %tmp1 218;AVX-LABEL:@inlineasmdep 219;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]], [[XMM0]] 220;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM0]], {{%xmm[0-9]+}} 221} 222 223; Make sure we are making a smart choice regarding undef registers and 224; hiding the false dependency behind a true dependency 225define double @truedeps(float %arg) { 226top: 227 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() 228 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 229 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() 230 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 231 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 232 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 233 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 234 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 235 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 236 %tmp1 = fpext float %arg to double 237 ret double %tmp1 238;AVX-LABEL:@truedeps 239;AVX-NOT: vxorps 240;AVX: vcvtss2sd [[XMM0:%xmm[0-9]+]], [[XMM0]], {{%xmm[0-9]+}} 241} 242 243; Make sure we are making a smart choice regarding undef registers and 244; choosing the register with the highest clearence 245define double @clearence(i64 %arg) { 246top: 247 tail call void asm sideeffect "", "~{xmm6},~{dirflag},~{fpsr},~{flags}"() 248 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 249 tail call void asm sideeffect "", "~{xmm4},~{xmm5},~{xmm7},~{dirflag},~{fpsr},~{flags}"() 250 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 251 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 252 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 253 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 254 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 255 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 256 %tmp1 = sitofp i64 %arg to double 257 ret double %tmp1 258;AVX-LABEL:@clearence 259;AVX: vxorps [[XMM6:%xmm6]], [[XMM6]], [[XMM6]] 260;AVX-NEXT: vcvtsi2sdq {{.*}}, [[XMM6]], {{%xmm[0-9]+}} 261} 262 263; Make sure we are making a smart choice regarding undef registers in order to 264; avoid a cyclic dependence on a write to the same register in a previous 265; iteration, especially when we cannot zero out the undef register because it 266; is alive. 267define i64 @loopclearence(i64* nocapture %x, double* nocapture %y) nounwind { 268entry: 269 %vx = load i64, i64* %x 270 br label %loop 271loop: 272 %i = phi i64 [ 1, %entry ], [ %inc, %loop ] 273 %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] 274 %fi = sitofp i64 %i to double 275 tail call void asm sideeffect "", "~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{dirflag},~{fpsr},~{flags}"() 276 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 277 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 278 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 279 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 280 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 281 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 282 %vy = load double, double* %y 283 %fipy = fadd double %fi, %vy 284 %iipy = fptosi double %fipy to i64 285 %s2 = add i64 %s1, %iipy 286 %inc = add nsw i64 %i, 1 287 %exitcond = icmp eq i64 %inc, 156250000 288 br i1 %exitcond, label %ret, label %loop 289ret: 290 ret i64 %s2 291;AVX-LABEL:@loopclearence 292;Registers 4-7 are not used and therefore one of them should be chosen 293;AVX-NOT: {{%xmm[4-7]}} 294;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} 295;AVX-NOT: [[XMM4_7]] 296} 297 298; Make sure we are making a smart choice regarding undef registers even for more 299; complicated loop structures. This example is the inner loop from 300; julia> a = falses(10000); a[1:4:end] = true 301; julia> linspace(1.0,2.0,10000)[a] 302define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { 303entry: 304 tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() 305 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() 306 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() 307 tail call void asm sideeffect "", "~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{dirflag},~{fpsr},~{flags}"() 308 tail call void asm sideeffect "", "~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{dirflag},~{fpsr},~{flags}"() 309 tail call void asm sideeffect "", "~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{dirflag},~{fpsr},~{flags}"() 310 tail call void asm sideeffect "", "~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{dirflag},~{fpsr},~{flags}"() 311 br label %loop 312 313loop: 314 %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ] 315 %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ] 316 %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ] 317 br label %inner_loop 318 319inner_loop: 320 %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ] 321 %idx = lshr i64 %phi, 6 322 %inputptr = getelementptr i64, i64* %x, i64 %idx 323 %input = load i64, i64* %inputptr, align 8 324 %masked = and i64 %phi, 63 325 %shiftedmasked = shl i64 1, %masked 326 %maskedinput = and i64 %input, %shiftedmasked 327 %cmp = icmp eq i64 %maskedinput, 0 328 %nextk = add i64 %phi, 1 329 br i1 %cmp, label %inner_loop, label %loop_end 330 331loop_end: 332 %nexti = add i64 %phi_i, 1 333 %nextj = add i64 %phi_j, 1 334 ; Register use, plus us clobbering 7-15 above, basically forces xmm6 here as 335 ; the only reasonable choice. The primary thing we care about is that it's 336 ; not one of the registers used in the loop (e.g. not the output reg here) 337;AVX-NOT: %xmm6 338;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} 339;AVX-NOT: %xmm6 340 %nexti_f = sitofp i64 %nexti to double 341 %sub = fsub double %c1, %nexti_f 342 %mul = fmul double %sub, %c2 343;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} 344;AVX-NOT: %xmm6 345 %phi_f = sitofp i64 %phi to double 346 %mul2 = fmul double %phi_f, %c3 347 %add2 = fadd double %mul, %mul2 348 %div = fdiv double %add2, %c4 349 %prev_j = add i64 %phi_j, -1 350 %outptr = getelementptr double, double* %y, i64 %prev_j 351 store double %div, double* %outptr, align 8 352 %done = icmp slt i64 %size, %nexti 353 br i1 %done, label %loopdone, label %loop 354 355loopdone: 356 ret void 357} 358