1; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s 2; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s 3 4define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 5entry: 6; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 7; CHECK-LABEL: @test_cse 8; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 9 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 10 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 11 br label %for.cond 12 13for.cond: ; preds = %for.body, %entry 14 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 15 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 16 %cmp = icmp slt i32 %i.0, %n 17 br i1 %cmp, label %for.body, label %for.end 18 19for.body: ; preds = %for.cond 20 %0 = bitcast i32* %a to i8* 21 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 22 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 23 %3 = bitcast <16 x i8> %1 to <4 x i32> 24 %4 = bitcast <16 x i8> %2 to <4 x i32> 25 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 26 %5 = bitcast i32* %a to i8* 27 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 28 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 29 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 30 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 31 %inc = add nsw i32 %i.0, 1 32 br label %for.cond 33 34for.end: ; preds = %for.cond 35 ret <4 x i32> %res.0 36} 37 38define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 39entry: 40; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE. 41; CHECK-LABEL: @test_cse2 42; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 43; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 44 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 45 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 46 br label %for.cond 47 48for.cond: ; preds = %for.body, %entry 49 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 50 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 51 %cmp = icmp slt i32 %i.0, %n 52 br i1 %cmp, label %for.body, label %for.end 53 54for.body: ; preds = %for.cond 55 %0 = bitcast i32* %a to i8* 56 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 57 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 58 %3 = bitcast <16 x i8> %1 to <4 x i32> 59 %4 = bitcast <16 x i8> %2 to <4 x i32> 60 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 61 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 62 %5 = bitcast i32* %a to i8* 63 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 64 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 65 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 66 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 67 %inc = add nsw i32 %i.0, 1 68 br label %for.cond 69 70for.end: ; preds = %for.cond 71 ret <4 x i32> %res.0 72} 73 74define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 { 75entry: 76; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE. 77; CHECK-LABEL: @test_cse3 78; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 79; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 80 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 81 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 82 br label %for.cond 83 84for.cond: ; preds = %for.body, %entry 85 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 86 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 87 %cmp = icmp slt i32 %i.0, %n 88 br i1 %cmp, label %for.body, label %for.end 89 90for.body: ; preds = %for.cond 91 %0 = bitcast i32* %a to i8* 92 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0) 93 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 94 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 95 %1 = bitcast i32* %a to i8* 96 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1) 97 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0 98 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1 99 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract) 100 %inc = add nsw i32 %i.0, 1 101 br label %for.cond 102 103for.end: ; preds = %for.cond 104 ret <4 x i32> %res.0 105} 106 107 108define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) { 109entry: 110; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized 111; away by Early CSE. 112; CHECK-LABEL: @test_nocse 113; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8 114 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 115 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 116 br label %for.cond 117 118for.cond: ; preds = %for.body, %entry 119 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 120 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 121 %cmp = icmp slt i32 %i.0, %n 122 br i1 %cmp, label %for.body, label %for.end 123 124for.body: ; preds = %for.cond 125 %0 = bitcast i32* %a to i8* 126 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 127 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 128 %3 = bitcast <16 x i8> %1 to <4 x i32> 129 %4 = bitcast <16 x i8> %2 to <4 x i32> 130 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 131 store i32 0, i32* %b, align 4 132 %5 = bitcast i32* %a to i8* 133 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5) 134 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0 135 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1 136 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract) 137 %inc = add nsw i32 %i.0, 1 138 br label %for.cond 139 140for.end: ; preds = %for.cond 141 ret <4 x i32> %res.0 142} 143 144define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 145entry: 146; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due 147; to mismatch between st2 and ld3. 148; CHECK-LABEL: @test_nocse2 149; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8 150 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 151 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 152 br label %for.cond 153 154for.cond: ; preds = %for.body, %entry 155 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 156 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 157 %cmp = icmp slt i32 %i.0, %n 158 br i1 %cmp, label %for.body, label %for.end 159 160for.body: ; preds = %for.cond 161 %0 = bitcast i32* %a to i8* 162 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 163 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 164 %3 = bitcast <16 x i8> %1 to <4 x i32> 165 %4 = bitcast <16 x i8> %2 to <4 x i32> 166 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0) 167 %5 = bitcast i32* %a to i8* 168 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) 169 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 170 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2 171 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract) 172 %inc = add nsw i32 %i.0, 1 173 br label %for.cond 174 175for.end: ; preds = %for.cond 176 ret <4 x i32> %res.0 177} 178 179define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) { 180entry: 181; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to 182; mismatch between st2 and st3. 183; CHECK-LABEL: @test_nocse3 184; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8 185; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8 186 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 187 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 188 br label %for.cond 189 190for.cond: ; preds = %for.body, %entry 191 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 192 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ] 193 %cmp = icmp slt i32 %i.0, %n 194 br i1 %cmp, label %for.body, label %for.end 195 196for.body: ; preds = %for.cond 197 %0 = bitcast i32* %a to i8* 198 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8> 199 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8> 200 %3 = bitcast <16 x i8> %1 to <4 x i32> 201 %4 = bitcast <16 x i8> %2 to <4 x i32> 202 call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0) 203 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0) 204 %5 = bitcast i32* %a to i8* 205 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5) 206 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0 207 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1 208 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract) 209 %inc = add nsw i32 %i.0, 1 210 br label %for.cond 211 212for.end: ; preds = %for.cond 213 ret <4 x i32> %res.0 214} 215 216; Function Attrs: nounwind 217declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture) 218 219; Function Attrs: nounwind 220declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture) 221 222; Function Attrs: nounwind readonly 223declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*) 224 225; Function Attrs: nounwind readonly 226declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*) 227 228define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) { 229entry: 230 %add = add <4 x i32> %__p0, %__p1 231 ret <4 x i32> %add 232} 233