1; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s 2; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s 3; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s 4 5%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 } 6%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 } 7 8; CHECK-LABEL: merge_const_store: 9; save 1,2,3 ... as one big integer. 10; CHECK: movabsq $578437695752307201 11; CHECK: ret 12define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp { 13 %1 = icmp sgt i32 %count, 0 14 br i1 %1, label %.lr.ph, label %._crit_edge 15.lr.ph: 16 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 17 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] 18 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 19 store i8 1, i8* %2, align 1 20 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 21 store i8 2, i8* %3, align 1 22 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2 23 store i8 3, i8* %4, align 1 24 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3 25 store i8 4, i8* %5, align 1 26 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4 27 store i8 5, i8* %6, align 1 28 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5 29 store i8 6, i8* %7, align 1 30 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6 31 store i8 7, i8* %8, align 1 32 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7 33 store i8 8, i8* %9, align 1 34 %10 = add nsw i32 %i.02, 1 35 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 36 %exitcond = icmp eq i32 %10, %count 37 br i1 %exitcond, label %._crit_edge, label %.lr.ph 38._crit_edge: 39 ret void 40} 41 42; No vectors because we use noimplicitfloat 43; CHECK-LABEL: merge_const_store_no_vec: 44; CHECK-NOT: vmovups 45; CHECK: ret 46define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{ 47 %1 = icmp sgt i32 %count, 0 48 br i1 %1, label %.lr.ph, label %._crit_edge 49.lr.ph: 50 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 51 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ] 52 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 53 store i32 0, i32* %2, align 4 54 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 55 store i32 0, i32* %3, align 4 56 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 57 store i32 0, i32* %4, align 4 58 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 59 store i32 0, i32* %5, align 4 60 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4 61 store i32 0, i32* %6, align 4 62 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5 63 store i32 0, i32* %7, align 4 64 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6 65 store i32 0, i32* %8, align 4 66 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7 67 store i32 0, i32* %9, align 4 68 %10 = add nsw i32 %i.02, 1 69 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 70 %exitcond = icmp eq i32 %10, %count 71 br i1 %exitcond, label %._crit_edge, label %.lr.ph 72._crit_edge: 73 ret void 74} 75 76; Move the constants using a single vector store. 77; CHECK-LABEL: merge_const_store_vec: 78; CHECK: vmovups 79; CHECK: ret 80define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp { 81 %1 = icmp sgt i32 %count, 0 82 br i1 %1, label %.lr.ph, label %._crit_edge 83.lr.ph: 84 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 85 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ] 86 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 87 store i32 0, i32* %2, align 4 88 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 89 store i32 0, i32* %3, align 4 90 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 91 store i32 0, i32* %4, align 4 92 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 93 store i32 0, i32* %5, align 4 94 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4 95 store i32 0, i32* %6, align 4 96 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5 97 store i32 0, i32* %7, align 4 98 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6 99 store i32 0, i32* %8, align 4 100 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7 101 store i32 0, i32* %9, align 4 102 %10 = add nsw i32 %i.02, 1 103 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 104 %exitcond = icmp eq i32 %10, %count 105 br i1 %exitcond, label %._crit_edge, label %.lr.ph 106._crit_edge: 107 ret void 108} 109 110; Move the first 4 constants as a single vector. Move the rest as scalars. 111; CHECK-LABEL: merge_nonconst_store: 112; CHECK: movl $67305985 113; CHECK: movb 114; CHECK: movb 115; CHECK: movb 116; CHECK: movb 117; CHECK: ret 118define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp { 119 %1 = icmp sgt i32 %count, 0 120 br i1 %1, label %.lr.ph, label %._crit_edge 121.lr.ph: 122 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] 123 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] 124 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 125 store i8 1, i8* %2, align 1 126 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 127 store i8 2, i8* %3, align 1 128 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2 129 store i8 3, i8* %4, align 1 130 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3 131 store i8 4, i8* %5, align 1 132 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4 133 store i8 %zz, i8* %6, align 1 ; <----------- Not a const; 134 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5 135 store i8 6, i8* %7, align 1 136 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6 137 store i8 7, i8* %8, align 1 138 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7 139 store i8 8, i8* %9, align 1 140 %10 = add nsw i32 %i.02, 1 141 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 142 %exitcond = icmp eq i32 %10, %count 143 br i1 %exitcond, label %._crit_edge, label %.lr.ph 144._crit_edge: 145 ret void 146} 147 148 149; CHECK-LABEL: merge_loads_i16: 150; load: 151; BWON: movzwl 152; BWOFF: movw 153; store: 154; CHECK: movw 155; CHECK: ret 156define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { 157 %1 = icmp sgt i32 %count, 0 158 br i1 %1, label %.lr.ph, label %._crit_edge 159 160.lr.ph: ; preds = %0 161 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0 162 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1 163 br label %4 164 165; <label>:4 ; preds = %4, %.lr.ph 166 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ] 167 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ] 168 %5 = load i8, i8* %2, align 1 169 %6 = load i8, i8* %3, align 1 170 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 171 store i8 %5, i8* %7, align 1 172 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 173 store i8 %6, i8* %8, align 1 174 %9 = add nsw i32 %i.02, 1 175 %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 176 %exitcond = icmp eq i32 %9, %count 177 br i1 %exitcond, label %._crit_edge, label %4 178 179._crit_edge: ; preds = %4, %0 180 ret void 181} 182 183; The loads and the stores are interleaved. Can't merge them. 184; CHECK-LABEL: no_merge_loads: 185; BWON: movzbl 186; BWOFF: movb 187; CHECK: movb 188; BWON: movzbl 189; BWOFF: movb 190; CHECK: movb 191; CHECK: ret 192define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { 193 %1 = icmp sgt i32 %count, 0 194 br i1 %1, label %.lr.ph, label %._crit_edge 195 196.lr.ph: ; preds = %0 197 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0 198 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1 199 br label %a4 200 201a4: ; preds = %4, %.lr.ph 202 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ] 203 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ] 204 %a5 = load i8, i8* %2, align 1 205 %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0 206 store i8 %a5, i8* %a7, align 1 207 %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1 208 %a6 = load i8, i8* %3, align 1 209 store i8 %a6, i8* %a8, align 1 210 %a9 = add nsw i32 %i.02, 1 211 %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1 212 %exitcond = icmp eq i32 %a9, %count 213 br i1 %exitcond, label %._crit_edge, label %a4 214 215._crit_edge: ; preds = %4, %0 216 ret void 217} 218 219 220; CHECK-LABEL: merge_loads_integer: 221; load: 222; CHECK: movq 223; store: 224; CHECK: movq 225; CHECK: ret 226define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { 227 %1 = icmp sgt i32 %count, 0 228 br i1 %1, label %.lr.ph, label %._crit_edge 229 230.lr.ph: ; preds = %0 231 %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0 232 %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1 233 br label %4 234 235; <label>:4 ; preds = %4, %.lr.ph 236 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ] 237 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ] 238 %5 = load i32, i32* %2 239 %6 = load i32, i32* %3 240 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 241 store i32 %5, i32* %7 242 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 243 store i32 %6, i32* %8 244 %9 = add nsw i32 %i.02, 1 245 %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 246 %exitcond = icmp eq i32 %9, %count 247 br i1 %exitcond, label %._crit_edge, label %4 248 249._crit_edge: ; preds = %4, %0 250 ret void 251} 252 253 254; CHECK-LABEL: merge_loads_vector: 255; load: 256; CHECK: movups 257; store: 258; CHECK: movups 259; CHECK: ret 260define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { 261 %a1 = icmp sgt i32 %count, 0 262 br i1 %a1, label %.lr.ph, label %._crit_edge 263 264.lr.ph: ; preds = %0 265 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0 266 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1 267 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2 268 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3 269 br label %block4 270 271block4: ; preds = %4, %.lr.ph 272 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ] 273 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ] 274 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 275 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 276 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 277 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 278 %b1 = load i32, i32* %a2 279 %b2 = load i32, i32* %a3 280 %b3 = load i32, i32* %a4 281 %b4 = load i32, i32* %a5 282 store i32 %b1, i32* %a7 283 store i32 %b2, i32* %a8 284 store i32 %b3, i32* %a9 285 store i32 %b4, i32* %a10 286 %c9 = add nsw i32 %i.02, 1 287 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 288 %exitcond = icmp eq i32 %c9, %count 289 br i1 %exitcond, label %._crit_edge, label %block4 290 291._crit_edge: ; preds = %4, %0 292 ret void 293} 294 295;; On x86, even unaligned copies should be merged to vector ops. 296;; TODO: however, this cannot happen at the moment, due to brokenness 297;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp 298;; visitSTORE. 299 300; CHECK-LABEL: merge_loads_no_align: 301; load: 302; CHECK-NOT: vmovups ;; TODO 303; store: 304; CHECK-NOT: vmovups ;; TODO 305; CHECK: ret 306define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { 307 %a1 = icmp sgt i32 %count, 0 308 br i1 %a1, label %.lr.ph, label %._crit_edge 309 310.lr.ph: ; preds = %0 311 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0 312 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1 313 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2 314 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3 315 br label %block4 316 317block4: ; preds = %4, %.lr.ph 318 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ] 319 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ] 320 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0 321 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1 322 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2 323 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3 324 %b1 = load i32, i32* %a2, align 1 325 %b2 = load i32, i32* %a3, align 1 326 %b3 = load i32, i32* %a4, align 1 327 %b4 = load i32, i32* %a5, align 1 328 store i32 %b1, i32* %a7, align 1 329 store i32 %b2, i32* %a8, align 1 330 store i32 %b3, i32* %a9, align 1 331 store i32 %b4, i32* %a10, align 1 332 %c9 = add nsw i32 %i.02, 1 333 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1 334 %exitcond = icmp eq i32 %c9, %count 335 br i1 %exitcond, label %._crit_edge, label %block4 336 337._crit_edge: ; preds = %4, %0 338 ret void 339} 340 341; Make sure that we merge the consecutive load/store sequence below and use a 342; word (16 bit) instead of a byte copy. 343; CHECK-LABEL: MergeLoadStoreBaseIndexOffset: 344; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] 345; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] 346; CHECK: movw %[[REG]], (%{{.*}}) 347define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { 348 br label %1 349 350; <label>:1 351 %.09 = phi i32 [ %n, %0 ], [ %11, %1 ] 352 %.08 = phi i8* [ %b, %0 ], [ %10, %1 ] 353 %.0 = phi i64* [ %a, %0 ], [ %2, %1 ] 354 %2 = getelementptr inbounds i64, i64* %.0, i64 1 355 %3 = load i64, i64* %.0, align 1 356 %4 = getelementptr inbounds i8, i8* %c, i64 %3 357 %5 = load i8, i8* %4, align 1 358 %6 = add i64 %3, 1 359 %7 = getelementptr inbounds i8, i8* %c, i64 %6 360 %8 = load i8, i8* %7, align 1 361 store i8 %5, i8* %.08, align 1 362 %9 = getelementptr inbounds i8, i8* %.08, i64 1 363 store i8 %8, i8* %9, align 1 364 %10 = getelementptr inbounds i8, i8* %.08, i64 2 365 %11 = add nsw i32 %.09, -1 366 %12 = icmp eq i32 %11, 0 367 br i1 %12, label %13, label %1 368 369; <label>:13 370 ret void 371} 372 373; Make sure that we merge the consecutive load/store sequence below and use a 374; word (16 bit) instead of a byte copy even if there are intermediate sign 375; extensions. 376; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext: 377; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]] 378; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]] 379; CHECK: movw %[[REG]], (%{{.*}}) 380define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) { 381 br label %1 382 383; <label>:1 384 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ] 385 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ] 386 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ] 387 %2 = getelementptr inbounds i8, i8* %.0, i64 1 388 %3 = load i8, i8* %.0, align 1 389 %4 = sext i8 %3 to i64 390 %5 = getelementptr inbounds i8, i8* %c, i64 %4 391 %6 = load i8, i8* %5, align 1 392 %7 = add i64 %4, 1 393 %8 = getelementptr inbounds i8, i8* %c, i64 %7 394 %9 = load i8, i8* %8, align 1 395 store i8 %6, i8* %.08, align 1 396 %10 = getelementptr inbounds i8, i8* %.08, i64 1 397 store i8 %9, i8* %10, align 1 398 %11 = getelementptr inbounds i8, i8* %.08, i64 2 399 %12 = add nsw i32 %.09, -1 400 %13 = icmp eq i32 %12, 0 401 br i1 %13, label %14, label %1 402 403; <label>:14 404 ret void 405} 406 407; However, we can only merge ignore sign extensions when they are on all memory 408; computations; 409; CHECK-LABEL: loadStoreBaseIndexOffsetSextNoSex: 410; CHECK-NOT: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]] 411; CHECK-NOT: movw [[REG]], (%{{.*}}) 412define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) { 413 br label %1 414 415; <label>:1 416 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ] 417 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ] 418 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ] 419 %2 = getelementptr inbounds i8, i8* %.0, i64 1 420 %3 = load i8, i8* %.0, align 1 421 %4 = sext i8 %3 to i64 422 %5 = getelementptr inbounds i8, i8* %c, i64 %4 423 %6 = load i8, i8* %5, align 1 424 %7 = add i8 %3, 1 425 %wrap.4 = sext i8 %7 to i64 426 %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4 427 %9 = load i8, i8* %8, align 1 428 store i8 %6, i8* %.08, align 1 429 %10 = getelementptr inbounds i8, i8* %.08, i64 1 430 store i8 %9, i8* %10, align 1 431 %11 = getelementptr inbounds i8, i8* %.08, i64 2 432 %12 = add nsw i32 %.09, -1 433 %13 = icmp eq i32 %12, 0 434 br i1 %13, label %14, label %1 435 436; <label>:14 437 ret void 438} 439 440; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 ) 441define void @merge_vec_element_store(<8 x float> %v, float* %ptr) { 442 %vecext0 = extractelement <8 x float> %v, i32 0 443 %vecext1 = extractelement <8 x float> %v, i32 1 444 %vecext2 = extractelement <8 x float> %v, i32 2 445 %vecext3 = extractelement <8 x float> %v, i32 3 446 %vecext4 = extractelement <8 x float> %v, i32 4 447 %vecext5 = extractelement <8 x float> %v, i32 5 448 %vecext6 = extractelement <8 x float> %v, i32 6 449 %vecext7 = extractelement <8 x float> %v, i32 7 450 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1 451 %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2 452 %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3 453 %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4 454 %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5 455 %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6 456 %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7 457 store float %vecext0, float* %ptr, align 4 458 store float %vecext1, float* %arrayidx1, align 4 459 store float %vecext2, float* %arrayidx2, align 4 460 store float %vecext3, float* %arrayidx3, align 4 461 store float %vecext4, float* %arrayidx4, align 4 462 store float %vecext5, float* %arrayidx5, align 4 463 store float %vecext6, float* %arrayidx6, align 4 464 store float %vecext7, float* %arrayidx7, align 4 465 ret void 466 467; CHECK-LABEL: merge_vec_element_store 468; CHECK: vmovups 469; CHECK-NEXT: vzeroupper 470; CHECK-NEXT: retq 471} 472 473; PR21711 - Merge vector stores into wider vector stores. 474; These should be merged into 32-byte stores. 475define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) { 476 %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 477 %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4 478 %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5 479 %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6 480 %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 481 %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 482 %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 483 %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 484 store <4 x float> %shuffle0, <4 x float>* %idx0, align 16 485 store <4 x float> %shuffle1, <4 x float>* %idx1, align 16 486 store <4 x float> %shuffle2, <4 x float>* %idx2, align 16 487 store <4 x float> %shuffle3, <4 x float>* %idx3, align 16 488 ret void 489 490; CHECK-LABEL: merge_vec_extract_stores 491; CHECK: vmovups %ymm0, 48(%rdi) 492; CHECK-NEXT: vmovups %ymm1, 80(%rdi) 493; CHECK-NEXT: vzeroupper 494; CHECK-NEXT: retq 495} 496 497; Merging vector stores when sourced from vector loads is not currently handled. 498define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) { 499 %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0 500 %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1 501 %v0 = load <4 x float>, <4 x float>* %load_idx0 502 %v1 = load <4 x float>, <4 x float>* %load_idx1 503 %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0 504 %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1 505 store <4 x float> %v0, <4 x float>* %store_idx0, align 16 506 store <4 x float> %v1, <4 x float>* %store_idx1, align 16 507 ret void 508 509; CHECK-LABEL: merge_vec_stores_from_loads 510; CHECK: vmovaps 511; CHECK-NEXT: vmovaps 512; CHECK-NEXT: vmovaps 513; CHECK-NEXT: vmovaps 514; CHECK-NEXT: retq 515} 516 517; Merging vector stores when sourced from a constant vector is not currently handled. 518define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) { 519 %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3 520 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4 521 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16 522 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16 523 ret void 524 525; CHECK-LABEL: merge_vec_stores_of_constants 526; CHECK: vxorps 527; CHECK-NEXT: vmovaps 528; CHECK-NEXT: vmovaps 529; CHECK-NEXT: retq 530} 531 532; This is a minimized test based on real code that was failing. 533; We could merge stores (and loads) like this... 534 535define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { 536 %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0 537 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1 538 %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4 539 %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5 540 541 %a0 = load i64, i64* %idx0, align 8 542 store i64 %a0, i64* %idx4, align 8 543 544 %b = bitcast i64* %idx1 to <2 x i64>* 545 %v = load <2 x i64>, <2 x i64>* %b, align 8 546 %a1 = extractelement <2 x i64> %v, i32 0 547 store i64 %a1, i64* %idx5, align 8 548 ret void 549 550; CHECK-LABEL: merge_vec_element_and_scalar_load 551; CHECK: movq (%rdi), %rax 552; CHECK-NEXT: movq %rax, 32(%rdi) 553; CHECK-NEXT: movq 8(%rdi), %rax 554; CHECK-NEXT: movq %rax, 40(%rdi) 555; CHECK-NEXT: retq 556} 557