1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vld2_v2i32(<4 x i32> *%src, <2 x i32> *%dst) { 7; CHECK-LABEL: vld2_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: vldrw.u32 q0, [r0] 10; CHECK-NEXT: vrev64.32 q1, q0 11; CHECK-NEXT: vmov r2, s2 12; CHECK-NEXT: vmov r0, s6 13; CHECK-NEXT: vmov r3, s0 14; CHECK-NEXT: add r0, r2 15; CHECK-NEXT: vmov r2, s4 16; CHECK-NEXT: add r2, r3 17; CHECK-NEXT: strd r2, r0, [r1] 18; CHECK-NEXT: bx lr 19entry: 20 %l1 = load <4 x i32>, <4 x i32>* %src, align 4 21 %s1 = shufflevector <4 x i32> %l1, <4 x i32> undef, <2 x i32> <i32 0, i32 2> 22 %s2 = shufflevector <4 x i32> %l1, <4 x i32> undef, <2 x i32> <i32 1, i32 3> 23 %a = add <2 x i32> %s1, %s2 24 store <2 x i32> %a, <2 x i32> *%dst 25 ret void 26} 27 28define void @vld2_v4i32(<8 x i32> *%src, <4 x i32> *%dst) { 29; CHECK-LABEL: vld2_v4i32: 30; CHECK: @ %bb.0: @ %entry 31; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 32; CHECK-NEXT: vld21.32 {q0, q1}, [r0] 33; CHECK-NEXT: vadd.i32 q0, q0, q1 34; CHECK-NEXT: vstrw.32 q0, [r1] 35; CHECK-NEXT: bx lr 36entry: 37 %l1 = load <8 x i32>, <8 x i32>* %src, align 4 38 %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 39 %s2 = shufflevector <8 x i32> %l1, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 40 %a = add <4 x i32> %s1, %s2 41 store <4 x i32> %a, <4 x i32> *%dst 42 ret void 43} 44 45define void @vld2_v8i32(<16 x i32> *%src, <8 x i32> *%dst) { 46; CHECK-LABEL: vld2_v8i32: 47; CHECK: @ %bb.0: @ %entry 48; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 49; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! 50; CHECK-NEXT: vld20.32 {q2, q3}, [r0] 51; CHECK-NEXT: vadd.i32 q0, q0, q1 52; CHECK-NEXT: vld21.32 {q2, q3}, [r0] 53; CHECK-NEXT: vstrw.32 q0, [r1] 54; CHECK-NEXT: vadd.i32 q1, q2, q3 55; CHECK-NEXT: vstrw.32 q1, [r1, #16] 56; CHECK-NEXT: bx lr 57entry: 58 %l1 = load <16 x i32>, <16 x i32>* %src, align 4 59 %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 60 %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 61 %a = add <8 x i32> %s1, %s2 62 store <8 x i32> %a, <8 x i32> *%dst 63 ret void 64} 65 66define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) { 67; CHECK-LABEL: vld2_v16i32: 68; CHECK: @ %bb.0: @ %entry 69; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 70; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 71; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 72; CHECK-NEXT: add.w r2, r0, #96 73; CHECK-NEXT: add.w r3, r0, #64 74; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! 75; CHECK-NEXT: vadd.i32 q0, q0, q1 76; CHECK-NEXT: vld20.32 {q1, q2}, [r3] 77; CHECK-NEXT: vld20.32 {q3, q4}, [r2] 78; CHECK-NEXT: vld20.32 {q5, q6}, [r0] 79; CHECK-NEXT: vld21.32 {q5, q6}, [r0] 80; CHECK-NEXT: vld21.32 {q1, q2}, [r3] 81; CHECK-NEXT: vld21.32 {q3, q4}, [r2] 82; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 83; CHECK-NEXT: vstrw.32 q0, [r1] 84; CHECK-NEXT: vadd.i32 q5, q5, q6 85; CHECK-NEXT: vadd.i32 q1, q1, q2 86; CHECK-NEXT: vadd.i32 q3, q3, q4 87; CHECK-NEXT: vstrw.32 q1, [r1, #32] 88; CHECK-NEXT: vstrw.32 q3, [r1, #48] 89; CHECK-NEXT: vstrw.32 q5, [r1, #16] 90; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 91; CHECK-NEXT: bx lr 92entry: 93 %l1 = load <32 x i32>, <32 x i32>* %src, align 4 94 %s1 = shufflevector <32 x i32> %l1, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 95 %s2 = shufflevector <32 x i32> %l1, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 96 %a = add <16 x i32> %s1, %s2 97 store <16 x i32> %a, <16 x i32> *%dst 98 ret void 99} 100 101; i16 102 103define void @vld2_v2i16(<4 x i16> *%src, <2 x i16> *%dst) { 104; CHECK-LABEL: vld2_v2i16: 105; CHECK: @ %bb.0: @ %entry 106; CHECK-NEXT: vldrh.u32 q0, [r0] 107; CHECK-NEXT: vrev64.32 q1, q0 108; CHECK-NEXT: vmov r2, s2 109; CHECK-NEXT: vmov r0, s6 110; CHECK-NEXT: add r0, r2 111; CHECK-NEXT: strh r0, [r1, #2] 112; CHECK-NEXT: vmov r0, s4 113; CHECK-NEXT: vmov r2, s0 114; CHECK-NEXT: add r0, r2 115; CHECK-NEXT: strh r0, [r1] 116; CHECK-NEXT: bx lr 117entry: 118 %l1 = load <4 x i16>, <4 x i16>* %src, align 4 119 %s1 = shufflevector <4 x i16> %l1, <4 x i16> undef, <2 x i32> <i32 0, i32 2> 120 %s2 = shufflevector <4 x i16> %l1, <4 x i16> undef, <2 x i32> <i32 1, i32 3> 121 %a = add <2 x i16> %s1, %s2 122 store <2 x i16> %a, <2 x i16> *%dst 123 ret void 124} 125 126define void @vld2_v4i16(<8 x i16> *%src, <4 x i16> *%dst) { 127; CHECK-LABEL: vld2_v4i16: 128; CHECK: @ %bb.0: @ %entry 129; CHECK-NEXT: vldrw.u32 q0, [r0] 130; CHECK-NEXT: vrev32.16 q1, q0 131; CHECK-NEXT: vadd.i32 q0, q0, q1 132; CHECK-NEXT: vstrh.32 q0, [r1] 133; CHECK-NEXT: bx lr 134entry: 135 %l1 = load <8 x i16>, <8 x i16>* %src, align 4 136 %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 137 %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 138 %a = add <4 x i16> %s1, %s2 139 store <4 x i16> %a, <4 x i16> *%dst 140 ret void 141} 142 143define void @vld2_v8i16(<16 x i16> *%src, <8 x i16> *%dst) { 144; CHECK-LABEL: vld2_v8i16: 145; CHECK: @ %bb.0: @ %entry 146; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 147; CHECK-NEXT: vld21.16 {q0, q1}, [r0] 148; CHECK-NEXT: vadd.i16 q0, q0, q1 149; CHECK-NEXT: vstrw.32 q0, [r1] 150; CHECK-NEXT: bx lr 151entry: 152 %l1 = load <16 x i16>, <16 x i16>* %src, align 4 153 %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 154 %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 155 %a = add <8 x i16> %s1, %s2 156 store <8 x i16> %a, <8 x i16> *%dst 157 ret void 158} 159 160define void @vld2_v16i16(<32 x i16> *%src, <16 x i16> *%dst) { 161; CHECK-LABEL: vld2_v16i16: 162; CHECK: @ %bb.0: @ %entry 163; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 164; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! 165; CHECK-NEXT: vld20.16 {q2, q3}, [r0] 166; CHECK-NEXT: vadd.i16 q0, q0, q1 167; CHECK-NEXT: vld21.16 {q2, q3}, [r0] 168; CHECK-NEXT: vstrw.32 q0, [r1] 169; CHECK-NEXT: vadd.i16 q1, q2, q3 170; CHECK-NEXT: vstrw.32 q1, [r1, #16] 171; CHECK-NEXT: bx lr 172entry: 173 %l1 = load <32 x i16>, <32 x i16>* %src, align 4 174 %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 175 %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 176 %a = add <16 x i16> %s1, %s2 177 store <16 x i16> %a, <16 x i16> *%dst 178 ret void 179} 180 181; i8 182 183define void @vld2_v2i8(<4 x i8> *%src, <2 x i8> *%dst) { 184; CHECK-LABEL: vld2_v2i8: 185; CHECK: @ %bb.0: @ %entry 186; CHECK-NEXT: vldrb.u32 q0, [r0] 187; CHECK-NEXT: vrev64.32 q1, q0 188; CHECK-NEXT: vmov r2, s2 189; CHECK-NEXT: vmov r0, s6 190; CHECK-NEXT: add r0, r2 191; CHECK-NEXT: strb r0, [r1, #1] 192; CHECK-NEXT: vmov r0, s4 193; CHECK-NEXT: vmov r2, s0 194; CHECK-NEXT: add r0, r2 195; CHECK-NEXT: strb r0, [r1] 196; CHECK-NEXT: bx lr 197entry: 198 %l1 = load <4 x i8>, <4 x i8>* %src, align 4 199 %s1 = shufflevector <4 x i8> %l1, <4 x i8> undef, <2 x i32> <i32 0, i32 2> 200 %s2 = shufflevector <4 x i8> %l1, <4 x i8> undef, <2 x i32> <i32 1, i32 3> 201 %a = add <2 x i8> %s1, %s2 202 store <2 x i8> %a, <2 x i8> *%dst 203 ret void 204} 205 206define void @vld2_v4i8(<8 x i8> *%src, <4 x i8> *%dst) { 207; CHECK-LABEL: vld2_v4i8: 208; CHECK: @ %bb.0: @ %entry 209; CHECK-NEXT: vldrb.u16 q0, [r0] 210; CHECK-NEXT: vrev32.16 q1, q0 211; CHECK-NEXT: vadd.i32 q0, q0, q1 212; CHECK-NEXT: vstrb.32 q0, [r1] 213; CHECK-NEXT: bx lr 214entry: 215 %l1 = load <8 x i8>, <8 x i8>* %src, align 4 216 %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 217 %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 218 %a = add <4 x i8> %s1, %s2 219 store <4 x i8> %a, <4 x i8> *%dst 220 ret void 221} 222 223define void @vld2_v8i8(<16 x i8> *%src, <8 x i8> *%dst) { 224; CHECK-LABEL: vld2_v8i8: 225; CHECK: @ %bb.0: @ %entry 226; CHECK-NEXT: vldrw.u32 q0, [r0] 227; CHECK-NEXT: vrev16.8 q1, q0 228; CHECK-NEXT: vadd.i16 q0, q0, q1 229; CHECK-NEXT: vstrb.16 q0, [r1] 230; CHECK-NEXT: bx lr 231entry: 232 %l1 = load <16 x i8>, <16 x i8>* %src, align 4 233 %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 234 %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 235 %a = add <8 x i8> %s1, %s2 236 store <8 x i8> %a, <8 x i8> *%dst 237 ret void 238} 239 240define void @vld2_v16i8(<32 x i8> *%src, <16 x i8> *%dst) { 241; CHECK-LABEL: vld2_v16i8: 242; CHECK: @ %bb.0: @ %entry 243; CHECK-NEXT: vld20.8 {q0, q1}, [r0] 244; CHECK-NEXT: vld21.8 {q0, q1}, [r0] 245; CHECK-NEXT: vadd.i8 q0, q0, q1 246; CHECK-NEXT: vstrw.32 q0, [r1] 247; CHECK-NEXT: bx lr 248entry: 249 %l1 = load <32 x i8>, <32 x i8>* %src, align 4 250 %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 251 %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 252 %a = add <16 x i8> %s1, %s2 253 store <16 x i8> %a, <16 x i8> *%dst 254 ret void 255} 256 257; i64 258 259define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) { 260; CHECK-LABEL: vld2_v2i64: 261; CHECK: @ %bb.0: @ %entry 262; CHECK-NEXT: .save {r4, lr} 263; CHECK-NEXT: push {r4, lr} 264; CHECK-NEXT: vldrw.u32 q0, [r0] 265; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 266; CHECK-NEXT: vmov.f64 d2, d1 267; CHECK-NEXT: vmov.f32 s5, s3 268; CHECK-NEXT: vmov.f32 s6, s10 269; CHECK-NEXT: vmov.f32 s2, s8 270; CHECK-NEXT: vmov.f32 s3, s9 271; CHECK-NEXT: vmov.f32 s7, s11 272; CHECK-NEXT: vmov r3, s6 273; CHECK-NEXT: vmov r0, s2 274; CHECK-NEXT: vmov r4, s0 275; CHECK-NEXT: vmov r2, s3 276; CHECK-NEXT: vmov r12, s7 277; CHECK-NEXT: adds.w lr, r0, r3 278; CHECK-NEXT: vmov r0, s4 279; CHECK-NEXT: vmov r3, s5 280; CHECK-NEXT: adc.w r12, r12, r2 281; CHECK-NEXT: vmov r2, s1 282; CHECK-NEXT: adds r0, r0, r4 283; CHECK-NEXT: vmov.32 q0[0], r0 284; CHECK-NEXT: adcs r2, r3 285; CHECK-NEXT: vmov.32 q0[1], r2 286; CHECK-NEXT: vmov.32 q0[2], lr 287; CHECK-NEXT: vmov.32 q0[3], r12 288; CHECK-NEXT: vstrw.32 q0, [r1] 289; CHECK-NEXT: pop {r4, pc} 290entry: 291 %l1 = load <4 x i64>, <4 x i64>* %src, align 4 292 %s1 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 0, i32 2> 293 %s2 = shufflevector <4 x i64> %l1, <4 x i64> undef, <2 x i32> <i32 1, i32 3> 294 %a = add <2 x i64> %s1, %s2 295 store <2 x i64> %a, <2 x i64> *%dst 296 ret void 297} 298 299define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) { 300; CHECK-LABEL: vld2_v4i64: 301; CHECK: @ %bb.0: @ %entry 302; CHECK-NEXT: .save {r4, lr} 303; CHECK-NEXT: push {r4, lr} 304; CHECK-NEXT: .vsave {d8, d9, d10, d11} 305; CHECK-NEXT: vpush {d8, d9, d10, d11} 306; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 307; CHECK-NEXT: vldrw.u32 q5, [r0, #48] 308; CHECK-NEXT: vldrw.u32 q0, [r0] 309; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 310; CHECK-NEXT: vmov.f64 d8, d7 311; CHECK-NEXT: vmov.f32 s17, s15 312; CHECK-NEXT: vmov.f32 s18, s22 313; CHECK-NEXT: vmov.f32 s14, s20 314; CHECK-NEXT: vmov.f32 s15, s21 315; CHECK-NEXT: vmov.f32 s19, s23 316; CHECK-NEXT: vmov r3, s18 317; CHECK-NEXT: vmov r0, s14 318; CHECK-NEXT: vmov r4, s12 319; CHECK-NEXT: vmov.f64 d2, d1 320; CHECK-NEXT: vmov r12, s19 321; CHECK-NEXT: vmov r2, s15 322; CHECK-NEXT: vmov.f32 s5, s3 323; CHECK-NEXT: vmov.f32 s6, s10 324; CHECK-NEXT: vmov.f32 s2, s8 325; CHECK-NEXT: vmov.f32 s7, s11 326; CHECK-NEXT: vmov.f32 s3, s9 327; CHECK-NEXT: adds.w lr, r0, r3 328; CHECK-NEXT: vmov r0, s16 329; CHECK-NEXT: vmov r3, s17 330; CHECK-NEXT: adc.w r12, r12, r2 331; CHECK-NEXT: vmov r2, s13 332; CHECK-NEXT: adds r0, r0, r4 333; CHECK-NEXT: vmov r4, s2 334; CHECK-NEXT: vmov.32 q3[0], r0 335; CHECK-NEXT: vmov r0, s7 336; CHECK-NEXT: adcs r2, r3 337; CHECK-NEXT: vmov r3, s6 338; CHECK-NEXT: vmov.32 q3[1], r2 339; CHECK-NEXT: vmov r2, s3 340; CHECK-NEXT: vmov.32 q3[2], lr 341; CHECK-NEXT: vmov.32 q3[3], r12 342; CHECK-NEXT: vstrw.32 q3, [r1, #16] 343; CHECK-NEXT: adds.w lr, r4, r3 344; CHECK-NEXT: vmov r3, s0 345; CHECK-NEXT: adc.w r12, r2, r0 346; CHECK-NEXT: vmov r0, s4 347; CHECK-NEXT: vmov r2, s5 348; CHECK-NEXT: vmov r4, s1 349; CHECK-NEXT: adds r0, r0, r3 350; CHECK-NEXT: vmov.32 q0[0], r0 351; CHECK-NEXT: adcs r2, r4 352; CHECK-NEXT: vmov.32 q0[1], r2 353; CHECK-NEXT: vmov.32 q0[2], lr 354; CHECK-NEXT: vmov.32 q0[3], r12 355; CHECK-NEXT: vstrw.32 q0, [r1] 356; CHECK-NEXT: vpop {d8, d9, d10, d11} 357; CHECK-NEXT: pop {r4, pc} 358entry: 359 %l1 = load <8 x i64>, <8 x i64>* %src, align 4 360 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 361 %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 362 %a = add <4 x i64> %s1, %s2 363 store <4 x i64> %a, <4 x i64> *%dst 364 ret void 365} 366 367; f32 368 369define void @vld2_v2f32(<4 x float> *%src, <2 x float> *%dst) { 370; CHECK-LABEL: vld2_v2f32: 371; CHECK: @ %bb.0: @ %entry 372; CHECK-NEXT: vldrw.u32 q0, [r0] 373; CHECK-NEXT: vmov.f32 s4, s1 374; CHECK-NEXT: vmov.f32 s5, s3 375; CHECK-NEXT: vmov.f32 s1, s2 376; CHECK-NEXT: vadd.f32 q0, q0, q1 377; CHECK-NEXT: vstmia r1, {s0, s1} 378; CHECK-NEXT: bx lr 379entry: 380 %l1 = load <4 x float>, <4 x float>* %src, align 4 381 %s1 = shufflevector <4 x float> %l1, <4 x float> undef, <2 x i32> <i32 0, i32 2> 382 %s2 = shufflevector <4 x float> %l1, <4 x float> undef, <2 x i32> <i32 1, i32 3> 383 %a = fadd <2 x float> %s1, %s2 384 store <2 x float> %a, <2 x float> *%dst 385 ret void 386} 387 388define void @vld2_v4f32(<8 x float> *%src, <4 x float> *%dst) { 389; CHECK-LABEL: vld2_v4f32: 390; CHECK: @ %bb.0: @ %entry 391; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 392; CHECK-NEXT: vld21.32 {q0, q1}, [r0] 393; CHECK-NEXT: vadd.f32 q0, q0, q1 394; CHECK-NEXT: vstrw.32 q0, [r1] 395; CHECK-NEXT: bx lr 396entry: 397 %l1 = load <8 x float>, <8 x float>* %src, align 4 398 %s1 = shufflevector <8 x float> %l1, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 399 %s2 = shufflevector <8 x float> %l1, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 400 %a = fadd <4 x float> %s1, %s2 401 store <4 x float> %a, <4 x float> *%dst 402 ret void 403} 404 405define void @vld2_v8f32(<16 x float> *%src, <8 x float> *%dst) { 406; CHECK-LABEL: vld2_v8f32: 407; CHECK: @ %bb.0: @ %entry 408; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 409; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! 410; CHECK-NEXT: vld20.32 {q2, q3}, [r0] 411; CHECK-NEXT: vadd.f32 q0, q0, q1 412; CHECK-NEXT: vld21.32 {q2, q3}, [r0] 413; CHECK-NEXT: vstrw.32 q0, [r1] 414; CHECK-NEXT: vadd.f32 q1, q2, q3 415; CHECK-NEXT: vstrw.32 q1, [r1, #16] 416; CHECK-NEXT: bx lr 417entry: 418 %l1 = load <16 x float>, <16 x float>* %src, align 4 419 %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 420 %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 421 %a = fadd <8 x float> %s1, %s2 422 store <8 x float> %a, <8 x float> *%dst 423 ret void 424} 425 426define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) { 427; CHECK-LABEL: vld2_v16f32: 428; CHECK: @ %bb.0: @ %entry 429; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 430; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 431; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 432; CHECK-NEXT: add.w r2, r0, #96 433; CHECK-NEXT: add.w r3, r0, #64 434; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! 435; CHECK-NEXT: vadd.f32 q0, q0, q1 436; CHECK-NEXT: vld20.32 {q1, q2}, [r3] 437; CHECK-NEXT: vld20.32 {q3, q4}, [r2] 438; CHECK-NEXT: vld20.32 {q5, q6}, [r0] 439; CHECK-NEXT: vld21.32 {q5, q6}, [r0] 440; CHECK-NEXT: vld21.32 {q1, q2}, [r3] 441; CHECK-NEXT: vld21.32 {q3, q4}, [r2] 442; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2 443; CHECK-NEXT: vstrw.32 q0, [r1] 444; CHECK-NEXT: vadd.f32 q5, q5, q6 445; CHECK-NEXT: vadd.f32 q1, q1, q2 446; CHECK-NEXT: vadd.f32 q3, q3, q4 447; CHECK-NEXT: vstrw.32 q1, [r1, #32] 448; CHECK-NEXT: vstrw.32 q3, [r1, #48] 449; CHECK-NEXT: vstrw.32 q5, [r1, #16] 450; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 451; CHECK-NEXT: bx lr 452entry: 453 %l1 = load <32 x float>, <32 x float>* %src, align 4 454 %s1 = shufflevector <32 x float> %l1, <32 x float> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 455 %s2 = shufflevector <32 x float> %l1, <32 x float> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 456 %a = fadd <16 x float> %s1, %s2 457 store <16 x float> %a, <16 x float> *%dst 458 ret void 459} 460 461; f16 462 463define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) { 464; CHECK-LABEL: vld2_v2f16: 465; CHECK: @ %bb.0: @ %entry 466; CHECK-NEXT: ldrd r2, r0, [r0] 467; CHECK-NEXT: vmov.32 q0[0], r2 468; CHECK-NEXT: vmov.32 q0[1], r0 469; CHECK-NEXT: vmovx.f16 s4, s1 470; CHECK-NEXT: vmov r0, s4 471; CHECK-NEXT: vmovx.f16 s4, s0 472; CHECK-NEXT: vmov r2, s4 473; CHECK-NEXT: vmov.16 q1[0], r2 474; CHECK-NEXT: vmov r2, s0 475; CHECK-NEXT: vmov.16 q1[1], r0 476; CHECK-NEXT: vmov r0, s1 477; CHECK-NEXT: vmov.16 q0[0], r2 478; CHECK-NEXT: vmov.16 q0[1], r0 479; CHECK-NEXT: vadd.f16 q0, q0, q1 480; CHECK-NEXT: vmov r0, s0 481; CHECK-NEXT: str r0, [r1] 482; CHECK-NEXT: bx lr 483entry: 484 %l1 = load <4 x half>, <4 x half>* %src, align 4 485 %s1 = shufflevector <4 x half> %l1, <4 x half> undef, <2 x i32> <i32 0, i32 2> 486 %s2 = shufflevector <4 x half> %l1, <4 x half> undef, <2 x i32> <i32 1, i32 3> 487 %a = fadd <2 x half> %s1, %s2 488 store <2 x half> %a, <2 x half> *%dst 489 ret void 490} 491 492define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) { 493; CHECK-LABEL: vld2_v4f16: 494; CHECK: @ %bb.0: @ %entry 495; CHECK-NEXT: vldrw.u32 q0, [r0] 496; CHECK-NEXT: vmov r2, s0 497; CHECK-NEXT: vmovx.f16 s8, s0 498; CHECK-NEXT: vmov r0, s1 499; CHECK-NEXT: vmov.16 q1[0], r2 500; CHECK-NEXT: vmov.16 q1[1], r0 501; CHECK-NEXT: vmov r0, s2 502; CHECK-NEXT: vmov.16 q1[2], r0 503; CHECK-NEXT: vmov r0, s8 504; CHECK-NEXT: vmovx.f16 s8, s1 505; CHECK-NEXT: vmovx.f16 s12, s2 506; CHECK-NEXT: vmov r2, s8 507; CHECK-NEXT: vmov.16 q2[0], r0 508; CHECK-NEXT: vmov r0, s12 509; CHECK-NEXT: vmov.16 q2[1], r2 510; CHECK-NEXT: vmovx.f16 s12, s3 511; CHECK-NEXT: vmov.16 q2[2], r0 512; CHECK-NEXT: vmov r0, s12 513; CHECK-NEXT: vmov.16 q2[3], r0 514; CHECK-NEXT: vmov r0, s3 515; CHECK-NEXT: vmov.16 q1[3], r0 516; CHECK-NEXT: vadd.f16 q0, q1, q2 517; CHECK-NEXT: vmov r2, s1 518; CHECK-NEXT: vmov r0, s0 519; CHECK-NEXT: strd r0, r2, [r1] 520; CHECK-NEXT: bx lr 521entry: 522 %l1 = load <8 x half>, <8 x half>* %src, align 4 523 %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 524 %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 525 %a = fadd <4 x half> %s1, %s2 526 store <4 x half> %a, <4 x half> *%dst 527 ret void 528} 529 530define void @vld2_v8f16(<16 x half> *%src, <8 x half> *%dst) { 531; CHECK-LABEL: vld2_v8f16: 532; CHECK: @ %bb.0: @ %entry 533; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 534; CHECK-NEXT: vld21.16 {q0, q1}, [r0] 535; CHECK-NEXT: vadd.f16 q0, q0, q1 536; CHECK-NEXT: vstrw.32 q0, [r1] 537; CHECK-NEXT: bx lr 538entry: 539 %l1 = load <16 x half>, <16 x half>* %src, align 4 540 %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 541 %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 542 %a = fadd <8 x half> %s1, %s2 543 store <8 x half> %a, <8 x half> *%dst 544 ret void 545} 546 547define void @vld2_v16f16(<32 x half> *%src, <16 x half> *%dst) { 548; CHECK-LABEL: vld2_v16f16: 549; CHECK: @ %bb.0: @ %entry 550; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 551; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! 552; CHECK-NEXT: vld20.16 {q2, q3}, [r0] 553; CHECK-NEXT: vadd.f16 q0, q0, q1 554; CHECK-NEXT: vld21.16 {q2, q3}, [r0] 555; CHECK-NEXT: vstrw.32 q0, [r1] 556; CHECK-NEXT: vadd.f16 q2, q2, q3 557; CHECK-NEXT: vstrw.32 q2, [r1, #16] 558; CHECK-NEXT: bx lr 559entry: 560 %l1 = load <32 x half>, <32 x half>* %src, align 4 561 %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 562 %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 563 %a = fadd <16 x half> %s1, %s2 564 store <16 x half> %a, <16 x half> *%dst 565 ret void 566} 567 568; f64 569 570define void @vld2_v2f64(<4 x double> *%src, <2 x double> *%dst) { 571; CHECK-LABEL: vld2_v2f64: 572; CHECK: @ %bb.0: @ %entry 573; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 574; CHECK-NEXT: vldrw.u32 q1, [r0] 575; CHECK-NEXT: vadd.f64 d1, d0, d1 576; CHECK-NEXT: vadd.f64 d0, d2, d3 577; CHECK-NEXT: vstrw.32 q0, [r1] 578; CHECK-NEXT: bx lr 579entry: 580 %l1 = load <4 x double>, <4 x double>* %src, align 4 581 %s1 = shufflevector <4 x double> %l1, <4 x double> undef, <2 x i32> <i32 0, i32 2> 582 %s2 = shufflevector <4 x double> %l1, <4 x double> undef, <2 x i32> <i32 1, i32 3> 583 %a = fadd <2 x double> %s1, %s2 584 store <2 x double> %a, <2 x double> *%dst 585 ret void 586} 587 588define void @vld2_v4f64(<8 x double> *%src, <4 x double> *%dst) { 589; CHECK-LABEL: vld2_v4f64: 590; CHECK: @ %bb.0: @ %entry 591; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 592; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 593; CHECK-NEXT: vldrw.u32 q2, [r0] 594; CHECK-NEXT: vadd.f64 d1, d0, d1 595; CHECK-NEXT: vadd.f64 d0, d2, d3 596; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 597; CHECK-NEXT: vadd.f64 d3, d2, d3 598; CHECK-NEXT: vstrw.32 q0, [r1, #16] 599; CHECK-NEXT: vadd.f64 d2, d4, d5 600; CHECK-NEXT: vstrw.32 q1, [r1] 601; CHECK-NEXT: bx lr 602entry: 603 %l1 = load <8 x double>, <8 x double>* %src, align 4 604 %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 605 %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 606 %a = fadd <4 x double> %s1, %s2 607 store <4 x double> %a, <4 x double> *%dst 608 ret void 609} 610