1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -mve-max-interleave-factor=4 -verify-machineinstrs %s -o - | FileCheck %s 3 4; i32 5 6define void @vld4_v2i32(<8 x i32> *%src, <2 x i32> *%dst) { 7; CHECK-LABEL: vld4_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: vldrw.u32 q0, [r0] 10; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 11; CHECK-NEXT: vmov.f32 s8, s3 12; CHECK-NEXT: vmov.f32 s10, s7 13; CHECK-NEXT: vmov r2, s6 14; CHECK-NEXT: vmov.f32 s12, s1 15; CHECK-NEXT: vmov.f32 s14, s5 16; CHECK-NEXT: vmov r3, s4 17; CHECK-NEXT: vmov r0, s10 18; CHECK-NEXT: add r0, r2 19; CHECK-NEXT: vmov r2, s14 20; CHECK-NEXT: add r2, r3 21; CHECK-NEXT: vmov r3, s2 22; CHECK-NEXT: add.w r12, r2, r0 23; CHECK-NEXT: vmov r2, s8 24; CHECK-NEXT: vmov r0, s0 25; CHECK-NEXT: add r2, r3 26; CHECK-NEXT: vmov r3, s12 27; CHECK-NEXT: add r0, r3 28; CHECK-NEXT: add r0, r2 29; CHECK-NEXT: strd r0, r12, [r1] 30; CHECK-NEXT: bx lr 31entry: 32 %l1 = load <8 x i32>, <8 x i32>* %src, align 4 33 %s1 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 0, i32 4> 34 %s2 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 1, i32 5> 35 %s3 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 2, i32 6> 36 %s4 = shufflevector <8 x i32> %l1, <8 x i32> undef, <2 x i32> <i32 3, i32 7> 37 %a1 = add <2 x i32> %s1, %s2 38 %a2 = add <2 x i32> %s3, %s4 39 %a3 = add <2 x i32> %a1, %a2 40 store <2 x i32> %a3, <2 x i32> *%dst 41 ret void 42} 43 44define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) { 45; CHECK-LABEL: vld4_v4i32: 46; CHECK: @ %bb.0: @ %entry 47; CHECK-NEXT: .vsave {d8, d9} 48; CHECK-NEXT: vpush {d8, d9} 49; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 50; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 51; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 52; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 53; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 54; CHECK-NEXT: vadd.i32 q4, q2, q3 55; CHECK-NEXT: vadd.i32 q0, q0, q1 56; CHECK-NEXT: vadd.i32 q0, q0, q4 57; CHECK-NEXT: vstrw.32 q0, [r1] 58; CHECK-NEXT: vpop {d8, d9} 59; CHECK-NEXT: bx lr 60entry: 61 %l1 = load <16 x i32>, <16 x i32>* %src, align 4 62 %s1 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 63 %s2 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 64 %s3 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 65 %s4 = shufflevector <16 x i32> %l1, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 66 %a1 = add <4 x i32> %s1, %s2 67 %a2 = add <4 x i32> %s3, %s4 68 %a3 = add <4 x i32> %a1, %a2 69 store <4 x i32> %a3, <4 x i32> *%dst 70 ret void 71} 72 73define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) { 74; CHECK-LABEL: vld4_v8i32: 75; CHECK: @ %bb.0: @ %entry 76; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 77; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 78; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 79; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 80; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 81; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! 82; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 83; CHECK-NEXT: vadd.i32 q6, q2, q3 84; CHECK-NEXT: vadd.i32 q0, q0, q1 85; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] 86; CHECK-NEXT: vadd.i32 q0, q0, q6 87; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] 88; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] 89; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] 90; CHECK-NEXT: vstrw.32 q0, [r1] 91; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 92; CHECK-NEXT: vadd.i32 q5, q3, q4 93; CHECK-NEXT: vadd.i32 q1, q1, q2 94; CHECK-NEXT: vadd.i32 q1, q1, q5 95; CHECK-NEXT: vstrw.32 q1, [r1, #16] 96; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 97; CHECK-NEXT: bx lr 98entry: 99 %l1 = load <32 x i32>, <32 x i32>* %src, align 4 100 %s1 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 101 %s2 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 102 %s3 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 103 %s4 = shufflevector <32 x i32> %l1, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 104 %a1 = add <8 x i32> %s1, %s2 105 %a2 = add <8 x i32> %s3, %s4 106 %a3 = add <8 x i32> %a1, %a2 107 store <8 x i32> %a3, <8 x i32> *%dst 108 ret void 109} 110 111define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) { 112; CHECK-LABEL: vld4_v16i32: 113; CHECK: @ %bb.0: @ %entry 114; CHECK-NEXT: .save {r4, r5} 115; CHECK-NEXT: push {r4, r5} 116; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 117; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 118; CHECK-NEXT: .pad #136 119; CHECK-NEXT: sub sp, #136 120; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 121; CHECK-NEXT: mov r2, r0 122; CHECK-NEXT: add.w r3, r0, #192 123; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 124; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 125; CHECK-NEXT: adds r0, #128 126; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! 127; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 128; CHECK-NEXT: vadd.i32 q4, q2, q3 129; CHECK-NEXT: vadd.i32 q0, q0, q1 130; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill 131; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 132; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] 133; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] 134; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] 135; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] 136; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload 137; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload 138; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill 139; CHECK-NEXT: vmov q0, q1 140; CHECK-NEXT: vadd.i32 q6, q5, q6 141; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload 142; CHECK-NEXT: vadd.i32 q0, q0, q2 143; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill 144; CHECK-NEXT: vadd.i32 q1, q3, q5 145; CHECK-NEXT: vadd.i32 q0, q0, q1 146; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 147; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] 148; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] 149; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] 150; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] 151; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill 152; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 153; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 154; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 155; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 156; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 157; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill 158; CHECK-NEXT: vmov q5, q1 159; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload 160; CHECK-NEXT: vadd.i32 q0, q0, q5 161; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload 162; CHECK-NEXT: vadd.i32 q1, q2, q1 163; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 164; CHECK-NEXT: vadd.i32 q2, q3, q4 165; CHECK-NEXT: vadd.i32 q0, q0, q1 166; CHECK-NEXT: vadd.i32 q1, q5, q6 167; CHECK-NEXT: vadd.i32 q1, q2, q1 168; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload 169; CHECK-NEXT: vstrw.32 q0, [r1, #32] 170; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 171; CHECK-NEXT: vstrw.32 q2, [r1, #48] 172; CHECK-NEXT: vstrw.32 q1, [r1, #16] 173; CHECK-NEXT: vstrw.32 q0, [r1] 174; CHECK-NEXT: add sp, #136 175; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 176; CHECK-NEXT: pop {r4, r5} 177; CHECK-NEXT: bx lr 178entry: 179 %l1 = load <64 x i32>, <64 x i32>* %src, align 4 180 %s1 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 181 %s2 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 182 %s3 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 183 %s4 = shufflevector <64 x i32> %l1, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 184 %a1 = add <16 x i32> %s1, %s2 185 %a2 = add <16 x i32> %s3, %s4 186 %a3 = add <16 x i32> %a1, %a2 187 store <16 x i32> %a3, <16 x i32> *%dst 188 ret void 189} 190 191; i16 192 193define void @vld4_v2i16(<8 x i16> *%src, <2 x i16> *%dst) { 194; CHECK-LABEL: vld4_v2i16: 195; CHECK: @ %bb.0: @ %entry 196; CHECK-NEXT: vldrw.u32 q0, [r0] 197; CHECK-NEXT: vmov.u16 r0, q0[7] 198; CHECK-NEXT: vmov.u16 r2, q0[6] 199; CHECK-NEXT: add r0, r2 200; CHECK-NEXT: vmov.u16 r2, q0[5] 201; CHECK-NEXT: vmov.u16 r3, q0[4] 202; CHECK-NEXT: add r2, r3 203; CHECK-NEXT: vmov.u16 r3, q0[0] 204; CHECK-NEXT: add r0, r2 205; CHECK-NEXT: strh r0, [r1, #2] 206; CHECK-NEXT: vmov.u16 r0, q0[3] 207; CHECK-NEXT: vmov.u16 r2, q0[2] 208; CHECK-NEXT: add r0, r2 209; CHECK-NEXT: vmov.u16 r2, q0[1] 210; CHECK-NEXT: add r2, r3 211; CHECK-NEXT: add r0, r2 212; CHECK-NEXT: strh r0, [r1] 213; CHECK-NEXT: bx lr 214entry: 215 %l1 = load <8 x i16>, <8 x i16>* %src, align 4 216 %s1 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 0, i32 4> 217 %s2 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 1, i32 5> 218 %s3 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 2, i32 6> 219 %s4 = shufflevector <8 x i16> %l1, <8 x i16> undef, <2 x i32> <i32 3, i32 7> 220 %a1 = add <2 x i16> %s1, %s2 221 %a2 = add <2 x i16> %s3, %s4 222 %a3 = add <2 x i16> %a1, %a2 223 store <2 x i16> %a3, <2 x i16> *%dst 224 ret void 225} 226 227define void @vld4_v4i16(<16 x i16> *%src, <4 x i16> *%dst) { 228; CHECK-LABEL: vld4_v4i16: 229; CHECK: @ %bb.0: @ %entry 230; CHECK-NEXT: .vsave {d8, d9} 231; CHECK-NEXT: vpush {d8, d9} 232; CHECK-NEXT: vldrw.u32 q1, [r0] 233; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 234; CHECK-NEXT: vmov.u16 r2, q1[3] 235; CHECK-NEXT: vmov.u16 r0, q0[3] 236; CHECK-NEXT: vmov.32 q2[0], r2 237; CHECK-NEXT: vmov.u16 r2, q1[7] 238; CHECK-NEXT: vmov.32 q2[1], r2 239; CHECK-NEXT: vmov.32 q2[2], r0 240; CHECK-NEXT: vmov.u16 r0, q0[7] 241; CHECK-NEXT: vmov.32 q2[3], r0 242; CHECK-NEXT: vmov.u16 r0, q1[2] 243; CHECK-NEXT: vmov.32 q3[0], r0 244; CHECK-NEXT: vmov.u16 r0, q1[6] 245; CHECK-NEXT: vmov.32 q3[1], r0 246; CHECK-NEXT: vmov.u16 r0, q0[2] 247; CHECK-NEXT: vmov.32 q3[2], r0 248; CHECK-NEXT: vmov.u16 r0, q0[6] 249; CHECK-NEXT: vmov.32 q3[3], r0 250; CHECK-NEXT: vmov.u16 r0, q1[0] 251; CHECK-NEXT: vadd.i32 q2, q3, q2 252; CHECK-NEXT: vmov.32 q3[0], r0 253; CHECK-NEXT: vmov.u16 r0, q1[4] 254; CHECK-NEXT: vmov.32 q3[1], r0 255; CHECK-NEXT: vmov.u16 r0, q0[0] 256; CHECK-NEXT: vmov.32 q3[2], r0 257; CHECK-NEXT: vmov.u16 r0, q1[1] 258; CHECK-NEXT: vmov.32 q4[0], r0 259; CHECK-NEXT: vmov.u16 r0, q1[5] 260; CHECK-NEXT: vmov.32 q4[1], r0 261; CHECK-NEXT: vmov.u16 r0, q0[1] 262; CHECK-NEXT: vmov.32 q4[2], r0 263; CHECK-NEXT: vmov.u16 r0, q0[5] 264; CHECK-NEXT: vmov.32 q4[3], r0 265; CHECK-NEXT: vmov.u16 r0, q0[4] 266; CHECK-NEXT: vmov.32 q3[3], r0 267; CHECK-NEXT: vadd.i32 q0, q3, q4 268; CHECK-NEXT: vadd.i32 q0, q0, q2 269; CHECK-NEXT: vstrh.32 q0, [r1] 270; CHECK-NEXT: vpop {d8, d9} 271; CHECK-NEXT: bx lr 272entry: 273 %l1 = load <16 x i16>, <16 x i16>* %src, align 4 274 %s1 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 275 %s2 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 276 %s3 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 277 %s4 = shufflevector <16 x i16> %l1, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 278 %a1 = add <4 x i16> %s1, %s2 279 %a2 = add <4 x i16> %s3, %s4 280 %a3 = add <4 x i16> %a1, %a2 281 store <4 x i16> %a3, <4 x i16> *%dst 282 ret void 283} 284 285define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) { 286; CHECK-LABEL: vld4_v8i16: 287; CHECK: @ %bb.0: @ %entry 288; CHECK-NEXT: .vsave {d8, d9} 289; CHECK-NEXT: vpush {d8, d9} 290; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 291; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 292; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 293; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] 294; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 295; CHECK-NEXT: vadd.i16 q4, q2, q3 296; CHECK-NEXT: vadd.i16 q0, q0, q1 297; CHECK-NEXT: vadd.i16 q0, q0, q4 298; CHECK-NEXT: vstrw.32 q0, [r1] 299; CHECK-NEXT: vpop {d8, d9} 300; CHECK-NEXT: bx lr 301entry: 302 %l1 = load <32 x i16>, <32 x i16>* %src, align 4 303 %s1 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 304 %s2 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 305 %s3 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 306 %s4 = shufflevector <32 x i16> %l1, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 307 %a1 = add <8 x i16> %s1, %s2 308 %a2 = add <8 x i16> %s3, %s4 309 %a3 = add <8 x i16> %a1, %a2 310 store <8 x i16> %a3, <8 x i16> *%dst 311 ret void 312} 313 314define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) { 315; CHECK-LABEL: vld4_v16i16: 316; CHECK: @ %bb.0: @ %entry 317; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 318; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 319; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 320; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 321; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 322; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! 323; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 324; CHECK-NEXT: vadd.i16 q6, q2, q3 325; CHECK-NEXT: vadd.i16 q0, q0, q1 326; CHECK-NEXT: vld40.16 {q1, q2, q3, q4}, [r0] 327; CHECK-NEXT: vadd.i16 q0, q0, q6 328; CHECK-NEXT: vld41.16 {q1, q2, q3, q4}, [r0] 329; CHECK-NEXT: vld42.16 {q1, q2, q3, q4}, [r0] 330; CHECK-NEXT: vld43.16 {q1, q2, q3, q4}, [r0] 331; CHECK-NEXT: vstrw.32 q0, [r1] 332; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 333; CHECK-NEXT: vadd.i16 q5, q3, q4 334; CHECK-NEXT: vadd.i16 q1, q1, q2 335; CHECK-NEXT: vadd.i16 q1, q1, q5 336; CHECK-NEXT: vstrw.32 q1, [r1, #16] 337; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 338; CHECK-NEXT: bx lr 339entry: 340 %l1 = load <64 x i16>, <64 x i16>* %src, align 4 341 %s1 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 342 %s2 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 343 %s3 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 344 %s4 = shufflevector <64 x i16> %l1, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 345 %a1 = add <16 x i16> %s1, %s2 346 %a2 = add <16 x i16> %s3, %s4 347 %a3 = add <16 x i16> %a1, %a2 348 store <16 x i16> %a3, <16 x i16> *%dst 349 ret void 350} 351 352; i8 353 354define void @vld4_v2i8(<8 x i8> *%src, <2 x i8> *%dst) { 355; CHECK-LABEL: vld4_v2i8: 356; CHECK: @ %bb.0: @ %entry 357; CHECK-NEXT: vldrb.u16 q0, [r0] 358; CHECK-NEXT: vmov.u16 r0, q0[7] 359; CHECK-NEXT: vmov.u16 r2, q0[6] 360; CHECK-NEXT: add r0, r2 361; CHECK-NEXT: vmov.u16 r2, q0[5] 362; CHECK-NEXT: vmov.u16 r3, q0[4] 363; CHECK-NEXT: add r2, r3 364; CHECK-NEXT: vmov.u16 r3, q0[0] 365; CHECK-NEXT: add r0, r2 366; CHECK-NEXT: strb r0, [r1, #1] 367; CHECK-NEXT: vmov.u16 r0, q0[3] 368; CHECK-NEXT: vmov.u16 r2, q0[2] 369; CHECK-NEXT: add r0, r2 370; CHECK-NEXT: vmov.u16 r2, q0[1] 371; CHECK-NEXT: add r2, r3 372; CHECK-NEXT: add r0, r2 373; CHECK-NEXT: strb r0, [r1] 374; CHECK-NEXT: bx lr 375entry: 376 %l1 = load <8 x i8>, <8 x i8>* %src, align 4 377 %s1 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 0, i32 4> 378 %s2 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 1, i32 5> 379 %s3 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 2, i32 6> 380 %s4 = shufflevector <8 x i8> %l1, <8 x i8> undef, <2 x i32> <i32 3, i32 7> 381 %a1 = add <2 x i8> %s1, %s2 382 %a2 = add <2 x i8> %s3, %s4 383 %a3 = add <2 x i8> %a1, %a2 384 store <2 x i8> %a3, <2 x i8> *%dst 385 ret void 386} 387 388define void @vld4_v4i8(<16 x i8> *%src, <4 x i8> *%dst) { 389; CHECK-LABEL: vld4_v4i8: 390; CHECK: @ %bb.0: @ %entry 391; CHECK-NEXT: vldrw.u32 q0, [r0] 392; CHECK-NEXT: vmov.u8 r0, q0[2] 393; CHECK-NEXT: vrev32.8 q2, q0 394; CHECK-NEXT: vmov.32 q1[0], r0 395; CHECK-NEXT: vmov.u8 r0, q0[6] 396; CHECK-NEXT: vmov.32 q1[1], r0 397; CHECK-NEXT: vmov.u8 r0, q0[10] 398; CHECK-NEXT: vmov.32 q1[2], r0 399; CHECK-NEXT: vmov.u8 r0, q0[14] 400; CHECK-NEXT: vmov.32 q1[3], r0 401; CHECK-NEXT: vadd.i32 q1, q1, q2 402; CHECK-NEXT: vrev16.8 q2, q0 403; CHECK-NEXT: vadd.i32 q0, q0, q2 404; CHECK-NEXT: vadd.i32 q0, q0, q1 405; CHECK-NEXT: vstrb.32 q0, [r1] 406; CHECK-NEXT: bx lr 407entry: 408 %l1 = load <16 x i8>, <16 x i8>* %src, align 4 409 %s1 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 410 %s2 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 411 %s3 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 412 %s4 = shufflevector <16 x i8> %l1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 413 %a1 = add <4 x i8> %s1, %s2 414 %a2 = add <4 x i8> %s3, %s4 415 %a3 = add <4 x i8> %a1, %a2 416 store <4 x i8> %a3, <4 x i8> *%dst 417 ret void 418} 419 420define void @vld4_v8i8(<32 x i8> *%src, <8 x i8> *%dst) { 421; CHECK-LABEL: vld4_v8i8: 422; CHECK: @ %bb.0: @ %entry 423; CHECK-NEXT: .vsave {d8, d9} 424; CHECK-NEXT: vpush {d8, d9} 425; CHECK-NEXT: vldrw.u32 q1, [r0] 426; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 427; CHECK-NEXT: vmov.u8 r2, q1[3] 428; CHECK-NEXT: vmov.u8 r0, q0[3] 429; CHECK-NEXT: vmov.16 q2[0], r2 430; CHECK-NEXT: vmov.u8 r2, q1[7] 431; CHECK-NEXT: vmov.16 q2[1], r2 432; CHECK-NEXT: vmov.u8 r2, q1[11] 433; CHECK-NEXT: vmov.16 q2[2], r2 434; CHECK-NEXT: vmov.u8 r2, q1[15] 435; CHECK-NEXT: vmov.16 q2[3], r2 436; CHECK-NEXT: vmov.16 q2[4], r0 437; CHECK-NEXT: vmov.u8 r0, q0[7] 438; CHECK-NEXT: vmov.16 q2[5], r0 439; CHECK-NEXT: vmov.u8 r0, q0[11] 440; CHECK-NEXT: vmov.16 q2[6], r0 441; CHECK-NEXT: vmov.u8 r0, q0[15] 442; CHECK-NEXT: vmov.16 q2[7], r0 443; CHECK-NEXT: vmov.u8 r0, q1[2] 444; CHECK-NEXT: vmov.16 q3[0], r0 445; CHECK-NEXT: vmov.u8 r0, q1[6] 446; CHECK-NEXT: vmov.16 q3[1], r0 447; CHECK-NEXT: vmov.u8 r0, q1[10] 448; CHECK-NEXT: vmov.16 q3[2], r0 449; CHECK-NEXT: vmov.u8 r0, q1[14] 450; CHECK-NEXT: vmov.16 q3[3], r0 451; CHECK-NEXT: vmov.u8 r0, q0[2] 452; CHECK-NEXT: vmov.16 q3[4], r0 453; CHECK-NEXT: vmov.u8 r0, q0[6] 454; CHECK-NEXT: vmov.16 q3[5], r0 455; CHECK-NEXT: vmov.u8 r0, q0[10] 456; CHECK-NEXT: vmov.16 q3[6], r0 457; CHECK-NEXT: vmov.u8 r0, q0[14] 458; CHECK-NEXT: vmov.16 q3[7], r0 459; CHECK-NEXT: vmov.u8 r0, q1[0] 460; CHECK-NEXT: vadd.i16 q2, q3, q2 461; CHECK-NEXT: vmov.16 q3[0], r0 462; CHECK-NEXT: vmov.u8 r0, q1[4] 463; CHECK-NEXT: vmov.16 q3[1], r0 464; CHECK-NEXT: vmov.u8 r0, q1[8] 465; CHECK-NEXT: vmov.16 q3[2], r0 466; CHECK-NEXT: vmov.u8 r0, q1[12] 467; CHECK-NEXT: vmov.16 q3[3], r0 468; CHECK-NEXT: vmov.u8 r0, q0[0] 469; CHECK-NEXT: vmov.16 q3[4], r0 470; CHECK-NEXT: vmov.u8 r0, q0[4] 471; CHECK-NEXT: vmov.16 q3[5], r0 472; CHECK-NEXT: vmov.u8 r0, q0[8] 473; CHECK-NEXT: vmov.16 q3[6], r0 474; CHECK-NEXT: vmov.u8 r0, q1[1] 475; CHECK-NEXT: vmov.16 q4[0], r0 476; CHECK-NEXT: vmov.u8 r0, q1[5] 477; CHECK-NEXT: vmov.16 q4[1], r0 478; CHECK-NEXT: vmov.u8 r0, q1[9] 479; CHECK-NEXT: vmov.16 q4[2], r0 480; CHECK-NEXT: vmov.u8 r0, q1[13] 481; CHECK-NEXT: vmov.16 q4[3], r0 482; CHECK-NEXT: vmov.u8 r0, q0[1] 483; CHECK-NEXT: vmov.16 q4[4], r0 484; CHECK-NEXT: vmov.u8 r0, q0[5] 485; CHECK-NEXT: vmov.16 q4[5], r0 486; CHECK-NEXT: vmov.u8 r0, q0[9] 487; CHECK-NEXT: vmov.16 q4[6], r0 488; CHECK-NEXT: vmov.u8 r0, q0[13] 489; CHECK-NEXT: vmov.16 q4[7], r0 490; CHECK-NEXT: vmov.u8 r0, q0[12] 491; CHECK-NEXT: vmov.16 q3[7], r0 492; CHECK-NEXT: vadd.i16 q0, q3, q4 493; CHECK-NEXT: vadd.i16 q0, q0, q2 494; CHECK-NEXT: vstrb.16 q0, [r1] 495; CHECK-NEXT: vpop {d8, d9} 496; CHECK-NEXT: bx lr 497entry: 498 %l1 = load <32 x i8>, <32 x i8>* %src, align 4 499 %s1 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 500 %s2 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 501 %s3 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 502 %s4 = shufflevector <32 x i8> %l1, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 503 %a1 = add <8 x i8> %s1, %s2 504 %a2 = add <8 x i8> %s3, %s4 505 %a3 = add <8 x i8> %a1, %a2 506 store <8 x i8> %a3, <8 x i8> *%dst 507 ret void 508} 509 510define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) { 511; CHECK-LABEL: vld4_v16i8: 512; CHECK: @ %bb.0: @ %entry 513; CHECK-NEXT: .vsave {d8, d9} 514; CHECK-NEXT: vpush {d8, d9} 515; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] 516; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] 517; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] 518; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] 519; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 520; CHECK-NEXT: vadd.i8 q4, q2, q3 521; CHECK-NEXT: vadd.i8 q0, q0, q1 522; CHECK-NEXT: vadd.i8 q0, q0, q4 523; CHECK-NEXT: vstrw.32 q0, [r1] 524; CHECK-NEXT: vpop {d8, d9} 525; CHECK-NEXT: bx lr 526entry: 527 %l1 = load <64 x i8>, <64 x i8>* %src, align 4 528 %s1 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 529 %s2 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 530 %s3 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 531 %s4 = shufflevector <64 x i8> %l1, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 532 %a1 = add <16 x i8> %s1, %s2 533 %a2 = add <16 x i8> %s3, %s4 534 %a3 = add <16 x i8> %a1, %a2 535 store <16 x i8> %a3, <16 x i8> *%dst 536 ret void 537} 538 539; i64 540 541define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) { 542; CHECK-LABEL: vld4_v2i64: 543; CHECK: @ %bb.0: @ %entry 544; CHECK-NEXT: .save {r4, r5, r6, lr} 545; CHECK-NEXT: push {r4, r5, r6, lr} 546; CHECK-NEXT: .vsave {d8, d9, d10, d11} 547; CHECK-NEXT: vpush {d8, d9, d10, d11} 548; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 549; CHECK-NEXT: vldrw.u32 q5, [r0, #48] 550; CHECK-NEXT: vldrw.u32 q0, [r0] 551; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 552; CHECK-NEXT: vmov.f64 d8, d7 553; CHECK-NEXT: vmov.f32 s17, s15 554; CHECK-NEXT: vmov.f32 s18, s22 555; CHECK-NEXT: vmov.f32 s14, s20 556; CHECK-NEXT: vmov.f32 s19, s23 557; CHECK-NEXT: vmov.f32 s15, s21 558; CHECK-NEXT: vmov r3, s18 559; CHECK-NEXT: vmov r0, s14 560; CHECK-NEXT: vmov.f64 d2, d1 561; CHECK-NEXT: vmov r12, s19 562; CHECK-NEXT: vmov r2, s15 563; CHECK-NEXT: vmov.f32 s5, s3 564; CHECK-NEXT: vmov.f32 s6, s10 565; CHECK-NEXT: vmov.f32 s2, s8 566; CHECK-NEXT: vmov.f32 s3, s9 567; CHECK-NEXT: vmov.f32 s7, s11 568; CHECK-NEXT: vmov r4, s2 569; CHECK-NEXT: vmov r5, s4 570; CHECK-NEXT: vmov r6, s0 571; CHECK-NEXT: adds.w lr, r0, r3 572; CHECK-NEXT: vmov r3, s7 573; CHECK-NEXT: vmov r0, s3 574; CHECK-NEXT: adc.w r12, r12, r2 575; CHECK-NEXT: vmov r2, s6 576; CHECK-NEXT: adds r2, r2, r4 577; CHECK-NEXT: vmov r4, s13 578; CHECK-NEXT: adcs r0, r3 579; CHECK-NEXT: adds.w lr, lr, r2 580; CHECK-NEXT: adc.w r12, r12, r0 581; CHECK-NEXT: vmov r0, s16 582; CHECK-NEXT: vmov r2, s12 583; CHECK-NEXT: vmov r3, s17 584; CHECK-NEXT: adds r0, r0, r2 585; CHECK-NEXT: adc.w r2, r4, r3 586; CHECK-NEXT: vmov r3, s5 587; CHECK-NEXT: vmov r4, s1 588; CHECK-NEXT: adds r5, r5, r6 589; CHECK-NEXT: adcs r3, r4 590; CHECK-NEXT: adds r0, r0, r5 591; CHECK-NEXT: adcs r2, r3 592; CHECK-NEXT: vmov.32 q0[0], r0 593; CHECK-NEXT: vmov.32 q0[1], r2 594; CHECK-NEXT: vmov.32 q0[2], lr 595; CHECK-NEXT: vmov.32 q0[3], r12 596; CHECK-NEXT: vstrw.32 q0, [r1] 597; CHECK-NEXT: vpop {d8, d9, d10, d11} 598; CHECK-NEXT: pop {r4, r5, r6, pc} 599entry: 600 %l1 = load <8 x i64>, <8 x i64>* %src, align 4 601 %s1 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 0, i32 4> 602 %s2 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 1, i32 5> 603 %s3 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 2, i32 6> 604 %s4 = shufflevector <8 x i64> %l1, <8 x i64> undef, <2 x i32> <i32 3, i32 7> 605 %a1 = add <2 x i64> %s1, %s2 606 %a2 = add <2 x i64> %s3, %s4 607 %a3 = add <2 x i64> %a1, %a2 608 store <2 x i64> %a3, <2 x i64> *%dst 609 ret void 610} 611 612define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) { 613; CHECK-LABEL: vld4_v4i64: 614; CHECK: @ %bb.0: @ %entry 615; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} 616; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 617; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 618; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 619; CHECK-NEXT: .pad #72 620; CHECK-NEXT: sub sp, #72 621; CHECK-NEXT: vldrw.u32 q1, [r0, #64] 622; CHECK-NEXT: vldrw.u32 q0, [r0, #96] 623; CHECK-NEXT: vldrw.u32 q5, [r0, #80] 624; CHECK-NEXT: vldrw.u32 q7, [r0, #16] 625; CHECK-NEXT: vmov.f64 d8, d3 626; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 627; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill 628; CHECK-NEXT: vmov.f32 s17, s7 629; CHECK-NEXT: vldrw.u32 q1, [r0] 630; CHECK-NEXT: vmov.f32 s18, s2 631; CHECK-NEXT: vmov.f32 s19, s3 632; CHECK-NEXT: vldrw.u32 q0, [r0, #112] 633; CHECK-NEXT: vmov.f64 d12, d11 634; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 635; CHECK-NEXT: vmov.f32 s25, s23 636; CHECK-NEXT: vmov.f32 s26, s2 637; CHECK-NEXT: vmov.f64 d6, d3 638; CHECK-NEXT: vmov.f32 s27, s3 639; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 640; CHECK-NEXT: vmov.f32 s13, s7 641; CHECK-NEXT: vmov.f32 s14, s2 642; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 643; CHECK-NEXT: vmov.f64 d4, d15 644; CHECK-NEXT: vmov.f32 s15, s3 645; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 646; CHECK-NEXT: vmov.f32 s9, s31 647; CHECK-NEXT: vmov.f32 s10, s2 648; CHECK-NEXT: vmov.f32 s30, s0 649; CHECK-NEXT: vmov.f32 s11, s3 650; CHECK-NEXT: vmov.f32 s31, s1 651; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 652; CHECK-NEXT: vmov r3, s10 653; CHECK-NEXT: vmov r0, s30 654; CHECK-NEXT: vmov.f32 s6, s0 655; CHECK-NEXT: vmov.f32 s7, s1 656; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 657; CHECK-NEXT: vmov r4, s6 658; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill 659; CHECK-NEXT: vmov r12, s11 660; CHECK-NEXT: vmov r2, s31 661; CHECK-NEXT: vmov.f32 s22, s0 662; CHECK-NEXT: vmov.f32 s23, s1 663; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload 664; CHECK-NEXT: vmov r5, s18 665; CHECK-NEXT: vmov r7, s16 666; CHECK-NEXT: adds.w lr, r0, r3 667; CHECK-NEXT: vmov r3, s14 668; CHECK-NEXT: vmov r0, s7 669; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload 670; CHECK-NEXT: vmov.f32 s2, s4 671; CHECK-NEXT: vmov.f32 s3, s5 672; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 673; CHECK-NEXT: adc.w r12, r12, r2 674; CHECK-NEXT: vmov r2, s15 675; CHECK-NEXT: vmov r6, s2 676; CHECK-NEXT: adds r3, r3, r4 677; CHECK-NEXT: vmov r4, s23 678; CHECK-NEXT: adcs r0, r2 679; CHECK-NEXT: adds.w lr, lr, r3 680; CHECK-NEXT: adc.w r12, r12, r0 681; CHECK-NEXT: vmov r0, s26 682; CHECK-NEXT: vmov r2, s22 683; CHECK-NEXT: vmov r3, s27 684; CHECK-NEXT: adds r0, r0, r2 685; CHECK-NEXT: adc.w r2, r4, r3 686; CHECK-NEXT: vmov r3, s19 687; CHECK-NEXT: vmov r4, s3 688; CHECK-NEXT: adds r5, r5, r6 689; CHECK-NEXT: vmov r6, s20 690; CHECK-NEXT: adcs r3, r4 691; CHECK-NEXT: adds r0, r0, r5 692; CHECK-NEXT: vmov r5, s24 693; CHECK-NEXT: adc.w r8, r3, r2 694; CHECK-NEXT: vmov r2, s25 695; CHECK-NEXT: vmov r4, s21 696; CHECK-NEXT: vmov r3, s0 697; CHECK-NEXT: adds r5, r5, r6 698; CHECK-NEXT: vmov r6, s1 699; CHECK-NEXT: adcs r2, r4 700; CHECK-NEXT: vmov r4, s17 701; CHECK-NEXT: adds r3, r3, r7 702; CHECK-NEXT: vmov r7, s28 703; CHECK-NEXT: adcs r4, r6 704; CHECK-NEXT: adds r3, r3, r5 705; CHECK-NEXT: vmov r6, s8 706; CHECK-NEXT: adcs r2, r4 707; CHECK-NEXT: vmov r4, s9 708; CHECK-NEXT: vmov.32 q0[0], r3 709; CHECK-NEXT: vmov r5, s29 710; CHECK-NEXT: vmov.32 q0[1], r2 711; CHECK-NEXT: vmov.32 q0[2], r0 712; CHECK-NEXT: vmov r0, s12 713; CHECK-NEXT: vmov r3, s13 714; CHECK-NEXT: vmov.32 q0[3], r8 715; CHECK-NEXT: vmov r2, s5 716; CHECK-NEXT: vstrw.32 q0, [r1, #16] 717; CHECK-NEXT: adds r6, r6, r7 718; CHECK-NEXT: adcs r4, r5 719; CHECK-NEXT: vmov r5, s4 720; CHECK-NEXT: adds r0, r0, r5 721; CHECK-NEXT: adcs r2, r3 722; CHECK-NEXT: adds r0, r0, r6 723; CHECK-NEXT: adcs r2, r4 724; CHECK-NEXT: vmov.32 q0[0], r0 725; CHECK-NEXT: vmov.32 q0[1], r2 726; CHECK-NEXT: vmov.32 q0[2], lr 727; CHECK-NEXT: vmov.32 q0[3], r12 728; CHECK-NEXT: vstrw.32 q0, [r1] 729; CHECK-NEXT: add sp, #72 730; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 731; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 732entry: 733 %l1 = load <16 x i64>, <16 x i64>* %src, align 4 734 %s1 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 735 %s2 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 736 %s3 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 737 %s4 = shufflevector <16 x i64> %l1, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 738 %a1 = add <4 x i64> %s1, %s2 739 %a2 = add <4 x i64> %s3, %s4 740 %a3 = add <4 x i64> %a1, %a2 741 store <4 x i64> %a3, <4 x i64> *%dst 742 ret void 743} 744 745; f32 746 747define void @vld4_v2f32(<8 x float> *%src, <2 x float> *%dst) { 748; CHECK-LABEL: vld4_v2f32: 749; CHECK: @ %bb.0: @ %entry 750; CHECK-NEXT: vldrw.u32 q1, [r0] 751; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 752; CHECK-NEXT: vmov.f32 s8, s7 753; CHECK-NEXT: vmov.f64 d6, d3 754; CHECK-NEXT: vmov.f32 s9, s3 755; CHECK-NEXT: vmov.f32 s13, s2 756; CHECK-NEXT: vadd.f32 q2, q3, q2 757; CHECK-NEXT: vmov.f32 s12, s5 758; CHECK-NEXT: vmov.f32 s13, s1 759; CHECK-NEXT: vmov.f32 s5, s0 760; CHECK-NEXT: vadd.f32 q0, q1, q3 761; CHECK-NEXT: vadd.f32 q0, q0, q2 762; CHECK-NEXT: vstmia r1, {s0, s1} 763; CHECK-NEXT: bx lr 764entry: 765 %l1 = load <8 x float>, <8 x float>* %src, align 4 766 %s1 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 0, i32 4> 767 %s2 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 1, i32 5> 768 %s3 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 2, i32 6> 769 %s4 = shufflevector <8 x float> %l1, <8 x float> undef, <2 x i32> <i32 3, i32 7> 770 %a1 = fadd <2 x float> %s1, %s2 771 %a2 = fadd <2 x float> %s3, %s4 772 %a3 = fadd <2 x float> %a1, %a2 773 store <2 x float> %a3, <2 x float> *%dst 774 ret void 775} 776 777define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) { 778; CHECK-LABEL: vld4_v4f32: 779; CHECK: @ %bb.0: @ %entry 780; CHECK-NEXT: .vsave {d8, d9} 781; CHECK-NEXT: vpush {d8, d9} 782; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 783; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 784; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 785; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 786; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 787; CHECK-NEXT: vadd.f32 q4, q2, q3 788; CHECK-NEXT: vadd.f32 q0, q0, q1 789; CHECK-NEXT: vadd.f32 q0, q0, q4 790; CHECK-NEXT: vstrw.32 q0, [r1] 791; CHECK-NEXT: vpop {d8, d9} 792; CHECK-NEXT: bx lr 793entry: 794 %l1 = load <16 x float>, <16 x float>* %src, align 4 795 %s1 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 796 %s2 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 797 %s3 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 798 %s4 = shufflevector <16 x float> %l1, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 799 %a1 = fadd <4 x float> %s1, %s2 800 %a2 = fadd <4 x float> %s3, %s4 801 %a3 = fadd <4 x float> %a1, %a2 802 store <4 x float> %a3, <4 x float> *%dst 803 ret void 804} 805 806define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) { 807; CHECK-LABEL: vld4_v8f32: 808; CHECK: @ %bb.0: @ %entry 809; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 810; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 811; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 812; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 813; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 814; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0]! 815; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 816; CHECK-NEXT: vadd.f32 q6, q2, q3 817; CHECK-NEXT: vadd.f32 q0, q0, q1 818; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r0] 819; CHECK-NEXT: vadd.f32 q0, q0, q6 820; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r0] 821; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r0] 822; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r0] 823; CHECK-NEXT: vstrw.32 q0, [r1] 824; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4 825; CHECK-NEXT: vadd.f32 q5, q3, q4 826; CHECK-NEXT: vadd.f32 q1, q1, q2 827; CHECK-NEXT: vadd.f32 q1, q1, q5 828; CHECK-NEXT: vstrw.32 q1, [r1, #16] 829; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 830; CHECK-NEXT: bx lr 831entry: 832 %l1 = load <32 x float>, <32 x float>* %src, align 4 833 %s1 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 834 %s2 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 835 %s3 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 836 %s4 = shufflevector <32 x float> %l1, <32 x float> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 837 %a1 = fadd <8 x float> %s1, %s2 838 %a2 = fadd <8 x float> %s3, %s4 839 %a3 = fadd <8 x float> %a1, %a2 840 store <8 x float> %a3, <8 x float> *%dst 841 ret void 842} 843 844define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) { 845; CHECK-LABEL: vld4_v16f32: 846; CHECK: @ %bb.0: @ %entry 847; CHECK-NEXT: .save {r4, r5} 848; CHECK-NEXT: push {r4, r5} 849; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 850; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 851; CHECK-NEXT: .pad #136 852; CHECK-NEXT: sub sp, #136 853; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 854; CHECK-NEXT: mov r2, r0 855; CHECK-NEXT: add.w r3, r0, #192 856; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 857; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 858; CHECK-NEXT: adds r0, #128 859; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2]! 860; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 861; CHECK-NEXT: vadd.f32 q4, q2, q3 862; CHECK-NEXT: vadd.f32 q0, q0, q1 863; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill 864; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 865; CHECK-NEXT: vld40.32 {q1, q2, q3, q4}, [r3] 866; CHECK-NEXT: vld41.32 {q1, q2, q3, q4}, [r3] 867; CHECK-NEXT: vld42.32 {q1, q2, q3, q4}, [r3] 868; CHECK-NEXT: vld43.32 {q1, q2, q3, q4}, [r3] 869; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload 870; CHECK-NEXT: vldrw.u32 q5, [sp, #96] @ 16-byte Reload 871; CHECK-NEXT: vstrw.32 q4, [sp, #80] @ 16-byte Spill 872; CHECK-NEXT: vmov q0, q1 873; CHECK-NEXT: vadd.f32 q6, q5, q6 874; CHECK-NEXT: vldrw.u32 q5, [sp, #80] @ 16-byte Reload 875; CHECK-NEXT: vadd.f32 q0, q0, q2 876; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill 877; CHECK-NEXT: vadd.f32 q1, q3, q5 878; CHECK-NEXT: vadd.f32 q0, q0, q1 879; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 880; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r2] 881; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r2] 882; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r2] 883; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r2] 884; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill 885; CHECK-NEXT: vld40.32 {q0, q1, q2, q3}, [r0] 886; CHECK-NEXT: vld41.32 {q0, q1, q2, q3}, [r0] 887; CHECK-NEXT: vld42.32 {q0, q1, q2, q3}, [r0] 888; CHECK-NEXT: vld43.32 {q0, q1, q2, q3}, [r0] 889; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 890; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill 891; CHECK-NEXT: vmov q5, q1 892; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload 893; CHECK-NEXT: vadd.f32 q0, q0, q5 894; CHECK-NEXT: vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload 895; CHECK-NEXT: vadd.f32 q1, q2, q1 896; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6 897; CHECK-NEXT: vadd.f32 q2, q3, q4 898; CHECK-NEXT: vadd.f32 q0, q0, q1 899; CHECK-NEXT: vadd.f32 q1, q5, q6 900; CHECK-NEXT: vadd.f32 q1, q2, q1 901; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload 902; CHECK-NEXT: vstrw.32 q0, [r1, #32] 903; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload 904; CHECK-NEXT: vstrw.32 q2, [r1, #48] 905; CHECK-NEXT: vstrw.32 q1, [r1, #16] 906; CHECK-NEXT: vstrw.32 q0, [r1] 907; CHECK-NEXT: add sp, #136 908; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 909; CHECK-NEXT: pop {r4, r5} 910; CHECK-NEXT: bx lr 911entry: 912 %l1 = load <64 x float>, <64 x float>* %src, align 4 913 %s1 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 914 %s2 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 915 %s3 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 916 %s4 = shufflevector <64 x float> %l1, <64 x float> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 917 %a1 = fadd <16 x float> %s1, %s2 918 %a2 = fadd <16 x float> %s3, %s4 919 %a3 = fadd <16 x float> %a1, %a2 920 store <16 x float> %a3, <16 x float> *%dst 921 ret void 922} 923 924; f16 925 926define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) { 927; CHECK-LABEL: vld4_v2f16: 928; CHECK: @ %bb.0: @ %entry 929; CHECK-NEXT: vldrw.u32 q0, [r0] 930; CHECK-NEXT: vmovx.f16 s4, s3 931; CHECK-NEXT: vmov r0, s4 932; CHECK-NEXT: vmovx.f16 s4, s1 933; CHECK-NEXT: vmov r2, s4 934; CHECK-NEXT: vmov.16 q1[0], r2 935; CHECK-NEXT: vmov r2, s1 936; CHECK-NEXT: vmov.16 q1[1], r0 937; CHECK-NEXT: vmov r0, s3 938; CHECK-NEXT: vmov.16 q2[0], r2 939; CHECK-NEXT: vmov.16 q2[1], r0 940; CHECK-NEXT: vadd.f16 q1, q2, q1 941; CHECK-NEXT: vmovx.f16 s8, s0 942; CHECK-NEXT: vmov r0, s8 943; CHECK-NEXT: vmovx.f16 s8, s2 944; CHECK-NEXT: vmov r2, s8 945; CHECK-NEXT: vmov.16 q2[0], r0 946; CHECK-NEXT: vmov r0, s0 947; CHECK-NEXT: vmov.16 q2[1], r2 948; CHECK-NEXT: vmov r2, s2 949; CHECK-NEXT: vmov.16 q0[0], r0 950; CHECK-NEXT: vmov.16 q0[1], r2 951; CHECK-NEXT: vadd.f16 q0, q0, q2 952; CHECK-NEXT: vadd.f16 q0, q0, q1 953; CHECK-NEXT: vmov r0, s0 954; CHECK-NEXT: str r0, [r1] 955; CHECK-NEXT: bx lr 956entry: 957 %l1 = load <8 x half>, <8 x half>* %src, align 4 958 %s1 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 0, i32 4> 959 %s2 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 1, i32 5> 960 %s3 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 2, i32 6> 961 %s4 = shufflevector <8 x half> %l1, <8 x half> undef, <2 x i32> <i32 3, i32 7> 962 %a1 = fadd <2 x half> %s1, %s2 963 %a2 = fadd <2 x half> %s3, %s4 964 %a3 = fadd <2 x half> %a1, %a2 965 store <2 x half> %a3, <2 x half> *%dst 966 ret void 967} 968 969define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) { 970; CHECK-LABEL: vld4_v4f16: 971; CHECK: @ %bb.0: @ %entry 972; CHECK-NEXT: .vsave {d8} 973; CHECK-NEXT: vpush {d8} 974; CHECK-NEXT: vldrw.u32 q1, [r0] 975; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 976; CHECK-NEXT: vmov r2, s5 977; CHECK-NEXT: vmovx.f16 s12, s5 978; CHECK-NEXT: vmov r3, s7 979; CHECK-NEXT: vmov.16 q2[0], r2 980; CHECK-NEXT: vmov.16 q2[1], r3 981; CHECK-NEXT: vmov r0, s1 982; CHECK-NEXT: vmov.16 q2[2], r0 983; CHECK-NEXT: vmov r0, s12 984; CHECK-NEXT: vmovx.f16 s12, s7 985; CHECK-NEXT: vmovx.f16 s16, s1 986; CHECK-NEXT: vmov r2, s12 987; CHECK-NEXT: vmov.16 q3[0], r0 988; CHECK-NEXT: vmov r0, s16 989; CHECK-NEXT: vmov.16 q3[1], r2 990; CHECK-NEXT: vmovx.f16 s16, s3 991; CHECK-NEXT: vmov.16 q3[2], r0 992; CHECK-NEXT: vmov r0, s16 993; CHECK-NEXT: vmovx.f16 s16, s0 994; CHECK-NEXT: vmov.16 q3[3], r0 995; CHECK-NEXT: vmov r0, s3 996; CHECK-NEXT: vmov.16 q2[3], r0 997; CHECK-NEXT: vadd.f16 q2, q2, q3 998; CHECK-NEXT: vmovx.f16 s12, s4 999; CHECK-NEXT: vmov r0, s12 1000; CHECK-NEXT: vmovx.f16 s12, s6 1001; CHECK-NEXT: vmov r2, s12 1002; CHECK-NEXT: vmov.16 q3[0], r0 1003; CHECK-NEXT: vmov.16 q3[1], r2 1004; CHECK-NEXT: vmov r0, s16 1005; CHECK-NEXT: vmovx.f16 s16, s2 1006; CHECK-NEXT: vmov.16 q3[2], r0 1007; CHECK-NEXT: vmov r0, s16 1008; CHECK-NEXT: vmov.16 q3[3], r0 1009; CHECK-NEXT: vmov r0, s4 1010; CHECK-NEXT: vmov r2, s6 1011; CHECK-NEXT: vmov.16 q1[0], r0 1012; CHECK-NEXT: vmov.16 q1[1], r2 1013; CHECK-NEXT: vmov r0, s0 1014; CHECK-NEXT: vmov.16 q1[2], r0 1015; CHECK-NEXT: vmov r0, s2 1016; CHECK-NEXT: vmov.16 q1[3], r0 1017; CHECK-NEXT: vadd.f16 q0, q1, q3 1018; CHECK-NEXT: vadd.f16 q0, q0, q2 1019; CHECK-NEXT: vmov r2, s1 1020; CHECK-NEXT: vmov r0, s0 1021; CHECK-NEXT: strd r0, r2, [r1] 1022; CHECK-NEXT: vpop {d8} 1023; CHECK-NEXT: bx lr 1024entry: 1025 %l1 = load <16 x half>, <16 x half>* %src, align 4 1026 %s1 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 1027 %s2 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 1028 %s3 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 1029 %s4 = shufflevector <16 x half> %l1, <16 x half> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 1030 %a1 = fadd <4 x half> %s1, %s2 1031 %a2 = fadd <4 x half> %s3, %s4 1032 %a3 = fadd <4 x half> %a1, %a2 1033 store <4 x half> %a3, <4 x half> *%dst 1034 ret void 1035} 1036 1037define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) { 1038; CHECK-LABEL: vld4_v8f16: 1039; CHECK: @ %bb.0: @ %entry 1040; CHECK-NEXT: .vsave {d8, d9} 1041; CHECK-NEXT: vpush {d8, d9} 1042; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 1043; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 1044; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 1045; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0] 1046; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 1047; CHECK-NEXT: vadd.f16 q4, q2, q3 1048; CHECK-NEXT: vadd.f16 q0, q0, q1 1049; CHECK-NEXT: vadd.f16 q0, q0, q4 1050; CHECK-NEXT: vstrw.32 q0, [r1] 1051; CHECK-NEXT: vpop {d8, d9} 1052; CHECK-NEXT: bx lr 1053entry: 1054 %l1 = load <32 x half>, <32 x half>* %src, align 4 1055 %s1 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 1056 %s2 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 1057 %s3 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 1058 %s4 = shufflevector <32 x half> %l1, <32 x half> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 1059 %a1 = fadd <8 x half> %s1, %s2 1060 %a2 = fadd <8 x half> %s3, %s4 1061 %a3 = fadd <8 x half> %a1, %a2 1062 store <8 x half> %a3, <8 x half> *%dst 1063 ret void 1064} 1065 1066define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) { 1067; CHECK-LABEL: vld4_v16f16: 1068; CHECK: @ %bb.0: @ %entry 1069; CHECK-NEXT: .save {r4, r5} 1070; CHECK-NEXT: push {r4, r5} 1071; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1072; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1073; CHECK-NEXT: .pad #88 1074; CHECK-NEXT: sub sp, #88 1075; CHECK-NEXT: vld40.16 {q0, q1, q2, q3}, [r0] 1076; CHECK-NEXT: vld41.16 {q0, q1, q2, q3}, [r0] 1077; CHECK-NEXT: vld42.16 {q0, q1, q2, q3}, [r0] 1078; CHECK-NEXT: vld43.16 {q0, q1, q2, q3}, [r0]! 1079; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill 1080; CHECK-NEXT: vld40.16 {q4, q5, q6, q7}, [r0] 1081; CHECK-NEXT: vld41.16 {q4, q5, q6, q7}, [r0] 1082; CHECK-NEXT: vld42.16 {q4, q5, q6, q7}, [r0] 1083; CHECK-NEXT: vld43.16 {q4, q5, q6, q7}, [r0] 1084; CHECK-NEXT: @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7 1085; CHECK-NEXT: vadd.f16 q0, q6, q7 1086; CHECK-NEXT: vadd.f16 q4, q4, q5 1087; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill 1088; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload 1089; CHECK-NEXT: vadd.f16 q4, q4, q0 1090; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload 1091; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 1092; CHECK-NEXT: vstrw.32 q4, [r1, #16] 1093; CHECK-NEXT: vadd.f16 q4, q2, q3 1094; CHECK-NEXT: vadd.f16 q0, q0, q1 1095; CHECK-NEXT: vadd.f16 q0, q0, q4 1096; CHECK-NEXT: vstrw.32 q0, [r1] 1097; CHECK-NEXT: add sp, #88 1098; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1099; CHECK-NEXT: pop {r4, r5} 1100; CHECK-NEXT: bx lr 1101entry: 1102 %l1 = load <64 x half>, <64 x half>* %src, align 4 1103 %s1 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 1104 %s2 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 1105 %s3 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62> 1106 %s4 = shufflevector <64 x half> %l1, <64 x half> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63> 1107 %a1 = fadd <16 x half> %s1, %s2 1108 %a2 = fadd <16 x half> %s3, %s4 1109 %a3 = fadd <16 x half> %a1, %a2 1110 store <16 x half> %a3, <16 x half> *%dst 1111 ret void 1112} 1113 1114; f64 1115 1116define void @vld4_v2f64(<8 x double> *%src, <2 x double> *%dst) { 1117; CHECK-LABEL: vld4_v2f64: 1118; CHECK: @ %bb.0: @ %entry 1119; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 1120; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 1121; CHECK-NEXT: vldrw.u32 q2, [r0] 1122; CHECK-NEXT: vadd.f64 d0, d0, d1 1123; CHECK-NEXT: vadd.f64 d1, d2, d3 1124; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 1125; CHECK-NEXT: vadd.f64 d2, d2, d3 1126; CHECK-NEXT: vadd.f64 d3, d4, d5 1127; CHECK-NEXT: vadd.f64 d1, d1, d0 1128; CHECK-NEXT: vadd.f64 d0, d3, d2 1129; CHECK-NEXT: vstrw.32 q0, [r1] 1130; CHECK-NEXT: bx lr 1131entry: 1132 %l1 = load <8 x double>, <8 x double>* %src, align 4 1133 %s1 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 0, i32 4> 1134 %s2 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 1, i32 5> 1135 %s3 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 2, i32 6> 1136 %s4 = shufflevector <8 x double> %l1, <8 x double> undef, <2 x i32> <i32 3, i32 7> 1137 %a1 = fadd <2 x double> %s1, %s2 1138 %a2 = fadd <2 x double> %s3, %s4 1139 %a3 = fadd <2 x double> %a1, %a2 1140 store <2 x double> %a3, <2 x double> *%dst 1141 ret void 1142} 1143 1144define void @vld4_v4f64(<16 x double> *%src, <4 x double> *%dst) { 1145; CHECK-LABEL: vld4_v4f64: 1146; CHECK: @ %bb.0: @ %entry 1147; CHECK-NEXT: .vsave {d8, d9} 1148; CHECK-NEXT: vpush {d8, d9} 1149; CHECK-NEXT: vldrw.u32 q0, [r0, #112] 1150; CHECK-NEXT: vldrw.u32 q1, [r0, #96] 1151; CHECK-NEXT: vldrw.u32 q2, [r0, #64] 1152; CHECK-NEXT: vldrw.u32 q3, [r0, #32] 1153; CHECK-NEXT: vadd.f64 d0, d0, d1 1154; CHECK-NEXT: vldrw.u32 q4, [r0] 1155; CHECK-NEXT: vadd.f64 d1, d2, d3 1156; CHECK-NEXT: vldrw.u32 q1, [r0, #80] 1157; CHECK-NEXT: vadd.f64 d2, d2, d3 1158; CHECK-NEXT: vadd.f64 d3, d4, d5 1159; CHECK-NEXT: vldrw.u32 q2, [r0, #48] 1160; CHECK-NEXT: vadd.f64 d4, d4, d5 1161; CHECK-NEXT: vadd.f64 d5, d6, d7 1162; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 1163; CHECK-NEXT: vadd.f64 d6, d6, d7 1164; CHECK-NEXT: vadd.f64 d7, d8, d9 1165; CHECK-NEXT: vadd.f64 d1, d1, d0 1166; CHECK-NEXT: vadd.f64 d0, d3, d2 1167; CHECK-NEXT: vadd.f64 d3, d5, d4 1168; CHECK-NEXT: vstrw.32 q0, [r1, #16] 1169; CHECK-NEXT: vadd.f64 d2, d7, d6 1170; CHECK-NEXT: vstrw.32 q1, [r1] 1171; CHECK-NEXT: vpop {d8, d9} 1172; CHECK-NEXT: bx lr 1173entry: 1174 %l1 = load <16 x double>, <16 x double>* %src, align 4 1175 %s1 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 1176 %s2 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 1177 %s3 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 1178 %s4 = shufflevector <16 x double> %l1, <16 x double> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 1179 %a1 = fadd <4 x double> %s1, %s2 1180 %a2 = fadd <4 x double> %s3, %s4 1181 %a3 = fadd <4 x double> %a1, %a2 1182 store <4 x double> %a3, <4 x double> *%dst 1183 ret void 1184} 1185