1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s 3 4; i32 5 6define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(<2 x i32*>* %offptr) { 7; CHECK-LABEL: ptr_v2i32: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: ldrd r1, r0, [r0] 10; CHECK-NEXT: ldr r0, [r0] 11; CHECK-NEXT: ldr r1, [r1] 12; CHECK-NEXT: vmov.32 q0[0], r1 13; CHECK-NEXT: vmov.32 q0[2], r0 14; CHECK-NEXT: bx lr 15entry: 16 %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4 17 %gather = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) 18 ret <2 x i32> %gather 19} 20 21define arm_aapcs_vfpcc <4 x i32> @ptr_v4i32(<4 x i32*>* %offptr) { 22; CHECK-LABEL: ptr_v4i32: 23; CHECK: @ %bb.0: @ %entry 24; CHECK-NEXT: vldrw.u32 q1, [r0] 25; CHECK-NEXT: vldrw.u32 q0, [q1] 26; CHECK-NEXT: bx lr 27entry: 28 %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4 29 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 30 ret <4 x i32> %gather 31} 32 33define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(<8 x i32*>* %offptr) { 34; CHECK-LABEL: ptr_v8i32: 35; CHECK: @ %bb.0: @ %entry 36; CHECK-NEXT: .save {r4, r5, r7, lr} 37; CHECK-NEXT: push {r4, r5, r7, lr} 38; CHECK-NEXT: vldrw.u32 q0, [r0] 39; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 40; CHECK-NEXT: vmov r1, s2 41; CHECK-NEXT: vmov r2, s3 42; CHECK-NEXT: vmov r3, s0 43; CHECK-NEXT: vmov r0, s4 44; CHECK-NEXT: vmov r5, s1 45; CHECK-NEXT: vmov r4, s7 46; CHECK-NEXT: ldr.w r12, [r1] 47; CHECK-NEXT: vmov r1, s5 48; CHECK-NEXT: ldr.w lr, [r2] 49; CHECK-NEXT: vmov r2, s6 50; CHECK-NEXT: ldr r3, [r3] 51; CHECK-NEXT: ldr r0, [r0] 52; CHECK-NEXT: ldr r5, [r5] 53; CHECK-NEXT: vmov.32 q0[0], r3 54; CHECK-NEXT: vmov.32 q1[0], r0 55; CHECK-NEXT: ldr r4, [r4] 56; CHECK-NEXT: vmov.32 q0[1], r5 57; CHECK-NEXT: vmov.32 q0[2], r12 58; CHECK-NEXT: vmov.32 q0[3], lr 59; CHECK-NEXT: ldr r1, [r1] 60; CHECK-NEXT: ldr r2, [r2] 61; CHECK-NEXT: vmov.32 q1[1], r1 62; CHECK-NEXT: vmov.32 q1[2], r2 63; CHECK-NEXT: vmov.32 q1[3], r4 64; CHECK-NEXT: pop {r4, r5, r7, pc} 65entry: 66 %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4 67 %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 68 ret <8 x i32> %gather 69} 70 71define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(<16 x i32*>* %offptr) { 72; CHECK-LABEL: ptr_v16i32: 73; CHECK: @ %bb.0: @ %entry 74; CHECK-NEXT: .save {r4, r5, r6, lr} 75; CHECK-NEXT: push {r4, r5, r6, lr} 76; CHECK-NEXT: vldrw.u32 q2, [r0, #32] 77; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 78; CHECK-NEXT: vldrw.u32 q1, [r0] 79; CHECK-NEXT: vldrw.u32 q3, [r0, #16] 80; CHECK-NEXT: vmov r1, s10 81; CHECK-NEXT: vmov r5, s4 82; CHECK-NEXT: vmov r2, s3 83; CHECK-NEXT: vmov r0, s12 84; CHECK-NEXT: vmov r6, s7 85; CHECK-NEXT: vmov r4, s11 86; CHECK-NEXT: ldr.w r12, [r1] 87; CHECK-NEXT: vmov r1, s0 88; CHECK-NEXT: ldr r5, [r5] 89; CHECK-NEXT: ldr r2, [r2] 90; CHECK-NEXT: ldr r0, [r0] 91; CHECK-NEXT: ldr r6, [r6] 92; CHECK-NEXT: ldr r4, [r4] 93; CHECK-NEXT: ldr.w lr, [r1] 94; CHECK-NEXT: vmov r1, s1 95; CHECK-NEXT: ldr r3, [r1] 96; CHECK-NEXT: vmov r1, s2 97; CHECK-NEXT: vmov.32 q0[0], r5 98; CHECK-NEXT: vmov r5, s5 99; CHECK-NEXT: ldr r1, [r1] 100; CHECK-NEXT: ldr r5, [r5] 101; CHECK-NEXT: vmov.32 q0[1], r5 102; CHECK-NEXT: vmov r5, s6 103; CHECK-NEXT: vmov.32 q1[0], r0 104; CHECK-NEXT: vmov r0, s13 105; CHECK-NEXT: ldr r5, [r5] 106; CHECK-NEXT: ldr r0, [r0] 107; CHECK-NEXT: vmov.32 q0[2], r5 108; CHECK-NEXT: vmov r5, s8 109; CHECK-NEXT: vmov.32 q1[1], r0 110; CHECK-NEXT: vmov r0, s14 111; CHECK-NEXT: vmov.32 q0[3], r6 112; CHECK-NEXT: ldr r5, [r5] 113; CHECK-NEXT: ldr r0, [r0] 114; CHECK-NEXT: vmov.32 q1[2], r0 115; CHECK-NEXT: vmov r0, s15 116; CHECK-NEXT: vmov.32 q3[0], lr 117; CHECK-NEXT: vmov.32 q3[1], r3 118; CHECK-NEXT: vmov.32 q3[2], r1 119; CHECK-NEXT: vmov.32 q3[3], r2 120; CHECK-NEXT: ldr r0, [r0] 121; CHECK-NEXT: vmov.32 q1[3], r0 122; CHECK-NEXT: vmov r0, s9 123; CHECK-NEXT: vmov.32 q2[0], r5 124; CHECK-NEXT: ldr r0, [r0] 125; CHECK-NEXT: vmov.32 q2[1], r0 126; CHECK-NEXT: vmov.32 q2[2], r12 127; CHECK-NEXT: vmov.32 q2[3], r4 128; CHECK-NEXT: pop {r4, r5, r6, pc} 129entry: 130 %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4 131 %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef) 132 ret <16 x i32> %gather 133} 134 135; f32 136 137define arm_aapcs_vfpcc <2 x float> @ptr_v2f32(<2 x float*>* %offptr) { 138; CHECK-LABEL: ptr_v2f32: 139; CHECK: @ %bb.0: @ %entry 140; CHECK-NEXT: ldrd r1, r0, [r0] 141; CHECK-NEXT: vldr s1, [r0] 142; CHECK-NEXT: vldr s0, [r1] 143; CHECK-NEXT: bx lr 144entry: 145 %offs = load <2 x float*>, <2 x float*>* %offptr, align 4 146 %gather = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef) 147 ret <2 x float> %gather 148} 149 150define arm_aapcs_vfpcc <4 x float> @ptr_v4f32(<4 x float*>* %offptr) { 151; CHECK-LABEL: ptr_v4f32: 152; CHECK: @ %bb.0: @ %entry 153; CHECK-NEXT: vldrw.u32 q1, [r0] 154; CHECK-NEXT: vldrw.u32 q0, [q1] 155; CHECK-NEXT: bx lr 156entry: 157 %offs = load <4 x float*>, <4 x float*>* %offptr, align 4 158 %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 159 ret <4 x float> %gather 160} 161 162define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) { 163; CHECK-LABEL: ptr_v8f32: 164; CHECK: @ %bb.0: @ %entry 165; CHECK-NEXT: vldrw.u32 q1, [r0] 166; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 167; CHECK-NEXT: vmov r1, s7 168; CHECK-NEXT: vmov r0, s11 169; CHECK-NEXT: vldr s3, [r1] 170; CHECK-NEXT: vmov r1, s6 171; CHECK-NEXT: vldr s2, [r1] 172; CHECK-NEXT: vmov r1, s5 173; CHECK-NEXT: vldr s1, [r1] 174; CHECK-NEXT: vmov r1, s4 175; CHECK-NEXT: vldr s7, [r0] 176; CHECK-NEXT: vmov r0, s10 177; CHECK-NEXT: vldr s0, [r1] 178; CHECK-NEXT: vldr s6, [r0] 179; CHECK-NEXT: vmov r0, s9 180; CHECK-NEXT: vldr s5, [r0] 181; CHECK-NEXT: vmov r0, s8 182; CHECK-NEXT: vldr s4, [r0] 183; CHECK-NEXT: bx lr 184entry: 185 %offs = load <8 x float*>, <8 x float*>* %offptr, align 4 186 %gather = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef) 187 ret <8 x float> %gather 188} 189 190; i16 191 192define arm_aapcs_vfpcc <8 x i16> @ptr_i16(<8 x i16*>* %offptr) { 193; CHECK-LABEL: ptr_i16: 194; CHECK: @ %bb.0: @ %entry 195; CHECK-NEXT: .save {r4, r5, r7, lr} 196; CHECK-NEXT: push {r4, r5, r7, lr} 197; CHECK-NEXT: vldrw.u32 q0, [r0] 198; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 199; CHECK-NEXT: vmov r1, s2 200; CHECK-NEXT: vmov r2, s3 201; CHECK-NEXT: vmov r3, s0 202; CHECK-NEXT: vmov r5, s1 203; CHECK-NEXT: vmov r0, s4 204; CHECK-NEXT: vmov r4, s7 205; CHECK-NEXT: ldrh.w r12, [r1] 206; CHECK-NEXT: vmov r1, s5 207; CHECK-NEXT: ldrh.w lr, [r2] 208; CHECK-NEXT: vmov r2, s6 209; CHECK-NEXT: ldrh r3, [r3] 210; CHECK-NEXT: ldrh r5, [r5] 211; CHECK-NEXT: vmov.16 q0[0], r3 212; CHECK-NEXT: ldrh r0, [r0] 213; CHECK-NEXT: vmov.16 q0[1], r5 214; CHECK-NEXT: ldrh r4, [r4] 215; CHECK-NEXT: vmov.16 q0[2], r12 216; CHECK-NEXT: vmov.16 q0[3], lr 217; CHECK-NEXT: vmov.16 q0[4], r0 218; CHECK-NEXT: ldrh r1, [r1] 219; CHECK-NEXT: ldrh r2, [r2] 220; CHECK-NEXT: vmov.16 q0[5], r1 221; CHECK-NEXT: vmov.16 q0[6], r2 222; CHECK-NEXT: vmov.16 q0[7], r4 223; CHECK-NEXT: pop {r4, r5, r7, pc} 224entry: 225 %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 226 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 227 ret <8 x i16> %gather 228} 229 230define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(<2 x i16*>* %offptr) { 231; CHECK-LABEL: ptr_v2i16_sext: 232; CHECK: @ %bb.0: @ %entry 233; CHECK-NEXT: ldrd r1, r0, [r0] 234; CHECK-NEXT: ldrsh.w r0, [r0] 235; CHECK-NEXT: ldrsh.w r1, [r1] 236; CHECK-NEXT: asrs r2, r0, #31 237; CHECK-NEXT: vmov.32 q0[0], r1 238; CHECK-NEXT: asrs r1, r1, #31 239; CHECK-NEXT: vmov.32 q0[1], r1 240; CHECK-NEXT: vmov.32 q0[2], r0 241; CHECK-NEXT: vmov.32 q0[3], r2 242; CHECK-NEXT: bx lr 243entry: 244 %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4 245 %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef) 246 %ext = sext <2 x i16> %gather to <2 x i32> 247 ret <2 x i32> %ext 248} 249 250define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(<2 x i16*>* %offptr) { 251; CHECK-LABEL: ptr_v2i16_zext: 252; CHECK: @ %bb.0: @ %entry 253; CHECK-NEXT: ldrd r1, r0, [r0] 254; CHECK-NEXT: vmov.i64 q0, #0xffff 255; CHECK-NEXT: ldrh r0, [r0] 256; CHECK-NEXT: ldrh r1, [r1] 257; CHECK-NEXT: vmov.32 q1[0], r1 258; CHECK-NEXT: vmov.32 q1[2], r0 259; CHECK-NEXT: vand q0, q1, q0 260; CHECK-NEXT: bx lr 261entry: 262 %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4 263 %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef) 264 %ext = zext <2 x i16> %gather to <2 x i32> 265 ret <2 x i32> %ext 266} 267 268define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) { 269; CHECK-LABEL: ptr_v4i16_sext: 270; CHECK: @ %bb.0: @ %entry 271; CHECK-NEXT: vldrw.u32 q0, [r0] 272; CHECK-NEXT: vmov r2, s0 273; CHECK-NEXT: vmov r0, s1 274; CHECK-NEXT: vmov r3, s2 275; CHECK-NEXT: vmov r1, s3 276; CHECK-NEXT: ldrh r2, [r2] 277; CHECK-NEXT: ldrh r0, [r0] 278; CHECK-NEXT: vmov.32 q0[0], r2 279; CHECK-NEXT: ldrh r3, [r3] 280; CHECK-NEXT: vmov.32 q0[1], r0 281; CHECK-NEXT: ldrh r1, [r1] 282; CHECK-NEXT: vmov.32 q0[2], r3 283; CHECK-NEXT: vmov.32 q0[3], r1 284; CHECK-NEXT: vmovlb.s16 q0, q0 285; CHECK-NEXT: bx lr 286entry: 287 %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 288 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 289 %ext = sext <4 x i16> %gather to <4 x i32> 290 ret <4 x i32> %ext 291} 292 293define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) { 294; CHECK-LABEL: ptr_v4i16_zext: 295; CHECK: @ %bb.0: @ %entry 296; CHECK-NEXT: vldrw.u32 q0, [r0] 297; CHECK-NEXT: vmov r2, s0 298; CHECK-NEXT: vmov r0, s1 299; CHECK-NEXT: vmov r3, s2 300; CHECK-NEXT: vmov r1, s3 301; CHECK-NEXT: ldrh r2, [r2] 302; CHECK-NEXT: ldrh r0, [r0] 303; CHECK-NEXT: vmov.32 q0[0], r2 304; CHECK-NEXT: ldrh r3, [r3] 305; CHECK-NEXT: vmov.32 q0[1], r0 306; CHECK-NEXT: ldrh r1, [r1] 307; CHECK-NEXT: vmov.32 q0[2], r3 308; CHECK-NEXT: vmov.32 q0[3], r1 309; CHECK-NEXT: vmovlb.u16 q0, q0 310; CHECK-NEXT: bx lr 311entry: 312 %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 313 %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef) 314 %ext = zext <4 x i16> %gather to <4 x i32> 315 ret <4 x i32> %ext 316} 317 318define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) { 319; CHECK-LABEL: ptr_v8i16_sext: 320; CHECK: @ %bb.0: @ %entry 321; CHECK-NEXT: .save {r4, r5, r7, lr} 322; CHECK-NEXT: push {r4, r5, r7, lr} 323; CHECK-NEXT: vldrw.u32 q0, [r0] 324; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 325; CHECK-NEXT: vmov r1, s2 326; CHECK-NEXT: vmov r2, s3 327; CHECK-NEXT: vmov r3, s0 328; CHECK-NEXT: vmov r0, s4 329; CHECK-NEXT: vmov r5, s1 330; CHECK-NEXT: vmov r4, s7 331; CHECK-NEXT: ldrh.w r12, [r1] 332; CHECK-NEXT: vmov r1, s5 333; CHECK-NEXT: ldrh.w lr, [r2] 334; CHECK-NEXT: vmov r2, s6 335; CHECK-NEXT: ldrh r3, [r3] 336; CHECK-NEXT: ldrh r0, [r0] 337; CHECK-NEXT: ldrh r5, [r5] 338; CHECK-NEXT: vmov.32 q0[0], r3 339; CHECK-NEXT: vmov.32 q1[0], r0 340; CHECK-NEXT: ldrh r4, [r4] 341; CHECK-NEXT: vmov.32 q0[1], r5 342; CHECK-NEXT: vmov.32 q0[2], r12 343; CHECK-NEXT: vmov.32 q0[3], lr 344; CHECK-NEXT: vmovlb.s16 q0, q0 345; CHECK-NEXT: ldrh r1, [r1] 346; CHECK-NEXT: ldrh r2, [r2] 347; CHECK-NEXT: vmov.32 q1[1], r1 348; CHECK-NEXT: vmov.32 q1[2], r2 349; CHECK-NEXT: vmov.32 q1[3], r4 350; CHECK-NEXT: vmovlb.s16 q1, q1 351; CHECK-NEXT: pop {r4, r5, r7, pc} 352entry: 353 %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 354 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 355 %ext = sext <8 x i16> %gather to <8 x i32> 356 ret <8 x i32> %ext 357} 358 359define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(<8 x i16*>* %offptr) { 360; CHECK-LABEL: ptr_v8i16_zext: 361; CHECK: @ %bb.0: @ %entry 362; CHECK-NEXT: .save {r4, r5, r7, lr} 363; CHECK-NEXT: push {r4, r5, r7, lr} 364; CHECK-NEXT: vldrw.u32 q0, [r0] 365; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 366; CHECK-NEXT: vmov r1, s2 367; CHECK-NEXT: vmov r2, s3 368; CHECK-NEXT: vmov r3, s0 369; CHECK-NEXT: vmov r0, s4 370; CHECK-NEXT: vmov r5, s1 371; CHECK-NEXT: vmov r4, s7 372; CHECK-NEXT: ldrh.w r12, [r1] 373; CHECK-NEXT: vmov r1, s5 374; CHECK-NEXT: ldrh.w lr, [r2] 375; CHECK-NEXT: vmov r2, s6 376; CHECK-NEXT: ldrh r3, [r3] 377; CHECK-NEXT: ldrh r0, [r0] 378; CHECK-NEXT: ldrh r5, [r5] 379; CHECK-NEXT: vmov.32 q0[0], r3 380; CHECK-NEXT: vmov.32 q1[0], r0 381; CHECK-NEXT: ldrh r4, [r4] 382; CHECK-NEXT: vmov.32 q0[1], r5 383; CHECK-NEXT: vmov.32 q0[2], r12 384; CHECK-NEXT: vmov.32 q0[3], lr 385; CHECK-NEXT: vmovlb.u16 q0, q0 386; CHECK-NEXT: ldrh r1, [r1] 387; CHECK-NEXT: ldrh r2, [r2] 388; CHECK-NEXT: vmov.32 q1[1], r1 389; CHECK-NEXT: vmov.32 q1[2], r2 390; CHECK-NEXT: vmov.32 q1[3], r4 391; CHECK-NEXT: vmovlb.u16 q1, q1 392; CHECK-NEXT: pop {r4, r5, r7, pc} 393entry: 394 %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 395 %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 396 %ext = zext <8 x i16> %gather to <8 x i32> 397 ret <8 x i32> %ext 398} 399 400; f16 401 402define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) { 403; CHECK-LABEL: ptr_f16: 404; CHECK: @ %bb.0: @ %entry 405; CHECK-NEXT: vldrw.u32 q1, [r0] 406; CHECK-NEXT: vmov r1, s5 407; CHECK-NEXT: vldr.16 s0, [r1] 408; CHECK-NEXT: vmov r2, s4 409; CHECK-NEXT: vmov r1, s0 410; CHECK-NEXT: vldr.16 s0, [r2] 411; CHECK-NEXT: vmov r2, s0 412; CHECK-NEXT: vmov.16 q0[0], r2 413; CHECK-NEXT: vmov.16 q0[1], r1 414; CHECK-NEXT: vmov r1, s6 415; CHECK-NEXT: vldr.16 s8, [r1] 416; CHECK-NEXT: vmov r1, s8 417; CHECK-NEXT: vmov.16 q0[2], r1 418; CHECK-NEXT: vmov r1, s7 419; CHECK-NEXT: vldr.16 s4, [r1] 420; CHECK-NEXT: vmov r1, s4 421; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 422; CHECK-NEXT: vmov.16 q0[3], r1 423; CHECK-NEXT: vmov r0, s4 424; CHECK-NEXT: vldr.16 s8, [r0] 425; CHECK-NEXT: vmov r0, s8 426; CHECK-NEXT: vmov.16 q0[4], r0 427; CHECK-NEXT: vmov r0, s5 428; CHECK-NEXT: vldr.16 s8, [r0] 429; CHECK-NEXT: vmov r0, s8 430; CHECK-NEXT: vmov.16 q0[5], r0 431; CHECK-NEXT: vmov r0, s6 432; CHECK-NEXT: vldr.16 s8, [r0] 433; CHECK-NEXT: vmov r0, s8 434; CHECK-NEXT: vmov.16 q0[6], r0 435; CHECK-NEXT: vmov r0, s7 436; CHECK-NEXT: vldr.16 s4, [r0] 437; CHECK-NEXT: vmov r0, s4 438; CHECK-NEXT: vmov.16 q0[7], r0 439; CHECK-NEXT: bx lr 440entry: 441 %offs = load <8 x half*>, <8 x half*>* %offptr, align 4 442 %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 443 ret <8 x half> %gather 444} 445 446; i8 447 448define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) { 449; CHECK-LABEL: ptr_i8: 450; CHECK: @ %bb.0: @ %entry 451; CHECK-NEXT: .save {r4, r5, r6, lr} 452; CHECK-NEXT: push {r4, r5, r6, lr} 453; CHECK-NEXT: vldrw.u32 q1, [r0, #32] 454; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 455; CHECK-NEXT: vldrw.u32 q2, [r0] 456; CHECK-NEXT: vmov r1, s6 457; CHECK-NEXT: vmov r5, s8 458; CHECK-NEXT: vmov r2, s3 459; CHECK-NEXT: vmov r6, s11 460; CHECK-NEXT: vmov r4, s7 461; CHECK-NEXT: ldrb.w r12, [r1] 462; CHECK-NEXT: vmov r1, s0 463; CHECK-NEXT: ldrb r5, [r5] 464; CHECK-NEXT: ldrb r2, [r2] 465; CHECK-NEXT: ldrb r6, [r6] 466; CHECK-NEXT: ldrb r4, [r4] 467; CHECK-NEXT: ldrb.w lr, [r1] 468; CHECK-NEXT: vmov r1, s1 469; CHECK-NEXT: ldrb r3, [r1] 470; CHECK-NEXT: vmov r1, s2 471; CHECK-NEXT: vmov.8 q0[0], r5 472; CHECK-NEXT: vmov r5, s9 473; CHECK-NEXT: ldrb r1, [r1] 474; CHECK-NEXT: ldrb r5, [r5] 475; CHECK-NEXT: vmov.8 q0[1], r5 476; CHECK-NEXT: vmov r5, s10 477; CHECK-NEXT: vldrw.u32 q2, [r0, #16] 478; CHECK-NEXT: vmov r0, s8 479; CHECK-NEXT: ldrb r5, [r5] 480; CHECK-NEXT: vmov.8 q0[2], r5 481; CHECK-NEXT: vmov r5, s4 482; CHECK-NEXT: ldrb r0, [r0] 483; CHECK-NEXT: vmov.8 q0[3], r6 484; CHECK-NEXT: vmov.8 q0[4], r0 485; CHECK-NEXT: vmov r0, s9 486; CHECK-NEXT: ldrb r5, [r5] 487; CHECK-NEXT: ldrb r0, [r0] 488; CHECK-NEXT: vmov.8 q0[5], r0 489; CHECK-NEXT: vmov r0, s10 490; CHECK-NEXT: ldrb r0, [r0] 491; CHECK-NEXT: vmov.8 q0[6], r0 492; CHECK-NEXT: vmov r0, s11 493; CHECK-NEXT: ldrb r0, [r0] 494; CHECK-NEXT: vmov.8 q0[7], r0 495; CHECK-NEXT: vmov r0, s5 496; CHECK-NEXT: vmov.8 q0[8], r5 497; CHECK-NEXT: ldrb r0, [r0] 498; CHECK-NEXT: vmov.8 q0[9], r0 499; CHECK-NEXT: vmov.8 q0[10], r12 500; CHECK-NEXT: vmov.8 q0[11], r4 501; CHECK-NEXT: vmov.8 q0[12], lr 502; CHECK-NEXT: vmov.8 q0[13], r3 503; CHECK-NEXT: vmov.8 q0[14], r1 504; CHECK-NEXT: vmov.8 q0[15], r2 505; CHECK-NEXT: pop {r4, r5, r6, pc} 506entry: 507 %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4 508 %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 509 ret <16 x i8> %gather 510} 511 512define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(<8 x i8*>* %offptr) { 513; CHECK-LABEL: ptr_v8i8_sext16: 514; CHECK: @ %bb.0: @ %entry 515; CHECK-NEXT: .save {r4, r5, r7, lr} 516; CHECK-NEXT: push {r4, r5, r7, lr} 517; CHECK-NEXT: vldrw.u32 q0, [r0] 518; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 519; CHECK-NEXT: vmov r2, s3 520; CHECK-NEXT: vmov r1, s1 521; CHECK-NEXT: vmov r5, s0 522; CHECK-NEXT: vmov r3, s2 523; CHECK-NEXT: vmov r0, s4 524; CHECK-NEXT: vmov r4, s7 525; CHECK-NEXT: ldrb.w r12, [r2] 526; CHECK-NEXT: vmov r2, s5 527; CHECK-NEXT: ldrb.w lr, [r1] 528; CHECK-NEXT: vmov r1, s6 529; CHECK-NEXT: ldrb r5, [r5] 530; CHECK-NEXT: ldrb r3, [r3] 531; CHECK-NEXT: vmov.16 q0[0], r5 532; CHECK-NEXT: ldrb r0, [r0] 533; CHECK-NEXT: vmov.16 q0[1], lr 534; CHECK-NEXT: ldrb r4, [r4] 535; CHECK-NEXT: vmov.16 q0[2], r3 536; CHECK-NEXT: vmov.16 q0[3], r12 537; CHECK-NEXT: vmov.16 q0[4], r0 538; CHECK-NEXT: ldrb r2, [r2] 539; CHECK-NEXT: ldrb r1, [r1] 540; CHECK-NEXT: vmov.16 q0[5], r2 541; CHECK-NEXT: vmov.16 q0[6], r1 542; CHECK-NEXT: vmov.16 q0[7], r4 543; CHECK-NEXT: vmovlb.s8 q0, q0 544; CHECK-NEXT: pop {r4, r5, r7, pc} 545entry: 546 %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 547 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 548 %ext = sext <8 x i8> %gather to <8 x i16> 549 ret <8 x i16> %ext 550} 551 552define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(<8 x i8*>* %offptr) { 553; CHECK-LABEL: ptr_v8i8_zext16: 554; CHECK: @ %bb.0: @ %entry 555; CHECK-NEXT: .save {r4, r5, r7, lr} 556; CHECK-NEXT: push {r4, r5, r7, lr} 557; CHECK-NEXT: vldrw.u32 q0, [r0] 558; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 559; CHECK-NEXT: vmov r2, s3 560; CHECK-NEXT: vmov r1, s1 561; CHECK-NEXT: vmov r5, s0 562; CHECK-NEXT: vmov r3, s2 563; CHECK-NEXT: vmov r0, s4 564; CHECK-NEXT: vmov r4, s7 565; CHECK-NEXT: ldrb.w r12, [r2] 566; CHECK-NEXT: vmov r2, s5 567; CHECK-NEXT: ldrb.w lr, [r1] 568; CHECK-NEXT: vmov r1, s6 569; CHECK-NEXT: ldrb r5, [r5] 570; CHECK-NEXT: ldrb r3, [r3] 571; CHECK-NEXT: vmov.16 q0[0], r5 572; CHECK-NEXT: ldrb r0, [r0] 573; CHECK-NEXT: vmov.16 q0[1], lr 574; CHECK-NEXT: ldrb r4, [r4] 575; CHECK-NEXT: vmov.16 q0[2], r3 576; CHECK-NEXT: vmov.16 q0[3], r12 577; CHECK-NEXT: vmov.16 q0[4], r0 578; CHECK-NEXT: ldrb r2, [r2] 579; CHECK-NEXT: ldrb r1, [r1] 580; CHECK-NEXT: vmov.16 q0[5], r2 581; CHECK-NEXT: vmov.16 q0[6], r1 582; CHECK-NEXT: vmov.16 q0[7], r4 583; CHECK-NEXT: vmovlb.u8 q0, q0 584; CHECK-NEXT: pop {r4, r5, r7, pc} 585entry: 586 %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 587 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 588 %ext = zext <8 x i8> %gather to <8 x i16> 589 ret <8 x i16> %ext 590} 591 592define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) { 593; CHECK-LABEL: ptr_v4i8_sext32: 594; CHECK: @ %bb.0: @ %entry 595; CHECK-NEXT: vldrw.u32 q0, [r0] 596; CHECK-NEXT: vmov r0, s0 597; CHECK-NEXT: vmov r3, s1 598; CHECK-NEXT: vmov r1, s2 599; CHECK-NEXT: vmov r2, s3 600; CHECK-NEXT: ldrb r0, [r0] 601; CHECK-NEXT: ldrb r3, [r3] 602; CHECK-NEXT: vmov.32 q0[0], r0 603; CHECK-NEXT: ldrb r1, [r1] 604; CHECK-NEXT: vmov.32 q0[1], r3 605; CHECK-NEXT: ldrb r2, [r2] 606; CHECK-NEXT: vmov.32 q0[2], r1 607; CHECK-NEXT: vmov.32 q0[3], r2 608; CHECK-NEXT: vmovlb.s8 q0, q0 609; CHECK-NEXT: vmovlb.s16 q0, q0 610; CHECK-NEXT: bx lr 611entry: 612 %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 613 %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 614 %ext = sext <4 x i8> %gather to <4 x i32> 615 ret <4 x i32> %ext 616} 617 618define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) { 619; CHECK-LABEL: ptr_v4i8_zext32: 620; CHECK: @ %bb.0: @ %entry 621; CHECK-NEXT: vldrw.u32 q0, [r0] 622; CHECK-NEXT: vmov.i32 q1, #0xff 623; CHECK-NEXT: vmov r2, s0 624; CHECK-NEXT: vmov r1, s1 625; CHECK-NEXT: vmov r3, s2 626; CHECK-NEXT: vmov r0, s3 627; CHECK-NEXT: ldrb r2, [r2] 628; CHECK-NEXT: ldrb r1, [r1] 629; CHECK-NEXT: vmov.32 q0[0], r2 630; CHECK-NEXT: ldrb r3, [r3] 631; CHECK-NEXT: vmov.32 q0[1], r1 632; CHECK-NEXT: ldrb r0, [r0] 633; CHECK-NEXT: vmov.32 q0[2], r3 634; CHECK-NEXT: vmov.32 q0[3], r0 635; CHECK-NEXT: vand q0, q0, q1 636; CHECK-NEXT: bx lr 637entry: 638 %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 639 %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 640 %ext = zext <4 x i8> %gather to <4 x i32> 641 ret <4 x i32> %ext 642} 643 644define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) { 645; CHECK-LABEL: ptr_v8i8_sext32: 646; CHECK: @ %bb.0: @ %entry 647; CHECK-NEXT: .save {r4, r5, r7, lr} 648; CHECK-NEXT: push {r4, r5, r7, lr} 649; CHECK-NEXT: vldrw.u32 q0, [r0] 650; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 651; CHECK-NEXT: vmov r1, s2 652; CHECK-NEXT: vmov r2, s3 653; CHECK-NEXT: vmov r3, s0 654; CHECK-NEXT: vmov r0, s4 655; CHECK-NEXT: vmov r5, s1 656; CHECK-NEXT: vmov r4, s7 657; CHECK-NEXT: ldrb.w r12, [r1] 658; CHECK-NEXT: vmov r1, s5 659; CHECK-NEXT: ldrb.w lr, [r2] 660; CHECK-NEXT: vmov r2, s6 661; CHECK-NEXT: ldrb r3, [r3] 662; CHECK-NEXT: ldrb r0, [r0] 663; CHECK-NEXT: ldrb r5, [r5] 664; CHECK-NEXT: vmov.32 q0[0], r3 665; CHECK-NEXT: vmov.32 q1[0], r0 666; CHECK-NEXT: ldrb r4, [r4] 667; CHECK-NEXT: vmov.32 q0[1], r5 668; CHECK-NEXT: vmov.32 q0[2], r12 669; CHECK-NEXT: vmov.32 q0[3], lr 670; CHECK-NEXT: vmovlb.s8 q0, q0 671; CHECK-NEXT: vmovlb.s16 q0, q0 672; CHECK-NEXT: ldrb r1, [r1] 673; CHECK-NEXT: ldrb r2, [r2] 674; CHECK-NEXT: vmov.32 q1[1], r1 675; CHECK-NEXT: vmov.32 q1[2], r2 676; CHECK-NEXT: vmov.32 q1[3], r4 677; CHECK-NEXT: vmovlb.s8 q1, q1 678; CHECK-NEXT: vmovlb.s16 q1, q1 679; CHECK-NEXT: pop {r4, r5, r7, pc} 680entry: 681 %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 682 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 683 %ext = sext <8 x i8> %gather to <8 x i32> 684 ret <8 x i32> %ext 685} 686 687define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(<8 x i8*>* %offptr) { 688; CHECK-LABEL: ptr_v8i8_zext32: 689; CHECK: @ %bb.0: @ %entry 690; CHECK-NEXT: .save {r4, r5, r7, lr} 691; CHECK-NEXT: push {r4, r5, r7, lr} 692; CHECK-NEXT: vldrw.u32 q0, [r0] 693; CHECK-NEXT: vldrw.u32 q1, [r0, #16] 694; CHECK-NEXT: vmov r1, s2 695; CHECK-NEXT: vmov r2, s3 696; CHECK-NEXT: vmov r0, s4 697; CHECK-NEXT: vmov r4, s0 698; CHECK-NEXT: vmov r3, s5 699; CHECK-NEXT: vmov r5, s1 700; CHECK-NEXT: ldrb.w r12, [r1] 701; CHECK-NEXT: vmov r1, s6 702; CHECK-NEXT: ldrb.w lr, [r2] 703; CHECK-NEXT: vmov r2, s7 704; CHECK-NEXT: ldrb r0, [r0] 705; CHECK-NEXT: vmov.i32 q1, #0xff 706; CHECK-NEXT: ldrb r4, [r4] 707; CHECK-NEXT: ldrb r3, [r3] 708; CHECK-NEXT: vmov.32 q2[0], r0 709; CHECK-NEXT: ldrb r5, [r5] 710; CHECK-NEXT: vmov.32 q0[0], r4 711; CHECK-NEXT: vmov.32 q2[1], r3 712; CHECK-NEXT: vmov.32 q0[1], r5 713; CHECK-NEXT: vmov.32 q0[2], r12 714; CHECK-NEXT: vmov.32 q0[3], lr 715; CHECK-NEXT: vand q0, q0, q1 716; CHECK-NEXT: ldrb r1, [r1] 717; CHECK-NEXT: ldrb r2, [r2] 718; CHECK-NEXT: vmov.32 q2[2], r1 719; CHECK-NEXT: vmov.32 q2[3], r2 720; CHECK-NEXT: vand q1, q2, q1 721; CHECK-NEXT: pop {r4, r5, r7, pc} 722entry: 723 %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 724 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 725 %ext = zext <8 x i8> %gather to <8 x i32> 726 ret <8 x i32> %ext 727} 728 729; loops 730 731define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) { 732; CHECK-LABEL: foo_ptr_p_int32_t: 733; CHECK: @ %bb.0: @ %entry 734; CHECK-NEXT: .save {r7, lr} 735; CHECK-NEXT: push {r7, lr} 736; CHECK-NEXT: bic r2, r2, #15 737; CHECK-NEXT: cmp r2, #1 738; CHECK-NEXT: it lt 739; CHECK-NEXT: poplt {r7, pc} 740; CHECK-NEXT: .LBB22_1: @ %vector.body.preheader 741; CHECK-NEXT: subs r2, #4 742; CHECK-NEXT: movs r3, #1 743; CHECK-NEXT: add.w lr, r3, r2, lsr #2 744; CHECK-NEXT: dls lr, lr 745; CHECK-NEXT: .LBB22_2: @ %vector.body 746; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 747; CHECK-NEXT: vldrw.u32 q0, [r1], #16 748; CHECK-NEXT: vptt.i32 ne, q0, zr 749; CHECK-NEXT: vldrwt.u32 q1, [q0] 750; CHECK-NEXT: vstrwt.32 q1, [r0], #16 751; CHECK-NEXT: le lr, .LBB22_2 752; CHECK-NEXT: @ %bb.3: @ %for.end 753; CHECK-NEXT: pop {r7, pc} 754entry: 755 %and = and i32 %n, -16 756 %cmp11 = icmp sgt i32 %and, 0 757 br i1 %cmp11, label %vector.body, label %for.end 758 759vector.body: ; preds = %entry, %vector.body 760 %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ] 761 %0 = getelementptr inbounds i32*, i32** %src, i32 %index 762 %1 = bitcast i32** %0 to <4 x i32*>* 763 %wide.load = load <4 x i32*>, <4 x i32*>* %1, align 4 764 %2 = icmp ne <4 x i32*> %wide.load, zeroinitializer 765 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %wide.load, i32 4, <4 x i1> %2, <4 x i32> undef) 766 %3 = getelementptr inbounds i32, i32* %dest, i32 %index 767 %4 = bitcast i32* %3 to <4 x i32>* 768 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.gather, <4 x i32>* %4, i32 4, <4 x i1> %2) 769 %index.next = add i32 %index, 4 770 %5 = icmp eq i32 %index.next, %and 771 br i1 %5, label %for.end, label %vector.body 772 773for.end: ; preds = %vector.body, %entry 774 ret void 775} 776 777define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) { 778; CHECK-LABEL: foo_ptr_p_float: 779; CHECK: @ %bb.0: @ %entry 780; CHECK-NEXT: .save {r7, lr} 781; CHECK-NEXT: push {r7, lr} 782; CHECK-NEXT: bic r2, r2, #15 783; CHECK-NEXT: cmp r2, #1 784; CHECK-NEXT: it lt 785; CHECK-NEXT: poplt {r7, pc} 786; CHECK-NEXT: .LBB23_1: @ %vector.body.preheader 787; CHECK-NEXT: subs r2, #4 788; CHECK-NEXT: movs r3, #1 789; CHECK-NEXT: add.w lr, r3, r2, lsr #2 790; CHECK-NEXT: dls lr, lr 791; CHECK-NEXT: .LBB23_2: @ %vector.body 792; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 793; CHECK-NEXT: vldrw.u32 q0, [r1], #16 794; CHECK-NEXT: vptt.i32 ne, q0, zr 795; CHECK-NEXT: vldrwt.u32 q1, [q0] 796; CHECK-NEXT: vstrwt.32 q1, [r0], #16 797; CHECK-NEXT: le lr, .LBB23_2 798; CHECK-NEXT: @ %bb.3: @ %for.end 799; CHECK-NEXT: pop {r7, pc} 800entry: 801 %and = and i32 %n, -16 802 %cmp11 = icmp sgt i32 %and, 0 803 br i1 %cmp11, label %vector.body, label %for.end 804 805vector.body: ; preds = %entry, %vector.body 806 %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ] 807 %0 = getelementptr inbounds float*, float** %src, i32 %index 808 %1 = bitcast float** %0 to <4 x float*>* 809 %wide.load = load <4 x float*>, <4 x float*>* %1, align 4 810 %2 = icmp ne <4 x float*> %wide.load, zeroinitializer 811 %3 = bitcast <4 x float*> %wide.load to <4 x i32*> 812 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> %2, <4 x i32> undef) 813 %4 = getelementptr inbounds float, float* %dest, i32 %index 814 %5 = bitcast float* %4 to <4 x i32>* 815 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %wide.masked.gather, <4 x i32>* %5, i32 4, <4 x i1> %2) 816 %index.next = add i32 %index, 4 817 %6 = icmp eq i32 %index.next, %and 818 br i1 %6, label %for.end, label %vector.body 819 820for.end: ; preds = %vector.body, %entry 821 ret void 822} 823 824define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) { 825; CHECK-LABEL: qi4: 826; CHECK: @ %bb.0: @ %entry 827; CHECK-NEXT: vmov.i32 q1, #0x10 828; CHECK-NEXT: vadd.i32 q1, q0, q1 829; CHECK-NEXT: vldrw.u32 q0, [q1] 830; CHECK-NEXT: bx lr 831entry: 832 %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4 833 %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 834 ret <4 x i32> %gather 835} 836 837define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) { 838; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64: 839; CHECK: @ %bb.0: @ %entry 840; CHECK-NEXT: .save {r4, r5, r7, lr} 841; CHECK-NEXT: push {r4, r5, r7, lr} 842; CHECK-NEXT: vldrb.u32 q0, [r1] 843; CHECK-NEXT: vldrb.u32 q1, [r1, #4] 844; CHECK-NEXT: vadd.i32 q0, q0, r0 845; CHECK-NEXT: vadd.i32 q1, q1, r0 846; CHECK-NEXT: vmov r2, s2 847; CHECK-NEXT: vmov r3, s3 848; CHECK-NEXT: vmov r0, s4 849; CHECK-NEXT: vmov r1, s5 850; CHECK-NEXT: vmov r5, s1 851; CHECK-NEXT: vmov r4, s7 852; CHECK-NEXT: ldrb.w r12, [r2] 853; CHECK-NEXT: vmov r2, s0 854; CHECK-NEXT: ldrb.w lr, [r3] 855; CHECK-NEXT: vmov r3, s6 856; CHECK-NEXT: ldrb r0, [r0] 857; CHECK-NEXT: ldrb r1, [r1] 858; CHECK-NEXT: ldrb r5, [r5] 859; CHECK-NEXT: vmov.32 q1[0], r0 860; CHECK-NEXT: ldrb r4, [r4] 861; CHECK-NEXT: vmov.32 q1[1], r1 862; CHECK-NEXT: ldrb r2, [r2] 863; CHECK-NEXT: ldrb r3, [r3] 864; CHECK-NEXT: vmov.32 q0[0], r2 865; CHECK-NEXT: vmov.32 q0[1], r5 866; CHECK-NEXT: vmov.32 q1[2], r3 867; CHECK-NEXT: vmov.32 q0[2], r12 868; CHECK-NEXT: vmov.32 q1[3], r4 869; CHECK-NEXT: vmov.32 q0[3], lr 870; CHECK-NEXT: vmovlb.s8 q1, q1 871; CHECK-NEXT: vmovlb.s8 q0, q0 872; CHECK-NEXT: vmovlb.s16 q1, q1 873; CHECK-NEXT: vmovlb.s16 q0, q0 874; CHECK-NEXT: pop {r4, r5, r7, pc} 875entry: 876 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 877 %offs.zext = zext <8 x i8> %offs to <8 x i32> 878 %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 879 %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef) 880 %gather.sext = sext <8 x i8> %gather to <8 x i32> 881 ret <8 x i32> %gather.sext 882} 883 884declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) 885declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) 886declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>) 887declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) 888declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>) 889declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 890declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>) 891declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>) 892declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) 893declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) 894declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>) 895declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) 896declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) 897declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>) 898declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) 899declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) 900declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) 901declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>) 902declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) 903