1; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s 2 3; Simple load of v4i16 4define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 { 5; CHECK-LABEL: load_64: 6; CHECK: ldr d0, [x0] 7entry: 8 %0 = load <4 x half>, <4 x half>* %a, align 8 9 ret <4 x half> %0 10} 11 12; Simple load of v8i16 13define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 { 14; CHECK-LABEL: load_128: 15; CHECK: ldr q0, [x0] 16entry: 17 %0 = load <8 x half>, <8 x half>* %a, align 16 18 ret <8 x half> %0 19} 20 21; Duplicating load to v4i16 22define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 { 23; CHECK-LABEL: load_dup_64: 24; CHECK: ld1r { v0.4h }, [x0] 25entry: 26 %0 = load half, half* %a, align 2 27 %1 = insertelement <4 x half> undef, half %0, i32 0 28 %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer 29 ret <4 x half> %2 30} 31 32; Duplicating load to v8i16 33define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 { 34; CHECK-LABEL: load_dup_128: 35; CHECK: ld1r { v0.8h }, [x0] 36entry: 37 %0 = load half, half* %a, align 2 38 %1 = insertelement <8 x half> undef, half %0, i32 0 39 %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer 40 ret <8 x half> %2 41} 42 43; Load to one lane of v4f16 44define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 { 45; CHECK-LABEL: load_lane_64: 46; CHECK: ld1 { v0.h }[2], [x0] 47entry: 48 %0 = load half, half* %a, align 2 49 %1 = insertelement <4 x half> %b, half %0, i32 2 50 ret <4 x half> %1 51} 52 53; Load to one lane of v8f16 54define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 { 55; CHECK-LABEL: load_lane_128: 56; CHECK: ld1 { v0.h }[5], [x0] 57entry: 58 %0 = load half, half* %a, align 2 59 %1 = insertelement <8 x half> %b, half %0, i32 5 60 ret <8 x half> %1 61} 62 63; Simple store of v4f16 64define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 { 65; CHECK-LABEL: store_64: 66; CHECK: str d0, [x0] 67entry: 68 store <4 x half> %b, <4 x half>* %a, align 8 69 ret void 70} 71 72; Simple store of v8f16 73define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 { 74; CHECK-LABEL: store_128: 75; CHECK: str q0, [x0] 76entry: 77 store <8 x half> %b, <8 x half>* %a, align 16 78 ret void 79} 80 81; Store from one lane of v4f16 82define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 { 83; CHECK-LABEL: store_lane_64: 84; CHECK: st1 { v0.h }[2], [x0] 85entry: 86 %0 = extractelement <4 x half> %b, i32 2 87 store half %0, half* %a, align 2 88 ret void 89} 90 91; Store from one lane of v8f16 92define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 { 93; CHECK-LABEL: store_lane_128: 94; CHECK: st1 { v0.h }[5], [x0] 95entry: 96 %0 = extractelement <8 x half> %b, i32 5 97 store half %0, half* %a, align 2 98 ret void 99} 100 101; NEON intrinsics - (de-)interleaving loads and stores 102declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*) 103declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*) 104declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*) 105declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) 106declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) 107declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) 108declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*) 109declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*) 110declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*) 111declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) 112declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) 113declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) 114 115; Load 2 x v4f16 with de-interleaving 116define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 { 117; CHECK-LABEL: load_interleave_64_2: 118; CHECK: ld2 { v0.4h, v1.4h }, [x0] 119entry: 120 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a) 121 ret { <4 x half>, <4 x half> } %0 122} 123 124; Load 3 x v4f16 with de-interleaving 125define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 { 126; CHECK-LABEL: load_interleave_64_3: 127; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0] 128entry: 129 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a) 130 ret { <4 x half>, <4 x half>, <4 x half> } %0 131} 132 133; Load 4 x v4f16 with de-interleaving 134define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 { 135; CHECK-LABEL: load_interleave_64_4: 136; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 137entry: 138 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a) 139 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 140} 141 142; Store 2 x v4f16 with interleaving 143define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { 144; CHECK-LABEL: store_interleave_64_2: 145; CHECK: st2 { v0.4h, v1.4h }, [x0] 146entry: 147 tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) 148 ret void 149} 150 151; Store 3 x v4f16 with interleaving 152define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 153; CHECK-LABEL: store_interleave_64_3: 154; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] 155entry: 156 tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) 157 ret void 158} 159 160; Store 4 x v4f16 with interleaving 161define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 162; CHECK-LABEL: store_interleave_64_4: 163; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 164entry: 165 tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) 166 ret void 167} 168 169; Load 2 x v8f16 with de-interleaving 170define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 { 171; CHECK-LABEL: load_interleave_128_2: 172; CHECK: ld2 { v0.8h, v1.8h }, [x0] 173entry: 174 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a) 175 ret { <8 x half>, <8 x half> } %0 176} 177 178; Load 3 x v8f16 with de-interleaving 179define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 { 180; CHECK-LABEL: load_interleave_128_3: 181; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0] 182entry: 183 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a) 184 ret { <8 x half>, <8 x half>, <8 x half> } %0 185} 186 187; Load 8 x v8f16 with de-interleaving 188define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 { 189; CHECK-LABEL: load_interleave_128_4: 190; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 191entry: 192 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a) 193 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 194} 195 196; Store 2 x v8f16 with interleaving 197define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { 198; CHECK-LABEL: store_interleave_128_2: 199; CHECK: st2 { v0.8h, v1.8h }, [x0] 200entry: 201 tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) 202 ret void 203} 204 205; Store 3 x v8f16 with interleaving 206define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 207; CHECK-LABEL: store_interleave_128_3: 208; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] 209entry: 210 tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) 211 ret void 212} 213 214; Store 8 x v8f16 with interleaving 215define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 216; CHECK-LABEL: store_interleave_128_4: 217; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 218entry: 219 tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) 220 ret void 221} 222 223; NEON intrinsics - duplicating loads 224declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*) 225declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*) 226declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*) 227declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*) 228declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*) 229declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*) 230 231; Load 2 x v4f16 with duplication 232define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 { 233; CHECK-LABEL: load_dup_64_2: 234; CHECK: ld2r { v0.4h, v1.4h }, [x0] 235entry: 236 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a) 237 ret { <4 x half>, <4 x half> } %0 238} 239 240; Load 3 x v4f16 with duplication 241define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 { 242; CHECK-LABEL: load_dup_64_3: 243; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0] 244entry: 245 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a) 246 ret { <4 x half>, <4 x half>, <4 x half> } %0 247} 248 249; Load 4 x v4f16 with duplication 250define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 { 251; CHECK-LABEL: load_dup_64_4: 252; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 253entry: 254 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a) 255 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 256} 257 258; Load 2 x v8f16 with duplication 259define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 { 260; CHECK-LABEL: load_dup_128_2: 261; CHECK: ld2r { v0.8h, v1.8h }, [x0] 262entry: 263 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a) 264 ret { <8 x half>, <8 x half> } %0 265} 266 267; Load 3 x v8f16 with duplication 268define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 { 269; CHECK-LABEL: load_dup_128_3: 270; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0] 271entry: 272 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a) 273 ret { <8 x half>, <8 x half>, <8 x half> } %0 274} 275 276; Load 8 x v8f16 with duplication 277define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 { 278; CHECK-LABEL: load_dup_128_4: 279; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 280entry: 281 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a) 282 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 283} 284 285 286; NEON intrinsics - loads and stores to/from one lane 287declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) 288declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) 289declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) 290declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) 291declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) 292declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) 293declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) 294declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) 295declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) 296declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) 297declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) 298declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) 299 300; Load one lane of 2 x v4f16 301define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { 302; CHECK-LABEL: load_lane_64_2: 303; CHECK: ld2 { v0.h, v1.h }[2], [x0] 304entry: 305 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) 306 ret { <4 x half>, <4 x half> } %0 307} 308 309; Load one lane of 3 x v4f16 310define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 311; CHECK-LABEL: load_lane_64_3: 312; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] 313entry: 314 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) 315 ret { <4 x half>, <4 x half>, <4 x half> } %0 316} 317 318; Load one lane of 4 x v4f16 319define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 320; CHECK-LABEL: load_lane_64_4: 321; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 322entry: 323 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) 324 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 325} 326 327; Store one lane of 2 x v4f16 328define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { 329; CHECK-LABEL: store_lane_64_2: 330; CHECK: st2 { v0.h, v1.h }[2], [x0] 331entry: 332 tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) 333 ret void 334} 335 336; Store one lane of 3 x v4f16 337define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 338; CHECK-LABEL: store_lane_64_3: 339; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] 340entry: 341 tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) 342 ret void 343} 344 345; Store one lane of 4 x v4f16 346define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 347; CHECK-LABEL: store_lane_64_4: 348; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 349entry: 350 tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) 351 ret void 352} 353 354; Load one lane of 2 x v8f16 355define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { 356; CHECK-LABEL: load_lane_128_2: 357; CHECK: ld2 { v0.h, v1.h }[2], [x0] 358entry: 359 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) 360 ret { <8 x half>, <8 x half> } %0 361} 362 363; Load one lane of 3 x v8f16 364define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 365; CHECK-LABEL: load_lane_128_3: 366; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] 367entry: 368 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) 369 ret { <8 x half>, <8 x half>, <8 x half> } %0 370} 371 372; Load one lane of 8 x v8f16 373define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 374; CHECK-LABEL: load_lane_128_4: 375; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 376entry: 377 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) 378 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 379} 380 381; Store one lane of 2 x v8f16 382define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { 383; CHECK-LABEL: store_lane_128_2: 384; CHECK: st2 { v0.h, v1.h }[2], [x0] 385entry: 386 tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) 387 ret void 388} 389 390; Store one lane of 3 x v8f16 391define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 392; CHECK-LABEL: store_lane_128_3: 393; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] 394entry: 395 tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) 396 ret void 397} 398 399; Store one lane of 8 x v8f16 400define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 401; CHECK-LABEL: store_lane_128_4: 402; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] 403entry: 404 tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) 405 ret void 406} 407 408; NEON intrinsics - load/store without interleaving 409declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*) 410declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*) 411declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*) 412declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) 413declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) 414declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) 415declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*) 416declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*) 417declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*) 418declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) 419declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) 420declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) 421 422; Load 2 x v4f16 without de-interleaving 423define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 { 424; CHECK-LABEL: load_64_2: 425; CHECK: ld1 { v0.4h, v1.4h }, [x0] 426entry: 427 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a) 428 ret { <4 x half>, <4 x half> } %0 429} 430 431; Load 3 x v4f16 without de-interleaving 432define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 { 433; CHECK-LABEL: load_64_3: 434; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0] 435entry: 436 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a) 437 ret { <4 x half>, <4 x half>, <4 x half> } %0 438} 439 440; Load 4 x v4f16 without de-interleaving 441define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 { 442; CHECK-LABEL: load_64_4: 443; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 444entry: 445 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a) 446 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 447} 448 449; Store 2 x v4f16 without interleaving 450define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { 451; CHECK-LABEL: store_64_2: 452; CHECK: st1 { v0.4h, v1.4h }, [x0] 453entry: 454 tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) 455 ret void 456} 457 458; Store 3 x v4f16 without interleaving 459define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { 460; CHECK-LABEL: store_64_3: 461; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] 462entry: 463 tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) 464 ret void 465} 466 467; Store 4 x v4f16 without interleaving 468define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { 469; CHECK-LABEL: store_64_4: 470; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] 471entry: 472 tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) 473 ret void 474} 475 476; Load 2 x v8f16 without de-interleaving 477define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 { 478; CHECK-LABEL: load_128_2: 479; CHECK: ld1 { v0.8h, v1.8h }, [x0] 480entry: 481 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a) 482 ret { <8 x half>, <8 x half> } %0 483} 484 485; Load 3 x v8f16 without de-interleaving 486define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 { 487; CHECK-LABEL: load_128_3: 488; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0] 489entry: 490 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a) 491 ret { <8 x half>, <8 x half>, <8 x half> } %0 492} 493 494; Load 8 x v8f16 without de-interleaving 495define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 { 496; CHECK-LABEL: load_128_4: 497; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 498entry: 499 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a) 500 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 501} 502 503; Store 2 x v8f16 without interleaving 504define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { 505; CHECK-LABEL: store_128_2: 506; CHECK: st1 { v0.8h, v1.8h }, [x0] 507entry: 508 tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) 509 ret void 510} 511 512; Store 3 x v8f16 without interleaving 513define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { 514; CHECK-LABEL: store_128_3: 515; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] 516entry: 517 tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) 518 ret void 519} 520 521; Store 8 x v8f16 without interleaving 522define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { 523; CHECK-LABEL: store_128_4: 524; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] 525entry: 526 tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) 527 ret void 528} 529