1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s 3; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 4 5define arm_aapcs_vfpcc <4 x i32> @vdup_i32(i32 %src) { 6; CHECK-LABEL: vdup_i32: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vdup.32 q0, r0 9; CHECK-NEXT: bx lr 10entry: 11 %0 = insertelement <4 x i32> undef, i32 %src, i32 0 12 %out = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 13 ret <4 x i32> %out 14} 15 16define arm_aapcs_vfpcc <8 x i16> @vdup_i16(i16 %src) { 17; CHECK-LABEL: vdup_i16: 18; CHECK: @ %bb.0: @ %entry 19; CHECK-NEXT: vdup.16 q0, r0 20; CHECK-NEXT: bx lr 21entry: 22 %0 = insertelement <8 x i16> undef, i16 %src, i32 0 23 %out = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 24 ret <8 x i16> %out 25} 26 27define arm_aapcs_vfpcc <16 x i8> @vdup_i8(i8 %src) { 28; CHECK-LABEL: vdup_i8: 29; CHECK: @ %bb.0: @ %entry 30; CHECK-NEXT: vdup.8 q0, r0 31; CHECK-NEXT: bx lr 32entry: 33 %0 = insertelement <16 x i8> undef, i8 %src, i32 0 34 %out = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 35 ret <16 x i8> %out 36} 37 38define arm_aapcs_vfpcc <2 x i64> @vdup_i64(i64 %src) { 39; CHECK-LABEL: vdup_i64: 40; CHECK: @ %bb.0: @ %entry 41; CHECK-NEXT: vmov.32 q0[0], r0 42; CHECK-NEXT: vmov.32 q0[1], r1 43; CHECK-NEXT: vmov.32 q0[2], r0 44; CHECK-NEXT: vmov.32 q0[3], r1 45; CHECK-NEXT: bx lr 46entry: 47 %0 = insertelement <2 x i64> undef, i64 %src, i32 0 48 %out = shufflevector <2 x i64> %0, <2 x i64> undef, <2 x i32> zeroinitializer 49 ret <2 x i64> %out 50} 51 52define arm_aapcs_vfpcc <4 x float> @vdup_f32_1(float %src) { 53; CHECK-LABEL: vdup_f32_1: 54; CHECK: @ %bb.0: @ %entry 55; CHECK-NEXT: vmov r0, s0 56; CHECK-NEXT: vdup.32 q0, r0 57; CHECK-NEXT: bx lr 58entry: 59 %0 = insertelement <4 x float> undef, float %src, i32 0 60 %out = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer 61 ret <4 x float> %out 62} 63 64define arm_aapcs_vfpcc <4 x float> @vdup_f32_2(float %src1, float %src2) { 65; CHECK-LABEL: vdup_f32_2: 66; CHECK: @ %bb.0: @ %entry 67; CHECK-NEXT: vadd.f32 s0, s0, s1 68; CHECK-NEXT: vmov r0, s0 69; CHECK-NEXT: vdup.32 q0, r0 70; CHECK-NEXT: bx lr 71entry: 72 %0 = fadd float %src1, %src2 73 %1 = insertelement <4 x float> undef, float %0, i32 0 74 %out = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer 75 ret <4 x float> %out 76} 77 78define arm_aapcs_vfpcc <4 x float> @vdup_f32_1bc(float %src) { 79; CHECK-LABEL: vdup_f32_1bc: 80; CHECK: @ %bb.0: @ %entry 81; CHECK-NEXT: vmov r0, s0 82; CHECK-NEXT: vdup.32 q0, r0 83; CHECK-NEXT: bx lr 84entry: 85 %srcbc = bitcast float %src to i32 86 %0 = insertelement <4 x i32> undef, i32 %srcbc, i32 0 87 %out = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 88 %outbc = bitcast <4 x i32> %out to <4 x float> 89 ret <4 x float> %outbc 90} 91 92define arm_aapcs_vfpcc <4 x float> @vdup_f32_2bc(float %src1, float %src2) { 93; CHECK-LABEL: vdup_f32_2bc: 94; CHECK: @ %bb.0: @ %entry 95; CHECK-NEXT: vadd.f32 s0, s0, s1 96; CHECK-NEXT: vmov r0, s0 97; CHECK-NEXT: vdup.32 q0, r0 98; CHECK-NEXT: bx lr 99entry: 100 %0 = fadd float %src1, %src2 101 %bc = bitcast float %0 to i32 102 %1 = insertelement <4 x i32> undef, i32 %bc, i32 0 103 %out = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer 104 %outbc = bitcast <4 x i32> %out to <4 x float> 105 ret <4 x float> %outbc 106} 107 108; TODO: Calling convention needs fixing to pass half types directly to functions 109define arm_aapcs_vfpcc <8 x half> @vdup_f16(half* %src1, half* %src2) { 110; CHECK-LABEL: vdup_f16: 111; CHECK: @ %bb.0: @ %entry 112; CHECK-NEXT: vldr.16 s0, [r1] 113; CHECK-NEXT: vldr.16 s2, [r0] 114; CHECK-NEXT: vadd.f16 s0, s2, s0 115; CHECK-NEXT: vmov.f16 r0, s0 116; CHECK-NEXT: vdup.16 q0, r0 117; CHECK-NEXT: bx lr 118entry: 119 %0 = load half, half *%src1, align 2 120 %1 = load half, half *%src2, align 2 121 %2 = fadd half %0, %1 122 %3 = insertelement <8 x half> undef, half %2, i32 0 123 %out = shufflevector <8 x half> %3, <8 x half> undef, <8 x i32> zeroinitializer 124 ret <8 x half> %out 125} 126 127define arm_aapcs_vfpcc <8 x half> @vdup_f16_bc(half* %src1, half* %src2) { 128; CHECK-LABEL: vdup_f16_bc: 129; CHECK: @ %bb.0: @ %entry 130; CHECK-NEXT: vldr.16 s0, [r1] 131; CHECK-NEXT: vldr.16 s2, [r0] 132; CHECK-NEXT: vadd.f16 s0, s2, s0 133; CHECK-NEXT: vmov.f16 r0, s0 134; CHECK-NEXT: vdup.16 q0, r0 135; CHECK-NEXT: bx lr 136entry: 137 %0 = load half, half *%src1, align 2 138 %1 = load half, half *%src2, align 2 139 %2 = fadd half %0, %1 140 %bc = bitcast half %2 to i16 141 %3 = insertelement <8 x i16> undef, i16 %bc, i32 0 142 %out = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer 143 %outbc = bitcast <8 x i16> %out to <8 x half> 144 ret <8 x half> %outbc 145} 146 147define arm_aapcs_vfpcc <2 x double> @vdup_f64(double %src) { 148; CHECK-LABEL: vdup_f64: 149; CHECK: @ %bb.0: @ %entry 150; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 151; CHECK-NEXT: vmov.f32 s2, s0 152; CHECK-NEXT: vmov.f32 s3, s1 153; CHECK-NEXT: bx lr 154entry: 155 %0 = insertelement <2 x double> undef, double %src, i32 0 156 %out = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> zeroinitializer 157 ret <2 x double> %out 158} 159 160 161 162define arm_aapcs_vfpcc <4 x i32> @vduplane_i32(<4 x i32> %src) { 163; CHECK-LABEL: vduplane_i32: 164; CHECK: @ %bb.0: @ %entry 165; CHECK-NEXT: vmov r0, s3 166; CHECK-NEXT: vdup.32 q0, r0 167; CHECK-NEXT: bx lr 168entry: 169 %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 170 ret <4 x i32> %out 171} 172 173define arm_aapcs_vfpcc <8 x i16> @vduplane_i16(<8 x i16> %src) { 174; CHECK-LABEL: vduplane_i16: 175; CHECK: @ %bb.0: @ %entry 176; CHECK-NEXT: vmov.u16 r0, q0[3] 177; CHECK-NEXT: vdup.16 q0, r0 178; CHECK-NEXT: bx lr 179entry: 180 %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 181 ret <8 x i16> %out 182} 183 184define arm_aapcs_vfpcc <16 x i8> @vduplane_i8(<16 x i8> %src) { 185; CHECK-LABEL: vduplane_i8: 186; CHECK: @ %bb.0: @ %entry 187; CHECK-NEXT: vmov.u8 r0, q0[3] 188; CHECK-NEXT: vdup.8 q0, r0 189; CHECK-NEXT: bx lr 190entry: 191 %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 192 ret <16 x i8> %out 193} 194 195define arm_aapcs_vfpcc <2 x i64> @vduplane_i64(<2 x i64> %src) { 196; CHECK-LABEL: vduplane_i64: 197; CHECK: @ %bb.0: @ %entry 198; CHECK-NEXT: vmov.f32 s0, s2 199; CHECK-NEXT: vmov.f32 s1, s3 200; CHECK-NEXT: bx lr 201entry: 202 %out = shufflevector <2 x i64> %src, <2 x i64> undef, <2 x i32> <i32 1, i32 1> 203 ret <2 x i64> %out 204} 205 206define arm_aapcs_vfpcc <4 x float> @vduplane_f32(<4 x float> %src) { 207; CHECK-LABEL: vduplane_f32: 208; CHECK: @ %bb.0: @ %entry 209; CHECK-NEXT: vmov r0, s3 210; CHECK-NEXT: vdup.32 q0, r0 211; CHECK-NEXT: bx lr 212entry: 213 %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 214 ret <4 x float> %out 215} 216 217define arm_aapcs_vfpcc <8 x half> @vduplane_f16(<8 x half> %src) { 218; CHECK-LABEL: vduplane_f16: 219; CHECK: @ %bb.0: @ %entry 220; CHECK-NEXT: vmov.u16 r0, q0[3] 221; CHECK-NEXT: vdup.16 q0, r0 222; CHECK-NEXT: bx lr 223entry: 224 %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 225 ret <8 x half> %out 226} 227 228define arm_aapcs_vfpcc <2 x double> @vduplane_f64(<2 x double> %src) { 229; CHECK-LABEL: vduplane_f64: 230; CHECK: @ %bb.0: @ %entry 231; CHECK-NEXT: vmov.f32 s0, s2 232; CHECK-NEXT: vmov.f32 s1, s3 233; CHECK-NEXT: bx lr 234entry: 235 %out = shufflevector <2 x double> %src, <2 x double> undef, <2 x i32> <i32 1, i32 1> 236 ret <2 x double> %out 237} 238 239 240define arm_aapcs_vfpcc float @vdup_f32_extract(float %src) { 241; CHECK-LABEL: vdup_f32_extract: 242; CHECK: @ %bb.0: @ %entry 243; CHECK-NEXT: bx lr 244entry: 245 %srcbc = bitcast float %src to i32 246 %0 = insertelement <4 x i32> undef, i32 %srcbc, i32 0 247 %out = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 248 %outbc = bitcast <4 x i32> %out to <4 x float> 249 %ext = extractelement <4 x float> %outbc, i32 2 250 ret float %ext 251} 252 253define arm_aapcs_vfpcc half @vdup_f16_extract(half* %src1, half* %src2) { 254; CHECK-LABEL: vdup_f16_extract: 255; CHECK: @ %bb.0: @ %entry 256; CHECK-NEXT: vldr.16 s0, [r1] 257; CHECK-NEXT: vldr.16 s2, [r0] 258; CHECK-NEXT: vadd.f16 s0, s2, s0 259; CHECK-NEXT: bx lr 260entry: 261 %0 = load half, half *%src1, align 2 262 %1 = load half, half *%src2, align 2 263 %2 = fadd half %0, %1 264 %bc = bitcast half %2 to i16 265 %3 = insertelement <8 x i16> undef, i16 %bc, i32 0 266 %out = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer 267 %outbc = bitcast <8 x i16> %out to <8 x half> 268 %ext = extractelement <8 x half> %outbc, i32 2 269 ret half %ext 270} 271