1; RUN: llc < %s -march=arm -mattr=+neon,+fp16 | FileCheck %s 2 3define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind { 4;CHECK: vcvt_f32tos32: 5;CHECK: vcvt.s32.f32 6 %tmp1 = load <2 x float>* %A 7 %tmp2 = fptosi <2 x float> %tmp1 to <2 x i32> 8 ret <2 x i32> %tmp2 9} 10 11define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind { 12;CHECK: vcvt_f32tou32: 13;CHECK: vcvt.u32.f32 14 %tmp1 = load <2 x float>* %A 15 %tmp2 = fptoui <2 x float> %tmp1 to <2 x i32> 16 ret <2 x i32> %tmp2 17} 18 19define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind { 20;CHECK: vcvt_s32tof32: 21;CHECK: vcvt.f32.s32 22 %tmp1 = load <2 x i32>* %A 23 %tmp2 = sitofp <2 x i32> %tmp1 to <2 x float> 24 ret <2 x float> %tmp2 25} 26 27define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind { 28;CHECK: vcvt_u32tof32: 29;CHECK: vcvt.f32.u32 30 %tmp1 = load <2 x i32>* %A 31 %tmp2 = uitofp <2 x i32> %tmp1 to <2 x float> 32 ret <2 x float> %tmp2 33} 34 35define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind { 36;CHECK: vcvtQ_f32tos32: 37;CHECK: vcvt.s32.f32 38 %tmp1 = load <4 x float>* %A 39 %tmp2 = fptosi <4 x float> %tmp1 to <4 x i32> 40 ret <4 x i32> %tmp2 41} 42 43define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind { 44;CHECK: vcvtQ_f32tou32: 45;CHECK: vcvt.u32.f32 46 %tmp1 = load <4 x float>* %A 47 %tmp2 = fptoui <4 x float> %tmp1 to <4 x i32> 48 ret <4 x i32> %tmp2 49} 50 51define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind { 52;CHECK: vcvtQ_s32tof32: 53;CHECK: vcvt.f32.s32 54 %tmp1 = load <4 x i32>* %A 55 %tmp2 = sitofp <4 x i32> %tmp1 to <4 x float> 56 ret <4 x float> %tmp2 57} 58 59define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind { 60;CHECK: vcvtQ_u32tof32: 61;CHECK: vcvt.f32.u32 62 %tmp1 = load <4 x i32>* %A 63 %tmp2 = uitofp <4 x i32> %tmp1 to <4 x float> 64 ret <4 x float> %tmp2 65} 66 67define <2 x i32> @vcvt_n_f32tos32(<2 x float>* %A) nounwind { 68;CHECK: vcvt_n_f32tos32: 69;CHECK: vcvt.s32.f32 70 %tmp1 = load <2 x float>* %A 71 %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1) 72 ret <2 x i32> %tmp2 73} 74 75define <2 x i32> @vcvt_n_f32tou32(<2 x float>* %A) nounwind { 76;CHECK: vcvt_n_f32tou32: 77;CHECK: vcvt.u32.f32 78 %tmp1 = load <2 x float>* %A 79 %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1) 80 ret <2 x i32> %tmp2 81} 82 83define <2 x float> @vcvt_n_s32tof32(<2 x i32>* %A) nounwind { 84;CHECK: vcvt_n_s32tof32: 85;CHECK: vcvt.f32.s32 86 %tmp1 = load <2 x i32>* %A 87 %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) 88 ret <2 x float> %tmp2 89} 90 91define <2 x float> @vcvt_n_u32tof32(<2 x i32>* %A) nounwind { 92;CHECK: vcvt_n_u32tof32: 93;CHECK: vcvt.f32.u32 94 %tmp1 = load <2 x i32>* %A 95 %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) 96 ret <2 x float> %tmp2 97} 98 99declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone 100declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone 101declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone 102declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone 103 104define <4 x i32> @vcvtQ_n_f32tos32(<4 x float>* %A) nounwind { 105;CHECK: vcvtQ_n_f32tos32: 106;CHECK: vcvt.s32.f32 107 %tmp1 = load <4 x float>* %A 108 %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1) 109 ret <4 x i32> %tmp2 110} 111 112define <4 x i32> @vcvtQ_n_f32tou32(<4 x float>* %A) nounwind { 113;CHECK: vcvtQ_n_f32tou32: 114;CHECK: vcvt.u32.f32 115 %tmp1 = load <4 x float>* %A 116 %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1) 117 ret <4 x i32> %tmp2 118} 119 120define <4 x float> @vcvtQ_n_s32tof32(<4 x i32>* %A) nounwind { 121;CHECK: vcvtQ_n_s32tof32: 122;CHECK: vcvt.f32.s32 123 %tmp1 = load <4 x i32>* %A 124 %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) 125 ret <4 x float> %tmp2 126} 127 128define <4 x float> @vcvtQ_n_u32tof32(<4 x i32>* %A) nounwind { 129;CHECK: vcvtQ_n_u32tof32: 130;CHECK: vcvt.f32.u32 131 %tmp1 = load <4 x i32>* %A 132 %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) 133 ret <4 x float> %tmp2 134} 135 136declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone 137declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone 138declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone 139declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone 140 141define <4 x float> @vcvt_f16tof32(<4 x i16>* %A) nounwind { 142;CHECK: vcvt_f16tof32: 143;CHECK: vcvt.f32.f16 144 %tmp1 = load <4 x i16>* %A 145 %tmp2 = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %tmp1) 146 ret <4 x float> %tmp2 147} 148 149define <4 x i16> @vcvt_f32tof16(<4 x float>* %A) nounwind { 150;CHECK: vcvt_f32tof16: 151;CHECK: vcvt.f16.f32 152 %tmp1 = load <4 x float>* %A 153 %tmp2 = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %tmp1) 154 ret <4 x i16> %tmp2 155} 156 157declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone 158declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone 159 160; We currently estimate the cost of sext/zext/trunc v8(v16)i32 <-> v8(v16)i8 161; instructions as expensive. If lowering is improved the cost model needs to 162; change. 163; RUN: opt < %s -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -march=arm -mcpu=cortex-a8 | FileCheck %s --check-prefix=COST 164%T0_5 = type <8 x i8> 165%T1_5 = type <8 x i32> 166; CHECK: func_cvt5: 167define void @func_cvt5(%T0_5* %loadaddr, %T1_5* %storeaddr) { 168; CHECK: strh 169; CHECK: strh 170; CHECK: strh 171; CHECK: strh 172; CHECK: strh 173; CHECK: strh 174; CHECK: strh 175; CHECK: strh 176 %v0 = load %T0_5* %loadaddr 177; COST: func_cvt5 178; COST: cost of 24 {{.*}} sext 179 %r = sext %T0_5 %v0 to %T1_5 180 store %T1_5 %r, %T1_5* %storeaddr 181 ret void 182} 183;; We currently estimate the cost of this instruction as expensive. If lowering 184;; is improved the cost needs to change. 185%TA0_5 = type <8 x i8> 186%TA1_5 = type <8 x i32> 187; CHECK: func_cvt1: 188define void @func_cvt1(%TA0_5* %loadaddr, %TA1_5* %storeaddr) { 189; CHECK: strh 190; CHECK: strh 191; CHECK: strh 192; CHECK: strh 193; CHECK: strh 194; CHECK: strh 195; CHECK: strh 196; CHECK: strh 197 %v0 = load %TA0_5* %loadaddr 198; COST: func_cvt1 199; COST: cost of 22 {{.*}} zext 200 %r = zext %TA0_5 %v0 to %TA1_5 201 store %TA1_5 %r, %TA1_5* %storeaddr 202 ret void 203} 204;; We currently estimate the cost of this instruction as expensive. If lowering 205;; is improved the cost needs to change. 206%T0_51 = type <8 x i32> 207%T1_51 = type <8 x i8> 208; CHECK: func_cvt51: 209define void @func_cvt51(%T0_51* %loadaddr, %T1_51* %storeaddr) { 210; CHECK: strb 211; CHECK: strb 212; CHECK: strb 213; CHECK: strb 214; CHECK: strb 215; CHECK: strb 216; CHECK: strb 217; CHECK: strb 218 %v0 = load %T0_51* %loadaddr 219; COST: func_cvt51 220; COST: cost of 19 {{.*}} trunc 221 %r = trunc %T0_51 %v0 to %T1_51 222 store %T1_51 %r, %T1_51* %storeaddr 223 ret void 224} 225;; We currently estimate the cost of this instruction as expensive. If lowering 226;; is improved the cost needs to change. 227%TT0_5 = type <16 x i8> 228%TT1_5 = type <16 x i32> 229; CHECK: func_cvt52: 230define void @func_cvt52(%TT0_5* %loadaddr, %TT1_5* %storeaddr) { 231; CHECK: strh 232; CHECK: strh 233; CHECK: strh 234; CHECK: strh 235; CHECK: strh 236; CHECK: strh 237; CHECK: strh 238; CHECK: strh 239; CHECK: strh 240; CHECK: strh 241; CHECK: strh 242; CHECK: strh 243; CHECK: strh 244; CHECK: strh 245; CHECK: strh 246; CHECK: strh 247 %v0 = load %TT0_5* %loadaddr 248; COST: func_cvt52 249; COST: cost of 48 {{.*}} sext 250 %r = sext %TT0_5 %v0 to %TT1_5 251 store %TT1_5 %r, %TT1_5* %storeaddr 252 ret void 253} 254;; We currently estimate the cost of this instruction as expensive. If lowering 255;; is improved the cost needs to change. 256%TTA0_5 = type <16 x i8> 257%TTA1_5 = type <16 x i32> 258; CHECK: func_cvt12: 259define void @func_cvt12(%TTA0_5* %loadaddr, %TTA1_5* %storeaddr) { 260; CHECK: strh 261; CHECK: strh 262; CHECK: strh 263; CHECK: strh 264; CHECK: strh 265; CHECK: strh 266; CHECK: strh 267; CHECK: strh 268; CHECK: strh 269; CHECK: strh 270; CHECK: strh 271; CHECK: strh 272; CHECK: strh 273; CHECK: strh 274; CHECK: strh 275; CHECK: strh 276 %v0 = load %TTA0_5* %loadaddr 277; COST: func_cvt12 278; COST: cost of 44 {{.*}} zext 279 %r = zext %TTA0_5 %v0 to %TTA1_5 280 store %TTA1_5 %r, %TTA1_5* %storeaddr 281 ret void 282} 283;; We currently estimate the cost of this instruction as expensive. If lowering 284;; is improved the cost needs to change. 285%TT0_51 = type <16 x i32> 286%TT1_51 = type <16 x i8> 287; CHECK: func_cvt512: 288define void @func_cvt512(%TT0_51* %loadaddr, %TT1_51* %storeaddr) { 289; CHECK: strb 290; CHECK: strb 291; CHECK: strb 292; CHECK: strb 293; CHECK: strb 294; CHECK: strb 295; CHECK: strb 296; CHECK: strb 297; CHECK: strb 298; CHECK: strb 299; CHECK: strb 300; CHECK: strb 301; CHECK: strb 302; CHECK: strb 303; CHECK: strb 304; CHECK: strb 305 %v0 = load %TT0_51* %loadaddr 306; COST: func_cvt512 307; COST: cost of 38 {{.*}} trunc 308 %r = trunc %TT0_51 %v0 to %TT1_51 309 store %TT1_51 %r, %TT1_51* %storeaddr 310 ret void 311} 312