1; RUN: llc < %s -mattr=+neon | FileCheck %s 2target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" 3target triple = "thumbv7-elf" 4 5define i32 @vget_lanes8(<8 x i8>* %A) nounwind { 6;CHECK-LABEL: vget_lanes8: 7;CHECK: vmov.s8 8 %tmp1 = load <8 x i8>* %A 9 %tmp2 = extractelement <8 x i8> %tmp1, i32 1 10 %tmp3 = sext i8 %tmp2 to i32 11 ret i32 %tmp3 12} 13 14define i32 @vget_lanes16(<4 x i16>* %A) nounwind { 15;CHECK-LABEL: vget_lanes16: 16;CHECK: vmov.s16 17 %tmp1 = load <4 x i16>* %A 18 %tmp2 = extractelement <4 x i16> %tmp1, i32 1 19 %tmp3 = sext i16 %tmp2 to i32 20 ret i32 %tmp3 21} 22 23define i32 @vget_laneu8(<8 x i8>* %A) nounwind { 24;CHECK-LABEL: vget_laneu8: 25;CHECK: vmov.u8 26 %tmp1 = load <8 x i8>* %A 27 %tmp2 = extractelement <8 x i8> %tmp1, i32 1 28 %tmp3 = zext i8 %tmp2 to i32 29 ret i32 %tmp3 30} 31 32define i32 @vget_laneu16(<4 x i16>* %A) nounwind { 33;CHECK-LABEL: vget_laneu16: 34;CHECK: vmov.u16 35 %tmp1 = load <4 x i16>* %A 36 %tmp2 = extractelement <4 x i16> %tmp1, i32 1 37 %tmp3 = zext i16 %tmp2 to i32 38 ret i32 %tmp3 39} 40 41; Do a vector add to keep the extraction from being done directly from memory. 42define i32 @vget_lanei32(<2 x i32>* %A) nounwind { 43;CHECK-LABEL: vget_lanei32: 44;CHECK: vmov.32 45 %tmp1 = load <2 x i32>* %A 46 %tmp2 = add <2 x i32> %tmp1, %tmp1 47 %tmp3 = extractelement <2 x i32> %tmp2, i32 1 48 ret i32 %tmp3 49} 50 51define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind { 52;CHECK-LABEL: vgetQ_lanes8: 53;CHECK: vmov.s8 54 %tmp1 = load <16 x i8>* %A 55 %tmp2 = extractelement <16 x i8> %tmp1, i32 1 56 %tmp3 = sext i8 %tmp2 to i32 57 ret i32 %tmp3 58} 59 60define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind { 61;CHECK-LABEL: vgetQ_lanes16: 62;CHECK: vmov.s16 63 %tmp1 = load <8 x i16>* %A 64 %tmp2 = extractelement <8 x i16> %tmp1, i32 1 65 %tmp3 = sext i16 %tmp2 to i32 66 ret i32 %tmp3 67} 68 69define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind { 70;CHECK-LABEL: vgetQ_laneu8: 71;CHECK: vmov.u8 72 %tmp1 = load <16 x i8>* %A 73 %tmp2 = extractelement <16 x i8> %tmp1, i32 1 74 %tmp3 = zext i8 %tmp2 to i32 75 ret i32 %tmp3 76} 77 78define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind { 79;CHECK-LABEL: vgetQ_laneu16: 80;CHECK: vmov.u16 81 %tmp1 = load <8 x i16>* %A 82 %tmp2 = extractelement <8 x i16> %tmp1, i32 1 83 %tmp3 = zext i16 %tmp2 to i32 84 ret i32 %tmp3 85} 86 87; Do a vector add to keep the extraction from being done directly from memory. 88define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind { 89;CHECK-LABEL: vgetQ_lanei32: 90;CHECK: vmov.32 91 %tmp1 = load <4 x i32>* %A 92 %tmp2 = add <4 x i32> %tmp1, %tmp1 93 %tmp3 = extractelement <4 x i32> %tmp2, i32 1 94 ret i32 %tmp3 95} 96 97define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind { 98entry: 99; CHECK: vmov.u16 r0, d{{.*}}[1] 100 %arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1] 101 %out_uint16_t = alloca i16 ; <i16*> [#uses=1] 102 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 103 %0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1] 104 %1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1] 105 %2 = add i16 %1, %1 106 store i16 %2, i16* %out_uint16_t, align 2 107 br label %return 108 109return: ; preds = %entry 110 ret void 111} 112 113define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind { 114entry: 115; CHECK: vmov.u8 r0, d{{.*}}[1] 116 %arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1] 117 %out_uint8_t = alloca i8 ; <i8*> [#uses=1] 118 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 119 %0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1] 120 %1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1] 121 %2 = add i8 %1, %1 122 store i8 %2, i8* %out_uint8_t, align 1 123 br label %return 124 125return: ; preds = %entry 126 ret void 127} 128 129define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind { 130entry: 131; CHECK: vmov.u16 r0, d{{.*}}[1] 132 %arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1] 133 %out_uint16_t = alloca i16 ; <i16*> [#uses=1] 134 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 135 %0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1] 136 %1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1] 137 %2 = add i16 %1, %1 138 store i16 %2, i16* %out_uint16_t, align 2 139 br label %return 140 141return: ; preds = %entry 142 ret void 143} 144 145define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind { 146entry: 147; CHECK: vmov.u8 r0, d{{.*}}[1] 148 %arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1] 149 %out_uint8_t = alloca i8 ; <i8*> [#uses=1] 150 %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] 151 %0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1] 152 %1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1] 153 %2 = add i8 %1, %1 154 store i8 %2, i8* %out_uint8_t, align 1 155 br label %return 156 157return: ; preds = %entry 158 ret void 159} 160 161define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind { 162;CHECK-LABEL: vset_lane8: 163;CHECK: vmov.8 164 %tmp1 = load <8 x i8>* %A 165 %tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1 166 ret <8 x i8> %tmp2 167} 168 169define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind { 170;CHECK-LABEL: vset_lane16: 171;CHECK: vmov.16 172 %tmp1 = load <4 x i16>* %A 173 %tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1 174 ret <4 x i16> %tmp2 175} 176 177define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind { 178;CHECK-LABEL: vset_lane32: 179;CHECK: vmov.32 180 %tmp1 = load <2 x i32>* %A 181 %tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1 182 ret <2 x i32> %tmp2 183} 184 185define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind { 186;CHECK-LABEL: vsetQ_lane8: 187;CHECK: vmov.8 188 %tmp1 = load <16 x i8>* %A 189 %tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1 190 ret <16 x i8> %tmp2 191} 192 193define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind { 194;CHECK-LABEL: vsetQ_lane16: 195;CHECK: vmov.16 196 %tmp1 = load <8 x i16>* %A 197 %tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1 198 ret <8 x i16> %tmp2 199} 200 201define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind { 202;CHECK-LABEL: vsetQ_lane32: 203;CHECK: vmov.32 d{{.*}}[1], r1 204 %tmp1 = load <4 x i32>* %A 205 %tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1 206 ret <4 x i32> %tmp2 207} 208 209define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind { 210;CHECK-LABEL: test_vset_lanef32: 211;CHECK: vmov.f32 s3, s0 212;CHECK: vmov.f64 d0, d1 213entry: 214 %0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1] 215 ret <2 x float> %0 216} 217 218; The llvm extractelement instruction does not require that the lane number 219; be an immediate constant. Make sure a variable lane number is handled. 220 221define i32 @vget_variable_lanes8(<8 x i8>* %A, i32 %B) nounwind { 222 %tmp1 = load <8 x i8>* %A 223 %tmp2 = extractelement <8 x i8> %tmp1, i32 %B 224 %tmp3 = sext i8 %tmp2 to i32 225 ret i32 %tmp3 226} 227 228define i32 @vgetQ_variable_lanei32(<4 x i32>* %A, i32 %B) nounwind { 229 %tmp1 = load <4 x i32>* %A 230 %tmp2 = add <4 x i32> %tmp1, %tmp1 231 %tmp3 = extractelement <4 x i32> %tmp2, i32 %B 232 ret i32 %tmp3 233} 234