1; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s 2; Implement ctpop with vcnt 3 4define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind { 5;CHECK-LABEL: vcnt8: 6;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 7 %tmp1 = load <8 x i8>, <8 x i8>* %A 8 %tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1) 9 ret <8 x i8> %tmp2 10} 11 12define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { 13;CHECK-LABEL: vcntQ8: 14;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 15 %tmp1 = load <16 x i8>, <16 x i8>* %A 16 %tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1) 17 ret <16 x i8> %tmp2 18} 19 20define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind { 21; CHECK-LABEL: vcnt16: 22; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 23; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 24; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 25; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 26; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 27 %tmp1 = load <4 x i16>, <4 x i16>* %A 28 %tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1) 29 ret <4 x i16> %tmp2 30} 31 32define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind { 33; CHECK-LABEL: vcntQ16: 34; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 35; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 36; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 37; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 38; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 39 %tmp1 = load <8 x i16>, <8 x i16>* %A 40 %tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1) 41 ret <8 x i16> %tmp2 42} 43 44define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind { 45; CHECK-LABEL: vcnt32: 46; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 47; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 48; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 49; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 50; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 51; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}} 52; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}} 53; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 54 %tmp1 = load <2 x i32>, <2 x i32>* %A 55 %tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1) 56 ret <2 x i32> %tmp2 57} 58 59define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { 60; CHECK-LABEL: vcntQ32: 61; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 62; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 63; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 64; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 65; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 66; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}} 67; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}} 68; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 69 %tmp1 = load <4 x i32>, <4 x i32>* %A 70 %tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1) 71 ret <4 x i32> %tmp2 72} 73 74define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind { 75; CHECK-LABEL: vcnt64: 76 %tmp1 = load <1 x i64>, <1 x i64>* %A 77 %tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1) 78 ret <1 x i64> %tmp2 79} 80 81define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind { 82; CHECK-LABEL: vcntQ64: 83 %tmp1 = load <2 x i64>, <2 x i64>* %A 84 %tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1) 85 ret <2 x i64> %tmp2 86} 87 88declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone 89declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone 90declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone 91declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone 92declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone 93declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone 94declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone 95declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone 96 97define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { 98;CHECK-LABEL: vclz8: 99;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}} 100 %tmp1 = load <8 x i8>, <8 x i8>* %A 101 %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0) 102 ret <8 x i8> %tmp2 103} 104 105define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { 106;CHECK-LABEL: vclz16: 107;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}} 108 %tmp1 = load <4 x i16>, <4 x i16>* %A 109 %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0) 110 ret <4 x i16> %tmp2 111} 112 113define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { 114;CHECK-LABEL: vclz32: 115;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}} 116 %tmp1 = load <2 x i32>, <2 x i32>* %A 117 %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0) 118 ret <2 x i32> %tmp2 119} 120 121define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { 122;CHECK-LABEL: vclzQ8: 123;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}} 124 %tmp1 = load <16 x i8>, <16 x i8>* %A 125 %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0) 126 ret <16 x i8> %tmp2 127} 128 129define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { 130;CHECK-LABEL: vclzQ16: 131;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}} 132 %tmp1 = load <8 x i16>, <8 x i16>* %A 133 %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0) 134 ret <8 x i16> %tmp2 135} 136 137define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { 138;CHECK-LABEL: vclzQ32: 139;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}} 140 %tmp1 = load <4 x i32>, <4 x i32>* %A 141 %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0) 142 ret <4 x i32> %tmp2 143} 144 145declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone 146declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone 147declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 148 149declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone 150declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone 151declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 152 153define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { 154;CHECK-LABEL: vclss8: 155;CHECK: vcls.s8 156 %tmp1 = load <8 x i8>, <8 x i8>* %A 157 %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) 158 ret <8 x i8> %tmp2 159} 160 161define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { 162;CHECK-LABEL: vclss16: 163;CHECK: vcls.s16 164 %tmp1 = load <4 x i16>, <4 x i16>* %A 165 %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) 166 ret <4 x i16> %tmp2 167} 168 169define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { 170;CHECK-LABEL: vclss32: 171;CHECK: vcls.s32 172 %tmp1 = load <2 x i32>, <2 x i32>* %A 173 %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) 174 ret <2 x i32> %tmp2 175} 176 177define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { 178;CHECK-LABEL: vclsQs8: 179;CHECK: vcls.s8 180 %tmp1 = load <16 x i8>, <16 x i8>* %A 181 %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) 182 ret <16 x i8> %tmp2 183} 184 185define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { 186;CHECK-LABEL: vclsQs16: 187;CHECK: vcls.s16 188 %tmp1 = load <8 x i16>, <8 x i16>* %A 189 %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) 190 ret <8 x i16> %tmp2 191} 192 193define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { 194;CHECK-LABEL: vclsQs32: 195;CHECK: vcls.s32 196 %tmp1 = load <4 x i32>, <4 x i32>* %A 197 %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) 198 ret <4 x i32> %tmp2 199} 200 201declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone 202declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone 203declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone 204 205declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone 206declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone 207declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone 208