1; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s 2; Implement ctpop with vcnt 3 4define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind { 5;CHECK: vcnt8: 6;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 7 %tmp1 = load <8 x i8>* %A 8 %tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1) 9 ret <8 x i8> %tmp2 10} 11 12define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { 13;CHECK: vcntQ8: 14;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 15 %tmp1 = load <16 x i8>* %A 16 %tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1) 17 ret <16 x i8> %tmp2 18} 19 20define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind { 21; CHECK: vcnt16: 22; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 23; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 24; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 25; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 26; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 27 %tmp1 = load <4 x i16>* %A 28 %tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1) 29 ret <4 x i16> %tmp2 30} 31 32define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind { 33; CHECK: vcntQ16: 34; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 35; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 36; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 37; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 38; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 39 %tmp1 = load <8 x i16>* %A 40 %tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1) 41 ret <8 x i16> %tmp2 42} 43 44define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind { 45; CHECK: vcnt32: 46; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} 47; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} 48; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} 49; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} 50; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 51; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}} 52; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}} 53; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 54 %tmp1 = load <2 x i32>* %A 55 %tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1) 56 ret <2 x i32> %tmp2 57} 58 59define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { 60; CHECK: vcntQ32: 61; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} 62; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} 63; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} 64; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} 65; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} 66; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}} 67; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}} 68; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} 69 %tmp1 = load <4 x i32>* %A 70 %tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1) 71 ret <4 x i32> %tmp2 72} 73 74declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone 75declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone 76declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone 77declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone 78declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone 79declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone 80 81define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { 82;CHECK: vclz8: 83;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}} 84 %tmp1 = load <8 x i8>* %A 85 %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0) 86 ret <8 x i8> %tmp2 87} 88 89define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { 90;CHECK: vclz16: 91;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}} 92 %tmp1 = load <4 x i16>* %A 93 %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0) 94 ret <4 x i16> %tmp2 95} 96 97define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { 98;CHECK: vclz32: 99;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}} 100 %tmp1 = load <2 x i32>* %A 101 %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0) 102 ret <2 x i32> %tmp2 103} 104 105define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { 106;CHECK: vclzQ8: 107;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}} 108 %tmp1 = load <16 x i8>* %A 109 %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0) 110 ret <16 x i8> %tmp2 111} 112 113define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { 114;CHECK: vclzQ16: 115;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}} 116 %tmp1 = load <8 x i16>* %A 117 %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0) 118 ret <8 x i16> %tmp2 119} 120 121define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { 122;CHECK: vclzQ32: 123;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}} 124 %tmp1 = load <4 x i32>* %A 125 %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0) 126 ret <4 x i32> %tmp2 127} 128 129declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone 130declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone 131declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 132 133declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone 134declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone 135declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 136 137define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { 138;CHECK: vclss8: 139;CHECK: vcls.s8 140 %tmp1 = load <8 x i8>* %A 141 %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) 142 ret <8 x i8> %tmp2 143} 144 145define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { 146;CHECK: vclss16: 147;CHECK: vcls.s16 148 %tmp1 = load <4 x i16>* %A 149 %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) 150 ret <4 x i16> %tmp2 151} 152 153define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { 154;CHECK: vclss32: 155;CHECK: vcls.s32 156 %tmp1 = load <2 x i32>* %A 157 %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) 158 ret <2 x i32> %tmp2 159} 160 161define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { 162;CHECK: vclsQs8: 163;CHECK: vcls.s8 164 %tmp1 = load <16 x i8>* %A 165 %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) 166 ret <16 x i8> %tmp2 167} 168 169define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { 170;CHECK: vclsQs16: 171;CHECK: vcls.s16 172 %tmp1 = load <8 x i16>* %A 173 %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) 174 ret <8 x i16> %tmp2 175} 176 177define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { 178;CHECK: vclsQs32: 179;CHECK: vcls.s32 180 %tmp1 = load <4 x i32>* %A 181 %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) 182 ret <4 x i32> %tmp2 183} 184 185declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone 186declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone 187declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone 188 189declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone 190declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone 191declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone 192