1; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s 2 3define i32 @test_rev_w(i32 %a) nounwind { 4entry: 5; CHECK-LABEL: test_rev_w: 6; CHECK: rev w0, w0 7 %0 = tail call i32 @llvm.bswap.i32(i32 %a) 8 ret i32 %0 9} 10 11define i64 @test_rev_x(i64 %a) nounwind { 12entry: 13; CHECK-LABEL: test_rev_x: 14; CHECK: rev x0, x0 15 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 16 ret i64 %0 17} 18 19; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits 20; of %a are zero. This optimizes rev + lsr 16 to rev16. 21define i32 @test_rev_w_srl16(i16 %a) { 22entry: 23; CHECK-LABEL: test_rev_w_srl16: 24; CHECK: and [[REG:w[0-9]+]], w0, #0xffff 25; CHECK: rev16 w0, [[REG]] 26; CHECK-NOT: lsr 27 %0 = zext i16 %a to i32 28 %1 = tail call i32 @llvm.bswap.i32(i32 %0) 29 %2 = lshr i32 %1, 16 30 ret i32 %2 31} 32 33; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits 34; of %a are zero. This optimizes rev + lsr 32 to rev32. 35define i64 @test_rev_x_srl32(i32 %a) { 36entry: 37; CHECK-LABEL: test_rev_x_srl32: 38; CHECK: rev32 x0, {{x[0-9]+}} 39; CHECK-NOT: lsr 40 %0 = zext i32 %a to i64 41 %1 = tail call i64 @llvm.bswap.i64(i64 %0) 42 %2 = lshr i64 %1, 32 43 ret i64 %2 44} 45 46declare i32 @llvm.bswap.i32(i32) nounwind readnone 47declare i64 @llvm.bswap.i64(i64) nounwind readnone 48 49define i32 @test_rev16_w(i32 %X) nounwind { 50entry: 51; CHECK-LABEL: test_rev16_w: 52; CHECK: rev16 w0, w0 53 %tmp1 = lshr i32 %X, 8 54 %X15 = bitcast i32 %X to i32 55 %tmp4 = shl i32 %X15, 8 56 %tmp2 = and i32 %tmp1, 16711680 57 %tmp5 = and i32 %tmp4, -16777216 58 %tmp9 = and i32 %tmp1, 255 59 %tmp13 = and i32 %tmp4, 65280 60 %tmp6 = or i32 %tmp5, %tmp2 61 %tmp10 = or i32 %tmp6, %tmp13 62 %tmp14 = or i32 %tmp10, %tmp9 63 ret i32 %tmp14 64} 65 66; 64-bit REV16 is *not* a swap then a 16-bit rotation: 67; 01234567 ->(bswap) 76543210 ->(rotr) 10765432 68; 01234567 ->(rev16) 10325476 69define i64 @test_rev16_x(i64 %a) nounwind { 70entry: 71; CHECK-LABEL: test_rev16_x: 72; CHECK-NOT: rev16 x0, x0 73 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 74 %1 = lshr i64 %0, 16 75 %2 = shl i64 %0, 48 76 %3 = or i64 %1, %2 77 ret i64 %3 78} 79 80define i64 @test_rev32_x(i64 %a) nounwind { 81entry: 82; CHECK-LABEL: test_rev32_x: 83; CHECK: rev32 x0, x0 84 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 85 %1 = lshr i64 %0, 32 86 %2 = shl i64 %0, 32 87 %3 = or i64 %1, %2 88 ret i64 %3 89} 90 91define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind { 92;CHECK-LABEL: test_vrev64D8: 93;CHECK: rev64.8b 94 %tmp1 = load <8 x i8>, <8 x i8>* %A 95 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 96 ret <8 x i8> %tmp2 97} 98 99define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind { 100;CHECK-LABEL: test_vrev64D16: 101;CHECK: rev64.4h 102 %tmp1 = load <4 x i16>, <4 x i16>* %A 103 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 104 ret <4 x i16> %tmp2 105} 106 107define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind { 108;CHECK-LABEL: test_vrev64D32: 109;CHECK: rev64.2s 110 %tmp1 = load <2 x i32>, <2 x i32>* %A 111 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0> 112 ret <2 x i32> %tmp2 113} 114 115define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind { 116;CHECK-LABEL: test_vrev64Df: 117;CHECK: rev64.2s 118 %tmp1 = load <2 x float>, <2 x float>* %A 119 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0> 120 ret <2 x float> %tmp2 121} 122 123define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind { 124;CHECK-LABEL: test_vrev64Q8: 125;CHECK: rev64.16b 126 %tmp1 = load <16 x i8>, <16 x i8>* %A 127 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> 128 ret <16 x i8> %tmp2 129} 130 131define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind { 132;CHECK-LABEL: test_vrev64Q16: 133;CHECK: rev64.8h 134 %tmp1 = load <8 x i16>, <8 x i16>* %A 135 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 136 ret <8 x i16> %tmp2 137} 138 139define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind { 140;CHECK-LABEL: test_vrev64Q32: 141;CHECK: rev64.4s 142 %tmp1 = load <4 x i32>, <4 x i32>* %A 143 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 144 ret <4 x i32> %tmp2 145} 146 147define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind { 148;CHECK-LABEL: test_vrev64Qf: 149;CHECK: rev64.4s 150 %tmp1 = load <4 x float>, <4 x float>* %A 151 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 152 ret <4 x float> %tmp2 153} 154 155define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind { 156;CHECK-LABEL: test_vrev32D8: 157;CHECK: rev32.8b 158 %tmp1 = load <8 x i8>, <8 x i8>* %A 159 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 160 ret <8 x i8> %tmp2 161} 162 163define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind { 164;CHECK-LABEL: test_vrev32D16: 165;CHECK: rev32.4h 166 %tmp1 = load <4 x i16>, <4 x i16>* %A 167 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 168 ret <4 x i16> %tmp2 169} 170 171define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind { 172;CHECK-LABEL: test_vrev32Q8: 173;CHECK: rev32.16b 174 %tmp1 = load <16 x i8>, <16 x i8>* %A 175 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 176 ret <16 x i8> %tmp2 177} 178 179define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind { 180;CHECK-LABEL: test_vrev32Q16: 181;CHECK: rev32.8h 182 %tmp1 = load <8 x i16>, <8 x i16>* %A 183 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 184 ret <8 x i16> %tmp2 185} 186 187define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind { 188;CHECK-LABEL: test_vrev16D8: 189;CHECK: rev16.8b 190 %tmp1 = load <8 x i8>, <8 x i8>* %A 191 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 192 ret <8 x i8> %tmp2 193} 194 195define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind { 196;CHECK-LABEL: test_vrev16Q8: 197;CHECK: rev16.16b 198 %tmp1 = load <16 x i8>, <16 x i8>* %A 199 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 200 ret <16 x i8> %tmp2 201} 202 203; Undef shuffle indices should not prevent matching to VREV: 204 205define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind { 206;CHECK-LABEL: test_vrev64D8_undef: 207;CHECK: rev64.8b 208 %tmp1 = load <8 x i8>, <8 x i8>* %A 209 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0> 210 ret <8 x i8> %tmp2 211} 212 213define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind { 214;CHECK-LABEL: test_vrev32Q16_undef: 215;CHECK: rev32.8h 216 %tmp1 = load <8 x i16>, <8 x i16>* %A 217 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef> 218 ret <8 x i16> %tmp2 219} 220 221; vrev <4 x i16> should use REV32 and not REV64 222define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp { 223; CHECK-LABEL: test_vrev64: 224; CHECK: ldr [[DEST:q[0-9]+]], 225; CHECK: st1.h 226; CHECK: st1.h 227entry: 228 %0 = bitcast <4 x i16>* %source to <8 x i16>* 229 %tmp2 = load <8 x i16>, <8 x i16>* %0, align 4 230 %tmp3 = extractelement <8 x i16> %tmp2, i32 6 231 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0 232 %tmp9 = extractelement <8 x i16> %tmp2, i32 5 233 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1 234 store <2 x i16> %tmp11, <2 x i16>* %dst, align 4 235 ret void 236} 237 238; Test vrev of float4 239define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp { 240; CHECK: float_vrev64 241; CHECK: ldr [[DEST:q[0-9]+]], 242; CHECK: rev64.4s 243entry: 244 %0 = bitcast float* %source to <4 x float>* 245 %tmp2 = load <4 x float>, <4 x float>* %0, align 4 246 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0> 247 %arrayidx8 = getelementptr inbounds <4 x float>, <4 x float>* %dest, i32 11 248 store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4 249 ret void 250} 251 252 253define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { 254; CHECK-LABEL: test_vrev32_bswap: 255; CHECK: rev32.16b 256; CHECK-NOT: rev 257; CHECK: ret 258 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) 259 ret <4 x i32> %bswap 260} 261 262declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone 263