1; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a 2; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a 3; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=falkor -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a 4; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a 5; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=saphira -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a 6; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple 7 8declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) 9declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) 10declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) 11declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) 12declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) 13declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16) 14 15declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) 16declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) 17declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) 18declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) 19declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) 20declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16) 21 22declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) 23declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) 24declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) 25declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) 26declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) 27declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16) 28 29;----------------------------------------------------------------------------- 30; RDMA Vector 31; test for SIMDThreeSameVectorSQRDMLxHTiedHS 32 33define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { 34; CHECK-LABEL: test_sqrdmlah_v4i16: 35 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) 36 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) 37; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h 38; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.4h 39; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2 40 ret <4 x i16> %retval 41} 42 43define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { 44; CHECK-LABEL: test_sqrdmlah_v8i16: 45 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) 46 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) 47; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h 48; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.8h 49; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2 50 ret <8 x i16> %retval 51} 52 53define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { 54; CHECK-LABEL: test_sqrdmlah_v2i32: 55 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) 56 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) 57; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s 58; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.2s 59; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2 60 ret <2 x i32> %retval 61} 62 63define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { 64; CHECK-LABEL: test_sqrdmlah_v4i32: 65 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) 66 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) 67; CHECK-V81: sqrdmulh v1.4s, v1.4s, v2.4s 68; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.4s 69; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2 70 ret <4 x i32> %retval 71} 72 73define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) { 74; CHECK-LABEL: test_sqrdmlsh_v4i16: 75 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs, <4 x i16> %rhs) 76 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) 77; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.4h 78; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.4h 79; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2 80 ret <4 x i16> %retval 81} 82 83define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) { 84; CHECK-LABEL: test_sqrdmlsh_v8i16: 85 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs) 86 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) 87; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.8h 88; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.8h 89; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2 90 ret <8 x i16> %retval 91} 92 93define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) { 94; CHECK-LABEL: test_sqrdmlsh_v2i32: 95 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs) 96 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) 97; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.2s 98; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.2s 99; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2 100 ret <2 x i32> %retval 101} 102 103define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) { 104; CHECK-LABEL: test_sqrdmlsh_v4i32: 105 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs) 106 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) 107; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.4s 108; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.4s 109; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2 110 ret <4 x i32> %retval 111} 112 113;----------------------------------------------------------------------------- 114; RDMA Vector, by element 115; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied 116 117define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { 118; CHECK-LABEL: test_sqrdmlah_lane_s16: 119entry: 120 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 121 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 122 %retval = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod) 123; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3] 124; CHECK-V81a: sqrdmlah v0.4h, v1.4h, v2.h[3] 125; CHECK-V81a-apple: sqrdmlah.4h v0, v1, v2[3] 126 ret <4 x i16> %retval 127} 128 129define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { 130; CHECK-LABEL: test_sqrdmlahq_lane_s16: 131entry: 132 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 133 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 134 %retval = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod) 135; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] 136; CHECK-V81a: sqrdmlah v0.8h, v1.8h, v2.h[2] 137; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2[2] 138 ret <8 x i16> %retval 139} 140 141define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { 142; CHECK-LABEL: test_sqrdmlah_lane_s32: 143entry: 144 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 145 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 146 %retval = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod) 147; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] 148; CHECK-V81a: sqrdmlah v0.2s, v1.2s, v2.s[1] 149; CHECK-V81a-apple: sqrdmlah.2s v0, v1, v2[1] 150 ret <2 x i32> %retval 151} 152 153define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { 154; CHECK-LABEL: test_sqrdmlahq_lane_s32: 155entry: 156 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 157 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 158 %retval = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod) 159; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] 160; CHECK-V81a: sqrdmlah v0.4s, v1.4s, v2.s[0] 161; CHECK-V81a-apple: sqrdmlah.4s v0, v1, v2[0] 162 ret <4 x i32> %retval 163} 164 165define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) { 166; CHECK-LABEL: test_sqrdmlsh_lane_s16: 167entry: 168 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 169 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 170 %retval = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod) 171; CHECK-V8a: sqrdmulh v1.4h, v1.4h, v2.h[3] 172; CHECK-V81a: sqrdmlsh v0.4h, v1.4h, v2.h[3] 173; CHECK-V81a-apple: sqrdmlsh.4h v0, v1, v2[3] 174 ret <4 x i16> %retval 175} 176 177define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) { 178; CHECK-LABEL: test_sqrdmlshq_lane_s16: 179entry: 180 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 181 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 182 %retval = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod) 183; CHECK-V8a: sqrdmulh v1.8h, v1.8h, v2.h[2] 184; CHECK-V81a: sqrdmlsh v0.8h, v1.8h, v2.h[2] 185; CHECK-V81a-apple: sqrdmlsh.8h v0, v1, v2[2] 186 ret <8 x i16> %retval 187} 188 189define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) { 190; CHECK-LABEL: test_sqrdmlsh_lane_s32: 191entry: 192 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> 193 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 194 %retval = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod) 195; CHECK-V8a: sqrdmulh v1.2s, v1.2s, v2.s[1] 196; CHECK-V81a: sqrdmlsh v0.2s, v1.2s, v2.s[1] 197; CHECK-V81a-apple: sqrdmlsh.2s v0, v1, v2[1] 198 ret <2 x i32> %retval 199} 200 201define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) { 202; CHECK-LABEL: test_sqrdmlshq_lane_s32: 203entry: 204 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 205 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 206 %retval = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod) 207; CHECK-V8a: sqrdmulh v1.4s, v1.4s, v2.s[0] 208; CHECK-V81a: sqrdmlsh v0.4s, v1.4s, v2.s[0] 209; CHECK-V81a-apple: sqrdmlsh.4s v0, v1, v2[0] 210 ret <4 x i32> %retval 211} 212 213;----------------------------------------------------------------------------- 214; RDMA Vector, by element, extracted 215; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style 216; i32 tests are for "def : Pat" in SIMDIndexedSQRDMLxHSDTied 217 218define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { 219; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16: 220entry: 221 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 222 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 223 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 224 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 225 %retval = extractelement <4 x i16> %retval_vec, i64 0 226; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] 227; CHECK-V81a: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[1] 228; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}}, v0, v1[1] 229 ret i16 %retval 230} 231 232define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { 233; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16: 234entry: 235 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> 236 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 237 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 238 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 239 %retval = extractelement <8 x i16> %retval_vec, i64 0 240; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] 241; CHECK-V81a: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[1] 242; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}}, v0, v1[1] 243 ret i16 %retval 244} 245 246define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { 247; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32: 248entry: 249 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer 250 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 251 %extract = extractelement <2 x i32> %prod, i64 0 252 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) 253; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] 254; CHECK-V81a: sqrdmlah v2.2s, v0.2s, v1.s[0] 255; CHECK-V81a-apple: sqrdmlah.2s v2, v0, v1[0] 256 ret i32 %retval 257} 258 259define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { 260; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32: 261entry: 262 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 263 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 264 %extract = extractelement <4 x i32> %prod, i64 0 265 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract) 266; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] 267; CHECK-V81a: sqrdmlah v2.4s, v0.4s, v1.s[0] 268; CHECK-V81a-apple: sqrdmlah.4s v2, v0, v1[0] 269 ret i32 %retval 270} 271 272define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) { 273; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16: 274entry: 275 %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 276 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle) 277 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 278 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 279 %retval = extractelement <4 x i16> %retval_vec, i64 0 280; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, v0.4h, v1.h[1] 281; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.4h, v0.4h, v1.h[1] 282; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}}, v0, v1[1] 283 ret i16 %retval 284} 285 286define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) { 287; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16: 288entry: 289 %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1> 290 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle) 291 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 292 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 293 %retval = extractelement <8 x i16> %retval_vec, i64 0 294; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, v0.8h, v1.h[1] 295; CHECK-V81a: sqrdmlsh {{v[2-9]+}}.8h, v0.8h, v1.h[1] 296; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}}, v0, v1[1] 297 ret i16 %retval 298} 299 300define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) { 301; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32: 302entry: 303 %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer 304 %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle) 305 %extract = extractelement <2 x i32> %prod, i64 0 306 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) 307; CHECK-V8a: sqrdmulh v0.2s, v0.2s, v1.s[0] 308; CHECK-V81a: sqrdmlsh v2.2s, v0.2s, v1.s[0] 309; CHECK-V81a-apple: sqrdmlsh.2s v2, v0, v1[0] 310 ret i32 %retval 311} 312 313define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) { 314; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32: 315entry: 316 %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer 317 %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle) 318 %extract = extractelement <4 x i32> %prod, i64 0 319 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract) 320; CHECK-V8a: sqrdmulh v0.4s, v0.4s, v1.s[0] 321; CHECK-V81a: sqrdmlsh v2.4s, v0.4s, v1.s[0] 322; CHECK-V81a-apple: sqrdmlsh.4s v2, v0, v1[0] 323 ret i32 %retval 324} 325 326;----------------------------------------------------------------------------- 327; RDMA Scalar 328; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td 329 330define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) { 331; CHECK-LABEL: test_sqrdmlah_v1i16: 332 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 333 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 334 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) 335 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 336 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) 337 %retval = extractelement <4 x i16> %retval_vec, i64 0 338; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 339; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 340; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 341 ret i16 %retval 342} 343 344define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) { 345; CHECK-LABEL: test_sqrdmlah_v1i32: 346 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 347 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 348 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) 349 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 350 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) 351 %retval = extractelement <4 x i32> %retval_vec, i64 0 352; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 353; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 354; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 355 ret i32 %retval 356} 357 358 359define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) { 360; CHECK-LABEL: test_sqrdmlsh_v1i16: 361 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 362 %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0 363 %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %y_vec) 364 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 365 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod_vec) 366 %retval = extractelement <4 x i16> %retval_vec, i64 0 367; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 368; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h 369; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 370 ret i16 %retval 371} 372 373define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) { 374; CHECK-LABEL: test_sqrdmlsh_v1i32: 375 %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0 376 %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0 377 %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec, <4 x i32> %y_vec) 378 %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0 379 %retval_vec = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec, <4 x i32> %prod_vec) 380 %retval = extractelement <4 x i32> %retval_vec, i64 0 381; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 382; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s 383; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} 384 ret i32 %retval 385} 386define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) { 387; CHECK-LABEL: test_sqrdmlah_i32: 388 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) 389 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) 390; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 391; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 392; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 393 ret i32 %retval 394} 395 396define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) { 397; CHECK-LABEL: test_sqrdmlsh_i32: 398 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %rhs) 399 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) 400; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 401; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 402; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} 403 ret i32 %retval 404} 405 406;----------------------------------------------------------------------------- 407; RDMA Scalar, by element 408; i16 tests are performed via tests in above chapter, with IR in ACLE style 409; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied 410 411define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) { 412; CHECK-LABEL: test_sqrdmlah_extract_i16: 413 %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1> 414 %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0 415 %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle) 416 %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0 417 %retval_vec = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod) 418 %retval = extractelement <4 x i16> %retval_vec, i32 0 419; CHECK-V8a: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] 420; CHECK-V81a: sqrdmlah {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1] 421; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] 422 ret i16 %retval 423} 424 425define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { 426; CHECK-LABEL: test_sqrdmlah_extract_i32: 427 %extract = extractelement <4 x i32> %rhs, i32 3 428 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) 429 %retval = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %prod) 430; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 431; CHECK-V81a: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 432; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] 433 ret i32 %retval 434} 435 436define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) { 437; CHECK-LABEL: test_sqrdmlshq_extract_i16: 438 %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1> 439 %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0 440 %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle) 441 %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0 442 %retval_vec = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod) 443 %retval = extractelement <8 x i16> %retval_vec, i32 0 444; CHECK-V8a: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] 445; CHECK-V81a: sqrdmlsh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1] 446; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}}, {{v[0-9]+}}, v0[1] 447 ret i16 %retval 448} 449 450define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { 451; CHECK-LABEL: test_sqrdmlsh_extract_i32: 452 %extract = extractelement <4 x i32> %rhs, i32 3 453 %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs, i32 %extract) 454 %retval = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %prod) 455; CHECK-V8a: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 456; CHECK-V81a: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3] 457; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3] 458 ret i32 %retval 459} 460