1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s 3 4declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32) 5declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32) 6declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32) 7declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32) 8 9declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32) 10declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32) 11declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32) 12declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32) 13 14define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { 15; CHECK-LABEL: gather_mask_dps: 16; CHECK: ## %bb.0: 17; CHECK-NEXT: kmovd %edi, %k1 18; CHECK-NEXT: kmovq %k1, %k2 19; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} 20; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 21; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} 22; CHECK-NEXT: vzeroupper 23; CHECK-NEXT: retq 24 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) 25 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 26 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4) 27 ret void 28} 29 30define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { 31; CHECK-LABEL: gather_mask_dpd: 32; CHECK: ## %bb.0: 33; CHECK-NEXT: kmovd %edi, %k1 34; CHECK-NEXT: kmovq %k1, %k2 35; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} 36; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 37; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} 38; CHECK-NEXT: vzeroupper 39; CHECK-NEXT: retq 40 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) 41 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 42 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4) 43 ret void 44} 45 46define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) { 47; CHECK-LABEL: gather_mask_qps: 48; CHECK: ## %bb.0: 49; CHECK-NEXT: kmovd %edi, %k1 50; CHECK-NEXT: kmovq %k1, %k2 51; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} 52; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 53; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} 54; CHECK-NEXT: vzeroupper 55; CHECK-NEXT: retq 56 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 57 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 58 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4) 59 ret void 60} 61 62define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { 63; CHECK-LABEL: gather_mask_qpd: 64; CHECK: ## %bb.0: 65; CHECK-NEXT: kmovd %edi, %k1 66; CHECK-NEXT: kmovq %k1, %k2 67; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} 68; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 69; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} 70; CHECK-NEXT: vzeroupper 71; CHECK-NEXT: retq 72 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 73 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 74 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4) 75 ret void 76} 77;; 78;; Integer Gather/Scatter 79;; 80declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32) 81declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32) 82declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32) 83declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32) 84 85declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32) 86declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32) 87declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32) 88declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32) 89 90define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { 91; CHECK-LABEL: gather_mask_dd: 92; CHECK: ## %bb.0: 93; CHECK-NEXT: kmovd %edi, %k1 94; CHECK-NEXT: kmovq %k1, %k2 95; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} 96; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 97; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} 98; CHECK-NEXT: vzeroupper 99; CHECK-NEXT: retq 100 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) 101 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 102 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4) 103 ret void 104} 105 106define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) { 107; CHECK-LABEL: gather_mask_qd: 108; CHECK: ## %bb.0: 109; CHECK-NEXT: kmovd %edi, %k1 110; CHECK-NEXT: kmovq %k1, %k2 111; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} 112; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 113; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} 114; CHECK-NEXT: vzeroupper 115; CHECK-NEXT: retq 116 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 117 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 118 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4) 119 ret void 120} 121 122define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { 123; CHECK-LABEL: gather_mask_qq: 124; CHECK: ## %bb.0: 125; CHECK-NEXT: kmovd %edi, %k1 126; CHECK-NEXT: kmovq %k1, %k2 127; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} 128; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 129; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} 130; CHECK-NEXT: vzeroupper 131; CHECK-NEXT: retq 132 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 133 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 134 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4) 135 ret void 136} 137 138define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { 139; CHECK-LABEL: gather_mask_dq: 140; CHECK: ## %bb.0: 141; CHECK-NEXT: kmovd %edi, %k1 142; CHECK-NEXT: kmovq %k1, %k2 143; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} 144; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 145; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} 146; CHECK-NEXT: vzeroupper 147; CHECK-NEXT: retq 148 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) 149 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 150 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4) 151 ret void 152} 153 154define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { 155; CHECK-LABEL: gather_mask_dpd_execdomain: 156; CHECK: ## %bb.0: 157; CHECK-NEXT: kmovd %edi, %k1 158; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} 159; CHECK-NEXT: vmovapd %zmm1, (%rdx) 160; CHECK-NEXT: vzeroupper 161; CHECK-NEXT: retq 162 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) 163 store <8 x double> %x, <8 x double>* %stbuf 164 ret void 165} 166 167define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { 168; CHECK-LABEL: gather_mask_qpd_execdomain: 169; CHECK: ## %bb.0: 170; CHECK-NEXT: kmovd %edi, %k1 171; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} 172; CHECK-NEXT: vmovapd %zmm1, (%rdx) 173; CHECK-NEXT: vzeroupper 174; CHECK-NEXT: retq 175 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 176 store <8 x double> %x, <8 x double>* %stbuf 177 ret void 178} 179 180define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) { 181; CHECK-LABEL: gather_mask_dps_execdomain: 182; CHECK: ## %bb.0: 183; CHECK-NEXT: kmovd %edi, %k1 184; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1} 185; CHECK-NEXT: vmovaps %zmm1, %zmm0 186; CHECK-NEXT: retq 187 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) 188 ret <16 x float> %res; 189} 190 191define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) { 192; CHECK-LABEL: gather_mask_qps_execdomain: 193; CHECK: ## %bb.0: 194; CHECK-NEXT: kmovd %edi, %k1 195; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1} 196; CHECK-NEXT: vmovaps %ymm1, %ymm0 197; CHECK-NEXT: retq 198 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 199 ret <8 x float> %res; 200} 201 202define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { 203; CHECK-LABEL: scatter_mask_dpd_execdomain: 204; CHECK: ## %bb.0: 205; CHECK-NEXT: kmovd %esi, %k1 206; CHECK-NEXT: vmovapd (%rdi), %zmm1 207; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} 208; CHECK-NEXT: vzeroupper 209; CHECK-NEXT: retq 210 %x = load <8 x double>, <8 x double>* %src, align 64 211 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4) 212 ret void 213} 214 215define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { 216; CHECK-LABEL: scatter_mask_qpd_execdomain: 217; CHECK: ## %bb.0: 218; CHECK-NEXT: kmovd %esi, %k1 219; CHECK-NEXT: vmovapd (%rdi), %zmm1 220; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} 221; CHECK-NEXT: vzeroupper 222; CHECK-NEXT: retq 223 %x = load <8 x double>, <8 x double>* %src, align 64 224 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4) 225 ret void 226} 227 228define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) { 229; CHECK-LABEL: scatter_mask_dps_execdomain: 230; CHECK: ## %bb.0: 231; CHECK-NEXT: kmovd %esi, %k1 232; CHECK-NEXT: vmovaps (%rdi), %zmm1 233; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} 234; CHECK-NEXT: vzeroupper 235; CHECK-NEXT: retq 236 %x = load <16 x float>, <16 x float>* %src, align 64 237 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4) 238 ret void 239} 240 241define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) { 242; CHECK-LABEL: scatter_mask_qps_execdomain: 243; CHECK: ## %bb.0: 244; CHECK-NEXT: kmovd %esi, %k1 245; CHECK-NEXT: vmovaps (%rdi), %ymm1 246; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} 247; CHECK-NEXT: vzeroupper 248; CHECK-NEXT: retq 249 %x = load <8 x float>, <8 x float>* %src, align 32 250 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4) 251 ret void 252} 253 254define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) { 255; CHECK-LABEL: gather_qps: 256; CHECK: ## %bb.0: 257; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 258; CHECK-NEXT: kxnorw %k0, %k0, %k1 259; CHECK-NEXT: kxnorw %k0, %k0, %k2 260; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} 261; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 262; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} 263; CHECK-NEXT: vzeroupper 264; CHECK-NEXT: retq 265 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4) 266 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 267 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4) 268 ret void 269} 270 271declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32); 272declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32); 273define void @prefetch(<8 x i64> %ind, i8* %base) { 274; CHECK-LABEL: prefetch: 275; CHECK: ## %bb.0: 276; CHECK-NEXT: kxnorw %k0, %k0, %k1 277; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} 278; CHECK-NEXT: kxorw %k0, %k0, %k1 279; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} 280; CHECK-NEXT: movb $1, %al 281; CHECK-NEXT: kmovd %eax, %k1 282; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} 283; CHECK-NEXT: movb $120, %al 284; CHECK-NEXT: kmovd %eax, %k1 285; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} 286; CHECK-NEXT: vzeroupper 287; CHECK-NEXT: retq 288 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3) 289 call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2) 290 call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3) 291 call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2) 292 ret void 293} 294 295declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32) 296 297define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 298; CHECK-LABEL: test_int_x86_avx512_gather3div2_df: 299; CHECK: ## %bb.0: 300; CHECK-NEXT: kmovd %esi, %k1 301; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1} 302; CHECK-NEXT: kxnorw %k0, %k0, %k1 303; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 304; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1} 305; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 306; CHECK-NEXT: retq 307 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) 308 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) 309 %res2 = fadd <2 x double> %res, %res1 310 ret <2 x double> %res2 311} 312 313declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32) 314 315define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 316; CHECK-LABEL: test_int_x86_avx512_gather3div2_di: 317; CHECK: ## %bb.0: 318; CHECK-NEXT: kmovd %esi, %k1 319; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} 320; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 321; CHECK-NEXT: retq 322 %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) 323 %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) 324 %res2 = add <2 x i64> %res, %res1 325 ret <2 x i64> %res2 326} 327 328declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32) 329 330define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 331; CHECK-LABEL: test_int_x86_avx512_gather3div4_df: 332; CHECK: ## %bb.0: 333; CHECK-NEXT: kmovd %esi, %k1 334; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1} 335; CHECK-NEXT: kxnorw %k0, %k0, %k1 336; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 337; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1} 338; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 339; CHECK-NEXT: retq 340 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) 341 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) 342 %res2 = fadd <4 x double> %res, %res1 343 ret <4 x double> %res2 344} 345 346declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32) 347 348define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 349; CHECK-LABEL: test_int_x86_avx512_gather3div4_di: 350; CHECK: ## %bb.0: 351; CHECK-NEXT: kmovd %esi, %k1 352; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} 353; CHECK-NEXT: kxnorw %k0, %k0, %k1 354; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 355; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} 356; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 357; CHECK-NEXT: retq 358 %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) 359 %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) 360 %res2 = add <4 x i64> %res, %res1 361 ret <4 x i64> %res2 362} 363 364declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32) 365 366define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 367; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf: 368; CHECK: ## %bb.0: 369; CHECK-NEXT: kmovd %esi, %k1 370; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1} 371; CHECK-NEXT: kxnorw %k0, %k0, %k1 372; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 373; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1} 374; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 375; CHECK-NEXT: retq 376 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) 377 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) 378 %res2 = fadd <4 x float> %res, %res1 379 ret <4 x float> %res2 380} 381 382declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32) 383 384define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 385; CHECK-LABEL: test_int_x86_avx512_gather3div4_si: 386; CHECK: ## %bb.0: 387; CHECK-NEXT: kmovd %esi, %k1 388; CHECK-NEXT: kxnorw %k0, %k0, %k2 389; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 390; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2} 391; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} 392; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 393; CHECK-NEXT: retq 394 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4) 395 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) 396 %res2 = add <4 x i32> %res, %res1 397 ret <4 x i32> %res2 398} 399 400declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32) 401 402define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 403; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf: 404; CHECK: ## %bb.0: 405; CHECK-NEXT: kmovd %esi, %k1 406; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1} 407; CHECK-NEXT: kxnorw %k0, %k0, %k1 408; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 409; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1} 410; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 411; CHECK-NEXT: vzeroupper 412; CHECK-NEXT: retq 413 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) 414 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) 415 %res2 = fadd <4 x float> %res, %res1 416 ret <4 x float> %res2 417} 418 419declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32) 420 421define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 422; CHECK-LABEL: test_int_x86_avx512_gather3div8_si: 423; CHECK: ## %bb.0: 424; CHECK-NEXT: kmovd %esi, %k1 425; CHECK-NEXT: vmovdqa %xmm0, %xmm2 426; CHECK-NEXT: kmovq %k1, %k2 427; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} 428; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} 429; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 430; CHECK-NEXT: vzeroupper 431; CHECK-NEXT: retq 432 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) 433 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2) 434 %res2 = add <4 x i32> %res, %res1 435 ret <4 x i32> %res2 436} 437 438declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32) 439 440define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 441; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df: 442; CHECK: ## %bb.0: 443; CHECK-NEXT: kmovd %esi, %k1 444; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1} 445; CHECK-NEXT: kxnorw %k0, %k0, %k1 446; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 447; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1} 448; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 449; CHECK-NEXT: retq 450 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) 451 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) 452 %res2 = fadd <2 x double> %res, %res1 453 ret <2 x double> %res2 454} 455 456declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32) 457 458define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 459; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di: 460; CHECK: ## %bb.0: 461; CHECK-NEXT: kmovd %esi, %k1 462; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} 463; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 464; CHECK-NEXT: retq 465 %res = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 466 %res1 = call <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 467 %res2 = add <2 x i64> %res, %res1 468 ret <2 x i64> %res2 469} 470 471declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32) 472 473define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 474; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df: 475; CHECK: ## %bb.0: 476; CHECK-NEXT: kmovd %esi, %k1 477; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1} 478; CHECK-NEXT: kxnorw %k0, %k0, %k1 479; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 480; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1} 481; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 482; CHECK-NEXT: retq 483 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) 484 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) 485 %res2 = fadd <4 x double> %res, %res1 486 ret <4 x double> %res2 487} 488 489declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32) 490 491define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 492; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di: 493; CHECK: ## %bb.0: 494; CHECK-NEXT: kmovd %esi, %k1 495; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} 496; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 497; CHECK-NEXT: retq 498 %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 499 %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 500 %res2 = add <4 x i64> %res, %res1 501 ret <4 x i64> %res2 502} 503 504declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32) 505 506define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 507; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf: 508; CHECK: ## %bb.0: 509; CHECK-NEXT: kmovd %esi, %k1 510; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} 511; CHECK-NEXT: kxnorw %k0, %k0, %k1 512; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 513; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1} 514; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 515; CHECK-NEXT: retq 516 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) 517 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) 518 %res2 = fadd <4 x float> %res, %res1 519 ret <4 x float> %res2 520} 521 522declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32) 523 524define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 525; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si: 526; CHECK: ## %bb.0: 527; CHECK-NEXT: kmovd %esi, %k1 528; CHECK-NEXT: kxnorw %k0, %k0, %k2 529; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 530; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2} 531; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1} 532; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 533; CHECK-NEXT: retq 534 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4) 535 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2) 536 %res2 = add <4 x i32> %res, %res1 537 ret <4 x i32> %res2 538} 539 540declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32) 541 542define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { 543; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf: 544; CHECK: ## %bb.0: 545; CHECK-NEXT: kmovd %esi, %k1 546; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} 547; CHECK-NEXT: kxnorw %k0, %k0, %k1 548; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 549; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} 550; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 551; CHECK-NEXT: retq 552 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) 553 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2) 554 %res2 = fadd <8 x float> %res, %res1 555 ret <8 x float> %res2 556} 557 558declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32) 559 560define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { 561; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si: 562; CHECK: ## %bb.0: 563; CHECK-NEXT: kmovd %esi, %k1 564; CHECK-NEXT: vmovdqa %ymm0, %ymm2 565; CHECK-NEXT: kmovq %k1, %k2 566; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} 567; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1} 568; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 569; CHECK-NEXT: retq 570 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) 571 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2) 572 %res2 = add <8 x i32> %res, %res1 573 ret <8 x i32> %res2 574} 575 576declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32) 577 578define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) { 579; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df: 580; CHECK: ## %bb.0: 581; CHECK-NEXT: kmovd %esi, %k1 582; CHECK-NEXT: kxnorw %k0, %k0, %k2 583; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2} 584; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1} 585; CHECK-NEXT: retq 586 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2) 587 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4) 588 ret void 589} 590 591declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32) 592 593define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) { 594; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di: 595; CHECK: ## %bb.0: 596; CHECK-NEXT: kmovd %esi, %k1 597; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1} 598; CHECK-NEXT: kxnorw %k0, %k0, %k1 599; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1} 600; CHECK-NEXT: retq 601 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2) 602 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4) 603 ret void 604} 605 606declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32) 607 608define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) { 609; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df: 610; CHECK: ## %bb.0: 611; CHECK-NEXT: kmovd %esi, %k1 612; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1} 613; CHECK-NEXT: kxnorw %k0, %k0, %k1 614; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} 615; CHECK-NEXT: vzeroupper 616; CHECK-NEXT: retq 617 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2) 618 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4) 619 ret void 620} 621 622declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32) 623 624define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) { 625; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di: 626; CHECK: ## %bb.0: 627; CHECK-NEXT: kmovd %esi, %k1 628; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1} 629; CHECK-NEXT: kxnorw %k0, %k0, %k1 630; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} 631; CHECK-NEXT: vzeroupper 632; CHECK-NEXT: retq 633 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2) 634 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4) 635 ret void 636} 637 638declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32) 639 640define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) { 641; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf: 642; CHECK: ## %bb.0: 643; CHECK-NEXT: kmovd %esi, %k1 644; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1} 645; CHECK-NEXT: kxnorw %k0, %k0, %k1 646; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1} 647; CHECK-NEXT: retq 648 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2) 649 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4) 650 ret void 651} 652 653declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32) 654 655define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) { 656; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si: 657; CHECK: ## %bb.0: 658; CHECK-NEXT: kmovd %esi, %k1 659; CHECK-NEXT: kxnorw %k0, %k0, %k2 660; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2} 661; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} 662; CHECK-NEXT: retq 663 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2) 664 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4) 665 ret void 666} 667 668declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32) 669 670define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) { 671; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf: 672; CHECK: ## %bb.0: 673; CHECK-NEXT: kmovd %esi, %k1 674; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1} 675; CHECK-NEXT: kxnorw %k0, %k0, %k1 676; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} 677; CHECK-NEXT: vzeroupper 678; CHECK-NEXT: retq 679 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2) 680 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4) 681 ret void 682} 683 684declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32) 685 686define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) { 687; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si: 688; CHECK: ## %bb.0: 689; CHECK-NEXT: kmovd %esi, %k1 690; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1} 691; CHECK-NEXT: kxnorw %k0, %k0, %k1 692; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} 693; CHECK-NEXT: vzeroupper 694; CHECK-NEXT: retq 695 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2) 696 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4) 697 ret void 698} 699 700declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32) 701 702define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) { 703; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df: 704; CHECK: ## %bb.0: 705; CHECK-NEXT: kmovd %esi, %k1 706; CHECK-NEXT: kxnorw %k0, %k0, %k2 707; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2} 708; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1} 709; CHECK-NEXT: retq 710 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2) 711 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4) 712 ret void 713} 714 715declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32) 716 717define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) { 718; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di: 719; CHECK: ## %bb.0: 720; CHECK-NEXT: kmovd %esi, %k1 721; CHECK-NEXT: kxnorw %k0, %k0, %k2 722; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2} 723; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1} 724; CHECK-NEXT: retq 725 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2) 726 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4) 727 ret void 728} 729 730declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32) 731 732define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) { 733; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df: 734; CHECK: ## %bb.0: 735; CHECK-NEXT: kmovd %esi, %k1 736; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1} 737; CHECK-NEXT: kxnorw %k0, %k0, %k1 738; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} 739; CHECK-NEXT: vzeroupper 740; CHECK-NEXT: retq 741 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2) 742 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4) 743 ret void 744} 745 746declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32) 747 748define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) { 749; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di: 750; CHECK: ## %bb.0: 751; CHECK-NEXT: kmovd %esi, %k1 752; CHECK-NEXT: kxnorw %k0, %k0, %k2 753; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2} 754; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} 755; CHECK-NEXT: vzeroupper 756; CHECK-NEXT: retq 757 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2) 758 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4) 759 ret void 760} 761 762declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32) 763 764define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) { 765; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf: 766; CHECK: ## %bb.0: 767; CHECK-NEXT: kmovd %esi, %k1 768; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1} 769; CHECK-NEXT: kxnorw %k0, %k0, %k1 770; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1} 771; CHECK-NEXT: retq 772 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2) 773 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4) 774 ret void 775} 776 777declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32) 778 779define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) { 780; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si: 781; CHECK: ## %bb.0: 782; CHECK-NEXT: kmovd %esi, %k1 783; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1} 784; CHECK-NEXT: kxnorw %k0, %k0, %k1 785; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} 786; CHECK-NEXT: retq 787 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2) 788 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4) 789 ret void 790} 791 792declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32) 793 794define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) { 795; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf: 796; CHECK: ## %bb.0: 797; CHECK-NEXT: kmovd %esi, %k1 798; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} 799; CHECK-NEXT: kxnorw %k0, %k0, %k1 800; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} 801; CHECK-NEXT: vzeroupper 802; CHECK-NEXT: retq 803 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2) 804 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4) 805 ret void 806} 807 808declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32) 809 810define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) { 811; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si: 812; CHECK: ## %bb.0: 813; CHECK-NEXT: kmovd %esi, %k1 814; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 815; CHECK-NEXT: kxnorw %k0, %k0, %k1 816; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 817; CHECK-NEXT: vzeroupper 818; CHECK-NEXT: retq 819 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 820 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4) 821 ret void 822} 823 824define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) { 825; CHECK-LABEL: scatter_mask_test: 826; CHECK: ## %bb.0: 827; CHECK-NEXT: kxnorw %k0, %k0, %k1 828; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 829; CHECK-NEXT: kxorw %k0, %k0, %k1 830; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 831; CHECK-NEXT: movb $1, %al 832; CHECK-NEXT: kmovd %eax, %k1 833; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 834; CHECK-NEXT: movb $96, %al 835; CHECK-NEXT: kmovd %eax, %k1 836; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 837; CHECK-NEXT: vzeroupper 838; CHECK-NEXT: retq 839 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 840 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4) 841 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 842 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4) 843 ret void 844} 845 846define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) { 847; CHECK-LABEL: gather_mask_test: 848; CHECK: ## %bb.0: 849; CHECK-NEXT: kxnorw %k0, %k0, %k1 850; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 851; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} 852; CHECK-NEXT: kxorw %k0, %k0, %k1 853; CHECK-NEXT: vmovaps %zmm1, %zmm3 854; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} 855; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 856; CHECK-NEXT: movw $1, %ax 857; CHECK-NEXT: kmovd %eax, %k1 858; CHECK-NEXT: vmovaps %zmm1, %zmm3 859; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} 860; CHECK-NEXT: movw $220, %ax 861; CHECK-NEXT: kmovd %eax, %k1 862; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 863; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 864; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 865; CHECK-NEXT: retq 866 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) 867 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) 868 %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4) 869 %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4) 870 871 %res4 = fadd <16 x float> %res, %res1 872 %res5 = fadd <16 x float> %res3, %res2 873 %res6 = fadd <16 x float> %res5, %res4 874 ret <16 x float> %res6 875} 876