1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s 3 4declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32) 5declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32) 6declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32) 7declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32) 8 9declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32) 10declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32) 11declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32) 12declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32) 13 14define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) { 15; CHECK-LABEL: gather_mask_dps: 16; CHECK: ## BB#0: 17; CHECK-NEXT: kmovw %edi, %k1 18; CHECK-NEXT: kmovq %k1, %k2 19; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} 20; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 21; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} 22; CHECK-NEXT: retq 23 %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) 24 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 25 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4) 26 ret void 27} 28 29define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { 30; CHECK-LABEL: gather_mask_dpd: 31; CHECK: ## BB#0: 32; CHECK-NEXT: kmovb %edi, %k1 33; CHECK-NEXT: kmovq %k1, %k2 34; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} 35; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 36; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} 37; CHECK-NEXT: retq 38 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) 39 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 40 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4) 41 ret void 42} 43 44define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) { 45; CHECK-LABEL: gather_mask_qps: 46; CHECK: ## BB#0: 47; CHECK-NEXT: kmovb %edi, %k1 48; CHECK-NEXT: kmovq %k1, %k2 49; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} 50; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 51; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} 52; CHECK-NEXT: retq 53 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 54 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 55 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4) 56 ret void 57} 58 59define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) { 60; CHECK-LABEL: gather_mask_qpd: 61; CHECK: ## BB#0: 62; CHECK-NEXT: kmovb %edi, %k1 63; CHECK-NEXT: kmovq %k1, %k2 64; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} 65; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 66; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} 67; CHECK-NEXT: retq 68 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 69 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 70 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4) 71 ret void 72} 73;; 74;; Integer Gather/Scatter 75;; 76declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32) 77declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32) 78declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32) 79declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32) 80 81declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32) 82declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32) 83declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32) 84declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32) 85 86define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) { 87; CHECK-LABEL: gather_mask_dd: 88; CHECK: ## BB#0: 89; CHECK-NEXT: kmovw %edi, %k1 90; CHECK-NEXT: kmovq %k1, %k2 91; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} 92; CHECK-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 93; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} 94; CHECK-NEXT: retq 95 %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) 96 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 97 call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4) 98 ret void 99} 100 101define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) { 102; CHECK-LABEL: gather_mask_qd: 103; CHECK: ## BB#0: 104; CHECK-NEXT: kmovb %edi, %k1 105; CHECK-NEXT: kmovq %k1, %k2 106; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} 107; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 108; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} 109; CHECK-NEXT: retq 110 %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 111 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 112 call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4) 113 ret void 114} 115 116define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { 117; CHECK-LABEL: gather_mask_qq: 118; CHECK: ## BB#0: 119; CHECK-NEXT: kmovb %edi, %k1 120; CHECK-NEXT: kmovq %k1, %k2 121; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} 122; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 123; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} 124; CHECK-NEXT: retq 125 %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 126 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 127 call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4) 128 ret void 129} 130 131define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) { 132; CHECK-LABEL: gather_mask_dq: 133; CHECK: ## BB#0: 134; CHECK-NEXT: kmovb %edi, %k1 135; CHECK-NEXT: kmovq %k1, %k2 136; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} 137; CHECK-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 138; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} 139; CHECK-NEXT: retq 140 %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) 141 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 142 call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4) 143 ret void 144} 145 146define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { 147; CHECK-LABEL: gather_mask_dpd_execdomain: 148; CHECK: ## BB#0: 149; CHECK-NEXT: kmovb %edi, %k1 150; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} 151; CHECK-NEXT: vmovapd %zmm1, (%rdx) 152; CHECK-NEXT: retq 153 %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4) 154 store <8 x double> %x, <8 x double>* %stbuf 155 ret void 156} 157 158define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) { 159; CHECK-LABEL: gather_mask_qpd_execdomain: 160; CHECK: ## BB#0: 161; CHECK-NEXT: kmovb %edi, %k1 162; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} 163; CHECK-NEXT: vmovapd %zmm1, (%rdx) 164; CHECK-NEXT: retq 165 %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 166 store <8 x double> %x, <8 x double>* %stbuf 167 ret void 168} 169 170define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) { 171; CHECK-LABEL: gather_mask_dps_execdomain: 172; CHECK: ## BB#0: 173; CHECK-NEXT: kmovw %edi, %k1 174; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1} 175; CHECK-NEXT: vmovaps %zmm1, %zmm0 176; CHECK-NEXT: retq 177 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4) 178 ret <16 x float> %res; 179} 180 181define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) { 182; CHECK-LABEL: gather_mask_qps_execdomain: 183; CHECK: ## BB#0: 184; CHECK-NEXT: kmovb %edi, %k1 185; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1} 186; CHECK-NEXT: vmovaps %zmm1, %zmm0 187; CHECK-NEXT: retq 188 %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4) 189 ret <8 x float> %res; 190} 191 192define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { 193; CHECK-LABEL: scatter_mask_dpd_execdomain: 194; CHECK: ## BB#0: 195; CHECK-NEXT: kmovb %esi, %k1 196; CHECK-NEXT: vmovapd (%rdi), %zmm1 197; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} 198; CHECK-NEXT: retq 199 %x = load <8 x double>, <8 x double>* %src, align 64 200 call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4) 201 ret void 202} 203 204define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) { 205; CHECK-LABEL: scatter_mask_qpd_execdomain: 206; CHECK: ## BB#0: 207; CHECK-NEXT: kmovb %esi, %k1 208; CHECK-NEXT: vmovapd (%rdi), %zmm1 209; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} 210; CHECK-NEXT: retq 211 %x = load <8 x double>, <8 x double>* %src, align 64 212 call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4) 213 ret void 214} 215 216define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) { 217; CHECK-LABEL: scatter_mask_dps_execdomain: 218; CHECK: ## BB#0: 219; CHECK-NEXT: kmovw %esi, %k1 220; CHECK-NEXT: vmovaps (%rdi), %zmm1 221; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} 222; CHECK-NEXT: retq 223 %x = load <16 x float>, <16 x float>* %src, align 64 224 call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4) 225 ret void 226} 227 228define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) { 229; CHECK-LABEL: scatter_mask_qps_execdomain: 230; CHECK: ## BB#0: 231; CHECK-NEXT: kmovb %esi, %k1 232; CHECK-NEXT: vmovaps (%rdi), %ymm1 233; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} 234; CHECK-NEXT: retq 235 %x = load <8 x float>, <8 x float>* %src, align 32 236 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4) 237 ret void 238} 239 240define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) { 241; CHECK-LABEL: gather_qps: 242; CHECK: ## BB#0: 243; CHECK-NEXT: kxnorw %k0, %k0, %k1 244; CHECK-NEXT: kxnorw %k0, %k0, %k2 245; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} 246; CHECK-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 247; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} 248; CHECK-NEXT: retq 249 %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4) 250 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 251 call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4) 252 ret void 253} 254 255declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32); 256declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32); 257define void @prefetch(<8 x i64> %ind, i8* %base) { 258; CHECK-LABEL: prefetch: 259; CHECK: ## BB#0: 260; CHECK-NEXT: kxnorw %k0, %k0, %k1 261; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1} 262; CHECK-NEXT: kxorw %k0, %k0, %k1 263; CHECK-NEXT: vgatherpf1qps (%rdi,%zmm0,4) {%k1} 264; CHECK-NEXT: movb $1, %al 265; CHECK-NEXT: kmovb %eax, %k1 266; CHECK-NEXT: vscatterpf0qps (%rdi,%zmm0,2) {%k1} 267; CHECK-NEXT: movb $120, %al 268; CHECK-NEXT: kmovb %eax, %k1 269; CHECK-NEXT: vscatterpf1qps (%rdi,%zmm0,2) {%k1} 270; CHECK-NEXT: retq 271 call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0) 272 call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1) 273 call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0) 274 call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1) 275 ret void 276} 277 278declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64>, i8, i32) 279 280define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 281; CHECK-LABEL: test_int_x86_avx512_gather3div2_df: 282; CHECK: ## BB#0: 283; CHECK-NEXT: kmovb %esi, %k1 284; CHECK-NEXT: vmovaps %zmm0, %zmm2 285; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1} 286; CHECK-NEXT: kxnorw %k0, %k0, %k1 287; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm0 {%k1} 288; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 289; CHECK-NEXT: retq 290 %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) 291 %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) 292 %res2 = fadd <2 x double> %res, %res1 293 ret <2 x double> %res2 294} 295 296declare <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8, i32) 297 298define <4 x i32>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 299; CHECK-LABEL: test_int_x86_avx512_gather3div2_di: 300; CHECK: ## BB#0: 301; CHECK-NEXT: kmovb %esi, %k1 302; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} 303; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 304; CHECK-NEXT: retq 305 %res = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) 306 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8) 307 %res2 = add <4 x i32> %res, %res1 308 ret <4 x i32> %res2 309} 310 311declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64>, i8, i32) 312 313define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 314; CHECK-LABEL: test_int_x86_avx512_gather3div4_df: 315; CHECK: ## BB#0: 316; CHECK-NEXT: kmovb %esi, %k1 317; CHECK-NEXT: vmovaps %zmm0, %zmm2 318; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1} 319; CHECK-NEXT: kxnorw %k0, %k0, %k1 320; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm0 {%k1} 321; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 322; CHECK-NEXT: retq 323 %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) 324 %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) 325 %res2 = fadd <4 x double> %res, %res1 326 ret <4 x double> %res2 327} 328 329declare <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8, i32) 330 331define <8 x i32>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 332; CHECK-LABEL: test_int_x86_avx512_gather3div4_di: 333; CHECK: ## BB#0: 334; CHECK-NEXT: kmovb %esi, %k1 335; CHECK-NEXT: vmovaps %zmm0, %zmm2 336; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} 337; CHECK-NEXT: kxnorw %k0, %k0, %k1 338; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} 339; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 340; CHECK-NEXT: retq 341 %res = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8) 342 %res1 = call <8 x i32> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8) 343 %res2 = add <8 x i32> %res, %res1 344 ret <8 x i32> %res2 345} 346 347declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>, i8, i32) 348 349define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 350; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf: 351; CHECK: ## BB#0: 352; CHECK-NEXT: kmovb %esi, %k1 353; CHECK-NEXT: vmovaps %zmm0, %zmm2 354; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1} 355; CHECK-NEXT: kxnorw %k0, %k0, %k1 356; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm0 {%k1} 357; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 358; CHECK-NEXT: retq 359 %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) 360 %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2) 361 %res2 = fadd <4 x float> %res, %res1 362 ret <4 x float> %res2 363} 364 365declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8, i32) 366 367define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) { 368; CHECK-LABEL: test_int_x86_avx512_gather3div4_si: 369; CHECK: ## BB#0: 370; CHECK-NEXT: kmovb %esi, %k1 371; CHECK-NEXT: kxnorw %k0, %k0, %k2 372; CHECK-NEXT: vmovaps %zmm0, %zmm2 373; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2} 374; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} 375; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 376; CHECK-NEXT: retq 377 %res = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 4) 378 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4) 379 %res2 = add <4 x i32> %res, %res1 380 ret <4 x i32> %res2 381} 382 383declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>, i8, i32) 384 385define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 386; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf: 387; CHECK: ## BB#0: 388; CHECK-NEXT: kmovb %esi, %k1 389; CHECK-NEXT: vmovaps %zmm0, %zmm2 390; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1} 391; CHECK-NEXT: kxnorw %k0, %k0, %k1 392; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1} 393; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 394; CHECK-NEXT: retq 395 %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) 396 %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2) 397 %res2 = fadd <4 x float> %res, %res1 398 ret <4 x float> %res2 399} 400 401declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8, i32) 402 403define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) { 404; CHECK-LABEL: test_int_x86_avx512_gather3div8_si: 405; CHECK: ## BB#0: 406; CHECK-NEXT: kmovb %esi, %k1 407; CHECK-NEXT: vmovaps %zmm0, %zmm2 408; CHECK-NEXT: kmovq %k1, %k2 409; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} 410; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} 411; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 412; CHECK-NEXT: retq 413 %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4) 414 %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2) 415 %res2 = add <4 x i32> %res, %res1 416 ret <4 x i32> %res2 417} 418 419declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32>, i8, i32) 420 421define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 422; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df: 423; CHECK: ## BB#0: 424; CHECK-NEXT: kmovb %esi, %k1 425; CHECK-NEXT: vmovaps %zmm0, %zmm2 426; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1} 427; CHECK-NEXT: kxnorw %k0, %k0, %k1 428; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm0 {%k1} 429; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 430; CHECK-NEXT: retq 431 %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) 432 %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) 433 %res2 = fadd <2 x double> %res, %res1 434 ret <2 x double> %res2 435} 436 437declare <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8, i32) 438 439define <4 x i32>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 440; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di: 441; CHECK: ## BB#0: 442; CHECK-NEXT: kmovb %esi, %k1 443; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} 444; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 445; CHECK-NEXT: retq 446 %res = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 447 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv2.di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 448 %res2 = add <4 x i32> %res, %res1 449 ret <4 x i32> %res2 450} 451 452declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32>, i8, i32) 453 454define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 455; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df: 456; CHECK: ## BB#0: 457; CHECK-NEXT: kmovb %esi, %k1 458; CHECK-NEXT: vmovaps %zmm0, %zmm2 459; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1} 460; CHECK-NEXT: kxnorw %k0, %k0, %k1 461; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm0 {%k1} 462; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 463; CHECK-NEXT: retq 464 %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) 465 %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) 466 %res2 = fadd <4 x double> %res, %res1 467 ret <4 x double> %res2 468} 469 470declare <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8, i32) 471 472define <8 x i32>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 473; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di: 474; CHECK: ## BB#0: 475; CHECK-NEXT: kmovb %esi, %k1 476; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} 477; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 478; CHECK-NEXT: retq 479 %res = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 480 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8) 481 %res2 = add <8 x i32> %res, %res1 482 ret <8 x i32> %res2 483} 484 485declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>, i8, i32) 486 487define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 488; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf: 489; CHECK: ## BB#0: 490; CHECK-NEXT: kmovb %esi, %k1 491; CHECK-NEXT: vmovaps %zmm0, %zmm2 492; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1} 493; CHECK-NEXT: kxnorw %k0, %k0, %k1 494; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm0 {%k1} 495; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 496; CHECK-NEXT: retq 497 %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4) 498 %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2) 499 %res2 = fadd <4 x float> %res, %res1 500 ret <4 x float> %res2 501} 502 503declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8, i32) 504 505define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) { 506; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si: 507; CHECK: ## BB#0: 508; CHECK-NEXT: kmovb %esi, %k1 509; CHECK-NEXT: kxnorw %k0, %k0, %k2 510; CHECK-NEXT: vmovaps %zmm0, %zmm2 511; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2} 512; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1} 513; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 514; CHECK-NEXT: retq 515 %res = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 4) 516 %res1 = call <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 2) 517 %res2 = add <4 x i32> %res, %res1 518 ret <4 x i32> %res2 519} 520 521declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>, i8, i32) 522 523define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { 524; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf: 525; CHECK: ## BB#0: 526; CHECK-NEXT: kmovb %esi, %k1 527; CHECK-NEXT: vmovaps %zmm0, %zmm2 528; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1} 529; CHECK-NEXT: kxnorw %k0, %k0, %k1 530; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm0 {%k1} 531; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 532; CHECK-NEXT: retq 533 %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) 534 %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2) 535 %res2 = fadd <8 x float> %res, %res1 536 ret <8 x float> %res2 537} 538 539declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8, i32) 540 541define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) { 542; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si: 543; CHECK: ## BB#0: 544; CHECK-NEXT: kmovb %esi, %k1 545; CHECK-NEXT: vmovaps %zmm0, %zmm2 546; CHECK-NEXT: kmovq %k1, %k2 547; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} 548; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1} 549; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 550; CHECK-NEXT: retq 551 %res = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4) 552 %res1 = call <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 2) 553 %res2 = add <8 x i32> %res, %res1 554 ret <8 x i32> %res2 555} 556 557declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i32) 558 559define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) { 560; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df: 561; CHECK: ## BB#0: 562; CHECK-NEXT: kmovb %esi, %k1 563; CHECK-NEXT: kxnorw %k0, %k0, %k2 564; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2} 565; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1} 566; CHECK-NEXT: retq 567 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 -1, <2 x i64> %x2, <2 x double> %x3, i32 2) 568 call void @llvm.x86.avx512.scatterdiv2.df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3, i32 4) 569 ret void 570} 571 572declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32) 573 574define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) { 575; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di: 576; CHECK: ## BB#0: 577; CHECK-NEXT: kmovb %esi, %k1 578; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1} 579; CHECK-NEXT: kxnorw %k0, %k0, %k1 580; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1} 581; CHECK-NEXT: retq 582 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3, i32 2) 583 call void @llvm.x86.avx512.scatterdiv2.di(i8* %x0, i8 -1, <2 x i64> %x2, <2 x i64> %x3, i32 4) 584 ret void 585} 586 587declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i32) 588 589define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) { 590; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df: 591; CHECK: ## BB#0: 592; CHECK-NEXT: kmovb %esi, %k1 593; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1} 594; CHECK-NEXT: kxnorw %k0, %k0, %k1 595; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} 596; CHECK-NEXT: retq 597 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2) 598 call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4) 599 ret void 600} 601 602declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32) 603 604define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) { 605; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di: 606; CHECK: ## BB#0: 607; CHECK-NEXT: kmovb %esi, %k1 608; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1} 609; CHECK-NEXT: kxnorw %k0, %k0, %k1 610; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} 611; CHECK-NEXT: retq 612 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2) 613 call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4) 614 ret void 615} 616 617declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i32) 618 619define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) { 620; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf: 621; CHECK: ## BB#0: 622; CHECK-NEXT: kmovb %esi, %k1 623; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1} 624; CHECK-NEXT: kxnorw %k0, %k0, %k1 625; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1} 626; CHECK-NEXT: retq 627 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3, i32 2) 628 call void @llvm.x86.avx512.scatterdiv4.sf(i8* %x0, i8 -1, <2 x i64> %x2, <4 x float> %x3, i32 4) 629 ret void 630} 631 632declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32) 633 634define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) { 635; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si: 636; CHECK: ## BB#0: 637; CHECK-NEXT: kmovb %esi, %k1 638; CHECK-NEXT: kxnorw %k0, %k0, %k2 639; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2} 640; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} 641; CHECK-NEXT: retq 642 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 -1, <2 x i64> %x2, <4 x i32> %x3, i32 2) 643 call void @llvm.x86.avx512.scatterdiv4.si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3, i32 4) 644 ret void 645} 646 647declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i32) 648 649define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) { 650; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf: 651; CHECK: ## BB#0: 652; CHECK-NEXT: kmovb %esi, %k1 653; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1} 654; CHECK-NEXT: kxnorw %k0, %k0, %k1 655; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} 656; CHECK-NEXT: retq 657 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2) 658 call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4) 659 ret void 660} 661 662declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32) 663 664define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) { 665; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si: 666; CHECK: ## BB#0: 667; CHECK-NEXT: kmovb %esi, %k1 668; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1} 669; CHECK-NEXT: kxnorw %k0, %k0, %k1 670; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} 671; CHECK-NEXT: retq 672 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2) 673 call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4) 674 ret void 675} 676 677declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i32) 678 679define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) { 680; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df: 681; CHECK: ## BB#0: 682; CHECK-NEXT: kmovb %esi, %k1 683; CHECK-NEXT: kxnorw %k0, %k0, %k2 684; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2} 685; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1} 686; CHECK-NEXT: retq 687 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 -1, <4 x i32> %x2, <2 x double> %x3, i32 2) 688 call void @llvm.x86.avx512.scattersiv2.df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3, i32 4) 689 ret void 690} 691 692declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32) 693 694define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) { 695; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di: 696; CHECK: ## BB#0: 697; CHECK-NEXT: kmovb %esi, %k1 698; CHECK-NEXT: kxnorw %k0, %k0, %k2 699; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2} 700; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1} 701; CHECK-NEXT: retq 702 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 -1, <4 x i32> %x2, <2 x i64> %x3, i32 2) 703 call void @llvm.x86.avx512.scattersiv2.di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3, i32 4) 704 ret void 705} 706 707declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i32) 708 709define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) { 710; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df: 711; CHECK: ## BB#0: 712; CHECK-NEXT: kmovb %esi, %k1 713; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1} 714; CHECK-NEXT: kxnorw %k0, %k0, %k1 715; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} 716; CHECK-NEXT: retq 717 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2) 718 call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4) 719 ret void 720} 721 722declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32) 723 724define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) { 725; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di: 726; CHECK: ## BB#0: 727; CHECK-NEXT: kmovb %esi, %k1 728; CHECK-NEXT: kxnorw %k0, %k0, %k2 729; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2} 730; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} 731; CHECK-NEXT: retq 732 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2) 733 call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4) 734 ret void 735} 736 737declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i32) 738 739define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) { 740; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf: 741; CHECK: ## BB#0: 742; CHECK-NEXT: kmovb %esi, %k1 743; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1} 744; CHECK-NEXT: kxnorw %k0, %k0, %k1 745; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1} 746; CHECK-NEXT: retq 747 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3, i32 2) 748 call void @llvm.x86.avx512.scattersiv4.sf(i8* %x0, i8 -1, <4 x i32> %x2, <4 x float> %x3, i32 4) 749 ret void 750} 751 752declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32) 753 754define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) { 755; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si: 756; CHECK: ## BB#0: 757; CHECK-NEXT: kmovb %esi, %k1 758; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1} 759; CHECK-NEXT: kxnorw %k0, %k0, %k1 760; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} 761; CHECK-NEXT: retq 762 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3, i32 2) 763 call void @llvm.x86.avx512.scattersiv4.si(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i32> %x3, i32 4) 764 ret void 765} 766 767declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i32) 768 769define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) { 770; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf: 771; CHECK: ## BB#0: 772; CHECK-NEXT: kmovb %esi, %k1 773; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} 774; CHECK-NEXT: kxnorw %k0, %k0, %k1 775; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} 776; CHECK-NEXT: retq 777 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2) 778 call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4) 779 ret void 780} 781 782declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32) 783 784define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) { 785; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si: 786; CHECK: ## BB#0: 787; CHECK-NEXT: kmovb %esi, %k1 788; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 789; CHECK-NEXT: kxnorw %k0, %k0, %k1 790; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 791; CHECK-NEXT: retq 792 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 793 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4) 794 ret void 795} 796 797define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) { 798; CHECK-LABEL: scatter_mask_test: 799; CHECK: ## BB#0: 800; CHECK-NEXT: kxnorw %k0, %k0, %k1 801; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 802; CHECK-NEXT: kxorw %k0, %k0, %k1 803; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 804; CHECK-NEXT: movb $1, %al 805; CHECK-NEXT: kmovb %eax, %k1 806; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 807; CHECK-NEXT: movb $96, %al 808; CHECK-NEXT: kmovb %eax, %k1 809; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 810; CHECK-NEXT: retq 811 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 812 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4) 813 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 814 call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 96, <8 x i32> %x2, <8 x i32> %x3, i32 4) 815 ret void 816} 817 818define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) { 819; CHECK-LABEL: gather_mask_test: 820; CHECK: ## BB#0: 821; CHECK-NEXT: kxnorw %k0, %k0, %k1 822; CHECK-NEXT: vmovaps %zmm1, %zmm2 823; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} 824; CHECK-NEXT: kxorw %k0, %k0, %k1 825; CHECK-NEXT: vmovaps %zmm1, %zmm3 826; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} 827; CHECK-NEXT: movw $1, %ax 828; CHECK-NEXT: kmovw %eax, %k1 829; CHECK-NEXT: vmovaps %zmm1, %zmm4 830; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} 831; CHECK-NEXT: movw $220, %ax 832; CHECK-NEXT: kmovw %eax, %k1 833; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 834; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 835; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 836; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 837; CHECK-NEXT: retq 838 %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) 839 %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) 840 %res2 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 1, i32 4) 841 %res3 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 220, i32 4) 842 843 %res4 = fadd <16 x float> %res, %res1 844 %res5 = fadd <16 x float> %res3, %res2 845 %res6 = fadd <16 x float> %res5, %res4 846 ret <16 x float> %res6 847} 848