1; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64 2; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32 3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX 4; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32 5; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR 6; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null 7 8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 9target triple = "x86_64-unknown-linux-gnu" 10 11 12; SCALAR-LABEL: test1 13; SCALAR: extractelement <16 x float*> 14; SCALAR-NEXT: load float 15; SCALAR-NEXT: insertelement <16 x float> 16; SCALAR-NEXT: extractelement <16 x float*> 17; SCALAR-NEXT: load float 18 19define <16 x float> @test1(float* %base, <16 x i32> %ind) { 20; KNL_64-LABEL: test1: 21; KNL_64: # BB#0: 22; KNL_64-NEXT: kxnorw %k0, %k0, %k1 23; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 24; KNL_64-NEXT: vmovaps %zmm1, %zmm0 25; KNL_64-NEXT: retq 26; 27; KNL_32-LABEL: test1: 28; KNL_32: # BB#0: 29; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 30; KNL_32-NEXT: kxnorw %k0, %k0, %k1 31; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 32; KNL_32-NEXT: vmovaps %zmm1, %zmm0 33; KNL_32-NEXT: retl 34; 35; SKX-LABEL: test1: 36; SKX: # BB#0: 37; SKX-NEXT: kxnorw %k0, %k0, %k1 38; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 39; SKX-NEXT: vmovaps %zmm1, %zmm0 40; SKX-NEXT: retq 41 42 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 43 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 44 45 %sext_ind = sext <16 x i32> %ind to <16 x i64> 46 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 47 48 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 49 ret <16 x float>%res 50} 51 52declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) 53declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>) 54declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) 55 56 57; SCALAR-LABEL: test2 58; SCALAR: extractelement <16 x float*> 59; SCALAR-NEXT: load float 60; SCALAR-NEXT: insertelement <16 x float> 61; SCALAR-NEXT: br label %else 62; SCALAR: else: 63; SCALAR-NEXT: %res.phi.else = phi 64; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 65; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true 66; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2 67 68define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { 69; KNL_64-LABEL: test2: 70; KNL_64: # BB#0: 71; KNL_64-NEXT: kmovw %esi, %k1 72; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 73; KNL_64-NEXT: vmovaps %zmm1, %zmm0 74; KNL_64-NEXT: retq 75; 76; KNL_32-LABEL: test2: 77; KNL_32: # BB#0: 78; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 79; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 80; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 81; KNL_32-NEXT: vmovaps %zmm1, %zmm0 82; KNL_32-NEXT: retl 83; 84; SKX-LABEL: test2: 85; SKX: # BB#0: 86; SKX-NEXT: kmovw %esi, %k1 87; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 88; SKX-NEXT: vmovaps %zmm1, %zmm0 89; SKX-NEXT: retq 90 91 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 92 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 93 94 %sext_ind = sext <16 x i32> %ind to <16 x i64> 95 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 96 %imask = bitcast i16 %mask to <16 x i1> 97 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef) 98 ret <16 x float> %res 99} 100 101define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { 102; KNL_64-LABEL: test3: 103; KNL_64: # BB#0: 104; KNL_64-NEXT: kmovw %esi, %k1 105; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 106; KNL_64-NEXT: vmovaps %zmm1, %zmm0 107; KNL_64-NEXT: retq 108; 109; KNL_32-LABEL: test3: 110; KNL_32: # BB#0: 111; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 112; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 113; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} 114; KNL_32-NEXT: vmovaps %zmm1, %zmm0 115; KNL_32-NEXT: retl 116; 117; SKX-LABEL: test3: 118; SKX: # BB#0: 119; SKX-NEXT: kmovw %esi, %k1 120; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 121; SKX-NEXT: vmovaps %zmm1, %zmm0 122; SKX-NEXT: retq 123 124 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 125 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 126 127 %sext_ind = sext <16 x i32> %ind to <16 x i64> 128 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind 129 %imask = bitcast i16 %mask to <16 x i1> 130 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 131 ret <16 x i32> %res 132} 133 134 135define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { 136; KNL_64-LABEL: test4: 137; KNL_64: # BB#0: 138; KNL_64-NEXT: kmovw %esi, %k1 139; KNL_64-NEXT: kmovw %k1, %k2 140; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 141; KNL_64-NEXT: vmovaps %zmm1, %zmm2 142; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 143; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0 144; KNL_64-NEXT: retq 145; 146; KNL_32-LABEL: test4: 147; KNL_32: # BB#0: 148; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 149; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 150; KNL_32-NEXT: kmovw %k1, %k2 151; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} 152; KNL_32-NEXT: vmovaps %zmm1, %zmm2 153; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} 154; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 155; KNL_32-NEXT: retl 156; 157; SKX-LABEL: test4: 158; SKX: # BB#0: 159; SKX-NEXT: kmovw %esi, %k1 160; SKX-NEXT: kmovw %k1, %k2 161; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 162; SKX-NEXT: vmovaps %zmm1, %zmm2 163; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 164; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 165; SKX-NEXT: retq 166 167 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 168 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 169 170 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 171 %imask = bitcast i16 %mask to <16 x i1> 172 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 173 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 174 %res = add <16 x i32> %gt1, %gt2 175 ret <16 x i32> %res 176} 177 178 179; SCALAR-LABEL: test5 180; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0 181; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true 182; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else 183; SCALAR: cond.store: 184; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0 185; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0 186; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4 187; SCALAR-NEXT: br label %else 188; SCALAR: else: 189; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 190; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true 191; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2 192 193define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { 194; KNL_64-LABEL: test5: 195; KNL_64: # BB#0: 196; KNL_64-NEXT: kmovw %esi, %k1 197; KNL_64-NEXT: kmovw %k1, %k2 198; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 199; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 200; KNL_64-NEXT: retq 201; 202; KNL_32-LABEL: test5: 203; KNL_32: # BB#0: 204; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 205; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 206; KNL_32-NEXT: kmovw %k1, %k2 207; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} 208; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 209; KNL_32-NEXT: retl 210; 211; SKX-LABEL: test5: 212; SKX: # BB#0: 213; SKX-NEXT: kmovw %esi, %k1 214; SKX-NEXT: kmovw %k1, %k2 215; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 216; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 217; SKX-NEXT: retq 218 219 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 220 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 221 222 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 223 %imask = bitcast i16 %mask to <16 x i1> 224 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 225 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 226 ret void 227} 228 229declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) 230declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) 231 232 233; SCALAR-LABEL: test6 234; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4 235; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1 236; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1 237; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4 238; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2 239; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2 240; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4 241 242define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { 243; KNL_64-LABEL: test6: 244; KNL_64: # BB#0: 245; KNL_64-NEXT: kxnorw %k0, %k0, %k1 246; KNL_64-NEXT: kxnorw %k0, %k0, %k2 247; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 248; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 249; KNL_64-NEXT: vmovaps %zmm2, %zmm0 250; KNL_64-NEXT: retq 251; 252; KNL_32-LABEL: test6: 253; KNL_32: # BB#0: 254; KNL_32-NEXT: kxnorw %k0, %k0, %k1 255; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 256; KNL_32-NEXT: kxnorw %k0, %k0, %k2 257; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} 258; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} 259; KNL_32-NEXT: vmovaps %zmm1, %zmm0 260; KNL_32-NEXT: retl 261; 262; SKX-LABEL: test6: 263; SKX: # BB#0: 264; SKX-NEXT: kxnorw %k0, %k0, %k1 265; SKX-NEXT: kxnorw %k0, %k0, %k2 266; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 267; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 268; SKX-NEXT: vmovaps %zmm2, %zmm0 269; SKX-NEXT: retq 270 271 %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 272 273 call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 274 ret <8 x i32>%a 275} 276 277define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { 278; 279; KNL_64-LABEL: test7: 280; KNL_64: # BB#0: 281; KNL_64-NEXT: kmovw %esi, %k1 282; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 283; KNL_64-NEXT: kmovw %k1, %k2 284; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} 285; KNL_64-NEXT: vmovaps %zmm1, %zmm2 286; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} 287; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 288; KNL_64-NEXT: retq 289; 290; KNL_32-LABEL: test7: 291; KNL_32: # BB#0: 292; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 293; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 294; KNL_32-NEXT: kmovw %ecx, %k1 295; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 296; KNL_32-NEXT: kmovw %k1, %k2 297; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2} 298; KNL_32-NEXT: vmovaps %zmm1, %zmm2 299; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} 300; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 301; KNL_32-NEXT: retl 302; 303; SKX-LABEL: test7: 304; SKX: # BB#0: 305; SKX-NEXT: kmovb %esi, %k1 306; SKX-NEXT: kmovw %k1, %k2 307; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} 308; SKX-NEXT: vmovaps %zmm1, %zmm2 309; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} 310; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 311; SKX-NEXT: retq 312 313 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0 314 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer 315 316 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind 317 %imask = bitcast i8 %mask to <8 x i1> 318 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef) 319 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1) 320 %res = add <8 x i32> %gt1, %gt2 321 ret <8 x i32> %res 322} 323 324; No uniform base in this case, index <8 x i64> contains addresses, 325; each gather call will be split into two 326define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { 327; KNL_64-LABEL: test8: 328; KNL_64: # BB#0: 329; KNL_64-NEXT: kmovw %edi, %k1 330; KNL_64-NEXT: kshiftrw $8, %k1, %k2 331; KNL_64-NEXT: kmovw %k2, %k3 332; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 333; KNL_64-NEXT: kmovw %k1, %k3 334; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 335; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 336; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 337; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 338; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 339; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 340; KNL_64-NEXT: retq 341; 342; KNL_32-LABEL: test8: 343; KNL_32: # BB#0: 344; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 345; KNL_32-NEXT: kmovw %k1, %k2 346; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 347; KNL_32-NEXT: vmovaps %zmm1, %zmm2 348; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 349; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 350; KNL_32-NEXT: retl 351; 352; SKX-LABEL: test8: 353; SKX: # BB#0: 354; SKX-NEXT: kmovw %edi, %k1 355; SKX-NEXT: kshiftrw $8, %k1, %k2 356; SKX-NEXT: kmovw %k2, %k3 357; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 358; SKX-NEXT: kmovw %k1, %k3 359; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 360; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4 361; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 362; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 363; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 364; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 365; SKX-NEXT: retq 366; 367; SKX_32-LABEL: test8: 368; SKX_32: # BB#0: 369; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 370; SKX_32-NEXT: kmovw %k1, %k2 371; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 372; SKX_32-NEXT: vmovaps %zmm1, %zmm2 373; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 374; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 375; SKX_32-NEXT: retl 376 377 %imask = bitcast i16 %mask to <16 x i1> 378 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 379 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 380 %res = add <16 x i32> %gt1, %gt2 381 ret <16 x i32> %res 382} 383 384%struct.RT = type { i8, [10 x [20 x i32]], i8 } 385%struct.ST = type { i32, double, %struct.RT } 386 387; Masked gather for agregate types 388; Test9 and Test10 should give the same result (scalar and vector indices in GEP) 389 390 391define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { 392; KNL_64-LABEL: test9: 393; KNL_64: # BB#0: # %entry 394; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 395; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 396; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 397; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 398; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 399; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 400; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 401; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 402; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 403; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 404; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 405; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 406; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 407; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 408; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 409; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 410; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 411; KNL_64-NEXT: kxnorw %k0, %k0, %k1 412; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 413; KNL_64-NEXT: retq 414; 415; KNL_32-LABEL: test9: 416; KNL_32: # BB#0: # %entry 417; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 418; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3 419; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 420; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 421; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3 422; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 423; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 424; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 425; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1 426; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 427; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 428; KNL_32-NEXT: kxnorw %k0, %k0, %k1 429; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 430; KNL_32-NEXT: retl 431; 432; SKX-LABEL: test9: 433; SKX: # BB#0: # %entry 434; SKX-NEXT: vpbroadcastq %rdi, %zmm2 435; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 436; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 437; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 438; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 439; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 440; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 441; SKX-NEXT: kxnorw %k0, %k0, %k1 442; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 443; SKX-NEXT: retq 444entry: 445 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 446 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 447 448 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13> 449 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 450 ret <8 x i32> %res 451} 452 453define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { 454; KNL_64-LABEL: test10: 455; KNL_64: # BB#0: # %entry 456; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 457; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 458; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 459; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 460; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 461; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 462; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 463; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 464; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 465; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 466; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 467; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 468; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 469; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 470; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 471; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 472; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 473; KNL_64-NEXT: kxnorw %k0, %k0, %k1 474; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 475; KNL_64-NEXT: retq 476; 477; KNL_32-LABEL: test10: 478; KNL_32: # BB#0: # %entry 479; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 480; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3 481; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 482; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 483; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3 484; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 485; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 486; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 487; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1 488; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 489; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 490; KNL_32-NEXT: kxnorw %k0, %k0, %k1 491; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 492; KNL_32-NEXT: retl 493; 494; SKX-LABEL: test10: 495; SKX: # BB#0: # %entry 496; SKX-NEXT: vpbroadcastq %rdi, %zmm2 497; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 498; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 499; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 500; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 501; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 502; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 503; SKX-NEXT: kxnorw %k0, %k0, %k1 504; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 505; SKX-NEXT: retq 506entry: 507 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 508 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 509 510 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13 511 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 512 ret <8 x i32> %res 513} 514 515; Splat index in GEP, requires broadcast 516define <16 x float> @test11(float* %base, i32 %ind) { 517; KNL_64-LABEL: test11: 518; KNL_64: # BB#0: 519; KNL_64-NEXT: vpbroadcastd %esi, %zmm1 520; KNL_64-NEXT: kxnorw %k0, %k0, %k1 521; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 522; KNL_64-NEXT: retq 523; 524; KNL_32-LABEL: test11: 525; KNL_32: # BB#0: 526; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 527; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 528; KNL_32-NEXT: kxnorw %k0, %k0, %k1 529; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 530; KNL_32-NEXT: retl 531; 532; SKX-LABEL: test11: 533; SKX: # BB#0: 534; SKX-NEXT: vpbroadcastd %esi, %zmm1 535; SKX-NEXT: kxnorw %k0, %k0, %k1 536; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 537; SKX-NEXT: retq 538 539 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 540 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 541 542 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 543 544 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 545 ret <16 x float>%res 546} 547 548; We are checking the uniform base here. It is taken directly from input to vgatherdps 549define <16 x float> @test12(float* %base, <16 x i32> %ind) { 550; KNL_64-LABEL: test12: 551; KNL_64: # BB#0: 552; KNL_64-NEXT: kxnorw %k0, %k0, %k1 553; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 554; KNL_64-NEXT: vmovaps %zmm1, %zmm0 555; KNL_64-NEXT: retq 556; 557; KNL_32-LABEL: test12: 558; KNL_32: # BB#0: 559; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 560; KNL_32-NEXT: kxnorw %k0, %k0, %k1 561; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 562; KNL_32-NEXT: vmovaps %zmm1, %zmm0 563; KNL_32-NEXT: retl 564; 565; SKX-LABEL: test12: 566; SKX: # BB#0: 567; SKX-NEXT: kxnorw %k0, %k0, %k1 568; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 569; SKX-NEXT: vmovaps %zmm1, %zmm0 570; SKX-NEXT: retq 571 572 %sext_ind = sext <16 x i32> %ind to <16 x i64> 573 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 574 575 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 576 ret <16 x float>%res 577} 578 579; The same as the previous, but the mask is undefined 580define <16 x float> @test13(float* %base, <16 x i32> %ind) { 581; KNL_64-LABEL: test13: 582; KNL_64: # BB#0: 583; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 584; KNL_64-NEXT: vmovaps %zmm1, %zmm0 585; KNL_64-NEXT: retq 586; 587; KNL_32-LABEL: test13: 588; KNL_32: # BB#0: 589; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 590; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 591; KNL_32-NEXT: vmovaps %zmm1, %zmm0 592; KNL_32-NEXT: retl 593; 594; SKX-LABEL: test13: 595; SKX: # BB#0: 596; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 597; SKX-NEXT: vmovaps %zmm1, %zmm0 598; SKX-NEXT: retq 599 600 %sext_ind = sext <16 x i32> %ind to <16 x i64> 601 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 602 603 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 604 ret <16 x float>%res 605} 606 607; The base pointer is not splat, can't find unform base 608define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { 609; KNL_64-LABEL: test14: 610; KNL_64: # BB#0: 611; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 612; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 613; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 614; KNL_64-NEXT: vmovd %esi, %xmm1 615; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 616; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 617; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 618; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 619; KNL_64-NEXT: kshiftrw $8, %k0, %k1 620; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 621; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 622; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 623; KNL_64-NEXT: retq 624; 625; KNL_32-LABEL: test14: 626; KNL_32: # BB#0: 627; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 628; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 629; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 630; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 631; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 632; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 633; KNL_32-NEXT: retl 634; 635; SKX-LABEL: test14: 636; SKX: # BB#0: 637; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 638; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 639; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 640; SKX-NEXT: vpbroadcastd %esi, %ymm1 641; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 642; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 643; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 644; SKX-NEXT: kshiftrw $8, %k0, %k1 645; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 646; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 647; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0 648; SKX-NEXT: retq 649; 650; SKX_32-LABEL: test14: 651; SKX_32: # BB#0: 652; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 653; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 654; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 655; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 656; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 657; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 658; SKX_32-NEXT: retl 659 660 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1 661 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 662 663 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 664 665 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 666 ret <16 x float>%res 667} 668 669declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 670declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>) 671declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>) 672 673; Gather smaller than existing instruction 674define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { 675; 676; KNL_64-LABEL: test15: 677; KNL_64: # BB#0: 678; KNL_64: vpxor %ymm2, %ymm2, %ymm2 679; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 680; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 681; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0 682; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 683; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} 684; KNL_64-NEXT: # kill 685; KNL_64-NEXT: retq 686; 687; KNL_32-LABEL: test15: 688; KNL_32: # BB#0: 689; KNL_32: vpxor %ymm2, %ymm2, %ymm2 690; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 691; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 692; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 693; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0 694; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 695; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} 696; KNL_32-NEXT: # kill 697; KNL_32-NEXT: retl 698; 699; SKX-LABEL: test15: 700; SKX: # BB#0: 701; SKX-NEXT: vpslld $31, %xmm1, %xmm1 702; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 703; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} 704; SKX-NEXT: vmovaps %zmm1, %zmm0 705; SKX-NEXT: retq 706; 707; SKX_32-LABEL: test15: 708; SKX_32: # BB#0: 709; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 710; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 711; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 712; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} 713; SKX_32-NEXT: vmovaps %zmm1, %zmm0 714; SKX_32-NEXT: retl 715 716 %sext_ind = sext <4 x i32> %ind to <4 x i64> 717 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind 718 %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) 719 ret <4 x float>%res 720} 721 722; Gather smaller than existing instruction 723define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { 724; 725; KNL_64-LABEL: test16: 726; KNL_64: # BB#0: 727; KNL_64: vpslld $31, %xmm1, %xmm1 728; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 729; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 730; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 731; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 732; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 733; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 734; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 735; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 736; KNL_64-NEXT: vmovaps %zmm2, %zmm0 737; KNL_64-NEXT: retq 738; 739; KNL_32-LABEL: test16: 740; KNL_32: # BB#0: 741; KNL_32: vpslld $31, %xmm1, %xmm1 742; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 743; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 744; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 745; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 746; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 747; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 748; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1 749; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 750; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 751; KNL_32-NEXT: vmovaps %zmm2, %zmm0 752; KNL_32-NEXT: retl 753; 754; SKX-LABEL: test16: 755; SKX: # BB#0: 756; SKX-NEXT: vpslld $31, %xmm1, %xmm1 757; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 758; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} 759; SKX-NEXT: vmovaps %zmm2, %zmm0 760; SKX-NEXT: retq 761; 762; SKX_32-LABEL: test16: 763; SKX_32: # BB#0: 764; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 765; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 766; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 767; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1} 768; SKX_32-NEXT: vmovaps %zmm2, %zmm0 769; SKX_32-NEXT: retl 770 771 %sext_ind = sext <4 x i32> %ind to <4 x i64> 772 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind 773 %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) 774 ret <4 x double>%res 775} 776 777define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { 778; 779; KNL_64-LABEL: test17: 780; KNL_64: # BB#0: 781; KNL_64: vpxord %zmm3, %zmm3, %zmm3 782; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 783; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 784; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 785; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 786; KNL_64-NEXT: vmovaps %zmm2, %zmm0 787; KNL_64-NEXT: retq 788; 789; KNL_32-LABEL: test17: 790; KNL_32: # BB#0: 791; KNL_32: vpxord %zmm3, %zmm3, %zmm3 792; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 793; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 794; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1 795; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 796; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 797; KNL_32-NEXT: vmovaps %zmm2, %zmm0 798; KNL_32-NEXT: retl 799; 800; SKX-LABEL: test17: 801; SKX: # BB#0: 802; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 803; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 804; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} 805; SKX-NEXT: vmovaps %zmm2, %zmm0 806; SKX-NEXT: retq 807; 808; SKX_32-LABEL: test17: 809; SKX_32: # BB#0: 810; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 811; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 812; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 813; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1} 814; SKX_32-NEXT: vmovaps %zmm2, %zmm0 815; SKX_32-NEXT: retl 816 817 %sext_ind = sext <2 x i32> %ind to <2 x i64> 818 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind 819 %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) 820 ret <2 x double>%res 821} 822 823declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) 824declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) 825declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) 826declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) 827declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) 828 829define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { 830; 831; KNL_64-LABEL: test18: 832; KNL_64: # BB#0: 833; KNL_64: vpxor %ymm3, %ymm3, %ymm3 834; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 835; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 836; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 837; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 838; KNL_64-NEXT: retq 839; 840; KNL_32-LABEL: test18: 841; KNL_32: # BB#0: 842; KNL_32: vpxor %ymm3, %ymm3, %ymm3 843; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 844; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 845; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 846; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 847; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 848; KNL_32-NEXT: retl 849; 850; SKX-LABEL: test18: 851; SKX: # BB#0: 852; SKX-NEXT: vpslld $31, %xmm2, %xmm2 853; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 854; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 855; SKX-NEXT: retq 856; 857; SKX_32-LABEL: test18: 858; SKX_32: # BB#0: 859; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 860; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 861; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} 862; SKX_32-NEXT: retl 863 call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) 864 ret void 865} 866 867define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) { 868; 869; KNL_64-LABEL: test19: 870; KNL_64: # BB#0: 871; KNL_64: vpslld $31, %xmm1, %xmm1 872; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 873; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 874; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 875; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 876; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 877; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 878; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} 879; KNL_64-NEXT: retq 880; 881; KNL_32-LABEL: test19: 882; KNL_32: # BB#0: 883; KNL_32: vpslld $31, %xmm1, %xmm1 884; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 885; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 886; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 887; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 888; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 889; KNL_32-NEXT: vpsllvq .LCPI18_0, %zmm1, %zmm1 890; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 891; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} 892; KNL_32-NEXT: retl 893; 894; SKX-LABEL: test19: 895; SKX: # BB#0: 896; SKX-NEXT: vpslld $31, %xmm1, %xmm1 897; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 898; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} 899; SKX-NEXT: retq 900; 901; SKX_32-LABEL: test19: 902; SKX_32: # BB#0: 903; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 904; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1 905; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 906; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} 907; SKX_32-NEXT: retl 908 %gep = getelementptr double, double* %ptr, <4 x i64> %ind 909 call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) 910 ret void 911} 912 913; Data type requires widening 914define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { 915; 916; KNL_64-LABEL: test20: 917; KNL_64: # BB#0: 918; KNL_64: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 919; KNL_64-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero 920; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 921; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 922; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2 923; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 924; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 925; KNL_64-NEXT: retq 926; 927; KNL_32-LABEL: test20: 928; KNL_32: # BB#0: 929; KNL_32: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 930; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero 931; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 932; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 933; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 934; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 935; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2 936; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1 937; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 938; KNL_32-NEXT: retl 939; 940; SKX-LABEL: test20: 941; SKX: # BB#0: 942; SKX: vpsllq $63, %xmm2, %xmm2 943; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 944; SKX-NEXT: kshiftlb $6, %k0, %k0 945; SKX-NEXT: kshiftrb $6, %k0, %k1 946; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} 947; SKX-NEXT: retq 948; 949; SKX_32-LABEL: test20: 950; SKX_32: # BB#0: 951; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 952; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 953; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 954; SKX_32-NEXT: kshiftlb $6, %k0, %k0 955; SKX_32-NEXT: kshiftrb $6, %k0, %k1 956; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1} 957; SKX_32-NEXT: retl 958 call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) 959 ret void 960} 961 962; Data type requires promotion 963define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { 964; 965; KNL_64-LABEL: test21: 966; KNL_64: # BB#0: 967; KNL_64: vpxord %zmm3, %zmm3, %zmm3 968; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 969; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 970; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2 971; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 972; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 973; KNL_64-NEXT: retq 974; 975; KNL_32-LABEL: test21: 976; KNL_32: # BB#0: 977; KNL_32: vpxord %zmm3, %zmm3, %zmm3 978; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 979; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 980; KNL_32-NEXT: vpsllvq .LCPI20_0, %zmm2, %zmm2 981; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 982; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 983; KNL_32-NEXT: retl 984; 985; SKX-LABEL: test21: 986; SKX: # BB#0: 987; SKX: vpsllq $63, %xmm2, %xmm2 988; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0 989; SKX-NEXT: kshiftlb $6, %k0, %k0 990; SKX-NEXT: kshiftrb $6, %k0, %k1 991; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 992; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 993; SKX-NEXT: retq 994; 995; SKX_32-LABEL: test21: 996; SKX_32: # BB#0: 997; SKX_32: vpsllq $63, %xmm2, %xmm2 998; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0 999; SKX_32-NEXT: kshiftlb $6, %k0, %k0 1000; SKX_32-NEXT: kshiftrb $6, %k0, %k1 1001; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1002; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1003; SKX_32-NEXT: retl 1004 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) 1005 ret void 1006} 1007 1008; The result type requires widening 1009declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>) 1010 1011define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { 1012; 1013; 1014; KNL_64-LABEL: test22: 1015; KNL_64: # BB#0: 1016; KNL_64: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1017; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero 1018; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 1019; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1020; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1021; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 1022; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1 1023; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1 1024; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} 1025; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1026; KNL_64-NEXT: retq 1027; 1028; KNL_32-LABEL: test22: 1029; KNL_32: # BB#0: 1030; KNL_32: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1031; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero 1032; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 1033; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 1034; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1035; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1036; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 1037; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1 1038; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1039; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} 1040; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1041; KNL_32-NEXT: retl 1042; 1043; SKX-LABEL: test22: 1044; SKX: # BB#0: 1045; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1046; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1047; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0 1048; SKX-NEXT: kshiftlb $6, %k0, %k0 1049; SKX-NEXT: kshiftrb $6, %k0, %k1 1050; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} 1051; SKX-NEXT: vmovaps %zmm2, %zmm0 1052; SKX-NEXT: retq 1053; 1054; SKX_32-LABEL: test22: 1055; SKX_32: # BB#0: 1056; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1057; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1058; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0 1059; SKX_32-NEXT: kshiftlb $6, %k0, %k0 1060; SKX_32-NEXT: kshiftrb $6, %k0, %k1 1061; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1062; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1} 1063; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1064; SKX_32-NEXT: retl 1065 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1066 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1067 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) 1068 ret <2 x float>%res 1069} 1070 1071declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) 1072declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) 1073 1074define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { 1075; 1076; KNL_64-LABEL: test23: 1077; KNL_64: # BB#0: 1078; KNL_64: vpxord %zmm3, %zmm3, %zmm3 1079; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1080; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 1081; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1082; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1083; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1084; KNL_64-NEXT: retq 1085; 1086; KNL_32-LABEL: test23: 1087; KNL_32: # BB#0: 1088; KNL_32: vpxord %zmm3, %zmm3, %zmm3 1089; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1090; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1091; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1 1092; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1093; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1094; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1095; KNL_32-NEXT: retl 1096; 1097; SKX-LABEL: test23: 1098; SKX: # BB#0: 1099; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1100; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 1101; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1102; SKX-NEXT: vmovaps %zmm2, %zmm0 1103; SKX-NEXT: retq 1104; 1105; SKX_32-LABEL: test23: 1106; SKX_32: # BB#0: 1107; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1108; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 1109; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1110; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} 1111; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1112; SKX_32-NEXT: retl 1113 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1114 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1115 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) 1116 ret <2 x i32>%res 1117} 1118 1119define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { 1120; KNL_64-LABEL: test24: 1121; KNL_64: # BB#0: 1122; KNL_64: movb $3, %al 1123; KNL_64-NEXT: kmovw %eax, %k1 1124; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1125; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1126; KNL_64-NEXT: retq 1127; 1128; KNL_32-LABEL: test24: 1129; KNL_32: # BB#0: 1130; KNL_32: movl {{[0-9]+}}(%esp), %eax 1131; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 1132; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 1133; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1 1134; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1135; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1136; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1137; KNL_32-NEXT: retl 1138; 1139; SKX-LABEL: test24: 1140; SKX: # BB#0: 1141; SKX-NEXT: kxnorw %k0, %k0, %k1 1142; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1143; SKX-NEXT: vmovaps %zmm1, %zmm0 1144; SKX-NEXT: retq 1145; 1146; SKX_32-LABEL: test24: 1147; SKX_32: # BB#0: 1148; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1149; SKX_32-NEXT: kxnorw %k0, %k0, %k1 1150; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} 1151; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1152; SKX_32-NEXT: retl 1153 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1154 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1155 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) 1156 ret <2 x i32>%res 1157} 1158 1159define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { 1160; 1161; KNL_64-LABEL: test25: 1162; KNL_64: # BB#0: 1163; KNL_64: vpxord %zmm3, %zmm3, %zmm3 1164; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1165; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 1166; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1167; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1168; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1169; KNL_64-NEXT: retq 1170; 1171; KNL_32-LABEL: test25: 1172; KNL_32: # BB#0: 1173; KNL_32: vpxord %zmm3, %zmm3, %zmm3 1174; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1175; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1176; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1 1177; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1178; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1179; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1180; KNL_32-NEXT: retl 1181; 1182; SKX-LABEL: test25: 1183; SKX: # BB#0: 1184; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1185; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1 1186; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1187; SKX-NEXT: vmovaps %zmm2, %zmm0 1188; SKX-NEXT: retq 1189; 1190; SKX_32-LABEL: test25: 1191; SKX_32: # BB#0: 1192; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1193; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1 1194; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1195; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1} 1196; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1197; SKX_32-NEXT: retl 1198 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1199 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1200 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) 1201 ret <2 x i64>%res 1202} 1203 1204define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { 1205; 1206; KNL_64-LABEL: test26: 1207; KNL_64: # BB#0: 1208; KNL_64: movb $3, %al 1209; KNL_64-NEXT: kmovw %eax, %k1 1210; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1211; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1212; KNL_64-NEXT: retq 1213; 1214; KNL_32-LABEL: test26: 1215; KNL_32: # BB#0: 1216; KNL_32: movl {{[0-9]+}}(%esp), %eax 1217; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1218; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 1219; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2 1220; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1221; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1222; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1223; KNL_32-NEXT: retl 1224; 1225; SKX-LABEL: test26: 1226; SKX: # BB#0: 1227; SKX-NEXT: kxnorw %k0, %k0, %k1 1228; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1229; SKX-NEXT: vmovaps %zmm1, %zmm0 1230; SKX-NEXT: retq 1231; 1232; SKX_32-LABEL: test26: 1233; SKX_32: # BB#0: 1234; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1235; SKX_32-NEXT: kxnorw %k0, %k0, %k1 1236; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1} 1237; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1238; SKX_32-NEXT: retl 1239 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1240 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1241 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0) 1242 ret <2 x i64>%res 1243} 1244 1245; Result type requires widening; all-ones mask 1246define <2 x float> @test27(float* %base, <2 x i32> %ind) { 1247; 1248; KNL_64-LABEL: test27: 1249; KNL_64: # BB#0: 1250; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1251; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 1252; KNL_64-NEXT: movb $3, %al 1253; KNL_64-NEXT: kmovw %eax, %k1 1254; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} 1255; KNL_64-NEXT: # kill 1256; KNL_64-NEXT: retq 1257; 1258; KNL_32-LABEL: test27: 1259; KNL_32: # BB#0: 1260; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1261; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1262; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 1263; KNL_32-NEXT: movb $3, %cl 1264; KNL_32-NEXT: kmovw %ecx, %k1 1265; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} 1266; KNL_32-NEXT: # kill 1267; KNL_32-NEXT: retl 1268; 1269; SKX-LABEL: test27: 1270; SKX: # BB#0: 1271; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 1272; SKX-NEXT: movb $3, %al 1273; SKX-NEXT: kmovb %eax, %k1 1274; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} 1275; SKX-NEXT: retq 1276 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1277 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1278 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef) 1279 ret <2 x float>%res 1280} 1281 1282; Data type requires promotion, mask is all-ones 1283define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { 1284; 1285; 1286; KNL_64-LABEL: test28: 1287; KNL_64: # BB#0: 1288; KNL_64: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1289; KNL_64-NEXT: movb $3, %al 1290; KNL_64-NEXT: kmovw %eax, %k1 1291; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1292; KNL_64-NEXT: retq 1293; 1294; KNL_32-LABEL: test28: 1295; KNL_32: # BB#0: 1296; KNL_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1297; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1298; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 1299; KNL_32-NEXT: vpsllvq .LCPI27_1, %zmm2, %zmm2 1300; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1301; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1302; KNL_32-NEXT: retl 1303; 1304; SKX-LABEL: test28: 1305; SKX: # BB#0: 1306; SKX: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1307; SKX-NEXT: movb $3, %al 1308; SKX-NEXT: kmovb %eax, %k1 1309; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1310; SKX-NEXT: retq 1311; 1312; SKX_32-LABEL: test28: 1313; SKX_32: # BB#0: 1314; SKX_32: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1315; SKX_32-NEXT: movb $3, %al 1316; SKX_32-NEXT: kmovb %eax, %k1 1317; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1318; SKX_32-NEXT: retl 1319 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>) 1320 ret void 1321} 1322 1323 1324; SCALAR-LABEL: test29 1325; SCALAR: extractelement <16 x float*> 1326; SCALAR-NEXT: load float 1327; SCALAR-NEXT: insertelement <16 x float> 1328; SCALAR-NEXT: extractelement <16 x float*> 1329; SCALAR-NEXT: load float 1330 1331define <16 x float> @test29(float* %base, <16 x i32> %ind) { 1332; KNL_64-LABEL: test29: 1333; KNL_64: # BB#0: 1334; KNL_64-NEXT: movw $44, %ax 1335; KNL_64-NEXT: kmovw %eax, %k1 1336; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1337; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1338; KNL_64-NEXT: retq 1339; 1340; KNL_32-LABEL: test29: 1341; KNL_32: # BB#0: 1342; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1343; KNL_32-NEXT: movw $44, %cx 1344; KNL_32-NEXT: kmovw %ecx, %k1 1345; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 1346; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1347; KNL_32-NEXT: retl 1348; 1349; SKX-LABEL: test29: 1350; SKX: # BB#0: 1351; SKX-NEXT: movw $44, %ax 1352; SKX-NEXT: kmovw %eax, %k1 1353; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1354; SKX-NEXT: vmovaps %zmm1, %zmm0 1355; SKX-NEXT: retq 1356 1357 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 1358 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 1359 1360 %sext_ind = sext <16 x i32> %ind to <16 x i64> 1361 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 1362 1363 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef) 1364 ret <16 x float>%res 1365} 1366 1367; Check non-power-of-2 case. It should be scalarized. 1368declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) 1369define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { 1370; KNL_64-LABEL: test30: 1371; KNL_64: # BB#0: 1372; KNL_64-NEXT: andl $1, %edx 1373; KNL_64-NEXT: andl $1, %esi 1374; KNL_64-NEXT: movl %edi, %eax 1375; KNL_64-NEXT: andl $1, %eax 1376; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 1377; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 1378; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1379; KNL_64-NEXT: # implicit-def: %XMM0 1380; KNL_64-NEXT: testb $1, %dil 1381; KNL_64-NEXT: je .LBB29_2 1382; KNL_64-NEXT: # BB#1: # %cond.load 1383; KNL_64-NEXT: vmovq %xmm1, %rcx 1384; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1385; KNL_64-NEXT: .LBB29_2: # %else 1386; KNL_64-NEXT: testb %sil, %sil 1387; KNL_64-NEXT: je .LBB29_4 1388; KNL_64-NEXT: # BB#3: # %cond.load1 1389; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx 1390; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 1391; KNL_64-NEXT: .LBB29_4: # %else2 1392; KNL_64-NEXT: testb %dl, %dl 1393; KNL_64-NEXT: je .LBB29_6 1394; KNL_64-NEXT: # BB#5: # %cond.load4 1395; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 1396; KNL_64-NEXT: vmovq %xmm1, %rcx 1397; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm0, %xmm0 1398; KNL_64-NEXT: .LBB29_6: # %else5 1399; KNL_64-NEXT: vmovd %eax, %xmm1 1400; KNL_64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 1401; KNL_64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 1402; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 1403; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1404; KNL_64-NEXT: retq 1405; 1406; KNL_32-LABEL: test30: 1407; KNL_32: # BB#0: 1408; KNL_32-NEXT: pushl %ebx 1409; KNL_32-NEXT: .Ltmp0: 1410; KNL_32-NEXT: .cfi_def_cfa_offset 8 1411; KNL_32-NEXT: pushl %esi 1412; KNL_32-NEXT: .Ltmp1: 1413; KNL_32-NEXT: .cfi_def_cfa_offset 12 1414; KNL_32-NEXT: .Ltmp2: 1415; KNL_32-NEXT: .cfi_offset %esi, -12 1416; KNL_32-NEXT: .Ltmp3: 1417; KNL_32-NEXT: .cfi_offset %ebx, -8 1418; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1419; KNL_32-NEXT: andl $1, %eax 1420; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx 1421; KNL_32-NEXT: andl $1, %ecx 1422; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ebx 1423; KNL_32-NEXT: movl %ebx, %edx 1424; KNL_32-NEXT: andl $1, %edx 1425; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 1426; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1427; KNL_32-NEXT: # implicit-def: %XMM0 1428; KNL_32-NEXT: testb $1, %bl 1429; KNL_32-NEXT: je .LBB29_2 1430; KNL_32-NEXT: # BB#1: # %cond.load 1431; KNL_32-NEXT: vmovd %xmm1, %esi 1432; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1433; KNL_32-NEXT: .LBB29_2: # %else 1434; KNL_32-NEXT: testb %cl, %cl 1435; KNL_32-NEXT: je .LBB29_4 1436; KNL_32-NEXT: # BB#3: # %cond.load1 1437; KNL_32-NEXT: vpextrd $1, %xmm1, %esi 1438; KNL_32-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0 1439; KNL_32-NEXT: .LBB29_4: # %else2 1440; KNL_32-NEXT: testb %al, %al 1441; KNL_32-NEXT: je .LBB29_6 1442; KNL_32-NEXT: # BB#5: # %cond.load4 1443; KNL_32-NEXT: vpextrd $2, %xmm1, %esi 1444; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0 1445; KNL_32-NEXT: .LBB29_6: # %else5 1446; KNL_32-NEXT: vmovd %edx, %xmm1 1447; KNL_32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 1448; KNL_32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 1449; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 1450; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1451; KNL_32-NEXT: popl %esi 1452; KNL_32-NEXT: popl %ebx 1453; KNL_32-NEXT: retl 1454; 1455; SKX-LABEL: test30: 1456; SKX: # BB#0: 1457; SKX-NEXT: vpslld $31, %xmm2, %xmm2 1458; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1 1459; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1460; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 1461; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 1462; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1463; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1464; SKX-NEXT: # implicit-def: %XMM0 1465; SKX-NEXT: testb %al, %al 1466; SKX-NEXT: je .LBB29_2 1467; SKX-NEXT: # BB#1: # %cond.load 1468; SKX-NEXT: vmovq %xmm1, %rax 1469; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1470; SKX-NEXT: .LBB29_2: # %else 1471; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1472; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1473; SKX-NEXT: testb %al, %al 1474; SKX-NEXT: je .LBB29_4 1475; SKX-NEXT: # BB#3: # %cond.load1 1476; SKX-NEXT: vpextrq $1, %xmm1, %rax 1477; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 1478; SKX-NEXT: .LBB29_4: # %else2 1479; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1480; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1481; SKX-NEXT: testb %al, %al 1482; SKX-NEXT: je .LBB29_6 1483; SKX-NEXT: # BB#5: # %cond.load4 1484; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm1 1485; SKX-NEXT: vmovq %xmm1, %rax 1486; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 1487; SKX-NEXT: .LBB29_6: # %else5 1488; SKX-NEXT: vpblendmd %xmm0, %xmm3, %xmm0 {%k1} 1489; SKX-NEXT: retq 1490; 1491; SKX_32-LABEL: test30: 1492; SKX_32: # BB#0: 1493; SKX_32-NEXT: subl $12, %esp 1494; SKX_32-NEXT: .Ltmp0: 1495; SKX_32-NEXT: .cfi_def_cfa_offset 16 1496; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 1497; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1 1498; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) 1499; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 1500; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1501; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al 1502; SKX_32-NEXT: # implicit-def: %XMM0 1503; SKX_32-NEXT: testb %al, %al 1504; SKX_32-NEXT: je .LBB29_2 1505; SKX_32-NEXT: # BB#1: # %cond.load 1506; SKX_32-NEXT: vmovd %xmm1, %eax 1507; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1508; SKX_32-NEXT: .LBB29_2: # %else 1509; SKX_32-NEXT: kmovb %k1, {{[0-9]+}}(%esp) 1510; SKX_32-NEXT: movb {{[0-9]+}}(%esp), %al 1511; SKX_32-NEXT: testb %al, %al 1512; SKX_32-NEXT: je .LBB29_4 1513; SKX_32-NEXT: # BB#3: # %cond.load1 1514; SKX_32-NEXT: vpextrd $1, %xmm1, %eax 1515; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0 1516; SKX_32-NEXT: .LBB29_4: # %else2 1517; SKX_32-NEXT: vmovdqa32 {{[0-9]+}}(%esp), %xmm2 1518; SKX_32-NEXT: kmovb %k1, (%esp) 1519; SKX_32-NEXT: movb (%esp), %al 1520; SKX_32-NEXT: testb %al, %al 1521; SKX_32-NEXT: je .LBB29_6 1522; SKX_32-NEXT: # BB#5: # %cond.load4 1523; SKX_32-NEXT: vpextrd $2, %xmm1, %eax 1524; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 1525; SKX_32-NEXT: .LBB29_6: # %else5 1526; SKX_32-NEXT: vpblendmd %xmm0, %xmm2, %xmm0 {%k1} 1527; SKX_32-NEXT: addl $12, %esp 1528; SKX_32-NEXT: retl 1529 1530 %sext_ind = sext <3 x i32> %ind to <3 x i64> 1531 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind 1532 %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) 1533 ret <3 x i32>%res 1534} 1535 1536declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) 1537 1538; KNL-LABEL: test31 1539; KNL: vpgatherqq 1540; KNL: vpgatherqq 1541define <16 x float*> @test31(<16 x float**> %ptrs) { 1542; KNL_64-LABEL: test31: 1543; KNL_64: # BB#0: 1544; KNL_64-NEXT: kxnorw %k0, %k0, %k1 1545; KNL_64-NEXT: kxnorw %k0, %k0, %k2 1546; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1547; KNL_64-NEXT: kshiftrw $8, %k1, %k1 1548; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1549; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1550; KNL_64-NEXT: vmovaps %zmm3, %zmm1 1551; KNL_64-NEXT: retq 1552; 1553; KNL_32-LABEL: test31: 1554; KNL_32: # BB#0: 1555; KNL_32-NEXT: kxnorw %k0, %k0, %k1 1556; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1557; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1558; KNL_32-NEXT: retl 1559; 1560; SKX-LABEL: test31: 1561; SKX: # BB#0: 1562; SKX-NEXT: kxnorw %k0, %k0, %k1 1563; SKX-NEXT: kxnorw %k0, %k0, %k2 1564; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1565; SKX-NEXT: kshiftrw $8, %k1, %k1 1566; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1567; SKX-NEXT: vmovaps %zmm2, %zmm0 1568; SKX-NEXT: vmovaps %zmm3, %zmm1 1569; SKX-NEXT: retq 1570; 1571; SKX_32-LABEL: test31: 1572; SKX_32: # BB#0: 1573; SKX_32-NEXT: kxnorw %k0, %k0, %k1 1574; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1575; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1576; SKX_32-NEXT: retl 1577 1578 %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef) 1579 ret <16 x float*>%res 1580} 1581 1582define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1583; KNL_64-LABEL: test_gather_16i32: 1584; KNL_64: # BB#0: 1585; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1586; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1587; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1588; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2 1589; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1590; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1591; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1592; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 1593; KNL_64-NEXT: retq 1594; 1595; KNL_32-LABEL: test_gather_16i32: 1596; KNL_32: # BB#0: 1597; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1598; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1599; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1600; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1601; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1602; KNL_32-NEXT: retl 1603; 1604; SKX-LABEL: test_gather_16i32: 1605; SKX: # BB#0: 1606; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1607; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1608; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1609; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2 1610; SKX-NEXT: kshiftrw $8, %k1, %k2 1611; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1612; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1613; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 1614; SKX-NEXT: retq 1615; 1616; SKX_32-LABEL: test_gather_16i32: 1617; SKX_32: # BB#0: 1618; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1619; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1620; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1621; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1622; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1623; SKX_32-NEXT: retl 1624 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) 1625 ret <16 x i32> %res 1626} 1627define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1628; KNL_64-LABEL: test_gather_16i64: 1629; KNL_64: # BB#0: 1630; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1631; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1632; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1633; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1634; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1635; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1636; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1637; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1638; KNL_64-NEXT: retq 1639; 1640; KNL_32-LABEL: test_gather_16i64: 1641; KNL_32: # BB#0: 1642; KNL_32-NEXT: pushl %ebp 1643; KNL_32-NEXT: .Ltmp4: 1644; KNL_32-NEXT: .cfi_def_cfa_offset 8 1645; KNL_32-NEXT: .Ltmp5: 1646; KNL_32-NEXT: .cfi_offset %ebp, -8 1647; KNL_32-NEXT: movl %esp, %ebp 1648; KNL_32-NEXT: .Ltmp6: 1649; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1650; KNL_32-NEXT: andl $-64, %esp 1651; KNL_32-NEXT: subl $64, %esp 1652; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1653; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1654; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1655; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1656; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1657; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 1658; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1659; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 1660; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1661; KNL_32-NEXT: movl %ebp, %esp 1662; KNL_32-NEXT: popl %ebp 1663; KNL_32-NEXT: retl 1664; 1665; SKX-LABEL: test_gather_16i64: 1666; SKX: # BB#0: 1667; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1668; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1669; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1670; SKX-NEXT: kshiftrw $8, %k1, %k2 1671; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1672; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1673; SKX-NEXT: vmovaps %zmm3, %zmm0 1674; SKX-NEXT: vmovaps %zmm4, %zmm1 1675; SKX-NEXT: retq 1676; 1677; SKX_32-LABEL: test_gather_16i64: 1678; SKX_32: # BB#0: 1679; SKX_32-NEXT: pushl %ebp 1680; SKX_32-NEXT: .Ltmp1: 1681; SKX_32-NEXT: .cfi_def_cfa_offset 8 1682; SKX_32-NEXT: .Ltmp2: 1683; SKX_32-NEXT: .cfi_offset %ebp, -8 1684; SKX_32-NEXT: movl %esp, %ebp 1685; SKX_32-NEXT: .Ltmp3: 1686; SKX_32-NEXT: .cfi_def_cfa_register %ebp 1687; SKX_32-NEXT: andl $-64, %esp 1688; SKX_32-NEXT: subl $64, %esp 1689; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1690; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1691; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1692; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1693; SKX_32-NEXT: kshiftrw $8, %k1, %k2 1694; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 1695; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 1696; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 1697; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1698; SKX_32-NEXT: movl %ebp, %esp 1699; SKX_32-NEXT: popl %ebp 1700; SKX_32-NEXT: retl 1701 %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) 1702 ret <16 x i64> %res 1703} 1704declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) 1705define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1706; KNL_64-LABEL: test_gather_16f32: 1707; KNL_64: # BB#0: 1708; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1709; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1710; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1711; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2 1712; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1713; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1714; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1715; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 1716; KNL_64-NEXT: retq 1717; 1718; KNL_32-LABEL: test_gather_16f32: 1719; KNL_32: # BB#0: 1720; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1721; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1722; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1723; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 1724; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1725; KNL_32-NEXT: retl 1726; 1727; SKX-LABEL: test_gather_16f32: 1728; SKX: # BB#0: 1729; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1730; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1731; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1732; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2 1733; SKX-NEXT: kshiftrw $8, %k1, %k2 1734; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1735; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1736; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0 1737; SKX-NEXT: retq 1738; 1739; SKX_32-LABEL: test_gather_16f32: 1740; SKX_32: # BB#0: 1741; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1742; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1743; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1744; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 1745; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1746; SKX_32-NEXT: retl 1747 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) 1748 ret <16 x float> %res 1749} 1750define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1751; KNL_64-LABEL: test_gather_16f64: 1752; KNL_64: # BB#0: 1753; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1754; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1755; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1756; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1757; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1758; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1759; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1760; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1761; KNL_64-NEXT: retq 1762; 1763; KNL_32-LABEL: test_gather_16f64: 1764; KNL_32: # BB#0: 1765; KNL_32-NEXT: pushl %ebp 1766; KNL_32-NEXT: .Ltmp7: 1767; KNL_32-NEXT: .cfi_def_cfa_offset 8 1768; KNL_32-NEXT: .Ltmp8: 1769; KNL_32-NEXT: .cfi_offset %ebp, -8 1770; KNL_32-NEXT: movl %esp, %ebp 1771; KNL_32-NEXT: .Ltmp9: 1772; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1773; KNL_32-NEXT: andl $-64, %esp 1774; KNL_32-NEXT: subl $64, %esp 1775; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1776; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1777; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1778; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 1779; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1780; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 1781; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1782; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 1783; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1784; KNL_32-NEXT: movl %ebp, %esp 1785; KNL_32-NEXT: popl %ebp 1786; KNL_32-NEXT: retl 1787; 1788; SKX-LABEL: test_gather_16f64: 1789; SKX: # BB#0: 1790; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1791; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1792; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1793; SKX-NEXT: kshiftrw $8, %k1, %k2 1794; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1795; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1796; SKX-NEXT: vmovaps %zmm3, %zmm0 1797; SKX-NEXT: vmovaps %zmm4, %zmm1 1798; SKX-NEXT: retq 1799; 1800; SKX_32-LABEL: test_gather_16f64: 1801; SKX_32: # BB#0: 1802; SKX_32-NEXT: pushl %ebp 1803; SKX_32-NEXT: .Ltmp4: 1804; SKX_32-NEXT: .cfi_def_cfa_offset 8 1805; SKX_32-NEXT: .Ltmp5: 1806; SKX_32-NEXT: .cfi_offset %ebp, -8 1807; SKX_32-NEXT: movl %esp, %ebp 1808; SKX_32-NEXT: .Ltmp6: 1809; SKX_32-NEXT: .cfi_def_cfa_register %ebp 1810; SKX_32-NEXT: andl $-64, %esp 1811; SKX_32-NEXT: subl $64, %esp 1812; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1813; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1814; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1815; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 1816; SKX_32-NEXT: kshiftrw $8, %k1, %k2 1817; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 1818; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 1819; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 1820; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1821; SKX_32-NEXT: movl %ebp, %esp 1822; SKX_32-NEXT: popl %ebp 1823; SKX_32-NEXT: retl 1824 %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) 1825 ret <16 x double> %res 1826} 1827declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) 1828define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1829; KNL_64-LABEL: test_scatter_16i32: 1830; KNL_64: # BB#0: 1831; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1832; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1833; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1834; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1835; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1836; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1837; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1838; KNL_64-NEXT: retq 1839; 1840; KNL_32-LABEL: test_scatter_16i32: 1841; KNL_32: # BB#0: 1842; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1843; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1844; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1845; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1846; KNL_32-NEXT: retl 1847; 1848; SKX-LABEL: test_scatter_16i32: 1849; SKX: # BB#0: 1850; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1851; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1852; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1853; SKX-NEXT: kshiftrw $8, %k1, %k2 1854; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1855; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0 1856; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1857; SKX-NEXT: retq 1858; 1859; SKX_32-LABEL: test_scatter_16i32: 1860; SKX_32: # BB#0: 1861; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1862; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1863; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1864; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1865; SKX_32-NEXT: retl 1866 call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) 1867 ret void 1868} 1869define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1870; KNL_64-LABEL: test_scatter_16i64: 1871; KNL_64: # BB#0: 1872; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1873; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1874; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1875; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1876; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1877; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1878; KNL_64-NEXT: retq 1879; 1880; KNL_32-LABEL: test_scatter_16i64: 1881; KNL_32: # BB#0: 1882; KNL_32-NEXT: pushl %ebp 1883; KNL_32-NEXT: .Ltmp10: 1884; KNL_32-NEXT: .cfi_def_cfa_offset 8 1885; KNL_32-NEXT: .Ltmp11: 1886; KNL_32-NEXT: .cfi_offset %ebp, -8 1887; KNL_32-NEXT: movl %esp, %ebp 1888; KNL_32-NEXT: .Ltmp12: 1889; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1890; KNL_32-NEXT: andl $-64, %esp 1891; KNL_32-NEXT: subl $64, %esp 1892; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1893; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1894; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1895; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1896; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1897; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 1898; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1899; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 1900; KNL_32-NEXT: movl %ebp, %esp 1901; KNL_32-NEXT: popl %ebp 1902; KNL_32-NEXT: retl 1903; 1904; SKX-LABEL: test_scatter_16i64: 1905; SKX: # BB#0: 1906; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1907; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1908; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1909; SKX-NEXT: kshiftrw $8, %k1, %k2 1910; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1911; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1912; SKX-NEXT: retq 1913; 1914; SKX_32-LABEL: test_scatter_16i64: 1915; SKX_32: # BB#0: 1916; SKX_32-NEXT: pushl %ebp 1917; SKX_32-NEXT: .Ltmp7: 1918; SKX_32-NEXT: .cfi_def_cfa_offset 8 1919; SKX_32-NEXT: .Ltmp8: 1920; SKX_32-NEXT: .cfi_offset %ebp, -8 1921; SKX_32-NEXT: movl %esp, %ebp 1922; SKX_32-NEXT: .Ltmp9: 1923; SKX_32-NEXT: .cfi_def_cfa_register %ebp 1924; SKX_32-NEXT: andl $-64, %esp 1925; SKX_32-NEXT: subl $64, %esp 1926; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1927; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1928; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1929; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1930; SKX_32-NEXT: kshiftrw $8, %k1, %k2 1931; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 1932; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 1933; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 1934; SKX_32-NEXT: movl %ebp, %esp 1935; SKX_32-NEXT: popl %ebp 1936; SKX_32-NEXT: retl 1937 call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) 1938 ret void 1939} 1940declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask) 1941define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1942; KNL_64-LABEL: test_scatter_16f32: 1943; KNL_64: # BB#0: 1944; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1945; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1946; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1947; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1948; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1949; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0 1950; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1951; KNL_64-NEXT: retq 1952; 1953; KNL_32-LABEL: test_scatter_16f32: 1954; KNL_32: # BB#0: 1955; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1956; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 1957; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1958; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 1959; KNL_32-NEXT: retl 1960; 1961; SKX-LABEL: test_scatter_16f32: 1962; SKX: # BB#0: 1963; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1964; SKX-NEXT: vpslld $31, %zmm2, %zmm2 1965; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1966; SKX-NEXT: kshiftrw $8, %k1, %k2 1967; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1968; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 1969; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1970; SKX-NEXT: retq 1971; 1972; SKX_32-LABEL: test_scatter_16f32: 1973; SKX_32: # BB#0: 1974; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1975; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 1976; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1977; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 1978; SKX_32-NEXT: retl 1979 call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) 1980 ret void 1981} 1982declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask) 1983define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1984; KNL_64-LABEL: test_scatter_16f64: 1985; KNL_64: # BB#0: 1986; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1987; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 1988; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1989; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1990; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 1991; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 1992; KNL_64-NEXT: retq 1993; 1994; KNL_32-LABEL: test_scatter_16f64: 1995; KNL_32: # BB#0: 1996; KNL_32-NEXT: pushl %ebp 1997; KNL_32-NEXT: .Ltmp13: 1998; KNL_32-NEXT: .cfi_def_cfa_offset 8 1999; KNL_32-NEXT: .Ltmp14: 2000; KNL_32-NEXT: .cfi_offset %ebp, -8 2001; KNL_32-NEXT: movl %esp, %ebp 2002; KNL_32-NEXT: .Ltmp15: 2003; KNL_32-NEXT: .cfi_def_cfa_register %ebp 2004; KNL_32-NEXT: andl $-64, %esp 2005; KNL_32-NEXT: subl $64, %esp 2006; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 2007; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 2008; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2009; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 2010; KNL_32-NEXT: kshiftrw $8, %k1, %k2 2011; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 2012; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2013; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 2014; KNL_32-NEXT: movl %ebp, %esp 2015; KNL_32-NEXT: popl %ebp 2016; KNL_32-NEXT: retl 2017; 2018; SKX-LABEL: test_scatter_16f64: 2019; SKX: # BB#0: 2020; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 2021; SKX-NEXT: vpslld $31, %zmm2, %zmm2 2022; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 2023; SKX-NEXT: kshiftrw $8, %k1, %k2 2024; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 2025; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 2026; SKX-NEXT: retq 2027; 2028; SKX_32-LABEL: test_scatter_16f64: 2029; SKX_32: # BB#0: 2030; SKX_32-NEXT: pushl %ebp 2031; SKX_32-NEXT: .Ltmp10: 2032; SKX_32-NEXT: .cfi_def_cfa_offset 8 2033; SKX_32-NEXT: .Ltmp11: 2034; SKX_32-NEXT: .cfi_offset %ebp, -8 2035; SKX_32-NEXT: movl %esp, %ebp 2036; SKX_32-NEXT: .Ltmp12: 2037; SKX_32-NEXT: .cfi_def_cfa_register %ebp 2038; SKX_32-NEXT: andl $-64, %esp 2039; SKX_32-NEXT: subl $64, %esp 2040; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 2041; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 2042; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2043; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 2044; SKX_32-NEXT: kshiftrw $8, %k1, %k2 2045; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 2046; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 2047; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 2048; SKX_32-NEXT: movl %ebp, %esp 2049; SKX_32-NEXT: popl %ebp 2050; SKX_32-NEXT: retl 2051 call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) 2052 ret void 2053} 2054declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask) 2055