1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_64 3; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=KNL_32 4; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX 5; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=SKX_32 6; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR 7 8 9target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 10target triple = "x86_64-unknown-linux-gnu" 11 12 13; SCALAR-LABEL: test1 14; SCALAR: extractelement <16 x float*> 15; SCALAR-NEXT: load float 16; SCALAR-NEXT: insertelement <16 x float> 17; SCALAR-NEXT: extractelement <16 x float*> 18; SCALAR-NEXT: load float 19 20define <16 x float> @test1(float* %base, <16 x i32> %ind) { 21; KNL_64-LABEL: test1: 22; KNL_64: # BB#0: 23; KNL_64-NEXT: kxnorw %k1, %k1, %k1 24; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 25; KNL_64-NEXT: vmovaps %zmm1, %zmm0 26; KNL_64-NEXT: retq 27; 28; KNL_32-LABEL: test1: 29; KNL_32: # BB#0: 30; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 31; KNL_32-NEXT: kxnorw %k1, %k1, %k1 32; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 33; KNL_32-NEXT: vmovaps %zmm1, %zmm0 34; KNL_32-NEXT: retl 35; 36; SKX-LABEL: test1: 37; SKX: # BB#0: 38; SKX-NEXT: kxnorw %k1, %k1, %k1 39; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 40; SKX-NEXT: vmovaps %zmm1, %zmm0 41; SKX-NEXT: retq 42 43 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 44 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 45 46 %sext_ind = sext <16 x i32> %ind to <16 x i64> 47 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 48 49 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 50 ret <16 x float>%res 51} 52 53declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) 54declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>) 55declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> ) 56 57 58; SCALAR-LABEL: test2 59; SCALAR: extractelement <16 x float*> 60; SCALAR-NEXT: load float 61; SCALAR-NEXT: insertelement <16 x float> 62; SCALAR-NEXT: br label %else 63; SCALAR: else: 64; SCALAR-NEXT: %res.phi.else = phi 65; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 66; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true 67; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2 68 69define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) { 70; KNL_64-LABEL: test2: 71; KNL_64: # BB#0: 72; KNL_64-NEXT: kmovw %esi, %k1 73; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 74; KNL_64-NEXT: vmovaps %zmm1, %zmm0 75; KNL_64-NEXT: retq 76; 77; KNL_32-LABEL: test2: 78; KNL_32: # BB#0: 79; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 80; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 81; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 82; KNL_32-NEXT: vmovaps %zmm1, %zmm0 83; KNL_32-NEXT: retl 84; 85; SKX-LABEL: test2: 86; SKX: # BB#0: 87; SKX-NEXT: kmovw %esi, %k1 88; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 89; SKX-NEXT: vmovaps %zmm1, %zmm0 90; SKX-NEXT: retq 91 92 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 93 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 94 95 %sext_ind = sext <16 x i32> %ind to <16 x i64> 96 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 97 %imask = bitcast i16 %mask to <16 x i1> 98 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef) 99 ret <16 x float> %res 100} 101 102define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { 103; KNL_64-LABEL: test3: 104; KNL_64: # BB#0: 105; KNL_64-NEXT: kmovw %esi, %k1 106; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 107; KNL_64-NEXT: vmovaps %zmm1, %zmm0 108; KNL_64-NEXT: retq 109; 110; KNL_32-LABEL: test3: 111; KNL_32: # BB#0: 112; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 113; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 114; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} 115; KNL_32-NEXT: vmovaps %zmm1, %zmm0 116; KNL_32-NEXT: retl 117; 118; SKX-LABEL: test3: 119; SKX: # BB#0: 120; SKX-NEXT: kmovw %esi, %k1 121; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 122; SKX-NEXT: vmovaps %zmm1, %zmm0 123; SKX-NEXT: retq 124 125 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 126 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 127 128 %sext_ind = sext <16 x i32> %ind to <16 x i64> 129 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind 130 %imask = bitcast i16 %mask to <16 x i1> 131 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 132 ret <16 x i32> %res 133} 134 135 136define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { 137; KNL_64-LABEL: test4: 138; KNL_64: # BB#0: 139; KNL_64-NEXT: kmovw %esi, %k1 140; KNL_64-NEXT: kmovw %k1, %k2 141; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 142; KNL_64-NEXT: vmovaps %zmm1, %zmm2 143; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 144; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0 145; KNL_64-NEXT: retq 146; 147; KNL_32-LABEL: test4: 148; KNL_32: # BB#0: 149; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 150; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 151; KNL_32-NEXT: kmovw %k1, %k2 152; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} 153; KNL_32-NEXT: vmovaps %zmm1, %zmm2 154; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} 155; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 156; KNL_32-NEXT: retl 157; 158; SKX-LABEL: test4: 159; SKX: # BB#0: 160; SKX-NEXT: kmovw %esi, %k1 161; SKX-NEXT: kmovw %k1, %k2 162; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 163; SKX-NEXT: vmovaps %zmm1, %zmm2 164; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 165; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 166; SKX-NEXT: retq 167 168 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 169 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 170 171 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 172 %imask = bitcast i16 %mask to <16 x i1> 173 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 174 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 175 %res = add <16 x i32> %gt1, %gt2 176 ret <16 x i32> %res 177} 178 179 180; SCALAR-LABEL: test5 181; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0 182; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true 183; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else 184; SCALAR: cond.store: 185; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0 186; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0 187; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4 188; SCALAR-NEXT: br label %else 189; SCALAR: else: 190; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1 191; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true 192; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2 193 194define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { 195; KNL_64-LABEL: test5: 196; KNL_64: # BB#0: 197; KNL_64-NEXT: kmovw %esi, %k1 198; KNL_64-NEXT: kmovw %k1, %k2 199; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 200; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 201; KNL_64-NEXT: retq 202; 203; KNL_32-LABEL: test5: 204; KNL_32: # BB#0: 205; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 206; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 207; KNL_32-NEXT: kmovw %k1, %k2 208; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} 209; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 210; KNL_32-NEXT: retl 211; 212; SKX-LABEL: test5: 213; SKX: # BB#0: 214; SKX-NEXT: kmovw %esi, %k1 215; SKX-NEXT: kmovw %k1, %k2 216; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 217; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 218; SKX-NEXT: retq 219 220 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 221 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer 222 223 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind 224 %imask = bitcast i16 %mask to <16 x i1> 225 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 226 call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask) 227 ret void 228} 229 230declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> ) 231declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> ) 232 233 234; SCALAR-LABEL: test6 235; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4 236; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1 237; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1 238; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4 239; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2 240; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2 241; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4 242 243define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { 244; KNL_64-LABEL: test6: 245; KNL_64: # BB#0: 246; KNL_64-NEXT: kxnorw %k1, %k1, %k1 247; KNL_64-NEXT: kxnorw %k2, %k2, %k2 248; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 249; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 250; KNL_64-NEXT: vmovaps %zmm2, %zmm0 251; KNL_64-NEXT: retq 252; 253; KNL_32-LABEL: test6: 254; KNL_32: # BB#0: 255; KNL_32-NEXT: kxnorw %k1, %k1, %k1 256; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2 257; KNL_32-NEXT: kxnorw %k2, %k2, %k2 258; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} 259; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} 260; KNL_32-NEXT: vmovaps %zmm1, %zmm0 261; KNL_32-NEXT: retl 262; 263; SKX-LABEL: test6: 264; SKX: # BB#0: 265; SKX-NEXT: kxnorw %k1, %k1, %k1 266; SKX-NEXT: kxnorw %k2, %k2, %k2 267; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 268; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 269; SKX-NEXT: vmovaps %zmm2, %zmm0 270; SKX-NEXT: retq 271 272 %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 273 274 call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 275 ret <8 x i32>%a 276} 277 278define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { 279; 280; KNL_64-LABEL: test7: 281; KNL_64: # BB#0: 282; KNL_64-NEXT: movzbl %sil, %eax 283; KNL_64-NEXT: kmovw %eax, %k1 284; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 285; KNL_64-NEXT: kmovw %k1, %k2 286; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} 287; KNL_64-NEXT: vmovaps %zmm1, %zmm2 288; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} 289; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 290; KNL_64-NEXT: retq 291; 292; KNL_32-LABEL: test7: 293; KNL_32: # BB#0: 294; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 295; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 296; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 297; KNL_32-NEXT: kmovw %k1, %k2 298; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2} 299; KNL_32-NEXT: vmovaps %zmm1, %zmm2 300; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} 301; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 302; KNL_32-NEXT: retl 303; 304; SKX-LABEL: test7: 305; SKX: # BB#0: 306; SKX-NEXT: kmovb %esi, %k1 307; SKX-NEXT: kmovw %k1, %k2 308; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} 309; SKX-NEXT: vmovaps %zmm1, %zmm2 310; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} 311; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 312; SKX-NEXT: retq 313 314 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0 315 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer 316 317 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind 318 %imask = bitcast i8 %mask to <8 x i1> 319 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef) 320 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1) 321 %res = add <8 x i32> %gt1, %gt2 322 ret <8 x i32> %res 323} 324 325; No uniform base in this case, index <8 x i64> contains addresses, 326; each gather call will be split into two 327define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { 328; KNL_64-LABEL: test8: 329; KNL_64: # BB#0: 330; KNL_64-NEXT: kmovw %edi, %k1 331; KNL_64-NEXT: kshiftrw $8, %k1, %k2 332; KNL_64-NEXT: kmovw %k2, %k3 333; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 334; KNL_64-NEXT: kmovw %k1, %k3 335; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 336; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4 337; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 338; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 339; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 340; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 341; KNL_64-NEXT: retq 342; 343; KNL_32-LABEL: test8: 344; KNL_32: # BB#0: 345; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 346; KNL_32-NEXT: kmovw %k1, %k2 347; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 348; KNL_32-NEXT: vmovaps %zmm1, %zmm2 349; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 350; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 351; KNL_32-NEXT: retl 352; 353; SKX-LABEL: test8: 354; SKX: # BB#0: 355; SKX-NEXT: kmovw %edi, %k1 356; SKX-NEXT: kshiftrw $8, %k1, %k2 357; SKX-NEXT: kmovw %k2, %k3 358; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3} 359; SKX-NEXT: kmovw %k1, %k3 360; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3} 361; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4 362; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 363; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 364; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 365; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 366; SKX-NEXT: retq 367; 368; SKX_32-LABEL: test8: 369; SKX_32: # BB#0: 370; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 371; SKX_32-NEXT: kmovw %k1, %k2 372; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 373; SKX_32-NEXT: vmovaps %zmm1, %zmm2 374; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 375; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 376; SKX_32-NEXT: retl 377 378 %imask = bitcast i16 %mask to <16 x i1> 379 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 380 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 381 %res = add <16 x i32> %gt1, %gt2 382 ret <16 x i32> %res 383} 384 385%struct.RT = type { i8, [10 x [20 x i32]], i8 } 386%struct.ST = type { i32, double, %struct.RT } 387 388; Masked gather for agregate types 389; Test9 and Test10 should give the same result (scalar and vector indices in GEP) 390 391 392define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { 393; KNL_64-LABEL: test9: 394; KNL_64: # BB#0: # %entry 395; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 396; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 397; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 398; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 399; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 400; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 401; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 402; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 403; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 404; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 405; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 406; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 407; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 408; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 409; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 410; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 411; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 412; KNL_64-NEXT: kxnorw %k1, %k1, %k1 413; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 414; KNL_64-NEXT: retq 415; 416; KNL_32-LABEL: test9: 417; KNL_32: # BB#0: # %entry 418; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 419; KNL_32-NEXT: vpbroadcastd .LCPI8_0, %ymm3 420; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 421; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 422; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3 423; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 424; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 425; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 426; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1 427; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 428; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 429; KNL_32-NEXT: kxnorw %k1, %k1, %k1 430; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 431; KNL_32-NEXT: retl 432; 433; SKX-LABEL: test9: 434; SKX: # BB#0: # %entry 435; SKX-NEXT: vpbroadcastq %rdi, %zmm2 436; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 437; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 438; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 439; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 440; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 441; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 442; SKX-NEXT: kxnorw %k1, %k1, %k1 443; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 444; SKX-NEXT: retq 445entry: 446 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 447 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 448 449 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13> 450 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 451 ret <8 x i32> %res 452} 453 454define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { 455; KNL_64-LABEL: test10: 456; KNL_64: # BB#0: # %entry 457; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 458; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 459; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 460; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 461; KNL_64-NEXT: vpsrlq $32, %zmm1, %zmm1 462; KNL_64-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 463; KNL_64-NEXT: vpsllq $32, %zmm1, %zmm1 464; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 465; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3 466; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 467; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 468; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 469; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 470; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 471; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 472; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 473; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 474; KNL_64-NEXT: kxnorw %k1, %k1, %k1 475; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 476; KNL_64-NEXT: retq 477; 478; KNL_32-LABEL: test10: 479; KNL_32: # BB#0: # %entry 480; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 481; KNL_32-NEXT: vpbroadcastd .LCPI9_0, %ymm3 482; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 483; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 484; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3 485; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 486; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 487; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 488; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1 489; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 490; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 491; KNL_32-NEXT: kxnorw %k1, %k1, %k1 492; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 493; KNL_32-NEXT: retl 494; 495; SKX-LABEL: test10: 496; SKX: # BB#0: # %entry 497; SKX-NEXT: vpbroadcastq %rdi, %zmm2 498; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 499; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0 500; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 501; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1 502; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 503; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1 504; SKX-NEXT: kxnorw %k1, %k1, %k1 505; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} 506; SKX-NEXT: retq 507entry: 508 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 509 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer 510 511 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13 512 %res = call <8 x i32 > @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 513 ret <8 x i32> %res 514} 515 516; Splat index in GEP, requires broadcast 517define <16 x float> @test11(float* %base, i32 %ind) { 518; KNL_64-LABEL: test11: 519; KNL_64: # BB#0: 520; KNL_64-NEXT: vpbroadcastd %esi, %zmm1 521; KNL_64-NEXT: kxnorw %k1, %k1, %k1 522; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 523; KNL_64-NEXT: retq 524; 525; KNL_32-LABEL: test11: 526; KNL_32: # BB#0: 527; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 528; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 529; KNL_32-NEXT: kxnorw %k1, %k1, %k1 530; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 531; KNL_32-NEXT: retl 532; 533; SKX-LABEL: test11: 534; SKX: # BB#0: 535; SKX-NEXT: vpbroadcastd %esi, %zmm1 536; SKX-NEXT: kxnorw %k1, %k1, %k1 537; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 538; SKX-NEXT: retq 539 540 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 541 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 542 543 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 544 545 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 546 ret <16 x float>%res 547} 548 549; We are checking the uniform base here. It is taken directly from input to vgatherdps 550define <16 x float> @test12(float* %base, <16 x i32> %ind) { 551; KNL_64-LABEL: test12: 552; KNL_64: # BB#0: 553; KNL_64-NEXT: kxnorw %k1, %k1, %k1 554; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 555; KNL_64-NEXT: vmovaps %zmm1, %zmm0 556; KNL_64-NEXT: retq 557; 558; KNL_32-LABEL: test12: 559; KNL_32: # BB#0: 560; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 561; KNL_32-NEXT: kxnorw %k1, %k1, %k1 562; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 563; KNL_32-NEXT: vmovaps %zmm1, %zmm0 564; KNL_32-NEXT: retl 565; 566; SKX-LABEL: test12: 567; SKX: # BB#0: 568; SKX-NEXT: kxnorw %k1, %k1, %k1 569; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 570; SKX-NEXT: vmovaps %zmm1, %zmm0 571; SKX-NEXT: retq 572 573 %sext_ind = sext <16 x i32> %ind to <16 x i64> 574 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 575 576 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 577 ret <16 x float>%res 578} 579 580; The same as the previous, but the mask is undefined 581define <16 x float> @test13(float* %base, <16 x i32> %ind) { 582; KNL_64-LABEL: test13: 583; KNL_64: # BB#0: 584; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 585; KNL_64-NEXT: vmovaps %zmm1, %zmm0 586; KNL_64-NEXT: retq 587; 588; KNL_32-LABEL: test13: 589; KNL_32: # BB#0: 590; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 591; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 592; KNL_32-NEXT: vmovaps %zmm1, %zmm0 593; KNL_32-NEXT: retl 594; 595; SKX-LABEL: test13: 596; SKX: # BB#0: 597; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 598; SKX-NEXT: vmovaps %zmm1, %zmm0 599; SKX-NEXT: retq 600 601 %sext_ind = sext <16 x i32> %ind to <16 x i64> 602 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind 603 604 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 605 ret <16 x float>%res 606} 607 608; The base pointer is not splat, can't find unform base 609define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { 610; KNL_64-LABEL: test14: 611; KNL_64: # BB#0: 612; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 613; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 614; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 615; KNL_64-NEXT: vmovd %esi, %xmm1 616; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 617; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 618; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 619; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 620; KNL_64-NEXT: kshiftrw $8, %k0, %k1 621; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 622; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 623; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0 624; KNL_64-NEXT: retq 625; 626; KNL_32-LABEL: test14: 627; KNL_32: # BB#0: 628; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 629; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 630; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 631; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 632; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 633; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 634; KNL_32-NEXT: retl 635; 636; SKX-LABEL: test14: 637; SKX: # BB#0: 638; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 639; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 640; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 641; SKX-NEXT: vmovd %esi, %xmm1 642; SKX-NEXT: vpbroadcastd %xmm1, %ymm1 643; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 644; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 645; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 646; SKX-NEXT: kshiftrw $8, %k0, %k1 647; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} 648; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1} 649; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0 650; SKX-NEXT: retq 651; 652; SKX_32-LABEL: test14: 653; SKX_32: # BB#0: 654; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 655; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 656; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 657; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 658; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 659; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} 660; SKX_32-NEXT: retl 661 662 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1 663 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 664 665 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind 666 667 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> undef, <16 x float> undef) 668 ret <16 x float>%res 669} 670 671declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 672declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>) 673declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>) 674 675; Gather smaller than existing instruction 676define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) { 677; 678; KNL_64-LABEL: test15: 679; KNL_64: # BB#0: 680; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2 681; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 682; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2 683; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm0 684; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 685; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k1 686; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1} 687; KNL_64-NEXT: retq 688; 689; KNL_32-LABEL: test15: 690; KNL_32: # BB#0: 691; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2 692; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 693; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 694; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2 695; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm0 696; KNL_32-NEXT: vpandq .LCPI14_0, %zmm0, %zmm0 697; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1 698; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1} 699; KNL_32-NEXT: retl 700; 701; SKX-LABEL: test15: 702; SKX: # BB#0: 703; SKX-NEXT: vpmovd2m %xmm1, %k1 704; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} 705; SKX-NEXT: vmovaps %zmm1, %zmm0 706; SKX-NEXT: retq 707 708 %sext_ind = sext <4 x i32> %ind to <4 x i64> 709 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind 710 %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) 711 ret <4 x float>%res 712} 713 714; Gather smaller than existing instruction 715define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { 716; 717; KNL_64-LABEL: test16: 718; KNL_64: # BB#0: 719; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 720; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 721; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 722; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 723; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 724; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 725; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 726; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 727; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 728; KNL_64-NEXT: vmovaps %zmm2, %zmm0 729; KNL_64-NEXT: retq 730; 731; KNL_32-LABEL: test16: 732; KNL_32: # BB#0: 733; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 734; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 735; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 736; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 737; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 738; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 739; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 740; KNL_32-NEXT: vpandq .LCPI15_0, %zmm1, %zmm1 741; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 742; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 743; KNL_32-NEXT: vmovaps %zmm2, %zmm0 744; KNL_32-NEXT: retl 745; 746; SKX-LABEL: test16: 747; SKX: # BB#0: 748; SKX-NEXT: vpmovd2m %xmm1, %k1 749; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} 750; SKX-NEXT: vmovaps %zmm2, %zmm0 751; SKX-NEXT: retq 752 753 %sext_ind = sext <4 x i32> %ind to <4 x i64> 754 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind 755 %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) 756 ret <4 x double>%res 757} 758 759define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { 760; 761; KNL_64-LABEL: test17: 762; KNL_64: # BB#0: 763; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 764; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 765; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 766; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 767; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} 768; KNL_64-NEXT: vmovaps %zmm2, %zmm0 769; KNL_64-NEXT: retq 770; 771; KNL_32-LABEL: test17: 772; KNL_32: # BB#0: 773; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 774; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 775; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 776; KNL_32-NEXT: vpandq .LCPI16_0, %zmm1, %zmm1 777; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 778; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} 779; KNL_32-NEXT: vmovaps %zmm2, %zmm0 780; KNL_32-NEXT: retl 781; 782; SKX-LABEL: test17: 783; SKX: # BB#0: 784; SKX-NEXT: vpmovq2m %xmm1, %k1 785; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1} 786; SKX-NEXT: vmovaps %zmm2, %zmm0 787; SKX-NEXT: retq 788 789 %sext_ind = sext <2 x i32> %ind to <2 x i64> 790 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind 791 %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) 792 ret <2 x double>%res 793} 794 795declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> ) 796declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> ) 797declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> ) 798declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> ) 799declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> ) 800 801define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) { 802; 803; KNL_64-LABEL: test18: 804; KNL_64: # BB#0: 805; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 806; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 807; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 808; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 809; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 810; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 811; KNL_64-NEXT: retq 812; 813; KNL_32-LABEL: test18: 814; KNL_32: # BB#0: 815; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 816; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 817; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 818; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 819; KNL_32-NEXT: vpandq .LCPI17_0, %zmm2, %zmm2 820; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 821; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 822; KNL_32-NEXT: retl 823; 824; SKX-LABEL: test18: 825; SKX: # BB#0: 826; SKX-NEXT: vpmovd2m %xmm2, %k1 827; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 828; SKX-NEXT: retq 829 call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask) 830 ret void 831} 832 833define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) { 834; 835; KNL_64-LABEL: test19: 836; KNL_64: # BB#0: 837; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 838; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1 839; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 840; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 841; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 842; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 843; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 844; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1} 845; KNL_64-NEXT: retq 846; 847; KNL_32-LABEL: test19: 848; KNL_32: # BB#0: 849; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 850; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1 851; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1 852; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 853; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1 854; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 855; KNL_32-NEXT: vpandq .LCPI18_0, %zmm1, %zmm1 856; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 857; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1} 858; KNL_32-NEXT: retl 859; 860; SKX-LABEL: test19: 861; SKX: # BB#0: 862; SKX-NEXT: vpmovd2m %xmm1, %k1 863; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} 864; SKX-NEXT: retq 865; 866; SKX_32-LABEL: test19: 867; SKX_32: # BB#0: 868; SKX_32-NEXT: vpmovd2m %xmm1, %k1 869; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 870; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} 871; SKX_32-NEXT: retl 872 %gep = getelementptr double, double* %ptr, <4 x i64> %ind 873 call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask) 874 ret void 875} 876 877; Data type requires widening 878define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) { 879; 880; KNL_64-LABEL: test20: 881; KNL_64: # BB#0: 882; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 883; KNL_64-NEXT: vmovq %xmm2, %xmm2 884; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 885; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 886; KNL_64-NEXT: vpmovsxdq %ymm2, %zmm2 887; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 888; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 889; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 890; KNL_64-NEXT: retq 891; 892; KNL_32-LABEL: test20: 893; KNL_32: # BB#0: 894; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 895; KNL_32-NEXT: vmovq %xmm2, %xmm2 896; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 897; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 898; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 899; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 900; KNL_32-NEXT: vpmovsxdq %ymm2, %zmm2 901; KNL_32-NEXT: vpandq .LCPI19_0, %zmm2, %zmm2 902; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 903; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1} 904; KNL_32-NEXT: retl 905; 906; SKX-LABEL: test20: 907; SKX: # BB#0: 908; SKX-NEXT: vpmovq2m %xmm2, %k0 909; SKX-NEXT: kshiftlw $2, %k0, %k0 910; SKX-NEXT: kshiftrw $2, %k0, %k1 911; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1} 912; SKX-NEXT: retq 913 call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask) 914 ret void 915} 916 917; Data type requires promotion 918define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) { 919; 920; KNL_64-LABEL: test21: 921; KNL_64: # BB#0: 922; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 923; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 924; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 925; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 926; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1 927; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 928; KNL_64-NEXT: retq 929; 930; KNL_32-LABEL: test21: 931; KNL_32: # BB#0: 932; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 933; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2 934; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 935; KNL_32-NEXT: vpandq .LCPI20_0, %zmm2, %zmm2 936; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 937; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 938; KNL_32-NEXT: retl 939; 940; SKX-LABEL: test21: 941; SKX: # BB#0: 942; SKX-NEXT: vpmovq2m %xmm2, %k0 943; SKX-NEXT: kshiftlw $2, %k0, %k0 944; SKX-NEXT: kshiftrw $2, %k0, %k1 945; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 946; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 947; SKX-NEXT: retq 948 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) 949 ret void 950} 951 952; The result type requires widening 953declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>) 954 955define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { 956; 957; 958; KNL_64-LABEL: test22: 959; KNL_64: # BB#0: 960; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 961; KNL_64-NEXT: vmovq %xmm1, %xmm1 962; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3 963; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 964; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 965; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 966; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 967; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 968; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 969; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1} 970; KNL_64-NEXT: vmovaps %zmm2, %zmm0 971; KNL_64-NEXT: retq 972; 973; KNL_32-LABEL: test22: 974; KNL_32: # BB#0: 975; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 976; KNL_32-NEXT: vmovq %xmm1, %xmm1 977; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3 978; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 979; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 980; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 981; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 982; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1 983; KNL_32-NEXT: vpandq .LCPI21_0, %zmm1, %zmm1 984; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 985; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1} 986; KNL_32-NEXT: vmovaps %zmm2, %zmm0 987; KNL_32-NEXT: retl 988; 989; SKX-LABEL: test22: 990; SKX: # BB#0: 991; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 992; SKX-NEXT: vpmovq2m %xmm1, %k0 993; SKX-NEXT: kshiftlw $2, %k0, %k0 994; SKX-NEXT: kshiftrw $2, %k0, %k1 995; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1} 996; SKX-NEXT: vmovaps %zmm2, %zmm0 997; SKX-NEXT: retq 998 %sext_ind = sext <2 x i32> %ind to <2 x i64> 999 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1000 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) 1001 ret <2 x float>%res 1002} 1003 1004declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) 1005declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) 1006 1007define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { 1008; 1009; KNL_64-LABEL: test23: 1010; KNL_64: # BB#0: 1011; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 1012; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1013; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 1014; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1015; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1016; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1017; KNL_64-NEXT: retq 1018; 1019; KNL_32-LABEL: test23: 1020; KNL_32: # BB#0: 1021; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 1022; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1023; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1024; KNL_32-NEXT: vpandq .LCPI22_0, %zmm1, %zmm1 1025; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1026; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1027; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1028; KNL_32-NEXT: retl 1029; 1030; SKX-LABEL: test23: 1031; SKX: # BB#0: 1032; SKX-NEXT: vpmovq2m %xmm1, %k1 1033; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1034; SKX-NEXT: vmovaps %zmm2, %zmm0 1035; SKX-NEXT: retq 1036 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1037 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1038 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) 1039 ret <2 x i32>%res 1040} 1041 1042define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { 1043; 1044; 1045; KNL_64-LABEL: test24: 1046; KNL_64: # BB#0: 1047; KNL_64-NEXT: movb $3, %al 1048; KNL_64-NEXT: movzbl %al, %eax 1049; KNL_64-NEXT: kmovw %eax, %k1 1050; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1051; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1052; KNL_64-NEXT: retq 1053; 1054; KNL_32-LABEL: test24: 1055; KNL_32: # BB#0: 1056; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1057; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1 1058; KNL_32-NEXT: vinserti32x4 $0, .LCPI23_0, %zmm1, %zmm1 1059; KNL_32-NEXT: vpandq .LCPI23_1, %zmm1, %zmm1 1060; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1061; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1062; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1063; KNL_32-NEXT: retl 1064; 1065; SKX-LABEL: test24: 1066; SKX: # BB#0: 1067; SKX-NEXT: kxnorw %k1, %k1, %k1 1068; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1069; SKX-NEXT: vmovaps %zmm1, %zmm0 1070; SKX-NEXT: retq 1071 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1072 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind 1073 %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) 1074 ret <2 x i32>%res 1075} 1076 1077define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { 1078; 1079; KNL_64-LABEL: test25: 1080; KNL_64: # BB#0: 1081; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3 1082; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1083; KNL_64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 1084; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 1085; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} 1086; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1087; KNL_64-NEXT: retq 1088; 1089; KNL_32-LABEL: test25: 1090; KNL_32: # BB#0: 1091; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3 1092; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1 1093; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1094; KNL_32-NEXT: vpandq .LCPI24_0, %zmm1, %zmm1 1095; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 1096; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} 1097; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1098; KNL_32-NEXT: retl 1099; 1100; SKX-LABEL: test25: 1101; SKX: # BB#0: 1102; SKX-NEXT: vpmovq2m %xmm1, %k1 1103; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1} 1104; SKX-NEXT: vmovaps %zmm2, %zmm0 1105; SKX-NEXT: retq 1106 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1107 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1108 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) 1109 ret <2 x i64>%res 1110} 1111 1112define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { 1113; 1114; KNL_64-LABEL: test26: 1115; KNL_64: # BB#0: 1116; KNL_64-NEXT: movb $3, %al 1117; KNL_64-NEXT: movzbl %al, %eax 1118; KNL_64-NEXT: kmovw %eax, %k1 1119; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} 1120; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1121; KNL_64-NEXT: retq 1122; 1123; KNL_32-LABEL: test26: 1124; KNL_32: # BB#0: 1125; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1126; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1127; KNL_32-NEXT: vinserti32x4 $0, .LCPI25_0, %zmm2, %zmm2 1128; KNL_32-NEXT: vpandq .LCPI25_1, %zmm2, %zmm2 1129; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1130; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} 1131; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1132; KNL_32-NEXT: retl 1133; 1134; SKX-LABEL: test26: 1135; SKX: # BB#0: 1136; SKX-NEXT: kxnorw %k1, %k1, %k1 1137; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1} 1138; SKX-NEXT: vmovaps %zmm1, %zmm0 1139; SKX-NEXT: retq 1140 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1141 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind 1142 %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0) 1143 ret <2 x i64>%res 1144} 1145 1146; Result type requires widening; all-ones mask 1147define <2 x float> @test27(float* %base, <2 x i32> %ind) { 1148; 1149; KNL_64-LABEL: test27: 1150; KNL_64: # BB#0: 1151; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1152; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 1153; KNL_64-NEXT: movb $3, %al 1154; KNL_64-NEXT: movzbl %al, %eax 1155; KNL_64-NEXT: kmovw %eax, %k1 1156; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} 1157; KNL_64-NEXT: retq 1158; 1159; KNL_32-LABEL: test27: 1160; KNL_32: # BB#0: 1161; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1162; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1163; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 1164; KNL_32-NEXT: movb $3, %cl 1165; KNL_32-NEXT: movzbl %cl, %ecx 1166; KNL_32-NEXT: kmovw %ecx, %k1 1167; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} 1168; KNL_32-NEXT: retl 1169; 1170; SKX-LABEL: test27: 1171; SKX: # BB#0: 1172; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 1173; SKX-NEXT: movb $3, %al 1174; SKX-NEXT: kmovb %eax, %k1 1175; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} 1176; SKX-NEXT: retq 1177 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1178 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind 1179 %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef) 1180 ret <2 x float>%res 1181} 1182 1183; Data type requires promotion, mask is all-ones 1184define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { 1185; 1186; 1187; KNL_64-LABEL: test28: 1188; KNL_64: # BB#0: 1189; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1190; KNL_64-NEXT: movb $3, %al 1191; KNL_64-NEXT: movzbl %al, %eax 1192; KNL_64-NEXT: kmovw %eax, %k1 1193; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1194; KNL_64-NEXT: retq 1195; 1196; KNL_32-LABEL: test28: 1197; KNL_32: # BB#0: 1198; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1199; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2 1200; KNL_32-NEXT: vinserti32x4 $0, .LCPI27_0, %zmm2, %zmm2 1201; KNL_32-NEXT: vpandq .LCPI27_1, %zmm2, %zmm2 1202; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 1203; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 1204; KNL_32-NEXT: retl 1205; 1206; SKX-LABEL: test28: 1207; SKX: # BB#0: 1208; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1209; SKX-NEXT: movb $3, %al 1210; SKX-NEXT: kmovb %eax, %k1 1211; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1212; SKX-NEXT: retq 1213 call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>) 1214 ret void 1215} 1216 1217 1218; SCALAR-LABEL: test29 1219; SCALAR: extractelement <16 x float*> 1220; SCALAR-NEXT: load float 1221; SCALAR-NEXT: insertelement <16 x float> 1222; SCALAR-NEXT: extractelement <16 x float*> 1223; SCALAR-NEXT: load float 1224 1225define <16 x float> @test29(float* %base, <16 x i32> %ind) { 1226; KNL_64-LABEL: test29: 1227; KNL_64: # BB#0: 1228; KNL_64-NEXT: movw $44, %ax 1229; KNL_64-NEXT: kmovw %eax, %k1 1230; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1231; KNL_64-NEXT: vmovaps %zmm1, %zmm0 1232; KNL_64-NEXT: retq 1233; 1234; KNL_32-LABEL: test29: 1235; KNL_32: # BB#0: 1236; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1237; KNL_32-NEXT: movw $44, %cx 1238; KNL_32-NEXT: kmovw %ecx, %k1 1239; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 1240; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1241; KNL_32-NEXT: retl 1242; 1243; SKX-LABEL: test29: 1244; SKX: # BB#0: 1245; SKX-NEXT: movw $44, %ax 1246; SKX-NEXT: kmovw %eax, %k1 1247; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 1248; SKX-NEXT: vmovaps %zmm1, %zmm0 1249; SKX-NEXT: retq 1250 1251 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0 1252 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer 1253 1254 %sext_ind = sext <16 x i32> %ind to <16 x i64> 1255 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind 1256 1257 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef) 1258 ret <16 x float>%res 1259} 1260 1261; Check non-power-of-2 case. It should be scalarized. 1262declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>) 1263define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { 1264; KNL_64-LABEL: test30: 1265; KNL_64: # BB#0: 1266; KNL_64-NEXT: andl $1, %edx 1267; KNL_64-NEXT: kmovw %edx, %k1 1268; KNL_64-NEXT: andl $1, %esi 1269; KNL_64-NEXT: kmovw %esi, %k2 1270; KNL_64-NEXT: movl %edi, %eax 1271; KNL_64-NEXT: andl $1, %eax 1272; KNL_64-NEXT: kmovw %eax, %k0 1273; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 1274; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 1275; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1276; KNL_64-NEXT: # implicit-def: %XMM0 1277; KNL_64-NEXT: testb $1, %dil 1278; KNL_64-NEXT: je .LBB29_2 1279; KNL_64-NEXT: # BB#1: # %cond.load 1280; KNL_64-NEXT: vmovq %xmm1, %rax 1281; KNL_64-NEXT: vmovd (%rax), %xmm0 1282; KNL_64-NEXT: .LBB29_2: # %else 1283; KNL_64-NEXT: kmovw %k2, %eax 1284; KNL_64-NEXT: movl %eax, %ecx 1285; KNL_64-NEXT: andl $1, %ecx 1286; KNL_64-NEXT: testb %cl, %cl 1287; KNL_64-NEXT: je .LBB29_4 1288; KNL_64-NEXT: # BB#3: # %cond.load1 1289; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx 1290; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 1291; KNL_64-NEXT: .LBB29_4: # %else2 1292; KNL_64-NEXT: kmovw %k1, %ecx 1293; KNL_64-NEXT: movl %ecx, %edx 1294; KNL_64-NEXT: andl $1, %edx 1295; KNL_64-NEXT: testb %dl, %dl 1296; KNL_64-NEXT: je .LBB29_6 1297; KNL_64-NEXT: # BB#5: # %cond.load4 1298; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 1299; KNL_64-NEXT: vmovq %xmm1, %rdx 1300; KNL_64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 1301; KNL_64-NEXT: .LBB29_6: # %else5 1302; KNL_64-NEXT: kmovw %k0, %edx 1303; KNL_64-NEXT: vmovd %edx, %xmm1 1304; KNL_64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 1305; KNL_64-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 1306; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 1307; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1308; KNL_64-NEXT: retq 1309; 1310; KNL_32-LABEL: test30: 1311; KNL_32: # BB#0: 1312; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1313; KNL_32-NEXT: andl $1, %eax 1314; KNL_32-NEXT: kmovw %eax, %k1 1315; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1316; KNL_32-NEXT: andl $1, %eax 1317; KNL_32-NEXT: kmovw %eax, %k2 1318; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1319; KNL_32-NEXT: movl %eax, %ecx 1320; KNL_32-NEXT: andl $1, %ecx 1321; KNL_32-NEXT: kmovw %ecx, %k0 1322; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 1323; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1324; KNL_32-NEXT: # implicit-def: %XMM0 1325; KNL_32-NEXT: testb $1, %al 1326; KNL_32-NEXT: je .LBB29_2 1327; KNL_32-NEXT: # BB#1: # %cond.load 1328; KNL_32-NEXT: vmovd %xmm1, %eax 1329; KNL_32-NEXT: vmovd (%eax), %xmm0 1330; KNL_32-NEXT: .LBB29_2: # %else 1331; KNL_32-NEXT: kmovw %k2, %eax 1332; KNL_32-NEXT: movl %eax, %ecx 1333; KNL_32-NEXT: andl $1, %ecx 1334; KNL_32-NEXT: testb %cl, %cl 1335; KNL_32-NEXT: je .LBB29_4 1336; KNL_32-NEXT: # BB#3: # %cond.load1 1337; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx 1338; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 1339; KNL_32-NEXT: .LBB29_4: # %else2 1340; KNL_32-NEXT: kmovw %k1, %ecx 1341; KNL_32-NEXT: movl %ecx, %edx 1342; KNL_32-NEXT: andl $1, %edx 1343; KNL_32-NEXT: testb %dl, %dl 1344; KNL_32-NEXT: je .LBB29_6 1345; KNL_32-NEXT: # BB#5: # %cond.load4 1346; KNL_32-NEXT: vpextrd $2, %xmm1, %edx 1347; KNL_32-NEXT: vpinsrd $2, (%edx), %xmm0, %xmm0 1348; KNL_32-NEXT: .LBB29_6: # %else5 1349; KNL_32-NEXT: kmovw %k0, %edx 1350; KNL_32-NEXT: vmovd %edx, %xmm1 1351; KNL_32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 1352; KNL_32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 1353; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 1354; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 1355; KNL_32-NEXT: retl 1356; 1357; SKX-LABEL: test30: 1358; SKX: # BB#0: 1359; SKX-NEXT: vpmovd2m %xmm2, %k1 1360; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1361; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 1362; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 1363; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1364; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1365; SKX-NEXT: # implicit-def: %XMM0 1366; SKX-NEXT: andb $1, %al 1367; SKX-NEXT: je .LBB29_2 1368; SKX-NEXT: # BB#1: # %cond.load 1369; SKX-NEXT: vmovq %xmm1, %rax 1370; SKX-NEXT: vmovd (%rax), %xmm0 1371; SKX-NEXT: .LBB29_2: # %else 1372; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1373; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1374; SKX-NEXT: andb $1, %al 1375; SKX-NEXT: je .LBB29_4 1376; SKX-NEXT: # BB#3: # %cond.load1 1377; SKX-NEXT: vpextrq $1, %xmm1, %rax 1378; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0 1379; SKX-NEXT: .LBB29_4: # %else2 1380; SKX-NEXT: kmovb %k1, -{{[0-9]+}}(%rsp) 1381; SKX-NEXT: movb -{{[0-9]+}}(%rsp), %al 1382; SKX-NEXT: andb $1, %al 1383; SKX-NEXT: je .LBB29_6 1384; SKX-NEXT: # BB#5: # %cond.load4 1385; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1 1386; SKX-NEXT: vmovq %xmm1, %rax 1387; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0 1388; SKX-NEXT: .LBB29_6: # %else5 1389; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1} 1390; SKX-NEXT: vmovaps %zmm3, %zmm0 1391; SKX-NEXT: retq 1392 1393 %sext_ind = sext <3 x i32> %ind to <3 x i64> 1394 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind 1395 %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) 1396 ret <3 x i32>%res 1397} 1398 1399declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>) 1400 1401; KNL-LABEL: test31 1402; KNL: vpgatherqq 1403; KNL: vpgatherqq 1404define <16 x float*> @test31(<16 x float**> %ptrs) { 1405; KNL_64-LABEL: test31: 1406; KNL_64: # BB#0: 1407; KNL_64-NEXT: kxnorw %k1, %k1, %k1 1408; KNL_64-NEXT: kxnorw %k2, %k2, %k2 1409; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1410; KNL_64-NEXT: kshiftrw $8, %k1, %k1 1411; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1412; KNL_64-NEXT: vmovaps %zmm2, %zmm0 1413; KNL_64-NEXT: vmovaps %zmm3, %zmm1 1414; KNL_64-NEXT: retq 1415; 1416; KNL_32-LABEL: test31: 1417; KNL_32: # BB#0: 1418; KNL_32-NEXT: kxnorw %k1, %k1, %k1 1419; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1420; KNL_32-NEXT: vmovaps %zmm1, %zmm0 1421; KNL_32-NEXT: retl 1422; 1423; SKX-LABEL: test31: 1424; SKX: # BB#0: 1425; SKX-NEXT: kxnorw %k1, %k1, %k1 1426; SKX-NEXT: kxnorw %k2, %k2, %k2 1427; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} 1428; SKX-NEXT: kshiftrw $8, %k1, %k1 1429; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} 1430; SKX-NEXT: vmovaps %zmm2, %zmm0 1431; SKX-NEXT: vmovaps %zmm3, %zmm1 1432; SKX-NEXT: retq 1433; 1434; SKX_32-LABEL: test31: 1435; SKX_32: # BB#0: 1436; SKX_32-NEXT: kxnorw %k1, %k1, %k1 1437; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 1438; SKX_32-NEXT: vmovaps %zmm1, %zmm0 1439; SKX_32-NEXT: retl 1440 1441 %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef) 1442 ret <16 x float*>%res 1443} 1444 1445define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1446; KNL_64-LABEL: test_gather_16i32: 1447; KNL_64: # BB#0: 1448; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1449; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1450; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1451; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2 1452; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1453; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1454; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1455; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 1456; KNL_64-NEXT: retq 1457; 1458; KNL_32-LABEL: test_gather_16i32: 1459; KNL_32: # BB#0: 1460; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1461; KNL_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 1462; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1463; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1464; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1465; KNL_32-NEXT: retl 1466; 1467; SKX-LABEL: test_gather_16i32: 1468; SKX: # BB#0: 1469; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1470; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1471; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1472; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2 1473; SKX-NEXT: kshiftrw $8, %k1, %k2 1474; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 1475; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 1476; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0 1477; SKX-NEXT: retq 1478; 1479; SKX_32-LABEL: test_gather_16i32: 1480; SKX_32: # BB#0: 1481; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1482; SKX_32-NEXT: vpandd .LCPI31_0{1to16}, %zmm1, %zmm1 1483; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1484; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 1485; SKX_32-NEXT: vmovaps %zmm2, %zmm0 1486; SKX_32-NEXT: retl 1487 %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) 1488 ret <16 x i32> %res 1489} 1490define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1491; KNL_64-LABEL: test_gather_16i64: 1492; KNL_64: # BB#0: 1493; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1494; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1495; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1496; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1497; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1498; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1499; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1500; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1501; KNL_64-NEXT: retq 1502; 1503; KNL_32-LABEL: test_gather_16i64: 1504; KNL_32: # BB#0: 1505; KNL_32-NEXT: pushl %ebp 1506; KNL_32-NEXT: .Ltmp0: 1507; KNL_32-NEXT: .cfi_def_cfa_offset 8 1508; KNL_32-NEXT: .Ltmp1: 1509; KNL_32-NEXT: .cfi_offset %ebp, -8 1510; KNL_32-NEXT: movl %esp, %ebp 1511; KNL_32-NEXT: .Ltmp2: 1512; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1513; KNL_32-NEXT: andl $-64, %esp 1514; KNL_32-NEXT: subl $64, %esp 1515; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1516; KNL_32-NEXT: vpandd .LCPI32_0{1to16}, %zmm1, %zmm1 1517; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1518; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1519; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1520; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 1521; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1522; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 1523; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1524; KNL_32-NEXT: movl %ebp, %esp 1525; KNL_32-NEXT: popl %ebp 1526; KNL_32-NEXT: retl 1527; 1528; SKX-LABEL: test_gather_16i64: 1529; SKX: # BB#0: 1530; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1531; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1532; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1533; SKX-NEXT: kshiftrw $8, %k1, %k2 1534; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 1535; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 1536; SKX-NEXT: vmovaps %zmm3, %zmm0 1537; SKX-NEXT: vmovaps %zmm4, %zmm1 1538; SKX-NEXT: retq 1539 %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) 1540 ret <16 x i64> %res 1541} 1542declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) 1543define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1544; KNL_64-LABEL: test_gather_16f32: 1545; KNL_64: # BB#0: 1546; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1547; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1548; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1549; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2 1550; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1551; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1552; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1553; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 1554; KNL_64-NEXT: retq 1555; 1556; KNL_32-LABEL: test_gather_16f32: 1557; KNL_32: # BB#0: 1558; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1559; KNL_32-NEXT: vpandd .LCPI33_0{1to16}, %zmm1, %zmm1 1560; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1561; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 1562; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1563; KNL_32-NEXT: retl 1564; 1565; SKX-LABEL: test_gather_16f32: 1566; SKX: # BB#0: 1567; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1568; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1569; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1570; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2 1571; SKX-NEXT: kshiftrw $8, %k1, %k2 1572; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 1573; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 1574; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0 1575; SKX-NEXT: retq 1576 %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) 1577 ret <16 x float> %res 1578} 1579define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1580; KNL_64-LABEL: test_gather_16f64: 1581; KNL_64: # BB#0: 1582; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1583; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1584; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1585; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1586; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1587; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1588; KNL_64-NEXT: vmovaps %zmm3, %zmm0 1589; KNL_64-NEXT: vmovaps %zmm4, %zmm1 1590; KNL_64-NEXT: retq 1591; 1592; KNL_32-LABEL: test_gather_16f64: 1593; KNL_32: # BB#0: 1594; KNL_32-NEXT: pushl %ebp 1595; KNL_32-NEXT: .Ltmp3: 1596; KNL_32-NEXT: .cfi_def_cfa_offset 8 1597; KNL_32-NEXT: .Ltmp4: 1598; KNL_32-NEXT: .cfi_offset %ebp, -8 1599; KNL_32-NEXT: movl %esp, %ebp 1600; KNL_32-NEXT: .Ltmp5: 1601; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1602; KNL_32-NEXT: andl $-64, %esp 1603; KNL_32-NEXT: subl $64, %esp 1604; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1605; KNL_32-NEXT: vpandd .LCPI34_0{1to16}, %zmm1, %zmm1 1606; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1607; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 1608; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1609; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 1610; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1611; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 1612; KNL_32-NEXT: vmovaps %zmm2, %zmm0 1613; KNL_32-NEXT: movl %ebp, %esp 1614; KNL_32-NEXT: popl %ebp 1615; KNL_32-NEXT: retl 1616; 1617; SKX-LABEL: test_gather_16f64: 1618; SKX: # BB#0: 1619; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1620; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1621; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1622; SKX-NEXT: kshiftrw $8, %k1, %k2 1623; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 1624; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 1625; SKX-NEXT: vmovaps %zmm3, %zmm0 1626; SKX-NEXT: vmovaps %zmm4, %zmm1 1627; SKX-NEXT: retq 1628 %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) 1629 ret <16 x double> %res 1630} 1631declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) 1632define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 1633; KNL_64-LABEL: test_scatter_16i32: 1634; KNL_64: # BB#0: 1635; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1636; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1637; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1638; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1639; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1640; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0 1641; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1642; KNL_64-NEXT: retq 1643; 1644; KNL_32-LABEL: test_scatter_16i32: 1645; KNL_32: # BB#0: 1646; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1647; KNL_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 1648; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1649; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1650; KNL_32-NEXT: retl 1651; 1652; SKX-LABEL: test_scatter_16i32: 1653; SKX: # BB#0: 1654; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1655; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1656; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1657; SKX-NEXT: kshiftrw $8, %k1, %k2 1658; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 1659; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0 1660; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 1661; SKX-NEXT: retq 1662; 1663; SKX_32-LABEL: test_scatter_16i32: 1664; SKX_32: # BB#0: 1665; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 1666; SKX_32-NEXT: vpandd .LCPI35_0{1to16}, %zmm1, %zmm1 1667; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1668; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 1669; SKX_32-NEXT: retl 1670 call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask) 1671 ret void 1672} 1673define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 1674; KNL_64-LABEL: test_scatter_16i64: 1675; KNL_64: # BB#0: 1676; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1677; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1678; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1679; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1680; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1681; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1682; KNL_64-NEXT: retq 1683; 1684; KNL_32-LABEL: test_scatter_16i64: 1685; KNL_32: # BB#0: 1686; KNL_32-NEXT: pushl %ebp 1687; KNL_32-NEXT: .Ltmp6: 1688; KNL_32-NEXT: .cfi_def_cfa_offset 8 1689; KNL_32-NEXT: .Ltmp7: 1690; KNL_32-NEXT: .cfi_offset %ebp, -8 1691; KNL_32-NEXT: movl %esp, %ebp 1692; KNL_32-NEXT: .Ltmp8: 1693; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1694; KNL_32-NEXT: andl $-64, %esp 1695; KNL_32-NEXT: subl $64, %esp 1696; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1697; KNL_32-NEXT: vpandd .LCPI36_0{1to16}, %zmm1, %zmm1 1698; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1699; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 1700; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1701; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 1702; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1703; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 1704; KNL_32-NEXT: movl %ebp, %esp 1705; KNL_32-NEXT: popl %ebp 1706; KNL_32-NEXT: retl 1707; 1708; SKX-LABEL: test_scatter_16i64: 1709; SKX: # BB#0: 1710; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1711; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1712; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1713; SKX-NEXT: kshiftrw $8, %k1, %k2 1714; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 1715; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 1716; SKX-NEXT: retq 1717 call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask) 1718 ret void 1719} 1720declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask) 1721define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 1722; KNL_64-LABEL: test_scatter_16f32: 1723; KNL_64: # BB#0: 1724; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1725; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1726; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1727; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1728; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1729; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0 1730; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1731; KNL_64-NEXT: retq 1732; 1733; KNL_32-LABEL: test_scatter_16f32: 1734; KNL_32: # BB#0: 1735; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1736; KNL_32-NEXT: vpandd .LCPI37_0{1to16}, %zmm1, %zmm1 1737; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1738; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 1739; KNL_32-NEXT: retl 1740; 1741; SKX-LABEL: test_scatter_16f32: 1742; SKX: # BB#0: 1743; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1744; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1745; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1746; SKX-NEXT: kshiftrw $8, %k1, %k2 1747; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 1748; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0 1749; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 1750; SKX-NEXT: retq 1751 call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask) 1752 ret void 1753} 1754declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask) 1755define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 1756; KNL_64-LABEL: test_scatter_16f64: 1757; KNL_64: # BB#0: 1758; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 1759; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1760; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 1761; KNL_64-NEXT: kshiftrw $8, %k1, %k2 1762; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 1763; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 1764; KNL_64-NEXT: retq 1765; 1766; KNL_32-LABEL: test_scatter_16f64: 1767; KNL_32: # BB#0: 1768; KNL_32-NEXT: pushl %ebp 1769; KNL_32-NEXT: .Ltmp9: 1770; KNL_32-NEXT: .cfi_def_cfa_offset 8 1771; KNL_32-NEXT: .Ltmp10: 1772; KNL_32-NEXT: .cfi_offset %ebp, -8 1773; KNL_32-NEXT: movl %esp, %ebp 1774; KNL_32-NEXT: .Ltmp11: 1775; KNL_32-NEXT: .cfi_def_cfa_register %ebp 1776; KNL_32-NEXT: andl $-64, %esp 1777; KNL_32-NEXT: subl $64, %esp 1778; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 1779; KNL_32-NEXT: vpandd .LCPI38_0{1to16}, %zmm1, %zmm1 1780; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 1781; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 1782; KNL_32-NEXT: kshiftrw $8, %k1, %k2 1783; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 1784; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1785; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 1786; KNL_32-NEXT: movl %ebp, %esp 1787; KNL_32-NEXT: popl %ebp 1788; KNL_32-NEXT: retl 1789; 1790; SKX-LABEL: test_scatter_16f64: 1791; SKX: # BB#0: 1792; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 1793; SKX-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 1794; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1 1795; SKX-NEXT: kshiftrw $8, %k1, %k2 1796; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 1797; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 1798; SKX-NEXT: retq 1799 call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask) 1800 ret void 1801} 1802declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask) 1803