1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect --show-mc-encoding | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect --show-mc-encoding | FileCheck %s --check-prefix=X64 4 5define void @test_mm512_2intersect_epi32(<8 x i64> %a, <8 x i64> %b, i16* nocapture %m0, i16* nocapture %m1) { 6; X86-LABEL: test_mm512_2intersect_epi32: 7; X86: # %bb.0: # %entry 8; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 9; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] 10; X86-NEXT: vp2intersectd %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0xc1] 11; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] 12; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] 13; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 14; X86-NEXT: retl # encoding: [0xc3] 15; 16; X64-LABEL: test_mm512_2intersect_epi32: 17; X64: # %bb.0: # %entry 18; X64-NEXT: vp2intersectd %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0xc1] 19; X64-NEXT: kmovw %k0, (%rdi) # encoding: [0xc5,0xf8,0x91,0x07] 20; X64-NEXT: kmovw %k1, (%rsi) # encoding: [0xc5,0xf8,0x91,0x0e] 21; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 22; X64-NEXT: retq # encoding: [0xc3] 23entry: 24 %0 = bitcast <8 x i64> %a to <16 x i32> 25 %1 = bitcast <8 x i64> %b to <16 x i32> 26 %2 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %0, <16 x i32> %1) 27 %3 = extractvalue { <16 x i1>, <16 x i1> } %2, 0 28 %4 = bitcast i16* %m0 to <16 x i1>* 29 store <16 x i1> %3, <16 x i1>* %4, align 16 30 %5 = extractvalue { <16 x i1>, <16 x i1> } %2, 1 31 %6 = bitcast i16* %m1 to <16 x i1>* 32 store <16 x i1> %5, <16 x i1>* %6, align 16 33 ret void 34} 35 36define void @test_mm512_2intersect_epi64(<8 x i64> %a, <8 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) { 37; X86-LABEL: test_mm512_2intersect_epi64: 38; X86: # %bb.0: # %entry 39; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] 40; X86-NEXT: vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1] 41; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 42; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 43; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 44; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] 45; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 46; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 47; X86-NEXT: retl # encoding: [0xc3] 48; 49; X64-LABEL: test_mm512_2intersect_epi64: 50; X64: # %bb.0: # %entry 51; X64-NEXT: vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1] 52; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 53; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] 54; X64-NEXT: movb %cl, (%rdi) # encoding: [0x88,0x0f] 55; X64-NEXT: movb %al, (%rsi) # encoding: [0x88,0x06] 56; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 57; X64-NEXT: retq # encoding: [0xc3] 58entry: 59 %0 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %a, <8 x i64> %b) 60 %1 = extractvalue { <8 x i1>, <8 x i1> } %0, 0 61 %2 = bitcast i8* %m0 to <8 x i1>* 62 store <8 x i1> %1, <8 x i1>* %2, align 8 63 %3 = extractvalue { <8 x i1>, <8 x i1> } %0, 1 64 %4 = bitcast i8* %m1 to <8 x i1>* 65 store <8 x i1> %3, <8 x i1>* %4, align 8 66 ret void 67} 68 69define void @test_mm512_2intersect_epi32_p(<8 x i64>* nocapture readonly %a, <8 x i64>* nocapture readonly %b, i16* nocapture %m0, i16* nocapture %m1) { 70; X86-LABEL: test_mm512_2intersect_epi32_p: 71; X86: # %bb.0: # %entry 72; X86-NEXT: pushl %esi # encoding: [0x56] 73; X86-NEXT: .cfi_def_cfa_offset 8 74; X86-NEXT: .cfi_offset %esi, -8 75; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 76; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 77; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 78; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 79; X86-NEXT: vmovaps (%esi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x06] 80; X86-NEXT: vp2intersectd (%edx), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x02] 81; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] 82; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] 83; X86-NEXT: popl %esi # encoding: [0x5e] 84; X86-NEXT: .cfi_def_cfa_offset 4 85; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 86; X86-NEXT: retl # encoding: [0xc3] 87; 88; X64-LABEL: test_mm512_2intersect_epi32_p: 89; X64: # %bb.0: # %entry 90; X64-NEXT: vmovaps (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] 91; X64-NEXT: vp2intersectd (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x48,0x68,0x06] 92; X64-NEXT: kmovw %k0, (%rdx) # encoding: [0xc5,0xf8,0x91,0x02] 93; X64-NEXT: kmovw %k1, (%rcx) # encoding: [0xc5,0xf8,0x91,0x09] 94; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 95; X64-NEXT: retq # encoding: [0xc3] 96entry: 97 %0 = bitcast <8 x i64>* %a to <16 x i32>* 98 %1 = load <16 x i32>, <16 x i32>* %0, align 64 99 %2 = bitcast <8 x i64>* %b to <16 x i32>* 100 %3 = load <16 x i32>, <16 x i32>* %2, align 64 101 %4 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %1, <16 x i32> %3) 102 %5 = extractvalue { <16 x i1>, <16 x i1> } %4, 0 103 %6 = bitcast i16* %m0 to <16 x i1>* 104 store <16 x i1> %5, <16 x i1>* %6, align 16 105 %7 = extractvalue { <16 x i1>, <16 x i1> } %4, 1 106 %8 = bitcast i16* %m1 to <16 x i1>* 107 store <16 x i1> %7, <16 x i1>* %8, align 16 108 ret void 109} 110 111define void @test_mm512_2intersect_epi64_p(<8 x i64>* nocapture readonly %a, <8 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 112; X86-LABEL: test_mm512_2intersect_epi64_p: 113; X86: # %bb.0: # %entry 114; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 115; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 116; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 117; X86-NEXT: vmovaps (%edx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x02] 118; X86-NEXT: vp2intersectq (%ecx), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x01] 119; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 120; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 121; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 122; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 123; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 124; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 125; X86-NEXT: retl # encoding: [0xc3] 126; 127; X64-LABEL: test_mm512_2intersect_epi64_p: 128; X64: # %bb.0: # %entry 129; X64-NEXT: vmovaps (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] 130; X64-NEXT: vp2intersectq (%rsi), %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0x06] 131; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 132; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 133; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 134; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 135; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 136; X64-NEXT: retq # encoding: [0xc3] 137 138entry: 139 %0 = load <8 x i64>, <8 x i64>* %a, align 64 140 %1 = load <8 x i64>, <8 x i64>* %b, align 64 141 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %0, <8 x i64> %1) 142 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 143 %4 = bitcast i8* %m0 to <8 x i1>* 144 store <8 x i1> %3, <8 x i1>* %4, align 8 145 %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 146 %6 = bitcast i8* %m1 to <8 x i1>* 147 store <8 x i1> %5, <8 x i1>* %6, align 8 148 ret void 149} 150 151define void @test_mm512_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i16* nocapture %m0, i16* nocapture %m1) { 152; X86-LABEL: test_mm512_2intersect_epi32_b: 153; X86: # %bb.0: # %entry 154; X86-NEXT: pushl %esi # encoding: [0x56] 155; X86-NEXT: .cfi_def_cfa_offset 8 156; X86-NEXT: .cfi_offset %esi, -8 157; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14] 158; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10] 159; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] 160; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] 161; X86-NEXT: vbroadcastss (%esi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x18,0x06] 162; X86-NEXT: vp2intersectd (%edx){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x02] 163; X86-NEXT: kmovw %k0, (%ecx) # encoding: [0xc5,0xf8,0x91,0x01] 164; X86-NEXT: kmovw %k1, (%eax) # encoding: [0xc5,0xf8,0x91,0x08] 165; X86-NEXT: popl %esi # encoding: [0x5e] 166; X86-NEXT: .cfi_def_cfa_offset 4 167; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 168; X86-NEXT: retl # encoding: [0xc3] 169; 170; X64-LABEL: test_mm512_2intersect_epi32_b: 171; X64: # %bb.0: # %entry 172; X64-NEXT: vbroadcastss (%rdi), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x18,0x07] 173; X64-NEXT: vp2intersectd (%rsi){1to16}, %zmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x58,0x68,0x06] 174; X64-NEXT: kmovw %k0, (%rdx) # encoding: [0xc5,0xf8,0x91,0x02] 175; X64-NEXT: kmovw %k1, (%rcx) # encoding: [0xc5,0xf8,0x91,0x09] 176; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 177; X64-NEXT: retq # encoding: [0xc3] 178entry: 179 %0 = load i32, i32* %a, align 4 180 %vecinit.i = insertelement <16 x i32> undef, i32 %0, i32 0 181 %vecinit15.i = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer 182 %1 = load i32, i32* %b, align 4 183 %vecinit.i2 = insertelement <16 x i32> undef, i32 %1, i32 0 184 %vecinit15.i3 = shufflevector <16 x i32> %vecinit.i2, <16 x i32> undef, <16 x i32> zeroinitializer 185 %2 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %vecinit15.i, <16 x i32> %vecinit15.i3) 186 %3 = extractvalue { <16 x i1>, <16 x i1> } %2, 0 187 %4 = bitcast i16* %m0 to <16 x i1>* 188 store <16 x i1> %3, <16 x i1>* %4, align 16 189 %5 = extractvalue { <16 x i1>, <16 x i1> } %2, 1 190 %6 = bitcast i16* %m1 to <16 x i1>* 191 store <16 x i1> %5, <16 x i1>* %6, align 16 192 ret void 193} 194 195define void @test_mm512_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) { 196; X86-LABEL: test_mm512_2intersect_epi64_b: 197; X86: # %bb.0: # %entry 198; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c] 199; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] 200; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] 201; X86-NEXT: vbroadcastsd (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x02] 202; X86-NEXT: vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01] 203; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] 204; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] 205; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] 206; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10] 207; X86-NEXT: movb %cl, (%eax) # encoding: [0x88,0x08] 208; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 209; X86-NEXT: retl # encoding: [0xc3] 210; 211; X64-LABEL: test_mm512_2intersect_epi64_b: 212; X64: # %bb.0: # %entry 213; X64-NEXT: vbroadcastsd (%rdi), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x07] 214; X64-NEXT: vp2intersectq (%rsi){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x06] 215; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] 216; X64-NEXT: kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0] 217; X64-NEXT: movb %sil, (%rdx) # encoding: [0x40,0x88,0x32] 218; X64-NEXT: movb %al, (%rcx) # encoding: [0x88,0x01] 219; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] 220; X64-NEXT: retq # encoding: [0xc3] 221entry: 222 %0 = load i64, i64* %a, align 8 223 %vecinit.i = insertelement <8 x i64> undef, i64 %0, i32 0 224 %vecinit7.i = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer 225 %1 = load i64, i64* %b, align 8 226 %vecinit.i2 = insertelement <8 x i64> undef, i64 %1, i32 0 227 %vecinit7.i3 = shufflevector <8 x i64> %vecinit.i2, <8 x i64> undef, <8 x i32> zeroinitializer 228 %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %vecinit7.i, <8 x i64> %vecinit7.i3) 229 %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0 230 %4 = bitcast i8* %m0 to <8 x i1>* 231 store <8 x i1> %3, <8 x i1>* %4, align 8 232 %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1 233 %6 = bitcast i8* %m1 to <8 x i1>* 234 store <8 x i1> %5, <8 x i1>* %6, align 8 235 ret void 236} 237 238declare { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32>, <16 x i32>) 239declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64>, <8 x i64>) 240