• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7define void @stack_fold_vp2intersectd(<16 x i32>* %a, <16 x i32> %b, <16 x i1>* nocapture %m0, <16 x i1>* nocapture %m1) {
8; CHECK-LABEL: stack_fold_vp2intersectd:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11; CHECK-NEXT:    #APP
12; CHECK-NEXT:    nop
13; CHECK-NEXT:    #NO_APP
14; CHECK-NEXT:    vmovaps (%rdi), %zmm0
15; CHECK-NEXT:    vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
16; CHECK-NEXT:    kmovw %k0, (%rsi)
17; CHECK-NEXT:    kmovw %k1, (%rdx)
18; CHECK-NEXT:    vzeroupper
19; CHECK-NEXT:    retq
20  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
21  %2 = load <16 x i32>, <16 x i32>* %a
22  %3 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %2, <16 x i32> %b)
23  %4 = extractvalue { <16 x i1>, <16 x i1> } %3, 0
24  store <16 x i1> %4, <16 x i1>* %m0
25  %5 = extractvalue { <16 x i1>, <16 x i1> } %3, 1
26  store <16 x i1> %5, <16 x i1>* %m1
27  ret void
28}
29declare { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32>, <16 x i32>)
30
31define void @stack_fold_vp2intersectq(<8 x i64>* %a, <8 x i64> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
32; CHECK-LABEL: stack_fold_vp2intersectq:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
35; CHECK-NEXT:    #APP
36; CHECK-NEXT:    nop
37; CHECK-NEXT:    #NO_APP
38; CHECK-NEXT:    vmovaps (%rdi), %zmm0
39; CHECK-NEXT:    vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
40; CHECK-NEXT:    kmovw %k1, %eax
41; CHECK-NEXT:    kmovw %k0, %ecx
42; CHECK-NEXT:    movb %cl, (%rsi)
43; CHECK-NEXT:    movb %al, (%rdx)
44; CHECK-NEXT:    vzeroupper
45; CHECK-NEXT:    retq
46  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
47  %2 = load <8 x i64>, <8 x i64>* %a
48  %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %2, <8 x i64> %b)
49  %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0
50  store <8 x i1> %4, <8 x i1>* %m0
51  %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1
52  store <8 x i1> %5, <8 x i1>* %m1
53  ret void
54}
55declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64>, <8 x i64>)
56
57define void @stack_fold_vp2intersectd_256(<8 x i32>* %a, <8 x i32> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) {
58; CHECK-LABEL: stack_fold_vp2intersectd_256:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
61; CHECK-NEXT:    #APP
62; CHECK-NEXT:    nop
63; CHECK-NEXT:    #NO_APP
64; CHECK-NEXT:    vmovaps (%rdi), %ymm0
65; CHECK-NEXT:    vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
66; CHECK-NEXT:    kmovw %k1, %eax
67; CHECK-NEXT:    kmovw %k0, %ecx
68; CHECK-NEXT:    movb %cl, (%rsi)
69; CHECK-NEXT:    movb %al, (%rdx)
70; CHECK-NEXT:    vzeroupper
71; CHECK-NEXT:    retq
72  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
73  %2 = load <8 x i32>, <8 x i32>* %a
74  %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %2, <8 x i32> %b)
75  %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0
76  store <8 x i1> %4, <8 x i1>* %m0
77  %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1
78  store <8 x i1> %5, <8 x i1>* %m1
79  ret void
80}
81declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
82
83define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
84; CHECK-LABEL: stack_fold_vp2intersectq_256:
85; CHECK:       # %bb.0:
86; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
87; CHECK-NEXT:    #APP
88; CHECK-NEXT:    nop
89; CHECK-NEXT:    #NO_APP
90; CHECK-NEXT:    vmovaps (%rdi), %ymm0
91; CHECK-NEXT:    vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
92; CHECK-NEXT:    kshiftlw $12, %k0, %k2
93; CHECK-NEXT:    kshiftrw $12, %k2, %k2
94; CHECK-NEXT:    kmovw %k2, %eax
95; CHECK-NEXT:    movb %al, (%rsi)
96; CHECK-NEXT:    kshiftlw $12, %k1, %k0
97; CHECK-NEXT:    kshiftrw $12, %k0, %k0
98; CHECK-NEXT:    kmovw %k0, %eax
99; CHECK-NEXT:    movb %al, (%rdx)
100; CHECK-NEXT:    vzeroupper
101; CHECK-NEXT:    retq
102  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
103  %2 = load <4 x i64>, <4 x i64>* %a
104  %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %2, <4 x i64> %b)
105  %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0
106  store <4 x i1> %4, <4 x i1>* %m0
107  %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1
108  store <4 x i1> %5, <4 x i1>* %m1
109  ret void
110}
111declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
112
113define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) {
114; CHECK-LABEL: stack_fold_vp2intersectd_128:
115; CHECK:       # %bb.0:
116; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
117; CHECK-NEXT:    #APP
118; CHECK-NEXT:    nop
119; CHECK-NEXT:    #NO_APP
120; CHECK-NEXT:    vmovaps (%rdi), %xmm0
121; CHECK-NEXT:    vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
122; CHECK-NEXT:    kshiftlw $12, %k0, %k2
123; CHECK-NEXT:    kshiftrw $12, %k2, %k2
124; CHECK-NEXT:    kmovw %k2, %eax
125; CHECK-NEXT:    movb %al, (%rsi)
126; CHECK-NEXT:    kshiftlw $12, %k1, %k0
127; CHECK-NEXT:    kshiftrw $12, %k0, %k0
128; CHECK-NEXT:    kmovw %k0, %eax
129; CHECK-NEXT:    movb %al, (%rdx)
130; CHECK-NEXT:    retq
131  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
132  %2 = load <4 x i32>, <4 x i32>* %a
133  %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %2, <4 x i32> %b)
134  %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0
135  store <4 x i1> %4, <4 x i1>* %m0
136  %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1
137  store <4 x i1> %5, <4 x i1>* %m1
138  ret void
139}
140declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
141
142define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* nocapture %m0, <2 x i1>* nocapture %m1) {
143; CHECK-LABEL: stack_fold_vp2intersectq_128:
144; CHECK:       # %bb.0:
145; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
146; CHECK-NEXT:    #APP
147; CHECK-NEXT:    nop
148; CHECK-NEXT:    #NO_APP
149; CHECK-NEXT:    vmovaps (%rdi), %xmm0
150; CHECK-NEXT:    vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
151; CHECK-NEXT:    kshiftlw $14, %k0, %k2
152; CHECK-NEXT:    kshiftrw $14, %k2, %k2
153; CHECK-NEXT:    kmovw %k2, %eax
154; CHECK-NEXT:    movb %al, (%rsi)
155; CHECK-NEXT:    kshiftlw $14, %k1, %k0
156; CHECK-NEXT:    kshiftrw $14, %k0, %k0
157; CHECK-NEXT:    kmovw %k0, %eax
158; CHECK-NEXT:    movb %al, (%rdx)
159; CHECK-NEXT:    retq
160  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
161  %2 = load <2 x i64>, <2 x i64>* %a
162  %3 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %2, <2 x i64> %b)
163  %4 = extractvalue { <2 x i1>, <2 x i1> } %3, 0
164  store <2 x i1> %4, <2 x i1>* %m0
165  %5 = extractvalue { <2 x i1>, <2 x i1> } %3, 1
166  store <2 x i1> %5, <2 x i1>* %m1
167  ret void
168}
169declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)
170