1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7define void @stack_fold_vp2intersectd(<16 x i32>* %a, <16 x i32> %b, <16 x i1>* nocapture %m0, <16 x i1>* nocapture %m1) { 8; CHECK-LABEL: stack_fold_vp2intersectd: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 11; CHECK-NEXT: #APP 12; CHECK-NEXT: nop 13; CHECK-NEXT: #NO_APP 14; CHECK-NEXT: vmovaps (%rdi), %zmm0 15; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 16; CHECK-NEXT: kmovw %k0, (%rsi) 17; CHECK-NEXT: kmovw %k1, (%rdx) 18; CHECK-NEXT: vzeroupper 19; CHECK-NEXT: retq 20 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 21 %2 = load <16 x i32>, <16 x i32>* %a 22 %3 = tail call { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32> %2, <16 x i32> %b) 23 %4 = extractvalue { <16 x i1>, <16 x i1> } %3, 0 24 store <16 x i1> %4, <16 x i1>* %m0 25 %5 = extractvalue { <16 x i1>, <16 x i1> } %3, 1 26 store <16 x i1> %5, <16 x i1>* %m1 27 ret void 28} 29declare { <16 x i1>, <16 x i1> } @llvm.x86.avx512.vp2intersect.d.512(<16 x i32>, <16 x i32>) 30 31define void @stack_fold_vp2intersectq(<8 x i64>* %a, <8 x i64> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) { 32; CHECK-LABEL: stack_fold_vp2intersectq: 33; CHECK: # %bb.0: 34; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 35; CHECK-NEXT: #APP 36; CHECK-NEXT: nop 37; CHECK-NEXT: #NO_APP 38; CHECK-NEXT: vmovaps (%rdi), %zmm0 39; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload 40; CHECK-NEXT: kmovw %k1, %eax 41; CHECK-NEXT: kmovw %k0, %ecx 42; CHECK-NEXT: movb %cl, (%rsi) 43; CHECK-NEXT: movb %al, (%rdx) 44; CHECK-NEXT: vzeroupper 45; CHECK-NEXT: retq 46 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 47 %2 = load <8 x i64>, <8 x i64>* %a 48 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64> %2, <8 x i64> %b) 49 %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0 50 store <8 x i1> %4, <8 x i1>* %m0 51 %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1 52 store <8 x i1> %5, <8 x i1>* %m1 53 ret void 54} 55declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.q.512(<8 x i64>, <8 x i64>) 56 57define void @stack_fold_vp2intersectd_256(<8 x i32>* %a, <8 x i32> %b, <8 x i1>* nocapture %m0, <8 x i1>* nocapture %m1) { 58; CHECK-LABEL: stack_fold_vp2intersectd_256: 59; CHECK: # %bb.0: 60; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 61; CHECK-NEXT: #APP 62; CHECK-NEXT: nop 63; CHECK-NEXT: #NO_APP 64; CHECK-NEXT: vmovaps (%rdi), %ymm0 65; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload 66; CHECK-NEXT: kmovw %k1, %eax 67; CHECK-NEXT: kmovw %k0, %ecx 68; CHECK-NEXT: movb %cl, (%rsi) 69; CHECK-NEXT: movb %al, (%rdx) 70; CHECK-NEXT: vzeroupper 71; CHECK-NEXT: retq 72 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 73 %2 = load <8 x i32>, <8 x i32>* %a 74 %3 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %2, <8 x i32> %b) 75 %4 = extractvalue { <8 x i1>, <8 x i1> } %3, 0 76 store <8 x i1> %4, <8 x i1>* %m0 77 %5 = extractvalue { <8 x i1>, <8 x i1> } %3, 1 78 store <8 x i1> %5, <8 x i1>* %m1 79 ret void 80} 81declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>) 82 83define void @stack_fold_vp2intersectq_256(<4 x i64>* %a, <4 x i64> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) { 84; CHECK-LABEL: stack_fold_vp2intersectq_256: 85; CHECK: # %bb.0: 86; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 87; CHECK-NEXT: #APP 88; CHECK-NEXT: nop 89; CHECK-NEXT: #NO_APP 90; CHECK-NEXT: vmovaps (%rdi), %ymm0 91; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload 92; CHECK-NEXT: kshiftlw $12, %k0, %k2 93; CHECK-NEXT: kshiftrw $12, %k2, %k2 94; CHECK-NEXT: kmovw %k2, %eax 95; CHECK-NEXT: movb %al, (%rsi) 96; CHECK-NEXT: kshiftlw $12, %k1, %k0 97; CHECK-NEXT: kshiftrw $12, %k0, %k0 98; CHECK-NEXT: kmovw %k0, %eax 99; CHECK-NEXT: movb %al, (%rdx) 100; CHECK-NEXT: vzeroupper 101; CHECK-NEXT: retq 102 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 103 %2 = load <4 x i64>, <4 x i64>* %a 104 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %2, <4 x i64> %b) 105 %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0 106 store <4 x i1> %4, <4 x i1>* %m0 107 %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1 108 store <4 x i1> %5, <4 x i1>* %m1 109 ret void 110} 111declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>) 112 113define void @stack_fold_vp2intersectd_128(<4 x i32>* %a, <4 x i32> %b, <4 x i1>* nocapture %m0, <4 x i1>* nocapture %m1) { 114; CHECK-LABEL: stack_fold_vp2intersectd_128: 115; CHECK: # %bb.0: 116; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 117; CHECK-NEXT: #APP 118; CHECK-NEXT: nop 119; CHECK-NEXT: #NO_APP 120; CHECK-NEXT: vmovaps (%rdi), %xmm0 121; CHECK-NEXT: vp2intersectd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 122; CHECK-NEXT: kshiftlw $12, %k0, %k2 123; CHECK-NEXT: kshiftrw $12, %k2, %k2 124; CHECK-NEXT: kmovw %k2, %eax 125; CHECK-NEXT: movb %al, (%rsi) 126; CHECK-NEXT: kshiftlw $12, %k1, %k0 127; CHECK-NEXT: kshiftrw $12, %k0, %k0 128; CHECK-NEXT: kmovw %k0, %eax 129; CHECK-NEXT: movb %al, (%rdx) 130; CHECK-NEXT: retq 131 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 132 %2 = load <4 x i32>, <4 x i32>* %a 133 %3 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %2, <4 x i32> %b) 134 %4 = extractvalue { <4 x i1>, <4 x i1> } %3, 0 135 store <4 x i1> %4, <4 x i1>* %m0 136 %5 = extractvalue { <4 x i1>, <4 x i1> } %3, 1 137 store <4 x i1> %5, <4 x i1>* %m1 138 ret void 139} 140declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>) 141 142define void @stack_fold_vp2intersectq_128(<2 x i64>* %a, <2 x i64> %b, <2 x i1>* nocapture %m0, <2 x i1>* nocapture %m1) { 143; CHECK-LABEL: stack_fold_vp2intersectq_128: 144; CHECK: # %bb.0: 145; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 146; CHECK-NEXT: #APP 147; CHECK-NEXT: nop 148; CHECK-NEXT: #NO_APP 149; CHECK-NEXT: vmovaps (%rdi), %xmm0 150; CHECK-NEXT: vp2intersectq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload 151; CHECK-NEXT: kshiftlw $14, %k0, %k2 152; CHECK-NEXT: kshiftrw $14, %k2, %k2 153; CHECK-NEXT: kmovw %k2, %eax 154; CHECK-NEXT: movb %al, (%rsi) 155; CHECK-NEXT: kshiftlw $14, %k1, %k0 156; CHECK-NEXT: kshiftrw $14, %k0, %k0 157; CHECK-NEXT: kmovw %k0, %eax 158; CHECK-NEXT: movb %al, (%rdx) 159; CHECK-NEXT: retq 160 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() 161 %2 = load <2 x i64>, <2 x i64>* %a 162 %3 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %2, <2 x i64> %b) 163 %4 = extractvalue { <2 x i1>, <2 x i1> } %3, 0 164 store <2 x i1> %4, <2 x i1>* %m0 165 %5 = extractvalue { <2 x i1>, <2 x i1> } %3, 1 166 store <2 x i1> %5, <2 x i1>* %m1 167 ret void 168} 169declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>) 170