• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16,+avx512vl < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <32 x i16> @stack_fold_cvtne2ps2bf16(<16 x float> %a0, <16 x float> %a1) {
13; CHECK-LABEL: stack_fold_cvtne2ps2bf16:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16; CHECK-NEXT:    #APP
17; CHECK-NEXT:    nop
18; CHECK-NEXT:    #NO_APP
19; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
20; CHECK-NEXT:    retq
21  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
22  %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
23  ret <32 x i16> %2
24}
25declare <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>)
26
27define <32 x i16> @stack_fold_cvtne2ps2bf16_mask(<16 x float> %a0, <16 x float> %a1, <32 x i16>* %passthru, i32 %U) {
28; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask:
29; CHECK:       # %bb.0:
30; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
31; CHECK-NEXT:    #APP
32; CHECK-NEXT:    nop
33; CHECK-NEXT:    #NO_APP
34; CHECK-NEXT:    kmovd %esi, %k1
35; CHECK-NEXT:    vmovaps (%rdi), %zmm2
36; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
37; CHECK-NEXT:    vmovaps %zmm2, %zmm0
38; CHECK-NEXT:    retq
39  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
40  %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
41  %3 = bitcast i32 %U to <32 x i1>
42  ; load needed to keep the operation from being scheduled above the asm block
43  %4 = load <32 x i16>, <32 x i16>* %passthru
44  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
45  ret <32 x i16> %5
46}
47
48define <32 x i16> @stack_fold_cvtne2ps2bf16_maskz(<16 x float> %a0, <16 x float> %a1, i32 %U) {
49; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
52; CHECK-NEXT:    #APP
53; CHECK-NEXT:    nop
54; CHECK-NEXT:    #NO_APP
55; CHECK-NEXT:    kmovd %edi, %k1
56; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
57; CHECK-NEXT:    retq
58  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
59  %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1)
60  %3 = bitcast i32 %U to <32 x i1>
61  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
62  ret <32 x i16> %4
63}
64
65define <16 x i16> @stack_fold_cvtneps2bf16(<16 x float> %a0) {
66; CHECK-LABEL: stack_fold_cvtneps2bf16:
67; CHECK:       # %bb.0:
68; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
69; CHECK-NEXT:    #APP
70; CHECK-NEXT:    nop
71; CHECK-NEXT:    #NO_APP
72; CHECK-NEXT:    vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload
73; CHECK-NEXT:    retq
74  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
75  %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
76  ret <16 x i16> %2
77}
78declare <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>)
79
80define <16 x i16> @stack_fold_cvtneps2bf16_mask(<16 x float> %a0, <16 x i16>* %passthru, i16 %U) {
81; CHECK-LABEL: stack_fold_cvtneps2bf16_mask:
82; CHECK:       # %bb.0:
83; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
84; CHECK-NEXT:    #APP
85; CHECK-NEXT:    nop
86; CHECK-NEXT:    #NO_APP
87; CHECK-NEXT:    kmovd %esi, %k1
88; CHECK-NEXT:    vmovaps (%rdi), %ymm1
89; CHECK-NEXT:    vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 64-byte Folded Reload
90; CHECK-NEXT:    vmovaps %ymm1, %ymm0
91; CHECK-NEXT:    retq
92  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
93  %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
94  %3 = bitcast i16 %U to <16 x i1>
95  ; load needed to keep the operation from being scheduled above the asm block
96  %4 = load <16 x i16>, <16 x i16>* %passthru
97  %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
98  ret <16 x i16> %5
99}
100
101define <16 x i16> @stack_fold_cvtneps2bf16_maskz(<16 x float> %a0, i16 %U) {
102; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
105; CHECK-NEXT:    #APP
106; CHECK-NEXT:    nop
107; CHECK-NEXT:    #NO_APP
108; CHECK-NEXT:    kmovd %edi, %k1
109; CHECK-NEXT:    vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 64-byte Folded Reload
110; CHECK-NEXT:    retq
111  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
112  %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0)
113  %3 = bitcast i16 %U to <16 x i1>
114  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
115  ret <16 x i16> %4
116}
117
118define <16 x float> @stack_fold_vdpbf16ps(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) {
119; CHECK-LABEL: stack_fold_vdpbf16ps:
120; CHECK:       # %bb.0:
121; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
122; CHECK-NEXT:    #APP
123; CHECK-NEXT:    nop
124; CHECK-NEXT:    #NO_APP
125; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
126; CHECK-NEXT:    retq
127  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
128  %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2)
129  ret <16 x float> %2
130}
131declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <16 x i32>, <16 x i32>)
132
133define <16 x float> @stack_fold_vdpbf16ps_mask(<16 x float>* %a0, <16 x i32> %a1, <16 x i32> %a2, <16 x float>* %passthru, i16 %U) {
134; CHECK-LABEL: stack_fold_vdpbf16ps_mask:
135; CHECK:       # %bb.0:
136; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
137; CHECK-NEXT:    #APP
138; CHECK-NEXT:    nop
139; CHECK-NEXT:    #NO_APP
140; CHECK-NEXT:    vmovaps (%rdi), %zmm2
141; CHECK-NEXT:    kmovd %edx, %k1
142; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
143; CHECK-NEXT:    vmovaps %zmm2, %zmm0
144; CHECK-NEXT:    retq
145  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
146  ; load needed to keep the operation from being scheduled above the asm block
147  %2 = load <16 x float>, <16 x float>* %a0
148  %3 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %2, <16 x i32> %a1, <16 x i32> %a2)
149  %4 = bitcast i16 %U to <16 x i1>
150  %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %2
151  ret <16 x float> %5
152}
153
154define <16 x float> @stack_fold_vdpbf16ps_maskz(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %U) {
155; CHECK-LABEL: stack_fold_vdpbf16ps_maskz:
156; CHECK:       # %bb.0:
157; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
158; CHECK-NEXT:    #APP
159; CHECK-NEXT:    nop
160; CHECK-NEXT:    #NO_APP
161; CHECK-NEXT:    kmovw (%rdi), %k1
162; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
163; CHECK-NEXT:    retq
164  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
165  %2 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2)
166  %3 = load i16, i16* %U
167  %4 = bitcast i16 %3 to <16 x i1>
168  %5 = select <16 x i1> %4, <16 x float> %2, <16 x float> zeroinitializer
169  ret <16 x float> %5
170}
171
172
173
174define <16 x i16> @stack_fold_cvtne2ps2bf16_ymm(<8 x float> %a0, <8 x float> %a1) {
175; CHECK-LABEL: stack_fold_cvtne2ps2bf16_ymm:
176; CHECK:       # %bb.0:
177; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
178; CHECK-NEXT:    #APP
179; CHECK-NEXT:    nop
180; CHECK-NEXT:    #NO_APP
181; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
182; CHECK-NEXT:    retq
183  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
184  %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
185  ret <16 x i16> %2
186}
187declare <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>)
188
189define <16 x i16> @stack_fold_cvtne2ps2bf16_mask_ymm(<8 x float> %a0, <8 x float> %a1, <16 x i16>* %passthru, i16 %U) {
190; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_ymm:
191; CHECK:       # %bb.0:
192; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
193; CHECK-NEXT:    #APP
194; CHECK-NEXT:    nop
195; CHECK-NEXT:    #NO_APP
196; CHECK-NEXT:    kmovd %esi, %k1
197; CHECK-NEXT:    vmovaps (%rdi), %ymm2
198; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
199; CHECK-NEXT:    vmovaps %ymm2, %ymm0
200; CHECK-NEXT:    retq
201  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
202  %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
203  %3 = bitcast i16 %U to <16 x i1>
204  ; load needed to keep the operation from being scheduled above the asm block
205  %4 = load <16 x i16>, <16 x i16>* %passthru
206  %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
207  ret <16 x i16> %5
208}
209
210define <16 x i16> @stack_fold_cvtne2ps2bf16_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i16 %U) {
211; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_ymm:
212; CHECK:       # %bb.0:
213; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
214; CHECK-NEXT:    #APP
215; CHECK-NEXT:    nop
216; CHECK-NEXT:    #NO_APP
217; CHECK-NEXT:    kmovd %edi, %k1
218; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
219; CHECK-NEXT:    retq
220  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
221  %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1)
222  %3 = bitcast i16 %U to <16 x i1>
223  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
224  ret <16 x i16> %4
225}
226
227define <8 x i16> @stack_fold_cvtneps2bf16_ymm(<8 x float> %a0) {
228; CHECK-LABEL: stack_fold_cvtneps2bf16_ymm:
229; CHECK:       # %bb.0:
230; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
231; CHECK-NEXT:    #APP
232; CHECK-NEXT:    nop
233; CHECK-NEXT:    #NO_APP
234; CHECK-NEXT:    vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 32-byte Folded Reload
235; CHECK-NEXT:    vzeroupper
236; CHECK-NEXT:    retq
237  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
238  %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
239  ret <8 x i16> %2
240}
241declare <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>)
242
243define <8 x i16> @stack_fold_cvtneps2bf16_mask_ymm(<8 x float> %a0, <8 x i16>* %passthru, i8 %U) {
244; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_ymm:
245; CHECK:       # %bb.0:
246; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
247; CHECK-NEXT:    #APP
248; CHECK-NEXT:    nop
249; CHECK-NEXT:    #NO_APP
250; CHECK-NEXT:    kmovd %esi, %k1
251; CHECK-NEXT:    vmovaps (%rdi), %xmm1
252; CHECK-NEXT:    vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 32-byte Folded Reload
253; CHECK-NEXT:    vmovaps %xmm1, %xmm0
254; CHECK-NEXT:    vzeroupper
255; CHECK-NEXT:    retq
256  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
257  %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
258  %3 = bitcast i8 %U to <8 x i1>
259  ; load needed to keep the operation from being scheduled above the asm block
260  %4 = load <8 x i16>, <8 x i16>* %passthru
261  %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
262  ret <8 x i16> %5
263}
264
265define <8 x i16> @stack_fold_cvtneps2bf16_maskz_ymm(<8 x float> %a0, i8 %U) {
266; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_ymm:
267; CHECK:       # %bb.0:
268; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
269; CHECK-NEXT:    #APP
270; CHECK-NEXT:    nop
271; CHECK-NEXT:    #NO_APP
272; CHECK-NEXT:    kmovd %edi, %k1
273; CHECK-NEXT:    vcvtneps2bf16y {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 32-byte Folded Reload
274; CHECK-NEXT:    vzeroupper
275; CHECK-NEXT:    retq
276  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
277  %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0)
278  %3 = bitcast i8 %U to <8 x i1>
279  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
280  ret <8 x i16> %4
281}
282
283define <8 x float> @stack_fold_vdpbf16ps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) {
284; CHECK-LABEL: stack_fold_vdpbf16ps_ymm:
285; CHECK:       # %bb.0:
286; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
287; CHECK-NEXT:    #APP
288; CHECK-NEXT:    nop
289; CHECK-NEXT:    #NO_APP
290; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
291; CHECK-NEXT:    retq
292  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
293  %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2)
294  ret <8 x float> %2
295}
296declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <8 x i32>, <8 x i32>)
297
298define <8 x float> @stack_fold_vdpbf16ps_mask_ymm(<8 x float>* %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x float>* %passthru, i8 %U) {
299; CHECK-LABEL: stack_fold_vdpbf16ps_mask_ymm:
300; CHECK:       # %bb.0:
301; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
302; CHECK-NEXT:    #APP
303; CHECK-NEXT:    nop
304; CHECK-NEXT:    #NO_APP
305; CHECK-NEXT:    vmovaps (%rdi), %ymm2
306; CHECK-NEXT:    kmovd %edx, %k1
307; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
308; CHECK-NEXT:    vmovaps %ymm2, %ymm0
309; CHECK-NEXT:    retq
310  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
311  ; load needed to keep the operation from being scheduled above the asm block
312  %2 = load <8 x float>, <8 x float>* %a0
313  %3 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %2, <8 x i32> %a1, <8 x i32> %a2)
314  %4 = bitcast i8 %U to <8 x i1>
315  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %2
316  ret <8 x float> %5
317}
318
319define <8 x float> @stack_fold_vdpbf16ps_maskz_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2, i8* %U) {
320; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_ymm:
321; CHECK:       # %bb.0:
322; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
323; CHECK-NEXT:    #APP
324; CHECK-NEXT:    nop
325; CHECK-NEXT:    #NO_APP
326; CHECK-NEXT:    movzbl (%rdi), %eax
327; CHECK-NEXT:    kmovd %eax, %k1
328; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
329; CHECK-NEXT:    retq
330  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
331  %2 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2)
332  %3 = load i8, i8* %U
333  %4 = bitcast i8 %3 to <8 x i1>
334  %5 = select <8 x i1> %4, <8 x float> %2, <8 x float> zeroinitializer
335  ret <8 x float> %5
336}
337
338
339
340
341define <8 x i16> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) {
342; CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm:
343; CHECK:       # %bb.0:
344; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
345; CHECK-NEXT:    #APP
346; CHECK-NEXT:    nop
347; CHECK-NEXT:    #NO_APP
348; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
349; CHECK-NEXT:    retq
350  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
351  %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
352  ret <8 x i16> %2
353}
354declare <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>)
355
356define <8 x i16> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, <8 x i16>* %passthru, i8 %U) {
357; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_xmm:
358; CHECK:       # %bb.0:
359; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
360; CHECK-NEXT:    #APP
361; CHECK-NEXT:    nop
362; CHECK-NEXT:    #NO_APP
363; CHECK-NEXT:    kmovd %esi, %k1
364; CHECK-NEXT:    vmovaps (%rdi), %xmm2
365; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
366; CHECK-NEXT:    vmovaps %xmm2, %xmm0
367; CHECK-NEXT:    retq
368  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
369  %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
370  %3 = bitcast i8 %U to <8 x i1>
371  ; load needed to keep the operation from being scheduled above the asm block
372  %4 = load <8 x i16>, <8 x i16>* %passthru
373  %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
374  ret <8 x i16> %5
375}
376
377define <8 x i16> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) {
378; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_xmm:
379; CHECK:       # %bb.0:
380; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
381; CHECK-NEXT:    #APP
382; CHECK-NEXT:    nop
383; CHECK-NEXT:    #NO_APP
384; CHECK-NEXT:    kmovd %edi, %k1
385; CHECK-NEXT:    vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
386; CHECK-NEXT:    retq
387  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
388  %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1)
389  %3 = bitcast i8 %U to <8 x i1>
390  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
391  ret <8 x i16> %4
392}
393
394define <8 x i16> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) {
395; CHECK-LABEL: stack_fold_cvtneps2bf16_xmm:
396; CHECK:       # %bb.0:
397; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
398; CHECK-NEXT:    #APP
399; CHECK-NEXT:    nop
400; CHECK-NEXT:    #NO_APP
401; CHECK-NEXT:    vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
402; CHECK-NEXT:    retq
403  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
404  %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
405  ret <8 x i16> %2
406}
407declare <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x i16>, <4 x i1>)
408
409define <8 x i16> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, <8 x i16>* %passthru, i8 %U) {
410; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_xmm:
411; CHECK:       # %bb.0:
412; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
413; CHECK-NEXT:    #APP
414; CHECK-NEXT:    nop
415; CHECK-NEXT:    #NO_APP
416; CHECK-NEXT:    vmovaps (%rdi), %xmm1
417; CHECK-NEXT:    kmovd %esi, %k1
418; CHECK-NEXT:    vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
419; CHECK-NEXT:    vmovaps %xmm1, %xmm0
420; CHECK-NEXT:    retq
421  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
422  %2 = load <8 x i16>, <8 x i16>* %passthru
423  %3 = bitcast i8 %U to <8 x i1>
424  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
425  %5 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> %2, <4 x i1> %4)
426  ret <8 x i16> %5
427}
428
429define <8 x i16> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) {
430; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_xmm:
431; CHECK:       # %bb.0:
432; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
433; CHECK-NEXT:    #APP
434; CHECK-NEXT:    nop
435; CHECK-NEXT:    #NO_APP
436; CHECK-NEXT:    kmovd %edi, %k1
437; CHECK-NEXT:    vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
438; CHECK-NEXT:    retq
439  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
440  %2 = bitcast i8 %U to <8 x i1>
441  %3 = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
442  %4 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> zeroinitializer, <4 x i1> %3)
443  ret <8 x i16> %4
444}
445
446define <4 x float> @stack_fold_vdpbf16ps_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) {
447; CHECK-LABEL: stack_fold_vdpbf16ps_xmm:
448; CHECK:       # %bb.0:
449; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
450; CHECK-NEXT:    #APP
451; CHECK-NEXT:    nop
452; CHECK-NEXT:    #NO_APP
453; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
454; CHECK-NEXT:    retq
455  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
456  %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2)
457  ret <4 x float> %2
458}
459declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <4 x i32>, <4 x i32>)
460
461define <4 x float> @stack_fold_vdpbf16ps_mask_xmm(<4 x float>* %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x float>* %passthru, i8 %U) {
462; CHECK-LABEL: stack_fold_vdpbf16ps_mask_xmm:
463; CHECK:       # %bb.0:
464; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
465; CHECK-NEXT:    #APP
466; CHECK-NEXT:    nop
467; CHECK-NEXT:    #NO_APP
468; CHECK-NEXT:    vmovaps (%rdi), %xmm2
469; CHECK-NEXT:    kmovd %edx, %k1
470; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
471; CHECK-NEXT:    vmovaps %xmm2, %xmm0
472; CHECK-NEXT:    retq
473  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
474  ; load needed to keep the operation from being scheduled above the asm block
475  %2 = load <4 x float>, <4 x float>* %a0
476  %3 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %2, <4 x i32> %a1, <4 x i32> %a2)
477  %4 = bitcast i8 %U to <8 x i1>
478  %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
479  %6 = select <4 x i1> %5, <4 x float> %3, <4 x float> %2
480  ret <4 x float> %6
481}
482
483define <4 x float> @stack_fold_vdpbf16ps_maskz_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2, i8* %U) {
484; CHECK-LABEL: stack_fold_vdpbf16ps_maskz_xmm:
485; CHECK:       # %bb.0:
486; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
487; CHECK-NEXT:    #APP
488; CHECK-NEXT:    nop
489; CHECK-NEXT:    #NO_APP
490; CHECK-NEXT:    movzbl (%rdi), %eax
491; CHECK-NEXT:    kmovd %eax, %k1
492; CHECK-NEXT:    vdpbf16ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
493; CHECK-NEXT:    retq
494  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
495  %2 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2)
496  %3 = load i8, i8* %U
497  %4 = bitcast i8 %3 to <8 x i1>
498  %5 = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
499  %6 = select <4 x i1> %5, <4 x float> %2, <4 x float> zeroinitializer
500  ret <4 x float> %6
501}
502