• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-unknown"
6
7; Stack reload folding tests.
8;
9; By including a nop call with sideeffects we can force a partial register spill of the
10; relevant registers and check that the reload is correctly folded into the instruction.
11
12define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) {
13; CHECK-LABEL: stack_fold_valignd:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    subq $56, %rsp
16; CHECK-NEXT:    .cfi_def_cfa_offset 64
17; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19; CHECK-NEXT:    #APP
20; CHECK-NEXT:    nop
21; CHECK-NEXT:    #NO_APP
22; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
24; CHECK-NEXT:    # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
25; CHECK-NEXT:    addq $56, %rsp
26; CHECK-NEXT:    .cfi_def_cfa_offset 8
27; CHECK-NEXT:    retq
28  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
29  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
30  ret <16 x i32> %2
31}
32
33define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) {
34; CHECK-LABEL: stack_fold_valignd_mask:
35; CHECK:       # %bb.0:
36; CHECK-NEXT:    subq $56, %rsp
37; CHECK-NEXT:    .cfi_def_cfa_offset 64
38; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
39; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
40; CHECK-NEXT:    #APP
41; CHECK-NEXT:    nop
42; CHECK-NEXT:    #NO_APP
43; CHECK-NEXT:    kmovd %esi, %k1
44; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
45; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
46; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
47; CHECK-NEXT:    # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
48; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
49; CHECK-NEXT:    addq $56, %rsp
50; CHECK-NEXT:    .cfi_def_cfa_offset 8
51; CHECK-NEXT:    retq
52  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
53  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
54  %3 = bitcast i16 %mask to <16 x i1>
55  %4 = load <16 x i32>, <16 x i32>* %passthru
56  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
57  ret <16 x i32> %5
58}
59
60define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
61; CHECK-LABEL: stack_fold_valignd_maskz:
62; CHECK:       # %bb.0:
63; CHECK-NEXT:    subq $56, %rsp
64; CHECK-NEXT:    .cfi_def_cfa_offset 64
65; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
66; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
67; CHECK-NEXT:    #APP
68; CHECK-NEXT:    nop
69; CHECK-NEXT:    #NO_APP
70; CHECK-NEXT:    kmovd %edi, %k1
71; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
72; CHECK-NEXT:    valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
73; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
74; CHECK-NEXT:    addq $56, %rsp
75; CHECK-NEXT:    .cfi_def_cfa_offset 8
76; CHECK-NEXT:    retq
77  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
78  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
79  %3 = bitcast i16 %mask to <16 x i1>
80  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
81  ret <16 x i32> %4
82}
83
84define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) {
85; CHECK-LABEL: stack_fold_valignq:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    subq $56, %rsp
88; CHECK-NEXT:    .cfi_def_cfa_offset 64
89; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
90; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
91; CHECK-NEXT:    #APP
92; CHECK-NEXT:    nop
93; CHECK-NEXT:    #NO_APP
94; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
95; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
96; CHECK-NEXT:    # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0]
97; CHECK-NEXT:    addq $56, %rsp
98; CHECK-NEXT:    .cfi_def_cfa_offset 8
99; CHECK-NEXT:    retq
100  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
101  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
102  ret <8 x i64> %2
103}
104
105define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) {
106; CHECK-LABEL: stack_fold_valignq_mask:
107; CHECK:       # %bb.0:
108; CHECK-NEXT:    subq $56, %rsp
109; CHECK-NEXT:    .cfi_def_cfa_offset 64
110; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
111; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
112; CHECK-NEXT:    #APP
113; CHECK-NEXT:    nop
114; CHECK-NEXT:    #NO_APP
115; CHECK-NEXT:    kmovd %esi, %k1
116; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
117; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
118; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
119; CHECK-NEXT:    # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0]
120; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
121; CHECK-NEXT:    addq $56, %rsp
122; CHECK-NEXT:    .cfi_def_cfa_offset 8
123; CHECK-NEXT:    retq
124  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
125  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
126  %3 = bitcast i8 %mask to <8 x i1>
127  %4 = load <8 x i64>, <8 x i64>* %passthru
128  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
129  ret <8 x i64> %5
130}
131
132define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
133; CHECK-LABEL: stack_fold_valignq_maskz:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    subq $56, %rsp
136; CHECK-NEXT:    .cfi_def_cfa_offset 64
137; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
138; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
139; CHECK-NEXT:    #APP
140; CHECK-NEXT:    nop
141; CHECK-NEXT:    #NO_APP
142; CHECK-NEXT:    kmovd %edi, %k1
143; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
144; CHECK-NEXT:    valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
145; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0]
146; CHECK-NEXT:    addq $56, %rsp
147; CHECK-NEXT:    .cfi_def_cfa_offset 8
148; CHECK-NEXT:    retq
149  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
150  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
151  %3 = bitcast i8 %mask to <8 x i1>
152  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
153  ret <8 x i64> %4
154}
155
156define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
157; CHECK-LABEL: stack_fold_pavgb:
158; CHECK:       # %bb.0:
159; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
160; CHECK-NEXT:    #APP
161; CHECK-NEXT:    nop
162; CHECK-NEXT:    #NO_APP
163; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
164; CHECK-NEXT:    retq
165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
166  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
167  ret <64 x i8> %2
168}
169declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>)
170
171define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
172; CHECK-LABEL: stack_fold_pavgb_commuted:
173; CHECK:       # %bb.0:
174; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
175; CHECK-NEXT:    #APP
176; CHECK-NEXT:    nop
177; CHECK-NEXT:    #NO_APP
178; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
179; CHECK-NEXT:    retq
180  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
181  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
182  ret <64 x i8> %2
183}
184
185define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
186; CHECK-LABEL: stack_fold_pavgb_mask:
187; CHECK:       # %bb.0:
188; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
189; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
190; CHECK-NEXT:    #APP
191; CHECK-NEXT:    nop
192; CHECK-NEXT:    #NO_APP
193; CHECK-NEXT:    kmovq %rsi, %k1
194; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
195; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
196; CHECK-NEXT:    retq
197  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
198  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
199  %3 = bitcast i64 %mask to <64 x i1>
200  ; load needed to keep the operation from being scheduled about the asm block
201  %4 = load <64 x i8>, <64 x i8>* %a2
202  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
203  ret <64 x i8> %5
204}
205
206define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
207; CHECK-LABEL: stack_fold_pavgb_mask_commuted:
208; CHECK:       # %bb.0:
209; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
210; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
211; CHECK-NEXT:    #APP
212; CHECK-NEXT:    nop
213; CHECK-NEXT:    #NO_APP
214; CHECK-NEXT:    kmovq %rsi, %k1
215; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
216; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
217; CHECK-NEXT:    retq
218  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
219  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
220  %3 = bitcast i64 %mask to <64 x i1>
221  ; load needed to keep the operation from being scheduled about the asm block
222  %4 = load <64 x i8>, <64 x i8>* %a2
223  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
224  ret <64 x i8> %5
225}
226
227define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
228; CHECK-LABEL: stack_fold_pavgb_maskz:
229; CHECK:       # %bb.0:
230; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
231; CHECK-NEXT:    #APP
232; CHECK-NEXT:    nop
233; CHECK-NEXT:    #NO_APP
234; CHECK-NEXT:    kmovq %rdi, %k1
235; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
236; CHECK-NEXT:    retq
237  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
238  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
239  %3 = bitcast i64 %mask to <64 x i1>
240  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
241  ret <64 x i8> %4
242}
243
244define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
245; CHECK-LABEL: stack_fold_pavgb_maskz_commuted:
246; CHECK:       # %bb.0:
247; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
248; CHECK-NEXT:    #APP
249; CHECK-NEXT:    nop
250; CHECK-NEXT:    #NO_APP
251; CHECK-NEXT:    kmovq %rdi, %k1
252; CHECK-NEXT:    vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
253; CHECK-NEXT:    retq
254  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
255  %2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
256  %3 = bitcast i64 %mask to <64 x i1>
257  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
258  ret <64 x i8> %4
259}
260
261define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
262; CHECK-LABEL: stack_fold_pavgw:
263; CHECK:       # %bb.0:
264; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
265; CHECK-NEXT:    #APP
266; CHECK-NEXT:    nop
267; CHECK-NEXT:    #NO_APP
268; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
269; CHECK-NEXT:    retq
270  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
271  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
272  ret <32 x i16> %2
273}
274declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>)
275
276define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
277; CHECK-LABEL: stack_fold_pavgw_commuted:
278; CHECK:       # %bb.0:
279; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
280; CHECK-NEXT:    #APP
281; CHECK-NEXT:    nop
282; CHECK-NEXT:    #NO_APP
283; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
284; CHECK-NEXT:    retq
285  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
286  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
287  ret <32 x i16> %2
288}
289
290define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
291; CHECK-LABEL: stack_fold_pavgw_mask:
292; CHECK:       # %bb.0:
293; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
294; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
295; CHECK-NEXT:    #APP
296; CHECK-NEXT:    nop
297; CHECK-NEXT:    #NO_APP
298; CHECK-NEXT:    kmovd %esi, %k1
299; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
300; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
301; CHECK-NEXT:    retq
302  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
303  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
304  %3 = bitcast i32 %mask to <32 x i1>
305  ; load needed to keep the operation from being scheduled about the asm block
306  %4 = load <32 x i16>, <32 x i16>* %a2
307  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
308  ret <32 x i16> %5
309}
310
311define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
312; CHECK-LABEL: stack_fold_pavgw_mask_commuted:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
315; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
316; CHECK-NEXT:    #APP
317; CHECK-NEXT:    nop
318; CHECK-NEXT:    #NO_APP
319; CHECK-NEXT:    kmovd %esi, %k1
320; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
321; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
322; CHECK-NEXT:    retq
323  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
324  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
325  %3 = bitcast i32 %mask to <32 x i1>
326  ; load needed to keep the operation from being scheduled about the asm block
327  %4 = load <32 x i16>, <32 x i16>* %a2
328  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
329  ret <32 x i16> %5
330}
331
332define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
333; CHECK-LABEL: stack_fold_pavgw_maskz:
334; CHECK:       # %bb.0:
335; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
336; CHECK-NEXT:    #APP
337; CHECK-NEXT:    nop
338; CHECK-NEXT:    #NO_APP
339; CHECK-NEXT:    kmovd %edi, %k1
340; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
341; CHECK-NEXT:    retq
342  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
343  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
344  %3 = bitcast i32 %mask to <32 x i1>
345  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
346  ret <32 x i16> %4
347}
348
349define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
350; CHECK-LABEL: stack_fold_pavgw_maskz_commuted:
351; CHECK:       # %bb.0:
352; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
353; CHECK-NEXT:    #APP
354; CHECK-NEXT:    nop
355; CHECK-NEXT:    #NO_APP
356; CHECK-NEXT:    kmovd %edi, %k1
357; CHECK-NEXT:    vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
358; CHECK-NEXT:    retq
359  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
360  %2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
361  %3 = bitcast i32 %mask to <32 x i1>
362  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
363  ret <32 x i16> %4
364}
365
366define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) {
367; CHECK-LABEL: stack_fold_extracti32x4:
368; CHECK:       # %bb.0:
369; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
370; CHECK-NEXT:    vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
371; CHECK-NEXT:    #APP
372; CHECK-NEXT:    nop
373; CHECK-NEXT:    #NO_APP
374; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
375; CHECK-NEXT:    vzeroupper
376; CHECK-NEXT:    retq
377  ; zext forces execution domain
378  %1 = zext <16 x i16> %a0 to <16 x i32>
379  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
380  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
381  ret <4 x i32> %2
382}
383
384define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) {
385; CHECK-LABEL: stack_fold_extracti64x2:
386; CHECK:       # %bb.0:
387; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
388; CHECK-NEXT:    vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
389; CHECK-NEXT:    #APP
390; CHECK-NEXT:    nop
391; CHECK-NEXT:    #NO_APP
392; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
393; CHECK-NEXT:    vzeroupper
394; CHECK-NEXT:    retq
395  ; zext forces execution domain
396  %1 = zext <8 x i32> %a0 to <8 x i64>
397  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
398  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
399  ret <2 x i64> %2
400}
401
402define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) {
403; CHECK-LABEL: stack_fold_extracti32x8:
404; CHECK:       # %bb.0:
405; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
406; CHECK-NEXT:    vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
407; CHECK-NEXT:    #APP
408; CHECK-NEXT:    nop
409; CHECK-NEXT:    #NO_APP
410; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
411; CHECK-NEXT:    retq
412  ; zext forces execution domain
413  %1 = zext <16 x i16> %a0 to <16 x i32>
414  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
415  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
416  ret <8 x i32> %2
417}
418
419define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) {
420; CHECK-LABEL: stack_fold_extracti64x4:
421; CHECK:       # %bb.0:
422; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
423; CHECK-NEXT:    vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
424; CHECK-NEXT:    #APP
425; CHECK-NEXT:    nop
426; CHECK-NEXT:    #NO_APP
427; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
428; CHECK-NEXT:    retq
429  ; zext forces execution domain
430  %1 = zext <8 x i32> %a0 to <8 x i64>
431  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
432  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
433  ret <4 x i64> %2
434}
435
436define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
437; CHECK-LABEL: stack_fold_inserti32x8:
438; CHECK:       # %bb.0:
439; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
440; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
441; CHECK-NEXT:    #APP
442; CHECK-NEXT:    nop
443; CHECK-NEXT:    #NO_APP
444; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
445; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
446; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
447; CHECK-NEXT:    retq
448  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
449  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
450  ; add forces execution domain
451  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
452  ret <16 x i32> %3
453}
454
455define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
456; CHECK-LABEL: stack_fold_inserti64x4:
457; CHECK:       # %bb.0:
458; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
459; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
460; CHECK-NEXT:    #APP
461; CHECK-NEXT:    nop
462; CHECK-NEXT:    #NO_APP
463; CHECK-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
464; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
465; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
466; CHECK-NEXT:    retq
467  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
468  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
469  ; add forces execution domain
470  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
471  ret <8 x i64> %3
472}
473
474define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) {
475; CHECK-LABEL: stack_fold_pabsb:
476; CHECK:       # %bb.0:
477; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
478; CHECK-NEXT:    #APP
479; CHECK-NEXT:    nop
480; CHECK-NEXT:    #NO_APP
481; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
482; CHECK-NEXT:    retq
483  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
484  %2 = icmp sgt <64 x i8> %a0, zeroinitializer
485  %3 = sub <64 x i8> zeroinitializer, %a0
486  %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
487  ret <64 x i8> %4
488}
489
490define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) {
491; CHECK-LABEL: stack_fold_pabsb_mask:
492; CHECK:       # %bb.0:
493; CHECK-NEXT:    subq $56, %rsp
494; CHECK-NEXT:    .cfi_def_cfa_offset 64
495; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
496; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
497; CHECK-NEXT:    #APP
498; CHECK-NEXT:    nop
499; CHECK-NEXT:    #NO_APP
500; CHECK-NEXT:    kmovq %rdi, %k1
501; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
502; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
503; CHECK-NEXT:    addq $56, %rsp
504; CHECK-NEXT:    .cfi_def_cfa_offset 8
505; CHECK-NEXT:    retq
506  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
507  %2 = icmp sgt <64 x i8> %a0, zeroinitializer
508  %3 = sub <64 x i8> zeroinitializer, %a0
509  %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
510  %5 = bitcast i64 %mask to <64 x i1>
511  %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> %passthru
512  ret <64 x i8> %6
513}
514
515define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) {
516; CHECK-LABEL: stack_fold_pabsb_maskz:
517; CHECK:       # %bb.0:
518; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
519; CHECK-NEXT:    #APP
520; CHECK-NEXT:    nop
521; CHECK-NEXT:    #NO_APP
522; CHECK-NEXT:    kmovq %rdi, %k1
523; CHECK-NEXT:    vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
524; CHECK-NEXT:    retq
525  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
526  %2 = icmp sgt <64 x i8> %a0, zeroinitializer
527  %3 = sub <64 x i8> zeroinitializer, %a0
528  %4 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %3
529  %5 = bitcast i64 %mask to <64 x i1>
530  %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer
531  ret <64 x i8> %6
532}
533
534define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) {
535; CHECK-LABEL: stack_fold_pabsd:
536; CHECK:       # %bb.0:
537; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
538; CHECK-NEXT:    #APP
539; CHECK-NEXT:    nop
540; CHECK-NEXT:    #NO_APP
541; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
542; CHECK-NEXT:    retq
543  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
544  %2 = icmp sgt <16 x i32> %a0, zeroinitializer
545  %3 = sub <16 x i32> zeroinitializer, %a0
546  %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
547  ret <16 x i32> %4
548}
549
550define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
551; CHECK-LABEL: stack_fold_pabsd_mask:
552; CHECK:       # %bb.0:
553; CHECK-NEXT:    subq $56, %rsp
554; CHECK-NEXT:    .cfi_def_cfa_offset 64
555; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
556; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
557; CHECK-NEXT:    #APP
558; CHECK-NEXT:    nop
559; CHECK-NEXT:    #NO_APP
560; CHECK-NEXT:    kmovd %edi, %k1
561; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
562; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
563; CHECK-NEXT:    addq $56, %rsp
564; CHECK-NEXT:    .cfi_def_cfa_offset 8
565; CHECK-NEXT:    retq
566  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
567  %2 = icmp sgt <16 x i32> %a0, zeroinitializer
568  %3 = sub <16 x i32> zeroinitializer, %a0
569  %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
570  %5 = bitcast i16 %mask to <16 x i1>
571  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> %passthru
572  ret <16 x i32> %6
573}
574
575define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) {
576; CHECK-LABEL: stack_fold_pabsd_maskz:
577; CHECK:       # %bb.0:
578; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
579; CHECK-NEXT:    #APP
580; CHECK-NEXT:    nop
581; CHECK-NEXT:    #NO_APP
582; CHECK-NEXT:    kmovd %edi, %k1
583; CHECK-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
584; CHECK-NEXT:    retq
585  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
586  %2 = icmp sgt <16 x i32> %a0, zeroinitializer
587  %3 = sub <16 x i32> zeroinitializer, %a0
588  %4 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %3
589  %5 = bitcast i16 %mask to <16 x i1>
590  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
591  ret <16 x i32> %6
592}
593
594define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) {
595; CHECK-LABEL: stack_fold_pabsq:
596; CHECK:       # %bb.0:
597; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
598; CHECK-NEXT:    #APP
599; CHECK-NEXT:    nop
600; CHECK-NEXT:    #NO_APP
601; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
602; CHECK-NEXT:    retq
603  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
604  %2 = icmp sgt <8 x i64> %a0, zeroinitializer
605  %3 = sub <8 x i64> zeroinitializer, %a0
606  %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
607  ret <8 x i64> %4
608}
609
610define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) {
611; CHECK-LABEL: stack_fold_pabsq_mask:
612; CHECK:       # %bb.0:
613; CHECK-NEXT:    subq $56, %rsp
614; CHECK-NEXT:    .cfi_def_cfa_offset 64
615; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
616; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
617; CHECK-NEXT:    #APP
618; CHECK-NEXT:    nop
619; CHECK-NEXT:    #NO_APP
620; CHECK-NEXT:    kmovd %edi, %k1
621; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
622; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
623; CHECK-NEXT:    addq $56, %rsp
624; CHECK-NEXT:    .cfi_def_cfa_offset 8
625; CHECK-NEXT:    retq
626  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
627  %2 = icmp sgt <8 x i64> %a0, zeroinitializer
628  %3 = sub <8 x i64> zeroinitializer, %a0
629  %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
630  %5 = bitcast i8 %mask to <8 x i1>
631  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %passthru
632  ret <8 x i64> %6
633}
634
635define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) {
636; CHECK-LABEL: stack_fold_pabsq_maskz:
637; CHECK:       # %bb.0:
638; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
639; CHECK-NEXT:    #APP
640; CHECK-NEXT:    nop
641; CHECK-NEXT:    #NO_APP
642; CHECK-NEXT:    kmovd %edi, %k1
643; CHECK-NEXT:    vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
644; CHECK-NEXT:    retq
645  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
646  %2 = icmp sgt <8 x i64> %a0, zeroinitializer
647  %3 = sub <8 x i64> zeroinitializer, %a0
648  %4 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %3
649  %5 = bitcast i8 %mask to <8 x i1>
650  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
651  ret <8 x i64> %6
652}
653
654define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) {
655; CHECK-LABEL: stack_fold_pabsw:
656; CHECK:       # %bb.0:
657; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
658; CHECK-NEXT:    #APP
659; CHECK-NEXT:    nop
660; CHECK-NEXT:    #NO_APP
661; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
662; CHECK-NEXT:    retq
663  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
664  %2 = icmp sgt <32 x i16> %a0, zeroinitializer
665  %3 = sub <32 x i16> zeroinitializer, %a0
666  %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
667  ret <32 x i16> %4
668}
669
670define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
671; CHECK-LABEL: stack_fold_pabsw_mask:
672; CHECK:       # %bb.0:
673; CHECK-NEXT:    subq $56, %rsp
674; CHECK-NEXT:    .cfi_def_cfa_offset 64
675; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
676; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
677; CHECK-NEXT:    #APP
678; CHECK-NEXT:    nop
679; CHECK-NEXT:    #NO_APP
680; CHECK-NEXT:    kmovd %edi, %k1
681; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
682; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
683; CHECK-NEXT:    addq $56, %rsp
684; CHECK-NEXT:    .cfi_def_cfa_offset 8
685; CHECK-NEXT:    retq
686  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
687  %2 = icmp sgt <32 x i16> %a0, zeroinitializer
688  %3 = sub <32 x i16> zeroinitializer, %a0
689  %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
690  %5 = bitcast i32 %mask to <32 x i1>
691  %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> %passthru
692  ret <32 x i16> %6
693}
694
695define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) {
696; CHECK-LABEL: stack_fold_pabsw_maskz:
697; CHECK:       # %bb.0:
698; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
699; CHECK-NEXT:    #APP
700; CHECK-NEXT:    nop
701; CHECK-NEXT:    #NO_APP
702; CHECK-NEXT:    kmovd %edi, %k1
703; CHECK-NEXT:    vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
704; CHECK-NEXT:    retq
705  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
706  %2 = icmp sgt <32 x i16> %a0, zeroinitializer
707  %3 = sub <32 x i16> zeroinitializer, %a0
708  %4 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %3
709  %5 = bitcast i32 %mask to <32 x i1>
710  %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer
711  ret <32 x i16> %6
712}
713
714define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) {
715; CHECK-LABEL: stack_fold_packssdw:
716; CHECK:       # %bb.0:
717; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
718; CHECK-NEXT:    #APP
719; CHECK-NEXT:    nop
720; CHECK-NEXT:    #NO_APP
721; CHECK-NEXT:    vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
722; CHECK-NEXT:    retq
723  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
724  %2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1)
725  ret <32 x i16> %2
726}
727declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
728
729define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) {
730; CHECK-LABEL: stack_fold_packsswb:
731; CHECK:       # %bb.0:
732; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
733; CHECK-NEXT:    #APP
734; CHECK-NEXT:    nop
735; CHECK-NEXT:    #NO_APP
736; CHECK-NEXT:    vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
737; CHECK-NEXT:    retq
738  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
739  %2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1)
740  ret <64 x i8> %2
741}
742declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
743
744define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) {
745; CHECK-LABEL: stack_fold_packusdw:
746; CHECK:       # %bb.0:
747; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
748; CHECK-NEXT:    #APP
749; CHECK-NEXT:    nop
750; CHECK-NEXT:    #NO_APP
751; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
752; CHECK-NEXT:    retq
753  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
754  %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
755  ret <32 x i16> %2
756}
757declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
758
759define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
760; CHECK-LABEL: stack_fold_packusdw_mask:
761; CHECK:       # %bb.0:
762; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
763; CHECK-NEXT:    #APP
764; CHECK-NEXT:    nop
765; CHECK-NEXT:    #NO_APP
766; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
767; CHECK-NEXT:    kmovd %esi, %k1
768; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
769; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
770; CHECK-NEXT:    retq
771  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
772  %2 = load <32 x i16>, <32 x i16>* %passthru
773  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
774  %4 = bitcast i32 %mask to <32 x i1>
775  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %2
776  ret <32 x i16> %5
777}
778
779define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
780; CHECK-LABEL: stack_fold_packusdw_maskz:
781; CHECK:       # %bb.0:
782; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
783; CHECK-NEXT:    #APP
784; CHECK-NEXT:    nop
785; CHECK-NEXT:    #NO_APP
786; CHECK-NEXT:    kmovd %edi, %k1
787; CHECK-NEXT:    vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
788; CHECK-NEXT:    retq
789  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
790  %2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
791  %3 = bitcast i32 %mask to <32 x i1>
792  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
793  ret <32 x i16> %4
794}
795
796define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) {
797; CHECK-LABEL: stack_fold_packuswb:
798; CHECK:       # %bb.0:
799; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
800; CHECK-NEXT:    #APP
801; CHECK-NEXT:    nop
802; CHECK-NEXT:    #NO_APP
803; CHECK-NEXT:    vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
804; CHECK-NEXT:    retq
805  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
806  %2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1)
807  ret <64 x i8> %2
808}
809declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
810
811define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) {
812; CHECK-LABEL: stack_fold_paddb:
813; CHECK:       # %bb.0:
814; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
815; CHECK-NEXT:    #APP
816; CHECK-NEXT:    nop
817; CHECK-NEXT:    #NO_APP
818; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
819; CHECK-NEXT:    retq
820  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
821  %2 = add <64 x i8> %a0, %a1
822  ret <64 x i8> %2
823}
824
825define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
826; CHECK-LABEL: stack_fold_paddb_commuted:
827; CHECK:       # %bb.0:
828; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
829; CHECK-NEXT:    #APP
830; CHECK-NEXT:    nop
831; CHECK-NEXT:    #NO_APP
832; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
833; CHECK-NEXT:    retq
834  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
835  %2 = add <64 x i8> %a1, %a0
836  ret <64 x i8> %2
837}
838
839define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
840; CHECK-LABEL: stack_fold_paddb_mask:
841; CHECK:       # %bb.0:
842; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
843; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
844; CHECK-NEXT:    #APP
845; CHECK-NEXT:    nop
846; CHECK-NEXT:    #NO_APP
847; CHECK-NEXT:    kmovq %rsi, %k1
848; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
849; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
850; CHECK-NEXT:    retq
851  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
852  %2 = add <64 x i8> %a0, %a1
853  %3 = bitcast i64 %mask to <64 x i1>
854  ; load needed to keep the operation from being scheduled about the asm block
855  %4 = load <64 x i8>, <64 x i8>* %a2
856  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
857  ret <64 x i8> %5
858}
859
860define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
861; CHECK-LABEL: stack_fold_paddb_mask_commuted:
862; CHECK:       # %bb.0:
863; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
864; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
865; CHECK-NEXT:    #APP
866; CHECK-NEXT:    nop
867; CHECK-NEXT:    #NO_APP
868; CHECK-NEXT:    kmovq %rsi, %k1
869; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
870; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
871; CHECK-NEXT:    retq
872  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
873  %2 = add <64 x i8> %a1, %a0
874  %3 = bitcast i64 %mask to <64 x i1>
875  ; load needed to keep the operation from being scheduled about the asm block
876  %4 = load <64 x i8>, <64 x i8>* %a2
877  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
878  ret <64 x i8> %5
879}
880
881define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
882; CHECK-LABEL: stack_fold_paddb_maskz:
883; CHECK:       # %bb.0:
884; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
885; CHECK-NEXT:    #APP
886; CHECK-NEXT:    nop
887; CHECK-NEXT:    #NO_APP
888; CHECK-NEXT:    kmovq %rdi, %k1
889; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
890; CHECK-NEXT:    retq
891  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
892  %2 = add <64 x i8> %a0, %a1
893  %3 = bitcast i64 %mask to <64 x i1>
894  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
895  ret <64 x i8> %4
896}
897
898define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
899; CHECK-LABEL: stack_fold_paddb_maskz_commuted:
900; CHECK:       # %bb.0:
901; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
902; CHECK-NEXT:    #APP
903; CHECK-NEXT:    nop
904; CHECK-NEXT:    #NO_APP
905; CHECK-NEXT:    kmovq %rdi, %k1
906; CHECK-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
907; CHECK-NEXT:    retq
908  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
909  %2 = add <64 x i8> %a1, %a0
910  %3 = bitcast i64 %mask to <64 x i1>
911  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
912  ret <64 x i8> %4
913}
914
915define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) {
916; CHECK-LABEL: stack_fold_paddd:
917; CHECK:       # %bb.0:
918; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
919; CHECK-NEXT:    #APP
920; CHECK-NEXT:    nop
921; CHECK-NEXT:    #NO_APP
922; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
923; CHECK-NEXT:    retq
924  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
925  %2 = add <16 x i32> %a0, %a1
926  ret <16 x i32> %2
927}
928
929define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
930; CHECK-LABEL: stack_fold_paddd_commuted:
931; CHECK:       # %bb.0:
932; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
933; CHECK-NEXT:    #APP
934; CHECK-NEXT:    nop
935; CHECK-NEXT:    #NO_APP
936; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
937; CHECK-NEXT:    retq
938  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
939  %2 = add <16 x i32> %a1, %a0
940  ret <16 x i32> %2
941}
942
943define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
944; CHECK-LABEL: stack_fold_paddd_mask:
945; CHECK:       # %bb.0:
946; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
947; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
948; CHECK-NEXT:    #APP
949; CHECK-NEXT:    nop
950; CHECK-NEXT:    #NO_APP
951; CHECK-NEXT:    kmovd %esi, %k1
952; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
953; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
954; CHECK-NEXT:    retq
955  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
956  %2 = add <16 x i32> %a0, %a1
957  %3 = bitcast i16 %mask to <16 x i1>
958  ; load needed to keep the operation from being scheduled about the asm block
959  %4 = load <16 x i32>, <16 x i32>* %a2
960  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
961  ret <16 x i32> %5
962}
963
964define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
965; CHECK-LABEL: stack_fold_paddd_mask_commuted:
966; CHECK:       # %bb.0:
967; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
968; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
969; CHECK-NEXT:    #APP
970; CHECK-NEXT:    nop
971; CHECK-NEXT:    #NO_APP
972; CHECK-NEXT:    kmovd %esi, %k1
973; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
974; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
975; CHECK-NEXT:    retq
976  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
977  %2 = add <16 x i32> %a1, %a0
978  %3 = bitcast i16 %mask to <16 x i1>
979  ; load needed to keep the operation from being scheduled about the asm block
980  %4 = load <16 x i32>, <16 x i32>* %a2
981  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
982  ret <16 x i32> %5
983}
984
985define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
986; CHECK-LABEL: stack_fold_paddd_maskz:
987; CHECK:       # %bb.0:
988; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
989; CHECK-NEXT:    #APP
990; CHECK-NEXT:    nop
991; CHECK-NEXT:    #NO_APP
992; CHECK-NEXT:    kmovd %edi, %k1
993; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
994; CHECK-NEXT:    retq
995  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
996  %2 = add <16 x i32> %a0, %a1
997  %3 = bitcast i16 %mask to <16 x i1>
998  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
999  ret <16 x i32> %4
1000}
1001
1002define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1003; CHECK-LABEL: stack_fold_paddd_maskz_commuted:
1004; CHECK:       # %bb.0:
1005; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1006; CHECK-NEXT:    #APP
1007; CHECK-NEXT:    nop
1008; CHECK-NEXT:    #NO_APP
1009; CHECK-NEXT:    kmovd %edi, %k1
1010; CHECK-NEXT:    vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1011; CHECK-NEXT:    retq
1012  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1013  %2 = add <16 x i32> %a1, %a0
1014  %3 = bitcast i16 %mask to <16 x i1>
1015  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1016  ret <16 x i32> %4
1017}
1018
1019define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) {
1020; CHECK-LABEL: stack_fold_paddq:
1021; CHECK:       # %bb.0:
1022; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1023; CHECK-NEXT:    #APP
1024; CHECK-NEXT:    nop
1025; CHECK-NEXT:    #NO_APP
1026; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1027; CHECK-NEXT:    retq
1028  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1029  %2 = add <8 x i64> %a0, %a1
1030  ret <8 x i64> %2
1031}
1032
1033define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1034; CHECK-LABEL: stack_fold_paddq_commuted:
1035; CHECK:       # %bb.0:
1036; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1037; CHECK-NEXT:    #APP
1038; CHECK-NEXT:    nop
1039; CHECK-NEXT:    #NO_APP
1040; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1041; CHECK-NEXT:    retq
1042  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1043  %2 = add <8 x i64> %a1, %a0
1044  ret <8 x i64> %2
1045}
1046
1047define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1048; CHECK-LABEL: stack_fold_paddq_mask:
1049; CHECK:       # %bb.0:
1050; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1051; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1052; CHECK-NEXT:    #APP
1053; CHECK-NEXT:    nop
1054; CHECK-NEXT:    #NO_APP
1055; CHECK-NEXT:    kmovd %esi, %k1
1056; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1057; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1058; CHECK-NEXT:    retq
1059  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1060  %2 = add <8 x i64> %a0, %a1
1061  %3 = bitcast i8 %mask to <8 x i1>
1062  ; load needed to keep the operation from being scheduled about the asm block
1063  %4 = load <8 x i64>, <8 x i64>* %a2
1064  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1065  ret <8 x i64> %5
1066}
1067
1068define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1069; CHECK-LABEL: stack_fold_paddq_mask_commuted:
1070; CHECK:       # %bb.0:
1071; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1072; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1073; CHECK-NEXT:    #APP
1074; CHECK-NEXT:    nop
1075; CHECK-NEXT:    #NO_APP
1076; CHECK-NEXT:    kmovd %esi, %k1
1077; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1078; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1079; CHECK-NEXT:    retq
1080  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1081  %2 = add <8 x i64> %a1, %a0
1082  %3 = bitcast i8 %mask to <8 x i1>
1083  ; load needed to keep the operation from being scheduled about the asm block
1084  %4 = load <8 x i64>, <8 x i64>* %a2
1085  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1086  ret <8 x i64> %5
1087}
1088
1089define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1090; CHECK-LABEL: stack_fold_paddq_maskz:
1091; CHECK:       # %bb.0:
1092; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1093; CHECK-NEXT:    #APP
1094; CHECK-NEXT:    nop
1095; CHECK-NEXT:    #NO_APP
1096; CHECK-NEXT:    kmovd %edi, %k1
1097; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1098; CHECK-NEXT:    retq
1099  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1100  %2 = add <8 x i64> %a0, %a1
1101  %3 = bitcast i8 %mask to <8 x i1>
1102  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1103  ret <8 x i64> %4
1104}
1105
1106define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1107; CHECK-LABEL: stack_fold_paddq_maskz_commuted:
1108; CHECK:       # %bb.0:
1109; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1110; CHECK-NEXT:    #APP
1111; CHECK-NEXT:    nop
1112; CHECK-NEXT:    #NO_APP
1113; CHECK-NEXT:    kmovd %edi, %k1
1114; CHECK-NEXT:    vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1115; CHECK-NEXT:    retq
1116  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1117  %2 = add <8 x i64> %a1, %a0
1118  %3 = bitcast i8 %mask to <8 x i1>
1119  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1120  ret <8 x i64> %4
1121}
1122
1123define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) {
1124; CHECK-LABEL: stack_fold_paddsb:
1125; CHECK:       # %bb.0:
1126; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1127; CHECK-NEXT:    #APP
1128; CHECK-NEXT:    nop
1129; CHECK-NEXT:    #NO_APP
1130; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1131; CHECK-NEXT:    retq
1132  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1133  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1134  ret <64 x i8> %2
1135}
1136
1137define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1138; CHECK-LABEL: stack_fold_paddsb_commuted:
1139; CHECK:       # %bb.0:
1140; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1141; CHECK-NEXT:    #APP
1142; CHECK-NEXT:    nop
1143; CHECK-NEXT:    #NO_APP
1144; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1145; CHECK-NEXT:    retq
1146  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1147  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1148  ret <64 x i8> %2
1149}
1150
1151define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1152; CHECK-LABEL: stack_fold_paddsb_mask:
1153; CHECK:       # %bb.0:
1154; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1155; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1156; CHECK-NEXT:    #APP
1157; CHECK-NEXT:    nop
1158; CHECK-NEXT:    #NO_APP
1159; CHECK-NEXT:    kmovq %rsi, %k1
1160; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1161; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1162; CHECK-NEXT:    retq
1163  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1164  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1165  %3 = bitcast i64 %mask to <64 x i1>
1166  ; load needed to keep the operation from being scheduled about the asm block
1167  %4 = load <64 x i8>, <64 x i8>* %a2
1168  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1169  ret <64 x i8> %5
1170}
1171
1172define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1173; CHECK-LABEL: stack_fold_paddsb_mask_commuted:
1174; CHECK:       # %bb.0:
1175; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1176; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1177; CHECK-NEXT:    #APP
1178; CHECK-NEXT:    nop
1179; CHECK-NEXT:    #NO_APP
1180; CHECK-NEXT:    kmovq %rsi, %k1
1181; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1182; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1183; CHECK-NEXT:    retq
1184  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1185  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1186  %3 = bitcast i64 %mask to <64 x i1>
1187  ; load needed to keep the operation from being scheduled about the asm block
1188  %4 = load <64 x i8>, <64 x i8>* %a2
1189  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1190  ret <64 x i8> %5
1191}
1192
1193define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1194; CHECK-LABEL: stack_fold_paddsb_maskz:
1195; CHECK:       # %bb.0:
1196; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1197; CHECK-NEXT:    #APP
1198; CHECK-NEXT:    nop
1199; CHECK-NEXT:    #NO_APP
1200; CHECK-NEXT:    kmovq %rdi, %k1
1201; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1202; CHECK-NEXT:    retq
1203  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1204  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1205  %3 = bitcast i64 %mask to <64 x i1>
1206  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1207  ret <64 x i8> %4
1208}
1209
1210define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1211; CHECK-LABEL: stack_fold_paddsb_maskz_commuted:
1212; CHECK:       # %bb.0:
1213; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1214; CHECK-NEXT:    #APP
1215; CHECK-NEXT:    nop
1216; CHECK-NEXT:    #NO_APP
1217; CHECK-NEXT:    kmovq %rdi, %k1
1218; CHECK-NEXT:    vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1219; CHECK-NEXT:    retq
1220  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1221  %2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1222  %3 = bitcast i64 %mask to <64 x i1>
1223  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1224  ret <64 x i8> %4
1225}
1226
1227define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) {
1228; CHECK-LABEL: stack_fold_paddsw:
1229; CHECK:       # %bb.0:
1230; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1231; CHECK-NEXT:    #APP
1232; CHECK-NEXT:    nop
1233; CHECK-NEXT:    #NO_APP
1234; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1235; CHECK-NEXT:    retq
1236  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1237  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1238  ret <32 x i16> %2
1239}
1240
1241define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1242; CHECK-LABEL: stack_fold_paddsw_commuted:
1243; CHECK:       # %bb.0:
1244; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1245; CHECK-NEXT:    #APP
1246; CHECK-NEXT:    nop
1247; CHECK-NEXT:    #NO_APP
1248; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1249; CHECK-NEXT:    retq
1250  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1251  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1252  ret <32 x i16> %2
1253}
1254
1255define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1256; CHECK-LABEL: stack_fold_paddsw_mask:
1257; CHECK:       # %bb.0:
1258; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1259; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1260; CHECK-NEXT:    #APP
1261; CHECK-NEXT:    nop
1262; CHECK-NEXT:    #NO_APP
1263; CHECK-NEXT:    kmovd %esi, %k1
1264; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1265; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1266; CHECK-NEXT:    retq
1267  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1268  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1269  %3 = bitcast i32 %mask to <32 x i1>
1270  ; load needed to keep the operation from being scheduled about the asm block
1271  %4 = load <32 x i16>, <32 x i16>* %a2
1272  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1273  ret <32 x i16> %5
1274}
1275
1276define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1277; CHECK-LABEL: stack_fold_paddsw_mask_commuted:
1278; CHECK:       # %bb.0:
1279; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1280; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1281; CHECK-NEXT:    #APP
1282; CHECK-NEXT:    nop
1283; CHECK-NEXT:    #NO_APP
1284; CHECK-NEXT:    kmovd %esi, %k1
1285; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1286; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1287; CHECK-NEXT:    retq
1288  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1289  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1290  %3 = bitcast i32 %mask to <32 x i1>
1291  ; load needed to keep the operation from being scheduled about the asm block
1292  %4 = load <32 x i16>, <32 x i16>* %a2
1293  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1294  ret <32 x i16> %5
1295}
1296
1297define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1298; CHECK-LABEL: stack_fold_paddsw_maskz:
1299; CHECK:       # %bb.0:
1300; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1301; CHECK-NEXT:    #APP
1302; CHECK-NEXT:    nop
1303; CHECK-NEXT:    #NO_APP
1304; CHECK-NEXT:    kmovd %edi, %k1
1305; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1306; CHECK-NEXT:    retq
1307  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1308  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1309  %3 = bitcast i32 %mask to <32 x i1>
1310  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1311  ret <32 x i16> %4
1312}
1313
1314define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1315; CHECK-LABEL: stack_fold_paddsw_maskz_commuted:
1316; CHECK:       # %bb.0:
1317; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1318; CHECK-NEXT:    #APP
1319; CHECK-NEXT:    nop
1320; CHECK-NEXT:    #NO_APP
1321; CHECK-NEXT:    kmovd %edi, %k1
1322; CHECK-NEXT:    vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1323; CHECK-NEXT:    retq
1324  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1325  %2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1326  %3 = bitcast i32 %mask to <32 x i1>
1327  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1328  ret <32 x i16> %4
1329}
1330
1331define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) {
1332; CHECK-LABEL: stack_fold_paddusb:
1333; CHECK:       # %bb.0:
1334; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1335; CHECK-NEXT:    #APP
1336; CHECK-NEXT:    nop
1337; CHECK-NEXT:    #NO_APP
1338; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1339; CHECK-NEXT:    retq
1340  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1341  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1342  ret <64 x i8> %2
1343}
1344
1345define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
1346; CHECK-LABEL: stack_fold_paddusb_commuted:
1347; CHECK:       # %bb.0:
1348; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1349; CHECK-NEXT:    #APP
1350; CHECK-NEXT:    nop
1351; CHECK-NEXT:    #NO_APP
1352; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1353; CHECK-NEXT:    retq
1354  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1355  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1356  ret <64 x i8> %2
1357}
1358
1359define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1360; CHECK-LABEL: stack_fold_paddusb_mask:
1361; CHECK:       # %bb.0:
1362; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1363; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1364; CHECK-NEXT:    #APP
1365; CHECK-NEXT:    nop
1366; CHECK-NEXT:    #NO_APP
1367; CHECK-NEXT:    kmovq %rsi, %k1
1368; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1369; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1370; CHECK-NEXT:    retq
1371  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1372  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1373  %3 = bitcast i64 %mask to <64 x i1>
1374  ; load needed to keep the operation from being scheduled about the asm block
1375  %4 = load <64 x i8>, <64 x i8>* %a2
1376  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1377  ret <64 x i8> %5
1378}
1379
1380define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
1381; CHECK-LABEL: stack_fold_paddusb_mask_commuted:
1382; CHECK:       # %bb.0:
1383; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1384; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1385; CHECK-NEXT:    #APP
1386; CHECK-NEXT:    nop
1387; CHECK-NEXT:    #NO_APP
1388; CHECK-NEXT:    kmovq %rsi, %k1
1389; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1390; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1391; CHECK-NEXT:    retq
1392  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1393  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1394  %3 = bitcast i64 %mask to <64 x i1>
1395  ; load needed to keep the operation from being scheduled about the asm block
1396  %4 = load <64 x i8>, <64 x i8>* %a2
1397  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1398  ret <64 x i8> %5
1399}
1400
1401define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1402; CHECK-LABEL: stack_fold_paddusb_maskz:
1403; CHECK:       # %bb.0:
1404; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1405; CHECK-NEXT:    #APP
1406; CHECK-NEXT:    nop
1407; CHECK-NEXT:    #NO_APP
1408; CHECK-NEXT:    kmovq %rdi, %k1
1409; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1410; CHECK-NEXT:    retq
1411  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1412  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
1413  %3 = bitcast i64 %mask to <64 x i1>
1414  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1415  ret <64 x i8> %4
1416}
1417
1418define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1419; CHECK-LABEL: stack_fold_paddusb_maskz_commuted:
1420; CHECK:       # %bb.0:
1421; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1422; CHECK-NEXT:    #APP
1423; CHECK-NEXT:    nop
1424; CHECK-NEXT:    #NO_APP
1425; CHECK-NEXT:    kmovq %rdi, %k1
1426; CHECK-NEXT:    vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1427; CHECK-NEXT:    retq
1428  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1429  %2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
1430  %3 = bitcast i64 %mask to <64 x i1>
1431  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1432  ret <64 x i8> %4
1433}
1434
1435define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) {
1436; CHECK-LABEL: stack_fold_paddusw:
1437; CHECK:       # %bb.0:
1438; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1439; CHECK-NEXT:    #APP
1440; CHECK-NEXT:    nop
1441; CHECK-NEXT:    #NO_APP
1442; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1443; CHECK-NEXT:    retq
1444  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1445  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1446  ret <32 x i16> %2
1447}
1448
1449define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1450; CHECK-LABEL: stack_fold_paddusw_commuted:
1451; CHECK:       # %bb.0:
1452; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1453; CHECK-NEXT:    #APP
1454; CHECK-NEXT:    nop
1455; CHECK-NEXT:    #NO_APP
1456; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1457; CHECK-NEXT:    retq
1458  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1459  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1460  ret <32 x i16> %2
1461}
1462
1463define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1464; CHECK-LABEL: stack_fold_paddusw_mask:
1465; CHECK:       # %bb.0:
1466; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1467; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1468; CHECK-NEXT:    #APP
1469; CHECK-NEXT:    nop
1470; CHECK-NEXT:    #NO_APP
1471; CHECK-NEXT:    kmovd %esi, %k1
1472; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1473; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1474; CHECK-NEXT:    retq
1475  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1476  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1477  %3 = bitcast i32 %mask to <32 x i1>
1478  ; load needed to keep the operation from being scheduled about the asm block
1479  %4 = load <32 x i16>, <32 x i16>* %a2
1480  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1481  ret <32 x i16> %5
1482}
1483
1484define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1485; CHECK-LABEL: stack_fold_paddusw_mask_commuted:
1486; CHECK:       # %bb.0:
1487; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1488; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1489; CHECK-NEXT:    #APP
1490; CHECK-NEXT:    nop
1491; CHECK-NEXT:    #NO_APP
1492; CHECK-NEXT:    kmovd %esi, %k1
1493; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1494; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1495; CHECK-NEXT:    retq
1496  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1497  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1498  %3 = bitcast i32 %mask to <32 x i1>
1499  ; load needed to keep the operation from being scheduled about the asm block
1500  %4 = load <32 x i16>, <32 x i16>* %a2
1501  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1502  ret <32 x i16> %5
1503}
1504
1505define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1506; CHECK-LABEL: stack_fold_paddusw_maskz:
1507; CHECK:       # %bb.0:
1508; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1509; CHECK-NEXT:    #APP
1510; CHECK-NEXT:    nop
1511; CHECK-NEXT:    #NO_APP
1512; CHECK-NEXT:    kmovd %edi, %k1
1513; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1514; CHECK-NEXT:    retq
1515  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1516  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
1517  %3 = bitcast i32 %mask to <32 x i1>
1518  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1519  ret <32 x i16> %4
1520}
1521
1522define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1523; CHECK-LABEL: stack_fold_paddusw_maskz_commuted:
1524; CHECK:       # %bb.0:
1525; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1526; CHECK-NEXT:    #APP
1527; CHECK-NEXT:    nop
1528; CHECK-NEXT:    #NO_APP
1529; CHECK-NEXT:    kmovd %edi, %k1
1530; CHECK-NEXT:    vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1531; CHECK-NEXT:    retq
1532  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1533  %2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
1534  %3 = bitcast i32 %mask to <32 x i1>
1535  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1536  ret <32 x i16> %4
1537}
1538
1539define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) {
1540; CHECK-LABEL: stack_fold_paddw:
1541; CHECK:       # %bb.0:
1542; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1543; CHECK-NEXT:    #APP
1544; CHECK-NEXT:    nop
1545; CHECK-NEXT:    #NO_APP
1546; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1547; CHECK-NEXT:    retq
1548  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1549  %2 = add <32 x i16> %a0, %a1
1550  ret <32 x i16> %2
1551}
1552
1553define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
1554; CHECK-LABEL: stack_fold_paddw_commuted:
1555; CHECK:       # %bb.0:
1556; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1557; CHECK-NEXT:    #APP
1558; CHECK-NEXT:    nop
1559; CHECK-NEXT:    #NO_APP
1560; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1561; CHECK-NEXT:    retq
1562  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1563  %2 = add <32 x i16> %a1, %a0
1564  ret <32 x i16> %2
1565}
1566
1567define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1568; CHECK-LABEL: stack_fold_paddw_mask:
1569; CHECK:       # %bb.0:
1570; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1571; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1572; CHECK-NEXT:    #APP
1573; CHECK-NEXT:    nop
1574; CHECK-NEXT:    #NO_APP
1575; CHECK-NEXT:    kmovd %esi, %k1
1576; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1577; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1578; CHECK-NEXT:    retq
1579  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1580  %2 = add <32 x i16> %a0, %a1
1581  %3 = bitcast i32 %mask to <32 x i1>
1582  ; load needed to keep the operation from being scheduled about the asm block
1583  %4 = load <32 x i16>, <32 x i16>* %a2
1584  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1585  ret <32 x i16> %5
1586}
1587
1588define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
1589; CHECK-LABEL: stack_fold_paddw_mask_commuted:
1590; CHECK:       # %bb.0:
1591; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1592; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
1593; CHECK-NEXT:    #APP
1594; CHECK-NEXT:    nop
1595; CHECK-NEXT:    #NO_APP
1596; CHECK-NEXT:    kmovd %esi, %k1
1597; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
1598; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1599; CHECK-NEXT:    retq
1600  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1601  %2 = add <32 x i16> %a1, %a0
1602  %3 = bitcast i32 %mask to <32 x i1>
1603  ; load needed to keep the operation from being scheduled about the asm block
1604  %4 = load <32 x i16>, <32 x i16>* %a2
1605  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
1606  ret <32 x i16> %5
1607}
1608
1609define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1610; CHECK-LABEL: stack_fold_paddw_maskz:
1611; CHECK:       # %bb.0:
1612; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1613; CHECK-NEXT:    #APP
1614; CHECK-NEXT:    nop
1615; CHECK-NEXT:    #NO_APP
1616; CHECK-NEXT:    kmovd %edi, %k1
1617; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1618; CHECK-NEXT:    retq
1619  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1620  %2 = add <32 x i16> %a0, %a1
1621  %3 = bitcast i32 %mask to <32 x i1>
1622  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1623  ret <32 x i16> %4
1624}
1625
1626define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
1627; CHECK-LABEL: stack_fold_paddw_maskz_commuted:
1628; CHECK:       # %bb.0:
1629; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1630; CHECK-NEXT:    #APP
1631; CHECK-NEXT:    nop
1632; CHECK-NEXT:    #NO_APP
1633; CHECK-NEXT:    kmovd %edi, %k1
1634; CHECK-NEXT:    vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1635; CHECK-NEXT:    retq
1636  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1637  %2 = add <32 x i16> %a1, %a0
1638  %3 = bitcast i32 %mask to <32 x i1>
1639  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
1640  ret <32 x i16> %4
1641}
1642
1643define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) {
1644; CHECK-LABEL: stack_fold_palignr:
1645; CHECK:       # %bb.0:
1646; CHECK-NEXT:    subq $56, %rsp
1647; CHECK-NEXT:    .cfi_def_cfa_offset 64
1648; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1649; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1650; CHECK-NEXT:    #APP
1651; CHECK-NEXT:    nop
1652; CHECK-NEXT:    #NO_APP
1653; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1654; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1655; CHECK-NEXT:    # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1656; CHECK-NEXT:    addq $56, %rsp
1657; CHECK-NEXT:    .cfi_def_cfa_offset 8
1658; CHECK-NEXT:    retq
1659  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1660  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1661  ret <64 x i8> %2
1662}
1663
1664define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) {
1665; CHECK-LABEL: stack_fold_palignr_mask:
1666; CHECK:       # %bb.0:
1667; CHECK-NEXT:    subq $56, %rsp
1668; CHECK-NEXT:    .cfi_def_cfa_offset 64
1669; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1670; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1671; CHECK-NEXT:    #APP
1672; CHECK-NEXT:    nop
1673; CHECK-NEXT:    #NO_APP
1674; CHECK-NEXT:    kmovq %rsi, %k1
1675; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
1676; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1677; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
1678; CHECK-NEXT:    # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1679; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
1680; CHECK-NEXT:    addq $56, %rsp
1681; CHECK-NEXT:    .cfi_def_cfa_offset 8
1682; CHECK-NEXT:    retq
1683  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1684  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1685  %3 = bitcast i64 %mask to <64 x i1>
1686  %4 = load <64 x i8>, <64 x i8>* %passthru
1687  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
1688  ret <64 x i8> %5
1689}
1690
1691define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
1692; CHECK-LABEL: stack_fold_palignr_maskz:
1693; CHECK:       # %bb.0:
1694; CHECK-NEXT:    subq $56, %rsp
1695; CHECK-NEXT:    .cfi_def_cfa_offset 64
1696; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1697; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1698; CHECK-NEXT:    #APP
1699; CHECK-NEXT:    nop
1700; CHECK-NEXT:    #NO_APP
1701; CHECK-NEXT:    kmovq %rdi, %k1
1702; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1703; CHECK-NEXT:    vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1704; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
1705; CHECK-NEXT:    addq $56, %rsp
1706; CHECK-NEXT:    .cfi_def_cfa_offset 8
1707; CHECK-NEXT:    retq
1708  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1709  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
1710  %3 = bitcast i64 %mask to <64 x i1>
1711  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
1712  ret <64 x i8> %4
1713}
1714
1715define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) {
1716; CHECK-LABEL: stack_fold_pandd:
1717; CHECK:       # %bb.0:
1718; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1719; CHECK-NEXT:    #APP
1720; CHECK-NEXT:    nop
1721; CHECK-NEXT:    #NO_APP
1722; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1723; CHECK-NEXT:    retq
1724  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1725  %2 = and <16 x i32> %a0, %a1
1726  ret <16 x i32> %2
1727}
1728
1729define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
1730; CHECK-LABEL: stack_fold_pandd_commuted:
1731; CHECK:       # %bb.0:
1732; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1733; CHECK-NEXT:    #APP
1734; CHECK-NEXT:    nop
1735; CHECK-NEXT:    #NO_APP
1736; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1737; CHECK-NEXT:    retq
1738  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1739  %2 = and <16 x i32> %a1, %a0
1740  ret <16 x i32> %2
1741}
1742
1743define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
1744; CHECK-LABEL: stack_fold_pandd_mask:
1745; CHECK:       # %bb.0:
1746; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1747; CHECK-NEXT:    vmovaps %zmm0, %zmm1
1748; CHECK-NEXT:    #APP
1749; CHECK-NEXT:    nop
1750; CHECK-NEXT:    #NO_APP
1751; CHECK-NEXT:    kmovd %esi, %k1
1752; CHECK-NEXT:    vmovaps (%rdi), %zmm0
1753; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1754; CHECK-NEXT:    retq
1755  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1756  %2 = and <16 x i32> %a0, %a1
1757  %3 = bitcast i16 %mask to <16 x i1>
1758  ; load needed to keep the operation from being scheduled about the asm block
1759  %4 = load <16 x i32>, <16 x i32>* %a2
1760  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1761  ret <16 x i32> %5
1762}
1763
1764define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
1765; CHECK-LABEL: stack_fold_pandd_mask_commuted:
1766; CHECK:       # %bb.0:
1767; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1768; CHECK-NEXT:    vmovaps %zmm0, %zmm1
1769; CHECK-NEXT:    #APP
1770; CHECK-NEXT:    nop
1771; CHECK-NEXT:    #NO_APP
1772; CHECK-NEXT:    kmovd %esi, %k1
1773; CHECK-NEXT:    vmovaps (%rdi), %zmm0
1774; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1775; CHECK-NEXT:    retq
1776  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1777  %2 = and <16 x i32> %a1, %a0
1778  %3 = bitcast i16 %mask to <16 x i1>
1779  ; load needed to keep the operation from being scheduled about the asm block
1780  %4 = load <16 x i32>, <16 x i32>* %a2
1781  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
1782  ret <16 x i32> %5
1783}
1784
1785define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1786; CHECK-LABEL: stack_fold_pandd_maskz:
1787; CHECK:       # %bb.0:
1788; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1789; CHECK-NEXT:    #APP
1790; CHECK-NEXT:    nop
1791; CHECK-NEXT:    #NO_APP
1792; CHECK-NEXT:    kmovd %edi, %k1
1793; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1794; CHECK-NEXT:    retq
1795  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1796  %2 = and <16 x i32> %a0, %a1
1797  %3 = bitcast i16 %mask to <16 x i1>
1798  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1799  ret <16 x i32> %4
1800}
1801
1802define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
1803; CHECK-LABEL: stack_fold_pandd_maskz_commuted:
1804; CHECK:       # %bb.0:
1805; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1806; CHECK-NEXT:    #APP
1807; CHECK-NEXT:    nop
1808; CHECK-NEXT:    #NO_APP
1809; CHECK-NEXT:    kmovd %edi, %k1
1810; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1811; CHECK-NEXT:    retq
1812  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1813  %2 = and <16 x i32> %a1, %a0
1814  %3 = bitcast i16 %mask to <16 x i1>
1815  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
1816  ret <16 x i32> %4
1817}
1818
1819define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) {
1820; CHECK-LABEL: stack_fold_pandq:
1821; CHECK:       # %bb.0:
1822; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1823; CHECK-NEXT:    #APP
1824; CHECK-NEXT:    nop
1825; CHECK-NEXT:    #NO_APP
1826; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1827; CHECK-NEXT:    retq
1828  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1829  %2 = and <8 x i64> %a0, %a1
1830  ret <8 x i64> %2
1831}
1832
1833define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
1834; CHECK-LABEL: stack_fold_pandq_commuted:
1835; CHECK:       # %bb.0:
1836; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1837; CHECK-NEXT:    #APP
1838; CHECK-NEXT:    nop
1839; CHECK-NEXT:    #NO_APP
1840; CHECK-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
1841; CHECK-NEXT:    retq
1842  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1843  %2 = and <8 x i64> %a1, %a0
1844  ret <8 x i64> %2
1845}
1846
1847define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1848; CHECK-LABEL: stack_fold_pandq_mask:
1849; CHECK:       # %bb.0:
1850; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1851; CHECK-NEXT:    vmovapd %zmm0, %zmm1
1852; CHECK-NEXT:    #APP
1853; CHECK-NEXT:    nop
1854; CHECK-NEXT:    #NO_APP
1855; CHECK-NEXT:    kmovd %esi, %k1
1856; CHECK-NEXT:    vmovapd (%rdi), %zmm0
1857; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1858; CHECK-NEXT:    retq
1859  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1860  %2 = and <8 x i64> %a0, %a1
1861  %3 = bitcast i8 %mask to <8 x i1>
1862  ; load needed to keep the operation from being scheduled about the asm block
1863  %4 = load <8 x i64>, <8 x i64>* %a2
1864  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1865  ret <8 x i64> %5
1866}
1867
1868define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
1869; CHECK-LABEL: stack_fold_pandq_mask_commuted:
1870; CHECK:       # %bb.0:
1871; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1872; CHECK-NEXT:    vmovapd %zmm0, %zmm1
1873; CHECK-NEXT:    #APP
1874; CHECK-NEXT:    nop
1875; CHECK-NEXT:    #NO_APP
1876; CHECK-NEXT:    kmovd %esi, %k1
1877; CHECK-NEXT:    vmovapd (%rdi), %zmm0
1878; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
1879; CHECK-NEXT:    retq
1880  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1881  %2 = and <8 x i64> %a1, %a0
1882  %3 = bitcast i8 %mask to <8 x i1>
1883  ; load needed to keep the operation from being scheduled about the asm block
1884  %4 = load <8 x i64>, <8 x i64>* %a2
1885  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
1886  ret <8 x i64> %5
1887}
1888
1889define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1890; CHECK-LABEL: stack_fold_pandq_maskz:
1891; CHECK:       # %bb.0:
1892; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1893; CHECK-NEXT:    #APP
1894; CHECK-NEXT:    nop
1895; CHECK-NEXT:    #NO_APP
1896; CHECK-NEXT:    kmovd %edi, %k1
1897; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1898; CHECK-NEXT:    retq
1899  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1900  %2 = and <8 x i64> %a0, %a1
1901  %3 = bitcast i8 %mask to <8 x i1>
1902  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1903  ret <8 x i64> %4
1904}
1905
1906define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
1907; CHECK-LABEL: stack_fold_pandq_maskz_commuted:
1908; CHECK:       # %bb.0:
1909; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1910; CHECK-NEXT:    #APP
1911; CHECK-NEXT:    nop
1912; CHECK-NEXT:    #NO_APP
1913; CHECK-NEXT:    kmovd %edi, %k1
1914; CHECK-NEXT:    vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
1915; CHECK-NEXT:    retq
1916  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1917  %2 = and <8 x i64> %a1, %a0
1918  %3 = bitcast i8 %mask to <8 x i1>
1919  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1920  ret <8 x i64> %4
1921}
1922
1923define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) {
1924; CHECK-LABEL: stack_fold_vpconflictd:
1925; CHECK:       # %bb.0:
1926; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1927; CHECK-NEXT:    #APP
1928; CHECK-NEXT:    nop
1929; CHECK-NEXT:    #NO_APP
1930; CHECK-NEXT:    vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1931; CHECK-NEXT:    retq
1932  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1933  %2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0)
1934  ret <16 x i32> %2
1935}
1936declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
1937
1938define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) {
1939; CHECK-LABEL: stack_fold_vpconflictq:
1940; CHECK:       # %bb.0:
1941; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1942; CHECK-NEXT:    #APP
1943; CHECK-NEXT:    nop
1944; CHECK-NEXT:    #NO_APP
1945; CHECK-NEXT:    vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
1946; CHECK-NEXT:    retq
1947  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1948  %2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0)
1949  ret <8 x i64> %2
1950}
1951declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
1952
1953define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) {
1954; CHECK-LABEL: stack_fold_pcmpeqb:
1955; CHECK:       # %bb.0:
1956; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1957; CHECK-NEXT:    #APP
1958; CHECK-NEXT:    nop
1959; CHECK-NEXT:    #NO_APP
1960; CHECK-NEXT:    vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1961; CHECK-NEXT:    kmovq %k0, %rax
1962; CHECK-NEXT:    vzeroupper
1963; CHECK-NEXT:    retq
1964  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1965  %2 = icmp eq <64 x i8> %a0, %a1
1966  %3 = bitcast <64 x i1> %2 to i64
1967  ret i64 %3
1968}
1969
1970define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) {
1971; CHECK-LABEL: stack_fold_pcmpeqd:
1972; CHECK:       # %bb.0:
1973; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1974; CHECK-NEXT:    #APP
1975; CHECK-NEXT:    nop
1976; CHECK-NEXT:    #NO_APP
1977; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1978; CHECK-NEXT:    kmovd %k0, %eax
1979; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1980; CHECK-NEXT:    vzeroupper
1981; CHECK-NEXT:    retq
1982  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
1983  %2 = icmp eq <16 x i32> %a0, %a1
1984  %3 = bitcast <16 x i1> %2 to i16
1985  ret i16 %3
1986}
1987
1988define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) {
1989; CHECK-LABEL: stack_fold_pcmpeqq:
1990; CHECK:       # %bb.0:
1991; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1992; CHECK-NEXT:    #APP
1993; CHECK-NEXT:    nop
1994; CHECK-NEXT:    #NO_APP
1995; CHECK-NEXT:    vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
1996; CHECK-NEXT:    kmovd %k0, %eax
1997; CHECK-NEXT:    # kill: def $al killed $al killed $eax
1998; CHECK-NEXT:    vzeroupper
1999; CHECK-NEXT:    retq
2000  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2001  %2 = icmp eq <8 x i64> %a0, %a1
2002  %3 = bitcast <8 x i1> %2 to i8
2003  ret i8 %3
2004}
2005
2006define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) {
2007; CHECK-LABEL: stack_fold_pcmpeqw:
2008; CHECK:       # %bb.0:
2009; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2010; CHECK-NEXT:    #APP
2011; CHECK-NEXT:    nop
2012; CHECK-NEXT:    #NO_APP
2013; CHECK-NEXT:    vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2014; CHECK-NEXT:    kmovd %k0, %eax
2015; CHECK-NEXT:    vzeroupper
2016; CHECK-NEXT:    retq
2017  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2018  %2 = icmp eq <32 x i16> %a0, %a1
2019  %3 = bitcast <32 x i1> %2 to i32
2020  ret i32 %3
2021}
2022
2023define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2024; CHECK-LABEL: stack_fold_pcmpeqd_mask:
2025; CHECK:       # %bb.0:
2026; CHECK-NEXT:    subq $184, %rsp
2027; CHECK-NEXT:    .cfi_def_cfa_offset 192
2028; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2029; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
2030; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2031; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2032; CHECK-NEXT:    #APP
2033; CHECK-NEXT:    nop
2034; CHECK-NEXT:    #NO_APP
2035; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2036; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2037; CHECK-NEXT:    kmovd %esi, %k1
2038; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2039; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2040; CHECK-NEXT:    vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
2041; CHECK-NEXT:    addq $184, %rsp
2042; CHECK-NEXT:    .cfi_def_cfa_offset 8
2043; CHECK-NEXT:    retq
2044  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2045  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2046  %2 = load <16 x i32>, <16 x i32>* %a2
2047  %3 = add <16 x i32> %a1, %2
2048  %4 = bitcast i16 %mask to <16 x i1>
2049  %5 = icmp eq <16 x i32> %3, %a0
2050  %6 = and <16 x i1> %4, %5
2051  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2052  ret <16 x i32> %7
2053}
2054
2055define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2056; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted:
2057; CHECK:       # %bb.0:
2058; CHECK-NEXT:    subq $184, %rsp
2059; CHECK-NEXT:    .cfi_def_cfa_offset 192
2060; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2061; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
2062; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2063; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2064; CHECK-NEXT:    #APP
2065; CHECK-NEXT:    nop
2066; CHECK-NEXT:    #NO_APP
2067; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2068; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2069; CHECK-NEXT:    kmovd %esi, %k1
2070; CHECK-NEXT:    vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2071; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2072; CHECK-NEXT:    vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
2073; CHECK-NEXT:    addq $184, %rsp
2074; CHECK-NEXT:    .cfi_def_cfa_offset 8
2075; CHECK-NEXT:    retq
2076  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2077  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2078  %2 = load <16 x i32>, <16 x i32>* %a2
2079  %3 = add <16 x i32> %a1, %2
2080  %4 = bitcast i16 %mask to <16 x i1>
2081  %5 = icmp eq <16 x i32> %a0, %3
2082  %6 = and <16 x i1> %4, %5
2083  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2084  ret <16 x i32> %7
2085}
2086
2087define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
2088; CHECK-LABEL: stack_fold_pcmpled_mask:
2089; CHECK:       # %bb.0:
2090; CHECK-NEXT:    subq $184, %rsp
2091; CHECK-NEXT:    .cfi_def_cfa_offset 192
2092; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2093; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
2094; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2095; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2096; CHECK-NEXT:    #APP
2097; CHECK-NEXT:    nop
2098; CHECK-NEXT:    #NO_APP
2099; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2100; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2101; CHECK-NEXT:    kmovd %esi, %k1
2102; CHECK-NEXT:    vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
2103; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2104; CHECK-NEXT:    vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
2105; CHECK-NEXT:    addq $184, %rsp
2106; CHECK-NEXT:    .cfi_def_cfa_offset 8
2107; CHECK-NEXT:    retq
2108  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2109  ; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
2110  %2 = load <16 x i32>, <16 x i32>* %a2
2111  %3 = add <16 x i32> %a1, %2
2112  %4 = bitcast i16 %mask to <16 x i1>
2113  %5 = icmp sge <16 x i32> %a0, %3
2114  %6 = and <16 x i1> %4, %5
2115  %7 = select <16 x i1> %6, <16 x i32> %b0, <16 x i32> %b1
2116  ret <16 x i32> %7
2117}
2118
2119define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
2120; CHECK-LABEL: stack_fold_pcmpleud:
2121; CHECK:       # %bb.0:
2122; CHECK-NEXT:    subq $56, %rsp
2123; CHECK-NEXT:    .cfi_def_cfa_offset 64
2124; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2125; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2126; CHECK-NEXT:    #APP
2127; CHECK-NEXT:    nop
2128; CHECK-NEXT:    #NO_APP
2129; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
2130; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
2131; CHECK-NEXT:    vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
2132; CHECK-NEXT:    kmovd %k0, %eax
2133; CHECK-NEXT:    andl %esi, %eax
2134; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
2135; CHECK-NEXT:    addq $56, %rsp
2136; CHECK-NEXT:    .cfi_def_cfa_offset 8
2137; CHECK-NEXT:    vzeroupper
2138; CHECK-NEXT:    retq
2139  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2140  %2 = load <16 x i32>, <16 x i32>* %a2
2141  %3 = add <16 x i32> %a1, %2
2142  %4 = bitcast i16 %mask to <16 x i1>
2143  %5 = icmp uge <16 x i32> %a0, %3
2144  %6 = and <16 x i1> %5, %4
2145  %7 = bitcast <16 x i1> %6 to i16
2146  ret i16 %7
2147}
2148
2149define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
2150; CHECK-LABEL: stack_fold_permbvar:
2151; CHECK:       # %bb.0:
2152; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2153; CHECK-NEXT:    #APP
2154; CHECK-NEXT:    nop
2155; CHECK-NEXT:    #NO_APP
2156; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2157; CHECK-NEXT:    retq
2158  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2159  %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2160  ret <64 x i8> %2
2161}
2162declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly
2163
2164define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2165; CHECK-LABEL: stack_fold_permbvar_mask:
2166; CHECK:       # %bb.0:
2167; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2168; CHECK-NEXT:    #APP
2169; CHECK-NEXT:    nop
2170; CHECK-NEXT:    #NO_APP
2171; CHECK-NEXT:    kmovq %rsi, %k1
2172; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2173; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2174; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2175; CHECK-NEXT:    retq
2176  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2177  %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2178  %3 = bitcast i64 %mask to <64 x i1>
2179  ; load needed to keep the operation from being scheduled above the asm block
2180  %4 = load <64 x i8>, <64 x i8>* %passthru
2181  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
2182  ret <64 x i8> %5
2183}
2184
2185define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
2186; CHECK-LABEL: stack_fold_permbvar_maskz:
2187; CHECK:       # %bb.0:
2188; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2189; CHECK-NEXT:    #APP
2190; CHECK-NEXT:    nop
2191; CHECK-NEXT:    #NO_APP
2192; CHECK-NEXT:    kmovq %rdi, %k1
2193; CHECK-NEXT:    vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2194; CHECK-NEXT:    retq
2195  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2196  %2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
2197  %3 = bitcast i64 %mask to <64 x i1>
2198  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
2199  ret <64 x i8> %4
2200}
2201
2202define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
2203; CHECK-LABEL: stack_fold_permd:
2204; CHECK:       # %bb.0:
2205; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2206; CHECK-NEXT:    #APP
2207; CHECK-NEXT:    nop
2208; CHECK-NEXT:    #NO_APP
2209; CHECK-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2210; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
2211; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
2212; CHECK-NEXT:    retq
2213  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2214  %2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0)
2215  ; add forces execution domain
2216  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2217  ret <16 x i32> %3
2218}
2219declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly
2220
2221define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2222; CHECK-LABEL: stack_fold_vpermi2b:
2223; CHECK:       # %bb.0:
2224; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2225; CHECK-NEXT:    #APP
2226; CHECK-NEXT:    nop
2227; CHECK-NEXT:    #NO_APP
2228; CHECK-NEXT:    vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2229; CHECK-NEXT:    retq
2230  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2231  %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2)
2232  ret <64 x i8> %2
2233}
2234
2235define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2236; CHECK-LABEL: stack_fold_vpermi2d:
2237; CHECK:       # %bb.0:
2238; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2239; CHECK-NEXT:    #APP
2240; CHECK-NEXT:    nop
2241; CHECK-NEXT:    #NO_APP
2242; CHECK-NEXT:    vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2243; CHECK-NEXT:    retq
2244  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2245  %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
2246  ret <16 x i32> %2
2247}
2248
2249define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2250; CHECK-LABEL: stack_fold_vpermi2q:
2251; CHECK:       # %bb.0:
2252; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2253; CHECK-NEXT:    #APP
2254; CHECK-NEXT:    nop
2255; CHECK-NEXT:    #NO_APP
2256; CHECK-NEXT:    vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2257; CHECK-NEXT:    retq
2258  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2259  %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
2260  ret <8 x i64> %2
2261}
2262
2263define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2264; CHECK-LABEL: stack_fold_vpermi2w:
2265; CHECK:       # %bb.0:
2266; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2267; CHECK-NEXT:    #APP
2268; CHECK-NEXT:    nop
2269; CHECK-NEXT:    #NO_APP
2270; CHECK-NEXT:    vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2271; CHECK-NEXT:    retq
2272  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2273  %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2)
2274  ret <32 x i16> %2
2275}
2276
2277define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
2278; CHECK-LABEL: stack_fold_permq:
2279; CHECK:       # %bb.0:
2280; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2281; CHECK-NEXT:    #APP
2282; CHECK-NEXT:    nop
2283; CHECK-NEXT:    #NO_APP
2284; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2285; CHECK-NEXT:    # zmm0 = mem[3,2,2,3,7,6,6,7]
2286; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
2287; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2288; CHECK-NEXT:    retq
2289  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2290  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2291  ; add forces execution domain
2292  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2293  ret <8 x i64> %3
2294}
2295
2296define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
2297; CHECK-LABEL: stack_fold_permq_mask:
2298; CHECK:       # %bb.0:
2299; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2300; CHECK-NEXT:    #APP
2301; CHECK-NEXT:    nop
2302; CHECK-NEXT:    #NO_APP
2303; CHECK-NEXT:    kmovd %esi, %k1
2304; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
2305; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
2306; CHECK-NEXT:    # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
2307; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
2308; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2309; CHECK-NEXT:    retq
2310  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2311  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2312  %3 = bitcast i8 %mask to <8 x i1>
2313  ; load needed to keep the operation from being scheduled above the asm block
2314  %4 = load <8 x i64>, <8 x i64>* %passthru
2315  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2316  ; add forces execution domain
2317  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2318  ret <8 x i64> %6
2319}
2320
2321define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
2322; CHECK-LABEL: stack_fold_permq_maskz:
2323; CHECK:       # %bb.0:
2324; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2325; CHECK-NEXT:    #APP
2326; CHECK-NEXT:    nop
2327; CHECK-NEXT:    #NO_APP
2328; CHECK-NEXT:    kmovd %esi, %k1
2329; CHECK-NEXT:    vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
2330; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
2331; CHECK-NEXT:    retq
2332  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2333  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
2334  %3 = bitcast i8 %mask to <8 x i1>
2335  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
2336  ret <8 x i64> %4
2337}
2338
2339define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
2340; CHECK-LABEL: stack_fold_permqvar:
2341; CHECK:       # %bb.0:
2342; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2343; CHECK-NEXT:    #APP
2344; CHECK-NEXT:    nop
2345; CHECK-NEXT:    #NO_APP
2346; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2347; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
2348; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
2349; CHECK-NEXT:    retq
2350  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2351  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2352  ; add forces execution domain
2353  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2354  ret <8 x i64> %3
2355}
2356declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly
2357
2358define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2359; CHECK-LABEL: stack_fold_permqvar_mask:
2360; CHECK:       # %bb.0:
2361; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2362; CHECK-NEXT:    #APP
2363; CHECK-NEXT:    nop
2364; CHECK-NEXT:    #NO_APP
2365; CHECK-NEXT:    kmovd %esi, %k1
2366; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
2367; CHECK-NEXT:    vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
2368; CHECK-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
2369; CHECK-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
2370; CHECK-NEXT:    retq
2371  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2372  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
2373  %3 = bitcast i8 %mask to <8 x i1>
2374  ; load needed to keep the operation from being scheduled above the asm block
2375  %4 = load <8 x i64>, <8 x i64>* %passthru
2376  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
2377  ; add forces execution domain
2378  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
2379  ret <8 x i64> %6
2380}
2381
2382define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
2383; CHECK-LABEL: stack_fold_vpermt2b:
2384; CHECK:       # %bb.0:
2385; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2386; CHECK-NEXT:    #APP
2387; CHECK-NEXT:    nop
2388; CHECK-NEXT:    #NO_APP
2389; CHECK-NEXT:    vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2390; CHECK-NEXT:    retq
2391  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2392  %2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
2393  ret <64 x i8> %2
2394}
2395declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>)
2396
2397define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
2398; CHECK-LABEL: stack_fold_vpermt2d:
2399; CHECK:       # %bb.0:
2400; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2401; CHECK-NEXT:    #APP
2402; CHECK-NEXT:    nop
2403; CHECK-NEXT:    #NO_APP
2404; CHECK-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2405; CHECK-NEXT:    retq
2406  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2407  %2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
2408  ret <16 x i32> %2
2409}
2410declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2411
2412define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
2413; CHECK-LABEL: stack_fold_vpermt2q:
2414; CHECK:       # %bb.0:
2415; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2416; CHECK-NEXT:    #APP
2417; CHECK-NEXT:    nop
2418; CHECK-NEXT:    #NO_APP
2419; CHECK-NEXT:    vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2420; CHECK-NEXT:    retq
2421  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2422  %2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
2423  ret <8 x i64> %2
2424}
2425declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2426
2427define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
2428; CHECK-LABEL: stack_fold_vpermt2w:
2429; CHECK:       # %bb.0:
2430; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2431; CHECK-NEXT:    #APP
2432; CHECK-NEXT:    nop
2433; CHECK-NEXT:    #NO_APP
2434; CHECK-NEXT:    vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
2435; CHECK-NEXT:    retq
2436  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2437  %2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2)
2438  ret <32 x i16> %2
2439}
2440declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
2441
2442define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
2443; CHECK-LABEL: stack_fold_permwvar:
2444; CHECK:       # %bb.0:
2445; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2446; CHECK-NEXT:    #APP
2447; CHECK-NEXT:    nop
2448; CHECK-NEXT:    #NO_APP
2449; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2450; CHECK-NEXT:    retq
2451  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2452  %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2453  ret <32 x i16> %2
2454}
2455declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly
2456
2457define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2458; CHECK-LABEL: stack_fold_permwvar_mask:
2459; CHECK:       # %bb.0:
2460; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2461; CHECK-NEXT:    #APP
2462; CHECK-NEXT:    nop
2463; CHECK-NEXT:    #NO_APP
2464; CHECK-NEXT:    kmovd %esi, %k1
2465; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2466; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2467; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2468; CHECK-NEXT:    retq
2469  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2470  %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2471  %3 = bitcast i32 %mask to <32 x i1>
2472  ; load needed to keep the operation from being scheduled above the asm block
2473  %4 = load <32 x i16>, <32 x i16>* %passthru
2474  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2475  ret <32 x i16> %5
2476}
2477
2478define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
2479; CHECK-LABEL: stack_fold_permwvar_maskz:
2480; CHECK:       # %bb.0:
2481; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2482; CHECK-NEXT:    #APP
2483; CHECK-NEXT:    nop
2484; CHECK-NEXT:    #NO_APP
2485; CHECK-NEXT:    kmovd %edi, %k1
2486; CHECK-NEXT:    vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2487; CHECK-NEXT:    retq
2488  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2489  %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
2490  %3 = bitcast i32 %mask to <32 x i1>
2491  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2492  ret <32 x i16> %4
2493}
2494
2495define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
2496; CHECK-LABEL: stack_fold_pextrd:
2497; CHECK:       # %bb.0:
2498; CHECK-NEXT:    pushq %rbp
2499; CHECK-NEXT:    .cfi_def_cfa_offset 16
2500; CHECK-NEXT:    pushq %r15
2501; CHECK-NEXT:    .cfi_def_cfa_offset 24
2502; CHECK-NEXT:    pushq %r14
2503; CHECK-NEXT:    .cfi_def_cfa_offset 32
2504; CHECK-NEXT:    pushq %r13
2505; CHECK-NEXT:    .cfi_def_cfa_offset 40
2506; CHECK-NEXT:    pushq %r12
2507; CHECK-NEXT:    .cfi_def_cfa_offset 48
2508; CHECK-NEXT:    pushq %rbx
2509; CHECK-NEXT:    .cfi_def_cfa_offset 56
2510; CHECK-NEXT:    .cfi_offset %rbx, -56
2511; CHECK-NEXT:    .cfi_offset %r12, -48
2512; CHECK-NEXT:    .cfi_offset %r13, -40
2513; CHECK-NEXT:    .cfi_offset %r14, -32
2514; CHECK-NEXT:    .cfi_offset %r15, -24
2515; CHECK-NEXT:    .cfi_offset %rbp, -16
2516; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2517; CHECK-NEXT:    vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
2518; CHECK-NEXT:    #APP
2519; CHECK-NEXT:    nop
2520; CHECK-NEXT:    #NO_APP
2521; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
2522; CHECK-NEXT:    popq %rbx
2523; CHECK-NEXT:    .cfi_def_cfa_offset 48
2524; CHECK-NEXT:    popq %r12
2525; CHECK-NEXT:    .cfi_def_cfa_offset 40
2526; CHECK-NEXT:    popq %r13
2527; CHECK-NEXT:    .cfi_def_cfa_offset 32
2528; CHECK-NEXT:    popq %r14
2529; CHECK-NEXT:    .cfi_def_cfa_offset 24
2530; CHECK-NEXT:    popq %r15
2531; CHECK-NEXT:    .cfi_def_cfa_offset 16
2532; CHECK-NEXT:    popq %rbp
2533; CHECK-NEXT:    .cfi_def_cfa_offset 8
2534; CHECK-NEXT:    retq
2535  ; add forces execution domain
2536  %1 = add <4 x i32> %a0, %a1
2537  %2 = extractelement <4 x i32> %1, i32 1
2538  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2539  ret i32 %2
2540}
2541
2542define i64 @stack_fold_pextrq(<2 x i64> %a0) {
2543; CHECK-LABEL: stack_fold_pextrq:
2544; CHECK:       # %bb.0:
2545; CHECK-NEXT:    pushq %rbp
2546; CHECK-NEXT:    .cfi_def_cfa_offset 16
2547; CHECK-NEXT:    pushq %r15
2548; CHECK-NEXT:    .cfi_def_cfa_offset 24
2549; CHECK-NEXT:    pushq %r14
2550; CHECK-NEXT:    .cfi_def_cfa_offset 32
2551; CHECK-NEXT:    pushq %r13
2552; CHECK-NEXT:    .cfi_def_cfa_offset 40
2553; CHECK-NEXT:    pushq %r12
2554; CHECK-NEXT:    .cfi_def_cfa_offset 48
2555; CHECK-NEXT:    pushq %rbx
2556; CHECK-NEXT:    .cfi_def_cfa_offset 56
2557; CHECK-NEXT:    .cfi_offset %rbx, -56
2558; CHECK-NEXT:    .cfi_offset %r12, -48
2559; CHECK-NEXT:    .cfi_offset %r13, -40
2560; CHECK-NEXT:    .cfi_offset %r14, -32
2561; CHECK-NEXT:    .cfi_offset %r15, -24
2562; CHECK-NEXT:    .cfi_offset %rbp, -16
2563; CHECK-NEXT:    vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
2564; CHECK-NEXT:    #APP
2565; CHECK-NEXT:    nop
2566; CHECK-NEXT:    #NO_APP
2567; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
2568; CHECK-NEXT:    popq %rbx
2569; CHECK-NEXT:    .cfi_def_cfa_offset 48
2570; CHECK-NEXT:    popq %r12
2571; CHECK-NEXT:    .cfi_def_cfa_offset 40
2572; CHECK-NEXT:    popq %r13
2573; CHECK-NEXT:    .cfi_def_cfa_offset 32
2574; CHECK-NEXT:    popq %r14
2575; CHECK-NEXT:    .cfi_def_cfa_offset 24
2576; CHECK-NEXT:    popq %r15
2577; CHECK-NEXT:    .cfi_def_cfa_offset 16
2578; CHECK-NEXT:    popq %rbp
2579; CHECK-NEXT:    .cfi_def_cfa_offset 8
2580; CHECK-NEXT:    retq
2581  %1 = extractelement <2 x i64> %a0, i32 1
2582  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2583  ret i64 %1
2584}
2585
2586define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
2587; CHECK-LABEL: stack_fold_pinsrb:
2588; CHECK:       # %bb.0:
2589; CHECK-NEXT:    pushq %rbp
2590; CHECK-NEXT:    .cfi_def_cfa_offset 16
2591; CHECK-NEXT:    pushq %r15
2592; CHECK-NEXT:    .cfi_def_cfa_offset 24
2593; CHECK-NEXT:    pushq %r14
2594; CHECK-NEXT:    .cfi_def_cfa_offset 32
2595; CHECK-NEXT:    pushq %r13
2596; CHECK-NEXT:    .cfi_def_cfa_offset 40
2597; CHECK-NEXT:    pushq %r12
2598; CHECK-NEXT:    .cfi_def_cfa_offset 48
2599; CHECK-NEXT:    pushq %rbx
2600; CHECK-NEXT:    .cfi_def_cfa_offset 56
2601; CHECK-NEXT:    .cfi_offset %rbx, -56
2602; CHECK-NEXT:    .cfi_offset %r12, -48
2603; CHECK-NEXT:    .cfi_offset %r13, -40
2604; CHECK-NEXT:    .cfi_offset %r14, -32
2605; CHECK-NEXT:    .cfi_offset %r15, -24
2606; CHECK-NEXT:    .cfi_offset %rbp, -16
2607; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2608; CHECK-NEXT:    #APP
2609; CHECK-NEXT:    nop
2610; CHECK-NEXT:    #NO_APP
2611; CHECK-NEXT:    vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2612; CHECK-NEXT:    popq %rbx
2613; CHECK-NEXT:    .cfi_def_cfa_offset 48
2614; CHECK-NEXT:    popq %r12
2615; CHECK-NEXT:    .cfi_def_cfa_offset 40
2616; CHECK-NEXT:    popq %r13
2617; CHECK-NEXT:    .cfi_def_cfa_offset 32
2618; CHECK-NEXT:    popq %r14
2619; CHECK-NEXT:    .cfi_def_cfa_offset 24
2620; CHECK-NEXT:    popq %r15
2621; CHECK-NEXT:    .cfi_def_cfa_offset 16
2622; CHECK-NEXT:    popq %rbp
2623; CHECK-NEXT:    .cfi_def_cfa_offset 8
2624; CHECK-NEXT:    retq
2625  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2626  %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
2627  ret <16 x i8> %2
2628}
2629
2630define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
2631; CHECK-LABEL: stack_fold_pinsrd:
2632; CHECK:       # %bb.0:
2633; CHECK-NEXT:    pushq %rbp
2634; CHECK-NEXT:    .cfi_def_cfa_offset 16
2635; CHECK-NEXT:    pushq %r15
2636; CHECK-NEXT:    .cfi_def_cfa_offset 24
2637; CHECK-NEXT:    pushq %r14
2638; CHECK-NEXT:    .cfi_def_cfa_offset 32
2639; CHECK-NEXT:    pushq %r13
2640; CHECK-NEXT:    .cfi_def_cfa_offset 40
2641; CHECK-NEXT:    pushq %r12
2642; CHECK-NEXT:    .cfi_def_cfa_offset 48
2643; CHECK-NEXT:    pushq %rbx
2644; CHECK-NEXT:    .cfi_def_cfa_offset 56
2645; CHECK-NEXT:    .cfi_offset %rbx, -56
2646; CHECK-NEXT:    .cfi_offset %r12, -48
2647; CHECK-NEXT:    .cfi_offset %r13, -40
2648; CHECK-NEXT:    .cfi_offset %r14, -32
2649; CHECK-NEXT:    .cfi_offset %r15, -24
2650; CHECK-NEXT:    .cfi_offset %rbp, -16
2651; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2652; CHECK-NEXT:    #APP
2653; CHECK-NEXT:    nop
2654; CHECK-NEXT:    #NO_APP
2655; CHECK-NEXT:    vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2656; CHECK-NEXT:    popq %rbx
2657; CHECK-NEXT:    .cfi_def_cfa_offset 48
2658; CHECK-NEXT:    popq %r12
2659; CHECK-NEXT:    .cfi_def_cfa_offset 40
2660; CHECK-NEXT:    popq %r13
2661; CHECK-NEXT:    .cfi_def_cfa_offset 32
2662; CHECK-NEXT:    popq %r14
2663; CHECK-NEXT:    .cfi_def_cfa_offset 24
2664; CHECK-NEXT:    popq %r15
2665; CHECK-NEXT:    .cfi_def_cfa_offset 16
2666; CHECK-NEXT:    popq %rbp
2667; CHECK-NEXT:    .cfi_def_cfa_offset 8
2668; CHECK-NEXT:    retq
2669  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2670  %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
2671  ret <4 x i32> %2
2672}
2673
2674define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
2675; CHECK-LABEL: stack_fold_pinsrq:
2676; CHECK:       # %bb.0:
2677; CHECK-NEXT:    pushq %rbp
2678; CHECK-NEXT:    .cfi_def_cfa_offset 16
2679; CHECK-NEXT:    pushq %r15
2680; CHECK-NEXT:    .cfi_def_cfa_offset 24
2681; CHECK-NEXT:    pushq %r14
2682; CHECK-NEXT:    .cfi_def_cfa_offset 32
2683; CHECK-NEXT:    pushq %r13
2684; CHECK-NEXT:    .cfi_def_cfa_offset 40
2685; CHECK-NEXT:    pushq %r12
2686; CHECK-NEXT:    .cfi_def_cfa_offset 48
2687; CHECK-NEXT:    pushq %rbx
2688; CHECK-NEXT:    .cfi_def_cfa_offset 56
2689; CHECK-NEXT:    .cfi_offset %rbx, -56
2690; CHECK-NEXT:    .cfi_offset %r12, -48
2691; CHECK-NEXT:    .cfi_offset %r13, -40
2692; CHECK-NEXT:    .cfi_offset %r14, -32
2693; CHECK-NEXT:    .cfi_offset %r15, -24
2694; CHECK-NEXT:    .cfi_offset %rbp, -16
2695; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
2696; CHECK-NEXT:    #APP
2697; CHECK-NEXT:    nop
2698; CHECK-NEXT:    #NO_APP
2699; CHECK-NEXT:    vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
2700; CHECK-NEXT:    popq %rbx
2701; CHECK-NEXT:    .cfi_def_cfa_offset 48
2702; CHECK-NEXT:    popq %r12
2703; CHECK-NEXT:    .cfi_def_cfa_offset 40
2704; CHECK-NEXT:    popq %r13
2705; CHECK-NEXT:    .cfi_def_cfa_offset 32
2706; CHECK-NEXT:    popq %r14
2707; CHECK-NEXT:    .cfi_def_cfa_offset 24
2708; CHECK-NEXT:    popq %r15
2709; CHECK-NEXT:    .cfi_def_cfa_offset 16
2710; CHECK-NEXT:    popq %rbp
2711; CHECK-NEXT:    .cfi_def_cfa_offset 8
2712; CHECK-NEXT:    retq
2713  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2714  %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
2715  ret <2 x i64> %2
2716}
2717
2718define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
2719; CHECK-LABEL: stack_fold_pinsrw:
2720; CHECK:       # %bb.0:
2721; CHECK-NEXT:    pushq %rbp
2722; CHECK-NEXT:    .cfi_def_cfa_offset 16
2723; CHECK-NEXT:    pushq %r15
2724; CHECK-NEXT:    .cfi_def_cfa_offset 24
2725; CHECK-NEXT:    pushq %r14
2726; CHECK-NEXT:    .cfi_def_cfa_offset 32
2727; CHECK-NEXT:    pushq %r13
2728; CHECK-NEXT:    .cfi_def_cfa_offset 40
2729; CHECK-NEXT:    pushq %r12
2730; CHECK-NEXT:    .cfi_def_cfa_offset 48
2731; CHECK-NEXT:    pushq %rbx
2732; CHECK-NEXT:    .cfi_def_cfa_offset 56
2733; CHECK-NEXT:    .cfi_offset %rbx, -56
2734; CHECK-NEXT:    .cfi_offset %r12, -48
2735; CHECK-NEXT:    .cfi_offset %r13, -40
2736; CHECK-NEXT:    .cfi_offset %r14, -32
2737; CHECK-NEXT:    .cfi_offset %r15, -24
2738; CHECK-NEXT:    .cfi_offset %rbp, -16
2739; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
2740; CHECK-NEXT:    #APP
2741; CHECK-NEXT:    nop
2742; CHECK-NEXT:    #NO_APP
2743; CHECK-NEXT:    vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
2744; CHECK-NEXT:    popq %rbx
2745; CHECK-NEXT:    .cfi_def_cfa_offset 48
2746; CHECK-NEXT:    popq %r12
2747; CHECK-NEXT:    .cfi_def_cfa_offset 40
2748; CHECK-NEXT:    popq %r13
2749; CHECK-NEXT:    .cfi_def_cfa_offset 32
2750; CHECK-NEXT:    popq %r14
2751; CHECK-NEXT:    .cfi_def_cfa_offset 24
2752; CHECK-NEXT:    popq %r15
2753; CHECK-NEXT:    .cfi_def_cfa_offset 16
2754; CHECK-NEXT:    popq %rbp
2755; CHECK-NEXT:    .cfi_def_cfa_offset 8
2756; CHECK-NEXT:    retq
2757  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
2758  %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
2759  ret <8 x i16> %2
2760}
2761
2762define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) {
2763; CHECK-LABEL: stack_fold_vplzcntd:
2764; CHECK:       # %bb.0:
2765; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2766; CHECK-NEXT:    #APP
2767; CHECK-NEXT:    nop
2768; CHECK-NEXT:    #NO_APP
2769; CHECK-NEXT:    vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2770; CHECK-NEXT:    retq
2771  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2772  %2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false)
2773  ret <16 x i32> %2
2774}
2775
2776define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) {
2777; CHECK-LABEL: stack_fold_vplzcntq:
2778; CHECK:       # %bb.0:
2779; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2780; CHECK-NEXT:    #APP
2781; CHECK-NEXT:    nop
2782; CHECK-NEXT:    #NO_APP
2783; CHECK-NEXT:    vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
2784; CHECK-NEXT:    retq
2785  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2786  %2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false)
2787  ret <8 x i64> %2
2788}
2789
2790define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
2791; CHECK-LABEL: stack_fold_pmaddubsw_zmm:
2792; CHECK:       # %bb.0:
2793; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2794; CHECK-NEXT:    #APP
2795; CHECK-NEXT:    nop
2796; CHECK-NEXT:    #NO_APP
2797; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2798; CHECK-NEXT:    retq
2799  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2800  %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2801  ret <32 x i16> %2
2802}
2803declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone
2804
2805define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2806; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask:
2807; CHECK:       # %bb.0:
2808; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2809; CHECK-NEXT:    #APP
2810; CHECK-NEXT:    nop
2811; CHECK-NEXT:    #NO_APP
2812; CHECK-NEXT:    kmovd %esi, %k1
2813; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2814; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2815; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2816; CHECK-NEXT:    retq
2817  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2818  %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2819  %3 = bitcast i32 %mask to <32 x i1>
2820  ; load needed to keep the operation from being scheduled about the asm block
2821  %4 = load <32 x i16>, <32 x i16>* %passthru
2822  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
2823  ret <32 x i16> %5
2824}
2825
2826define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
2827; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz:
2828; CHECK:       # %bb.0:
2829; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2830; CHECK-NEXT:    #APP
2831; CHECK-NEXT:    nop
2832; CHECK-NEXT:    #NO_APP
2833; CHECK-NEXT:    kmovd %edi, %k1
2834; CHECK-NEXT:    vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2835; CHECK-NEXT:    retq
2836  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2837  %2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
2838  %3 = bitcast i32 %mask to <32 x i1>
2839  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
2840  ret <32 x i16> %4
2841}
2842
2843define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) {
2844; CHECK-LABEL: stack_fold_pmaddwd_zmm:
2845; CHECK:       # %bb.0:
2846; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2847; CHECK-NEXT:    #APP
2848; CHECK-NEXT:    nop
2849; CHECK-NEXT:    #NO_APP
2850; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2851; CHECK-NEXT:    retq
2852  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2853  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2854  ret <16 x i32> %2
2855}
2856declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone
2857
2858define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) {
2859; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted:
2860; CHECK:       # %bb.0:
2861; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2862; CHECK-NEXT:    #APP
2863; CHECK-NEXT:    nop
2864; CHECK-NEXT:    #NO_APP
2865; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2866; CHECK-NEXT:    retq
2867  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2868  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2869  ret <16 x i32> %2
2870}
2871
2872define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2873; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask:
2874; CHECK:       # %bb.0:
2875; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2876; CHECK-NEXT:    #APP
2877; CHECK-NEXT:    nop
2878; CHECK-NEXT:    #NO_APP
2879; CHECK-NEXT:    kmovd %esi, %k1
2880; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2881; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2882; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2883; CHECK-NEXT:    retq
2884  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2885  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2886  %3 = bitcast i16 %mask to <16 x i1>
2887  ; load needed to keep the operation from being scheduled about the asm block
2888  %4 = load <16 x i32>, <16 x i32>* %passthru
2889  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2890  ret <16 x i32> %5
2891}
2892
2893define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2894; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted:
2895; CHECK:       # %bb.0:
2896; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2897; CHECK-NEXT:    #APP
2898; CHECK-NEXT:    nop
2899; CHECK-NEXT:    #NO_APP
2900; CHECK-NEXT:    kmovd %esi, %k1
2901; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
2902; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2903; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2904; CHECK-NEXT:    retq
2905  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2906  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2907  %3 = bitcast i16 %mask to <16 x i1>
2908  ; load needed to keep the operation from being scheduled about the asm block
2909  %4 = load <16 x i32>, <16 x i32>* %passthru
2910  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
2911  ret <16 x i32> %5
2912}
2913
2914define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2915; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz:
2916; CHECK:       # %bb.0:
2917; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2918; CHECK-NEXT:    #APP
2919; CHECK-NEXT:    nop
2920; CHECK-NEXT:    #NO_APP
2921; CHECK-NEXT:    kmovd %esi, %k1
2922; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2923; CHECK-NEXT:    retq
2924  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2925  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
2926  %3 = bitcast i16 %mask to <16 x i1>
2927  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2928  ret <16 x i32> %4
2929}
2930
2931define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
2932; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted:
2933; CHECK:       # %bb.0:
2934; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2935; CHECK-NEXT:    #APP
2936; CHECK-NEXT:    nop
2937; CHECK-NEXT:    #NO_APP
2938; CHECK-NEXT:    kmovd %esi, %k1
2939; CHECK-NEXT:    vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
2940; CHECK-NEXT:    retq
2941  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2942  %2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
2943  %3 = bitcast i16 %mask to <16 x i1>
2944  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
2945  ret <16 x i32> %4
2946}
2947
2948define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) {
2949; CHECK-LABEL: stack_fold_pmaxsb:
2950; CHECK:       # %bb.0:
2951; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2952; CHECK-NEXT:    #APP
2953; CHECK-NEXT:    nop
2954; CHECK-NEXT:    #NO_APP
2955; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2956; CHECK-NEXT:    retq
2957  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2958  %2 = icmp sgt <64 x i8> %a0, %a1
2959  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2960  ret <64 x i8> %3
2961}
2962
2963define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
2964; CHECK-LABEL: stack_fold_pmaxsb_commuted:
2965; CHECK:       # %bb.0:
2966; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2967; CHECK-NEXT:    #APP
2968; CHECK-NEXT:    nop
2969; CHECK-NEXT:    #NO_APP
2970; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
2971; CHECK-NEXT:    retq
2972  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2973  %2 = icmp sgt <64 x i8> %a1, %a0
2974  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
2975  ret <64 x i8> %3
2976}
2977
2978define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
2979; CHECK-LABEL: stack_fold_pmaxsb_mask:
2980; CHECK:       # %bb.0:
2981; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
2982; CHECK-NEXT:    #APP
2983; CHECK-NEXT:    nop
2984; CHECK-NEXT:    #NO_APP
2985; CHECK-NEXT:    kmovq %rdi, %k1
2986; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
2987; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
2988; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
2989; CHECK-NEXT:    retq
2990  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
2991  %2 = icmp sgt <64 x i8> %a0, %a1
2992  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
2993  %4 = bitcast i64 %mask to <64 x i1>
2994  ; load needed to keep the operation from being scheduled about the asm block
2995  %5 = load <64 x i8>, <64 x i8>* %passthru
2996  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
2997  ret <64 x i8> %6
2998}
2999
3000define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3001; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted:
3002; CHECK:       # %bb.0:
3003; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3004; CHECK-NEXT:    #APP
3005; CHECK-NEXT:    nop
3006; CHECK-NEXT:    #NO_APP
3007; CHECK-NEXT:    kmovq %rdi, %k1
3008; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3009; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3010; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3011; CHECK-NEXT:    retq
3012  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3013  %2 = icmp sgt <64 x i8> %a1, %a0
3014  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3015  %4 = bitcast i64 %mask to <64 x i1>
3016  ; load needed to keep the operation from being scheduled about the asm block
3017  %5 = load <64 x i8>, <64 x i8>* %passthru
3018  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3019  ret <64 x i8> %6
3020}
3021
3022define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3023; CHECK-LABEL: stack_fold_pmaxsb_maskz:
3024; CHECK:       # %bb.0:
3025; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3026; CHECK-NEXT:    #APP
3027; CHECK-NEXT:    nop
3028; CHECK-NEXT:    #NO_APP
3029; CHECK-NEXT:    kmovq %rdi, %k1
3030; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3031; CHECK-NEXT:    retq
3032  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3033  %2 = icmp sgt <64 x i8> %a0, %a1
3034  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3035  %4 = bitcast i64 %mask to <64 x i1>
3036  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3037  ret <64 x i8> %5
3038}
3039
3040define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3041; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted:
3042; CHECK:       # %bb.0:
3043; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3044; CHECK-NEXT:    #APP
3045; CHECK-NEXT:    nop
3046; CHECK-NEXT:    #NO_APP
3047; CHECK-NEXT:    kmovq %rdi, %k1
3048; CHECK-NEXT:    vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3049; CHECK-NEXT:    retq
3050  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3051  %2 = icmp sgt <64 x i8> %a1, %a0
3052  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3053  %4 = bitcast i64 %mask to <64 x i1>
3054  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3055  ret <64 x i8> %5
3056}
3057
3058define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
3059; CHECK-LABEL: stack_fold_pmaxsd:
3060; CHECK:       # %bb.0:
3061; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3062; CHECK-NEXT:    #APP
3063; CHECK-NEXT:    nop
3064; CHECK-NEXT:    #NO_APP
3065; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3066; CHECK-NEXT:    retq
3067  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3068  %2 = icmp sgt <16 x i32> %a0, %a1
3069  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3070  ret <16 x i32> %3
3071}
3072
3073define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3074; CHECK-LABEL: stack_fold_pmaxsd_commuted:
3075; CHECK:       # %bb.0:
3076; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3077; CHECK-NEXT:    #APP
3078; CHECK-NEXT:    nop
3079; CHECK-NEXT:    #NO_APP
3080; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3081; CHECK-NEXT:    retq
3082  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3083  %2 = icmp sgt <16 x i32> %a1, %a0
3084  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3085  ret <16 x i32> %3
3086}
3087
3088define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3089; CHECK-LABEL: stack_fold_pmaxsd_mask:
3090; CHECK:       # %bb.0:
3091; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3092; CHECK-NEXT:    #APP
3093; CHECK-NEXT:    nop
3094; CHECK-NEXT:    #NO_APP
3095; CHECK-NEXT:    kmovd %edi, %k1
3096; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3097; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3098; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3099; CHECK-NEXT:    retq
3100  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3101  %2 = icmp sgt <16 x i32> %a0, %a1
3102  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3103  %4 = bitcast i16 %mask to <16 x i1>
3104  ; load needed to keep the operation from being scheduled about the asm block
3105  %5 = load <16 x i32>, <16 x i32>* %passthru
3106  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3107  ret <16 x i32> %6
3108}
3109
3110define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3111; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted:
3112; CHECK:       # %bb.0:
3113; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3114; CHECK-NEXT:    #APP
3115; CHECK-NEXT:    nop
3116; CHECK-NEXT:    #NO_APP
3117; CHECK-NEXT:    kmovd %edi, %k1
3118; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3119; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3120; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3121; CHECK-NEXT:    retq
3122  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3123  %2 = icmp sgt <16 x i32> %a1, %a0
3124  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3125  %4 = bitcast i16 %mask to <16 x i1>
3126  ; load needed to keep the operation from being scheduled about the asm block
3127  %5 = load <16 x i32>, <16 x i32>* %passthru
3128  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3129  ret <16 x i32> %6
3130}
3131
3132define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3133; CHECK-LABEL: stack_fold_pmaxsd_maskz:
3134; CHECK:       # %bb.0:
3135; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3136; CHECK-NEXT:    #APP
3137; CHECK-NEXT:    nop
3138; CHECK-NEXT:    #NO_APP
3139; CHECK-NEXT:    kmovd %edi, %k1
3140; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3141; CHECK-NEXT:    retq
3142  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3143  %2 = icmp sgt <16 x i32> %a0, %a1
3144  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3145  %4 = bitcast i16 %mask to <16 x i1>
3146  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3147  ret <16 x i32> %5
3148}
3149
3150define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3151; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted:
3152; CHECK:       # %bb.0:
3153; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3154; CHECK-NEXT:    #APP
3155; CHECK-NEXT:    nop
3156; CHECK-NEXT:    #NO_APP
3157; CHECK-NEXT:    kmovd %edi, %k1
3158; CHECK-NEXT:    vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3159; CHECK-NEXT:    retq
3160  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3161  %2 = icmp sgt <16 x i32> %a1, %a0
3162  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3163  %4 = bitcast i16 %mask to <16 x i1>
3164  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3165  ret <16 x i32> %5
3166}
3167
3168define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) {
3169; CHECK-LABEL: stack_fold_pmaxsq:
3170; CHECK:       # %bb.0:
3171; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3172; CHECK-NEXT:    #APP
3173; CHECK-NEXT:    nop
3174; CHECK-NEXT:    #NO_APP
3175; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3176; CHECK-NEXT:    retq
3177  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3178  %2 = icmp sgt <8 x i64> %a0, %a1
3179  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3180  ret <8 x i64> %3
3181}
3182
3183define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3184; CHECK-LABEL: stack_fold_pmaxsq_commuted:
3185; CHECK:       # %bb.0:
3186; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3187; CHECK-NEXT:    #APP
3188; CHECK-NEXT:    nop
3189; CHECK-NEXT:    #NO_APP
3190; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3191; CHECK-NEXT:    retq
3192  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3193  %2 = icmp sgt <8 x i64> %a1, %a0
3194  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3195  ret <8 x i64> %3
3196}
3197
3198define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3199; CHECK-LABEL: stack_fold_pmaxsq_mask:
3200; CHECK:       # %bb.0:
3201; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3202; CHECK-NEXT:    #APP
3203; CHECK-NEXT:    nop
3204; CHECK-NEXT:    #NO_APP
3205; CHECK-NEXT:    kmovd %edi, %k1
3206; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3207; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3208; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3209; CHECK-NEXT:    retq
3210  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3211  %2 = icmp sgt <8 x i64> %a0, %a1
3212  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3213  %4 = bitcast i8 %mask to <8 x i1>
3214  ; load needed to keep the operation from being scheduled about the asm block
3215  %5 = load <8 x i64>, <8 x i64>* %passthru
3216  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3217  ret <8 x i64> %6
3218}
3219
3220define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3221; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted:
3222; CHECK:       # %bb.0:
3223; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3224; CHECK-NEXT:    #APP
3225; CHECK-NEXT:    nop
3226; CHECK-NEXT:    #NO_APP
3227; CHECK-NEXT:    kmovd %edi, %k1
3228; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3229; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3230; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3231; CHECK-NEXT:    retq
3232  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3233  %2 = icmp sgt <8 x i64> %a1, %a0
3234  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3235  %4 = bitcast i8 %mask to <8 x i1>
3236  ; load needed to keep the operation from being scheduled about the asm block
3237  %5 = load <8 x i64>, <8 x i64>* %passthru
3238  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3239  ret <8 x i64> %6
3240}
3241
3242define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3243; CHECK-LABEL: stack_fold_pmaxsq_maskz:
3244; CHECK:       # %bb.0:
3245; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3246; CHECK-NEXT:    #APP
3247; CHECK-NEXT:    nop
3248; CHECK-NEXT:    #NO_APP
3249; CHECK-NEXT:    kmovd %edi, %k1
3250; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3251; CHECK-NEXT:    retq
3252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3253  %2 = icmp sgt <8 x i64> %a0, %a1
3254  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3255  %4 = bitcast i8 %mask to <8 x i1>
3256  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3257  ret <8 x i64> %5
3258}
3259
3260define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3261; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted:
3262; CHECK:       # %bb.0:
3263; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3264; CHECK-NEXT:    #APP
3265; CHECK-NEXT:    nop
3266; CHECK-NEXT:    #NO_APP
3267; CHECK-NEXT:    kmovd %edi, %k1
3268; CHECK-NEXT:    vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3269; CHECK-NEXT:    retq
3270  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3271  %2 = icmp sgt <8 x i64> %a1, %a0
3272  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3273  %4 = bitcast i8 %mask to <8 x i1>
3274  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3275  ret <8 x i64> %5
3276}
3277
3278define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) {
3279; CHECK-LABEL: stack_fold_pmaxsw:
3280; CHECK:       # %bb.0:
3281; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3282; CHECK-NEXT:    #APP
3283; CHECK-NEXT:    nop
3284; CHECK-NEXT:    #NO_APP
3285; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3286; CHECK-NEXT:    retq
3287  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3288  %2 = icmp sgt <32 x i16> %a0, %a1
3289  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3290  ret <32 x i16> %3
3291}
3292
3293define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3294; CHECK-LABEL: stack_fold_pmaxsw_commuted:
3295; CHECK:       # %bb.0:
3296; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3297; CHECK-NEXT:    #APP
3298; CHECK-NEXT:    nop
3299; CHECK-NEXT:    #NO_APP
3300; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3301; CHECK-NEXT:    retq
3302  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3303  %2 = icmp sgt <32 x i16> %a1, %a0
3304  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3305  ret <32 x i16> %3
3306}
3307
3308define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3309; CHECK-LABEL: stack_fold_pmaxsw_mask:
3310; CHECK:       # %bb.0:
3311; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3312; CHECK-NEXT:    #APP
3313; CHECK-NEXT:    nop
3314; CHECK-NEXT:    #NO_APP
3315; CHECK-NEXT:    kmovd %edi, %k1
3316; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3317; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3318; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3319; CHECK-NEXT:    retq
3320  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3321  %2 = icmp sgt <32 x i16> %a0, %a1
3322  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3323  %4 = bitcast i32 %mask to <32 x i1>
3324  ; load needed to keep the operation from being scheduled about the asm block
3325  %5 = load <32 x i16>, <32 x i16>* %passthru
3326  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3327  ret <32 x i16> %6
3328}
3329
3330define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3331; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted:
3332; CHECK:       # %bb.0:
3333; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3334; CHECK-NEXT:    #APP
3335; CHECK-NEXT:    nop
3336; CHECK-NEXT:    #NO_APP
3337; CHECK-NEXT:    kmovd %edi, %k1
3338; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3339; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3340; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3341; CHECK-NEXT:    retq
3342  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3343  %2 = icmp sgt <32 x i16> %a1, %a0
3344  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3345  %4 = bitcast i32 %mask to <32 x i1>
3346  ; load needed to keep the operation from being scheduled about the asm block
3347  %5 = load <32 x i16>, <32 x i16>* %passthru
3348  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3349  ret <32 x i16> %6
3350}
3351
3352define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3353; CHECK-LABEL: stack_fold_pmaxsw_maskz:
3354; CHECK:       # %bb.0:
3355; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3356; CHECK-NEXT:    #APP
3357; CHECK-NEXT:    nop
3358; CHECK-NEXT:    #NO_APP
3359; CHECK-NEXT:    kmovd %edi, %k1
3360; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3361; CHECK-NEXT:    retq
3362  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3363  %2 = icmp sgt <32 x i16> %a0, %a1
3364  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3365  %4 = bitcast i32 %mask to <32 x i1>
3366  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3367  ret <32 x i16> %5
3368}
3369
3370define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3371; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted:
3372; CHECK:       # %bb.0:
3373; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3374; CHECK-NEXT:    #APP
3375; CHECK-NEXT:    nop
3376; CHECK-NEXT:    #NO_APP
3377; CHECK-NEXT:    kmovd %edi, %k1
3378; CHECK-NEXT:    vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3379; CHECK-NEXT:    retq
3380  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3381  %2 = icmp sgt <32 x i16> %a1, %a0
3382  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3383  %4 = bitcast i32 %mask to <32 x i1>
3384  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3385  ret <32 x i16> %5
3386}
3387
3388define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) {
3389; CHECK-LABEL: stack_fold_pmaxub:
3390; CHECK:       # %bb.0:
3391; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3392; CHECK-NEXT:    #APP
3393; CHECK-NEXT:    nop
3394; CHECK-NEXT:    #NO_APP
3395; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3396; CHECK-NEXT:    retq
3397  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3398  %2 = icmp ugt <64 x i8> %a0, %a1
3399  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3400  ret <64 x i8> %3
3401}
3402
3403define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3404; CHECK-LABEL: stack_fold_pmaxub_commuted:
3405; CHECK:       # %bb.0:
3406; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3407; CHECK-NEXT:    #APP
3408; CHECK-NEXT:    nop
3409; CHECK-NEXT:    #NO_APP
3410; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3411; CHECK-NEXT:    retq
3412  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3413  %2 = icmp ugt <64 x i8> %a1, %a0
3414  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3415  ret <64 x i8> %3
3416}
3417
3418define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3419; CHECK-LABEL: stack_fold_pmaxub_mask:
3420; CHECK:       # %bb.0:
3421; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3422; CHECK-NEXT:    #APP
3423; CHECK-NEXT:    nop
3424; CHECK-NEXT:    #NO_APP
3425; CHECK-NEXT:    kmovq %rdi, %k1
3426; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3427; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3428; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3429; CHECK-NEXT:    retq
3430  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3431  %2 = icmp ugt <64 x i8> %a0, %a1
3432  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3433  %4 = bitcast i64 %mask to <64 x i1>
3434  ; load needed to keep the operation from being scheduled about the asm block
3435  %5 = load <64 x i8>, <64 x i8>* %passthru
3436  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3437  ret <64 x i8> %6
3438}
3439
3440define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3441; CHECK-LABEL: stack_fold_pmaxub_mask_commuted:
3442; CHECK:       # %bb.0:
3443; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3444; CHECK-NEXT:    #APP
3445; CHECK-NEXT:    nop
3446; CHECK-NEXT:    #NO_APP
3447; CHECK-NEXT:    kmovq %rdi, %k1
3448; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3449; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3450; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3451; CHECK-NEXT:    retq
3452  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3453  %2 = icmp ugt <64 x i8> %a1, %a0
3454  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3455  %4 = bitcast i64 %mask to <64 x i1>
3456  ; load needed to keep the operation from being scheduled about the asm block
3457  %5 = load <64 x i8>, <64 x i8>* %passthru
3458  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3459  ret <64 x i8> %6
3460}
3461
3462define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3463; CHECK-LABEL: stack_fold_pmaxub_maskz:
3464; CHECK:       # %bb.0:
3465; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3466; CHECK-NEXT:    #APP
3467; CHECK-NEXT:    nop
3468; CHECK-NEXT:    #NO_APP
3469; CHECK-NEXT:    kmovq %rdi, %k1
3470; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3471; CHECK-NEXT:    retq
3472  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3473  %2 = icmp ugt <64 x i8> %a0, %a1
3474  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3475  %4 = bitcast i64 %mask to <64 x i1>
3476  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3477  ret <64 x i8> %5
3478}
3479
3480define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3481; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted:
3482; CHECK:       # %bb.0:
3483; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3484; CHECK-NEXT:    #APP
3485; CHECK-NEXT:    nop
3486; CHECK-NEXT:    #NO_APP
3487; CHECK-NEXT:    kmovq %rdi, %k1
3488; CHECK-NEXT:    vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3489; CHECK-NEXT:    retq
3490  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3491  %2 = icmp ugt <64 x i8> %a1, %a0
3492  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3493  %4 = bitcast i64 %mask to <64 x i1>
3494  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3495  ret <64 x i8> %5
3496}
3497
3498define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) {
3499; CHECK-LABEL: stack_fold_pmaxud:
3500; CHECK:       # %bb.0:
3501; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3502; CHECK-NEXT:    #APP
3503; CHECK-NEXT:    nop
3504; CHECK-NEXT:    #NO_APP
3505; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3506; CHECK-NEXT:    retq
3507  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3508  %2 = icmp ugt <16 x i32> %a0, %a1
3509  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3510  ret <16 x i32> %3
3511}
3512
3513define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3514; CHECK-LABEL: stack_fold_pmaxud_commuted:
3515; CHECK:       # %bb.0:
3516; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3517; CHECK-NEXT:    #APP
3518; CHECK-NEXT:    nop
3519; CHECK-NEXT:    #NO_APP
3520; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3521; CHECK-NEXT:    retq
3522  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3523  %2 = icmp ugt <16 x i32> %a1, %a0
3524  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3525  ret <16 x i32> %3
3526}
3527
3528define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3529; CHECK-LABEL: stack_fold_pmaxud_mask:
3530; CHECK:       # %bb.0:
3531; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3532; CHECK-NEXT:    #APP
3533; CHECK-NEXT:    nop
3534; CHECK-NEXT:    #NO_APP
3535; CHECK-NEXT:    kmovd %edi, %k1
3536; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3537; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3538; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3539; CHECK-NEXT:    retq
3540  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3541  %2 = icmp ugt <16 x i32> %a0, %a1
3542  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3543  %4 = bitcast i16 %mask to <16 x i1>
3544  ; load needed to keep the operation from being scheduled about the asm block
3545  %5 = load <16 x i32>, <16 x i32>* %passthru
3546  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3547  ret <16 x i32> %6
3548}
3549
3550define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3551; CHECK-LABEL: stack_fold_pmaxud_mask_commuted:
3552; CHECK:       # %bb.0:
3553; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3554; CHECK-NEXT:    #APP
3555; CHECK-NEXT:    nop
3556; CHECK-NEXT:    #NO_APP
3557; CHECK-NEXT:    kmovd %edi, %k1
3558; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3559; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3560; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3561; CHECK-NEXT:    retq
3562  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3563  %2 = icmp ugt <16 x i32> %a1, %a0
3564  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3565  %4 = bitcast i16 %mask to <16 x i1>
3566  ; load needed to keep the operation from being scheduled about the asm block
3567  %5 = load <16 x i32>, <16 x i32>* %passthru
3568  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3569  ret <16 x i32> %6
3570}
3571
3572define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3573; CHECK-LABEL: stack_fold_pmaxud_maskz:
3574; CHECK:       # %bb.0:
3575; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3576; CHECK-NEXT:    #APP
3577; CHECK-NEXT:    nop
3578; CHECK-NEXT:    #NO_APP
3579; CHECK-NEXT:    kmovd %edi, %k1
3580; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3581; CHECK-NEXT:    retq
3582  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3583  %2 = icmp ugt <16 x i32> %a0, %a1
3584  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3585  %4 = bitcast i16 %mask to <16 x i1>
3586  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3587  ret <16 x i32> %5
3588}
3589
3590define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
3591; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted:
3592; CHECK:       # %bb.0:
3593; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3594; CHECK-NEXT:    #APP
3595; CHECK-NEXT:    nop
3596; CHECK-NEXT:    #NO_APP
3597; CHECK-NEXT:    kmovd %edi, %k1
3598; CHECK-NEXT:    vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3599; CHECK-NEXT:    retq
3600  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3601  %2 = icmp ugt <16 x i32> %a1, %a0
3602  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3603  %4 = bitcast i16 %mask to <16 x i1>
3604  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
3605  ret <16 x i32> %5
3606}
3607
3608define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) {
3609; CHECK-LABEL: stack_fold_pmaxuq:
3610; CHECK:       # %bb.0:
3611; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3612; CHECK-NEXT:    #APP
3613; CHECK-NEXT:    nop
3614; CHECK-NEXT:    #NO_APP
3615; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3616; CHECK-NEXT:    retq
3617  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3618  %2 = icmp ugt <8 x i64> %a0, %a1
3619  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3620  ret <8 x i64> %3
3621}
3622
3623define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
3624; CHECK-LABEL: stack_fold_pmaxuq_commuted:
3625; CHECK:       # %bb.0:
3626; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3627; CHECK-NEXT:    #APP
3628; CHECK-NEXT:    nop
3629; CHECK-NEXT:    #NO_APP
3630; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3631; CHECK-NEXT:    retq
3632  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3633  %2 = icmp ugt <8 x i64> %a1, %a0
3634  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3635  ret <8 x i64> %3
3636}
3637
3638define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3639; CHECK-LABEL: stack_fold_pmaxuq_mask:
3640; CHECK:       # %bb.0:
3641; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3642; CHECK-NEXT:    #APP
3643; CHECK-NEXT:    nop
3644; CHECK-NEXT:    #NO_APP
3645; CHECK-NEXT:    kmovd %edi, %k1
3646; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3647; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3648; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3649; CHECK-NEXT:    retq
3650  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3651  %2 = icmp ugt <8 x i64> %a0, %a1
3652  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3653  %4 = bitcast i8 %mask to <8 x i1>
3654  ; load needed to keep the operation from being scheduled about the asm block
3655  %5 = load <8 x i64>, <8 x i64>* %passthru
3656  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3657  ret <8 x i64> %6
3658}
3659
3660define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
3661; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted:
3662; CHECK:       # %bb.0:
3663; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3664; CHECK-NEXT:    #APP
3665; CHECK-NEXT:    nop
3666; CHECK-NEXT:    #NO_APP
3667; CHECK-NEXT:    kmovd %edi, %k1
3668; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3669; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3670; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3671; CHECK-NEXT:    retq
3672  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3673  %2 = icmp ugt <8 x i64> %a1, %a0
3674  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3675  %4 = bitcast i8 %mask to <8 x i1>
3676  ; load needed to keep the operation from being scheduled about the asm block
3677  %5 = load <8 x i64>, <8 x i64>* %passthru
3678  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
3679  ret <8 x i64> %6
3680}
3681
3682define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3683; CHECK-LABEL: stack_fold_pmaxuq_maskz:
3684; CHECK:       # %bb.0:
3685; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3686; CHECK-NEXT:    #APP
3687; CHECK-NEXT:    nop
3688; CHECK-NEXT:    #NO_APP
3689; CHECK-NEXT:    kmovd %edi, %k1
3690; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3691; CHECK-NEXT:    retq
3692  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3693  %2 = icmp ugt <8 x i64> %a0, %a1
3694  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
3695  %4 = bitcast i8 %mask to <8 x i1>
3696  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3697  ret <8 x i64> %5
3698}
3699
3700define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
3701; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted:
3702; CHECK:       # %bb.0:
3703; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3704; CHECK-NEXT:    #APP
3705; CHECK-NEXT:    nop
3706; CHECK-NEXT:    #NO_APP
3707; CHECK-NEXT:    kmovd %edi, %k1
3708; CHECK-NEXT:    vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3709; CHECK-NEXT:    retq
3710  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3711  %2 = icmp ugt <8 x i64> %a1, %a0
3712  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
3713  %4 = bitcast i8 %mask to <8 x i1>
3714  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
3715  ret <8 x i64> %5
3716}
3717
3718define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) {
3719; CHECK-LABEL: stack_fold_pmaxuw:
3720; CHECK:       # %bb.0:
3721; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3722; CHECK-NEXT:    #APP
3723; CHECK-NEXT:    nop
3724; CHECK-NEXT:    #NO_APP
3725; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3726; CHECK-NEXT:    retq
3727  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3728  %2 = icmp ugt <32 x i16> %a0, %a1
3729  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3730  ret <32 x i16> %3
3731}
3732
3733define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
3734; CHECK-LABEL: stack_fold_pmaxuw_commuted:
3735; CHECK:       # %bb.0:
3736; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3737; CHECK-NEXT:    #APP
3738; CHECK-NEXT:    nop
3739; CHECK-NEXT:    #NO_APP
3740; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3741; CHECK-NEXT:    retq
3742  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3743  %2 = icmp ugt <32 x i16> %a1, %a0
3744  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3745  ret <32 x i16> %3
3746}
3747
3748define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3749; CHECK-LABEL: stack_fold_pmaxuw_mask:
3750; CHECK:       # %bb.0:
3751; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3752; CHECK-NEXT:    #APP
3753; CHECK-NEXT:    nop
3754; CHECK-NEXT:    #NO_APP
3755; CHECK-NEXT:    kmovd %edi, %k1
3756; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3757; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3758; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3759; CHECK-NEXT:    retq
3760  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3761  %2 = icmp ugt <32 x i16> %a0, %a1
3762  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3763  %4 = bitcast i32 %mask to <32 x i1>
3764  ; load needed to keep the operation from being scheduled about the asm block
3765  %5 = load <32 x i16>, <32 x i16>* %passthru
3766  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3767  ret <32 x i16> %6
3768}
3769
3770define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
3771; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted:
3772; CHECK:       # %bb.0:
3773; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3774; CHECK-NEXT:    #APP
3775; CHECK-NEXT:    nop
3776; CHECK-NEXT:    #NO_APP
3777; CHECK-NEXT:    kmovd %edi, %k1
3778; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3779; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3780; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3781; CHECK-NEXT:    retq
3782  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3783  %2 = icmp ugt <32 x i16> %a1, %a0
3784  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3785  %4 = bitcast i32 %mask to <32 x i1>
3786  ; load needed to keep the operation from being scheduled about the asm block
3787  %5 = load <32 x i16>, <32 x i16>* %passthru
3788  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
3789  ret <32 x i16> %6
3790}
3791
3792define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3793; CHECK-LABEL: stack_fold_pmaxuw_maskz:
3794; CHECK:       # %bb.0:
3795; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3796; CHECK-NEXT:    #APP
3797; CHECK-NEXT:    nop
3798; CHECK-NEXT:    #NO_APP
3799; CHECK-NEXT:    kmovd %edi, %k1
3800; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3801; CHECK-NEXT:    retq
3802  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3803  %2 = icmp ugt <32 x i16> %a0, %a1
3804  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
3805  %4 = bitcast i32 %mask to <32 x i1>
3806  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3807  ret <32 x i16> %5
3808}
3809
3810define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
3811; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted:
3812; CHECK:       # %bb.0:
3813; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3814; CHECK-NEXT:    #APP
3815; CHECK-NEXT:    nop
3816; CHECK-NEXT:    #NO_APP
3817; CHECK-NEXT:    kmovd %edi, %k1
3818; CHECK-NEXT:    vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3819; CHECK-NEXT:    retq
3820  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3821  %2 = icmp ugt <32 x i16> %a1, %a0
3822  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
3823  %4 = bitcast i32 %mask to <32 x i1>
3824  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
3825  ret <32 x i16> %5
3826}
3827
3828define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) {
3829; CHECK-LABEL: stack_fold_pminsb:
3830; CHECK:       # %bb.0:
3831; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3832; CHECK-NEXT:    #APP
3833; CHECK-NEXT:    nop
3834; CHECK-NEXT:    #NO_APP
3835; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3836; CHECK-NEXT:    retq
3837  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3838  %2 = icmp slt <64 x i8> %a0, %a1
3839  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3840  ret <64 x i8> %3
3841}
3842
3843define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
3844; CHECK-LABEL: stack_fold_pminsb_commuted:
3845; CHECK:       # %bb.0:
3846; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3847; CHECK-NEXT:    #APP
3848; CHECK-NEXT:    nop
3849; CHECK-NEXT:    #NO_APP
3850; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3851; CHECK-NEXT:    retq
3852  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3853  %2 = icmp slt <64 x i8> %a1, %a0
3854  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3855  ret <64 x i8> %3
3856}
3857
3858define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3859; CHECK-LABEL: stack_fold_pminsb_mask:
3860; CHECK:       # %bb.0:
3861; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3862; CHECK-NEXT:    #APP
3863; CHECK-NEXT:    nop
3864; CHECK-NEXT:    #NO_APP
3865; CHECK-NEXT:    kmovq %rdi, %k1
3866; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3867; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3868; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3869; CHECK-NEXT:    retq
3870  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3871  %2 = icmp slt <64 x i8> %a0, %a1
3872  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3873  %4 = bitcast i64 %mask to <64 x i1>
3874  ; load needed to keep the operation from being scheduled about the asm block
3875  %5 = load <64 x i8>, <64 x i8>* %passthru
3876  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3877  ret <64 x i8> %6
3878}
3879
3880define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
3881; CHECK-LABEL: stack_fold_pminsb_mask_commuted:
3882; CHECK:       # %bb.0:
3883; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3884; CHECK-NEXT:    #APP
3885; CHECK-NEXT:    nop
3886; CHECK-NEXT:    #NO_APP
3887; CHECK-NEXT:    kmovq %rdi, %k1
3888; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3889; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3890; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3891; CHECK-NEXT:    retq
3892  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3893  %2 = icmp slt <64 x i8> %a1, %a0
3894  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3895  %4 = bitcast i64 %mask to <64 x i1>
3896  ; load needed to keep the operation from being scheduled about the asm block
3897  %5 = load <64 x i8>, <64 x i8>* %passthru
3898  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
3899  ret <64 x i8> %6
3900}
3901
3902define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3903; CHECK-LABEL: stack_fold_pminsb_maskz:
3904; CHECK:       # %bb.0:
3905; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3906; CHECK-NEXT:    #APP
3907; CHECK-NEXT:    nop
3908; CHECK-NEXT:    #NO_APP
3909; CHECK-NEXT:    kmovq %rdi, %k1
3910; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3911; CHECK-NEXT:    retq
3912  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3913  %2 = icmp slt <64 x i8> %a0, %a1
3914  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
3915  %4 = bitcast i64 %mask to <64 x i1>
3916  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3917  ret <64 x i8> %5
3918}
3919
3920define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
3921; CHECK-LABEL: stack_fold_pminsb_maskz_commuted:
3922; CHECK:       # %bb.0:
3923; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3924; CHECK-NEXT:    #APP
3925; CHECK-NEXT:    nop
3926; CHECK-NEXT:    #NO_APP
3927; CHECK-NEXT:    kmovq %rdi, %k1
3928; CHECK-NEXT:    vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
3929; CHECK-NEXT:    retq
3930  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3931  %2 = icmp slt <64 x i8> %a1, %a0
3932  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
3933  %4 = bitcast i64 %mask to <64 x i1>
3934  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
3935  ret <64 x i8> %5
3936}
3937
3938define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) {
3939; CHECK-LABEL: stack_fold_pminsd:
3940; CHECK:       # %bb.0:
3941; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3942; CHECK-NEXT:    #APP
3943; CHECK-NEXT:    nop
3944; CHECK-NEXT:    #NO_APP
3945; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3946; CHECK-NEXT:    retq
3947  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3948  %2 = icmp slt <16 x i32> %a0, %a1
3949  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3950  ret <16 x i32> %3
3951}
3952
3953define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
3954; CHECK-LABEL: stack_fold_pminsd_commuted:
3955; CHECK:       # %bb.0:
3956; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3957; CHECK-NEXT:    #APP
3958; CHECK-NEXT:    nop
3959; CHECK-NEXT:    #NO_APP
3960; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
3961; CHECK-NEXT:    retq
3962  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3963  %2 = icmp slt <16 x i32> %a1, %a0
3964  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
3965  ret <16 x i32> %3
3966}
3967
3968define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3969; CHECK-LABEL: stack_fold_pminsd_mask:
3970; CHECK:       # %bb.0:
3971; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3972; CHECK-NEXT:    #APP
3973; CHECK-NEXT:    nop
3974; CHECK-NEXT:    #NO_APP
3975; CHECK-NEXT:    kmovd %edi, %k1
3976; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3977; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
3978; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
3979; CHECK-NEXT:    retq
3980  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
3981  %2 = icmp slt <16 x i32> %a0, %a1
3982  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
3983  %4 = bitcast i16 %mask to <16 x i1>
3984  ; load needed to keep the operation from being scheduled about the asm block
3985  %5 = load <16 x i32>, <16 x i32>* %passthru
3986  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
3987  ret <16 x i32> %6
3988}
3989
3990define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
3991; CHECK-LABEL: stack_fold_pminsd_mask_commuted:
3992; CHECK:       # %bb.0:
3993; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3994; CHECK-NEXT:    #APP
3995; CHECK-NEXT:    nop
3996; CHECK-NEXT:    #NO_APP
3997; CHECK-NEXT:    kmovd %edi, %k1
3998; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
3999; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4000; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4001; CHECK-NEXT:    retq
4002  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4003  %2 = icmp slt <16 x i32> %a1, %a0
4004  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4005  %4 = bitcast i16 %mask to <16 x i1>
4006  ; load needed to keep the operation from being scheduled about the asm block
4007  %5 = load <16 x i32>, <16 x i32>* %passthru
4008  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4009  ret <16 x i32> %6
4010}
4011
4012define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4013; CHECK-LABEL: stack_fold_pminsd_maskz:
4014; CHECK:       # %bb.0:
4015; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4016; CHECK-NEXT:    #APP
4017; CHECK-NEXT:    nop
4018; CHECK-NEXT:    #NO_APP
4019; CHECK-NEXT:    kmovd %edi, %k1
4020; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4021; CHECK-NEXT:    retq
4022  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4023  %2 = icmp slt <16 x i32> %a0, %a1
4024  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4025  %4 = bitcast i16 %mask to <16 x i1>
4026  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4027  ret <16 x i32> %5
4028}
4029
4030define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4031; CHECK-LABEL: stack_fold_pminsd_maskz_commuted:
4032; CHECK:       # %bb.0:
4033; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4034; CHECK-NEXT:    #APP
4035; CHECK-NEXT:    nop
4036; CHECK-NEXT:    #NO_APP
4037; CHECK-NEXT:    kmovd %edi, %k1
4038; CHECK-NEXT:    vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4039; CHECK-NEXT:    retq
4040  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4041  %2 = icmp slt <16 x i32> %a1, %a0
4042  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4043  %4 = bitcast i16 %mask to <16 x i1>
4044  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4045  ret <16 x i32> %5
4046}
4047
4048define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) {
4049; CHECK-LABEL: stack_fold_pminsq:
4050; CHECK:       # %bb.0:
4051; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4052; CHECK-NEXT:    #APP
4053; CHECK-NEXT:    nop
4054; CHECK-NEXT:    #NO_APP
4055; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4056; CHECK-NEXT:    retq
4057  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4058  %2 = icmp slt <8 x i64> %a0, %a1
4059  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4060  ret <8 x i64> %3
4061}
4062
4063define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4064; CHECK-LABEL: stack_fold_pminsq_commuted:
4065; CHECK:       # %bb.0:
4066; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4067; CHECK-NEXT:    #APP
4068; CHECK-NEXT:    nop
4069; CHECK-NEXT:    #NO_APP
4070; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4071; CHECK-NEXT:    retq
4072  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4073  %2 = icmp slt <8 x i64> %a1, %a0
4074  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4075  ret <8 x i64> %3
4076}
4077
4078define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4079; CHECK-LABEL: stack_fold_pminsq_mask:
4080; CHECK:       # %bb.0:
4081; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4082; CHECK-NEXT:    #APP
4083; CHECK-NEXT:    nop
4084; CHECK-NEXT:    #NO_APP
4085; CHECK-NEXT:    kmovd %edi, %k1
4086; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4087; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4088; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4089; CHECK-NEXT:    retq
4090  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4091  %2 = icmp slt <8 x i64> %a0, %a1
4092  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4093  %4 = bitcast i8 %mask to <8 x i1>
4094  ; load needed to keep the operation from being scheduled about the asm block
4095  %5 = load <8 x i64>, <8 x i64>* %passthru
4096  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4097  ret <8 x i64> %6
4098}
4099
4100define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4101; CHECK-LABEL: stack_fold_pminsq_mask_commuted:
4102; CHECK:       # %bb.0:
4103; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4104; CHECK-NEXT:    #APP
4105; CHECK-NEXT:    nop
4106; CHECK-NEXT:    #NO_APP
4107; CHECK-NEXT:    kmovd %edi, %k1
4108; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4109; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4110; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4111; CHECK-NEXT:    retq
4112  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4113  %2 = icmp slt <8 x i64> %a1, %a0
4114  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4115  %4 = bitcast i8 %mask to <8 x i1>
4116  ; load needed to keep the operation from being scheduled about the asm block
4117  %5 = load <8 x i64>, <8 x i64>* %passthru
4118  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4119  ret <8 x i64> %6
4120}
4121
4122define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4123; CHECK-LABEL: stack_fold_pminsq_maskz:
4124; CHECK:       # %bb.0:
4125; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4126; CHECK-NEXT:    #APP
4127; CHECK-NEXT:    nop
4128; CHECK-NEXT:    #NO_APP
4129; CHECK-NEXT:    kmovd %edi, %k1
4130; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4131; CHECK-NEXT:    retq
4132  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4133  %2 = icmp slt <8 x i64> %a0, %a1
4134  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4135  %4 = bitcast i8 %mask to <8 x i1>
4136  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4137  ret <8 x i64> %5
4138}
4139
4140define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4141; CHECK-LABEL: stack_fold_pminsq_maskz_commuted:
4142; CHECK:       # %bb.0:
4143; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4144; CHECK-NEXT:    #APP
4145; CHECK-NEXT:    nop
4146; CHECK-NEXT:    #NO_APP
4147; CHECK-NEXT:    kmovd %edi, %k1
4148; CHECK-NEXT:    vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4149; CHECK-NEXT:    retq
4150  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4151  %2 = icmp slt <8 x i64> %a1, %a0
4152  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4153  %4 = bitcast i8 %mask to <8 x i1>
4154  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4155  ret <8 x i64> %5
4156}
4157
4158define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) {
4159; CHECK-LABEL: stack_fold_pminsw:
4160; CHECK:       # %bb.0:
4161; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4162; CHECK-NEXT:    #APP
4163; CHECK-NEXT:    nop
4164; CHECK-NEXT:    #NO_APP
4165; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4166; CHECK-NEXT:    retq
4167  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4168  %2 = icmp slt <32 x i16> %a0, %a1
4169  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4170  ret <32 x i16> %3
4171}
4172
4173define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4174; CHECK-LABEL: stack_fold_pminsw_commuted:
4175; CHECK:       # %bb.0:
4176; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4177; CHECK-NEXT:    #APP
4178; CHECK-NEXT:    nop
4179; CHECK-NEXT:    #NO_APP
4180; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4181; CHECK-NEXT:    retq
4182  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4183  %2 = icmp slt <32 x i16> %a1, %a0
4184  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4185  ret <32 x i16> %3
4186}
4187
4188define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4189; CHECK-LABEL: stack_fold_pminsw_mask:
4190; CHECK:       # %bb.0:
4191; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4192; CHECK-NEXT:    #APP
4193; CHECK-NEXT:    nop
4194; CHECK-NEXT:    #NO_APP
4195; CHECK-NEXT:    kmovd %edi, %k1
4196; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4197; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4198; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4199; CHECK-NEXT:    retq
4200  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4201  %2 = icmp slt <32 x i16> %a0, %a1
4202  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4203  %4 = bitcast i32 %mask to <32 x i1>
4204  ; load needed to keep the operation from being scheduled about the asm block
4205  %5 = load <32 x i16>, <32 x i16>* %passthru
4206  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4207  ret <32 x i16> %6
4208}
4209
4210define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4211; CHECK-LABEL: stack_fold_pminsw_mask_commuted:
4212; CHECK:       # %bb.0:
4213; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4214; CHECK-NEXT:    #APP
4215; CHECK-NEXT:    nop
4216; CHECK-NEXT:    #NO_APP
4217; CHECK-NEXT:    kmovd %edi, %k1
4218; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4219; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4220; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4221; CHECK-NEXT:    retq
4222  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4223  %2 = icmp slt <32 x i16> %a1, %a0
4224  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4225  %4 = bitcast i32 %mask to <32 x i1>
4226  ; load needed to keep the operation from being scheduled about the asm block
4227  %5 = load <32 x i16>, <32 x i16>* %passthru
4228  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4229  ret <32 x i16> %6
4230}
4231
4232define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4233; CHECK-LABEL: stack_fold_pminsw_maskz:
4234; CHECK:       # %bb.0:
4235; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4236; CHECK-NEXT:    #APP
4237; CHECK-NEXT:    nop
4238; CHECK-NEXT:    #NO_APP
4239; CHECK-NEXT:    kmovd %edi, %k1
4240; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4241; CHECK-NEXT:    retq
4242  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4243  %2 = icmp slt <32 x i16> %a0, %a1
4244  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4245  %4 = bitcast i32 %mask to <32 x i1>
4246  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4247  ret <32 x i16> %5
4248}
4249
4250define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4251; CHECK-LABEL: stack_fold_pminsw_maskz_commuted:
4252; CHECK:       # %bb.0:
4253; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4254; CHECK-NEXT:    #APP
4255; CHECK-NEXT:    nop
4256; CHECK-NEXT:    #NO_APP
4257; CHECK-NEXT:    kmovd %edi, %k1
4258; CHECK-NEXT:    vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4259; CHECK-NEXT:    retq
4260  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4261  %2 = icmp slt <32 x i16> %a1, %a0
4262  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4263  %4 = bitcast i32 %mask to <32 x i1>
4264  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4265  ret <32 x i16> %5
4266}
4267
4268define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) {
4269; CHECK-LABEL: stack_fold_pminub:
4270; CHECK:       # %bb.0:
4271; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4272; CHECK-NEXT:    #APP
4273; CHECK-NEXT:    nop
4274; CHECK-NEXT:    #NO_APP
4275; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4276; CHECK-NEXT:    retq
4277  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4278  %2 = icmp ult <64 x i8> %a0, %a1
4279  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4280  ret <64 x i8> %3
4281}
4282
4283define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
4284; CHECK-LABEL: stack_fold_pminub_commuted:
4285; CHECK:       # %bb.0:
4286; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4287; CHECK-NEXT:    #APP
4288; CHECK-NEXT:    nop
4289; CHECK-NEXT:    #NO_APP
4290; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4291; CHECK-NEXT:    retq
4292  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4293  %2 = icmp ult <64 x i8> %a1, %a0
4294  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4295  ret <64 x i8> %3
4296}
4297
4298define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
4299; CHECK-LABEL: stack_fold_pminub_mask:
4300; CHECK:       # %bb.0:
4301; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4302; CHECK-NEXT:    #APP
4303; CHECK-NEXT:    nop
4304; CHECK-NEXT:    #NO_APP
4305; CHECK-NEXT:    kmovq %rdi, %k1
4306; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4307; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4308; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4309; CHECK-NEXT:    retq
4310  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4311  %2 = icmp ult <64 x i8> %a0, %a1
4312  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4313  %4 = bitcast i64 %mask to <64 x i1>
4314  ; load needed to keep the operation from being scheduled about the asm block
4315  %5 = load <64 x i8>, <64 x i8>* %passthru
4316  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4317  ret <64 x i8> %6
4318}
4319
4320define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
4321; CHECK-LABEL: stack_fold_pminub_mask_commuted:
4322; CHECK:       # %bb.0:
4323; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4324; CHECK-NEXT:    #APP
4325; CHECK-NEXT:    nop
4326; CHECK-NEXT:    #NO_APP
4327; CHECK-NEXT:    kmovq %rdi, %k1
4328; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4329; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4330; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4331; CHECK-NEXT:    retq
4332  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4333  %2 = icmp ult <64 x i8> %a1, %a0
4334  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4335  %4 = bitcast i64 %mask to <64 x i1>
4336  ; load needed to keep the operation from being scheduled about the asm block
4337  %5 = load <64 x i8>, <64 x i8>* %passthru
4338  %6 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %5
4339  ret <64 x i8> %6
4340}
4341
4342define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4343; CHECK-LABEL: stack_fold_pminub_maskz:
4344; CHECK:       # %bb.0:
4345; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4346; CHECK-NEXT:    #APP
4347; CHECK-NEXT:    nop
4348; CHECK-NEXT:    #NO_APP
4349; CHECK-NEXT:    kmovq %rdi, %k1
4350; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4351; CHECK-NEXT:    retq
4352  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4353  %2 = icmp ult <64 x i8> %a0, %a1
4354  %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
4355  %4 = bitcast i64 %mask to <64 x i1>
4356  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4357  ret <64 x i8> %5
4358}
4359
4360define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
4361; CHECK-LABEL: stack_fold_pminub_maskz_commuted:
4362; CHECK:       # %bb.0:
4363; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4364; CHECK-NEXT:    #APP
4365; CHECK-NEXT:    nop
4366; CHECK-NEXT:    #NO_APP
4367; CHECK-NEXT:    kmovq %rdi, %k1
4368; CHECK-NEXT:    vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4369; CHECK-NEXT:    retq
4370  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4371  %2 = icmp ult <64 x i8> %a1, %a0
4372  %3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
4373  %4 = bitcast i64 %mask to <64 x i1>
4374  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> zeroinitializer
4375  ret <64 x i8> %5
4376}
4377
4378define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) {
4379; CHECK-LABEL: stack_fold_pminud:
4380; CHECK:       # %bb.0:
4381; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4382; CHECK-NEXT:    #APP
4383; CHECK-NEXT:    nop
4384; CHECK-NEXT:    #NO_APP
4385; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4386; CHECK-NEXT:    retq
4387  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4388  %2 = icmp ult <16 x i32> %a0, %a1
4389  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4390  ret <16 x i32> %3
4391}
4392
4393define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
4394; CHECK-LABEL: stack_fold_pminud_commuted:
4395; CHECK:       # %bb.0:
4396; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4397; CHECK-NEXT:    #APP
4398; CHECK-NEXT:    nop
4399; CHECK-NEXT:    #NO_APP
4400; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4401; CHECK-NEXT:    retq
4402  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4403  %2 = icmp ult <16 x i32> %a1, %a0
4404  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4405  ret <16 x i32> %3
4406}
4407
4408define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
4409; CHECK-LABEL: stack_fold_pminud_mask:
4410; CHECK:       # %bb.0:
4411; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4412; CHECK-NEXT:    #APP
4413; CHECK-NEXT:    nop
4414; CHECK-NEXT:    #NO_APP
4415; CHECK-NEXT:    kmovd %edi, %k1
4416; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4417; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4418; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4419; CHECK-NEXT:    retq
4420  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4421  %2 = icmp ult <16 x i32> %a0, %a1
4422  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4423  %4 = bitcast i16 %mask to <16 x i1>
4424  ; load needed to keep the operation from being scheduled about the asm block
4425  %5 = load <16 x i32>, <16 x i32>* %passthru
4426  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4427  ret <16 x i32> %6
4428}
4429
4430define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
4431; CHECK-LABEL: stack_fold_pminud_mask_commuted:
4432; CHECK:       # %bb.0:
4433; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4434; CHECK-NEXT:    #APP
4435; CHECK-NEXT:    nop
4436; CHECK-NEXT:    #NO_APP
4437; CHECK-NEXT:    kmovd %edi, %k1
4438; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4439; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4440; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4441; CHECK-NEXT:    retq
4442  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4443  %2 = icmp ult <16 x i32> %a1, %a0
4444  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4445  %4 = bitcast i16 %mask to <16 x i1>
4446  ; load needed to keep the operation from being scheduled about the asm block
4447  %5 = load <16 x i32>, <16 x i32>* %passthru
4448  %6 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %5
4449  ret <16 x i32> %6
4450}
4451
4452define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4453; CHECK-LABEL: stack_fold_pminud_maskz:
4454; CHECK:       # %bb.0:
4455; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4456; CHECK-NEXT:    #APP
4457; CHECK-NEXT:    nop
4458; CHECK-NEXT:    #NO_APP
4459; CHECK-NEXT:    kmovd %edi, %k1
4460; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4461; CHECK-NEXT:    retq
4462  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4463  %2 = icmp ult <16 x i32> %a0, %a1
4464  %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
4465  %4 = bitcast i16 %mask to <16 x i1>
4466  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4467  ret <16 x i32> %5
4468}
4469
4470define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
4471; CHECK-LABEL: stack_fold_pminud_maskz_commuted:
4472; CHECK:       # %bb.0:
4473; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4474; CHECK-NEXT:    #APP
4475; CHECK-NEXT:    nop
4476; CHECK-NEXT:    #NO_APP
4477; CHECK-NEXT:    kmovd %edi, %k1
4478; CHECK-NEXT:    vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4479; CHECK-NEXT:    retq
4480  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4481  %2 = icmp ult <16 x i32> %a1, %a0
4482  %3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
4483  %4 = bitcast i16 %mask to <16 x i1>
4484  %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
4485  ret <16 x i32> %5
4486}
4487
4488define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) {
4489; CHECK-LABEL: stack_fold_pminuq:
4490; CHECK:       # %bb.0:
4491; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4492; CHECK-NEXT:    #APP
4493; CHECK-NEXT:    nop
4494; CHECK-NEXT:    #NO_APP
4495; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4496; CHECK-NEXT:    retq
4497  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4498  %2 = icmp ult <8 x i64> %a0, %a1
4499  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4500  ret <8 x i64> %3
4501}
4502
4503define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
4504; CHECK-LABEL: stack_fold_pminuq_commuted:
4505; CHECK:       # %bb.0:
4506; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4507; CHECK-NEXT:    #APP
4508; CHECK-NEXT:    nop
4509; CHECK-NEXT:    #NO_APP
4510; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4511; CHECK-NEXT:    retq
4512  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4513  %2 = icmp ult <8 x i64> %a1, %a0
4514  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4515  ret <8 x i64> %3
4516}
4517
4518define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4519; CHECK-LABEL: stack_fold_pminuq_mask:
4520; CHECK:       # %bb.0:
4521; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4522; CHECK-NEXT:    #APP
4523; CHECK-NEXT:    nop
4524; CHECK-NEXT:    #NO_APP
4525; CHECK-NEXT:    kmovd %edi, %k1
4526; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4527; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4528; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4529; CHECK-NEXT:    retq
4530  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4531  %2 = icmp ult <8 x i64> %a0, %a1
4532  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4533  %4 = bitcast i8 %mask to <8 x i1>
4534  ; load needed to keep the operation from being scheduled about the asm block
4535  %5 = load <8 x i64>, <8 x i64>* %passthru
4536  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4537  ret <8 x i64> %6
4538}
4539
4540define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
4541; CHECK-LABEL: stack_fold_pminuq_mask_commuted:
4542; CHECK:       # %bb.0:
4543; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4544; CHECK-NEXT:    #APP
4545; CHECK-NEXT:    nop
4546; CHECK-NEXT:    #NO_APP
4547; CHECK-NEXT:    kmovd %edi, %k1
4548; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4549; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4550; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4551; CHECK-NEXT:    retq
4552  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4553  %2 = icmp ult <8 x i64> %a1, %a0
4554  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4555  %4 = bitcast i8 %mask to <8 x i1>
4556  ; load needed to keep the operation from being scheduled about the asm block
4557  %5 = load <8 x i64>, <8 x i64>* %passthru
4558  %6 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %5
4559  ret <8 x i64> %6
4560}
4561
4562define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4563; CHECK-LABEL: stack_fold_pminuq_maskz:
4564; CHECK:       # %bb.0:
4565; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4566; CHECK-NEXT:    #APP
4567; CHECK-NEXT:    nop
4568; CHECK-NEXT:    #NO_APP
4569; CHECK-NEXT:    kmovd %edi, %k1
4570; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4571; CHECK-NEXT:    retq
4572  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4573  %2 = icmp ult <8 x i64> %a0, %a1
4574  %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
4575  %4 = bitcast i8 %mask to <8 x i1>
4576  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4577  ret <8 x i64> %5
4578}
4579
4580define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
4581; CHECK-LABEL: stack_fold_pminuq_maskz_commuted:
4582; CHECK:       # %bb.0:
4583; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4584; CHECK-NEXT:    #APP
4585; CHECK-NEXT:    nop
4586; CHECK-NEXT:    #NO_APP
4587; CHECK-NEXT:    kmovd %edi, %k1
4588; CHECK-NEXT:    vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4589; CHECK-NEXT:    retq
4590  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4591  %2 = icmp ult <8 x i64> %a1, %a0
4592  %3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
4593  %4 = bitcast i8 %mask to <8 x i1>
4594  %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> zeroinitializer
4595  ret <8 x i64> %5
4596}
4597
4598define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) {
4599; CHECK-LABEL: stack_fold_pminuw:
4600; CHECK:       # %bb.0:
4601; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4602; CHECK-NEXT:    #APP
4603; CHECK-NEXT:    nop
4604; CHECK-NEXT:    #NO_APP
4605; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4606; CHECK-NEXT:    retq
4607  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4608  %2 = icmp ult <32 x i16> %a0, %a1
4609  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4610  ret <32 x i16> %3
4611}
4612
4613define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
4614; CHECK-LABEL: stack_fold_pminuw_commuted:
4615; CHECK:       # %bb.0:
4616; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4617; CHECK-NEXT:    #APP
4618; CHECK-NEXT:    nop
4619; CHECK-NEXT:    #NO_APP
4620; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
4621; CHECK-NEXT:    retq
4622  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4623  %2 = icmp ult <32 x i16> %a1, %a0
4624  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4625  ret <32 x i16> %3
4626}
4627
4628define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4629; CHECK-LABEL: stack_fold_pminuw_mask:
4630; CHECK:       # %bb.0:
4631; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4632; CHECK-NEXT:    #APP
4633; CHECK-NEXT:    nop
4634; CHECK-NEXT:    #NO_APP
4635; CHECK-NEXT:    kmovd %edi, %k1
4636; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4637; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4638; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4639; CHECK-NEXT:    retq
4640  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4641  %2 = icmp ult <32 x i16> %a0, %a1
4642  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4643  %4 = bitcast i32 %mask to <32 x i1>
4644  ; load needed to keep the operation from being scheduled about the asm block
4645  %5 = load <32 x i16>, <32 x i16>* %passthru
4646  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4647  ret <32 x i16> %6
4648}
4649
4650define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
4651; CHECK-LABEL: stack_fold_pminuw_mask_commuted:
4652; CHECK:       # %bb.0:
4653; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4654; CHECK-NEXT:    #APP
4655; CHECK-NEXT:    nop
4656; CHECK-NEXT:    #NO_APP
4657; CHECK-NEXT:    kmovd %edi, %k1
4658; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm2
4659; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
4660; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
4661; CHECK-NEXT:    retq
4662  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4663  %2 = icmp ult <32 x i16> %a1, %a0
4664  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4665  %4 = bitcast i32 %mask to <32 x i1>
4666  ; load needed to keep the operation from being scheduled about the asm block
4667  %5 = load <32 x i16>, <32 x i16>* %passthru
4668  %6 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> %5
4669  ret <32 x i16> %6
4670}
4671
4672define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4673; CHECK-LABEL: stack_fold_pminuw_maskz:
4674; CHECK:       # %bb.0:
4675; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4676; CHECK-NEXT:    #APP
4677; CHECK-NEXT:    nop
4678; CHECK-NEXT:    #NO_APP
4679; CHECK-NEXT:    kmovd %edi, %k1
4680; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4681; CHECK-NEXT:    retq
4682  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4683  %2 = icmp ult <32 x i16> %a0, %a1
4684  %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
4685  %4 = bitcast i32 %mask to <32 x i1>
4686  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4687  ret <32 x i16> %5
4688}
4689
4690define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
4691; CHECK-LABEL: stack_fold_pminuw_maskz_commuted:
4692; CHECK:       # %bb.0:
4693; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4694; CHECK-NEXT:    #APP
4695; CHECK-NEXT:    nop
4696; CHECK-NEXT:    #NO_APP
4697; CHECK-NEXT:    kmovd %edi, %k1
4698; CHECK-NEXT:    vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
4699; CHECK-NEXT:    retq
4700  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4701  %2 = icmp ult <32 x i16> %a1, %a0
4702  %3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
4703  %4 = bitcast i32 %mask to <32 x i1>
4704  %5 = select <32 x i1> %4, <32 x i16> %3, <32 x i16> zeroinitializer
4705  ret <32 x i16> %5
4706}
4707
4708define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) {
4709; CHECK-LABEL: stack_fold_vpmovdb:
4710; CHECK:       # %bb.0:
4711; CHECK-NEXT:    vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4712; CHECK-NEXT:    #APP
4713; CHECK-NEXT:    nop
4714; CHECK-NEXT:    #NO_APP
4715; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4716; CHECK-NEXT:    vzeroupper
4717; CHECK-NEXT:    retq
4718  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4719  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4720  ret <16 x i8> %1
4721}
4722declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
4723
4724define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) {
4725; CHECK-LABEL: stack_fold_vpmovdw:
4726; CHECK:       # %bb.0:
4727; CHECK-NEXT:    vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4728; CHECK-NEXT:    #APP
4729; CHECK-NEXT:    nop
4730; CHECK-NEXT:    #NO_APP
4731; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4732; CHECK-NEXT:    retq
4733  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4734  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4735  ret <16 x i16> %1
4736}
4737declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
4738
4739define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
4740; CHECK-LABEL: stack_fold_movq_load:
4741; CHECK:       # %bb.0:
4742; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4743; CHECK-NEXT:    #APP
4744; CHECK-NEXT:    nop
4745; CHECK-NEXT:    #NO_APP
4746; CHECK-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4747; CHECK-NEXT:    # xmm0 = mem[0],zero
4748; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
4749; CHECK-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
4750; CHECK-NEXT:    retq
4751  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4752  %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
4753  ; add forces execution domain
4754  %3 = add <2 x i64> %2, <i64 1, i64 1>
4755  ret <2 x i64> %3
4756}
4757
4758define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) {
4759; CHECK-LABEL: stack_fold_vpmovqd:
4760; CHECK:       # %bb.0:
4761; CHECK-NEXT:    vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4762; CHECK-NEXT:    #APP
4763; CHECK-NEXT:    nop
4764; CHECK-NEXT:    #NO_APP
4765; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4766; CHECK-NEXT:    retq
4767  %1 = trunc <8 x i64> %a0 to <8 x i32>
4768  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4769  ret <8 x i32> %1
4770}
4771declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
4772
4773define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) {
4774; CHECK-LABEL: stack_fold_vpmovqw:
4775; CHECK:       # %bb.0:
4776; CHECK-NEXT:    vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4777; CHECK-NEXT:    #APP
4778; CHECK-NEXT:    nop
4779; CHECK-NEXT:    #NO_APP
4780; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4781; CHECK-NEXT:    vzeroupper
4782; CHECK-NEXT:    retq
4783  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4784  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4785  ret <8 x i16> %1
4786}
4787declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
4788
4789define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) {
4790; CHECK-LABEL: stack_fold_vpmovwb:
4791; CHECK:       # %bb.0:
4792; CHECK-NEXT:    vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4793; CHECK-NEXT:    #APP
4794; CHECK-NEXT:    nop
4795; CHECK-NEXT:    #NO_APP
4796; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4797; CHECK-NEXT:    retq
4798  %1 = trunc <32 x i16> %a0 to <32 x i8>
4799  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4800  ret <32 x i8> %1
4801}
4802declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
4803
4804define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) {
4805; CHECK-LABEL: stack_fold_vpmovsdb:
4806; CHECK:       # %bb.0:
4807; CHECK-NEXT:    vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4808; CHECK-NEXT:    #APP
4809; CHECK-NEXT:    nop
4810; CHECK-NEXT:    #NO_APP
4811; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4812; CHECK-NEXT:    vzeroupper
4813; CHECK-NEXT:    retq
4814  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
4815  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4816  ret <16 x i8> %1
4817}
4818declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
4819
4820define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) {
4821; CHECK-LABEL: stack_fold_vpmovsdw:
4822; CHECK:       # %bb.0:
4823; CHECK-NEXT:    vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4824; CHECK-NEXT:    #APP
4825; CHECK-NEXT:    nop
4826; CHECK-NEXT:    #NO_APP
4827; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4828; CHECK-NEXT:    retq
4829  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
4830  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4831  ret <16 x i16> %1
4832}
4833declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
4834
4835define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) {
4836; CHECK-LABEL: stack_fold_vpmovsqd:
4837; CHECK:       # %bb.0:
4838; CHECK-NEXT:    vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4839; CHECK-NEXT:    #APP
4840; CHECK-NEXT:    nop
4841; CHECK-NEXT:    #NO_APP
4842; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4843; CHECK-NEXT:    retq
4844  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
4845  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4846  ret <8 x i32> %1
4847}
4848declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
4849
4850define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) {
4851; CHECK-LABEL: stack_fold_vpmovsqw:
4852; CHECK:       # %bb.0:
4853; CHECK-NEXT:    vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
4854; CHECK-NEXT:    #APP
4855; CHECK-NEXT:    nop
4856; CHECK-NEXT:    #NO_APP
4857; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4858; CHECK-NEXT:    vzeroupper
4859; CHECK-NEXT:    retq
4860  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
4861  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4862  ret <8 x i16> %1
4863}
4864declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
4865
4866define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) {
4867; CHECK-LABEL: stack_fold_vpmovswb:
4868; CHECK:       # %bb.0:
4869; CHECK-NEXT:    vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
4870; CHECK-NEXT:    #APP
4871; CHECK-NEXT:    nop
4872; CHECK-NEXT:    #NO_APP
4873; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4874; CHECK-NEXT:    retq
4875  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
4876  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4877  ret <32 x i8> %1
4878}
4879declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
4880
4881define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) {
4882; CHECK-LABEL: stack_fold_pmovsxbd_zmm:
4883; CHECK:       # %bb.0:
4884; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4885; CHECK-NEXT:    #APP
4886; CHECK-NEXT:    nop
4887; CHECK-NEXT:    #NO_APP
4888; CHECK-NEXT:    vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4889; CHECK-NEXT:    retq
4890  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4891  %2 = sext <16 x i8> %a0 to <16 x i32>
4892  ret <16 x i32> %2
4893}
4894
4895define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) {
4896; CHECK-LABEL: stack_fold_pmovsxbq_zmm:
4897; CHECK:       # %bb.0:
4898; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4899; CHECK-NEXT:    #APP
4900; CHECK-NEXT:    nop
4901; CHECK-NEXT:    #NO_APP
4902; CHECK-NEXT:    vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4903; CHECK-NEXT:    retq
4904  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4905  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
4906  %3 = sext <8 x i8> %2 to <8 x i64>
4907  ret <8 x i64> %3
4908}
4909
4910define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) {
4911; CHECK-LABEL: stack_fold_pmovsxbw_zmm:
4912; CHECK:       # %bb.0:
4913; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4914; CHECK-NEXT:    #APP
4915; CHECK-NEXT:    nop
4916; CHECK-NEXT:    #NO_APP
4917; CHECK-NEXT:    vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4918; CHECK-NEXT:    retq
4919  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4920  %2 = sext <32 x i8> %a0 to <32 x i16>
4921  ret <32 x i16> %2
4922}
4923
4924define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) {
4925; CHECK-LABEL: stack_fold_pmovsxdq_zmm:
4926; CHECK:       # %bb.0:
4927; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4928; CHECK-NEXT:    #APP
4929; CHECK-NEXT:    nop
4930; CHECK-NEXT:    #NO_APP
4931; CHECK-NEXT:    vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4932; CHECK-NEXT:    retq
4933  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4934  %2 = sext <8 x i32> %a0 to <8 x i64>
4935  ret <8 x i64> %2
4936}
4937
4938define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) {
4939; CHECK-LABEL: stack_fold_pmovsxwd_zmm:
4940; CHECK:       # %bb.0:
4941; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4942; CHECK-NEXT:    #APP
4943; CHECK-NEXT:    nop
4944; CHECK-NEXT:    #NO_APP
4945; CHECK-NEXT:    vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
4946; CHECK-NEXT:    retq
4947  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4948  %2 = sext <16 x i16> %a0 to <16 x i32>
4949  ret <16 x i32> %2
4950}
4951
4952define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) {
4953; CHECK-LABEL: stack_fold_pmovsxwq_zmm:
4954; CHECK:       # %bb.0:
4955; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4956; CHECK-NEXT:    #APP
4957; CHECK-NEXT:    nop
4958; CHECK-NEXT:    #NO_APP
4959; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
4960; CHECK-NEXT:    retq
4961  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4962  %2 = sext <8 x i16> %a0 to <8 x i64>
4963  ret <8 x i64> %2
4964}
4965
4966define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
4967; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm:
4968; CHECK:       # %bb.0:
4969; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4970; CHECK-NEXT:    #APP
4971; CHECK-NEXT:    nop
4972; CHECK-NEXT:    #NO_APP
4973; CHECK-NEXT:    kmovd %edi, %k1
4974; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
4975; CHECK-NEXT:    retq
4976  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4977  %2 = sext <8 x i16> %a0 to <8 x i64>
4978  %3 = bitcast i8 %mask to <8 x i1>
4979  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
4980  ret <8 x i64> %4
4981}
4982
4983define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
4984; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm:
4985; CHECK:       # %bb.0:
4986; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4987; CHECK-NEXT:    #APP
4988; CHECK-NEXT:    nop
4989; CHECK-NEXT:    #NO_APP
4990; CHECK-NEXT:    kmovd %edi, %k1
4991; CHECK-NEXT:    vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
4992; CHECK-NEXT:    retq
4993  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
4994  %2 = sext <8 x i16> %a0 to <8 x i64>
4995  %3 = bitcast i8 %mask to <8 x i1>
4996  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
4997  ret <8 x i64> %4
4998}
4999
5000define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) {
5001; CHECK-LABEL: stack_fold_vpmovusdb:
5002; CHECK:       # %bb.0:
5003; CHECK-NEXT:    vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5004; CHECK-NEXT:    #APP
5005; CHECK-NEXT:    nop
5006; CHECK-NEXT:    #NO_APP
5007; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5008; CHECK-NEXT:    vzeroupper
5009; CHECK-NEXT:    retq
5010  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
5011  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5012  ret <16 x i8> %1
5013}
5014declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
5015
5016define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) {
5017; CHECK-LABEL: stack_fold_vpmovusdw:
5018; CHECK:       # %bb.0:
5019; CHECK-NEXT:    vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5020; CHECK-NEXT:    #APP
5021; CHECK-NEXT:    nop
5022; CHECK-NEXT:    #NO_APP
5023; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5024; CHECK-NEXT:    retq
5025  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
5026  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5027  ret <16 x i16> %1
5028}
5029declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
5030
5031define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) {
5032; CHECK-LABEL: stack_fold_vpmovusqd:
5033; CHECK:       # %bb.0:
5034; CHECK-NEXT:    vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5035; CHECK-NEXT:    #APP
5036; CHECK-NEXT:    nop
5037; CHECK-NEXT:    #NO_APP
5038; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5039; CHECK-NEXT:    retq
5040  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
5041  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5042  ret <8 x i32> %1
5043}
5044declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
5045
5046define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) {
5047; CHECK-LABEL: stack_fold_vpmovusqw:
5048; CHECK:       # %bb.0:
5049; CHECK-NEXT:    vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
5050; CHECK-NEXT:    #APP
5051; CHECK-NEXT:    nop
5052; CHECK-NEXT:    #NO_APP
5053; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5054; CHECK-NEXT:    vzeroupper
5055; CHECK-NEXT:    retq
5056  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
5057  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5058  ret <8 x i16> %1
5059}
5060declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
5061
5062define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) {
5063; CHECK-LABEL: stack_fold_vpmovuswb:
5064; CHECK:       # %bb.0:
5065; CHECK-NEXT:    vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
5066; CHECK-NEXT:    #APP
5067; CHECK-NEXT:    nop
5068; CHECK-NEXT:    #NO_APP
5069; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5070; CHECK-NEXT:    retq
5071  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
5072  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5073  ret <32 x i8> %1
5074}
5075declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
5076
5077define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) {
5078; CHECK-LABEL: stack_fold_pmovzxbd_zmm:
5079; CHECK:       # %bb.0:
5080; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5081; CHECK-NEXT:    #APP
5082; CHECK-NEXT:    nop
5083; CHECK-NEXT:    #NO_APP
5084; CHECK-NEXT:    vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5085; CHECK-NEXT:    # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
5086; CHECK-NEXT:    retq
5087  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5088  %2 = zext <16 x i8> %a0 to <16 x i32>
5089  ret <16 x i32> %2
5090}
5091
5092define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) {
5093; CHECK-LABEL: stack_fold_pmovzxbq_zmm:
5094; CHECK:       # %bb.0:
5095; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5096; CHECK-NEXT:    #APP
5097; CHECK-NEXT:    nop
5098; CHECK-NEXT:    #NO_APP
5099; CHECK-NEXT:    vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5100; CHECK-NEXT:    # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
5101; CHECK-NEXT:    retq
5102  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5103  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
5104  %3 = zext <8 x i8> %2 to <8 x i64>
5105  ret <8 x i64> %3
5106}
5107
5108define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) {
5109; CHECK-LABEL: stack_fold_pmovzxbw_zmm:
5110; CHECK:       # %bb.0:
5111; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5112; CHECK-NEXT:    #APP
5113; CHECK-NEXT:    nop
5114; CHECK-NEXT:    #NO_APP
5115; CHECK-NEXT:    vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5116; CHECK-NEXT:    # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
5117; CHECK-NEXT:    retq
5118  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5119  %2 = zext <32 x i8> %a0 to <32 x i16>
5120  ret <32 x i16> %2
5121}
5122
5123define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) {
5124; CHECK-LABEL: stack_fold_pmovzxdq_zmm:
5125; CHECK:       # %bb.0:
5126; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5127; CHECK-NEXT:    #APP
5128; CHECK-NEXT:    nop
5129; CHECK-NEXT:    #NO_APP
5130; CHECK-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5131; CHECK-NEXT:    # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
5132; CHECK-NEXT:    retq
5133  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5134  %2 = zext <8 x i32> %a0 to <8 x i64>
5135  ret <8 x i64> %2
5136}
5137
5138define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) {
5139; CHECK-LABEL: stack_fold_pmovzxwd_zmm:
5140; CHECK:       # %bb.0:
5141; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5142; CHECK-NEXT:    #APP
5143; CHECK-NEXT:    nop
5144; CHECK-NEXT:    #NO_APP
5145; CHECK-NEXT:    vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
5146; CHECK-NEXT:    # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
5147; CHECK-NEXT:    retq
5148  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5149  %2 = zext <16 x i16> %a0 to <16 x i32>
5150  ret <16 x i32> %2
5151}
5152
5153define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) {
5154; CHECK-LABEL: stack_fold_pmovzxwq_zmm:
5155; CHECK:       # %bb.0:
5156; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5157; CHECK-NEXT:    #APP
5158; CHECK-NEXT:    nop
5159; CHECK-NEXT:    #NO_APP
5160; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
5161; CHECK-NEXT:    # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5162; CHECK-NEXT:    retq
5163  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5164  %2 = zext <8 x i16> %a0 to <8 x i64>
5165  ret <8 x i64> %2
5166}
5167
5168define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
5169; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm:
5170; CHECK:       # %bb.0:
5171; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5172; CHECK-NEXT:    #APP
5173; CHECK-NEXT:    nop
5174; CHECK-NEXT:    #NO_APP
5175; CHECK-NEXT:    kmovd %edi, %k1
5176; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
5177; CHECK-NEXT:    # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5178; CHECK-NEXT:    retq
5179  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5180  %2 = zext <8 x i16> %a0 to <8 x i64>
5181  %3 = bitcast i8 %mask to <8 x i1>
5182  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %passthru
5183  ret <8 x i64> %4
5184}
5185
5186define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
5187; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm:
5188; CHECK:       # %bb.0:
5189; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5190; CHECK-NEXT:    #APP
5191; CHECK-NEXT:    nop
5192; CHECK-NEXT:    #NO_APP
5193; CHECK-NEXT:    kmovd %edi, %k1
5194; CHECK-NEXT:    vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
5195; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
5196; CHECK-NEXT:    retq
5197  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5198  %2 = zext <8 x i16> %a0 to <8 x i64>
5199  %3 = bitcast i8 %mask to <8 x i1>
5200  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5201  ret <8 x i64> %4
5202}
5203
5204define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) {
5205; CHECK-LABEL: stack_fold_pmulld:
5206; CHECK:       # %bb.0:
5207; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5208; CHECK-NEXT:    #APP
5209; CHECK-NEXT:    nop
5210; CHECK-NEXT:    #NO_APP
5211; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5212; CHECK-NEXT:    retq
5213  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5214  %2 = mul <16 x i32> %a0, %a1
5215  ret <16 x i32> %2
5216}
5217
5218define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5219; CHECK-LABEL: stack_fold_pmulld_commuted:
5220; CHECK:       # %bb.0:
5221; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5222; CHECK-NEXT:    #APP
5223; CHECK-NEXT:    nop
5224; CHECK-NEXT:    #NO_APP
5225; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5226; CHECK-NEXT:    retq
5227  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5228  %2 = mul <16 x i32> %a1, %a0
5229  ret <16 x i32> %2
5230}
5231
5232define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5233; CHECK-LABEL: stack_fold_pmulld_mask:
5234; CHECK:       # %bb.0:
5235; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5236; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5237; CHECK-NEXT:    #APP
5238; CHECK-NEXT:    nop
5239; CHECK-NEXT:    #NO_APP
5240; CHECK-NEXT:    kmovd %esi, %k1
5241; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5242; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5243; CHECK-NEXT:    retq
5244  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5245  %2 = mul <16 x i32> %a0, %a1
5246  %3 = bitcast i16 %mask to <16 x i1>
5247  ; load needed to keep the operation from being scheduled about the asm block
5248  %4 = load <16 x i32>, <16 x i32>* %a2
5249  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5250  ret <16 x i32> %5
5251}
5252
5253define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5254; CHECK-LABEL: stack_fold_pmulld_mask_commuted:
5255; CHECK:       # %bb.0:
5256; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5257; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5258; CHECK-NEXT:    #APP
5259; CHECK-NEXT:    nop
5260; CHECK-NEXT:    #NO_APP
5261; CHECK-NEXT:    kmovd %esi, %k1
5262; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5263; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5264; CHECK-NEXT:    retq
5265  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5266  %2 = mul <16 x i32> %a1, %a0
5267  %3 = bitcast i16 %mask to <16 x i1>
5268  ; load needed to keep the operation from being scheduled about the asm block
5269  %4 = load <16 x i32>, <16 x i32>* %a2
5270  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5271  ret <16 x i32> %5
5272}
5273
5274define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5275; CHECK-LABEL: stack_fold_pmulld_maskz:
5276; CHECK:       # %bb.0:
5277; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5278; CHECK-NEXT:    #APP
5279; CHECK-NEXT:    nop
5280; CHECK-NEXT:    #NO_APP
5281; CHECK-NEXT:    kmovd %edi, %k1
5282; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5283; CHECK-NEXT:    retq
5284  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5285  %2 = mul <16 x i32> %a0, %a1
5286  %3 = bitcast i16 %mask to <16 x i1>
5287  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5288  ret <16 x i32> %4
5289}
5290
5291define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5292; CHECK-LABEL: stack_fold_pmulld_maskz_commuted:
5293; CHECK:       # %bb.0:
5294; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5295; CHECK-NEXT:    #APP
5296; CHECK-NEXT:    nop
5297; CHECK-NEXT:    #NO_APP
5298; CHECK-NEXT:    kmovd %edi, %k1
5299; CHECK-NEXT:    vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5300; CHECK-NEXT:    retq
5301  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5302  %2 = mul <16 x i32> %a1, %a0
5303  %3 = bitcast i16 %mask to <16 x i1>
5304  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5305  ret <16 x i32> %4
5306}
5307
5308define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) {
5309; CHECK-LABEL: stack_fold_pmullq:
5310; CHECK:       # %bb.0:
5311; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5312; CHECK-NEXT:    #APP
5313; CHECK-NEXT:    nop
5314; CHECK-NEXT:    #NO_APP
5315; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5316; CHECK-NEXT:    retq
5317  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5318  %2 = mul <8 x i64> %a0, %a1
5319  ret <8 x i64> %2
5320}
5321
5322define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5323; CHECK-LABEL: stack_fold_pmullq_commuted:
5324; CHECK:       # %bb.0:
5325; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5326; CHECK-NEXT:    #APP
5327; CHECK-NEXT:    nop
5328; CHECK-NEXT:    #NO_APP
5329; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5330; CHECK-NEXT:    retq
5331  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5332  %2 = mul <8 x i64> %a1, %a0
5333  ret <8 x i64> %2
5334}
5335
5336define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5337; CHECK-LABEL: stack_fold_pmullq_mask:
5338; CHECK:       # %bb.0:
5339; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5340; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5341; CHECK-NEXT:    #APP
5342; CHECK-NEXT:    nop
5343; CHECK-NEXT:    #NO_APP
5344; CHECK-NEXT:    kmovd %esi, %k1
5345; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5346; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5347; CHECK-NEXT:    retq
5348  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5349  %2 = mul <8 x i64> %a0, %a1
5350  %3 = bitcast i8 %mask to <8 x i1>
5351  ; load needed to keep the operation from being scheduled about the asm block
5352  %4 = load <8 x i64>, <8 x i64>* %a2
5353  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5354  ret <8 x i64> %5
5355}
5356
5357define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5358; CHECK-LABEL: stack_fold_pmullq_mask_commuted:
5359; CHECK:       # %bb.0:
5360; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5361; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5362; CHECK-NEXT:    #APP
5363; CHECK-NEXT:    nop
5364; CHECK-NEXT:    #NO_APP
5365; CHECK-NEXT:    kmovd %esi, %k1
5366; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5367; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5368; CHECK-NEXT:    retq
5369  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5370  %2 = mul <8 x i64> %a1, %a0
5371  %3 = bitcast i8 %mask to <8 x i1>
5372  ; load needed to keep the operation from being scheduled about the asm block
5373  %4 = load <8 x i64>, <8 x i64>* %a2
5374  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5375  ret <8 x i64> %5
5376}
5377
5378define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5379; CHECK-LABEL: stack_fold_pmullq_maskz:
5380; CHECK:       # %bb.0:
5381; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5382; CHECK-NEXT:    #APP
5383; CHECK-NEXT:    nop
5384; CHECK-NEXT:    #NO_APP
5385; CHECK-NEXT:    kmovd %edi, %k1
5386; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5387; CHECK-NEXT:    retq
5388  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5389  %2 = mul <8 x i64> %a0, %a1
5390  %3 = bitcast i8 %mask to <8 x i1>
5391  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5392  ret <8 x i64> %4
5393}
5394
5395define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5396; CHECK-LABEL: stack_fold_pmullq_maskz_commuted:
5397; CHECK:       # %bb.0:
5398; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5399; CHECK-NEXT:    #APP
5400; CHECK-NEXT:    nop
5401; CHECK-NEXT:    #NO_APP
5402; CHECK-NEXT:    kmovd %edi, %k1
5403; CHECK-NEXT:    vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5404; CHECK-NEXT:    retq
5405  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5406  %2 = mul <8 x i64> %a1, %a0
5407  %3 = bitcast i8 %mask to <8 x i1>
5408  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5409  ret <8 x i64> %4
5410}
5411
5412define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) {
5413; CHECK-LABEL: stack_fold_pmullw:
5414; CHECK:       # %bb.0:
5415; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5416; CHECK-NEXT:    #APP
5417; CHECK-NEXT:    nop
5418; CHECK-NEXT:    #NO_APP
5419; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5420; CHECK-NEXT:    retq
5421  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5422  %2 = mul <32 x i16> %a0, %a1
5423  ret <32 x i16> %2
5424}
5425
5426define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
5427; CHECK-LABEL: stack_fold_pmullw_commuted:
5428; CHECK:       # %bb.0:
5429; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5430; CHECK-NEXT:    #APP
5431; CHECK-NEXT:    nop
5432; CHECK-NEXT:    #NO_APP
5433; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5434; CHECK-NEXT:    retq
5435  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5436  %2 = mul <32 x i16> %a1, %a0
5437  ret <32 x i16> %2
5438}
5439
5440define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
5441; CHECK-LABEL: stack_fold_pmullw_mask:
5442; CHECK:       # %bb.0:
5443; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5444; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5445; CHECK-NEXT:    #APP
5446; CHECK-NEXT:    nop
5447; CHECK-NEXT:    #NO_APP
5448; CHECK-NEXT:    kmovd %esi, %k1
5449; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5450; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5451; CHECK-NEXT:    retq
5452  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5453  %2 = mul <32 x i16> %a0, %a1
5454  %3 = bitcast i32 %mask to <32 x i1>
5455  ; load needed to keep the operation from being scheduled about the asm block
5456  %4 = load <32 x i16>, <32 x i16>* %a2
5457  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5458  ret <32 x i16> %5
5459}
5460
5461define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
5462; CHECK-LABEL: stack_fold_pmullw_mask_commuted:
5463; CHECK:       # %bb.0:
5464; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5465; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5466; CHECK-NEXT:    #APP
5467; CHECK-NEXT:    nop
5468; CHECK-NEXT:    #NO_APP
5469; CHECK-NEXT:    kmovd %esi, %k1
5470; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5471; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5472; CHECK-NEXT:    retq
5473  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5474  %2 = mul <32 x i16> %a1, %a0
5475  %3 = bitcast i32 %mask to <32 x i1>
5476  ; load needed to keep the operation from being scheduled about the asm block
5477  %4 = load <32 x i16>, <32 x i16>* %a2
5478  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
5479  ret <32 x i16> %5
5480}
5481
5482define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5483; CHECK-LABEL: stack_fold_pmullw_maskz:
5484; CHECK:       # %bb.0:
5485; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5486; CHECK-NEXT:    #APP
5487; CHECK-NEXT:    nop
5488; CHECK-NEXT:    #NO_APP
5489; CHECK-NEXT:    kmovd %edi, %k1
5490; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5491; CHECK-NEXT:    retq
5492  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5493  %2 = mul <32 x i16> %a0, %a1
5494  %3 = bitcast i32 %mask to <32 x i1>
5495  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5496  ret <32 x i16> %4
5497}
5498
5499define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
5500; CHECK-LABEL: stack_fold_pmullw_maskz_commuted:
5501; CHECK:       # %bb.0:
5502; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5503; CHECK-NEXT:    #APP
5504; CHECK-NEXT:    nop
5505; CHECK-NEXT:    #NO_APP
5506; CHECK-NEXT:    kmovd %edi, %k1
5507; CHECK-NEXT:    vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5508; CHECK-NEXT:    retq
5509  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5510  %2 = mul <32 x i16> %a1, %a0
5511  %3 = bitcast i32 %mask to <32 x i1>
5512  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
5513  ret <32 x i16> %4
5514}
5515
5516define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) {
5517; CHECK-LABEL: stack_fold_pmuldq:
5518; CHECK:       # %bb.0:
5519; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5520; CHECK-NEXT:    #APP
5521; CHECK-NEXT:    nop
5522; CHECK-NEXT:    #NO_APP
5523; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5524; CHECK-NEXT:    retq
5525  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5526  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5527  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5528  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5529  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5530  %6 = mul <8 x i64> %3, %5
5531  ret <8 x i64> %6
5532}
5533
5534define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5535; CHECK-LABEL: stack_fold_pmuldq_commuted:
5536; CHECK:       # %bb.0:
5537; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5538; CHECK-NEXT:    #APP
5539; CHECK-NEXT:    nop
5540; CHECK-NEXT:    #NO_APP
5541; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5542; CHECK-NEXT:    retq
5543  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5544  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5545  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5546  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5547  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5548  %6 = mul <8 x i64> %5, %3
5549  ret <8 x i64> %6
5550}
5551
5552define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5553; CHECK-LABEL: stack_fold_pmuldq_mask:
5554; CHECK:       # %bb.0:
5555; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5556; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5557; CHECK-NEXT:    #APP
5558; CHECK-NEXT:    nop
5559; CHECK-NEXT:    #NO_APP
5560; CHECK-NEXT:    kmovd %esi, %k1
5561; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5562; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5563; CHECK-NEXT:    retq
5564  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5565  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5566  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5567  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5568  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5569  %6 = mul <8 x i64> %3, %5
5570  %7 = bitcast i8 %mask to <8 x i1>
5571  ; load needed to keep the operation from being scheduled about the asm block
5572  %8 = load <8 x i64>, <8 x i64>* %a2
5573  %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5574  ret <8 x i64> %9
5575}
5576
5577define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5578; CHECK-LABEL: stack_fold_pmuldq_mask_commuted:
5579; CHECK:       # %bb.0:
5580; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5581; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5582; CHECK-NEXT:    #APP
5583; CHECK-NEXT:    nop
5584; CHECK-NEXT:    #NO_APP
5585; CHECK-NEXT:    kmovd %esi, %k1
5586; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5587; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5588; CHECK-NEXT:    retq
5589  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5590  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5591  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5592  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5593  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5594  %6 = mul <8 x i64> %5, %3
5595  %7 = bitcast i8 %mask to <8 x i1>
5596  ; load needed to keep the operation from being scheduled about the asm block
5597  %8 = load <8 x i64>, <8 x i64>* %a2
5598  %9 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> %8
5599  ret <8 x i64> %9
5600}
5601
5602define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5603; CHECK-LABEL: stack_fold_pmuldq_maskz:
5604; CHECK:       # %bb.0:
5605; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5606; CHECK-NEXT:    #APP
5607; CHECK-NEXT:    nop
5608; CHECK-NEXT:    #NO_APP
5609; CHECK-NEXT:    kmovd %edi, %k1
5610; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5611; CHECK-NEXT:    retq
5612  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5613  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5614  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5615  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5616  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5617  %6 = mul <8 x i64> %3, %5
5618  %7 = bitcast i8 %mask to <8 x i1>
5619  %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5620  ret <8 x i64> %8
5621}
5622
5623define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5624; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted:
5625; CHECK:       # %bb.0:
5626; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5627; CHECK-NEXT:    #APP
5628; CHECK-NEXT:    nop
5629; CHECK-NEXT:    #NO_APP
5630; CHECK-NEXT:    kmovd %edi, %k1
5631; CHECK-NEXT:    vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5632; CHECK-NEXT:    retq
5633  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5634  %2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5635  %3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5636  %4 = shl <8 x i64> %a1, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5637  %5 = ashr <8 x i64> %4, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
5638  %6 = mul <8 x i64> %5, %3
5639  %7 = bitcast i8 %mask to <8 x i1>
5640  %8 = select <8 x i1> %7, <8 x i64> %6, <8 x i64> zeroinitializer
5641  ret <8 x i64> %8
5642}
5643
5644
5645
5646
5647define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) {
5648; CHECK-LABEL: stack_fold_pmuludq:
5649; CHECK:       # %bb.0:
5650; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5651; CHECK-NEXT:    #APP
5652; CHECK-NEXT:    nop
5653; CHECK-NEXT:    #NO_APP
5654; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5655; CHECK-NEXT:    retq
5656  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5657  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5658  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5659  %4 = mul <8 x i64> %2, %3
5660  ret <8 x i64> %4
5661}
5662
5663define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5664; CHECK-LABEL: stack_fold_pmuludq_commuted:
5665; CHECK:       # %bb.0:
5666; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5667; CHECK-NEXT:    #APP
5668; CHECK-NEXT:    nop
5669; CHECK-NEXT:    #NO_APP
5670; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5671; CHECK-NEXT:    retq
5672  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5673  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5674  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5675  %4 = mul <8 x i64> %3, %2
5676  ret <8 x i64> %4
5677}
5678
5679define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5680; CHECK-LABEL: stack_fold_pmuludq_mask:
5681; CHECK:       # %bb.0:
5682; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5683; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5684; CHECK-NEXT:    #APP
5685; CHECK-NEXT:    nop
5686; CHECK-NEXT:    #NO_APP
5687; CHECK-NEXT:    kmovd %esi, %k1
5688; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5689; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5690; CHECK-NEXT:    retq
5691  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5692  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5693  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5694  %4 = mul <8 x i64> %2, %3
5695  %5 = bitcast i8 %mask to <8 x i1>
5696  ; load needed to keep the operation from being scheduled about the asm block
5697  %6 = load <8 x i64>, <8 x i64>* %a2
5698  %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5699  ret <8 x i64> %7
5700}
5701
5702define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5703; CHECK-LABEL: stack_fold_pmuludq_mask_commuted:
5704; CHECK:       # %bb.0:
5705; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5706; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm1
5707; CHECK-NEXT:    #APP
5708; CHECK-NEXT:    nop
5709; CHECK-NEXT:    #NO_APP
5710; CHECK-NEXT:    kmovd %esi, %k1
5711; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
5712; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5713; CHECK-NEXT:    retq
5714  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5715  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5716  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5717  %4 = mul <8 x i64> %3, %2
5718  %5 = bitcast i8 %mask to <8 x i1>
5719  ; load needed to keep the operation from being scheduled about the asm block
5720  %6 = load <8 x i64>, <8 x i64>* %a2
5721  %7 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %6
5722  ret <8 x i64> %7
5723}
5724
5725define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5726; CHECK-LABEL: stack_fold_pmuludq_maskz:
5727; CHECK:       # %bb.0:
5728; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5729; CHECK-NEXT:    #APP
5730; CHECK-NEXT:    nop
5731; CHECK-NEXT:    #NO_APP
5732; CHECK-NEXT:    kmovd %edi, %k1
5733; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5734; CHECK-NEXT:    retq
5735  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5736  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5737  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5738  %4 = mul <8 x i64> %2, %3
5739  %5 = bitcast i8 %mask to <8 x i1>
5740  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5741  ret <8 x i64> %6
5742}
5743
5744define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5745; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted:
5746; CHECK:       # %bb.0:
5747; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5748; CHECK-NEXT:    #APP
5749; CHECK-NEXT:    nop
5750; CHECK-NEXT:    #NO_APP
5751; CHECK-NEXT:    kmovd %edi, %k1
5752; CHECK-NEXT:    vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5753; CHECK-NEXT:    retq
5754  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5755  %2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5756  %3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
5757  %4 = mul <8 x i64> %3, %2
5758  %5 = bitcast i8 %mask to <8 x i1>
5759  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
5760  ret <8 x i64> %6
5761}
5762
5763define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) {
5764; CHECK-LABEL: stack_fold_vpopcntd:
5765; CHECK:       # %bb.0:
5766; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5767; CHECK-NEXT:    #APP
5768; CHECK-NEXT:    nop
5769; CHECK-NEXT:    #NO_APP
5770; CHECK-NEXT:    vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5771; CHECK-NEXT:    retq
5772  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5773  %2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0)
5774  ret <16 x i32> %2
5775}
5776declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly
5777
5778define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) {
5779; CHECK-LABEL: stack_fold_vpopcntq:
5780; CHECK:       # %bb.0:
5781; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5782; CHECK-NEXT:    #APP
5783; CHECK-NEXT:    nop
5784; CHECK-NEXT:    #NO_APP
5785; CHECK-NEXT:    vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
5786; CHECK-NEXT:    retq
5787  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5788  %2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0)
5789  ret <8 x i64> %2
5790}
5791declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
5792
5793define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) {
5794; CHECK-LABEL: stack_fold_pord:
5795; CHECK:       # %bb.0:
5796; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5797; CHECK-NEXT:    #APP
5798; CHECK-NEXT:    nop
5799; CHECK-NEXT:    #NO_APP
5800; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5801; CHECK-NEXT:    retq
5802  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5803  %2 = or <16 x i32> %a0, %a1
5804  ret <16 x i32> %2
5805}
5806
5807define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
5808; CHECK-LABEL: stack_fold_pord_commuted:
5809; CHECK:       # %bb.0:
5810; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5811; CHECK-NEXT:    #APP
5812; CHECK-NEXT:    nop
5813; CHECK-NEXT:    #NO_APP
5814; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5815; CHECK-NEXT:    retq
5816  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5817  %2 = or <16 x i32> %a1, %a0
5818  ret <16 x i32> %2
5819}
5820
5821define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5822; CHECK-LABEL: stack_fold_pord_mask:
5823; CHECK:       # %bb.0:
5824; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5825; CHECK-NEXT:    vmovaps %zmm0, %zmm1
5826; CHECK-NEXT:    #APP
5827; CHECK-NEXT:    nop
5828; CHECK-NEXT:    #NO_APP
5829; CHECK-NEXT:    kmovd %esi, %k1
5830; CHECK-NEXT:    vmovaps (%rdi), %zmm0
5831; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5832; CHECK-NEXT:    retq
5833  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5834  %2 = or <16 x i32> %a0, %a1
5835  %3 = bitcast i16 %mask to <16 x i1>
5836  ; load needed to keep the operation from being scheduled about the asm block
5837  %4 = load <16 x i32>, <16 x i32>* %a2
5838  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5839  ret <16 x i32> %5
5840}
5841
5842define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
5843; CHECK-LABEL: stack_fold_pord_mask_commuted:
5844; CHECK:       # %bb.0:
5845; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5846; CHECK-NEXT:    vmovaps %zmm0, %zmm1
5847; CHECK-NEXT:    #APP
5848; CHECK-NEXT:    nop
5849; CHECK-NEXT:    #NO_APP
5850; CHECK-NEXT:    kmovd %esi, %k1
5851; CHECK-NEXT:    vmovaps (%rdi), %zmm0
5852; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5853; CHECK-NEXT:    retq
5854  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5855  %2 = or <16 x i32> %a1, %a0
5856  %3 = bitcast i16 %mask to <16 x i1>
5857  ; load needed to keep the operation from being scheduled about the asm block
5858  %4 = load <16 x i32>, <16 x i32>* %a2
5859  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
5860  ret <16 x i32> %5
5861}
5862
5863define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5864; CHECK-LABEL: stack_fold_pord_maskz:
5865; CHECK:       # %bb.0:
5866; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5867; CHECK-NEXT:    #APP
5868; CHECK-NEXT:    nop
5869; CHECK-NEXT:    #NO_APP
5870; CHECK-NEXT:    kmovd %edi, %k1
5871; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5872; CHECK-NEXT:    retq
5873  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5874  %2 = or <16 x i32> %a0, %a1
5875  %3 = bitcast i16 %mask to <16 x i1>
5876  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5877  ret <16 x i32> %4
5878}
5879
5880define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
5881; CHECK-LABEL: stack_fold_pord_maskz_commuted:
5882; CHECK:       # %bb.0:
5883; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5884; CHECK-NEXT:    #APP
5885; CHECK-NEXT:    nop
5886; CHECK-NEXT:    #NO_APP
5887; CHECK-NEXT:    kmovd %edi, %k1
5888; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5889; CHECK-NEXT:    retq
5890  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5891  %2 = or <16 x i32> %a1, %a0
5892  %3 = bitcast i16 %mask to <16 x i1>
5893  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
5894  ret <16 x i32> %4
5895}
5896
5897define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) {
5898; CHECK-LABEL: stack_fold_porq:
5899; CHECK:       # %bb.0:
5900; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5901; CHECK-NEXT:    #APP
5902; CHECK-NEXT:    nop
5903; CHECK-NEXT:    #NO_APP
5904; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5905; CHECK-NEXT:    retq
5906  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5907  %2 = or <8 x i64> %a0, %a1
5908  ret <8 x i64> %2
5909}
5910
5911define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
5912; CHECK-LABEL: stack_fold_porq_commuted:
5913; CHECK:       # %bb.0:
5914; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5915; CHECK-NEXT:    #APP
5916; CHECK-NEXT:    nop
5917; CHECK-NEXT:    #NO_APP
5918; CHECK-NEXT:    vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
5919; CHECK-NEXT:    retq
5920  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5921  %2 = or <8 x i64> %a1, %a0
5922  ret <8 x i64> %2
5923}
5924
5925define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5926; CHECK-LABEL: stack_fold_porq_mask:
5927; CHECK:       # %bb.0:
5928; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5929; CHECK-NEXT:    vmovapd %zmm0, %zmm1
5930; CHECK-NEXT:    #APP
5931; CHECK-NEXT:    nop
5932; CHECK-NEXT:    #NO_APP
5933; CHECK-NEXT:    kmovd %esi, %k1
5934; CHECK-NEXT:    vmovapd (%rdi), %zmm0
5935; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5936; CHECK-NEXT:    retq
5937  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5938  %2 = or <8 x i64> %a0, %a1
5939  %3 = bitcast i8 %mask to <8 x i1>
5940  ; load needed to keep the operation from being scheduled about the asm block
5941  %4 = load <8 x i64>, <8 x i64>* %a2
5942  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5943  ret <8 x i64> %5
5944}
5945
5946define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
5947; CHECK-LABEL: stack_fold_porq_mask_commuted:
5948; CHECK:       # %bb.0:
5949; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5950; CHECK-NEXT:    vmovapd %zmm0, %zmm1
5951; CHECK-NEXT:    #APP
5952; CHECK-NEXT:    nop
5953; CHECK-NEXT:    #NO_APP
5954; CHECK-NEXT:    kmovd %esi, %k1
5955; CHECK-NEXT:    vmovapd (%rdi), %zmm0
5956; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
5957; CHECK-NEXT:    retq
5958  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5959  %2 = or <8 x i64> %a1, %a0
5960  %3 = bitcast i8 %mask to <8 x i1>
5961  ; load needed to keep the operation from being scheduled about the asm block
5962  %4 = load <8 x i64>, <8 x i64>* %a2
5963  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
5964  ret <8 x i64> %5
5965}
5966
5967define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5968; CHECK-LABEL: stack_fold_porq_maskz:
5969; CHECK:       # %bb.0:
5970; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5971; CHECK-NEXT:    #APP
5972; CHECK-NEXT:    nop
5973; CHECK-NEXT:    #NO_APP
5974; CHECK-NEXT:    kmovd %edi, %k1
5975; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5976; CHECK-NEXT:    retq
5977  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5978  %2 = or <8 x i64> %a0, %a1
5979  %3 = bitcast i8 %mask to <8 x i1>
5980  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5981  ret <8 x i64> %4
5982}
5983
5984define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
5985; CHECK-LABEL: stack_fold_porq_maskz_commuted:
5986; CHECK:       # %bb.0:
5987; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5988; CHECK-NEXT:    #APP
5989; CHECK-NEXT:    nop
5990; CHECK-NEXT:    #NO_APP
5991; CHECK-NEXT:    kmovd %edi, %k1
5992; CHECK-NEXT:    vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
5993; CHECK-NEXT:    retq
5994  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
5995  %2 = or <8 x i64> %a1, %a0
5996  %3 = bitcast i8 %mask to <8 x i1>
5997  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
5998  ret <8 x i64> %4
5999}
6000
6001define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) {
6002; CHECK-LABEL: stack_fold_psadbw:
6003; CHECK:       # %bb.0:
6004; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6005; CHECK-NEXT:    #APP
6006; CHECK-NEXT:    nop
6007; CHECK-NEXT:    #NO_APP
6008; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6009; CHECK-NEXT:    retq
6010  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6011  %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1)
6012  ret <8 x i64> %2
6013}
6014declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone
6015
6016define <8 x i64> @stack_fold_psadbw_commute(<64 x i8> %a0, <64 x i8> %a1) {
6017; CHECK-LABEL: stack_fold_psadbw_commute:
6018; CHECK:       # %bb.0:
6019; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6020; CHECK-NEXT:    #APP
6021; CHECK-NEXT:    nop
6022; CHECK-NEXT:    #NO_APP
6023; CHECK-NEXT:    vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6024; CHECK-NEXT:    retq
6025  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6026  %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a1, <64 x i8> %a0)
6027  ret <8 x i64> %2
6028}
6029
6030define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) {
6031; CHECK-LABEL: stack_fold_pshufb_zmm:
6032; CHECK:       # %bb.0:
6033; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6034; CHECK-NEXT:    #APP
6035; CHECK-NEXT:    nop
6036; CHECK-NEXT:    #NO_APP
6037; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6038; CHECK-NEXT:    retq
6039  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6040  %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6041  ret <64 x i8> %2
6042}
6043declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
6044
6045define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6046; CHECK-LABEL: stack_fold_pshufb_zmm_mask:
6047; CHECK:       # %bb.0:
6048; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6049; CHECK-NEXT:    #APP
6050; CHECK-NEXT:    nop
6051; CHECK-NEXT:    #NO_APP
6052; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
6053; CHECK-NEXT:    kmovq %rsi, %k1
6054; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6055; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
6056; CHECK-NEXT:    retq
6057  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6058  %2 = load <64 x i8>, <64 x i8>* %passthru
6059  %3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6060  %4 = bitcast i64 %mask to <64 x i1>
6061  %5 = select <64 x i1> %4, <64 x i8> %3, <64 x i8> %2
6062  ret <64 x i8> %5
6063}
6064
6065define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
6066; CHECK-LABEL: stack_fold_pshufb_zmm_maskz:
6067; CHECK:       # %bb.0:
6068; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6069; CHECK-NEXT:    #APP
6070; CHECK-NEXT:    nop
6071; CHECK-NEXT:    #NO_APP
6072; CHECK-NEXT:    kmovq %rdi, %k1
6073; CHECK-NEXT:    vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6074; CHECK-NEXT:    retq
6075  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6076  %2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
6077  %3 = bitcast i64 %mask to <64 x i1>
6078  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
6079  ret <64 x i8> %4
6080}
6081
6082define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) {
6083; CHECK-LABEL: stack_fold_pshufd_zmm:
6084; CHECK:       # %bb.0:
6085; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6086; CHECK-NEXT:    #APP
6087; CHECK-NEXT:    nop
6088; CHECK-NEXT:    #NO_APP
6089; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6090; CHECK-NEXT:    # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6091; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
6092; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
6093; CHECK-NEXT:    retq
6094  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6095  %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6096  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
6097  ret <16 x i32> %3
6098}
6099
6100define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
6101; CHECK-LABEL: stack_fold_pshufd_zmm_mask:
6102; CHECK:       # %bb.0:
6103; CHECK-NEXT:    subq $56, %rsp
6104; CHECK-NEXT:    .cfi_def_cfa_offset 64
6105; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6106; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6107; CHECK-NEXT:    #APP
6108; CHECK-NEXT:    nop
6109; CHECK-NEXT:    #NO_APP
6110; CHECK-NEXT:    kmovd %edi, %k1
6111; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6112; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6113; CHECK-NEXT:    # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6114; CHECK-NEXT:    addq $56, %rsp
6115; CHECK-NEXT:    .cfi_def_cfa_offset 8
6116; CHECK-NEXT:    retq
6117  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6118  %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6119  %3 = bitcast i16 %mask to <16 x i1>
6120  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %passthru
6121  ret <16 x i32> %4
6122}
6123
6124define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) {
6125; CHECK-LABEL: stack_fold_pshufd_zmm_maskz:
6126; CHECK:       # %bb.0:
6127; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6128; CHECK-NEXT:    #APP
6129; CHECK-NEXT:    nop
6130; CHECK-NEXT:    #NO_APP
6131; CHECK-NEXT:    kmovd %edi, %k1
6132; CHECK-NEXT:    vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6133; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
6134; CHECK-NEXT:    retq
6135  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6136  %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
6137  %3 = bitcast i16 %mask to <16 x i1>
6138  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6139  ret <16 x i32> %4
6140}
6141
6142define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) {
6143; CHECK-LABEL: stack_fold_pshufhw_zmm:
6144; CHECK:       # %bb.0:
6145; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6146; CHECK-NEXT:    #APP
6147; CHECK-NEXT:    nop
6148; CHECK-NEXT:    #NO_APP
6149; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6150; CHECK-NEXT:    # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6151; CHECK-NEXT:    retq
6152  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6153  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6154  ret <32 x i16> %2
6155}
6156
6157define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6158; CHECK-LABEL: stack_fold_pshufhw_zmm_mask:
6159; CHECK:       # %bb.0:
6160; CHECK-NEXT:    subq $56, %rsp
6161; CHECK-NEXT:    .cfi_def_cfa_offset 64
6162; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6163; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6164; CHECK-NEXT:    #APP
6165; CHECK-NEXT:    nop
6166; CHECK-NEXT:    #NO_APP
6167; CHECK-NEXT:    kmovd %edi, %k1
6168; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6169; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6170; CHECK-NEXT:    # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6171; CHECK-NEXT:    addq $56, %rsp
6172; CHECK-NEXT:    .cfi_def_cfa_offset 8
6173; CHECK-NEXT:    retq
6174  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6175  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6176  %3 = bitcast i32 %mask to <32 x i1>
6177  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6178  ret <32 x i16> %4
6179}
6180
6181define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6182; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz:
6183; CHECK:       # %bb.0:
6184; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6185; CHECK-NEXT:    #APP
6186; CHECK-NEXT:    nop
6187; CHECK-NEXT:    #NO_APP
6188; CHECK-NEXT:    kmovd %edi, %k1
6189; CHECK-NEXT:    vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6190; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
6191; CHECK-NEXT:    retq
6192  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6193  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
6194  %3 = bitcast i32 %mask to <32 x i1>
6195  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6196  ret <32 x i16> %4
6197}
6198
6199define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) {
6200; CHECK-LABEL: stack_fold_pshuflw_zmm:
6201; CHECK:       # %bb.0:
6202; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6203; CHECK-NEXT:    #APP
6204; CHECK-NEXT:    nop
6205; CHECK-NEXT:    #NO_APP
6206; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6207; CHECK-NEXT:    # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6208; CHECK-NEXT:    retq
6209  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6210  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6211  ret <32 x i16> %2
6212}
6213
6214define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
6215; CHECK-LABEL: stack_fold_pshuflw_zmm_mask:
6216; CHECK:       # %bb.0:
6217; CHECK-NEXT:    subq $56, %rsp
6218; CHECK-NEXT:    .cfi_def_cfa_offset 64
6219; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6220; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6221; CHECK-NEXT:    #APP
6222; CHECK-NEXT:    nop
6223; CHECK-NEXT:    #NO_APP
6224; CHECK-NEXT:    kmovd %edi, %k1
6225; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6226; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
6227; CHECK-NEXT:    # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6228; CHECK-NEXT:    addq $56, %rsp
6229; CHECK-NEXT:    .cfi_def_cfa_offset 8
6230; CHECK-NEXT:    retq
6231  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6232  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6233  %3 = bitcast i32 %mask to <32 x i1>
6234  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru
6235  ret <32 x i16> %4
6236}
6237
6238define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
6239; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz:
6240; CHECK:       # %bb.0:
6241; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6242; CHECK-NEXT:    #APP
6243; CHECK-NEXT:    nop
6244; CHECK-NEXT:    #NO_APP
6245; CHECK-NEXT:    kmovd %edi, %k1
6246; CHECK-NEXT:    vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6247; CHECK-NEXT:    # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
6248; CHECK-NEXT:    retq
6249  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6250  %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
6251  %3 = bitcast i32 %mask to <32 x i1>
6252  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
6253  ret <32 x i16> %4
6254}
6255
6256define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) {
6257; CHECK-LABEL: stack_fold_pslld:
6258; CHECK:       # %bb.0:
6259; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6260; CHECK-NEXT:    #APP
6261; CHECK-NEXT:    nop
6262; CHECK-NEXT:    #NO_APP
6263; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6264; CHECK-NEXT:    retq
6265  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6266  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6267  ret <16 x i32> %2
6268}
6269declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6270
6271define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6272; CHECK-LABEL: stack_fold_pslld_mask:
6273; CHECK:       # %bb.0:
6274; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6275; CHECK-NEXT:    #APP
6276; CHECK-NEXT:    nop
6277; CHECK-NEXT:    #NO_APP
6278; CHECK-NEXT:    kmovd %esi, %k1
6279; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
6280; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
6281; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
6282; CHECK-NEXT:    retq
6283  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6284  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6285  %3 = bitcast i16 %mask to <16 x i1>
6286  %4 = load <16 x i32>, <16 x i32>* %passthru
6287  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6288  ret <16 x i32> %5
6289}
6290
6291define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
6292; CHECK-LABEL: stack_fold_pslld_maskz:
6293; CHECK:       # %bb.0:
6294; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6295; CHECK-NEXT:    #APP
6296; CHECK-NEXT:    nop
6297; CHECK-NEXT:    #NO_APP
6298; CHECK-NEXT:    kmovd %edi, %k1
6299; CHECK-NEXT:    vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload
6300; CHECK-NEXT:    retq
6301  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6302  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
6303  %3 = bitcast i16 %mask to <16 x i1>
6304  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6305  ret <16 x i32> %4
6306}
6307
6308define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) {
6309; CHECK-LABEL: stack_fold_pslldi:
6310; CHECK:       # %bb.0:
6311; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6312; CHECK-NEXT:    #APP
6313; CHECK-NEXT:    nop
6314; CHECK-NEXT:    #NO_APP
6315; CHECK-NEXT:    vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6316; CHECK-NEXT:    retq
6317  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6318  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6319  ret <16 x i32> %2
6320}
6321declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
6322
6323define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, i16 %mask) {
6324; CHECK-LABEL: stack_fold_pslldi_mask:
6325; CHECK:       # %bb.0:
6326; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6327; CHECK-NEXT:    #APP
6328; CHECK-NEXT:    nop
6329; CHECK-NEXT:    #NO_APP
6330; CHECK-NEXT:    kmovd %esi, %k1
6331; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1
6332; CHECK-NEXT:    vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
6333; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
6334; CHECK-NEXT:    retq
6335  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6336  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6337  %3 = bitcast i16 %mask to <16 x i1>
6338  %4 = load <16 x i32>, <16 x i32>* %passthru
6339  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6340  ret <16 x i32> %5
6341}
6342
6343define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) {
6344; CHECK-LABEL: stack_fold_pslldi_maskz:
6345; CHECK:       # %bb.0:
6346; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6347; CHECK-NEXT:    #APP
6348; CHECK-NEXT:    nop
6349; CHECK-NEXT:    #NO_APP
6350; CHECK-NEXT:    kmovd %edi, %k1
6351; CHECK-NEXT:    vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
6352; CHECK-NEXT:    retq
6353  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6354  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
6355  %3 = bitcast i16 %mask to <16 x i1>
6356  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6357  ret <16 x i32> %4
6358}
6359
6360define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) {
6361; CHECK-LABEL: stack_fold_pslldq:
6362; CHECK:       # %bb.0:
6363; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6364; CHECK-NEXT:    #APP
6365; CHECK-NEXT:    nop
6366; CHECK-NEXT:    #NO_APP
6367; CHECK-NEXT:    vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6368; CHECK-NEXT:    # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
6369; CHECK-NEXT:    retq
6370  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6371  %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
6372  ret <64 x i8> %2
6373}
6374
6375define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) {
6376; CHECK-LABEL: stack_fold_psllq:
6377; CHECK:       # %bb.0:
6378; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6379; CHECK-NEXT:    #APP
6380; CHECK-NEXT:    nop
6381; CHECK-NEXT:    #NO_APP
6382; CHECK-NEXT:    vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6383; CHECK-NEXT:    retq
6384  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6385  %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1)
6386  ret <8 x i64> %2
6387}
6388declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6389
6390define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) {
6391; CHECK-LABEL: stack_fold_psllqi:
6392; CHECK:       # %bb.0:
6393; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6394; CHECK-NEXT:    #APP
6395; CHECK-NEXT:    nop
6396; CHECK-NEXT:    #NO_APP
6397; CHECK-NEXT:    vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6398; CHECK-NEXT:    retq
6399  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6400  %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1)
6401  ret <8 x i64> %2
6402}
6403declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
6404
6405define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) {
6406; CHECK-LABEL: stack_fold_psllvd:
6407; CHECK:       # %bb.0:
6408; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6409; CHECK-NEXT:    #APP
6410; CHECK-NEXT:    nop
6411; CHECK-NEXT:    #NO_APP
6412; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6413; CHECK-NEXT:    retq
6414  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6415  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6416  ret <16 x i32> %2
6417}
6418declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6419
6420define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6421; CHECK-LABEL: stack_fold_psllvd_mask:
6422; CHECK:       # %bb.0:
6423; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6424; CHECK-NEXT:    #APP
6425; CHECK-NEXT:    nop
6426; CHECK-NEXT:    #NO_APP
6427; CHECK-NEXT:    kmovd %esi, %k1
6428; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
6429; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
6430; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
6431; CHECK-NEXT:    retq
6432  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6433  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6434  %3 = bitcast i16 %mask to <16 x i1>
6435  %4 = load <16 x i32>, <16 x i32>* %passthru
6436  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
6437  ret <16 x i32> %5
6438}
6439
6440define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
6441; CHECK-LABEL: stack_fold_psllvd_maskz:
6442; CHECK:       # %bb.0:
6443; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6444; CHECK-NEXT:    #APP
6445; CHECK-NEXT:    nop
6446; CHECK-NEXT:    #NO_APP
6447; CHECK-NEXT:    kmovd %edi, %k1
6448; CHECK-NEXT:    vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6449; CHECK-NEXT:    retq
6450  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6451  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6452  %3 = bitcast i16 %mask to <16 x i1>
6453  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
6454  ret <16 x i32> %4
6455}
6456
6457define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) {
6458; CHECK-LABEL: stack_fold_psllvq:
6459; CHECK:       # %bb.0:
6460; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6461; CHECK-NEXT:    #APP
6462; CHECK-NEXT:    nop
6463; CHECK-NEXT:    #NO_APP
6464; CHECK-NEXT:    vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6465; CHECK-NEXT:    retq
6466  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6467  %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6468  ret <8 x i64> %2
6469}
6470declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6471
6472define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) {
6473; CHECK-LABEL: stack_fold_psllvw:
6474; CHECK:       # %bb.0:
6475; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6476; CHECK-NEXT:    #APP
6477; CHECK-NEXT:    nop
6478; CHECK-NEXT:    #NO_APP
6479; CHECK-NEXT:    vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6480; CHECK-NEXT:    retq
6481  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6482  %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6483  ret <32 x i16> %2
6484}
6485declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6486
6487define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) {
6488; CHECK-LABEL: stack_fold_psllw:
6489; CHECK:       # %bb.0:
6490; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6491; CHECK-NEXT:    #APP
6492; CHECK-NEXT:    nop
6493; CHECK-NEXT:    #NO_APP
6494; CHECK-NEXT:    vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6495; CHECK-NEXT:    retq
6496  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6497  %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1)
6498  ret <32 x i16> %2
6499}
6500declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6501
6502define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) {
6503; CHECK-LABEL: stack_fold_psllwi:
6504; CHECK:       # %bb.0:
6505; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6506; CHECK-NEXT:    #APP
6507; CHECK-NEXT:    nop
6508; CHECK-NEXT:    #NO_APP
6509; CHECK-NEXT:    vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6510; CHECK-NEXT:    retq
6511  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6512  %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1)
6513  ret <32 x i16> %2
6514}
6515declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone
6516
6517define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) {
6518; CHECK-LABEL: stack_fold_psrad:
6519; CHECK:       # %bb.0:
6520; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6521; CHECK-NEXT:    #APP
6522; CHECK-NEXT:    nop
6523; CHECK-NEXT:    #NO_APP
6524; CHECK-NEXT:    vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6525; CHECK-NEXT:    retq
6526  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6527  %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1)
6528  ret <16 x i32> %2
6529}
6530declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6531
6532define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) {
6533; CHECK-LABEL: stack_fold_psradi:
6534; CHECK:       # %bb.0:
6535; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6536; CHECK-NEXT:    #APP
6537; CHECK-NEXT:    nop
6538; CHECK-NEXT:    #NO_APP
6539; CHECK-NEXT:    vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6540; CHECK-NEXT:    retq
6541  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6542  %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1)
6543  ret <16 x i32> %2
6544}
6545declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
6546
6547define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) {
6548; CHECK-LABEL: stack_fold_psraq:
6549; CHECK:       # %bb.0:
6550; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6551; CHECK-NEXT:    #APP
6552; CHECK-NEXT:    nop
6553; CHECK-NEXT:    #NO_APP
6554; CHECK-NEXT:    vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6555; CHECK-NEXT:    retq
6556  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6557  %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1)
6558  ret <8 x i64> %2
6559}
6560declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6561
6562define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) {
6563; CHECK-LABEL: stack_fold_psraqi:
6564; CHECK:       # %bb.0:
6565; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6566; CHECK-NEXT:    #APP
6567; CHECK-NEXT:    nop
6568; CHECK-NEXT:    #NO_APP
6569; CHECK-NEXT:    vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6570; CHECK-NEXT:    retq
6571  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6572  %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1)
6573  ret <8 x i64> %2
6574}
6575declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
6576
6577define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) {
6578; CHECK-LABEL: stack_fold_psravd:
6579; CHECK:       # %bb.0:
6580; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6581; CHECK-NEXT:    #APP
6582; CHECK-NEXT:    nop
6583; CHECK-NEXT:    #NO_APP
6584; CHECK-NEXT:    vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6585; CHECK-NEXT:    retq
6586  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6587  %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
6588  ret <16 x i32> %2
6589}
6590declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6591
6592define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) {
6593; CHECK-LABEL: stack_fold_psravq:
6594; CHECK:       # %bb.0:
6595; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6596; CHECK-NEXT:    #APP
6597; CHECK-NEXT:    nop
6598; CHECK-NEXT:    #NO_APP
6599; CHECK-NEXT:    vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6600; CHECK-NEXT:    retq
6601  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6602  %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
6603  ret <8 x i64> %2
6604}
6605declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6606
6607define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) {
6608; CHECK-LABEL: stack_fold_psravw:
6609; CHECK:       # %bb.0:
6610; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6611; CHECK-NEXT:    #APP
6612; CHECK-NEXT:    nop
6613; CHECK-NEXT:    #NO_APP
6614; CHECK-NEXT:    vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6615; CHECK-NEXT:    retq
6616  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6617  %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1)
6618  ret <32 x i16> %2
6619}
6620declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6621
6622define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) {
6623; CHECK-LABEL: stack_fold_psraw:
6624; CHECK:       # %bb.0:
6625; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6626; CHECK-NEXT:    #APP
6627; CHECK-NEXT:    nop
6628; CHECK-NEXT:    #NO_APP
6629; CHECK-NEXT:    vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6630; CHECK-NEXT:    retq
6631  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6632  %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1)
6633  ret <32 x i16> %2
6634}
6635declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6636
6637define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) {
6638; CHECK-LABEL: stack_fold_psrawi:
6639; CHECK:       # %bb.0:
6640; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6641; CHECK-NEXT:    #APP
6642; CHECK-NEXT:    nop
6643; CHECK-NEXT:    #NO_APP
6644; CHECK-NEXT:    vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6645; CHECK-NEXT:    retq
6646  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6647  %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1)
6648  ret <32 x i16> %2
6649}
6650declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone
6651
6652define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) {
6653; CHECK-LABEL: stack_fold_psrld:
6654; CHECK:       # %bb.0:
6655; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6656; CHECK-NEXT:    #APP
6657; CHECK-NEXT:    nop
6658; CHECK-NEXT:    #NO_APP
6659; CHECK-NEXT:    vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6660; CHECK-NEXT:    retq
6661  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6662  %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1)
6663  ret <16 x i32> %2
6664}
6665declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
6666
6667define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) {
6668; CHECK-LABEL: stack_fold_psrldi:
6669; CHECK:       # %bb.0:
6670; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6671; CHECK-NEXT:    #APP
6672; CHECK-NEXT:    nop
6673; CHECK-NEXT:    #NO_APP
6674; CHECK-NEXT:    vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6675; CHECK-NEXT:    retq
6676  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6677  %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1)
6678  ret <16 x i32> %2
6679}
6680declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
6681
6682define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) {
6683; CHECK-LABEL: stack_fold_psrldq:
6684; CHECK:       # %bb.0:
6685; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6686; CHECK-NEXT:    #APP
6687; CHECK-NEXT:    nop
6688; CHECK-NEXT:    #NO_APP
6689; CHECK-NEXT:    vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6690; CHECK-NEXT:    # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
6691; CHECK-NEXT:    retq
6692  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6693  %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
6694  ret <64 x i8> %2
6695}
6696
6697define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) {
6698; CHECK-LABEL: stack_fold_psrlq:
6699; CHECK:       # %bb.0:
6700; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6701; CHECK-NEXT:    #APP
6702; CHECK-NEXT:    nop
6703; CHECK-NEXT:    #NO_APP
6704; CHECK-NEXT:    vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6705; CHECK-NEXT:    retq
6706  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6707  %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1)
6708  ret <8 x i64> %2
6709}
6710declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
6711
6712define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) {
6713; CHECK-LABEL: stack_fold_psrlqi:
6714; CHECK:       # %bb.0:
6715; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6716; CHECK-NEXT:    #APP
6717; CHECK-NEXT:    nop
6718; CHECK-NEXT:    #NO_APP
6719; CHECK-NEXT:    vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6720; CHECK-NEXT:    retq
6721  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6722  %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1)
6723  ret <8 x i64> %2
6724}
6725declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
6726
6727define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) {
6728; CHECK-LABEL: stack_fold_psrlvd:
6729; CHECK:       # %bb.0:
6730; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6731; CHECK-NEXT:    #APP
6732; CHECK-NEXT:    nop
6733; CHECK-NEXT:    #NO_APP
6734; CHECK-NEXT:    vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6735; CHECK-NEXT:    retq
6736  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6737  %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
6738  ret <16 x i32> %2
6739}
6740declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
6741
6742define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) {
6743; CHECK-LABEL: stack_fold_psrlvq:
6744; CHECK:       # %bb.0:
6745; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6746; CHECK-NEXT:    #APP
6747; CHECK-NEXT:    nop
6748; CHECK-NEXT:    #NO_APP
6749; CHECK-NEXT:    vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6750; CHECK-NEXT:    retq
6751  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6752  %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
6753  ret <8 x i64> %2
6754}
6755declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
6756
6757define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) {
6758; CHECK-LABEL: stack_fold_psrlvw:
6759; CHECK:       # %bb.0:
6760; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6761; CHECK-NEXT:    #APP
6762; CHECK-NEXT:    nop
6763; CHECK-NEXT:    #NO_APP
6764; CHECK-NEXT:    vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6765; CHECK-NEXT:    retq
6766  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6767  %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1)
6768  ret <32 x i16> %2
6769}
6770declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
6771
6772define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) {
6773; CHECK-LABEL: stack_fold_psrlw:
6774; CHECK:       # %bb.0:
6775; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6776; CHECK-NEXT:    #APP
6777; CHECK-NEXT:    nop
6778; CHECK-NEXT:    #NO_APP
6779; CHECK-NEXT:    vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
6780; CHECK-NEXT:    retq
6781  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6782  %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1)
6783  ret <32 x i16> %2
6784}
6785declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
6786
6787define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) {
6788; CHECK-LABEL: stack_fold_psrlwi:
6789; CHECK:       # %bb.0:
6790; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6791; CHECK-NEXT:    #APP
6792; CHECK-NEXT:    nop
6793; CHECK-NEXT:    #NO_APP
6794; CHECK-NEXT:    vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
6795; CHECK-NEXT:    retq
6796  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6797  %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1)
6798  ret <32 x i16> %2
6799}
6800declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone
6801
6802define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) {
6803; CHECK-LABEL: stack_fold_psubb:
6804; CHECK:       # %bb.0:
6805; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6806; CHECK-NEXT:    #APP
6807; CHECK-NEXT:    nop
6808; CHECK-NEXT:    #NO_APP
6809; CHECK-NEXT:    vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6810; CHECK-NEXT:    retq
6811  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6812  %2 = sub <64 x i8> %a0, %a1
6813  ret <64 x i8> %2
6814}
6815
6816define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) {
6817; CHECK-LABEL: stack_fold_psubd:
6818; CHECK:       # %bb.0:
6819; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6820; CHECK-NEXT:    #APP
6821; CHECK-NEXT:    nop
6822; CHECK-NEXT:    #NO_APP
6823; CHECK-NEXT:    vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6824; CHECK-NEXT:    retq
6825  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6826  %2 = sub <16 x i32> %a0, %a1
6827  ret <16 x i32> %2
6828}
6829
6830define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) {
6831; CHECK-LABEL: stack_fold_psubq:
6832; CHECK:       # %bb.0:
6833; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6834; CHECK-NEXT:    #APP
6835; CHECK-NEXT:    nop
6836; CHECK-NEXT:    #NO_APP
6837; CHECK-NEXT:    vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6838; CHECK-NEXT:    retq
6839  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6840  %2 = sub <8 x i64> %a0, %a1
6841  ret <8 x i64> %2
6842}
6843
6844define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) {
6845; CHECK-LABEL: stack_fold_psubsb:
6846; CHECK:       # %bb.0:
6847; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6848; CHECK-NEXT:    #APP
6849; CHECK-NEXT:    nop
6850; CHECK-NEXT:    #NO_APP
6851; CHECK-NEXT:    vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6852; CHECK-NEXT:    retq
6853  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6854  %2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6855  ret <64 x i8> %2
6856}
6857
6858define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) {
6859; CHECK-LABEL: stack_fold_psubsw:
6860; CHECK:       # %bb.0:
6861; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6862; CHECK-NEXT:    #APP
6863; CHECK-NEXT:    nop
6864; CHECK-NEXT:    #NO_APP
6865; CHECK-NEXT:    vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6866; CHECK-NEXT:    retq
6867  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6868  %2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6869  ret <32 x i16> %2
6870}
6871
6872define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) {
6873; CHECK-LABEL: stack_fold_psubusb:
6874; CHECK:       # %bb.0:
6875; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6876; CHECK-NEXT:    #APP
6877; CHECK-NEXT:    nop
6878; CHECK-NEXT:    #NO_APP
6879; CHECK-NEXT:    vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6880; CHECK-NEXT:    retq
6881  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6882  %2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
6883  ret <64 x i8> %2
6884}
6885
6886define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) {
6887; CHECK-LABEL: stack_fold_psubusw:
6888; CHECK:       # %bb.0:
6889; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6890; CHECK-NEXT:    #APP
6891; CHECK-NEXT:    nop
6892; CHECK-NEXT:    #NO_APP
6893; CHECK-NEXT:    vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6894; CHECK-NEXT:    retq
6895  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6896  %2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
6897  ret <32 x i16> %2
6898}
6899
6900define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) {
6901; CHECK-LABEL: stack_fold_psubw:
6902; CHECK:       # %bb.0:
6903; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6904; CHECK-NEXT:    #APP
6905; CHECK-NEXT:    nop
6906; CHECK-NEXT:    #NO_APP
6907; CHECK-NEXT:    vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6908; CHECK-NEXT:    retq
6909  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6910  %2 = sub <32 x i16> %a0, %a1
6911  ret <32 x i16> %2
6912}
6913
6914define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) {
6915; CHECK-LABEL: stack_fold_shufi64x2:
6916; CHECK:       # %bb.0:
6917; CHECK-NEXT:    subq $56, %rsp
6918; CHECK-NEXT:    .cfi_def_cfa_offset 64
6919; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6920; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6921; CHECK-NEXT:    #APP
6922; CHECK-NEXT:    nop
6923; CHECK-NEXT:    #NO_APP
6924; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6925; CHECK-NEXT:    vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
6926; CHECK-NEXT:    # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
6927; CHECK-NEXT:    addq $56, %rsp
6928; CHECK-NEXT:    .cfi_def_cfa_offset 8
6929; CHECK-NEXT:    retq
6930  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6931  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6932  ret <8 x i64> %2
6933}
6934
6935define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) {
6936; CHECK-LABEL: stack_fold_shufi64x2_mask:
6937; CHECK:       # %bb.0:
6938; CHECK-NEXT:    subq $56, %rsp
6939; CHECK-NEXT:    .cfi_def_cfa_offset 64
6940; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6941; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6942; CHECK-NEXT:    #APP
6943; CHECK-NEXT:    nop
6944; CHECK-NEXT:    #NO_APP
6945; CHECK-NEXT:    kmovd %edi, %k1
6946; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm1
6947; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6948; CHECK-NEXT:    vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
6949; CHECK-NEXT:    # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
6950; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
6951; CHECK-NEXT:    addq $56, %rsp
6952; CHECK-NEXT:    .cfi_def_cfa_offset 8
6953; CHECK-NEXT:    retq
6954  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6955  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6956  %3 = bitcast i8 %mask to <8 x i1>
6957  ; load needed to keep the operation from being scheduled above the asm block
6958  %4 = load <8 x i64>, <8 x i64>* %passthru
6959  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
6960  ret <8 x i64> %5
6961}
6962
6963define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) {
6964; CHECK-LABEL: stack_fold_shufi64x2_maskz:
6965; CHECK:       # %bb.0:
6966; CHECK-NEXT:    subq $56, %rsp
6967; CHECK-NEXT:    .cfi_def_cfa_offset 64
6968; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6969; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6970; CHECK-NEXT:    #APP
6971; CHECK-NEXT:    nop
6972; CHECK-NEXT:    #NO_APP
6973; CHECK-NEXT:    kmovd %edi, %k1
6974; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
6975; CHECK-NEXT:    vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
6976; CHECK-NEXT:    # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
6977; CHECK-NEXT:    addq $56, %rsp
6978; CHECK-NEXT:    .cfi_def_cfa_offset 8
6979; CHECK-NEXT:    retq
6980  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
6981  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
6982  %3 = bitcast i8 %mask to <8 x i1>
6983  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
6984  ret <8 x i64> %4
6985}
6986
6987define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, <16 x i32>* %passthru) {
6988; CHECK-LABEL: stack_fold_shufi32x4_mask:
6989; CHECK:       # %bb.0:
6990; CHECK-NEXT:    subq $56, %rsp
6991; CHECK-NEXT:    .cfi_def_cfa_offset 64
6992; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6993; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6994; CHECK-NEXT:    #APP
6995; CHECK-NEXT:    nop
6996; CHECK-NEXT:    #NO_APP
6997; CHECK-NEXT:    kmovd %edi, %k1
6998; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm1
6999; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7000; CHECK-NEXT:    vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
7001; CHECK-NEXT:    # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
7002; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
7003; CHECK-NEXT:    addq $56, %rsp
7004; CHECK-NEXT:    .cfi_def_cfa_offset 8
7005; CHECK-NEXT:    retq
7006  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7007  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
7008  %3 = bitcast i16 %mask to <16 x i1>
7009  ; load needed to keep the operation from being scheduled above the asm block
7010  %4 = load <16 x i32>, <16 x i32>* %passthru
7011  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7012  ret <16 x i32> %5
7013}
7014
7015define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
7016; CHECK-LABEL: stack_fold_shufi32x4_maskz:
7017; CHECK:       # %bb.0:
7018; CHECK-NEXT:    subq $56, %rsp
7019; CHECK-NEXT:    .cfi_def_cfa_offset 64
7020; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7021; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7022; CHECK-NEXT:    #APP
7023; CHECK-NEXT:    nop
7024; CHECK-NEXT:    #NO_APP
7025; CHECK-NEXT:    kmovd %edi, %k1
7026; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7027; CHECK-NEXT:    vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7028; CHECK-NEXT:    # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
7029; CHECK-NEXT:    addq $56, %rsp
7030; CHECK-NEXT:    .cfi_def_cfa_offset 8
7031; CHECK-NEXT:    retq
7032  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7033  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
7034  %3 = bitcast i16 %mask to <16 x i1>
7035  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7036  ret <16 x i32> %4
7037}
7038
7039define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
7040; CHECK-LABEL: stack_fold_ternlogd:
7041; CHECK:       # %bb.0:
7042; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7043; CHECK-NEXT:    #APP
7044; CHECK-NEXT:    nop
7045; CHECK-NEXT:    #NO_APP
7046; CHECK-NEXT:    vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7047; CHECK-NEXT:    retq
7048  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7049  %2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
7050  ret <16 x i32> %2
7051}
7052declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
7053
7054define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
7055; CHECK-LABEL: stack_fold_ternlogq:
7056; CHECK:       # %bb.0:
7057; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7058; CHECK-NEXT:    #APP
7059; CHECK-NEXT:    nop
7060; CHECK-NEXT:    #NO_APP
7061; CHECK-NEXT:    vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
7062; CHECK-NEXT:    retq
7063  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7064  %2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
7065  ret <8 x i64> %2
7066}
7067
7068declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
7069
7070define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
7071; CHECK-LABEL: stack_fold_punpckhbw_zmm:
7072; CHECK:       # %bb.0:
7073; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7074; CHECK-NEXT:    #APP
7075; CHECK-NEXT:    nop
7076; CHECK-NEXT:    #NO_APP
7077; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7078; CHECK-NEXT:    # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7079; CHECK-NEXT:    retq
7080  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7081  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7082  ret <64 x i8> %2
7083}
7084
7085define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7086; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm:
7087; CHECK:       # %bb.0:
7088; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7089; CHECK-NEXT:    #APP
7090; CHECK-NEXT:    nop
7091; CHECK-NEXT:    #NO_APP
7092; CHECK-NEXT:    kmovq %rsi, %k1
7093; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2
7094; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
7095; CHECK-NEXT:    # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7096; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
7097; CHECK-NEXT:    retq
7098  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7099  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7100  %3 = bitcast i64 %mask to <64 x i1>
7101  ; load needed to keep the operation from being scheduled about the asm block
7102  %4 = load <64 x i8>, <64 x i8>* %passthru
7103  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
7104  ret <64 x i8> %5
7105}
7106
7107define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
7108; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm:
7109; CHECK:       # %bb.0:
7110; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7111; CHECK-NEXT:    #APP
7112; CHECK-NEXT:    nop
7113; CHECK-NEXT:    #NO_APP
7114; CHECK-NEXT:    kmovq %rdi, %k1
7115; CHECK-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7116; CHECK-NEXT:    # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
7117; CHECK-NEXT:    retq
7118  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7119  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
7120  %3 = bitcast i64 %mask to <64 x i1>
7121  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
7122  ret <64 x i8> %4
7123}
7124
7125define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) {
7126; CHECK-LABEL: stack_fold_pxord:
7127; CHECK:       # %bb.0:
7128; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7129; CHECK-NEXT:    #APP
7130; CHECK-NEXT:    nop
7131; CHECK-NEXT:    #NO_APP
7132; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7133; CHECK-NEXT:    retq
7134  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7135  %2 = xor <16 x i32> %a0, %a1
7136  ret <16 x i32> %2
7137}
7138
7139define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
7140; CHECK-LABEL: stack_fold_pxord_commuted:
7141; CHECK:       # %bb.0:
7142; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7143; CHECK-NEXT:    #APP
7144; CHECK-NEXT:    nop
7145; CHECK-NEXT:    #NO_APP
7146; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7147; CHECK-NEXT:    retq
7148  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7149  %2 = xor <16 x i32> %a1, %a0
7150  ret <16 x i32> %2
7151}
7152
7153define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
7154; CHECK-LABEL: stack_fold_pxord_mask:
7155; CHECK:       # %bb.0:
7156; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7157; CHECK-NEXT:    vmovaps %zmm0, %zmm1
7158; CHECK-NEXT:    #APP
7159; CHECK-NEXT:    nop
7160; CHECK-NEXT:    #NO_APP
7161; CHECK-NEXT:    kmovd %esi, %k1
7162; CHECK-NEXT:    vmovaps (%rdi), %zmm0
7163; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7164; CHECK-NEXT:    retq
7165  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7166  %2 = xor <16 x i32> %a0, %a1
7167  %3 = bitcast i16 %mask to <16 x i1>
7168  ; load needed to keep the operation from being scheduled about the asm block
7169  %4 = load <16 x i32>, <16 x i32>* %a2
7170  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7171  ret <16 x i32> %5
7172}
7173
7174define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
7175; CHECK-LABEL: stack_fold_pxord_mask_commuted:
7176; CHECK:       # %bb.0:
7177; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7178; CHECK-NEXT:    vmovaps %zmm0, %zmm1
7179; CHECK-NEXT:    #APP
7180; CHECK-NEXT:    nop
7181; CHECK-NEXT:    #NO_APP
7182; CHECK-NEXT:    kmovd %esi, %k1
7183; CHECK-NEXT:    vmovaps (%rdi), %zmm0
7184; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7185; CHECK-NEXT:    retq
7186  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7187  %2 = xor <16 x i32> %a1, %a0
7188  %3 = bitcast i16 %mask to <16 x i1>
7189  ; load needed to keep the operation from being scheduled about the asm block
7190  %4 = load <16 x i32>, <16 x i32>* %a2
7191  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
7192  ret <16 x i32> %5
7193}
7194
7195define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7196; CHECK-LABEL: stack_fold_pxord_maskz:
7197; CHECK:       # %bb.0:
7198; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7199; CHECK-NEXT:    #APP
7200; CHECK-NEXT:    nop
7201; CHECK-NEXT:    #NO_APP
7202; CHECK-NEXT:    kmovd %edi, %k1
7203; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7204; CHECK-NEXT:    retq
7205  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7206  %2 = xor <16 x i32> %a0, %a1
7207  %3 = bitcast i16 %mask to <16 x i1>
7208  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7209  ret <16 x i32> %4
7210}
7211
7212define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
7213; CHECK-LABEL: stack_fold_pxord_maskz_commuted:
7214; CHECK:       # %bb.0:
7215; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7216; CHECK-NEXT:    #APP
7217; CHECK-NEXT:    nop
7218; CHECK-NEXT:    #NO_APP
7219; CHECK-NEXT:    kmovd %edi, %k1
7220; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7221; CHECK-NEXT:    retq
7222  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7223  %2 = xor <16 x i32> %a1, %a0
7224  %3 = bitcast i16 %mask to <16 x i1>
7225  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
7226  ret <16 x i32> %4
7227}
7228
7229define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) {
7230; CHECK-LABEL: stack_fold_pxorq:
7231; CHECK:       # %bb.0:
7232; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7233; CHECK-NEXT:    #APP
7234; CHECK-NEXT:    nop
7235; CHECK-NEXT:    #NO_APP
7236; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7237; CHECK-NEXT:    retq
7238  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7239  %2 = xor <8 x i64> %a0, %a1
7240  ret <8 x i64> %2
7241}
7242
7243define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
7244; CHECK-LABEL: stack_fold_pxorq_commuted:
7245; CHECK:       # %bb.0:
7246; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7247; CHECK-NEXT:    #APP
7248; CHECK-NEXT:    nop
7249; CHECK-NEXT:    #NO_APP
7250; CHECK-NEXT:    vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
7251; CHECK-NEXT:    retq
7252  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7253  %2 = xor <8 x i64> %a1, %a0
7254  ret <8 x i64> %2
7255}
7256
7257define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
7258; CHECK-LABEL: stack_fold_pxorq_mask:
7259; CHECK:       # %bb.0:
7260; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7261; CHECK-NEXT:    vmovapd %zmm0, %zmm1
7262; CHECK-NEXT:    #APP
7263; CHECK-NEXT:    nop
7264; CHECK-NEXT:    #NO_APP
7265; CHECK-NEXT:    kmovd %esi, %k1
7266; CHECK-NEXT:    vmovapd (%rdi), %zmm0
7267; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7268; CHECK-NEXT:    retq
7269  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7270  %2 = xor <8 x i64> %a0, %a1
7271  %3 = bitcast i8 %mask to <8 x i1>
7272  ; load needed to keep the operation from being scheduled about the asm block
7273  %4 = load <8 x i64>, <8 x i64>* %a2
7274  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7275  ret <8 x i64> %5
7276}
7277
7278define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
7279; CHECK-LABEL: stack_fold_pxorq_mask_commuted:
7280; CHECK:       # %bb.0:
7281; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7282; CHECK-NEXT:    vmovapd %zmm0, %zmm1
7283; CHECK-NEXT:    #APP
7284; CHECK-NEXT:    nop
7285; CHECK-NEXT:    #NO_APP
7286; CHECK-NEXT:    kmovd %esi, %k1
7287; CHECK-NEXT:    vmovapd (%rdi), %zmm0
7288; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
7289; CHECK-NEXT:    retq
7290  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7291  %2 = xor <8 x i64> %a1, %a0
7292  %3 = bitcast i8 %mask to <8 x i1>
7293  ; load needed to keep the operation from being scheduled about the asm block
7294  %4 = load <8 x i64>, <8 x i64>* %a2
7295  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
7296  ret <8 x i64> %5
7297}
7298
7299define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7300; CHECK-LABEL: stack_fold_pxorq_maskz:
7301; CHECK:       # %bb.0:
7302; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7303; CHECK-NEXT:    #APP
7304; CHECK-NEXT:    nop
7305; CHECK-NEXT:    #NO_APP
7306; CHECK-NEXT:    kmovd %edi, %k1
7307; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7308; CHECK-NEXT:    retq
7309  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7310  %2 = xor <8 x i64> %a0, %a1
7311  %3 = bitcast i8 %mask to <8 x i1>
7312  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7313  ret <8 x i64> %4
7314}
7315
7316define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
7317; CHECK-LABEL: stack_fold_pxorq_maskz_commuted:
7318; CHECK:       # %bb.0:
7319; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7320; CHECK-NEXT:    #APP
7321; CHECK-NEXT:    nop
7322; CHECK-NEXT:    #NO_APP
7323; CHECK-NEXT:    kmovd %edi, %k1
7324; CHECK-NEXT:    vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
7325; CHECK-NEXT:    retq
7326  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
7327  %2 = xor <8 x i64> %a1, %a0
7328  %3 = bitcast i8 %mask to <8 x i1>
7329  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
7330  ret <8 x i64> %4
7331}
7332
7333declare <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
7334declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>)
7335declare <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
7336declare <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16>, <32 x i16>)
7337declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>)
7338declare <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64>)
7339declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
7340declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1)
7341declare <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
7342declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>)
7343declare <64 x i8> @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
7344declare <32 x i16> @llvm.usub.sat.v32i16(<32 x i16>, <32 x i16>)
7345