• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
4
5define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
6; CHECK-LABEL: test_mask_packs_epi32_rr_128:
7; CHECK:       # %bb.0:
8; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
9; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
10  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
11  ret <8 x i16> %1
12}
13
14define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
15; X86-LABEL: test_mask_packs_epi32_rrk_128:
16; X86:       # %bb.0:
17; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
18; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
19; X86-NEXT:    vpackssdw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
20; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
21; X86-NEXT:    retl # encoding: [0xc3]
22;
23; X64-LABEL: test_mask_packs_epi32_rrk_128:
24; X64:       # %bb.0:
25; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
26; X64-NEXT:    vpackssdw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
27; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
28; X64-NEXT:    retq # encoding: [0xc3]
29  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
30  %2 = bitcast i8 %mask to <8 x i1>
31  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
32  ret <8 x i16> %3
33}
34
35define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
36; X86-LABEL: test_mask_packs_epi32_rrkz_128:
37; X86:       # %bb.0:
38; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
39; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
40; X86-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
41; X86-NEXT:    retl # encoding: [0xc3]
42;
43; X64-LABEL: test_mask_packs_epi32_rrkz_128:
44; X64:       # %bb.0:
45; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
46; X64-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
47; X64-NEXT:    retq # encoding: [0xc3]
48  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
49  %2 = bitcast i8 %mask to <8 x i1>
50  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
51  ret <8 x i16> %3
52}
53
54define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
55; X86-LABEL: test_mask_packs_epi32_rm_128:
56; X86:       # %bb.0:
57; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
58; X86-NEXT:    vpackssdw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x00]
59; X86-NEXT:    retl # encoding: [0xc3]
60;
61; X64-LABEL: test_mask_packs_epi32_rm_128:
62; X64:       # %bb.0:
63; X64-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
64; X64-NEXT:    retq # encoding: [0xc3]
65  %b = load <4 x i32>, <4 x i32>* %ptr_b
66  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
67  ret <8 x i16> %1
68}
69
70define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
71; X86-LABEL: test_mask_packs_epi32_rmk_128:
72; X86:       # %bb.0:
73; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
74; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
75; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
76; X86-NEXT:    vpackssdw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x08]
77; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
78; X86-NEXT:    retl # encoding: [0xc3]
79;
80; X64-LABEL: test_mask_packs_epi32_rmk_128:
81; X64:       # %bb.0:
82; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
83; X64-NEXT:    vpackssdw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
84; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
85; X64-NEXT:    retq # encoding: [0xc3]
86  %b = load <4 x i32>, <4 x i32>* %ptr_b
87  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
88  %2 = bitcast i8 %mask to <8 x i1>
89  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
90  ret <8 x i16> %3
91}
92
93define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
94; X86-LABEL: test_mask_packs_epi32_rmkz_128:
95; X86:       # %bb.0:
96; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
97; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
98; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
99; X86-NEXT:    vpackssdw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x00]
100; X86-NEXT:    retl # encoding: [0xc3]
101;
102; X64-LABEL: test_mask_packs_epi32_rmkz_128:
103; X64:       # %bb.0:
104; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
105; X64-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
106; X64-NEXT:    retq # encoding: [0xc3]
107  %b = load <4 x i32>, <4 x i32>* %ptr_b
108  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
109  %2 = bitcast i8 %mask to <8 x i1>
110  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
111  ret <8 x i16> %3
112}
113
114define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
115; X86-LABEL: test_mask_packs_epi32_rmb_128:
116; X86:       # %bb.0:
117; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
118; X86-NEXT:    vpackssdw (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x00]
119; X86-NEXT:    retl # encoding: [0xc3]
120;
121; X64-LABEL: test_mask_packs_epi32_rmb_128:
122; X64:       # %bb.0:
123; X64-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
124; X64-NEXT:    retq # encoding: [0xc3]
125  %q = load i32, i32* %ptr_b
126  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
127  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
128  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
129  ret <8 x i16> %1
130}
131
132define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
133; X86-LABEL: test_mask_packs_epi32_rmbk_128:
134; X86:       # %bb.0:
135; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
136; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
137; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
138; X86-NEXT:    vpackssdw (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x08]
139; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
140; X86-NEXT:    retl # encoding: [0xc3]
141;
142; X64-LABEL: test_mask_packs_epi32_rmbk_128:
143; X64:       # %bb.0:
144; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
145; X64-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
146; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
147; X64-NEXT:    retq # encoding: [0xc3]
148  %q = load i32, i32* %ptr_b
149  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
150  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
151  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
152  %2 = bitcast i8 %mask to <8 x i1>
153  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
154  ret <8 x i16> %3
155}
156
157define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
158; X86-LABEL: test_mask_packs_epi32_rmbkz_128:
159; X86:       # %bb.0:
160; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
161; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
162; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
163; X86-NEXT:    vpackssdw (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x00]
164; X86-NEXT:    retl # encoding: [0xc3]
165;
166; X64-LABEL: test_mask_packs_epi32_rmbkz_128:
167; X64:       # %bb.0:
168; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
169; X64-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
170; X64-NEXT:    retq # encoding: [0xc3]
171  %q = load i32, i32* %ptr_b
172  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
173  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
174  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
175  %2 = bitcast i8 %mask to <8 x i1>
176  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
177  ret <8 x i16> %3
178}
179
180declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
181
182define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
183; CHECK-LABEL: test_mask_packs_epi32_rr_256:
184; CHECK:       # %bb.0:
185; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
186; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
187  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
188  ret <16 x i16> %1
189}
190
191define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
192; X86-LABEL: test_mask_packs_epi32_rrk_256:
193; X86:       # %bb.0:
194; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
195; X86-NEXT:    vpackssdw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
196; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
197; X86-NEXT:    retl # encoding: [0xc3]
198;
199; X64-LABEL: test_mask_packs_epi32_rrk_256:
200; X64:       # %bb.0:
201; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
202; X64-NEXT:    vpackssdw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
203; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
204; X64-NEXT:    retq # encoding: [0xc3]
205  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
206  %2 = bitcast i16 %mask to <16 x i1>
207  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
208  ret <16 x i16> %3
209}
210
211define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
212; X86-LABEL: test_mask_packs_epi32_rrkz_256:
213; X86:       # %bb.0:
214; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
215; X86-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
216; X86-NEXT:    retl # encoding: [0xc3]
217;
218; X64-LABEL: test_mask_packs_epi32_rrkz_256:
219; X64:       # %bb.0:
220; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
221; X64-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
222; X64-NEXT:    retq # encoding: [0xc3]
223  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
224  %2 = bitcast i16 %mask to <16 x i1>
225  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
226  ret <16 x i16> %3
227}
228
229define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
230; X86-LABEL: test_mask_packs_epi32_rm_256:
231; X86:       # %bb.0:
232; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
233; X86-NEXT:    vpackssdw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x00]
234; X86-NEXT:    retl # encoding: [0xc3]
235;
236; X64-LABEL: test_mask_packs_epi32_rm_256:
237; X64:       # %bb.0:
238; X64-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
239; X64-NEXT:    retq # encoding: [0xc3]
240  %b = load <8 x i32>, <8 x i32>* %ptr_b
241  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
242  ret <16 x i16> %1
243}
244
245define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
246; X86-LABEL: test_mask_packs_epi32_rmk_256:
247; X86:       # %bb.0:
248; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
249; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
250; X86-NEXT:    vpackssdw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x08]
251; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
252; X86-NEXT:    retl # encoding: [0xc3]
253;
254; X64-LABEL: test_mask_packs_epi32_rmk_256:
255; X64:       # %bb.0:
256; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
257; X64-NEXT:    vpackssdw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
258; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
259; X64-NEXT:    retq # encoding: [0xc3]
260  %b = load <8 x i32>, <8 x i32>* %ptr_b
261  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
262  %2 = bitcast i16 %mask to <16 x i1>
263  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
264  ret <16 x i16> %3
265}
266
267define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
268; X86-LABEL: test_mask_packs_epi32_rmkz_256:
269; X86:       # %bb.0:
270; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
271; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
272; X86-NEXT:    vpackssdw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x00]
273; X86-NEXT:    retl # encoding: [0xc3]
274;
275; X64-LABEL: test_mask_packs_epi32_rmkz_256:
276; X64:       # %bb.0:
277; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
278; X64-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
279; X64-NEXT:    retq # encoding: [0xc3]
280  %b = load <8 x i32>, <8 x i32>* %ptr_b
281  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
282  %2 = bitcast i16 %mask to <16 x i1>
283  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
284  ret <16 x i16> %3
285}
286
287define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
288; X86-LABEL: test_mask_packs_epi32_rmb_256:
289; X86:       # %bb.0:
290; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
291; X86-NEXT:    vpackssdw (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x00]
292; X86-NEXT:    retl # encoding: [0xc3]
293;
294; X64-LABEL: test_mask_packs_epi32_rmb_256:
295; X64:       # %bb.0:
296; X64-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
297; X64-NEXT:    retq # encoding: [0xc3]
298  %q = load i32, i32* %ptr_b
299  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
300  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
301  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
302  ret <16 x i16> %1
303}
304
305define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
306; X86-LABEL: test_mask_packs_epi32_rmbk_256:
307; X86:       # %bb.0:
308; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
309; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
310; X86-NEXT:    vpackssdw (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x08]
311; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
312; X86-NEXT:    retl # encoding: [0xc3]
313;
314; X64-LABEL: test_mask_packs_epi32_rmbk_256:
315; X64:       # %bb.0:
316; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
317; X64-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
318; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
319; X64-NEXT:    retq # encoding: [0xc3]
320  %q = load i32, i32* %ptr_b
321  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
322  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
323  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
324  %2 = bitcast i16 %mask to <16 x i1>
325  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
326  ret <16 x i16> %3
327}
328
329define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
330; X86-LABEL: test_mask_packs_epi32_rmbkz_256:
331; X86:       # %bb.0:
332; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
333; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
334; X86-NEXT:    vpackssdw (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x00]
335; X86-NEXT:    retl # encoding: [0xc3]
336;
337; X64-LABEL: test_mask_packs_epi32_rmbkz_256:
338; X64:       # %bb.0:
339; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
340; X64-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
341; X64-NEXT:    retq # encoding: [0xc3]
342  %q = load i32, i32* %ptr_b
343  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
344  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
345  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
346  %2 = bitcast i16 %mask to <16 x i1>
347  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
348  ret <16 x i16> %3
349}
350
351declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
352
353define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
354; CHECK-LABEL: test_mask_packs_epi16_rr_128:
355; CHECK:       # %bb.0:
356; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
357; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
358  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
359  ret <16 x i8> %1
360}
361
362define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
363; X86-LABEL: test_mask_packs_epi16_rrk_128:
364; X86:       # %bb.0:
365; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
366; X86-NEXT:    vpacksswb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
367; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
368; X86-NEXT:    retl # encoding: [0xc3]
369;
370; X64-LABEL: test_mask_packs_epi16_rrk_128:
371; X64:       # %bb.0:
372; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
373; X64-NEXT:    vpacksswb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
374; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
375; X64-NEXT:    retq # encoding: [0xc3]
376  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
377  %2 = bitcast i16 %mask to <16 x i1>
378  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
379  ret <16 x i8> %3
380}
381
382define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
383; X86-LABEL: test_mask_packs_epi16_rrkz_128:
384; X86:       # %bb.0:
385; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
386; X86-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
387; X86-NEXT:    retl # encoding: [0xc3]
388;
389; X64-LABEL: test_mask_packs_epi16_rrkz_128:
390; X64:       # %bb.0:
391; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
392; X64-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
393; X64-NEXT:    retq # encoding: [0xc3]
394  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
395  %2 = bitcast i16 %mask to <16 x i1>
396  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
397  ret <16 x i8> %3
398}
399
400define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
401; X86-LABEL: test_mask_packs_epi16_rm_128:
402; X86:       # %bb.0:
403; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
404; X86-NEXT:    vpacksswb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x00]
405; X86-NEXT:    retl # encoding: [0xc3]
406;
407; X64-LABEL: test_mask_packs_epi16_rm_128:
408; X64:       # %bb.0:
409; X64-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
410; X64-NEXT:    retq # encoding: [0xc3]
411  %b = load <8 x i16>, <8 x i16>* %ptr_b
412  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
413  ret <16 x i8> %1
414}
415
416define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
417; X86-LABEL: test_mask_packs_epi16_rmk_128:
418; X86:       # %bb.0:
419; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
420; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
421; X86-NEXT:    vpacksswb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x63,0x08]
422; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
423; X86-NEXT:    retl # encoding: [0xc3]
424;
425; X64-LABEL: test_mask_packs_epi16_rmk_128:
426; X64:       # %bb.0:
427; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
428; X64-NEXT:    vpacksswb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
429; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
430; X64-NEXT:    retq # encoding: [0xc3]
431  %b = load <8 x i16>, <8 x i16>* %ptr_b
432  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
433  %2 = bitcast i16 %mask to <16 x i1>
434  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
435  ret <16 x i8> %3
436}
437
438define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
439; X86-LABEL: test_mask_packs_epi16_rmkz_128:
440; X86:       # %bb.0:
441; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
442; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
443; X86-NEXT:    vpacksswb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x63,0x00]
444; X86-NEXT:    retl # encoding: [0xc3]
445;
446; X64-LABEL: test_mask_packs_epi16_rmkz_128:
447; X64:       # %bb.0:
448; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
449; X64-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
450; X64-NEXT:    retq # encoding: [0xc3]
451  %b = load <8 x i16>, <8 x i16>* %ptr_b
452  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
453  %2 = bitcast i16 %mask to <16 x i1>
454  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
455  ret <16 x i8> %3
456}
457
458declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
459
460define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
461; CHECK-LABEL: test_mask_packs_epi16_rr_256:
462; CHECK:       # %bb.0:
463; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
464; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
465  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
466  ret <32 x i8> %1
467}
468
469define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
470; X86-LABEL: test_mask_packs_epi16_rrk_256:
471; X86:       # %bb.0:
472; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
473; X86-NEXT:    vpacksswb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
474; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
475; X86-NEXT:    retl # encoding: [0xc3]
476;
477; X64-LABEL: test_mask_packs_epi16_rrk_256:
478; X64:       # %bb.0:
479; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
480; X64-NEXT:    vpacksswb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
481; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
482; X64-NEXT:    retq # encoding: [0xc3]
483  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
484  %2 = bitcast i32 %mask to <32 x i1>
485  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
486  ret <32 x i8> %3
487}
488
489define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
490; X86-LABEL: test_mask_packs_epi16_rrkz_256:
491; X86:       # %bb.0:
492; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
493; X86-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
494; X86-NEXT:    retl # encoding: [0xc3]
495;
496; X64-LABEL: test_mask_packs_epi16_rrkz_256:
497; X64:       # %bb.0:
498; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
499; X64-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
500; X64-NEXT:    retq # encoding: [0xc3]
501  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
502  %2 = bitcast i32 %mask to <32 x i1>
503  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
504  ret <32 x i8> %3
505}
506
507define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
508; X86-LABEL: test_mask_packs_epi16_rm_256:
509; X86:       # %bb.0:
510; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
511; X86-NEXT:    vpacksswb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x00]
512; X86-NEXT:    retl # encoding: [0xc3]
513;
514; X64-LABEL: test_mask_packs_epi16_rm_256:
515; X64:       # %bb.0:
516; X64-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
517; X64-NEXT:    retq # encoding: [0xc3]
518  %b = load <16 x i16>, <16 x i16>* %ptr_b
519  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
520  ret <32 x i8> %1
521}
522
523define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
524; X86-LABEL: test_mask_packs_epi16_rmk_256:
525; X86:       # %bb.0:
526; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
527; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
528; X86-NEXT:    vpacksswb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x63,0x08]
529; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
530; X86-NEXT:    retl # encoding: [0xc3]
531;
532; X64-LABEL: test_mask_packs_epi16_rmk_256:
533; X64:       # %bb.0:
534; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
535; X64-NEXT:    vpacksswb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
536; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
537; X64-NEXT:    retq # encoding: [0xc3]
538  %b = load <16 x i16>, <16 x i16>* %ptr_b
539  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
540  %2 = bitcast i32 %mask to <32 x i1>
541  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
542  ret <32 x i8> %3
543}
544
545define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
546; X86-LABEL: test_mask_packs_epi16_rmkz_256:
547; X86:       # %bb.0:
548; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
549; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
550; X86-NEXT:    vpacksswb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x00]
551; X86-NEXT:    retl # encoding: [0xc3]
552;
553; X64-LABEL: test_mask_packs_epi16_rmkz_256:
554; X64:       # %bb.0:
555; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
556; X64-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
557; X64-NEXT:    retq # encoding: [0xc3]
558  %b = load <16 x i16>, <16 x i16>* %ptr_b
559  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
560  %2 = bitcast i32 %mask to <32 x i1>
561  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
562  ret <32 x i8> %3
563}
564
565declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
566
567
568define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
569; CHECK-LABEL: test_mask_packus_epi32_rr_128:
570; CHECK:       # %bb.0:
571; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
572; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
573  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
574  ret <8 x i16> %1
575}
576
577define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
578; X86-LABEL: test_mask_packus_epi32_rrk_128:
579; X86:       # %bb.0:
580; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
581; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
582; X86-NEXT:    vpackusdw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
583; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
584; X86-NEXT:    retl # encoding: [0xc3]
585;
586; X64-LABEL: test_mask_packus_epi32_rrk_128:
587; X64:       # %bb.0:
588; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
589; X64-NEXT:    vpackusdw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
590; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
591; X64-NEXT:    retq # encoding: [0xc3]
592  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
593  %2 = bitcast i8 %mask to <8 x i1>
594  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
595  ret <8 x i16> %3
596}
597
598define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
599; X86-LABEL: test_mask_packus_epi32_rrkz_128:
600; X86:       # %bb.0:
601; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
602; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
603; X86-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
604; X86-NEXT:    retl # encoding: [0xc3]
605;
606; X64-LABEL: test_mask_packus_epi32_rrkz_128:
607; X64:       # %bb.0:
608; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
609; X64-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
610; X64-NEXT:    retq # encoding: [0xc3]
611  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
612  %2 = bitcast i8 %mask to <8 x i1>
613  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
614  ret <8 x i16> %3
615}
616
617define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
618; X86-LABEL: test_mask_packus_epi32_rm_128:
619; X86:       # %bb.0:
620; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
621; X86-NEXT:    vpackusdw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x00]
622; X86-NEXT:    retl # encoding: [0xc3]
623;
624; X64-LABEL: test_mask_packus_epi32_rm_128:
625; X64:       # %bb.0:
626; X64-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
627; X64-NEXT:    retq # encoding: [0xc3]
628  %b = load <4 x i32>, <4 x i32>* %ptr_b
629  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
630  ret <8 x i16> %1
631}
632
633define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
634; X86-LABEL: test_mask_packus_epi32_rmk_128:
635; X86:       # %bb.0:
636; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
637; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
638; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
639; X86-NEXT:    vpackusdw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x08]
640; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
641; X86-NEXT:    retl # encoding: [0xc3]
642;
643; X64-LABEL: test_mask_packus_epi32_rmk_128:
644; X64:       # %bb.0:
645; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
646; X64-NEXT:    vpackusdw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
647; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
648; X64-NEXT:    retq # encoding: [0xc3]
649  %b = load <4 x i32>, <4 x i32>* %ptr_b
650  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
651  %2 = bitcast i8 %mask to <8 x i1>
652  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
653  ret <8 x i16> %3
654}
655
656define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
657; X86-LABEL: test_mask_packus_epi32_rmkz_128:
658; X86:       # %bb.0:
659; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
660; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
661; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
662; X86-NEXT:    vpackusdw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x00]
663; X86-NEXT:    retl # encoding: [0xc3]
664;
665; X64-LABEL: test_mask_packus_epi32_rmkz_128:
666; X64:       # %bb.0:
667; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
668; X64-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
669; X64-NEXT:    retq # encoding: [0xc3]
670  %b = load <4 x i32>, <4 x i32>* %ptr_b
671  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
672  %2 = bitcast i8 %mask to <8 x i1>
673  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
674  ret <8 x i16> %3
675}
676
677define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
678; X86-LABEL: test_mask_packus_epi32_rmb_128:
679; X86:       # %bb.0:
680; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
681; X86-NEXT:    vpackusdw (%eax){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x00]
682; X86-NEXT:    retl # encoding: [0xc3]
683;
684; X64-LABEL: test_mask_packus_epi32_rmb_128:
685; X64:       # %bb.0:
686; X64-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
687; X64-NEXT:    retq # encoding: [0xc3]
688  %q = load i32, i32* %ptr_b
689  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
690  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
691  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
692  ret <8 x i16> %1
693}
694
695define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
696; X86-LABEL: test_mask_packus_epi32_rmbk_128:
697; X86:       # %bb.0:
698; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
699; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
700; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
701; X86-NEXT:    vpackusdw (%eax){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x08]
702; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
703; X86-NEXT:    retl # encoding: [0xc3]
704;
705; X64-LABEL: test_mask_packus_epi32_rmbk_128:
706; X64:       # %bb.0:
707; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
708; X64-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
709; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
710; X64-NEXT:    retq # encoding: [0xc3]
711  %q = load i32, i32* %ptr_b
712  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
713  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
714  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
715  %2 = bitcast i8 %mask to <8 x i1>
716  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
717  ret <8 x i16> %3
718}
719
720define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
721; X86-LABEL: test_mask_packus_epi32_rmbkz_128:
722; X86:       # %bb.0:
723; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
724; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08]
725; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
726; X86-NEXT:    vpackusdw (%eax){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x00]
727; X86-NEXT:    retl # encoding: [0xc3]
728;
729; X64-LABEL: test_mask_packus_epi32_rmbkz_128:
730; X64:       # %bb.0:
731; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
732; X64-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
733; X64-NEXT:    retq # encoding: [0xc3]
734  %q = load i32, i32* %ptr_b
735  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
736  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
737  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
738  %2 = bitcast i8 %mask to <8 x i1>
739  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
740  ret <8 x i16> %3
741}
742
743declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
744
745define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
746; CHECK-LABEL: test_mask_packus_epi32_rr_256:
747; CHECK:       # %bb.0:
748; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
749; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
750  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
751  ret <16 x i16> %1
752}
753
754define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
755; X86-LABEL: test_mask_packus_epi32_rrk_256:
756; X86:       # %bb.0:
757; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
758; X86-NEXT:    vpackusdw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
759; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
760; X86-NEXT:    retl # encoding: [0xc3]
761;
762; X64-LABEL: test_mask_packus_epi32_rrk_256:
763; X64:       # %bb.0:
764; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
765; X64-NEXT:    vpackusdw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
766; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
767; X64-NEXT:    retq # encoding: [0xc3]
768  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
769  %2 = bitcast i16 %mask to <16 x i1>
770  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
771  ret <16 x i16> %3
772}
773
774define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
775; X86-LABEL: test_mask_packus_epi32_rrkz_256:
776; X86:       # %bb.0:
777; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
778; X86-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
779; X86-NEXT:    retl # encoding: [0xc3]
780;
781; X64-LABEL: test_mask_packus_epi32_rrkz_256:
782; X64:       # %bb.0:
783; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
784; X64-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
785; X64-NEXT:    retq # encoding: [0xc3]
786  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
787  %2 = bitcast i16 %mask to <16 x i1>
788  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
789  ret <16 x i16> %3
790}
791
792define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
793; X86-LABEL: test_mask_packus_epi32_rm_256:
794; X86:       # %bb.0:
795; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
796; X86-NEXT:    vpackusdw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x00]
797; X86-NEXT:    retl # encoding: [0xc3]
798;
799; X64-LABEL: test_mask_packus_epi32_rm_256:
800; X64:       # %bb.0:
801; X64-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
802; X64-NEXT:    retq # encoding: [0xc3]
803  %b = load <8 x i32>, <8 x i32>* %ptr_b
804  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
805  ret <16 x i16> %1
806}
807
808define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
809; X86-LABEL: test_mask_packus_epi32_rmk_256:
810; X86:       # %bb.0:
811; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
812; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
813; X86-NEXT:    vpackusdw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x08]
814; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
815; X86-NEXT:    retl # encoding: [0xc3]
816;
817; X64-LABEL: test_mask_packus_epi32_rmk_256:
818; X64:       # %bb.0:
819; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
820; X64-NEXT:    vpackusdw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
821; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
822; X64-NEXT:    retq # encoding: [0xc3]
823  %b = load <8 x i32>, <8 x i32>* %ptr_b
824  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
825  %2 = bitcast i16 %mask to <16 x i1>
826  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
827  ret <16 x i16> %3
828}
829
830define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
831; X86-LABEL: test_mask_packus_epi32_rmkz_256:
832; X86:       # %bb.0:
833; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
834; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
835; X86-NEXT:    vpackusdw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x00]
836; X86-NEXT:    retl # encoding: [0xc3]
837;
838; X64-LABEL: test_mask_packus_epi32_rmkz_256:
839; X64:       # %bb.0:
840; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
841; X64-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
842; X64-NEXT:    retq # encoding: [0xc3]
843  %b = load <8 x i32>, <8 x i32>* %ptr_b
844  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
845  %2 = bitcast i16 %mask to <16 x i1>
846  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
847  ret <16 x i16> %3
848}
849
850define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
851; X86-LABEL: test_mask_packus_epi32_rmb_256:
852; X86:       # %bb.0:
853; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
854; X86-NEXT:    vpackusdw (%eax){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x00]
855; X86-NEXT:    retl # encoding: [0xc3]
856;
857; X64-LABEL: test_mask_packus_epi32_rmb_256:
858; X64:       # %bb.0:
859; X64-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
860; X64-NEXT:    retq # encoding: [0xc3]
861  %q = load i32, i32* %ptr_b
862  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
863  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
864  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
865  ret <16 x i16> %1
866}
867
868define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
869; X86-LABEL: test_mask_packus_epi32_rmbk_256:
870; X86:       # %bb.0:
871; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
872; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
873; X86-NEXT:    vpackusdw (%eax){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x08]
874; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
875; X86-NEXT:    retl # encoding: [0xc3]
876;
877; X64-LABEL: test_mask_packus_epi32_rmbk_256:
878; X64:       # %bb.0:
879; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
880; X64-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
881; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
882; X64-NEXT:    retq # encoding: [0xc3]
883  %q = load i32, i32* %ptr_b
884  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
885  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
886  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
887  %2 = bitcast i16 %mask to <16 x i1>
888  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
889  ret <16 x i16> %3
890}
891
892define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
893; X86-LABEL: test_mask_packus_epi32_rmbkz_256:
894; X86:       # %bb.0:
895; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
896; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
897; X86-NEXT:    vpackusdw (%eax){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x00]
898; X86-NEXT:    retl # encoding: [0xc3]
899;
900; X64-LABEL: test_mask_packus_epi32_rmbkz_256:
901; X64:       # %bb.0:
902; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
903; X64-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
904; X64-NEXT:    retq # encoding: [0xc3]
905  %q = load i32, i32* %ptr_b
906  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
907  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
908  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
909  %2 = bitcast i16 %mask to <16 x i1>
910  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
911  ret <16 x i16> %3
912}
913
914declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
915
916define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
917; CHECK-LABEL: test_mask_packus_epi16_rr_128:
918; CHECK:       # %bb.0:
919; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
920; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
921  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
922  ret <16 x i8> %1
923}
924
925define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
926; X86-LABEL: test_mask_packus_epi16_rrk_128:
927; X86:       # %bb.0:
928; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
929; X86-NEXT:    vpackuswb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
930; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
931; X86-NEXT:    retl # encoding: [0xc3]
932;
933; X64-LABEL: test_mask_packus_epi16_rrk_128:
934; X64:       # %bb.0:
935; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
936; X64-NEXT:    vpackuswb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
937; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
938; X64-NEXT:    retq # encoding: [0xc3]
939  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
940  %2 = bitcast i16 %mask to <16 x i1>
941  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
942  ret <16 x i8> %3
943}
944
945define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
946; X86-LABEL: test_mask_packus_epi16_rrkz_128:
947; X86:       # %bb.0:
948; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
949; X86-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
950; X86-NEXT:    retl # encoding: [0xc3]
951;
952; X64-LABEL: test_mask_packus_epi16_rrkz_128:
953; X64:       # %bb.0:
954; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
955; X64-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
956; X64-NEXT:    retq # encoding: [0xc3]
957  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
958  %2 = bitcast i16 %mask to <16 x i1>
959  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
960  ret <16 x i8> %3
961}
962
963define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
964; X86-LABEL: test_mask_packus_epi16_rm_128:
965; X86:       # %bb.0:
966; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
967; X86-NEXT:    vpackuswb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x00]
968; X86-NEXT:    retl # encoding: [0xc3]
969;
970; X64-LABEL: test_mask_packus_epi16_rm_128:
971; X64:       # %bb.0:
972; X64-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
973; X64-NEXT:    retq # encoding: [0xc3]
974  %b = load <8 x i16>, <8 x i16>* %ptr_b
975  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
976  ret <16 x i8> %1
977}
978
979define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
980; X86-LABEL: test_mask_packus_epi16_rmk_128:
981; X86:       # %bb.0:
982; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
983; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
984; X86-NEXT:    vpackuswb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x67,0x08]
985; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
986; X86-NEXT:    retl # encoding: [0xc3]
987;
988; X64-LABEL: test_mask_packus_epi16_rmk_128:
989; X64:       # %bb.0:
990; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
991; X64-NEXT:    vpackuswb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
992; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
993; X64-NEXT:    retq # encoding: [0xc3]
994  %b = load <8 x i16>, <8 x i16>* %ptr_b
995  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
996  %2 = bitcast i16 %mask to <16 x i1>
997  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
998  ret <16 x i8> %3
999}
1000
1001define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
1002; X86-LABEL: test_mask_packus_epi16_rmkz_128:
1003; X86:       # %bb.0:
1004; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1005; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1006; X86-NEXT:    vpackuswb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x67,0x00]
1007; X86-NEXT:    retl # encoding: [0xc3]
1008;
1009; X64-LABEL: test_mask_packus_epi16_rmkz_128:
1010; X64:       # %bb.0:
1011; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1012; X64-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
1013; X64-NEXT:    retq # encoding: [0xc3]
1014  %b = load <8 x i16>, <8 x i16>* %ptr_b
1015  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
1016  %2 = bitcast i16 %mask to <16 x i1>
1017  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
1018  ret <16 x i8> %3
1019}
1020
1021declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
1022
1023define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
1024; CHECK-LABEL: test_mask_packus_epi16_rr_256:
1025; CHECK:       # %bb.0:
1026; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
1027; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1028  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
1029  ret <32 x i8> %1
1030}
1031
1032define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
1033; X86-LABEL: test_mask_packus_epi16_rrk_256:
1034; X86:       # %bb.0:
1035; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
1036; X86-NEXT:    vpackuswb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
1037; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1038; X86-NEXT:    retl # encoding: [0xc3]
1039;
1040; X64-LABEL: test_mask_packus_epi16_rrk_256:
1041; X64:       # %bb.0:
1042; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1043; X64-NEXT:    vpackuswb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
1044; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1045; X64-NEXT:    retq # encoding: [0xc3]
1046  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
1047  %2 = bitcast i32 %mask to <32 x i1>
1048  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
1049  ret <32 x i8> %3
1050}
1051
1052define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
1053; X86-LABEL: test_mask_packus_epi16_rrkz_256:
1054; X86:       # %bb.0:
1055; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
1056; X86-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
1057; X86-NEXT:    retl # encoding: [0xc3]
1058;
1059; X64-LABEL: test_mask_packus_epi16_rrkz_256:
1060; X64:       # %bb.0:
1061; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1062; X64-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
1063; X64-NEXT:    retq # encoding: [0xc3]
1064  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
1065  %2 = bitcast i32 %mask to <32 x i1>
1066  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
1067  ret <32 x i8> %3
1068}
1069
1070define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
1071; X86-LABEL: test_mask_packus_epi16_rm_256:
1072; X86:       # %bb.0:
1073; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1074; X86-NEXT:    vpackuswb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x00]
1075; X86-NEXT:    retl # encoding: [0xc3]
1076;
1077; X64-LABEL: test_mask_packus_epi16_rm_256:
1078; X64:       # %bb.0:
1079; X64-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
1080; X64-NEXT:    retq # encoding: [0xc3]
1081  %b = load <16 x i16>, <16 x i16>* %ptr_b
1082  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
1083  ret <32 x i8> %1
1084}
1085
1086define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
1087; X86-LABEL: test_mask_packus_epi16_rmk_256:
1088; X86:       # %bb.0:
1089; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1090; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
1091; X86-NEXT:    vpackuswb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x67,0x08]
1092; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1093; X86-NEXT:    retl # encoding: [0xc3]
1094;
1095; X64-LABEL: test_mask_packus_epi16_rmk_256:
1096; X64:       # %bb.0:
1097; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1098; X64-NEXT:    vpackuswb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
1099; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1100; X64-NEXT:    retq # encoding: [0xc3]
1101  %b = load <16 x i16>, <16 x i16>* %ptr_b
1102  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
1103  %2 = bitcast i32 %mask to <32 x i1>
1104  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
1105  ret <32 x i8> %3
1106}
1107
1108define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
1109; X86-LABEL: test_mask_packus_epi16_rmkz_256:
1110; X86:       # %bb.0:
1111; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1112; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
1113; X86-NEXT:    vpackuswb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x00]
1114; X86-NEXT:    retl # encoding: [0xc3]
1115;
1116; X64-LABEL: test_mask_packus_epi16_rmkz_256:
1117; X64:       # %bb.0:
1118; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1119; X64-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
1120; X64-NEXT:    retq # encoding: [0xc3]
1121  %b = load <16 x i16>, <16 x i16>* %ptr_b
1122  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
1123  %2 = bitcast i32 %mask to <32 x i1>
1124  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
1125  ret <32 x i8> %3
1126}
1127
1128declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
1129
1130define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
1131; CHECK-LABEL: test_mask_adds_epi16_rr_128:
1132; CHECK:       # %bb.0:
1133; CHECK-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1]
1134; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1135  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1136  ret <8 x i16> %res
1137}
1138
1139define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
1140; X86-LABEL: test_mask_adds_epi16_rrk_128:
1141; X86:       # %bb.0:
1142; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1143; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1144; X86-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
1145; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1146; X86-NEXT:    retl # encoding: [0xc3]
1147;
1148; X64-LABEL: test_mask_adds_epi16_rrk_128:
1149; X64:       # %bb.0:
1150; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1151; X64-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
1152; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1153; X64-NEXT:    retq # encoding: [0xc3]
1154  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1155  ret <8 x i16> %res
1156}
1157
1158define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
1159; X86-LABEL: test_mask_adds_epi16_rrkz_128:
1160; X86:       # %bb.0:
1161; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1162; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1163; X86-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
1164; X86-NEXT:    retl # encoding: [0xc3]
1165;
1166; X64-LABEL: test_mask_adds_epi16_rrkz_128:
1167; X64:       # %bb.0:
1168; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1169; X64-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
1170; X64-NEXT:    retq # encoding: [0xc3]
1171  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1172  ret <8 x i16> %res
1173}
1174
1175define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
1176; X86-LABEL: test_mask_adds_epi16_rm_128:
1177; X86:       # %bb.0:
1178; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1179; X86-NEXT:    vpaddsw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x00]
1180; X86-NEXT:    retl # encoding: [0xc3]
1181;
1182; X64-LABEL: test_mask_adds_epi16_rm_128:
1183; X64:       # %bb.0:
1184; X64-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07]
1185; X64-NEXT:    retq # encoding: [0xc3]
1186  %b = load <8 x i16>, <8 x i16>* %ptr_b
1187  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1188  ret <8 x i16> %res
1189}
1190
1191define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
1192; X86-LABEL: test_mask_adds_epi16_rmk_128:
1193; X86:       # %bb.0:
1194; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1195; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1196; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1197; X86-NEXT:    vpaddsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x08]
1198; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1199; X86-NEXT:    retl # encoding: [0xc3]
1200;
1201; X64-LABEL: test_mask_adds_epi16_rmk_128:
1202; X64:       # %bb.0:
1203; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1204; X64-NEXT:    vpaddsw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f]
1205; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1206; X64-NEXT:    retq # encoding: [0xc3]
1207  %b = load <8 x i16>, <8 x i16>* %ptr_b
1208  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1209  ret <8 x i16> %res
1210}
1211
1212define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
1213; X86-LABEL: test_mask_adds_epi16_rmkz_128:
1214; X86:       # %bb.0:
1215; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1216; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1217; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1218; X86-NEXT:    vpaddsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x00]
1219; X86-NEXT:    retl # encoding: [0xc3]
1220;
1221; X64-LABEL: test_mask_adds_epi16_rmkz_128:
1222; X64:       # %bb.0:
1223; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1224; X64-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07]
1225; X64-NEXT:    retq # encoding: [0xc3]
1226  %b = load <8 x i16>, <8 x i16>* %ptr_b
1227  %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1228  ret <8 x i16> %res
1229}
1230
1231declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
1232
1233define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
1234; CHECK-LABEL: test_mask_adds_epi16_rr_256:
1235; CHECK:       # %bb.0:
1236; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
1237; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1238  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1239  ret <16 x i16> %res
1240}
1241
1242define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
1243; X86-LABEL: test_mask_adds_epi16_rrk_256:
1244; X86:       # %bb.0:
1245; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1246; X86-NEXT:    vpaddsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
1247; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1248; X86-NEXT:    retl # encoding: [0xc3]
1249;
1250; X64-LABEL: test_mask_adds_epi16_rrk_256:
1251; X64:       # %bb.0:
1252; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1253; X64-NEXT:    vpaddsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
1254; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1255; X64-NEXT:    retq # encoding: [0xc3]
1256  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1257  ret <16 x i16> %res
1258}
1259
1260define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
1261; X86-LABEL: test_mask_adds_epi16_rrkz_256:
1262; X86:       # %bb.0:
1263; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1264; X86-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
1265; X86-NEXT:    retl # encoding: [0xc3]
1266;
1267; X64-LABEL: test_mask_adds_epi16_rrkz_256:
1268; X64:       # %bb.0:
1269; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1270; X64-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
1271; X64-NEXT:    retq # encoding: [0xc3]
1272  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1273  ret <16 x i16> %res
1274}
1275
1276define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
1277; X86-LABEL: test_mask_adds_epi16_rm_256:
1278; X86:       # %bb.0:
1279; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1280; X86-NEXT:    vpaddsw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x00]
1281; X86-NEXT:    retl # encoding: [0xc3]
1282;
1283; X64-LABEL: test_mask_adds_epi16_rm_256:
1284; X64:       # %bb.0:
1285; X64-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07]
1286; X64-NEXT:    retq # encoding: [0xc3]
1287  %b = load <16 x i16>, <16 x i16>* %ptr_b
1288  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1289  ret <16 x i16> %res
1290}
1291
1292define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
1293; X86-LABEL: test_mask_adds_epi16_rmk_256:
1294; X86:       # %bb.0:
1295; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1296; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1297; X86-NEXT:    vpaddsw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0x08]
1298; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1299; X86-NEXT:    retl # encoding: [0xc3]
1300;
1301; X64-LABEL: test_mask_adds_epi16_rmk_256:
1302; X64:       # %bb.0:
1303; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1304; X64-NEXT:    vpaddsw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f]
1305; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1306; X64-NEXT:    retq # encoding: [0xc3]
1307  %b = load <16 x i16>, <16 x i16>* %ptr_b
1308  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1309  ret <16 x i16> %res
1310}
1311
1312define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
1313; X86-LABEL: test_mask_adds_epi16_rmkz_256:
1314; X86:       # %bb.0:
1315; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1316; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1317; X86-NEXT:    vpaddsw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x00]
1318; X86-NEXT:    retl # encoding: [0xc3]
1319;
1320; X64-LABEL: test_mask_adds_epi16_rmkz_256:
1321; X64:       # %bb.0:
1322; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1323; X64-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07]
1324; X64-NEXT:    retq # encoding: [0xc3]
1325  %b = load <16 x i16>, <16 x i16>* %ptr_b
1326  %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1327  ret <16 x i16> %res
1328}
1329
1330declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
1331
1332define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
1333; CHECK-LABEL: test_mask_subs_epi16_rr_128:
1334; CHECK:       # %bb.0:
1335; CHECK-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1]
1336; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1337  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1338  ret <8 x i16> %res
1339}
1340
1341define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
1342; X86-LABEL: test_mask_subs_epi16_rrk_128:
1343; X86:       # %bb.0:
1344; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1345; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1346; X86-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
1347; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1348; X86-NEXT:    retl # encoding: [0xc3]
1349;
1350; X64-LABEL: test_mask_subs_epi16_rrk_128:
1351; X64:       # %bb.0:
1352; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1353; X64-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
1354; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1355; X64-NEXT:    retq # encoding: [0xc3]
1356  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1357  ret <8 x i16> %res
1358}
1359
1360define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
1361; X86-LABEL: test_mask_subs_epi16_rrkz_128:
1362; X86:       # %bb.0:
1363; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1364; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1365; X86-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
1366; X86-NEXT:    retl # encoding: [0xc3]
1367;
1368; X64-LABEL: test_mask_subs_epi16_rrkz_128:
1369; X64:       # %bb.0:
1370; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1371; X64-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
1372; X64-NEXT:    retq # encoding: [0xc3]
1373  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1374  ret <8 x i16> %res
1375}
1376
1377define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
1378; X86-LABEL: test_mask_subs_epi16_rm_128:
1379; X86:       # %bb.0:
1380; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1381; X86-NEXT:    vpsubsw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x00]
1382; X86-NEXT:    retl # encoding: [0xc3]
1383;
1384; X64-LABEL: test_mask_subs_epi16_rm_128:
1385; X64:       # %bb.0:
1386; X64-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07]
1387; X64-NEXT:    retq # encoding: [0xc3]
1388  %b = load <8 x i16>, <8 x i16>* %ptr_b
1389  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1390  ret <8 x i16> %res
1391}
1392
1393define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
1394; X86-LABEL: test_mask_subs_epi16_rmk_128:
1395; X86:       # %bb.0:
1396; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1397; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1398; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1399; X86-NEXT:    vpsubsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x08]
1400; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1401; X86-NEXT:    retl # encoding: [0xc3]
1402;
1403; X64-LABEL: test_mask_subs_epi16_rmk_128:
1404; X64:       # %bb.0:
1405; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1406; X64-NEXT:    vpsubsw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f]
1407; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1408; X64-NEXT:    retq # encoding: [0xc3]
1409  %b = load <8 x i16>, <8 x i16>* %ptr_b
1410  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1411  ret <8 x i16> %res
1412}
1413
1414define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
1415; X86-LABEL: test_mask_subs_epi16_rmkz_128:
1416; X86:       # %bb.0:
1417; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1418; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1419; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1420; X86-NEXT:    vpsubsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x00]
1421; X86-NEXT:    retl # encoding: [0xc3]
1422;
1423; X64-LABEL: test_mask_subs_epi16_rmkz_128:
1424; X64:       # %bb.0:
1425; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1426; X64-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07]
1427; X64-NEXT:    retq # encoding: [0xc3]
1428  %b = load <8 x i16>, <8 x i16>* %ptr_b
1429  %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1430  ret <8 x i16> %res
1431}
1432
1433declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
1434
1435define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
1436; CHECK-LABEL: test_mask_subs_epi16_rr_256:
1437; CHECK:       # %bb.0:
1438; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
1439; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1440  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1441  ret <16 x i16> %res
1442}
1443
1444define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
1445; X86-LABEL: test_mask_subs_epi16_rrk_256:
1446; X86:       # %bb.0:
1447; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1448; X86-NEXT:    vpsubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
1449; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1450; X86-NEXT:    retl # encoding: [0xc3]
1451;
1452; X64-LABEL: test_mask_subs_epi16_rrk_256:
1453; X64:       # %bb.0:
1454; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1455; X64-NEXT:    vpsubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
1456; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1457; X64-NEXT:    retq # encoding: [0xc3]
1458  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1459  ret <16 x i16> %res
1460}
1461
1462define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
1463; X86-LABEL: test_mask_subs_epi16_rrkz_256:
1464; X86:       # %bb.0:
1465; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1466; X86-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
1467; X86-NEXT:    retl # encoding: [0xc3]
1468;
1469; X64-LABEL: test_mask_subs_epi16_rrkz_256:
1470; X64:       # %bb.0:
1471; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1472; X64-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
1473; X64-NEXT:    retq # encoding: [0xc3]
1474  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1475  ret <16 x i16> %res
1476}
1477
1478define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
1479; X86-LABEL: test_mask_subs_epi16_rm_256:
1480; X86:       # %bb.0:
1481; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1482; X86-NEXT:    vpsubsw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x00]
1483; X86-NEXT:    retl # encoding: [0xc3]
1484;
1485; X64-LABEL: test_mask_subs_epi16_rm_256:
1486; X64:       # %bb.0:
1487; X64-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07]
1488; X64-NEXT:    retq # encoding: [0xc3]
1489  %b = load <16 x i16>, <16 x i16>* %ptr_b
1490  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1491  ret <16 x i16> %res
1492}
1493
1494define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
1495; X86-LABEL: test_mask_subs_epi16_rmk_256:
1496; X86:       # %bb.0:
1497; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1498; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1499; X86-NEXT:    vpsubsw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x08]
1500; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1501; X86-NEXT:    retl # encoding: [0xc3]
1502;
1503; X64-LABEL: test_mask_subs_epi16_rmk_256:
1504; X64:       # %bb.0:
1505; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1506; X64-NEXT:    vpsubsw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f]
1507; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1508; X64-NEXT:    retq # encoding: [0xc3]
1509  %b = load <16 x i16>, <16 x i16>* %ptr_b
1510  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1511  ret <16 x i16> %res
1512}
1513
1514define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
1515; X86-LABEL: test_mask_subs_epi16_rmkz_256:
1516; X86:       # %bb.0:
1517; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1518; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1519; X86-NEXT:    vpsubsw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x00]
1520; X86-NEXT:    retl # encoding: [0xc3]
1521;
1522; X64-LABEL: test_mask_subs_epi16_rmkz_256:
1523; X64:       # %bb.0:
1524; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1525; X64-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07]
1526; X64-NEXT:    retq # encoding: [0xc3]
1527  %b = load <16 x i16>, <16 x i16>* %ptr_b
1528  %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1529  ret <16 x i16> %res
1530}
1531
1532declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
1533
1534define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
1535; CHECK-LABEL: test_mask_adds_epu16_rr_128:
1536; CHECK:       # %bb.0:
1537; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
1538; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1539  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1540  ret <8 x i16> %res
1541}
1542
1543define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
1544; X86-LABEL: test_mask_adds_epu16_rrk_128:
1545; X86:       # %bb.0:
1546; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1547; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1548; X86-NEXT:    vpaddusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
1549; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1550; X86-NEXT:    retl # encoding: [0xc3]
1551;
1552; X64-LABEL: test_mask_adds_epu16_rrk_128:
1553; X64:       # %bb.0:
1554; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1555; X64-NEXT:    vpaddusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
1556; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1557; X64-NEXT:    retq # encoding: [0xc3]
1558  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1559  ret <8 x i16> %res
1560}
1561
1562define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
1563; X86-LABEL: test_mask_adds_epu16_rrkz_128:
1564; X86:       # %bb.0:
1565; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1566; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1567; X86-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
1568; X86-NEXT:    retl # encoding: [0xc3]
1569;
1570; X64-LABEL: test_mask_adds_epu16_rrkz_128:
1571; X64:       # %bb.0:
1572; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1573; X64-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
1574; X64-NEXT:    retq # encoding: [0xc3]
1575  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1576  ret <8 x i16> %res
1577}
1578
1579define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
1580; X86-LABEL: test_mask_adds_epu16_rm_128:
1581; X86:       # %bb.0:
1582; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1583; X86-NEXT:    vpaddusw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x00]
1584; X86-NEXT:    retl # encoding: [0xc3]
1585;
1586; X64-LABEL: test_mask_adds_epu16_rm_128:
1587; X64:       # %bb.0:
1588; X64-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07]
1589; X64-NEXT:    retq # encoding: [0xc3]
1590  %b = load <8 x i16>, <8 x i16>* %ptr_b
1591  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1592  ret <8 x i16> %res
1593}
1594
1595define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
1596; X86-LABEL: test_mask_adds_epu16_rmk_128:
1597; X86:       # %bb.0:
1598; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1599; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1600; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1601; X86-NEXT:    vpaddusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x08]
1602; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1603; X86-NEXT:    retl # encoding: [0xc3]
1604;
1605; X64-LABEL: test_mask_adds_epu16_rmk_128:
1606; X64:       # %bb.0:
1607; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1608; X64-NEXT:    vpaddusw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
1609; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1610; X64-NEXT:    retq # encoding: [0xc3]
1611  %b = load <8 x i16>, <8 x i16>* %ptr_b
1612  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1613  ret <8 x i16> %res
1614}
1615
1616define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
1617; X86-LABEL: test_mask_adds_epu16_rmkz_128:
1618; X86:       # %bb.0:
1619; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1620; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1621; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1622; X86-NEXT:    vpaddusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x00]
1623; X86-NEXT:    retl # encoding: [0xc3]
1624;
1625; X64-LABEL: test_mask_adds_epu16_rmkz_128:
1626; X64:       # %bb.0:
1627; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1628; X64-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
1629; X64-NEXT:    retq # encoding: [0xc3]
1630  %b = load <8 x i16>, <8 x i16>* %ptr_b
1631  %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1632  ret <8 x i16> %res
1633}
1634
1635declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
1636
1637define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
1638; CHECK-LABEL: test_mask_adds_epu16_rr_256:
1639; CHECK:       # %bb.0:
1640; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
1641; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1642  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1643  ret <16 x i16> %res
1644}
1645
1646define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
1647; X86-LABEL: test_mask_adds_epu16_rrk_256:
1648; X86:       # %bb.0:
1649; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1650; X86-NEXT:    vpaddusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
1651; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1652; X86-NEXT:    retl # encoding: [0xc3]
1653;
1654; X64-LABEL: test_mask_adds_epu16_rrk_256:
1655; X64:       # %bb.0:
1656; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1657; X64-NEXT:    vpaddusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
1658; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1659; X64-NEXT:    retq # encoding: [0xc3]
1660  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1661  ret <16 x i16> %res
1662}
1663
1664define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
1665; X86-LABEL: test_mask_adds_epu16_rrkz_256:
1666; X86:       # %bb.0:
1667; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1668; X86-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
1669; X86-NEXT:    retl # encoding: [0xc3]
1670;
1671; X64-LABEL: test_mask_adds_epu16_rrkz_256:
1672; X64:       # %bb.0:
1673; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1674; X64-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
1675; X64-NEXT:    retq # encoding: [0xc3]
1676  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1677  ret <16 x i16> %res
1678}
1679
1680define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
1681; X86-LABEL: test_mask_adds_epu16_rm_256:
1682; X86:       # %bb.0:
1683; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1684; X86-NEXT:    vpaddusw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x00]
1685; X86-NEXT:    retl # encoding: [0xc3]
1686;
1687; X64-LABEL: test_mask_adds_epu16_rm_256:
1688; X64:       # %bb.0:
1689; X64-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07]
1690; X64-NEXT:    retq # encoding: [0xc3]
1691  %b = load <16 x i16>, <16 x i16>* %ptr_b
1692  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1693  ret <16 x i16> %res
1694}
1695
1696define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
1697; X86-LABEL: test_mask_adds_epu16_rmk_256:
1698; X86:       # %bb.0:
1699; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1700; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1701; X86-NEXT:    vpaddusw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x08]
1702; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1703; X86-NEXT:    retl # encoding: [0xc3]
1704;
1705; X64-LABEL: test_mask_adds_epu16_rmk_256:
1706; X64:       # %bb.0:
1707; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1708; X64-NEXT:    vpaddusw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
1709; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1710; X64-NEXT:    retq # encoding: [0xc3]
1711  %b = load <16 x i16>, <16 x i16>* %ptr_b
1712  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1713  ret <16 x i16> %res
1714}
1715
1716define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
1717; X86-LABEL: test_mask_adds_epu16_rmkz_256:
1718; X86:       # %bb.0:
1719; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1720; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1721; X86-NEXT:    vpaddusw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x00]
1722; X86-NEXT:    retl # encoding: [0xc3]
1723;
1724; X64-LABEL: test_mask_adds_epu16_rmkz_256:
1725; X64:       # %bb.0:
1726; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1727; X64-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
1728; X64-NEXT:    retq # encoding: [0xc3]
1729  %b = load <16 x i16>, <16 x i16>* %ptr_b
1730  %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1731  ret <16 x i16> %res
1732}
1733
1734declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
1735
1736define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
1737; CHECK-LABEL: test_mask_subs_epu16_rr_128:
1738; CHECK:       # %bb.0:
1739; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
1740; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1741  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1742  ret <8 x i16> %res
1743}
1744
1745define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
1746; X86-LABEL: test_mask_subs_epu16_rrk_128:
1747; X86:       # %bb.0:
1748; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1749; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1750; X86-NEXT:    vpsubusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
1751; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1752; X86-NEXT:    retl # encoding: [0xc3]
1753;
1754; X64-LABEL: test_mask_subs_epu16_rrk_128:
1755; X64:       # %bb.0:
1756; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1757; X64-NEXT:    vpsubusw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
1758; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1759; X64-NEXT:    retq # encoding: [0xc3]
1760  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1761  ret <8 x i16> %res
1762}
1763
1764define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
1765; X86-LABEL: test_mask_subs_epu16_rrkz_128:
1766; X86:       # %bb.0:
1767; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1768; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1769; X86-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
1770; X86-NEXT:    retl # encoding: [0xc3]
1771;
1772; X64-LABEL: test_mask_subs_epu16_rrkz_128:
1773; X64:       # %bb.0:
1774; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1775; X64-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
1776; X64-NEXT:    retq # encoding: [0xc3]
1777  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1778  ret <8 x i16> %res
1779}
1780
1781define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
1782; X86-LABEL: test_mask_subs_epu16_rm_128:
1783; X86:       # %bb.0:
1784; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1785; X86-NEXT:    vpsubusw (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x00]
1786; X86-NEXT:    retl # encoding: [0xc3]
1787;
1788; X64-LABEL: test_mask_subs_epu16_rm_128:
1789; X64:       # %bb.0:
1790; X64-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07]
1791; X64-NEXT:    retq # encoding: [0xc3]
1792  %b = load <8 x i16>, <8 x i16>* %ptr_b
1793  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
1794  ret <8 x i16> %res
1795}
1796
1797define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
1798; X86-LABEL: test_mask_subs_epu16_rmk_128:
1799; X86:       # %bb.0:
1800; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1801; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1802; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1803; X86-NEXT:    vpsubusw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x08]
1804; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1805; X86-NEXT:    retl # encoding: [0xc3]
1806;
1807; X64-LABEL: test_mask_subs_epu16_rmk_128:
1808; X64:       # %bb.0:
1809; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1810; X64-NEXT:    vpsubusw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
1811; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
1812; X64-NEXT:    retq # encoding: [0xc3]
1813  %b = load <8 x i16>, <8 x i16>* %ptr_b
1814  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask)
1815  ret <8 x i16> %res
1816}
1817
1818define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
1819; X86-LABEL: test_mask_subs_epu16_rmkz_128:
1820; X86:       # %bb.0:
1821; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
1822; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
1823; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1824; X86-NEXT:    vpsubusw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x00]
1825; X86-NEXT:    retl # encoding: [0xc3]
1826;
1827; X64-LABEL: test_mask_subs_epu16_rmkz_128:
1828; X64:       # %bb.0:
1829; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1830; X64-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
1831; X64-NEXT:    retq # encoding: [0xc3]
1832  %b = load <8 x i16>, <8 x i16>* %ptr_b
1833  %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
1834  ret <8 x i16> %res
1835}
1836
1837declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
1838
1839define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
1840; CHECK-LABEL: test_mask_subs_epu16_rr_256:
1841; CHECK:       # %bb.0:
1842; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
1843; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1844  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1845  ret <16 x i16> %res
1846}
1847
1848define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
1849; X86-LABEL: test_mask_subs_epu16_rrk_256:
1850; X86:       # %bb.0:
1851; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1852; X86-NEXT:    vpsubusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
1853; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1854; X86-NEXT:    retl # encoding: [0xc3]
1855;
1856; X64-LABEL: test_mask_subs_epu16_rrk_256:
1857; X64:       # %bb.0:
1858; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1859; X64-NEXT:    vpsubusw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
1860; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
1861; X64-NEXT:    retq # encoding: [0xc3]
1862  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1863  ret <16 x i16> %res
1864}
1865
1866define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
1867; X86-LABEL: test_mask_subs_epu16_rrkz_256:
1868; X86:       # %bb.0:
1869; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1870; X86-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
1871; X86-NEXT:    retl # encoding: [0xc3]
1872;
1873; X64-LABEL: test_mask_subs_epu16_rrkz_256:
1874; X64:       # %bb.0:
1875; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1876; X64-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
1877; X64-NEXT:    retq # encoding: [0xc3]
1878  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1879  ret <16 x i16> %res
1880}
1881
1882define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
1883; X86-LABEL: test_mask_subs_epu16_rm_256:
1884; X86:       # %bb.0:
1885; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1886; X86-NEXT:    vpsubusw (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x00]
1887; X86-NEXT:    retl # encoding: [0xc3]
1888;
1889; X64-LABEL: test_mask_subs_epu16_rm_256:
1890; X64:       # %bb.0:
1891; X64-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07]
1892; X64-NEXT:    retq # encoding: [0xc3]
1893  %b = load <16 x i16>, <16 x i16>* %ptr_b
1894  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
1895  ret <16 x i16> %res
1896}
1897
1898define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
1899; X86-LABEL: test_mask_subs_epu16_rmk_256:
1900; X86:       # %bb.0:
1901; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1902; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1903; X86-NEXT:    vpsubusw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x08]
1904; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1905; X86-NEXT:    retl # encoding: [0xc3]
1906;
1907; X64-LABEL: test_mask_subs_epu16_rmk_256:
1908; X64:       # %bb.0:
1909; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1910; X64-NEXT:    vpsubusw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
1911; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
1912; X64-NEXT:    retq # encoding: [0xc3]
1913  %b = load <16 x i16>, <16 x i16>* %ptr_b
1914  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask)
1915  ret <16 x i16> %res
1916}
1917
1918define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
1919; X86-LABEL: test_mask_subs_epu16_rmkz_256:
1920; X86:       # %bb.0:
1921; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1922; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
1923; X86-NEXT:    vpsubusw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x00]
1924; X86-NEXT:    retl # encoding: [0xc3]
1925;
1926; X64-LABEL: test_mask_subs_epu16_rmkz_256:
1927; X64:       # %bb.0:
1928; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
1929; X64-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
1930; X64-NEXT:    retq # encoding: [0xc3]
1931  %b = load <16 x i16>, <16 x i16>* %ptr_b
1932  %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
1933  ret <16 x i16> %res
1934}
1935
1936declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
1937
1938define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
1939; CHECK-LABEL: test_mask_adds_epi8_rr_128:
1940; CHECK:       # %bb.0:
1941; CHECK-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1]
1942; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1943  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
1944  ret <16 x i8> %res
1945}
1946
1947define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
1948; X86-LABEL: test_mask_adds_epi8_rrk_128:
1949; X86:       # %bb.0:
1950; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1951; X86-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
1952; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1953; X86-NEXT:    retl # encoding: [0xc3]
1954;
1955; X64-LABEL: test_mask_adds_epi8_rrk_128:
1956; X64:       # %bb.0:
1957; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1958; X64-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
1959; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
1960; X64-NEXT:    retq # encoding: [0xc3]
1961  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
1962  ret <16 x i8> %res
1963}
1964
1965define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
1966; X86-LABEL: test_mask_adds_epi8_rrkz_128:
1967; X86:       # %bb.0:
1968; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
1969; X86-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
1970; X86-NEXT:    retl # encoding: [0xc3]
1971;
1972; X64-LABEL: test_mask_adds_epi8_rrkz_128:
1973; X64:       # %bb.0:
1974; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
1975; X64-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
1976; X64-NEXT:    retq # encoding: [0xc3]
1977  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
1978  ret <16 x i8> %res
1979}
1980
1981define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
1982; X86-LABEL: test_mask_adds_epi8_rm_128:
1983; X86:       # %bb.0:
1984; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1985; X86-NEXT:    vpaddsb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x00]
1986; X86-NEXT:    retl # encoding: [0xc3]
1987;
1988; X64-LABEL: test_mask_adds_epi8_rm_128:
1989; X64:       # %bb.0:
1990; X64-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07]
1991; X64-NEXT:    retq # encoding: [0xc3]
1992  %b = load <16 x i8>, <16 x i8>* %ptr_b
1993  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
1994  ret <16 x i8> %res
1995}
1996
1997define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
1998; X86-LABEL: test_mask_adds_epi8_rmk_128:
1999; X86:       # %bb.0:
2000; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2001; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2002; X86-NEXT:    vpaddsb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0x08]
2003; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2004; X86-NEXT:    retl # encoding: [0xc3]
2005;
2006; X64-LABEL: test_mask_adds_epi8_rmk_128:
2007; X64:       # %bb.0:
2008; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2009; X64-NEXT:    vpaddsb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f]
2010; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2011; X64-NEXT:    retq # encoding: [0xc3]
2012  %b = load <16 x i8>, <16 x i8>* %ptr_b
2013  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2014  ret <16 x i8> %res
2015}
2016
2017define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
2018; X86-LABEL: test_mask_adds_epi8_rmkz_128:
2019; X86:       # %bb.0:
2020; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2021; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2022; X86-NEXT:    vpaddsb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0x00]
2023; X86-NEXT:    retl # encoding: [0xc3]
2024;
2025; X64-LABEL: test_mask_adds_epi8_rmkz_128:
2026; X64:       # %bb.0:
2027; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2028; X64-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07]
2029; X64-NEXT:    retq # encoding: [0xc3]
2030  %b = load <16 x i8>, <16 x i8>* %ptr_b
2031  %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2032  ret <16 x i8> %res
2033}
2034
2035declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
2036
2037define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
2038; CHECK-LABEL: test_mask_adds_epi8_rr_256:
2039; CHECK:       # %bb.0:
2040; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
2041; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2042  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2043  ret <32 x i8> %res
2044}
2045
2046define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
2047; X86-LABEL: test_mask_adds_epi8_rrk_256:
2048; X86:       # %bb.0:
2049; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2050; X86-NEXT:    vpaddsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1]
2051; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2052; X86-NEXT:    retl # encoding: [0xc3]
2053;
2054; X64-LABEL: test_mask_adds_epi8_rrk_256:
2055; X64:       # %bb.0:
2056; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2057; X64-NEXT:    vpaddsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1]
2058; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2059; X64-NEXT:    retq # encoding: [0xc3]
2060  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2061  ret <32 x i8> %res
2062}
2063
2064define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
2065; X86-LABEL: test_mask_adds_epi8_rrkz_256:
2066; X86:       # %bb.0:
2067; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2068; X86-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1]
2069; X86-NEXT:    retl # encoding: [0xc3]
2070;
2071; X64-LABEL: test_mask_adds_epi8_rrkz_256:
2072; X64:       # %bb.0:
2073; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2074; X64-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1]
2075; X64-NEXT:    retq # encoding: [0xc3]
2076  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2077  ret <32 x i8> %res
2078}
2079
2080define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
2081; X86-LABEL: test_mask_adds_epi8_rm_256:
2082; X86:       # %bb.0:
2083; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2084; X86-NEXT:    vpaddsb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x00]
2085; X86-NEXT:    retl # encoding: [0xc3]
2086;
2087; X64-LABEL: test_mask_adds_epi8_rm_256:
2088; X64:       # %bb.0:
2089; X64-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07]
2090; X64-NEXT:    retq # encoding: [0xc3]
2091  %b = load <32 x i8>, <32 x i8>* %ptr_b
2092  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2093  ret <32 x i8> %res
2094}
2095
2096define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
2097; X86-LABEL: test_mask_adds_epi8_rmk_256:
2098; X86:       # %bb.0:
2099; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2100; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2101; X86-NEXT:    vpaddsb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0x08]
2102; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2103; X86-NEXT:    retl # encoding: [0xc3]
2104;
2105; X64-LABEL: test_mask_adds_epi8_rmk_256:
2106; X64:       # %bb.0:
2107; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2108; X64-NEXT:    vpaddsb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f]
2109; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2110; X64-NEXT:    retq # encoding: [0xc3]
2111  %b = load <32 x i8>, <32 x i8>* %ptr_b
2112  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2113  ret <32 x i8> %res
2114}
2115
2116define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
2117; X86-LABEL: test_mask_adds_epi8_rmkz_256:
2118; X86:       # %bb.0:
2119; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2120; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2121; X86-NEXT:    vpaddsb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x00]
2122; X86-NEXT:    retl # encoding: [0xc3]
2123;
2124; X64-LABEL: test_mask_adds_epi8_rmkz_256:
2125; X64:       # %bb.0:
2126; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2127; X64-NEXT:    vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07]
2128; X64-NEXT:    retq # encoding: [0xc3]
2129  %b = load <32 x i8>, <32 x i8>* %ptr_b
2130  %res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2131  ret <32 x i8> %res
2132}
2133
2134declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
2135
2136define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
2137; CHECK-LABEL: test_mask_subs_epi8_rr_128:
2138; CHECK:       # %bb.0:
2139; CHECK-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1]
2140; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2141  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2142  ret <16 x i8> %res
2143}
2144
2145define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
2146; X86-LABEL: test_mask_subs_epi8_rrk_128:
2147; X86:       # %bb.0:
2148; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2149; X86-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
2150; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
2151; X86-NEXT:    retl # encoding: [0xc3]
2152;
2153; X64-LABEL: test_mask_subs_epi8_rrk_128:
2154; X64:       # %bb.0:
2155; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2156; X64-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
2157; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
2158; X64-NEXT:    retq # encoding: [0xc3]
2159  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2160  ret <16 x i8> %res
2161}
2162
2163define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
2164; X86-LABEL: test_mask_subs_epi8_rrkz_128:
2165; X86:       # %bb.0:
2166; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2167; X86-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
2168; X86-NEXT:    retl # encoding: [0xc3]
2169;
2170; X64-LABEL: test_mask_subs_epi8_rrkz_128:
2171; X64:       # %bb.0:
2172; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2173; X64-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
2174; X64-NEXT:    retq # encoding: [0xc3]
2175  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2176  ret <16 x i8> %res
2177}
2178
2179define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
2180; X86-LABEL: test_mask_subs_epi8_rm_128:
2181; X86:       # %bb.0:
2182; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2183; X86-NEXT:    vpsubsb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x00]
2184; X86-NEXT:    retl # encoding: [0xc3]
2185;
2186; X64-LABEL: test_mask_subs_epi8_rm_128:
2187; X64:       # %bb.0:
2188; X64-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07]
2189; X64-NEXT:    retq # encoding: [0xc3]
2190  %b = load <16 x i8>, <16 x i8>* %ptr_b
2191  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2192  ret <16 x i8> %res
2193}
2194
2195define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
2196; X86-LABEL: test_mask_subs_epi8_rmk_128:
2197; X86:       # %bb.0:
2198; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2199; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2200; X86-NEXT:    vpsubsb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x08]
2201; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2202; X86-NEXT:    retl # encoding: [0xc3]
2203;
2204; X64-LABEL: test_mask_subs_epi8_rmk_128:
2205; X64:       # %bb.0:
2206; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2207; X64-NEXT:    vpsubsb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f]
2208; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2209; X64-NEXT:    retq # encoding: [0xc3]
2210  %b = load <16 x i8>, <16 x i8>* %ptr_b
2211  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2212  ret <16 x i8> %res
2213}
2214
2215define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
2216; X86-LABEL: test_mask_subs_epi8_rmkz_128:
2217; X86:       # %bb.0:
2218; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2219; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2220; X86-NEXT:    vpsubsb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x00]
2221; X86-NEXT:    retl # encoding: [0xc3]
2222;
2223; X64-LABEL: test_mask_subs_epi8_rmkz_128:
2224; X64:       # %bb.0:
2225; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2226; X64-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07]
2227; X64-NEXT:    retq # encoding: [0xc3]
2228  %b = load <16 x i8>, <16 x i8>* %ptr_b
2229  %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2230  ret <16 x i8> %res
2231}
2232
2233declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
2234
2235define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
2236; CHECK-LABEL: test_mask_subs_epi8_rr_256:
2237; CHECK:       # %bb.0:
2238; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
2239; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2240  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2241  ret <32 x i8> %res
2242}
2243
2244define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
2245; X86-LABEL: test_mask_subs_epi8_rrk_256:
2246; X86:       # %bb.0:
2247; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2248; X86-NEXT:    vpsubsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1]
2249; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2250; X86-NEXT:    retl # encoding: [0xc3]
2251;
2252; X64-LABEL: test_mask_subs_epi8_rrk_256:
2253; X64:       # %bb.0:
2254; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2255; X64-NEXT:    vpsubsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1]
2256; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2257; X64-NEXT:    retq # encoding: [0xc3]
2258  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2259  ret <32 x i8> %res
2260}
2261
2262define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
2263; X86-LABEL: test_mask_subs_epi8_rrkz_256:
2264; X86:       # %bb.0:
2265; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2266; X86-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1]
2267; X86-NEXT:    retl # encoding: [0xc3]
2268;
2269; X64-LABEL: test_mask_subs_epi8_rrkz_256:
2270; X64:       # %bb.0:
2271; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2272; X64-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1]
2273; X64-NEXT:    retq # encoding: [0xc3]
2274  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2275  ret <32 x i8> %res
2276}
2277
2278define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
2279; X86-LABEL: test_mask_subs_epi8_rm_256:
2280; X86:       # %bb.0:
2281; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2282; X86-NEXT:    vpsubsb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x00]
2283; X86-NEXT:    retl # encoding: [0xc3]
2284;
2285; X64-LABEL: test_mask_subs_epi8_rm_256:
2286; X64:       # %bb.0:
2287; X64-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07]
2288; X64-NEXT:    retq # encoding: [0xc3]
2289  %b = load <32 x i8>, <32 x i8>* %ptr_b
2290  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2291  ret <32 x i8> %res
2292}
2293
2294define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
2295; X86-LABEL: test_mask_subs_epi8_rmk_256:
2296; X86:       # %bb.0:
2297; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2298; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2299; X86-NEXT:    vpsubsb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x08]
2300; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2301; X86-NEXT:    retl # encoding: [0xc3]
2302;
2303; X64-LABEL: test_mask_subs_epi8_rmk_256:
2304; X64:       # %bb.0:
2305; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2306; X64-NEXT:    vpsubsb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f]
2307; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2308; X64-NEXT:    retq # encoding: [0xc3]
2309  %b = load <32 x i8>, <32 x i8>* %ptr_b
2310  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2311  ret <32 x i8> %res
2312}
2313
2314define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
2315; X86-LABEL: test_mask_subs_epi8_rmkz_256:
2316; X86:       # %bb.0:
2317; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2318; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2319; X86-NEXT:    vpsubsb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x00]
2320; X86-NEXT:    retl # encoding: [0xc3]
2321;
2322; X64-LABEL: test_mask_subs_epi8_rmkz_256:
2323; X64:       # %bb.0:
2324; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2325; X64-NEXT:    vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07]
2326; X64-NEXT:    retq # encoding: [0xc3]
2327  %b = load <32 x i8>, <32 x i8>* %ptr_b
2328  %res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2329  ret <32 x i8> %res
2330}
2331
2332declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
2333
2334define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
2335; CHECK-LABEL: test_mask_adds_epu8_rr_128:
2336; CHECK:       # %bb.0:
2337; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
2338; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2339  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2340  ret <16 x i8> %res
2341}
2342
2343define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
2344; X86-LABEL: test_mask_adds_epu8_rrk_128:
2345; X86:       # %bb.0:
2346; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2347; X86-NEXT:    vpaddusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
2348; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
2349; X86-NEXT:    retl # encoding: [0xc3]
2350;
2351; X64-LABEL: test_mask_adds_epu8_rrk_128:
2352; X64:       # %bb.0:
2353; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2354; X64-NEXT:    vpaddusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
2355; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
2356; X64-NEXT:    retq # encoding: [0xc3]
2357  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2358  ret <16 x i8> %res
2359}
2360
2361define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
2362; X86-LABEL: test_mask_adds_epu8_rrkz_128:
2363; X86:       # %bb.0:
2364; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2365; X86-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
2366; X86-NEXT:    retl # encoding: [0xc3]
2367;
2368; X64-LABEL: test_mask_adds_epu8_rrkz_128:
2369; X64:       # %bb.0:
2370; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2371; X64-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
2372; X64-NEXT:    retq # encoding: [0xc3]
2373  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2374  ret <16 x i8> %res
2375}
2376
2377define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
2378; X86-LABEL: test_mask_adds_epu8_rm_128:
2379; X86:       # %bb.0:
2380; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2381; X86-NEXT:    vpaddusb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x00]
2382; X86-NEXT:    retl # encoding: [0xc3]
2383;
2384; X64-LABEL: test_mask_adds_epu8_rm_128:
2385; X64:       # %bb.0:
2386; X64-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07]
2387; X64-NEXT:    retq # encoding: [0xc3]
2388  %b = load <16 x i8>, <16 x i8>* %ptr_b
2389  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2390  ret <16 x i8> %res
2391}
2392
2393define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
2394; X86-LABEL: test_mask_adds_epu8_rmk_128:
2395; X86:       # %bb.0:
2396; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2397; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2398; X86-NEXT:    vpaddusb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x08]
2399; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2400; X86-NEXT:    retl # encoding: [0xc3]
2401;
2402; X64-LABEL: test_mask_adds_epu8_rmk_128:
2403; X64:       # %bb.0:
2404; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2405; X64-NEXT:    vpaddusb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
2406; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2407; X64-NEXT:    retq # encoding: [0xc3]
2408  %b = load <16 x i8>, <16 x i8>* %ptr_b
2409  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2410  ret <16 x i8> %res
2411}
2412
2413define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
2414; X86-LABEL: test_mask_adds_epu8_rmkz_128:
2415; X86:       # %bb.0:
2416; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2417; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2418; X86-NEXT:    vpaddusb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x00]
2419; X86-NEXT:    retl # encoding: [0xc3]
2420;
2421; X64-LABEL: test_mask_adds_epu8_rmkz_128:
2422; X64:       # %bb.0:
2423; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2424; X64-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
2425; X64-NEXT:    retq # encoding: [0xc3]
2426  %b = load <16 x i8>, <16 x i8>* %ptr_b
2427  %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2428  ret <16 x i8> %res
2429}
2430
2431declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
2432
2433define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
2434; CHECK-LABEL: test_mask_adds_epu8_rr_256:
2435; CHECK:       # %bb.0:
2436; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
2437; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2438  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2439  ret <32 x i8> %res
2440}
2441
2442define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
2443; X86-LABEL: test_mask_adds_epu8_rrk_256:
2444; X86:       # %bb.0:
2445; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2446; X86-NEXT:    vpaddusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
2447; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2448; X86-NEXT:    retl # encoding: [0xc3]
2449;
2450; X64-LABEL: test_mask_adds_epu8_rrk_256:
2451; X64:       # %bb.0:
2452; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2453; X64-NEXT:    vpaddusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
2454; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2455; X64-NEXT:    retq # encoding: [0xc3]
2456  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2457  ret <32 x i8> %res
2458}
2459
2460define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
2461; X86-LABEL: test_mask_adds_epu8_rrkz_256:
2462; X86:       # %bb.0:
2463; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2464; X86-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
2465; X86-NEXT:    retl # encoding: [0xc3]
2466;
2467; X64-LABEL: test_mask_adds_epu8_rrkz_256:
2468; X64:       # %bb.0:
2469; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2470; X64-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
2471; X64-NEXT:    retq # encoding: [0xc3]
2472  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2473  ret <32 x i8> %res
2474}
2475
2476define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
2477; X86-LABEL: test_mask_adds_epu8_rm_256:
2478; X86:       # %bb.0:
2479; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2480; X86-NEXT:    vpaddusb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x00]
2481; X86-NEXT:    retl # encoding: [0xc3]
2482;
2483; X64-LABEL: test_mask_adds_epu8_rm_256:
2484; X64:       # %bb.0:
2485; X64-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07]
2486; X64-NEXT:    retq # encoding: [0xc3]
2487  %b = load <32 x i8>, <32 x i8>* %ptr_b
2488  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2489  ret <32 x i8> %res
2490}
2491
2492define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
2493; X86-LABEL: test_mask_adds_epu8_rmk_256:
2494; X86:       # %bb.0:
2495; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2496; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2497; X86-NEXT:    vpaddusb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x08]
2498; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2499; X86-NEXT:    retl # encoding: [0xc3]
2500;
2501; X64-LABEL: test_mask_adds_epu8_rmk_256:
2502; X64:       # %bb.0:
2503; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2504; X64-NEXT:    vpaddusb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
2505; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2506; X64-NEXT:    retq # encoding: [0xc3]
2507  %b = load <32 x i8>, <32 x i8>* %ptr_b
2508  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2509  ret <32 x i8> %res
2510}
2511
2512define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
2513; X86-LABEL: test_mask_adds_epu8_rmkz_256:
2514; X86:       # %bb.0:
2515; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2516; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2517; X86-NEXT:    vpaddusb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x00]
2518; X86-NEXT:    retl # encoding: [0xc3]
2519;
2520; X64-LABEL: test_mask_adds_epu8_rmkz_256:
2521; X64:       # %bb.0:
2522; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2523; X64-NEXT:    vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
2524; X64-NEXT:    retq # encoding: [0xc3]
2525  %b = load <32 x i8>, <32 x i8>* %ptr_b
2526  %res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2527  ret <32 x i8> %res
2528}
2529
2530declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
2531
2532define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
2533; CHECK-LABEL: test_mask_subs_epu8_rr_128:
2534; CHECK:       # %bb.0:
2535; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
2536; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2537  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2538  ret <16 x i8> %res
2539}
2540
2541define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
2542; X86-LABEL: test_mask_subs_epu8_rrk_128:
2543; X86:       # %bb.0:
2544; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2545; X86-NEXT:    vpsubusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
2546; X86-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
2547; X86-NEXT:    retl # encoding: [0xc3]
2548;
2549; X64-LABEL: test_mask_subs_epu8_rrk_128:
2550; X64:       # %bb.0:
2551; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2552; X64-NEXT:    vpsubusb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
2553; X64-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
2554; X64-NEXT:    retq # encoding: [0xc3]
2555  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2556  ret <16 x i8> %res
2557}
2558
2559define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
2560; X86-LABEL: test_mask_subs_epu8_rrkz_128:
2561; X86:       # %bb.0:
2562; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2563; X86-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
2564; X86-NEXT:    retl # encoding: [0xc3]
2565;
2566; X64-LABEL: test_mask_subs_epu8_rrkz_128:
2567; X64:       # %bb.0:
2568; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2569; X64-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
2570; X64-NEXT:    retq # encoding: [0xc3]
2571  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2572  ret <16 x i8> %res
2573}
2574
2575define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
2576; X86-LABEL: test_mask_subs_epu8_rm_128:
2577; X86:       # %bb.0:
2578; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2579; X86-NEXT:    vpsubusb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x00]
2580; X86-NEXT:    retl # encoding: [0xc3]
2581;
2582; X64-LABEL: test_mask_subs_epu8_rm_128:
2583; X64:       # %bb.0:
2584; X64-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07]
2585; X64-NEXT:    retq # encoding: [0xc3]
2586  %b = load <16 x i8>, <16 x i8>* %ptr_b
2587  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
2588  ret <16 x i8> %res
2589}
2590
2591define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
2592; X86-LABEL: test_mask_subs_epu8_rmk_128:
2593; X86:       # %bb.0:
2594; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2595; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2596; X86-NEXT:    vpsubusb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x08]
2597; X86-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2598; X86-NEXT:    retl # encoding: [0xc3]
2599;
2600; X64-LABEL: test_mask_subs_epu8_rmk_128:
2601; X64:       # %bb.0:
2602; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2603; X64-NEXT:    vpsubusb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
2604; X64-NEXT:    vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
2605; X64-NEXT:    retq # encoding: [0xc3]
2606  %b = load <16 x i8>, <16 x i8>* %ptr_b
2607  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask)
2608  ret <16 x i8> %res
2609}
2610
2611define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
2612; X86-LABEL: test_mask_subs_epu8_rmkz_128:
2613; X86:       # %bb.0:
2614; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2615; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
2616; X86-NEXT:    vpsubusb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x00]
2617; X86-NEXT:    retl # encoding: [0xc3]
2618;
2619; X64-LABEL: test_mask_subs_epu8_rmkz_128:
2620; X64:       # %bb.0:
2621; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2622; X64-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
2623; X64-NEXT:    retq # encoding: [0xc3]
2624  %b = load <16 x i8>, <16 x i8>* %ptr_b
2625  %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
2626  ret <16 x i8> %res
2627}
2628
2629declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
2630
2631define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
2632; CHECK-LABEL: test_mask_subs_epu8_rr_256:
2633; CHECK:       # %bb.0:
2634; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
2635; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2636  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2637  ret <32 x i8> %res
2638}
2639
2640define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
2641; X86-LABEL: test_mask_subs_epu8_rrk_256:
2642; X86:       # %bb.0:
2643; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2644; X86-NEXT:    vpsubusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
2645; X86-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2646; X86-NEXT:    retl # encoding: [0xc3]
2647;
2648; X64-LABEL: test_mask_subs_epu8_rrk_256:
2649; X64:       # %bb.0:
2650; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2651; X64-NEXT:    vpsubusb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
2652; X64-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
2653; X64-NEXT:    retq # encoding: [0xc3]
2654  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2655  ret <32 x i8> %res
2656}
2657
2658define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
2659; X86-LABEL: test_mask_subs_epu8_rrkz_256:
2660; X86:       # %bb.0:
2661; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
2662; X86-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
2663; X86-NEXT:    retl # encoding: [0xc3]
2664;
2665; X64-LABEL: test_mask_subs_epu8_rrkz_256:
2666; X64:       # %bb.0:
2667; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2668; X64-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
2669; X64-NEXT:    retq # encoding: [0xc3]
2670  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2671  ret <32 x i8> %res
2672}
2673
2674define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
2675; X86-LABEL: test_mask_subs_epu8_rm_256:
2676; X86:       # %bb.0:
2677; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2678; X86-NEXT:    vpsubusb (%eax), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x00]
2679; X86-NEXT:    retl # encoding: [0xc3]
2680;
2681; X64-LABEL: test_mask_subs_epu8_rm_256:
2682; X64:       # %bb.0:
2683; X64-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07]
2684; X64-NEXT:    retq # encoding: [0xc3]
2685  %b = load <32 x i8>, <32 x i8>* %ptr_b
2686  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
2687  ret <32 x i8> %res
2688}
2689
2690define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
2691; X86-LABEL: test_mask_subs_epu8_rmk_256:
2692; X86:       # %bb.0:
2693; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2694; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2695; X86-NEXT:    vpsubusb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x08]
2696; X86-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2697; X86-NEXT:    retl # encoding: [0xc3]
2698;
2699; X64-LABEL: test_mask_subs_epu8_rmk_256:
2700; X64:       # %bb.0:
2701; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2702; X64-NEXT:    vpsubusb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
2703; X64-NEXT:    vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
2704; X64-NEXT:    retq # encoding: [0xc3]
2705  %b = load <32 x i8>, <32 x i8>* %ptr_b
2706  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask)
2707  ret <32 x i8> %res
2708}
2709
2710define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
2711; X86-LABEL: test_mask_subs_epu8_rmkz_256:
2712; X86:       # %bb.0:
2713; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2714; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
2715; X86-NEXT:    vpsubusb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x00]
2716; X86-NEXT:    retl # encoding: [0xc3]
2717;
2718; X64-LABEL: test_mask_subs_epu8_rmkz_256:
2719; X64:       # %bb.0:
2720; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
2721; X64-NEXT:    vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
2722; X64-NEXT:    retq # encoding: [0xc3]
2723  %b = load <32 x i8>, <32 x i8>* %ptr_b
2724  %res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 %mask)
2725  ret <32 x i8> %res
2726}
2727
2728declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
2729
2730define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
2731; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
2732; X86:       # %bb.0:
2733; X86-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
2734; X86-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
2735; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
2736; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
2737; X86-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca]
2738; X86-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
2739; X86-NEXT:    retl # encoding: [0xc3]
2740;
2741; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
2742; X64:       # %bb.0:
2743; X64-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
2744; X64-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
2745; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2746; X64-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca]
2747; X64-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
2748; X64-NEXT:    retq # encoding: [0xc3]
2749  %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2)
2750  %2 = bitcast i8 %x3 to <8 x i1>
2751  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1
2752  %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2)
2753  %res2 = add <8 x i16> %3, %4
2754  ret <8 x i16> %res2
2755}
2756
2757define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
2758; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128:
2759; X86:       # %bb.0:
2760; X86-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
2761; X86-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
2762; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
2763; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
2764; X86-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca]
2765; X86-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
2766; X86-NEXT:    retl # encoding: [0xc3]
2767;
2768; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128:
2769; X64:       # %bb.0:
2770; X64-NEXT:    vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
2771; X64-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
2772; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2773; X64-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca]
2774; X64-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
2775; X64-NEXT:    retq # encoding: [0xc3]
2776  %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2)
2777  %2 = bitcast i8 %x3 to <8 x i1>
2778  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
2779  %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2)
2780  %res2 = add <8 x i16> %3, %4
2781  ret <8 x i16> %res2
2782}
2783
2784define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
2785; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256:
2786; X86:       # %bb.0:
2787; X86-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
2788; X86-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
2789; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2790; X86-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca]
2791; X86-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
2792; X86-NEXT:    retl # encoding: [0xc3]
2793;
2794; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256:
2795; X64:       # %bb.0:
2796; X64-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
2797; X64-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
2798; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2799; X64-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca]
2800; X64-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
2801; X64-NEXT:    retq # encoding: [0xc3]
2802  %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2)
2803  %2 = bitcast i16 %x3 to <16 x i1>
2804  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1
2805  %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2)
2806  %res2 = add <16 x i16> %3, %4
2807  ret <16 x i16> %res2
2808}
2809
2810define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
2811; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256:
2812; X86:       # %bb.0:
2813; X86-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
2814; X86-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
2815; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2816; X86-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca]
2817; X86-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
2818; X86-NEXT:    retl # encoding: [0xc3]
2819;
2820; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256:
2821; X64:       # %bb.0:
2822; X64-NEXT:    vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
2823; X64-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
2824; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2825; X64-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca]
2826; X64-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
2827; X64-NEXT:    retq # encoding: [0xc3]
2828  %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2)
2829  %2 = bitcast i16 %x3 to <16 x i1>
2830  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
2831  %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2)
2832  %res2 = add <16 x i16> %3, %4
2833  ret <16 x i16> %res2
2834}
2835
2836declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>)
2837
2838define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
2839; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128:
2840; X86:       # %bb.0:
2841; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
2842; X86-NEXT:    vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda]
2843; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
2844; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
2845; X86-NEXT:    vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca]
2846; X86-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
2847; X86-NEXT:    retl # encoding: [0xc3]
2848;
2849; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128:
2850; X64:       # %bb.0:
2851; X64-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
2852; X64-NEXT:    vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda]
2853; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2854; X64-NEXT:    vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca]
2855; X64-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
2856; X64-NEXT:    retq # encoding: [0xc3]
2857  %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2)
2858  %2 = bitcast i8 %x3 to <8 x i1>
2859  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1
2860  %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2)
2861  %res2 = add <8 x i16> %3, %4
2862  ret <8 x i16> %res2
2863}
2864
2865declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>)
2866
2867define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
2868; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256:
2869; X86:       # %bb.0:
2870; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
2871; X86-NEXT:    vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda]
2872; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2873; X86-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca]
2874; X86-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
2875; X86-NEXT:    retl # encoding: [0xc3]
2876;
2877; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256:
2878; X64:       # %bb.0:
2879; X64-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
2880; X64-NEXT:    vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda]
2881; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2882; X64-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca]
2883; X64-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
2884; X64-NEXT:    retq # encoding: [0xc3]
2885  %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2)
2886  %2 = bitcast i16 %x3 to <16 x i1>
2887  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1
2888  %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2)
2889  %res2 = add <16 x i16> %3, %4
2890  ret <16 x i16> %res2
2891}
2892
2893declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>)
2894
2895define <8 x i16> @test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
2896; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_128:
2897; X86:       # %bb.0:
2898; X86-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xd9]
2899; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
2900; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
2901; X86-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1]
2902; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
2903; X86-NEXT:    retl # encoding: [0xc3]
2904;
2905; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_128:
2906; X64:       # %bb.0:
2907; X64-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xd9]
2908; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2909; X64-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1]
2910; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
2911; X64-NEXT:    retq # encoding: [0xc3]
2912  %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %x0, <8 x i16> %x1)
2913  %2 = bitcast i8 %x3 to <8 x i1>
2914  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2
2915  %4 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %x0, <8 x i16> %x1)
2916  %res2 = add <8 x i16> %3, %4
2917  ret <8 x i16> %res2
2918}
2919
2920declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>)
2921
2922define <16 x i16> @test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
2923; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_256:
2924; X86:       # %bb.0:
2925; X86-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xd9]
2926; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2927; X86-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1]
2928; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
2929; X86-NEXT:    retl # encoding: [0xc3]
2930;
2931; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_256:
2932; X64:       # %bb.0:
2933; X64-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xd9]
2934; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2935; X64-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1]
2936; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
2937; X64-NEXT:    retq # encoding: [0xc3]
2938  %1 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %x0, <16 x i16> %x1)
2939  %2 = bitcast i16 %x3 to <16 x i1>
2940  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2
2941  %4 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %x0, <16 x i16> %x1)
2942  %res2 = add <16 x i16> %3, %4
2943  ret <16 x i16> %res2
2944}
2945
2946declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>)
2947
2948define <8 x i16> @test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
2949; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_128:
2950; X86:       # %bb.0:
2951; X86-NEXT:    vpmulhw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xd9]
2952; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
2953; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
2954; X86-NEXT:    vpmulhw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1]
2955; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
2956; X86-NEXT:    retl # encoding: [0xc3]
2957;
2958; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_128:
2959; X64:       # %bb.0:
2960; X64-NEXT:    vpmulhw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xd9]
2961; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2962; X64-NEXT:    vpmulhw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1]
2963; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
2964; X64-NEXT:    retq # encoding: [0xc3]
2965  %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %x0, <8 x i16> %x1)
2966  %2 = bitcast i8 %x3 to <8 x i1>
2967  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2
2968  %4 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %x0, <8 x i16> %x1)
2969  %res2 = add <8 x i16> %3, %4
2970  ret <8 x i16> %res2
2971}
2972
2973declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>)
2974
2975define <16 x i16> @test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
2976; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_256:
2977; X86:       # %bb.0:
2978; X86-NEXT:    vpmulhw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xd9]
2979; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
2980; X86-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1]
2981; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
2982; X86-NEXT:    retl # encoding: [0xc3]
2983;
2984; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_256:
2985; X64:       # %bb.0:
2986; X64-NEXT:    vpmulhw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xd9]
2987; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
2988; X64-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1]
2989; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
2990; X64-NEXT:    retq # encoding: [0xc3]
2991  %1 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %x0, <16 x i16> %x1)
2992  %2 = bitcast i16 %x3 to <16 x i1>
2993  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2
2994  %4 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %x0, <16 x i16> %x1)
2995  %res2 = add <16 x i16> %3, %4
2996  ret <16 x i16> %res2
2997}
2998
2999declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>)
3000
3001define <8 x i16> @test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3002; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128:
3003; X86:       # %bb.0:
3004; X86-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xd9]
3005; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3006; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3007; X86-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1]
3008; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
3009; X86-NEXT:    retl # encoding: [0xc3]
3010;
3011; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128:
3012; X64:       # %bb.0:
3013; X64-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xd9]
3014; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3015; X64-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1]
3016; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
3017; X64-NEXT:    retq # encoding: [0xc3]
3018  %1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1)
3019  %2 = bitcast i8 %x3 to <8 x i1>
3020  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2
3021  %4 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1)
3022  %res2 = add <8 x i16> %3, %4
3023  ret <8 x i16> %res2
3024}
3025
3026declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>)
3027
3028define <16 x i16> @test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3029; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256:
3030; X86:       # %bb.0:
3031; X86-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xd9]
3032; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3033; X86-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1]
3034; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
3035; X86-NEXT:    retl # encoding: [0xc3]
3036;
3037; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256:
3038; X64:       # %bb.0:
3039; X64-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xd9]
3040; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3041; X64-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1]
3042; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
3043; X64-NEXT:    retq # encoding: [0xc3]
3044  %1 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %x0, <16 x i16> %x1)
3045  %2 = bitcast i16 %x3 to <16 x i1>
3046  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2
3047  %4 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %x0, <16 x i16> %x1)
3048  %res2 = add <16 x i16> %3, %4
3049  ret <16 x i16> %res2
3050}
3051
3052declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
3053
3054define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
3055; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
3056; X86:       # %bb.0:
3057; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3058; X86-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
3059; X86-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
3060; X86-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3061; X86-NEXT:    vpmovwb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
3062; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3063; X86-NEXT:    retl # encoding: [0xc3]
3064;
3065; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
3066; X64:       # %bb.0:
3067; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3068; X64-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
3069; X64-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
3070; X64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3071; X64-NEXT:    vpmovwb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
3072; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3073; X64-NEXT:    retq # encoding: [0xc3]
3074    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
3075    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
3076    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
3077    %res3 = add <16 x i8> %res0, %res1
3078    %res4 = add <16 x i8> %res3, %res2
3079    ret <16 x i8> %res4
3080}
3081
3082declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
3083
3084define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
3085; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
3086; X86:       # %bb.0:
3087; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
3088; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3089; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3090; X86-NEXT:    vpmovwb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x30,0x00]
3091; X86-NEXT:    vpmovwb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0x00]
3092; X86-NEXT:    retl # encoding: [0xc3]
3093;
3094; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
3095; X64:       # %bb.0:
3096; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
3097; X64-NEXT:    vpmovwb %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x30,0x07]
3098; X64-NEXT:    vpmovwb %xmm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0x07]
3099; X64-NEXT:    retq # encoding: [0xc3]
3100    call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
3101    call void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
3102    ret void
3103}
3104
3105declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
3106
3107define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
3108; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
3109; X86:       # %bb.0:
3110; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3111; X86-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
3112; X86-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
3113; X86-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3114; X86-NEXT:    vpmovswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
3115; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3116; X86-NEXT:    retl # encoding: [0xc3]
3117;
3118; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
3119; X64:       # %bb.0:
3120; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3121; X64-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
3122; X64-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
3123; X64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3124; X64-NEXT:    vpmovswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
3125; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3126; X64-NEXT:    retq # encoding: [0xc3]
3127    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
3128    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
3129    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
3130    %res3 = add <16 x i8> %res0, %res1
3131    %res4 = add <16 x i8> %res3, %res2
3132    ret <16 x i8> %res4
3133}
3134
3135declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
3136
3137define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
3138; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
3139; X86:       # %bb.0:
3140; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
3141; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3142; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3143; X86-NEXT:    vpmovswb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x20,0x00]
3144; X86-NEXT:    vpmovswb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0x00]
3145; X86-NEXT:    retl # encoding: [0xc3]
3146;
3147; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
3148; X64:       # %bb.0:
3149; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
3150; X64-NEXT:    vpmovswb %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x20,0x07]
3151; X64-NEXT:    vpmovswb %xmm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0x07]
3152; X64-NEXT:    retq # encoding: [0xc3]
3153    call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
3154    call void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
3155    ret void
3156}
3157
3158declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
3159
3160define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
3161; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
3162; X86:       # %bb.0:
3163; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3164; X86-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
3165; X86-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
3166; X86-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3167; X86-NEXT:    vpmovuswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
3168; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3169; X86-NEXT:    retl # encoding: [0xc3]
3170;
3171; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
3172; X64:       # %bb.0:
3173; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3174; X64-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
3175; X64-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
3176; X64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3177; X64-NEXT:    vpmovuswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
3178; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3179; X64-NEXT:    retq # encoding: [0xc3]
3180    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1)
3181    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2)
3182    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> zeroinitializer, i8 %x2)
3183    %res3 = add <16 x i8> %res0, %res1
3184    %res4 = add <16 x i8> %res3, %res2
3185    ret <16 x i8> %res4
3186}
3187
3188declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
3189
3190define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
3191; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
3192; X86:       # %bb.0:
3193; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
3194; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3195; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3196; X86-NEXT:    vpmovuswb %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x10,0x00]
3197; X86-NEXT:    vpmovuswb %xmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0x00]
3198; X86-NEXT:    retl # encoding: [0xc3]
3199;
3200; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
3201; X64:       # %bb.0:
3202; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
3203; X64-NEXT:    vpmovuswb %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x10,0x07]
3204; X64-NEXT:    vpmovuswb %xmm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0x07]
3205; X64-NEXT:    retq # encoding: [0xc3]
3206    call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 -1)
3207    call void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16> %x1, i8 %x2)
3208    ret void
3209}
3210
3211declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
3212
3213define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
3214; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
3215; X86:       # %bb.0:
3216; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3217; X86-NEXT:    vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
3218; X86-NEXT:    vpmovwb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2]
3219; X86-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3220; X86-NEXT:    vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0]
3221; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3222; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3223; X86-NEXT:    retl # encoding: [0xc3]
3224;
3225; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
3226; X64:       # %bb.0:
3227; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3228; X64-NEXT:    vpmovwb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2]
3229; X64-NEXT:    vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
3230; X64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3231; X64-NEXT:    vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0]
3232; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3233; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3234; X64-NEXT:    retq # encoding: [0xc3]
3235    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
3236    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
3237    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
3238    %res3 = add <16 x i8> %res0, %res1
3239    %res4 = add <16 x i8> %res3, %res2
3240    ret <16 x i8> %res4
3241}
3242
3243declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
3244
3245define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
3246; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
3247; X86:       # %bb.0:
3248; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
3249; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3250; X86-NEXT:    vpmovwb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x30,0x00]
3251; X86-NEXT:    vpmovwb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0x00]
3252; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3253; X86-NEXT:    retl # encoding: [0xc3]
3254;
3255; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
3256; X64:       # %bb.0:
3257; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
3258; X64-NEXT:    vpmovwb %ymm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x28,0x30,0x07]
3259; X64-NEXT:    vpmovwb %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0x07]
3260; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3261; X64-NEXT:    retq # encoding: [0xc3]
3262    call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
3263    call void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
3264    ret void
3265}
3266
3267declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
3268
3269define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
3270; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
3271; X86:       # %bb.0:
3272; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3273; X86-NEXT:    vpmovswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
3274; X86-NEXT:    vpmovswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2]
3275; X86-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3276; X86-NEXT:    vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0]
3277; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3278; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3279; X86-NEXT:    retl # encoding: [0xc3]
3280;
3281; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
3282; X64:       # %bb.0:
3283; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3284; X64-NEXT:    vpmovswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2]
3285; X64-NEXT:    vpmovswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
3286; X64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3287; X64-NEXT:    vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0]
3288; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3289; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3290; X64-NEXT:    retq # encoding: [0xc3]
3291    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
3292    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
3293    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
3294    %res3 = add <16 x i8> %res0, %res1
3295    %res4 = add <16 x i8> %res3, %res2
3296    ret <16 x i8> %res4
3297}
3298
3299declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
3300
3301define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
3302; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
3303; X86:       # %bb.0:
3304; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
3305; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3306; X86-NEXT:    vpmovswb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x20,0x00]
3307; X86-NEXT:    vpmovswb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0x00]
3308; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3309; X86-NEXT:    retl # encoding: [0xc3]
3310;
3311; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
3312; X64:       # %bb.0:
3313; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
3314; X64-NEXT:    vpmovswb %ymm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x28,0x20,0x07]
3315; X64-NEXT:    vpmovswb %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0x07]
3316; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3317; X64-NEXT:    retq # encoding: [0xc3]
3318    call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
3319    call void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
3320    ret void
3321}
3322
3323declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
3324
3325define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
3326; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
3327; X86:       # %bb.0:
3328; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3329; X86-NEXT:    vpmovuswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
3330; X86-NEXT:    vpmovuswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2]
3331; X86-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3332; X86-NEXT:    vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0]
3333; X86-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3334; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3335; X86-NEXT:    retl # encoding: [0xc3]
3336;
3337; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
3338; X64:       # %bb.0:
3339; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3340; X64-NEXT:    vpmovuswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2]
3341; X64-NEXT:    vpmovuswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
3342; X64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca]
3343; X64-NEXT:    vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0]
3344; X64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
3345; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3346; X64-NEXT:    retq # encoding: [0xc3]
3347    %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1)
3348    %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2)
3349    %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2)
3350    %res3 = add <16 x i8> %res0, %res1
3351    %res4 = add <16 x i8> %res3, %res2
3352    ret <16 x i8> %res4
3353}
3354
3355declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
3356
3357define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
3358; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
3359; X86:       # %bb.0:
3360; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
3361; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3362; X86-NEXT:    vpmovuswb %ymm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x28,0x10,0x00]
3363; X86-NEXT:    vpmovuswb %ymm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0x00]
3364; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3365; X86-NEXT:    retl # encoding: [0xc3]
3366;
3367; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
3368; X64:       # %bb.0:
3369; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
3370; X64-NEXT:    vpmovuswb %ymm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x28,0x10,0x07]
3371; X64-NEXT:    vpmovuswb %ymm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0x07]
3372; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
3373; X64-NEXT:    retq # encoding: [0xc3]
3374    call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 -1)
3375    call void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16> %x1, i16 %x2)
3376    ret void
3377}
3378
3379declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
3380
3381define <4 x i32> @test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
3382; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
3383; X86:       # %bb.0:
3384; X86-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xd9]
3385; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3386; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3387; X86-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1]
3388; X86-NEXT:    vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3]
3389; X86-NEXT:    retl # encoding: [0xc3]
3390;
3391; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
3392; X64:       # %bb.0:
3393; X64-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xd9]
3394; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3395; X64-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1]
3396; X64-NEXT:    vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3]
3397; X64-NEXT:    retq # encoding: [0xc3]
3398  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %x0, <8 x i16> %x1)
3399  %2 = bitcast i8 %x3 to <8 x i1>
3400  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3401  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x2
3402  %4 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %x0, <8 x i16> %x1)
3403  %res2 = add <4 x i32> %3, %4
3404  ret <4 x i32> %res2
3405}
3406
3407declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
3408
3409define <8 x i32> @test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
3410; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
3411; X86:       # %bb.0:
3412; X86-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xd9]
3413; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3414; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3415; X86-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1]
3416; X86-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
3417; X86-NEXT:    retl # encoding: [0xc3]
3418;
3419; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
3420; X64:       # %bb.0:
3421; X64-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xd9]
3422; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3423; X64-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1]
3424; X64-NEXT:    vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3]
3425; X64-NEXT:    retq # encoding: [0xc3]
3426  %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %x0, <16 x i16> %x1)
3427  %2 = bitcast i8 %x3 to <8 x i1>
3428  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x2
3429  %4 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %x0, <16 x i16> %x1)
3430  %res2 = add <8 x i32> %3, %4
3431  ret <8 x i32> %res2
3432}
3433
3434declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
3435
3436define <8 x i16> @test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
3437; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
3438; X86:       # %bb.0:
3439; X86-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xd9]
3440; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3441; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3442; X86-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1]
3443; X86-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
3444; X86-NEXT:    retl # encoding: [0xc3]
3445;
3446; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
3447; X64:       # %bb.0:
3448; X64-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xd9]
3449; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3450; X64-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1]
3451; X64-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
3452; X64-NEXT:    retq # encoding: [0xc3]
3453  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %x0, <16 x i8> %x1)
3454  %2 = bitcast i8 %x3 to <8 x i1>
3455  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2
3456  %4 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %x0, <16 x i8> %x1)
3457  %res2 = add <8 x i16> %3, %4
3458  ret <8 x i16> %res2
3459}
3460
3461declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
3462
3463define <16 x i16> @test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
3464; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
3465; X86:       # %bb.0:
3466; X86-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xd9]
3467; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3468; X86-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1]
3469; X86-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
3470; X86-NEXT:    retl # encoding: [0xc3]
3471;
3472; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
3473; X64:       # %bb.0:
3474; X64-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xd9]
3475; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3476; X64-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1]
3477; X64-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
3478; X64-NEXT:    retq # encoding: [0xc3]
3479  %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %x0, <32 x i8> %x1)
3480  %2 = bitcast i16 %x3 to <16 x i1>
3481  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2
3482  %4 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %x0, <32 x i8> %x1)
3483  %res2 = add <16 x i16> %3, %4
3484  ret <16 x i16> %res2
3485}
3486
3487declare <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8>, <16 x i8>, i32)
3488
3489define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
3490; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
3491; X86:       # %bb.0:
3492; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd9,0x02]
3493; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3494; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3495; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
3496; X86-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc1,0x02]
3497; X86-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3498; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3499; X86-NEXT:    retl # encoding: [0xc3]
3500;
3501; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
3502; X64:       # %bb.0:
3503; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd9,0x02]
3504; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3505; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
3506; X64-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc1,0x02]
3507; X64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3508; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3509; X64-NEXT:    retq # encoding: [0xc3]
3510  %1 = call <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2)
3511  %2 = bitcast i8 %x4 to <8 x i1>
3512  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x3
3513  %4 = call <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2)
3514  %5 = bitcast i8 %x4 to <8 x i1>
3515  %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> zeroinitializer
3516  %7 = call <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2)
3517  %res3 = add <8 x i16> %3, %6
3518  %res4 = add <8 x i16> %7, %res3
3519  ret <8 x i16> %res4
3520}
3521
3522declare <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8>, <32 x i8>, i32)
3523
3524define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
3525; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
3526; X86:       # %bb.0:
3527; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd9,0x02]
3528; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3529; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
3530; X86-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc1,0x02]
3531; X86-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3532; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3533; X86-NEXT:    retl # encoding: [0xc3]
3534;
3535; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
3536; X64:       # %bb.0:
3537; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd9,0x02]
3538; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3539; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
3540; X64-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc1,0x02]
3541; X64-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3542; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3543; X64-NEXT:    retq # encoding: [0xc3]
3544  %1 = call <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2)
3545  %2 = bitcast i16 %x4 to <16 x i1>
3546  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x3
3547  %4 = call <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2)
3548  %5 = bitcast i16 %x4 to <16 x i1>
3549  %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> zeroinitializer
3550  %7 = call <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2)
3551  %res3 = add <16 x i16> %3, %6
3552  %res4 = add <16 x i16> %res3, %7
3553  ret <16 x i16> %res4
3554}
3555
3556declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3557
3558define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3559; X86-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
3560; X86:       # %bb.0:
3561; X86-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x10,0xd9]
3562; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3563; X86-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x10,0xd1]
3564; X86-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x10,0xc1]
3565; X86-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3566; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3567; X86-NEXT:    retl # encoding: [0xc3]
3568;
3569; X64-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
3570; X64:       # %bb.0:
3571; X64-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x10,0xd9]
3572; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3573; X64-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x10,0xd1]
3574; X64-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x10,0xc1]
3575; X64-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3576; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3577; X64-NEXT:    retq # encoding: [0xc3]
3578  %res = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3579  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
3580  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3581  %res3 = add <16 x i16> %res, %res1
3582  %res4 = add <16 x i16> %res3, %res2
3583  ret <16 x i16> %res4
3584}
3585
3586declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3587
3588define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3589; X86-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
3590; X86:       # %bb.0:
3591; X86-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x10,0xd9]
3592; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3593; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3594; X86-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x10,0xd1]
3595; X86-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x10,0xc1]
3596; X86-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3597; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3598; X86-NEXT:    retl # encoding: [0xc3]
3599;
3600; X64-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
3601; X64:       # %bb.0:
3602; X64-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x10,0xd9]
3603; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3604; X64-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x10,0xd1]
3605; X64-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x10,0xc1]
3606; X64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3607; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3608; X64-NEXT:    retq # encoding: [0xc3]
3609  %res = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3610  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
3611  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3612  %res3 = add <8 x i16> %res, %res1
3613  %res4 = add <8 x i16> %res3, %res2
3614  ret <8 x i16> %res4
3615}
3616
3617declare <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3618
3619define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3620; X86-LABEL: test_int_x86_avx512_mask_psrav16_hi:
3621; X86:       # %bb.0:
3622; X86-NEXT:    vpsravw %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x11,0xd9]
3623; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3624; X86-NEXT:    vpsravw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x11,0xd1]
3625; X86-NEXT:    vpsravw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x11,0xc1]
3626; X86-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3627; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3628; X86-NEXT:    retl # encoding: [0xc3]
3629;
3630; X64-LABEL: test_int_x86_avx512_mask_psrav16_hi:
3631; X64:       # %bb.0:
3632; X64-NEXT:    vpsravw %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x11,0xd9]
3633; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3634; X64-NEXT:    vpsravw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x11,0xd1]
3635; X64-NEXT:    vpsravw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x11,0xc1]
3636; X64-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3637; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3638; X64-NEXT:    retq # encoding: [0xc3]
3639  %res = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3640  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
3641  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3642  %res3 = add <16 x i16> %res, %res1
3643  %res4 = add <16 x i16> %res3, %res2
3644  ret <16 x i16> %res4
3645}
3646
3647declare <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3648
3649define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3650; X86-LABEL: test_int_x86_avx512_mask_psrav8_hi:
3651; X86:       # %bb.0:
3652; X86-NEXT:    vpsravw %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x11,0xd9]
3653; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3654; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3655; X86-NEXT:    vpsravw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x11,0xd1]
3656; X86-NEXT:    vpsravw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x11,0xc1]
3657; X86-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3658; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3659; X86-NEXT:    retl # encoding: [0xc3]
3660;
3661; X64-LABEL: test_int_x86_avx512_mask_psrav8_hi:
3662; X64:       # %bb.0:
3663; X64-NEXT:    vpsravw %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x11,0xd9]
3664; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3665; X64-NEXT:    vpsravw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x11,0xd1]
3666; X64-NEXT:    vpsravw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x11,0xc1]
3667; X64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3668; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3669; X64-NEXT:    retq # encoding: [0xc3]
3670  %res = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3671  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
3672  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3673  %res3 = add <8 x i16> %res, %res1
3674  %res4 = add <8 x i16> %res3, %res2
3675  ret <8 x i16> %res4
3676}
3677
3678declare <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
3679
3680define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3681; X86-LABEL: test_int_x86_avx512_mask_psllv16_hi:
3682; X86:       # %bb.0:
3683; X86-NEXT:    vpsllvw %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x12,0xd9]
3684; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3685; X86-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x12,0xd1]
3686; X86-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x12,0xc1]
3687; X86-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3688; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3689; X86-NEXT:    retl # encoding: [0xc3]
3690;
3691; X64-LABEL: test_int_x86_avx512_mask_psllv16_hi:
3692; X64:       # %bb.0:
3693; X64-NEXT:    vpsllvw %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x12,0xd9]
3694; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3695; X64-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x12,0xd1]
3696; X64-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x12,0xc1]
3697; X64-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3698; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3699; X64-NEXT:    retq # encoding: [0xc3]
3700  %res = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
3701  %res1 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
3702  %res2 = call <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
3703  %res3 = add <16 x i16> %res, %res1
3704  %res4 = add <16 x i16> %res3, %res2
3705  ret <16 x i16> %res4
3706}
3707
3708declare <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16>, <8 x i16>, <8 x i16>, i8)
3709
3710define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3711; X86-LABEL: test_int_x86_avx512_mask_psllv8_hi:
3712; X86:       # %bb.0:
3713; X86-NEXT:    vpsllvw %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x12,0xd9]
3714; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3715; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3716; X86-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x12,0xd1]
3717; X86-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x12,0xc1]
3718; X86-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3719; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3720; X86-NEXT:    retl # encoding: [0xc3]
3721;
3722; X64-LABEL: test_int_x86_avx512_mask_psllv8_hi:
3723; X64:       # %bb.0:
3724; X64-NEXT:    vpsllvw %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x12,0xd9]
3725; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3726; X64-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x12,0xd1]
3727; X64-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x12,0xc1]
3728; X64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3729; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3730; X64-NEXT:    retq # encoding: [0xc3]
3731  %res = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
3732  %res1 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
3733  %res2 = call <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
3734  %res3 = add <8 x i16> %res, %res1
3735  %res4 = add <8 x i16> %res3, %res2
3736  ret <8 x i16> %res4
3737}
3738
3739declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>)
3740
3741define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
3742; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
3743; X86:       # %bb.0:
3744; X86-NEXT:    vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8]
3745; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
3746; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
3747; X86-NEXT:    vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0]
3748; X86-NEXT:    vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0]
3749; X86-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3750; X86-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3751; X86-NEXT:    retl # encoding: [0xc3]
3752;
3753; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
3754; X64:       # %bb.0:
3755; X64-NEXT:    vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8]
3756; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3757; X64-NEXT:    vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0]
3758; X64-NEXT:    vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0]
3759; X64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
3760; X64-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
3761; X64-NEXT:    retq # encoding: [0xc3]
3762  %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1)
3763  %2 = bitcast i8 %x3 to <8 x i1>
3764  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2
3765  %4 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1)
3766  %5 = bitcast i8 %x3 to <8 x i1>
3767  %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> zeroinitializer
3768  %7 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1)
3769  %res3 = add <8 x i16> %3, %6
3770  %res4 = add <8 x i16> %res3, %7
3771  ret <8 x i16> %res4
3772}
3773
3774declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>)
3775
3776define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
3777; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_256:
3778; X86:       # %bb.0:
3779; X86-NEXT:    vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8]
3780; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
3781; X86-NEXT:    vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0]
3782; X86-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0]
3783; X86-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3784; X86-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3785; X86-NEXT:    retl # encoding: [0xc3]
3786;
3787; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_256:
3788; X64:       # %bb.0:
3789; X64-NEXT:    vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8]
3790; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
3791; X64-NEXT:    vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0]
3792; X64-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0]
3793; X64-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
3794; X64-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
3795; X64-NEXT:    retq # encoding: [0xc3]
3796  %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1)
3797  %2 = bitcast i16 %x3 to <16 x i1>
3798  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2
3799  %4 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1)
3800  %5 = bitcast i16 %x3 to <16 x i1>
3801  %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> zeroinitializer
3802  %7 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1)
3803  %res3 = add <16 x i16> %3, %6
3804  %res4 = add <16 x i16> %res3, %7
3805  ret <16 x i16> %res4
3806}
3807
3808