• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
9
10; fold (sdiv x, 1) -> x
11define i32 @combine_sdiv_by_one(i32 %x) {
12; CHECK-LABEL: combine_sdiv_by_one:
13; CHECK:       # %bb.0:
14; CHECK-NEXT:    movl %edi, %eax
15; CHECK-NEXT:    retq
16  %1 = sdiv i32 %x, 1
17  ret i32 %1
18}
19
20define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
21; CHECK-LABEL: combine_vec_sdiv_by_one:
22; CHECK:       # %bb.0:
23; CHECK-NEXT:    retq
24  %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
25  ret <4 x i32> %1
26}
27
28; fold (sdiv x, -1) -> 0 - x
29define i32 @combine_sdiv_by_negone(i32 %x) {
30; CHECK-LABEL: combine_sdiv_by_negone:
31; CHECK:       # %bb.0:
32; CHECK-NEXT:    movl %edi, %eax
33; CHECK-NEXT:    negl %eax
34; CHECK-NEXT:    retq
35  %1 = sdiv i32 %x, -1
36  ret i32 %1
37}
38
39define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
40; SSE-LABEL: combine_vec_sdiv_by_negone:
41; SSE:       # %bb.0:
42; SSE-NEXT:    pxor %xmm1, %xmm1
43; SSE-NEXT:    psubd %xmm0, %xmm1
44; SSE-NEXT:    movdqa %xmm1, %xmm0
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: combine_vec_sdiv_by_negone:
48; AVX:       # %bb.0:
49; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
50; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
51; AVX-NEXT:    retq
52  %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
53  ret <4 x i32> %1
54}
55
56; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
57define i32 @combine_sdiv_by_minsigned(i32 %x) {
58; CHECK-LABEL: combine_sdiv_by_minsigned:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    xorl %eax, %eax
61; CHECK-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
62; CHECK-NEXT:    sete %al
63; CHECK-NEXT:    retq
64  %1 = sdiv i32 %x, -2147483648
65  ret i32 %1
66}
67
68define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
69; SSE-LABEL: combine_vec_sdiv_by_minsigned:
70; SSE:       # %bb.0:
71; SSE-NEXT:    pcmpeqd {{.*}}(%rip), %xmm0
72; SSE-NEXT:    psrld $31, %xmm0
73; SSE-NEXT:    retq
74;
75; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
76; AVX1:       # %bb.0:
77; AVX1-NEXT:    vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0
78; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
79; AVX1-NEXT:    retq
80;
81; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
82; AVX2:       # %bb.0:
83; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
84; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
85; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
86; AVX2-NEXT:    retq
87;
88; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
89; AVX512F:       # %bb.0:
90; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
91; AVX512F-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
92; AVX512F-NEXT:    vpsrld $31, %xmm0, %xmm0
93; AVX512F-NEXT:    retq
94;
95; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
96; AVX512BW:       # %bb.0:
97; AVX512BW-NEXT:    vpcmpeqd {{.*}}(%rip){1to4}, %xmm0, %k1
98; AVX512BW-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
99; AVX512BW-NEXT:    retq
100;
101; XOP-LABEL: combine_vec_sdiv_by_minsigned:
102; XOP:       # %bb.0:
103; XOP-NEXT:    vpcomeqd {{.*}}(%rip), %xmm0, %xmm0
104; XOP-NEXT:    vpsrld $31, %xmm0, %xmm0
105; XOP-NEXT:    retq
106  %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
107  ret <4 x i32> %1
108}
109
110; fold (sdiv 0, x) -> 0
111define i32 @combine_sdiv_zero(i32 %x) {
112; CHECK-LABEL: combine_sdiv_zero:
113; CHECK:       # %bb.0:
114; CHECK-NEXT:    xorl %eax, %eax
115; CHECK-NEXT:    retq
116  %1 = sdiv i32 0, %x
117  ret i32 %1
118}
119
120define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
121; SSE-LABEL: combine_vec_sdiv_zero:
122; SSE:       # %bb.0:
123; SSE-NEXT:    xorps %xmm0, %xmm0
124; SSE-NEXT:    retq
125;
126; AVX-LABEL: combine_vec_sdiv_zero:
127; AVX:       # %bb.0:
128; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
129; AVX-NEXT:    retq
130  %1 = sdiv <4 x i32> zeroinitializer, %x
131  ret <4 x i32> %1
132}
133
134; fold (sdiv x, x) -> 1
135define i32 @combine_sdiv_dupe(i32 %x) {
136; CHECK-LABEL: combine_sdiv_dupe:
137; CHECK:       # %bb.0:
138; CHECK-NEXT:    movl $1, %eax
139; CHECK-NEXT:    retq
140  %1 = sdiv i32 %x, %x
141  ret i32 %1
142}
143
144define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
145; SSE-LABEL: combine_vec_sdiv_dupe:
146; SSE:       # %bb.0:
147; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
148; SSE-NEXT:    retq
149;
150; AVX1-LABEL: combine_vec_sdiv_dupe:
151; AVX1:       # %bb.0:
152; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,1]
153; AVX1-NEXT:    retq
154;
155; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe:
156; AVX2ORLATER:       # %bb.0:
157; AVX2ORLATER-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
158; AVX2ORLATER-NEXT:    retq
159;
160; XOP-LABEL: combine_vec_sdiv_dupe:
161; XOP:       # %bb.0:
162; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,1]
163; XOP-NEXT:    retq
164  %1 = sdiv <4 x i32> %x, %x
165  ret <4 x i32> %1
166}
167
168; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
169define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
170; SSE-LABEL: combine_vec_sdiv_by_pos0:
171; SSE:       # %bb.0:
172; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
173; SSE-NEXT:    psrld $2, %xmm0
174; SSE-NEXT:    retq
175;
176; AVX-LABEL: combine_vec_sdiv_by_pos0:
177; AVX:       # %bb.0:
178; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
179; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
180; AVX-NEXT:    retq
181  %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
182  %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
183  ret <4 x i32> %2
184}
185
186define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
187; SSE2-LABEL: combine_vec_sdiv_by_pos1:
188; SSE2:       # %bb.0:
189; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
190; SSE2-NEXT:    movdqa %xmm0, %xmm1
191; SSE2-NEXT:    psrld $4, %xmm1
192; SSE2-NEXT:    movdqa %xmm0, %xmm2
193; SSE2-NEXT:    psrld $3, %xmm2
194; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
195; SSE2-NEXT:    movdqa %xmm0, %xmm1
196; SSE2-NEXT:    psrld $2, %xmm1
197; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
199; SSE2-NEXT:    retq
200;
201; SSE41-LABEL: combine_vec_sdiv_by_pos1:
202; SSE41:       # %bb.0:
203; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
204; SSE41-NEXT:    movdqa %xmm0, %xmm2
205; SSE41-NEXT:    movdqa %xmm0, %xmm1
206; SSE41-NEXT:    psrld $3, %xmm1
207; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
208; SSE41-NEXT:    psrld $4, %xmm0
209; SSE41-NEXT:    psrld $2, %xmm2
210; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
211; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
212; SSE41-NEXT:    movdqa %xmm1, %xmm0
213; SSE41-NEXT:    retq
214;
215; AVX1-LABEL: combine_vec_sdiv_by_pos1:
216; AVX1:       # %bb.0:
217; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
218; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm1
219; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm2
220; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
221; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm2
222; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
223; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
224; AVX1-NEXT:    retq
225;
226; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1:
227; AVX2ORLATER:       # %bb.0:
228; AVX2ORLATER-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
229; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
230; AVX2ORLATER-NEXT:    retq
231;
232; XOP-LABEL: combine_vec_sdiv_by_pos1:
233; XOP:       # %bb.0:
234; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
235; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm0
236; XOP-NEXT:    retq
237  %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
238  %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
239  ret <4 x i32> %2
240}
241
242; fold (sdiv x, (1 << c)) -> x >>u c
243define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
244; SSE-LABEL: combine_vec_sdiv_by_pow2a:
245; SSE:       # %bb.0:
246; SSE-NEXT:    movdqa %xmm0, %xmm1
247; SSE-NEXT:    psrad $31, %xmm1
248; SSE-NEXT:    psrld $30, %xmm1
249; SSE-NEXT:    paddd %xmm0, %xmm1
250; SSE-NEXT:    psrad $2, %xmm1
251; SSE-NEXT:    movdqa %xmm1, %xmm0
252; SSE-NEXT:    retq
253;
254; AVX-LABEL: combine_vec_sdiv_by_pow2a:
255; AVX:       # %bb.0:
256; AVX-NEXT:    vpsrad $31, %xmm0, %xmm1
257; AVX-NEXT:    vpsrld $30, %xmm1, %xmm1
258; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
259; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
260; AVX-NEXT:    retq
261  %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
262  ret <4 x i32> %1
263}
264
265define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
266; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
267; SSE:       # %bb.0:
268; SSE-NEXT:    movdqa %xmm0, %xmm1
269; SSE-NEXT:    psrad $31, %xmm1
270; SSE-NEXT:    psrld $30, %xmm1
271; SSE-NEXT:    paddd %xmm0, %xmm1
272; SSE-NEXT:    psrad $2, %xmm1
273; SSE-NEXT:    pxor %xmm0, %xmm0
274; SSE-NEXT:    psubd %xmm1, %xmm0
275; SSE-NEXT:    retq
276;
277; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
278; AVX:       # %bb.0:
279; AVX-NEXT:    vpsrad $31, %xmm0, %xmm1
280; AVX-NEXT:    vpsrld $30, %xmm1, %xmm1
281; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
282; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
283; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
284; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
285; AVX-NEXT:    retq
286  %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
287  ret <4 x i32> %1
288}
289
290define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
291; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
292; SSE2:       # %bb.0:
293; SSE2-NEXT:    pxor %xmm1, %xmm1
294; SSE2-NEXT:    pxor %xmm2, %xmm2
295; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
296; SSE2-NEXT:    movdqa %xmm2, %xmm3
297; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
298; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2]
299; SSE2-NEXT:    pmullw %xmm4, %xmm3
300; SSE2-NEXT:    psrlw $8, %xmm3
301; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
302; SSE2-NEXT:    pmullw %xmm4, %xmm2
303; SSE2-NEXT:    psrlw $8, %xmm2
304; SSE2-NEXT:    packuswb %xmm3, %xmm2
305; SSE2-NEXT:    paddb %xmm0, %xmm2
306; SSE2-NEXT:    movdqa %xmm2, %xmm1
307; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
308; SSE2-NEXT:    psraw $8, %xmm1
309; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
310; SSE2-NEXT:    pmullw %xmm3, %xmm1
311; SSE2-NEXT:    psrlw $8, %xmm1
312; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
313; SSE2-NEXT:    psraw $8, %xmm2
314; SSE2-NEXT:    pmullw %xmm3, %xmm2
315; SSE2-NEXT:    psrlw $8, %xmm2
316; SSE2-NEXT:    packuswb %xmm1, %xmm2
317; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
318; SSE2-NEXT:    pand %xmm1, %xmm2
319; SSE2-NEXT:    pandn %xmm0, %xmm1
320; SSE2-NEXT:    por %xmm2, %xmm1
321; SSE2-NEXT:    movdqa %xmm1, %xmm0
322; SSE2-NEXT:    retq
323;
324; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
325; SSE41:       # %bb.0:
326; SSE41-NEXT:    movdqa %xmm0, %xmm1
327; SSE41-NEXT:    pxor %xmm0, %xmm0
328; SSE41-NEXT:    pxor %xmm3, %xmm3
329; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
330; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
331; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
332; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
333; SSE41-NEXT:    pmullw %xmm0, %xmm3
334; SSE41-NEXT:    psrlw $8, %xmm3
335; SSE41-NEXT:    pmullw %xmm0, %xmm2
336; SSE41-NEXT:    psrlw $8, %xmm2
337; SSE41-NEXT:    packuswb %xmm3, %xmm2
338; SSE41-NEXT:    paddb %xmm1, %xmm2
339; SSE41-NEXT:    movdqa %xmm2, %xmm0
340; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
341; SSE41-NEXT:    psraw $8, %xmm0
342; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
343; SSE41-NEXT:    pmullw %xmm3, %xmm0
344; SSE41-NEXT:    psrlw $8, %xmm0
345; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
346; SSE41-NEXT:    psraw $8, %xmm2
347; SSE41-NEXT:    pmullw %xmm3, %xmm2
348; SSE41-NEXT:    psrlw $8, %xmm2
349; SSE41-NEXT:    packuswb %xmm0, %xmm2
350; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
351; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
352; SSE41-NEXT:    movdqa %xmm1, %xmm0
353; SSE41-NEXT:    retq
354;
355; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
356; AVX1:       # %bb.0:
357; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
358; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
359; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
360; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2]
361; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
362; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
363; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
364; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
365; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
366; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
367; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
368; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
369; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
370; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
371; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
372; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
373; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
374; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
375; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
376; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
377; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
378; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
379; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
380; AVX1-NEXT:    retq
381;
382; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
383; AVX2:       # %bb.0:
384; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
385; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
386; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
387; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
388; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
389; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
390; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
391; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
392; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
393; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
394; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
395; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
396; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
397; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
398; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
399; AVX2-NEXT:    vzeroupper
400; AVX2-NEXT:    retq
401;
402; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
403; AVX512F:       # %bb.0:
404; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
405; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
406; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
407; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
408; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
409; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
410; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
411; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
412; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
413; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
414; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
415; AVX512F-NEXT:    vzeroupper
416; AVX512F-NEXT:    retq
417;
418; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
419; AVX512BW:       # %bb.0:
420; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
421; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
422; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
423; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
424; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
425; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
426; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
427; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm1, %ymm1
428; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
429; AVX512BW-NEXT:    movw $257, %ax # imm = 0x101
430; AVX512BW-NEXT:    kmovd %eax, %k1
431; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
432; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
433; AVX512BW-NEXT:    vzeroupper
434; AVX512BW-NEXT:    retq
435;
436; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
437; XOP:       # %bb.0:
438; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
439; XOP-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
440; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm1, %xmm1
441; XOP-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
442; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm1, %xmm1
443; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
444; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
445; XOP-NEXT:    retq
446  %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
447  ret <16 x i8> %1
448}
449
450define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
451; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
452; SSE2:       # %bb.0:
453; SSE2-NEXT:    movdqa %xmm0, %xmm1
454; SSE2-NEXT:    psraw $15, %xmm1
455; SSE2-NEXT:    pmulhuw {{.*}}(%rip), %xmm1
456; SSE2-NEXT:    paddw %xmm0, %xmm1
457; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
458; SSE2-NEXT:    movdqa %xmm1, %xmm3
459; SSE2-NEXT:    pand %xmm2, %xmm3
460; SSE2-NEXT:    psraw $4, %xmm1
461; SSE2-NEXT:    pandn %xmm1, %xmm2
462; SSE2-NEXT:    por %xmm3, %xmm2
463; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535]
464; SSE2-NEXT:    movdqa %xmm2, %xmm3
465; SSE2-NEXT:    pand %xmm1, %xmm3
466; SSE2-NEXT:    psraw $2, %xmm2
467; SSE2-NEXT:    pandn %xmm2, %xmm1
468; SSE2-NEXT:    por %xmm3, %xmm1
469; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0]
470; SSE2-NEXT:    movdqa %xmm1, %xmm3
471; SSE2-NEXT:    pand %xmm2, %xmm3
472; SSE2-NEXT:    psraw $1, %xmm1
473; SSE2-NEXT:    pandn %xmm1, %xmm2
474; SSE2-NEXT:    por %xmm3, %xmm2
475; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
476; SSE2-NEXT:    pand %xmm1, %xmm2
477; SSE2-NEXT:    pandn %xmm0, %xmm1
478; SSE2-NEXT:    por %xmm2, %xmm1
479; SSE2-NEXT:    movdqa %xmm1, %xmm0
480; SSE2-NEXT:    retq
481;
482; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
483; SSE41:       # %bb.0:
484; SSE41-NEXT:    movdqa %xmm0, %xmm1
485; SSE41-NEXT:    psraw $15, %xmm1
486; SSE41-NEXT:    pmulhuw {{.*}}(%rip), %xmm1
487; SSE41-NEXT:    paddw %xmm0, %xmm1
488; SSE41-NEXT:    movdqa %xmm1, %xmm2
489; SSE41-NEXT:    psraw $1, %xmm2
490; SSE41-NEXT:    pmulhw {{.*}}(%rip), %xmm1
491; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
492; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
493; SSE41-NEXT:    movdqa %xmm1, %xmm0
494; SSE41-NEXT:    retq
495;
496; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
497; AVX1:       # %bb.0:
498; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm1
499; AVX1-NEXT:    vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
500; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
501; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
502; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm1, %xmm1
503; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
504; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
505; AVX1-NEXT:    retq
506;
507; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
508; AVX2:       # %bb.0:
509; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm1
510; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
511; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
512; AVX2-NEXT:    vpsraw $1, %xmm1, %xmm2
513; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm1, %xmm1
514; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
515; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
516; AVX2-NEXT:    retq
517;
518; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
519; AVX512F:       # %bb.0:
520; AVX512F-NEXT:    vpsraw $15, %xmm0, %xmm1
521; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %xmm1, %xmm1
522; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
523; AVX512F-NEXT:    vpmovsxwd %xmm1, %ymm1
524; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %ymm1, %ymm1
525; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
526; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
527; AVX512F-NEXT:    vzeroupper
528; AVX512F-NEXT:    retq
529;
530; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
531; AVX512BW:       # %bb.0:
532; AVX512BW-NEXT:    vpsraw $15, %xmm0, %xmm1
533; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %xmm1, %xmm1
534; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
535; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %xmm1, %xmm1
536; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
537; AVX512BW-NEXT:    retq
538;
539; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
540; XOP:       # %bb.0:
541; XOP-NEXT:    vpsraw $15, %xmm0, %xmm1
542; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm1, %xmm1
543; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
544; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm1, %xmm1
545; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
546; XOP-NEXT:    retq
547  %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
548  ret <8 x i16> %1
549}
550
551define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
552; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
553; SSE2:       # %bb.0:
554; SSE2-NEXT:    movdqa %xmm0, %xmm3
555; SSE2-NEXT:    psraw $15, %xmm0
556; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = <u,4,2,16,8,32,64,2>
557; SSE2-NEXT:    pmulhuw %xmm8, %xmm0
558; SSE2-NEXT:    paddw %xmm3, %xmm0
559; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
560; SSE2-NEXT:    movdqa %xmm0, %xmm2
561; SSE2-NEXT:    pand %xmm4, %xmm2
562; SSE2-NEXT:    psraw $4, %xmm0
563; SSE2-NEXT:    movdqa %xmm4, %xmm6
564; SSE2-NEXT:    pandn %xmm0, %xmm6
565; SSE2-NEXT:    por %xmm2, %xmm6
566; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535]
567; SSE2-NEXT:    movdqa %xmm6, %xmm0
568; SSE2-NEXT:    pand %xmm5, %xmm0
569; SSE2-NEXT:    psraw $2, %xmm6
570; SSE2-NEXT:    movdqa %xmm5, %xmm2
571; SSE2-NEXT:    pandn %xmm6, %xmm2
572; SSE2-NEXT:    por %xmm0, %xmm2
573; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0]
574; SSE2-NEXT:    movdqa %xmm2, %xmm0
575; SSE2-NEXT:    pand %xmm7, %xmm0
576; SSE2-NEXT:    psraw $1, %xmm2
577; SSE2-NEXT:    movdqa %xmm7, %xmm6
578; SSE2-NEXT:    pandn %xmm2, %xmm6
579; SSE2-NEXT:    por %xmm0, %xmm6
580; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
581; SSE2-NEXT:    pand %xmm2, %xmm6
582; SSE2-NEXT:    movdqa %xmm2, %xmm0
583; SSE2-NEXT:    pandn %xmm3, %xmm0
584; SSE2-NEXT:    por %xmm6, %xmm0
585; SSE2-NEXT:    movdqa %xmm1, %xmm3
586; SSE2-NEXT:    psraw $15, %xmm3
587; SSE2-NEXT:    pmulhuw %xmm8, %xmm3
588; SSE2-NEXT:    paddw %xmm1, %xmm3
589; SSE2-NEXT:    movdqa %xmm3, %xmm6
590; SSE2-NEXT:    pand %xmm4, %xmm6
591; SSE2-NEXT:    psraw $4, %xmm3
592; SSE2-NEXT:    pandn %xmm3, %xmm4
593; SSE2-NEXT:    por %xmm6, %xmm4
594; SSE2-NEXT:    movdqa %xmm4, %xmm3
595; SSE2-NEXT:    pand %xmm5, %xmm3
596; SSE2-NEXT:    psraw $2, %xmm4
597; SSE2-NEXT:    pandn %xmm4, %xmm5
598; SSE2-NEXT:    por %xmm3, %xmm5
599; SSE2-NEXT:    movdqa %xmm5, %xmm3
600; SSE2-NEXT:    pand %xmm7, %xmm3
601; SSE2-NEXT:    psraw $1, %xmm5
602; SSE2-NEXT:    pandn %xmm5, %xmm7
603; SSE2-NEXT:    por %xmm3, %xmm7
604; SSE2-NEXT:    pand %xmm2, %xmm7
605; SSE2-NEXT:    pandn %xmm1, %xmm2
606; SSE2-NEXT:    por %xmm7, %xmm2
607; SSE2-NEXT:    movdqa %xmm2, %xmm1
608; SSE2-NEXT:    retq
609;
610; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
611; SSE41:       # %bb.0:
612; SSE41-NEXT:    movdqa %xmm0, %xmm2
613; SSE41-NEXT:    psraw $15, %xmm2
614; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
615; SSE41-NEXT:    pmulhuw %xmm4, %xmm2
616; SSE41-NEXT:    paddw %xmm0, %xmm2
617; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768>
618; SSE41-NEXT:    movdqa %xmm2, %xmm3
619; SSE41-NEXT:    pmulhw %xmm5, %xmm3
620; SSE41-NEXT:    psraw $1, %xmm2
621; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7]
622; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
623; SSE41-NEXT:    movdqa %xmm1, %xmm3
624; SSE41-NEXT:    psraw $15, %xmm3
625; SSE41-NEXT:    pmulhuw %xmm4, %xmm3
626; SSE41-NEXT:    paddw %xmm1, %xmm3
627; SSE41-NEXT:    pmulhw %xmm3, %xmm5
628; SSE41-NEXT:    psraw $1, %xmm3
629; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7]
630; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
631; SSE41-NEXT:    movdqa %xmm2, %xmm0
632; SSE41-NEXT:    movdqa %xmm3, %xmm1
633; SSE41-NEXT:    retq
634;
635; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
636; AVX1:       # %bb.0:
637; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
638; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm2
639; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
640; AVX1-NEXT:    vpmulhuw %xmm3, %xmm2, %xmm2
641; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
642; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768>
643; AVX1-NEXT:    vpmulhw %xmm2, %xmm1, %xmm4
644; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
645; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
646; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm4
647; AVX1-NEXT:    vpmulhuw %xmm3, %xmm4, %xmm3
648; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
649; AVX1-NEXT:    vpmulhw %xmm2, %xmm3, %xmm2
650; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm3
651; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
652; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
653; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
654; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
655; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
656; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
657; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
658; AVX1-NEXT:    retq
659;
660; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
661; AVX2:       # %bb.0:
662; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm1
663; AVX2-NEXT:    vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
664; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
665; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm2
666; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %ymm1, %ymm1
667; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15]
668; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
669; AVX2-NEXT:    retq
670;
671; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
672; AVX512F:       # %bb.0:
673; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm1
674; AVX512F-NEXT:    vpmulhuw {{.*}}(%rip), %ymm1, %ymm1
675; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
676; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
677; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
678; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
679; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
680; AVX512F-NEXT:    retq
681;
682; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
683; AVX512BW:       # %bb.0:
684; AVX512BW-NEXT:    vpsraw $15, %ymm0, %ymm1
685; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm1, %ymm1
686; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
687; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm1, %ymm1
688; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
689; AVX512BW-NEXT:    retq
690;
691; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
692; XOP:       # %bb.0:
693; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
694; XOP-NEXT:    vpsraw $15, %xmm1, %xmm2
695; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521>
696; XOP-NEXT:    vpshlw %xmm3, %xmm2, %xmm2
697; XOP-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
698; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535>
699; XOP-NEXT:    vpshaw %xmm2, %xmm1, %xmm1
700; XOP-NEXT:    vpsraw $15, %xmm0, %xmm4
701; XOP-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
702; XOP-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
703; XOP-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
704; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
705; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
706; XOP-NEXT:    # ymm2 = mem[0,1,0,1]
707; XOP-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
708; XOP-NEXT:    retq
709  %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
710  ret <16 x i16> %1
711}
712
713define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
714; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
715; SSE2:       # %bb.0:
716; SSE2-NEXT:    movdqa %xmm1, %xmm8
717; SSE2-NEXT:    movdqa %xmm0, %xmm1
718; SSE2-NEXT:    psraw $15, %xmm0
719; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2>
720; SSE2-NEXT:    pmulhuw %xmm9, %xmm0
721; SSE2-NEXT:    paddw %xmm1, %xmm0
722; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535]
723; SSE2-NEXT:    movdqa %xmm0, %xmm4
724; SSE2-NEXT:    pand %xmm11, %xmm4
725; SSE2-NEXT:    psraw $4, %xmm0
726; SSE2-NEXT:    movdqa %xmm11, %xmm5
727; SSE2-NEXT:    pandn %xmm0, %xmm5
728; SSE2-NEXT:    por %xmm4, %xmm5
729; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
730; SSE2-NEXT:    movdqa %xmm5, %xmm0
731; SSE2-NEXT:    pand %xmm7, %xmm0
732; SSE2-NEXT:    psraw $2, %xmm5
733; SSE2-NEXT:    movdqa %xmm7, %xmm4
734; SSE2-NEXT:    pandn %xmm5, %xmm4
735; SSE2-NEXT:    por %xmm0, %xmm4
736; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0]
737; SSE2-NEXT:    movdqa %xmm4, %xmm0
738; SSE2-NEXT:    pand %xmm10, %xmm0
739; SSE2-NEXT:    psraw $1, %xmm4
740; SSE2-NEXT:    movdqa %xmm10, %xmm5
741; SSE2-NEXT:    pandn %xmm4, %xmm5
742; SSE2-NEXT:    por %xmm0, %xmm5
743; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535]
744; SSE2-NEXT:    pand %xmm12, %xmm5
745; SSE2-NEXT:    movdqa %xmm12, %xmm0
746; SSE2-NEXT:    pandn %xmm1, %xmm0
747; SSE2-NEXT:    por %xmm5, %xmm0
748; SSE2-NEXT:    movdqa %xmm8, %xmm1
749; SSE2-NEXT:    psraw $15, %xmm1
750; SSE2-NEXT:    pmulhuw %xmm9, %xmm1
751; SSE2-NEXT:    paddw %xmm8, %xmm1
752; SSE2-NEXT:    movdqa %xmm1, %xmm5
753; SSE2-NEXT:    pand %xmm11, %xmm5
754; SSE2-NEXT:    psraw $4, %xmm1
755; SSE2-NEXT:    movdqa %xmm11, %xmm6
756; SSE2-NEXT:    pandn %xmm1, %xmm6
757; SSE2-NEXT:    por %xmm5, %xmm6
758; SSE2-NEXT:    movdqa %xmm6, %xmm1
759; SSE2-NEXT:    pand %xmm7, %xmm1
760; SSE2-NEXT:    psraw $2, %xmm6
761; SSE2-NEXT:    movdqa %xmm7, %xmm5
762; SSE2-NEXT:    pandn %xmm6, %xmm5
763; SSE2-NEXT:    por %xmm1, %xmm5
764; SSE2-NEXT:    movdqa %xmm5, %xmm1
765; SSE2-NEXT:    pand %xmm10, %xmm1
766; SSE2-NEXT:    psraw $1, %xmm5
767; SSE2-NEXT:    movdqa %xmm10, %xmm6
768; SSE2-NEXT:    pandn %xmm5, %xmm6
769; SSE2-NEXT:    por %xmm1, %xmm6
770; SSE2-NEXT:    pand %xmm12, %xmm6
771; SSE2-NEXT:    movdqa %xmm12, %xmm1
772; SSE2-NEXT:    pandn %xmm8, %xmm1
773; SSE2-NEXT:    por %xmm6, %xmm1
774; SSE2-NEXT:    movdqa %xmm2, %xmm5
775; SSE2-NEXT:    psraw $15, %xmm5
776; SSE2-NEXT:    pmulhuw %xmm9, %xmm5
777; SSE2-NEXT:    paddw %xmm2, %xmm5
778; SSE2-NEXT:    movdqa %xmm5, %xmm6
779; SSE2-NEXT:    pand %xmm11, %xmm6
780; SSE2-NEXT:    psraw $4, %xmm5
781; SSE2-NEXT:    movdqa %xmm11, %xmm4
782; SSE2-NEXT:    pandn %xmm5, %xmm4
783; SSE2-NEXT:    por %xmm6, %xmm4
784; SSE2-NEXT:    movdqa %xmm4, %xmm5
785; SSE2-NEXT:    pand %xmm7, %xmm5
786; SSE2-NEXT:    psraw $2, %xmm4
787; SSE2-NEXT:    movdqa %xmm7, %xmm6
788; SSE2-NEXT:    pandn %xmm4, %xmm6
789; SSE2-NEXT:    por %xmm5, %xmm6
790; SSE2-NEXT:    movdqa %xmm6, %xmm4
791; SSE2-NEXT:    pand %xmm10, %xmm4
792; SSE2-NEXT:    psraw $1, %xmm6
793; SSE2-NEXT:    movdqa %xmm10, %xmm5
794; SSE2-NEXT:    pandn %xmm6, %xmm5
795; SSE2-NEXT:    por %xmm4, %xmm5
796; SSE2-NEXT:    pand %xmm12, %xmm5
797; SSE2-NEXT:    movdqa %xmm12, %xmm8
798; SSE2-NEXT:    pandn %xmm2, %xmm8
799; SSE2-NEXT:    por %xmm5, %xmm8
800; SSE2-NEXT:    movdqa %xmm3, %xmm2
801; SSE2-NEXT:    psraw $15, %xmm2
802; SSE2-NEXT:    pmulhuw %xmm9, %xmm2
803; SSE2-NEXT:    paddw %xmm3, %xmm2
804; SSE2-NEXT:    movdqa %xmm2, %xmm4
805; SSE2-NEXT:    pand %xmm11, %xmm4
806; SSE2-NEXT:    psraw $4, %xmm2
807; SSE2-NEXT:    pandn %xmm2, %xmm11
808; SSE2-NEXT:    por %xmm4, %xmm11
809; SSE2-NEXT:    movdqa %xmm11, %xmm2
810; SSE2-NEXT:    pand %xmm7, %xmm2
811; SSE2-NEXT:    psraw $2, %xmm11
812; SSE2-NEXT:    pandn %xmm11, %xmm7
813; SSE2-NEXT:    por %xmm2, %xmm7
814; SSE2-NEXT:    movdqa %xmm7, %xmm2
815; SSE2-NEXT:    pand %xmm10, %xmm2
816; SSE2-NEXT:    psraw $1, %xmm7
817; SSE2-NEXT:    pandn %xmm7, %xmm10
818; SSE2-NEXT:    por %xmm2, %xmm10
819; SSE2-NEXT:    pand %xmm12, %xmm10
820; SSE2-NEXT:    pandn %xmm3, %xmm12
821; SSE2-NEXT:    por %xmm10, %xmm12
822; SSE2-NEXT:    movdqa %xmm8, %xmm2
823; SSE2-NEXT:    movdqa %xmm12, %xmm3
824; SSE2-NEXT:    retq
825;
826; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
827; SSE41:       # %bb.0:
828; SSE41-NEXT:    movdqa %xmm1, %xmm4
829; SSE41-NEXT:    movdqa %xmm0, %xmm1
830; SSE41-NEXT:    psraw $15, %xmm0
831; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
832; SSE41-NEXT:    pmulhuw %xmm7, %xmm0
833; SSE41-NEXT:    paddw %xmm1, %xmm0
834; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768>
835; SSE41-NEXT:    movdqa %xmm0, %xmm5
836; SSE41-NEXT:    pmulhw %xmm6, %xmm5
837; SSE41-NEXT:    psraw $1, %xmm0
838; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7]
839; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
840; SSE41-NEXT:    movdqa %xmm4, %xmm1
841; SSE41-NEXT:    psraw $15, %xmm1
842; SSE41-NEXT:    pmulhuw %xmm7, %xmm1
843; SSE41-NEXT:    paddw %xmm4, %xmm1
844; SSE41-NEXT:    movdqa %xmm1, %xmm5
845; SSE41-NEXT:    pmulhw %xmm6, %xmm5
846; SSE41-NEXT:    psraw $1, %xmm1
847; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7]
848; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
849; SSE41-NEXT:    movdqa %xmm2, %xmm4
850; SSE41-NEXT:    psraw $15, %xmm4
851; SSE41-NEXT:    pmulhuw %xmm7, %xmm4
852; SSE41-NEXT:    paddw %xmm2, %xmm4
853; SSE41-NEXT:    movdqa %xmm4, %xmm5
854; SSE41-NEXT:    pmulhw %xmm6, %xmm5
855; SSE41-NEXT:    psraw $1, %xmm4
856; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
857; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
858; SSE41-NEXT:    movdqa %xmm3, %xmm5
859; SSE41-NEXT:    psraw $15, %xmm5
860; SSE41-NEXT:    pmulhuw %xmm7, %xmm5
861; SSE41-NEXT:    paddw %xmm3, %xmm5
862; SSE41-NEXT:    pmulhw %xmm5, %xmm6
863; SSE41-NEXT:    psraw $1, %xmm5
864; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
865; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7]
866; SSE41-NEXT:    movdqa %xmm4, %xmm2
867; SSE41-NEXT:    movdqa %xmm5, %xmm3
868; SSE41-NEXT:    retq
869;
870; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
871; AVX1:       # %bb.0:
872; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
873; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm3
874; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
875; AVX1-NEXT:    vpmulhuw %xmm4, %xmm3, %xmm3
876; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
877; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768>
878; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm5
879; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm2
880; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
881; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm5
882; AVX1-NEXT:    vpmulhuw %xmm4, %xmm5, %xmm5
883; AVX1-NEXT:    vpaddw %xmm5, %xmm0, %xmm5
884; AVX1-NEXT:    vpmulhw %xmm3, %xmm5, %xmm6
885; AVX1-NEXT:    vpsraw $1, %xmm5, %xmm5
886; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
887; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
888; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
889; AVX1-NEXT:    # ymm5 = mem[0,1,0,1]
890; AVX1-NEXT:    vandps %ymm5, %ymm2, %ymm2
891; AVX1-NEXT:    vandnps %ymm0, %ymm5, %ymm0
892; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
893; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
894; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm6
895; AVX1-NEXT:    vpmulhuw %xmm4, %xmm6, %xmm6
896; AVX1-NEXT:    vpaddw %xmm6, %xmm2, %xmm2
897; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm6
898; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm2
899; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7]
900; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm6
901; AVX1-NEXT:    vpmulhuw %xmm4, %xmm6, %xmm4
902; AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm4
903; AVX1-NEXT:    vpmulhw %xmm3, %xmm4, %xmm3
904; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm4
905; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
906; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
907; AVX1-NEXT:    vandps %ymm5, %ymm2, %ymm2
908; AVX1-NEXT:    vandnps %ymm1, %ymm5, %ymm1
909; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
910; AVX1-NEXT:    retq
911;
912; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
913; AVX2:       # %bb.0:
914; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm2
915; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
916; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
917; AVX2-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm2
918; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm2
919; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768]
920; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
921; AVX2-NEXT:    vpmulhw %ymm4, %ymm2, %ymm5
922; AVX2-NEXT:    vpsraw $1, %ymm2, %ymm2
923; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15]
924; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
925; AVX2-NEXT:    vpsraw $15, %ymm1, %ymm2
926; AVX2-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm2
927; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
928; AVX2-NEXT:    vpmulhw %ymm4, %ymm2, %ymm3
929; AVX2-NEXT:    vpsraw $1, %ymm2, %ymm2
930; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
931; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
932; AVX2-NEXT:    retq
933;
934; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
935; AVX512F:       # %bb.0:
936; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm1
937; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
938; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
939; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
940; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
941; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
942; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
943; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
944; AVX512F-NEXT:    vpsravd %zmm3, %zmm1, %zmm1
945; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
946; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
947; AVX512F-NEXT:    vpsraw $15, %ymm4, %ymm5
948; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm5, %ymm2
949; AVX512F-NEXT:    vpaddw %ymm2, %ymm4, %ymm2
950; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
951; AVX512F-NEXT:    vpsravd %zmm3, %zmm2, %zmm2
952; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
953; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
954; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
955; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
956; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
957; AVX512F-NEXT:    retq
958;
959; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
960; AVX512BW:       # %bb.0:
961; AVX512BW-NEXT:    vpsraw $15, %zmm0, %zmm1
962; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm1, %zmm1
963; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
964; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm1, %zmm1
965; AVX512BW-NEXT:    movl $16843009, %eax # imm = 0x1010101
966; AVX512BW-NEXT:    kmovd %eax, %k1
967; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
968; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
969; AVX512BW-NEXT:    retq
970;
971; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
972; XOP:       # %bb.0:
973; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
974; XOP-NEXT:    vpsraw $15, %xmm2, %xmm3
975; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521>
976; XOP-NEXT:    vpshlw %xmm4, %xmm3, %xmm3
977; XOP-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
978; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535>
979; XOP-NEXT:    vpshaw %xmm3, %xmm2, %xmm2
980; XOP-NEXT:    vpsraw $15, %xmm0, %xmm5
981; XOP-NEXT:    vpshlw %xmm4, %xmm5, %xmm5
982; XOP-NEXT:    vpaddw %xmm5, %xmm0, %xmm5
983; XOP-NEXT:    vpshaw %xmm3, %xmm5, %xmm5
984; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
985; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
986; XOP-NEXT:    # ymm5 = mem[0,1,0,1]
987; XOP-NEXT:    vpcmov %ymm5, %ymm0, %ymm2, %ymm0
988; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
989; XOP-NEXT:    vpsraw $15, %xmm2, %xmm6
990; XOP-NEXT:    vpshlw %xmm4, %xmm6, %xmm6
991; XOP-NEXT:    vpaddw %xmm6, %xmm2, %xmm2
992; XOP-NEXT:    vpshaw %xmm3, %xmm2, %xmm2
993; XOP-NEXT:    vpsraw $15, %xmm1, %xmm6
994; XOP-NEXT:    vpshlw %xmm4, %xmm6, %xmm4
995; XOP-NEXT:    vpaddw %xmm4, %xmm1, %xmm4
996; XOP-NEXT:    vpshaw %xmm3, %xmm4, %xmm3
997; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
998; XOP-NEXT:    vpcmov %ymm5, %ymm1, %ymm2, %ymm1
999; XOP-NEXT:    retq
1000  %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
1001  ret <32 x i16> %1
1002}
1003
1004define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
1005; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1006; SSE2:       # %bb.0:
1007; SSE2-NEXT:    movdqa %xmm0, %xmm1
1008; SSE2-NEXT:    psrad $31, %xmm1
1009; SSE2-NEXT:    movdqa %xmm1, %xmm2
1010; SSE2-NEXT:    psrld $28, %xmm2
1011; SSE2-NEXT:    movdqa %xmm1, %xmm3
1012; SSE2-NEXT:    psrld $29, %xmm3
1013; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1014; SSE2-NEXT:    psrld $30, %xmm1
1015; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1016; SSE2-NEXT:    paddd %xmm0, %xmm1
1017; SSE2-NEXT:    movdqa %xmm1, %xmm2
1018; SSE2-NEXT:    psrad $4, %xmm2
1019; SSE2-NEXT:    movdqa %xmm1, %xmm3
1020; SSE2-NEXT:    psrad $3, %xmm3
1021; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1022; SSE2-NEXT:    psrad $2, %xmm1
1023; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1024; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1025; SSE2-NEXT:    movaps %xmm1, %xmm0
1026; SSE2-NEXT:    retq
1027;
1028; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1029; SSE41:       # %bb.0:
1030; SSE41-NEXT:    movdqa %xmm0, %xmm1
1031; SSE41-NEXT:    psrad $31, %xmm1
1032; SSE41-NEXT:    movdqa %xmm1, %xmm2
1033; SSE41-NEXT:    psrld $28, %xmm2
1034; SSE41-NEXT:    movdqa %xmm1, %xmm3
1035; SSE41-NEXT:    psrld $30, %xmm3
1036; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1037; SSE41-NEXT:    psrld $29, %xmm1
1038; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1039; SSE41-NEXT:    paddd %xmm0, %xmm1
1040; SSE41-NEXT:    movdqa %xmm1, %xmm2
1041; SSE41-NEXT:    psrad $4, %xmm2
1042; SSE41-NEXT:    movdqa %xmm1, %xmm3
1043; SSE41-NEXT:    psrad $2, %xmm3
1044; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1045; SSE41-NEXT:    psrad $3, %xmm1
1046; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1047; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1048; SSE41-NEXT:    movdqa %xmm1, %xmm0
1049; SSE41-NEXT:    retq
1050;
1051; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1052; AVX1:       # %bb.0:
1053; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
1054; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1055; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
1056; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1057; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
1058; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1059; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1060; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
1061; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
1062; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1063; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
1064; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1065; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1066; AVX1-NEXT:    retq
1067;
1068; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1069; AVX2ORLATER:       # %bb.0:
1070; AVX2ORLATER-NEXT:    vpsrad $31, %xmm0, %xmm1
1071; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
1072; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1073; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
1074; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1075; AVX2ORLATER-NEXT:    retq
1076;
1077; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1078; XOP:       # %bb.0:
1079; XOP-NEXT:    vpsrad $31, %xmm0, %xmm1
1080; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
1081; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1082; XOP-NEXT:    vpshad {{.*}}(%rip), %xmm1, %xmm1
1083; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1084; XOP-NEXT:    retq
1085  %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
1086  ret <4 x i32> %1
1087}
1088
1089define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
1090; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1091; SSE2:       # %bb.0:
1092; SSE2-NEXT:    movdqa %xmm0, %xmm2
1093; SSE2-NEXT:    psrad $31, %xmm0
1094; SSE2-NEXT:    movdqa %xmm0, %xmm3
1095; SSE2-NEXT:    psrld $28, %xmm3
1096; SSE2-NEXT:    movdqa %xmm0, %xmm4
1097; SSE2-NEXT:    psrld $29, %xmm4
1098; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1099; SSE2-NEXT:    psrld $30, %xmm0
1100; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1101; SSE2-NEXT:    paddd %xmm2, %xmm0
1102; SSE2-NEXT:    movdqa %xmm0, %xmm3
1103; SSE2-NEXT:    psrad $4, %xmm3
1104; SSE2-NEXT:    movdqa %xmm0, %xmm4
1105; SSE2-NEXT:    psrad $3, %xmm4
1106; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1107; SSE2-NEXT:    psrad $2, %xmm0
1108; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1109; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1110; SSE2-NEXT:    movdqa %xmm1, %xmm2
1111; SSE2-NEXT:    psrad $31, %xmm2
1112; SSE2-NEXT:    movdqa %xmm2, %xmm3
1113; SSE2-NEXT:    psrld $28, %xmm3
1114; SSE2-NEXT:    movdqa %xmm2, %xmm4
1115; SSE2-NEXT:    psrld $29, %xmm4
1116; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1117; SSE2-NEXT:    psrld $30, %xmm2
1118; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1119; SSE2-NEXT:    paddd %xmm1, %xmm2
1120; SSE2-NEXT:    movdqa %xmm2, %xmm3
1121; SSE2-NEXT:    psrad $4, %xmm3
1122; SSE2-NEXT:    movdqa %xmm2, %xmm4
1123; SSE2-NEXT:    psrad $3, %xmm4
1124; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1125; SSE2-NEXT:    psrad $2, %xmm2
1126; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1127; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1128; SSE2-NEXT:    movaps %xmm2, %xmm1
1129; SSE2-NEXT:    retq
1130;
1131; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1132; SSE41:       # %bb.0:
1133; SSE41-NEXT:    movdqa %xmm0, %xmm2
1134; SSE41-NEXT:    psrad $31, %xmm0
1135; SSE41-NEXT:    movdqa %xmm0, %xmm3
1136; SSE41-NEXT:    psrld $28, %xmm3
1137; SSE41-NEXT:    movdqa %xmm0, %xmm4
1138; SSE41-NEXT:    psrld $30, %xmm4
1139; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1140; SSE41-NEXT:    psrld $29, %xmm0
1141; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1142; SSE41-NEXT:    paddd %xmm2, %xmm0
1143; SSE41-NEXT:    movdqa %xmm0, %xmm3
1144; SSE41-NEXT:    psrad $4, %xmm3
1145; SSE41-NEXT:    movdqa %xmm0, %xmm4
1146; SSE41-NEXT:    psrad $2, %xmm4
1147; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1148; SSE41-NEXT:    psrad $3, %xmm0
1149; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1150; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1151; SSE41-NEXT:    movdqa %xmm1, %xmm2
1152; SSE41-NEXT:    psrad $31, %xmm2
1153; SSE41-NEXT:    movdqa %xmm2, %xmm3
1154; SSE41-NEXT:    psrld $28, %xmm3
1155; SSE41-NEXT:    movdqa %xmm2, %xmm4
1156; SSE41-NEXT:    psrld $30, %xmm4
1157; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1158; SSE41-NEXT:    psrld $29, %xmm2
1159; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1160; SSE41-NEXT:    paddd %xmm1, %xmm2
1161; SSE41-NEXT:    movdqa %xmm2, %xmm3
1162; SSE41-NEXT:    psrad $4, %xmm3
1163; SSE41-NEXT:    movdqa %xmm2, %xmm4
1164; SSE41-NEXT:    psrad $2, %xmm4
1165; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1166; SSE41-NEXT:    psrad $3, %xmm2
1167; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1168; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
1169; SSE41-NEXT:    movdqa %xmm2, %xmm1
1170; SSE41-NEXT:    retq
1171;
1172; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1173; AVX1:       # %bb.0:
1174; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1175; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
1176; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
1177; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm4
1178; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1179; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
1180; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1181; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1182; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
1183; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
1184; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1185; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
1186; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1187; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
1188; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
1189; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm4
1190; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1191; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
1192; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1193; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
1194; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
1195; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1196; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1197; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
1198; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1199; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1200; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1201; AVX1-NEXT:    retq
1202;
1203; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1204; AVX2ORLATER:       # %bb.0:
1205; AVX2ORLATER-NEXT:    vpsrad $31, %ymm0, %ymm1
1206; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %ymm1, %ymm1
1207; AVX2ORLATER-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
1208; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %ymm1, %ymm1
1209; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1210; AVX2ORLATER-NEXT:    retq
1211;
1212; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1213; XOP:       # %bb.0:
1214; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
1215; XOP-NEXT:    vpsrad $31, %xmm1, %xmm2
1216; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268>
1217; XOP-NEXT:    vpshld %xmm3, %xmm2, %xmm2
1218; XOP-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1219; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292>
1220; XOP-NEXT:    vpshad %xmm2, %xmm1, %xmm1
1221; XOP-NEXT:    vpsrad $31, %xmm0, %xmm4
1222; XOP-NEXT:    vpshld %xmm3, %xmm4, %xmm3
1223; XOP-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
1224; XOP-NEXT:    vpshad %xmm2, %xmm3, %xmm2
1225; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1226; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1227; XOP-NEXT:    retq
1228  %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1229  ret <8 x i32> %1
1230}
1231
1232define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
1233; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1234; SSE2:       # %bb.0:
1235; SSE2-NEXT:    movdqa %xmm1, %xmm4
1236; SSE2-NEXT:    movdqa %xmm0, %xmm1
1237; SSE2-NEXT:    psrad $31, %xmm0
1238; SSE2-NEXT:    movdqa %xmm0, %xmm5
1239; SSE2-NEXT:    psrld $28, %xmm5
1240; SSE2-NEXT:    movdqa %xmm0, %xmm6
1241; SSE2-NEXT:    psrld $29, %xmm6
1242; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1243; SSE2-NEXT:    psrld $30, %xmm0
1244; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1245; SSE2-NEXT:    paddd %xmm1, %xmm0
1246; SSE2-NEXT:    movdqa %xmm0, %xmm5
1247; SSE2-NEXT:    psrad $4, %xmm5
1248; SSE2-NEXT:    movdqa %xmm0, %xmm6
1249; SSE2-NEXT:    psrad $3, %xmm6
1250; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1251; SSE2-NEXT:    psrad $2, %xmm0
1252; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1253; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1254; SSE2-NEXT:    movdqa %xmm4, %xmm1
1255; SSE2-NEXT:    psrad $31, %xmm1
1256; SSE2-NEXT:    movdqa %xmm1, %xmm5
1257; SSE2-NEXT:    psrld $28, %xmm5
1258; SSE2-NEXT:    movdqa %xmm1, %xmm6
1259; SSE2-NEXT:    psrld $29, %xmm6
1260; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1261; SSE2-NEXT:    psrld $30, %xmm1
1262; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1263; SSE2-NEXT:    paddd %xmm4, %xmm1
1264; SSE2-NEXT:    movdqa %xmm1, %xmm5
1265; SSE2-NEXT:    psrad $4, %xmm5
1266; SSE2-NEXT:    movdqa %xmm1, %xmm6
1267; SSE2-NEXT:    psrad $3, %xmm6
1268; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1269; SSE2-NEXT:    psrad $2, %xmm1
1270; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1271; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
1272; SSE2-NEXT:    movdqa %xmm2, %xmm4
1273; SSE2-NEXT:    psrad $31, %xmm4
1274; SSE2-NEXT:    movdqa %xmm4, %xmm5
1275; SSE2-NEXT:    psrld $28, %xmm5
1276; SSE2-NEXT:    movdqa %xmm4, %xmm6
1277; SSE2-NEXT:    psrld $29, %xmm6
1278; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1279; SSE2-NEXT:    psrld $30, %xmm4
1280; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1281; SSE2-NEXT:    paddd %xmm2, %xmm4
1282; SSE2-NEXT:    movdqa %xmm4, %xmm5
1283; SSE2-NEXT:    psrad $4, %xmm5
1284; SSE2-NEXT:    movdqa %xmm4, %xmm6
1285; SSE2-NEXT:    psrad $3, %xmm6
1286; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1287; SSE2-NEXT:    psrad $2, %xmm4
1288; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1289; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1290; SSE2-NEXT:    movdqa %xmm3, %xmm5
1291; SSE2-NEXT:    psrad $31, %xmm5
1292; SSE2-NEXT:    movdqa %xmm5, %xmm2
1293; SSE2-NEXT:    psrld $28, %xmm2
1294; SSE2-NEXT:    movdqa %xmm5, %xmm6
1295; SSE2-NEXT:    psrld $29, %xmm6
1296; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1297; SSE2-NEXT:    psrld $30, %xmm5
1298; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1299; SSE2-NEXT:    paddd %xmm3, %xmm5
1300; SSE2-NEXT:    movdqa %xmm5, %xmm2
1301; SSE2-NEXT:    psrad $4, %xmm2
1302; SSE2-NEXT:    movdqa %xmm5, %xmm6
1303; SSE2-NEXT:    psrad $3, %xmm6
1304; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1305; SSE2-NEXT:    psrad $2, %xmm5
1306; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1307; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
1308; SSE2-NEXT:    movaps %xmm4, %xmm2
1309; SSE2-NEXT:    movaps %xmm5, %xmm3
1310; SSE2-NEXT:    retq
1311;
1312; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1313; SSE41:       # %bb.0:
1314; SSE41-NEXT:    movdqa %xmm1, %xmm4
1315; SSE41-NEXT:    movdqa %xmm0, %xmm1
1316; SSE41-NEXT:    psrad $31, %xmm0
1317; SSE41-NEXT:    movdqa %xmm0, %xmm5
1318; SSE41-NEXT:    psrld $28, %xmm5
1319; SSE41-NEXT:    movdqa %xmm0, %xmm6
1320; SSE41-NEXT:    psrld $30, %xmm6
1321; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1322; SSE41-NEXT:    psrld $29, %xmm0
1323; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1324; SSE41-NEXT:    paddd %xmm1, %xmm0
1325; SSE41-NEXT:    movdqa %xmm0, %xmm5
1326; SSE41-NEXT:    psrad $4, %xmm5
1327; SSE41-NEXT:    movdqa %xmm0, %xmm6
1328; SSE41-NEXT:    psrad $2, %xmm6
1329; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1330; SSE41-NEXT:    psrad $3, %xmm0
1331; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1332; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1333; SSE41-NEXT:    movdqa %xmm4, %xmm1
1334; SSE41-NEXT:    psrad $31, %xmm1
1335; SSE41-NEXT:    movdqa %xmm1, %xmm5
1336; SSE41-NEXT:    psrld $28, %xmm5
1337; SSE41-NEXT:    movdqa %xmm1, %xmm6
1338; SSE41-NEXT:    psrld $30, %xmm6
1339; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1340; SSE41-NEXT:    psrld $29, %xmm1
1341; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1342; SSE41-NEXT:    paddd %xmm4, %xmm1
1343; SSE41-NEXT:    movdqa %xmm1, %xmm5
1344; SSE41-NEXT:    psrad $4, %xmm5
1345; SSE41-NEXT:    movdqa %xmm1, %xmm6
1346; SSE41-NEXT:    psrad $2, %xmm6
1347; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1348; SSE41-NEXT:    psrad $3, %xmm1
1349; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1350; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
1351; SSE41-NEXT:    movdqa %xmm2, %xmm4
1352; SSE41-NEXT:    psrad $31, %xmm4
1353; SSE41-NEXT:    movdqa %xmm4, %xmm5
1354; SSE41-NEXT:    psrld $28, %xmm5
1355; SSE41-NEXT:    movdqa %xmm4, %xmm6
1356; SSE41-NEXT:    psrld $30, %xmm6
1357; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1358; SSE41-NEXT:    psrld $29, %xmm4
1359; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1360; SSE41-NEXT:    paddd %xmm2, %xmm4
1361; SSE41-NEXT:    movdqa %xmm4, %xmm5
1362; SSE41-NEXT:    psrad $4, %xmm5
1363; SSE41-NEXT:    movdqa %xmm4, %xmm6
1364; SSE41-NEXT:    psrad $2, %xmm6
1365; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1366; SSE41-NEXT:    psrad $3, %xmm4
1367; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1368; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1369; SSE41-NEXT:    movdqa %xmm3, %xmm5
1370; SSE41-NEXT:    psrad $31, %xmm5
1371; SSE41-NEXT:    movdqa %xmm5, %xmm2
1372; SSE41-NEXT:    psrld $28, %xmm2
1373; SSE41-NEXT:    movdqa %xmm5, %xmm6
1374; SSE41-NEXT:    psrld $30, %xmm6
1375; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1376; SSE41-NEXT:    psrld $29, %xmm5
1377; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1378; SSE41-NEXT:    paddd %xmm3, %xmm5
1379; SSE41-NEXT:    movdqa %xmm5, %xmm2
1380; SSE41-NEXT:    psrad $4, %xmm2
1381; SSE41-NEXT:    movdqa %xmm5, %xmm6
1382; SSE41-NEXT:    psrad $2, %xmm6
1383; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1384; SSE41-NEXT:    psrad $3, %xmm5
1385; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1386; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
1387; SSE41-NEXT:    movdqa %xmm4, %xmm2
1388; SSE41-NEXT:    movdqa %xmm5, %xmm3
1389; SSE41-NEXT:    retq
1390;
1391; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1392; AVX1:       # %bb.0:
1393; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1394; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
1395; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1396; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1397; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1398; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1399; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1400; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1401; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
1402; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1403; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1404; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
1405; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1406; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
1407; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1408; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1409; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1410; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1411; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1412; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
1413; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
1414; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
1415; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1416; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm3
1417; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1418; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1419; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1420; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1421; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
1422; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1423; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1424; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1425; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1426; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1427; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1428; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
1429; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1430; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1431; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
1432; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1433; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm3
1434; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1435; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1436; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1437; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1438; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1439; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
1440; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
1441; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
1442; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1443; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm3
1444; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1445; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1446; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1447; AVX1-NEXT:    retq
1448;
1449; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1450; AVX2:       # %bb.0:
1451; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
1452; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28]
1453; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
1454; AVX2-NEXT:    vpsrlvd %ymm3, %ymm2, %ymm2
1455; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
1456; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
1457; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
1458; AVX2-NEXT:    vpsravd %ymm4, %ymm2, %ymm2
1459; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1460; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
1461; AVX2-NEXT:    vpsrlvd %ymm3, %ymm2, %ymm2
1462; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm2
1463; AVX2-NEXT:    vpsravd %ymm4, %ymm2, %ymm2
1464; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1465; AVX2-NEXT:    retq
1466;
1467; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1468; AVX512F:       # %bb.0:
1469; AVX512F-NEXT:    vpsrad $31, %zmm0, %zmm1
1470; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1471; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
1472; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
1473; AVX512F-NEXT:    movw $4369, %ax # imm = 0x1111
1474; AVX512F-NEXT:    kmovw %eax, %k1
1475; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
1476; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
1477; AVX512F-NEXT:    retq
1478;
1479; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1480; AVX512BW:       # %bb.0:
1481; AVX512BW-NEXT:    vpsrad $31, %zmm0, %zmm1
1482; AVX512BW-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
1483; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
1484; AVX512BW-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
1485; AVX512BW-NEXT:    movw $4369, %ax # imm = 0x1111
1486; AVX512BW-NEXT:    kmovd %eax, %k1
1487; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
1488; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
1489; AVX512BW-NEXT:    retq
1490;
1491; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1492; XOP:       # %bb.0:
1493; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1494; XOP-NEXT:    vpsrad $31, %xmm2, %xmm3
1495; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268>
1496; XOP-NEXT:    vpshld %xmm4, %xmm3, %xmm3
1497; XOP-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1498; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292>
1499; XOP-NEXT:    vpshad %xmm3, %xmm2, %xmm2
1500; XOP-NEXT:    vpsrad $31, %xmm0, %xmm5
1501; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm5
1502; XOP-NEXT:    vpaddd %xmm5, %xmm0, %xmm5
1503; XOP-NEXT:    vpshad %xmm3, %xmm5, %xmm5
1504; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
1505; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1506; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
1507; XOP-NEXT:    vpsrad $31, %xmm2, %xmm5
1508; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm5
1509; XOP-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1510; XOP-NEXT:    vpshad %xmm3, %xmm2, %xmm2
1511; XOP-NEXT:    vpsrad $31, %xmm1, %xmm5
1512; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm4
1513; XOP-NEXT:    vpaddd %xmm4, %xmm1, %xmm4
1514; XOP-NEXT:    vpshad %xmm3, %xmm4, %xmm3
1515; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1516; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1517; XOP-NEXT:    retq
1518  %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1519  ret <16 x i32> %1
1520}
1521
1522define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
1523; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1524; SSE2:       # %bb.0:
1525; SSE2-NEXT:    movdqa %xmm0, %xmm1
1526; SSE2-NEXT:    psrad $31, %xmm1
1527; SSE2-NEXT:    psrlq $62, %xmm1
1528; SSE2-NEXT:    paddq %xmm0, %xmm1
1529; SSE2-NEXT:    movdqa %xmm1, %xmm2
1530; SSE2-NEXT:    psrad $2, %xmm2
1531; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1532; SSE2-NEXT:    psrlq $2, %xmm1
1533; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1534; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1535; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1536; SSE2-NEXT:    retq
1537;
1538; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1539; SSE41:       # %bb.0:
1540; SSE41-NEXT:    movdqa %xmm0, %xmm1
1541; SSE41-NEXT:    psrad $31, %xmm1
1542; SSE41-NEXT:    psrlq $62, %xmm1
1543; SSE41-NEXT:    paddq %xmm0, %xmm1
1544; SSE41-NEXT:    movdqa %xmm1, %xmm2
1545; SSE41-NEXT:    psrad $2, %xmm2
1546; SSE41-NEXT:    psrlq $2, %xmm1
1547; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1548; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1549; SSE41-NEXT:    movdqa %xmm1, %xmm0
1550; SSE41-NEXT:    retq
1551;
1552; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1553; AVX1:       # %bb.0:
1554; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1555; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
1556; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm1
1557; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1558; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm2
1559; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm1
1560; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1561; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1562; AVX1-NEXT:    retq
1563;
1564; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1565; AVX2:       # %bb.0:
1566; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1567; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
1568; AVX2-NEXT:    vpsrlq $62, %xmm1, %xmm1
1569; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1570; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm2
1571; AVX2-NEXT:    vpsrlq $2, %xmm1, %xmm1
1572; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
1573; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1574; AVX2-NEXT:    retq
1575;
1576; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1577; AVX512F:       # %bb.0:
1578; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1579; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
1580; AVX512F-NEXT:    vpsrlq $62, %xmm1, %xmm1
1581; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1582; AVX512F-NEXT:    vpsraq $2, %zmm1, %zmm1
1583; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1584; AVX512F-NEXT:    vzeroupper
1585; AVX512F-NEXT:    retq
1586;
1587; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1588; AVX512BW:       # %bb.0:
1589; AVX512BW-NEXT:    vpsraq $63, %xmm0, %xmm1
1590; AVX512BW-NEXT:    vpsrlq $62, %xmm1, %xmm1
1591; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1592; AVX512BW-NEXT:    vpsraq $2, %xmm1, %xmm1
1593; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1594; AVX512BW-NEXT:    retq
1595;
1596; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1597; XOP:       # %bb.0:
1598; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm0, %xmm1
1599; XOP-NEXT:    vpsrlq $62, %xmm1, %xmm1
1600; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1601; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm1, %xmm1
1602; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1603; XOP-NEXT:    retq
1604  %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
1605  ret <2 x i64> %1
1606}
1607
1608define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
1609; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1610; SSE2:       # %bb.0:
1611; SSE2-NEXT:    movdqa %xmm0, %xmm2
1612; SSE2-NEXT:    psrad $31, %xmm2
1613; SSE2-NEXT:    psrlq $62, %xmm2
1614; SSE2-NEXT:    paddq %xmm0, %xmm2
1615; SSE2-NEXT:    movdqa %xmm2, %xmm3
1616; SSE2-NEXT:    psrad $2, %xmm3
1617; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
1618; SSE2-NEXT:    psrlq $2, %xmm2
1619; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1620; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1621; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1622; SSE2-NEXT:    movdqa %xmm1, %xmm2
1623; SSE2-NEXT:    psrad $31, %xmm2
1624; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1625; SSE2-NEXT:    movdqa %xmm2, %xmm3
1626; SSE2-NEXT:    psrlq $61, %xmm3
1627; SSE2-NEXT:    psrlq $60, %xmm2
1628; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
1629; SSE2-NEXT:    paddq %xmm1, %xmm2
1630; SSE2-NEXT:    movdqa %xmm2, %xmm1
1631; SSE2-NEXT:    psrlq $3, %xmm1
1632; SSE2-NEXT:    psrlq $4, %xmm2
1633; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1634; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1635; SSE2-NEXT:    xorpd %xmm1, %xmm2
1636; SSE2-NEXT:    psubq %xmm1, %xmm2
1637; SSE2-NEXT:    movdqa %xmm2, %xmm1
1638; SSE2-NEXT:    retq
1639;
1640; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1641; SSE41:       # %bb.0:
1642; SSE41-NEXT:    movdqa %xmm0, %xmm2
1643; SSE41-NEXT:    psrad $31, %xmm0
1644; SSE41-NEXT:    psrlq $62, %xmm0
1645; SSE41-NEXT:    paddq %xmm2, %xmm0
1646; SSE41-NEXT:    movdqa %xmm0, %xmm3
1647; SSE41-NEXT:    psrad $2, %xmm3
1648; SSE41-NEXT:    psrlq $2, %xmm0
1649; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1650; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1651; SSE41-NEXT:    movdqa %xmm1, %xmm2
1652; SSE41-NEXT:    psrad $31, %xmm2
1653; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1654; SSE41-NEXT:    movdqa %xmm2, %xmm3
1655; SSE41-NEXT:    psrlq $60, %xmm3
1656; SSE41-NEXT:    psrlq $61, %xmm2
1657; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1658; SSE41-NEXT:    paddq %xmm1, %xmm2
1659; SSE41-NEXT:    movdqa %xmm2, %xmm1
1660; SSE41-NEXT:    psrlq $4, %xmm1
1661; SSE41-NEXT:    psrlq $3, %xmm2
1662; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1663; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1664; SSE41-NEXT:    pxor %xmm1, %xmm2
1665; SSE41-NEXT:    psubq %xmm1, %xmm2
1666; SSE41-NEXT:    movdqa %xmm2, %xmm1
1667; SSE41-NEXT:    retq
1668;
1669; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1670; AVX1:       # %bb.0:
1671; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1672; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1673; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
1674; AVX1-NEXT:    vpsrlq $60, %xmm3, %xmm4
1675; AVX1-NEXT:    vpsrlq $61, %xmm3, %xmm3
1676; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1677; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
1678; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm3
1679; AVX1-NEXT:    vpsrlq $3, %xmm1, %xmm1
1680; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1681; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
1682; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
1683; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
1684; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
1685; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
1686; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
1687; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm3
1688; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm2
1689; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1690; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1691; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1692; AVX1-NEXT:    retq
1693;
1694; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1695; AVX2:       # %bb.0:
1696; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1697; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm1
1698; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1699; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1700; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1701; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1702; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
1703; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
1704; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1705; AVX2-NEXT:    retq
1706;
1707; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1708; AVX512F:       # %bb.0:
1709; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1710; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,2,3,4>
1711; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm2
1712; AVX512F-NEXT:    vpsrlvq {{.*}}(%rip), %ymm2, %ymm2
1713; AVX512F-NEXT:    vpaddq %ymm2, %ymm0, %ymm2
1714; AVX512F-NEXT:    vpsravq %zmm1, %zmm2, %zmm1
1715; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1716; AVX512F-NEXT:    retq
1717;
1718; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1719; AVX512BW:       # %bb.0:
1720; AVX512BW-NEXT:    vpsraq $63, %ymm0, %ymm1
1721; AVX512BW-NEXT:    vpsrlvq {{.*}}(%rip), %ymm1, %ymm1
1722; AVX512BW-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1723; AVX512BW-NEXT:    vpsravq {{.*}}(%rip), %ymm1, %ymm1
1724; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1725; AVX512BW-NEXT:    retq
1726;
1727; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1728; XOP:       # %bb.0:
1729; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
1730; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm2
1731; XOP-NEXT:    vpsrlq $62, %xmm2, %xmm2
1732; XOP-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
1733; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm2, %xmm2
1734; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
1735; XOP-NEXT:    vpshaq %xmm1, %xmm3, %xmm1
1736; XOP-NEXT:    vpshlq {{.*}}(%rip), %xmm1, %xmm1
1737; XOP-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
1738; XOP-NEXT:    vpshaq {{.*}}(%rip), %xmm1, %xmm1
1739; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1740; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1741; XOP-NEXT:    retq
1742  %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
1743  ret <4 x i64> %1
1744}
1745
1746define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
1747; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1748; SSE2:       # %bb.0:
1749; SSE2-NEXT:    movdqa %xmm0, %xmm4
1750; SSE2-NEXT:    psrad $31, %xmm4
1751; SSE2-NEXT:    psrlq $62, %xmm4
1752; SSE2-NEXT:    paddq %xmm0, %xmm4
1753; SSE2-NEXT:    movdqa %xmm4, %xmm5
1754; SSE2-NEXT:    psrad $2, %xmm5
1755; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1756; SSE2-NEXT:    psrlq $2, %xmm4
1757; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1758; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1759; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
1760; SSE2-NEXT:    movdqa %xmm2, %xmm4
1761; SSE2-NEXT:    psrad $31, %xmm4
1762; SSE2-NEXT:    psrlq $62, %xmm4
1763; SSE2-NEXT:    paddq %xmm2, %xmm4
1764; SSE2-NEXT:    movdqa %xmm4, %xmm5
1765; SSE2-NEXT:    psrad $2, %xmm5
1766; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1767; SSE2-NEXT:    psrlq $2, %xmm4
1768; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1769; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1770; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
1771; SSE2-NEXT:    movdqa %xmm1, %xmm4
1772; SSE2-NEXT:    psrad $31, %xmm4
1773; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1774; SSE2-NEXT:    movdqa %xmm4, %xmm5
1775; SSE2-NEXT:    psrlq $61, %xmm5
1776; SSE2-NEXT:    psrlq $60, %xmm4
1777; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
1778; SSE2-NEXT:    paddq %xmm1, %xmm4
1779; SSE2-NEXT:    movdqa %xmm4, %xmm1
1780; SSE2-NEXT:    psrlq $3, %xmm1
1781; SSE2-NEXT:    psrlq $4, %xmm4
1782; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
1783; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1784; SSE2-NEXT:    xorpd %xmm1, %xmm4
1785; SSE2-NEXT:    psubq %xmm1, %xmm4
1786; SSE2-NEXT:    movdqa %xmm3, %xmm5
1787; SSE2-NEXT:    psrad $31, %xmm5
1788; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1789; SSE2-NEXT:    movdqa %xmm5, %xmm6
1790; SSE2-NEXT:    psrlq $61, %xmm6
1791; SSE2-NEXT:    psrlq $60, %xmm5
1792; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
1793; SSE2-NEXT:    paddq %xmm3, %xmm5
1794; SSE2-NEXT:    movdqa %xmm5, %xmm3
1795; SSE2-NEXT:    psrlq $3, %xmm3
1796; SSE2-NEXT:    psrlq $4, %xmm5
1797; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1798; SSE2-NEXT:    xorpd %xmm1, %xmm5
1799; SSE2-NEXT:    psubq %xmm1, %xmm5
1800; SSE2-NEXT:    movdqa %xmm4, %xmm1
1801; SSE2-NEXT:    movdqa %xmm5, %xmm3
1802; SSE2-NEXT:    retq
1803;
1804; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1805; SSE41:       # %bb.0:
1806; SSE41-NEXT:    movdqa %xmm2, %xmm5
1807; SSE41-NEXT:    movdqa %xmm1, %xmm4
1808; SSE41-NEXT:    movdqa %xmm0, %xmm1
1809; SSE41-NEXT:    psrad $31, %xmm0
1810; SSE41-NEXT:    psrlq $62, %xmm0
1811; SSE41-NEXT:    paddq %xmm1, %xmm0
1812; SSE41-NEXT:    movdqa %xmm0, %xmm2
1813; SSE41-NEXT:    psrad $2, %xmm2
1814; SSE41-NEXT:    psrlq $2, %xmm0
1815; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1816; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1817; SSE41-NEXT:    movdqa %xmm5, %xmm2
1818; SSE41-NEXT:    psrad $31, %xmm2
1819; SSE41-NEXT:    psrlq $62, %xmm2
1820; SSE41-NEXT:    paddq %xmm5, %xmm2
1821; SSE41-NEXT:    movdqa %xmm2, %xmm1
1822; SSE41-NEXT:    psrad $2, %xmm1
1823; SSE41-NEXT:    psrlq $2, %xmm2
1824; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1825; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
1826; SSE41-NEXT:    movdqa %xmm4, %xmm1
1827; SSE41-NEXT:    psrad $31, %xmm1
1828; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1829; SSE41-NEXT:    movdqa %xmm1, %xmm5
1830; SSE41-NEXT:    psrlq $60, %xmm5
1831; SSE41-NEXT:    psrlq $61, %xmm1
1832; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
1833; SSE41-NEXT:    paddq %xmm4, %xmm1
1834; SSE41-NEXT:    movdqa %xmm1, %xmm4
1835; SSE41-NEXT:    psrlq $4, %xmm4
1836; SSE41-NEXT:    psrlq $3, %xmm1
1837; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1838; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
1839; SSE41-NEXT:    pxor %xmm5, %xmm1
1840; SSE41-NEXT:    psubq %xmm5, %xmm1
1841; SSE41-NEXT:    movdqa %xmm3, %xmm4
1842; SSE41-NEXT:    psrad $31, %xmm4
1843; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1844; SSE41-NEXT:    movdqa %xmm4, %xmm6
1845; SSE41-NEXT:    psrlq $60, %xmm6
1846; SSE41-NEXT:    psrlq $61, %xmm4
1847; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
1848; SSE41-NEXT:    paddq %xmm3, %xmm4
1849; SSE41-NEXT:    movdqa %xmm4, %xmm3
1850; SSE41-NEXT:    psrlq $4, %xmm3
1851; SSE41-NEXT:    psrlq $3, %xmm4
1852; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1853; SSE41-NEXT:    pxor %xmm5, %xmm4
1854; SSE41-NEXT:    psubq %xmm5, %xmm4
1855; SSE41-NEXT:    movdqa %xmm4, %xmm3
1856; SSE41-NEXT:    retq
1857;
1858; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1859; AVX1:       # %bb.0:
1860; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1861; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1862; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
1863; AVX1-NEXT:    vpsrlq $60, %xmm4, %xmm5
1864; AVX1-NEXT:    vpsrlq $61, %xmm4, %xmm4
1865; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1866; AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
1867; AVX1-NEXT:    vpsrlq $4, %xmm3, %xmm4
1868; AVX1-NEXT:    vpsrlq $3, %xmm3, %xmm3
1869; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1870; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1871; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
1872; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
1873; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
1874; AVX1-NEXT:    vpsrlq $62, %xmm5, %xmm5
1875; AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm5
1876; AVX1-NEXT:    vpsrad $2, %xmm5, %xmm6
1877; AVX1-NEXT:    vpsrlq $2, %xmm5, %xmm5
1878; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1879; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
1880; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1881; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1882; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm5
1883; AVX1-NEXT:    vpsrlq $60, %xmm5, %xmm6
1884; AVX1-NEXT:    vpsrlq $61, %xmm5, %xmm5
1885; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1886; AVX1-NEXT:    vpaddq %xmm5, %xmm3, %xmm3
1887; AVX1-NEXT:    vpsrlq $4, %xmm3, %xmm5
1888; AVX1-NEXT:    vpsrlq $3, %xmm3, %xmm3
1889; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1890; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
1891; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
1892; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
1893; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
1894; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm2
1895; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1896; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm2
1897; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1898; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1899; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1900; AVX1-NEXT:    retq
1901;
1902; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1903; AVX2:       # %bb.0:
1904; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1905; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
1906; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,62,61,60>
1907; AVX2-NEXT:    vpsrlvq %ymm4, %ymm3, %ymm3
1908; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm3
1909; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,2,3,4>
1910; AVX2-NEXT:    vpsrlvq %ymm5, %ymm3, %ymm3
1911; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1912; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm3
1913; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
1914; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1915; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
1916; AVX2-NEXT:    vpsrlvq %ymm4, %ymm2, %ymm2
1917; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm2
1918; AVX2-NEXT:    vpsrlvq %ymm5, %ymm2, %ymm2
1919; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
1920; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1921; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1922; AVX2-NEXT:    retq
1923;
1924; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1925; AVX512F:       # %bb.0:
1926; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
1927; AVX512F-NEXT:    vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1928; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
1929; AVX512F-NEXT:    vpsravq {{.*}}(%rip), %zmm1, %zmm1
1930; AVX512F-NEXT:    movb $17, %al
1931; AVX512F-NEXT:    kmovw %eax, %k1
1932; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
1933; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
1934; AVX512F-NEXT:    retq
1935;
1936; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1937; AVX512BW:       # %bb.0:
1938; AVX512BW-NEXT:    vpsraq $63, %zmm0, %zmm1
1939; AVX512BW-NEXT:    vpsrlvq {{.*}}(%rip), %zmm1, %zmm1
1940; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
1941; AVX512BW-NEXT:    vpsravq {{.*}}(%rip), %zmm1, %zmm1
1942; AVX512BW-NEXT:    movb $17, %al
1943; AVX512BW-NEXT:    kmovd %eax, %k1
1944; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
1945; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
1946; AVX512BW-NEXT:    retq
1947;
1948; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1949; XOP:       # %bb.0:
1950; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1951; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
1952; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm4
1953; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556]
1954; XOP-NEXT:    vpshlq %xmm5, %xmm4, %xmm4
1955; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
1956; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
1957; XOP-NEXT:    vpshaq %xmm4, %xmm2, %xmm2
1958; XOP-NEXT:    vpshaq %xmm3, %xmm0, %xmm6
1959; XOP-NEXT:    vpsrlq $62, %xmm6, %xmm6
1960; XOP-NEXT:    vpaddq %xmm6, %xmm0, %xmm6
1961; XOP-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,18446744073709551614>
1962; XOP-NEXT:    vpshaq %xmm7, %xmm6, %xmm6
1963; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
1964; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
1965; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
1966; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm6
1967; XOP-NEXT:    vpshlq %xmm5, %xmm6, %xmm5
1968; XOP-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
1969; XOP-NEXT:    vpshaq %xmm4, %xmm2, %xmm2
1970; XOP-NEXT:    vpshaq %xmm3, %xmm1, %xmm3
1971; XOP-NEXT:    vpsrlq $62, %xmm3, %xmm3
1972; XOP-NEXT:    vpaddq %xmm3, %xmm1, %xmm3
1973; XOP-NEXT:    vpshaq %xmm7, %xmm3, %xmm3
1974; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1975; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1976; XOP-NEXT:    retq
1977  %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
1978  ret <8 x i64> %1
1979}
1980
1981define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
1982; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
1983; SSE2:       # %bb.0:
1984; SSE2-NEXT:    movdqa %xmm0, %xmm1
1985; SSE2-NEXT:    psrad $31, %xmm0
1986; SSE2-NEXT:    movdqa %xmm0, %xmm2
1987; SSE2-NEXT:    psrld $28, %xmm2
1988; SSE2-NEXT:    movdqa %xmm0, %xmm3
1989; SSE2-NEXT:    psrld $29, %xmm3
1990; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1991; SSE2-NEXT:    psrld $30, %xmm0
1992; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
1993; SSE2-NEXT:    paddd %xmm1, %xmm0
1994; SSE2-NEXT:    movdqa %xmm0, %xmm2
1995; SSE2-NEXT:    psrad $4, %xmm2
1996; SSE2-NEXT:    movdqa %xmm0, %xmm3
1997; SSE2-NEXT:    psrad $3, %xmm3
1998; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1999; SSE2-NEXT:    psrad $2, %xmm0
2000; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
2001; SSE2-NEXT:    pxor %xmm2, %xmm2
2002; SSE2-NEXT:    psubd %xmm0, %xmm2
2003; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
2004; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2005; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2006; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2007; SSE2-NEXT:    retq
2008;
2009; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2010; SSE41:       # %bb.0:
2011; SSE41-NEXT:    movdqa %xmm0, %xmm1
2012; SSE41-NEXT:    psrad $31, %xmm1
2013; SSE41-NEXT:    movdqa %xmm1, %xmm2
2014; SSE41-NEXT:    psrld $28, %xmm2
2015; SSE41-NEXT:    movdqa %xmm1, %xmm3
2016; SSE41-NEXT:    psrld $30, %xmm3
2017; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2018; SSE41-NEXT:    psrld $29, %xmm1
2019; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2020; SSE41-NEXT:    paddd %xmm0, %xmm1
2021; SSE41-NEXT:    movdqa %xmm1, %xmm2
2022; SSE41-NEXT:    psrad $4, %xmm2
2023; SSE41-NEXT:    movdqa %xmm1, %xmm3
2024; SSE41-NEXT:    psrad $2, %xmm3
2025; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2026; SSE41-NEXT:    pxor %xmm2, %xmm2
2027; SSE41-NEXT:    psubd %xmm3, %xmm2
2028; SSE41-NEXT:    psrad $3, %xmm1
2029; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2030; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2031; SSE41-NEXT:    movdqa %xmm1, %xmm0
2032; SSE41-NEXT:    retq
2033;
2034; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2035; AVX1:       # %bb.0:
2036; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
2037; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
2038; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
2039; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2040; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
2041; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2042; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2043; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
2044; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
2045; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2046; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2047; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
2048; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
2049; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2050; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2051; AVX1-NEXT:    retq
2052;
2053; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2054; AVX2ORLATER:       # %bb.0:
2055; AVX2ORLATER-NEXT:    vpsrad $31, %xmm0, %xmm1
2056; AVX2ORLATER-NEXT:    vpsrlvd {{.*}}(%rip), %xmm1, %xmm1
2057; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2058; AVX2ORLATER-NEXT:    vpsravd {{.*}}(%rip), %xmm1, %xmm1
2059; AVX2ORLATER-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2060; AVX2ORLATER-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
2061; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2062; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
2063; AVX2ORLATER-NEXT:    retq
2064;
2065; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2066; XOP:       # %bb.0:
2067; XOP-NEXT:    vpsrad $31, %xmm0, %xmm1
2068; XOP-NEXT:    vpshld {{.*}}(%rip), %xmm1, %xmm1
2069; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2070; XOP-NEXT:    vpshad {{.*}}(%rip), %xmm1, %xmm1
2071; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2072; XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
2073; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2074; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2075; XOP-NEXT:    retq
2076  %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
2077  ret <4 x i32> %1
2078}
2079
2080define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
2081; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
2082; CHECK:       # %bb.0:
2083; CHECK-NEXT:    retq
2084  %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
2085  ret <4 x i32> %1
2086}
2087
2088define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
2089; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
2090; CHECK:       # %bb.0:
2091; CHECK-NEXT:    retq
2092  %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
2093  ret <4 x i32> %1
2094}
2095
2096define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
2097; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
2098; CHECK:       # %bb.0:
2099; CHECK-NEXT:    retq
2100  %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
2101  ret <4 x i32> %1
2102}
2103
2104; PR37119
2105define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
2106; SSE-LABEL: non_splat_minus_one_divisor_0:
2107; SSE:       # %bb.0:
2108; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2109; SSE-NEXT:    pxor %xmm1, %xmm0
2110; SSE-NEXT:    psubb %xmm1, %xmm0
2111; SSE-NEXT:    retq
2112;
2113; AVX1-LABEL: non_splat_minus_one_divisor_0:
2114; AVX1:       # %bb.0:
2115; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2116; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2117; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2118; AVX1-NEXT:    retq
2119;
2120; AVX2-LABEL: non_splat_minus_one_divisor_0:
2121; AVX2:       # %bb.0:
2122; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2123; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2124; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2125; AVX2-NEXT:    retq
2126;
2127; AVX512F-LABEL: non_splat_minus_one_divisor_0:
2128; AVX512F:       # %bb.0:
2129; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2130; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2131; AVX512F-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2132; AVX512F-NEXT:    retq
2133;
2134; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
2135; AVX512BW:       # %bb.0:
2136; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2137; AVX512BW-NEXT:    movw $443, %ax # imm = 0x1BB
2138; AVX512BW-NEXT:    kmovd %eax, %k1
2139; AVX512BW-NEXT:    vpsubb %xmm0, %xmm1, %xmm0 {%k1}
2140; AVX512BW-NEXT:    retq
2141;
2142; XOP-LABEL: non_splat_minus_one_divisor_0:
2143; XOP:       # %bb.0:
2144; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2145; XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2146; XOP-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2147; XOP-NEXT:    retq
2148  %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2149  ret <16 x i8> %div
2150}
2151
2152define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
2153; SSE2-LABEL: non_splat_minus_one_divisor_1:
2154; SSE2:       # %bb.0:
2155; SSE2-NEXT:    pxor %xmm1, %xmm1
2156; SSE2-NEXT:    pxor %xmm2, %xmm2
2157; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
2158; SSE2-NEXT:    movdqa %xmm2, %xmm3
2159; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2160; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm3
2161; SSE2-NEXT:    psrlw $8, %xmm3
2162; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2163; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
2164; SSE2-NEXT:    psrlw $8, %xmm2
2165; SSE2-NEXT:    packuswb %xmm3, %xmm2
2166; SSE2-NEXT:    paddb %xmm0, %xmm2
2167; SSE2-NEXT:    movdqa %xmm2, %xmm1
2168; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2169; SSE2-NEXT:    psraw $8, %xmm1
2170; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
2171; SSE2-NEXT:    psrlw $8, %xmm1
2172; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2173; SSE2-NEXT:    psraw $8, %xmm2
2174; SSE2-NEXT:    psllw $7, %xmm2
2175; SSE2-NEXT:    psrlw $8, %xmm2
2176; SSE2-NEXT:    packuswb %xmm1, %xmm2
2177; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2178; SSE2-NEXT:    pand %xmm1, %xmm2
2179; SSE2-NEXT:    pandn %xmm0, %xmm1
2180; SSE2-NEXT:    por %xmm2, %xmm1
2181; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2182; SSE2-NEXT:    pxor %xmm0, %xmm1
2183; SSE2-NEXT:    psubb %xmm0, %xmm1
2184; SSE2-NEXT:    movdqa %xmm1, %xmm0
2185; SSE2-NEXT:    retq
2186;
2187; SSE41-LABEL: non_splat_minus_one_divisor_1:
2188; SSE41:       # %bb.0:
2189; SSE41-NEXT:    movdqa %xmm0, %xmm1
2190; SSE41-NEXT:    pxor %xmm0, %xmm0
2191; SSE41-NEXT:    pxor %xmm3, %xmm3
2192; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
2193; SSE41-NEXT:    pxor %xmm4, %xmm4
2194; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2195; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2196; SSE41-NEXT:    psllw $1, %xmm2
2197; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
2198; SSE41-NEXT:    psrlw $8, %xmm2
2199; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2200; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm3
2201; SSE41-NEXT:    psrlw $8, %xmm3
2202; SSE41-NEXT:    packuswb %xmm3, %xmm2
2203; SSE41-NEXT:    paddb %xmm1, %xmm2
2204; SSE41-NEXT:    movdqa %xmm2, %xmm0
2205; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2206; SSE41-NEXT:    psraw $8, %xmm0
2207; SSE41-NEXT:    movdqa %xmm0, %xmm3
2208; SSE41-NEXT:    psllw $1, %xmm3
2209; SSE41-NEXT:    psllw $7, %xmm0
2210; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2211; SSE41-NEXT:    psrlw $8, %xmm0
2212; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2213; SSE41-NEXT:    psraw $8, %xmm2
2214; SSE41-NEXT:    psllw $7, %xmm2
2215; SSE41-NEXT:    psrlw $8, %xmm2
2216; SSE41-NEXT:    packuswb %xmm0, %xmm2
2217; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2218; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
2219; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2220; SSE41-NEXT:    pxor %xmm0, %xmm1
2221; SSE41-NEXT:    psubb %xmm0, %xmm1
2222; SSE41-NEXT:    movdqa %xmm1, %xmm0
2223; SSE41-NEXT:    retq
2224;
2225; AVX1-LABEL: non_splat_minus_one_divisor_1:
2226; AVX1:       # %bb.0:
2227; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2228; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
2229; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2230; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2231; AVX1-NEXT:    vpsllw $1, %xmm4, %xmm4
2232; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2233; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
2234; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2235; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
2236; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2237; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
2238; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2239; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2240; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
2241; AVX1-NEXT:    vpsllw $1, %xmm2, %xmm3
2242; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
2243; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
2244; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
2245; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2246; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
2247; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
2248; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2249; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2250; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2251; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2252; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2253; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2254; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2255; AVX1-NEXT:    retq
2256;
2257; AVX2-LABEL: non_splat_minus_one_divisor_1:
2258; AVX2:       # %bb.0:
2259; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2260; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
2261; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2262; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
2263; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2264; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2265; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2266; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2267; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2268; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
2269; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2270; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2271; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2272; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2273; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2274; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2275; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2276; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2277; AVX2-NEXT:    vzeroupper
2278; AVX2-NEXT:    retq
2279;
2280; AVX512F-LABEL: non_splat_minus_one_divisor_1:
2281; AVX512F:       # %bb.0:
2282; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2283; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
2284; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2285; AVX512F-NEXT:    vpsrlvd {{.*}}(%rip), %zmm1, %zmm1
2286; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2287; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2288; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
2289; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm1, %zmm1
2290; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2291; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2292; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2293; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2294; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2295; AVX512F-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2296; AVX512F-NEXT:    vzeroupper
2297; AVX512F-NEXT:    retq
2298;
2299; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
2300; AVX512BW:       # %bb.0:
2301; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2302; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
2303; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2304; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm2, %ymm2
2305; AVX512BW-NEXT:    vpmovwb %ymm2, %xmm2
2306; AVX512BW-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
2307; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm2
2308; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm2, %ymm2
2309; AVX512BW-NEXT:    vpmovwb %ymm2, %xmm2
2310; AVX512BW-NEXT:    movw $443, %ax # imm = 0x1BB
2311; AVX512BW-NEXT:    kmovd %eax, %k1
2312; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
2313; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm0
2314; AVX512BW-NEXT:    movw $24132, %ax # imm = 0x5E44
2315; AVX512BW-NEXT:    kmovd %eax, %k1
2316; AVX512BW-NEXT:    vmovdqu8 %xmm2, %xmm0 {%k1}
2317; AVX512BW-NEXT:    vzeroupper
2318; AVX512BW-NEXT:    retq
2319;
2320; XOP-LABEL: non_splat_minus_one_divisor_1:
2321; XOP:       # %bb.0:
2322; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2323; XOP-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
2324; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm1, %xmm1
2325; XOP-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2326; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm1, %xmm1
2327; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2328; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2329; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2330; XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2331; XOP-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2332; XOP-NEXT:    retq
2333  %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
2334  ret <16 x i8> %div
2335}
2336
2337define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
2338; SSE2-LABEL: non_splat_minus_one_divisor_2:
2339; SSE2:       # %bb.0:
2340; SSE2-NEXT:    movdqa %xmm0, %xmm1
2341; SSE2-NEXT:    psrld $31, %xmm1
2342; SSE2-NEXT:    paddd %xmm0, %xmm1
2343; SSE2-NEXT:    psrad $1, %xmm1
2344; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2345; SSE2-NEXT:    pxor %xmm0, %xmm0
2346; SSE2-NEXT:    psubd %xmm1, %xmm0
2347; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2348; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2349; SSE2-NEXT:    retq
2350;
2351; SSE41-LABEL: non_splat_minus_one_divisor_2:
2352; SSE41:       # %bb.0:
2353; SSE41-NEXT:    movdqa %xmm0, %xmm1
2354; SSE41-NEXT:    psrld $31, %xmm1
2355; SSE41-NEXT:    paddd %xmm0, %xmm1
2356; SSE41-NEXT:    psrad $1, %xmm1
2357; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2358; SSE41-NEXT:    pxor %xmm0, %xmm0
2359; SSE41-NEXT:    psubd %xmm1, %xmm0
2360; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
2361; SSE41-NEXT:    movdqa %xmm1, %xmm0
2362; SSE41-NEXT:    retq
2363;
2364; AVX1-LABEL: non_splat_minus_one_divisor_2:
2365; AVX1:       # %bb.0:
2366; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
2367; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2368; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
2369; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2370; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2371; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
2372; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2373; AVX1-NEXT:    retq
2374;
2375; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
2376; AVX2ORLATER:       # %bb.0:
2377; AVX2ORLATER-NEXT:    vpsrld $31, %xmm0, %xmm1
2378; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2379; AVX2ORLATER-NEXT:    vpsrad $1, %xmm1, %xmm1
2380; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2381; AVX2ORLATER-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2382; AVX2ORLATER-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
2383; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
2384; AVX2ORLATER-NEXT:    retq
2385;
2386; XOP-LABEL: non_splat_minus_one_divisor_2:
2387; XOP:       # %bb.0:
2388; XOP-NEXT:    vpsrld $31, %xmm0, %xmm1
2389; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2390; XOP-NEXT:    vpsrad $1, %xmm1, %xmm1
2391; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2392; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2393; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
2394; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2395; XOP-NEXT:    retq
2396  %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
2397  ret <4 x i32> %div
2398}
2399
2400define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) {
2401; SSE-LABEL: combine_vec_sdiv_nonuniform:
2402; SSE:       # %bb.0:
2403; SSE-NEXT:    pmulhw {{.*}}(%rip), %xmm0
2404; SSE-NEXT:    movdqa %xmm0, %xmm1
2405; SSE-NEXT:    psrlw $15, %xmm1
2406; SSE-NEXT:    paddw %xmm0, %xmm1
2407; SSE-NEXT:    movdqa %xmm1, %xmm0
2408; SSE-NEXT:    retq
2409;
2410; AVX-LABEL: combine_vec_sdiv_nonuniform:
2411; AVX:       # %bb.0:
2412; AVX-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2413; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
2414; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2415; AVX-NEXT:    retq
2416  %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22>
2417  ret <8 x i16> %1
2418}
2419
2420define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
2421; SSE2-LABEL: combine_vec_sdiv_nonuniform2:
2422; SSE2:       # %bb.0:
2423; SSE2-NEXT:    pmulhw {{.*}}(%rip), %xmm0
2424; SSE2-NEXT:    movdqa %xmm0, %xmm1
2425; SSE2-NEXT:    psraw $2, %xmm1
2426; SSE2-NEXT:    movdqa %xmm0, %xmm2
2427; SSE2-NEXT:    psraw $1, %xmm2
2428; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2429; SSE2-NEXT:    psrlw $15, %xmm0
2430; SSE2-NEXT:    paddw %xmm2, %xmm0
2431; SSE2-NEXT:    retq
2432;
2433; SSE41-LABEL: combine_vec_sdiv_nonuniform2:
2434; SSE41:       # %bb.0:
2435; SSE41-NEXT:    pmulhw {{.*}}(%rip), %xmm0
2436; SSE41-NEXT:    movdqa %xmm0, %xmm1
2437; SSE41-NEXT:    psraw $1, %xmm1
2438; SSE41-NEXT:    movdqa %xmm0, %xmm2
2439; SSE41-NEXT:    psraw $2, %xmm2
2440; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2441; SSE41-NEXT:    psrlw $15, %xmm0
2442; SSE41-NEXT:    paddw %xmm2, %xmm0
2443; SSE41-NEXT:    retq
2444;
2445; AVX1-LABEL: combine_vec_sdiv_nonuniform2:
2446; AVX1:       # %bb.0:
2447; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2448; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
2449; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm2
2450; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2451; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2452; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2453; AVX1-NEXT:    retq
2454;
2455; AVX2-LABEL: combine_vec_sdiv_nonuniform2:
2456; AVX2:       # %bb.0:
2457; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2458; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm1
2459; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm2
2460; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2461; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2462; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2463; AVX2-NEXT:    retq
2464;
2465; AVX512F-LABEL: combine_vec_sdiv_nonuniform2:
2466; AVX512F:       # %bb.0:
2467; AVX512F-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2468; AVX512F-NEXT:    vpsraw $1, %xmm0, %xmm1
2469; AVX512F-NEXT:    vpsraw $2, %xmm0, %xmm2
2470; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2471; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm0
2472; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2473; AVX512F-NEXT:    retq
2474;
2475; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2:
2476; AVX512BW:       # %bb.0:
2477; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2478; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2479; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
2480; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2481; AVX512BW-NEXT:    retq
2482;
2483; XOP-LABEL: combine_vec_sdiv_nonuniform2:
2484; XOP:       # %bb.0:
2485; XOP-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2486; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2487; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
2488; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2489; XOP-NEXT:    retq
2490  %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25>
2491  ret <8 x i16> %1
2492}
2493
2494define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
2495; SSE2-LABEL: combine_vec_sdiv_nonuniform3:
2496; SSE2:       # %bb.0:
2497; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2498; SSE2-NEXT:    pmulhw %xmm0, %xmm1
2499; SSE2-NEXT:    paddw %xmm0, %xmm1
2500; SSE2-NEXT:    movdqa %xmm1, %xmm0
2501; SSE2-NEXT:    psraw $4, %xmm0
2502; SSE2-NEXT:    movdqa %xmm1, %xmm2
2503; SSE2-NEXT:    psraw $8, %xmm2
2504; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2505; SSE2-NEXT:    psrlw $15, %xmm1
2506; SSE2-NEXT:    paddw %xmm2, %xmm1
2507; SSE2-NEXT:    movdqa %xmm1, %xmm0
2508; SSE2-NEXT:    retq
2509;
2510; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
2511; SSE41:       # %bb.0:
2512; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2513; SSE41-NEXT:    pmulhw %xmm0, %xmm1
2514; SSE41-NEXT:    paddw %xmm0, %xmm1
2515; SSE41-NEXT:    movdqa %xmm1, %xmm0
2516; SSE41-NEXT:    psraw $8, %xmm0
2517; SSE41-NEXT:    movdqa %xmm1, %xmm2
2518; SSE41-NEXT:    psraw $4, %xmm2
2519; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2520; SSE41-NEXT:    psrlw $15, %xmm1
2521; SSE41-NEXT:    paddw %xmm2, %xmm1
2522; SSE41-NEXT:    movdqa %xmm1, %xmm0
2523; SSE41-NEXT:    retq
2524;
2525; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
2526; AVX1:       # %bb.0:
2527; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2528; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2529; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm1
2530; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm2
2531; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2532; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2533; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2534; AVX1-NEXT:    retq
2535;
2536; AVX2-LABEL: combine_vec_sdiv_nonuniform3:
2537; AVX2:       # %bb.0:
2538; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2539; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2540; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm1
2541; AVX2-NEXT:    vpsraw $4, %xmm0, %xmm2
2542; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2543; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2544; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2545; AVX2-NEXT:    retq
2546;
2547; AVX512F-LABEL: combine_vec_sdiv_nonuniform3:
2548; AVX512F:       # %bb.0:
2549; AVX512F-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2550; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2551; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm1
2552; AVX512F-NEXT:    vpsraw $4, %xmm0, %xmm2
2553; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2554; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm0
2555; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2556; AVX512F-NEXT:    retq
2557;
2558; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3:
2559; AVX512BW:       # %bb.0:
2560; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2561; AVX512BW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2562; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2563; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
2564; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2565; AVX512BW-NEXT:    retq
2566;
2567; XOP-LABEL: combine_vec_sdiv_nonuniform3:
2568; XOP:       # %bb.0:
2569; XOP-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2570; XOP-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2571; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2572; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
2573; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2574; XOP-NEXT:    retq
2575  %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511>
2576  ret <8 x i16> %1
2577}
2578
2579define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
2580; SSE2-LABEL: combine_vec_sdiv_nonuniform4:
2581; SSE2:       # %bb.0:
2582; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2583; SSE2-NEXT:    pmulhw %xmm0, %xmm1
2584; SSE2-NEXT:    psubw %xmm0, %xmm1
2585; SSE2-NEXT:    movdqa %xmm1, %xmm0
2586; SSE2-NEXT:    psraw $4, %xmm0
2587; SSE2-NEXT:    movdqa %xmm1, %xmm2
2588; SSE2-NEXT:    psraw $8, %xmm2
2589; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2590; SSE2-NEXT:    psrlw $15, %xmm1
2591; SSE2-NEXT:    paddw %xmm2, %xmm1
2592; SSE2-NEXT:    movdqa %xmm1, %xmm0
2593; SSE2-NEXT:    retq
2594;
2595; SSE41-LABEL: combine_vec_sdiv_nonuniform4:
2596; SSE41:       # %bb.0:
2597; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2598; SSE41-NEXT:    pmulhw %xmm0, %xmm1
2599; SSE41-NEXT:    psubw %xmm0, %xmm1
2600; SSE41-NEXT:    movdqa %xmm1, %xmm0
2601; SSE41-NEXT:    psraw $8, %xmm0
2602; SSE41-NEXT:    movdqa %xmm1, %xmm2
2603; SSE41-NEXT:    psraw $4, %xmm2
2604; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2605; SSE41-NEXT:    psrlw $15, %xmm1
2606; SSE41-NEXT:    paddw %xmm2, %xmm1
2607; SSE41-NEXT:    movdqa %xmm1, %xmm0
2608; SSE41-NEXT:    retq
2609;
2610; AVX1-LABEL: combine_vec_sdiv_nonuniform4:
2611; AVX1:       # %bb.0:
2612; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2613; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2614; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm1
2615; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm2
2616; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2617; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2618; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2619; AVX1-NEXT:    retq
2620;
2621; AVX2-LABEL: combine_vec_sdiv_nonuniform4:
2622; AVX2:       # %bb.0:
2623; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2624; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2625; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm1
2626; AVX2-NEXT:    vpsraw $4, %xmm0, %xmm2
2627; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2628; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2629; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2630; AVX2-NEXT:    retq
2631;
2632; AVX512F-LABEL: combine_vec_sdiv_nonuniform4:
2633; AVX512F:       # %bb.0:
2634; AVX512F-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2635; AVX512F-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2636; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm1
2637; AVX512F-NEXT:    vpsraw $4, %xmm0, %xmm2
2638; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2639; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm0
2640; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2641; AVX512F-NEXT:    retq
2642;
2643; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4:
2644; AVX512BW:       # %bb.0:
2645; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2646; AVX512BW-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2647; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2648; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
2649; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2650; AVX512BW-NEXT:    retq
2651;
2652; XOP-LABEL: combine_vec_sdiv_nonuniform4:
2653; XOP:       # %bb.0:
2654; XOP-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2655; XOP-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2656; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2657; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
2658; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2659; XOP-NEXT:    retq
2660  %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510>
2661  ret <8 x i16> %1
2662}
2663
2664define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
2665; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
2666; SSE2:       # %bb.0:
2667; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2668; SSE2-NEXT:    pmullw %xmm0, %xmm1
2669; SSE2-NEXT:    pmulhw {{.*}}(%rip), %xmm0
2670; SSE2-NEXT:    paddw %xmm1, %xmm0
2671; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
2672; SSE2-NEXT:    movdqa %xmm0, %xmm2
2673; SSE2-NEXT:    pand %xmm1, %xmm2
2674; SSE2-NEXT:    movdqa %xmm0, %xmm3
2675; SSE2-NEXT:    psraw $8, %xmm3
2676; SSE2-NEXT:    pandn %xmm3, %xmm1
2677; SSE2-NEXT:    por %xmm2, %xmm1
2678; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
2679; SSE2-NEXT:    pand %xmm2, %xmm1
2680; SSE2-NEXT:    movdqa %xmm0, %xmm3
2681; SSE2-NEXT:    psraw $4, %xmm3
2682; SSE2-NEXT:    pandn %xmm3, %xmm2
2683; SSE2-NEXT:    por %xmm1, %xmm2
2684; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
2685; SSE2-NEXT:    movdqa %xmm2, %xmm3
2686; SSE2-NEXT:    pand %xmm1, %xmm3
2687; SSE2-NEXT:    psraw $2, %xmm2
2688; SSE2-NEXT:    pandn %xmm2, %xmm1
2689; SSE2-NEXT:    por %xmm3, %xmm1
2690; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
2691; SSE2-NEXT:    movdqa %xmm1, %xmm3
2692; SSE2-NEXT:    pand %xmm2, %xmm3
2693; SSE2-NEXT:    psraw $1, %xmm1
2694; SSE2-NEXT:    pandn %xmm1, %xmm2
2695; SSE2-NEXT:    por %xmm3, %xmm2
2696; SSE2-NEXT:    psrlw $15, %xmm0
2697; SSE2-NEXT:    paddw %xmm2, %xmm0
2698; SSE2-NEXT:    retq
2699;
2700; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
2701; SSE41:       # %bb.0:
2702; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2703; SSE41-NEXT:    pmullw %xmm0, %xmm1
2704; SSE41-NEXT:    pmulhw {{.*}}(%rip), %xmm0
2705; SSE41-NEXT:    paddw %xmm1, %xmm0
2706; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256>
2707; SSE41-NEXT:    pmulhw %xmm0, %xmm1
2708; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2709; SSE41-NEXT:    movdqa %xmm0, %xmm2
2710; SSE41-NEXT:    psraw $1, %xmm2
2711; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2712; SSE41-NEXT:    psrlw $15, %xmm0
2713; SSE41-NEXT:    paddw %xmm2, %xmm0
2714; SSE41-NEXT:    retq
2715;
2716; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
2717; AVX1:       # %bb.0:
2718; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2719; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2720; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2721; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2722; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2723; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
2724; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2725; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2726; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2727; AVX1-NEXT:    retq
2728;
2729; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
2730; AVX2:       # %bb.0:
2731; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2732; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2733; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2734; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2735; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2736; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm2
2737; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2738; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2739; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2740; AVX2-NEXT:    retq
2741;
2742; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
2743; AVX512F:       # %bb.0:
2744; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2745; AVX512F-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2746; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2747; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm1
2748; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
2749; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
2750; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2751; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2752; AVX512F-NEXT:    vzeroupper
2753; AVX512F-NEXT:    retq
2754;
2755; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5:
2756; AVX512BW:       # %bb.0:
2757; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2758; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2759; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2760; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2761; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
2762; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2763; AVX512BW-NEXT:    retq
2764;
2765; XOP-LABEL: combine_vec_sdiv_nonuniform5:
2766; XOP:       # %bb.0:
2767; XOP-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2768; XOP-NEXT:    vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2769; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2770; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
2771; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2772; XOP-NEXT:    retq
2773  %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511>
2774  ret <8 x i16> %1
2775}
2776
2777define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
2778; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
2779; SSE2:       # %bb.0:
2780; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2781; SSE2-NEXT:    pmullw %xmm0, %xmm1
2782; SSE2-NEXT:    pmulhw {{.*}}(%rip), %xmm0
2783; SSE2-NEXT:    paddw %xmm1, %xmm0
2784; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
2785; SSE2-NEXT:    movdqa %xmm0, %xmm2
2786; SSE2-NEXT:    psraw $8, %xmm2
2787; SSE2-NEXT:    pand %xmm1, %xmm2
2788; SSE2-NEXT:    pandn %xmm0, %xmm1
2789; SSE2-NEXT:    por %xmm2, %xmm1
2790; SSE2-NEXT:    movdqa %xmm1, %xmm2
2791; SSE2-NEXT:    psraw $6, %xmm2
2792; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535]
2793; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0]
2794; SSE2-NEXT:    pand %xmm4, %xmm1
2795; SSE2-NEXT:    movdqa %xmm0, %xmm5
2796; SSE2-NEXT:    psraw $12, %xmm5
2797; SSE2-NEXT:    pandn %xmm5, %xmm4
2798; SSE2-NEXT:    por %xmm1, %xmm4
2799; SSE2-NEXT:    pand %xmm3, %xmm4
2800; SSE2-NEXT:    pandn %xmm2, %xmm3
2801; SSE2-NEXT:    por %xmm4, %xmm3
2802; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0]
2803; SSE2-NEXT:    movdqa %xmm3, %xmm2
2804; SSE2-NEXT:    pand %xmm1, %xmm2
2805; SSE2-NEXT:    psraw $1, %xmm3
2806; SSE2-NEXT:    pandn %xmm3, %xmm1
2807; SSE2-NEXT:    por %xmm2, %xmm1
2808; SSE2-NEXT:    psrlw $15, %xmm0
2809; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
2810; SSE2-NEXT:    paddw %xmm1, %xmm0
2811; SSE2-NEXT:    retq
2812;
2813; SSE41-LABEL: combine_vec_sdiv_nonuniform6:
2814; SSE41:       # %bb.0:
2815; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2816; SSE41-NEXT:    pmullw %xmm0, %xmm1
2817; SSE41-NEXT:    pmulhw {{.*}}(%rip), %xmm0
2818; SSE41-NEXT:    paddw %xmm1, %xmm0
2819; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8>
2820; SSE41-NEXT:    pmulhw %xmm0, %xmm2
2821; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
2822; SSE41-NEXT:    psrlw $15, %xmm0
2823; SSE41-NEXT:    pxor %xmm1, %xmm1
2824; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
2825; SSE41-NEXT:    paddw %xmm2, %xmm1
2826; SSE41-NEXT:    movdqa %xmm1, %xmm0
2827; SSE41-NEXT:    retq
2828;
2829; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
2830; AVX1:       # %bb.0:
2831; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2832; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2833; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2834; AVX1-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2835; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2836; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2837; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2838; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2839; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2840; AVX1-NEXT:    retq
2841;
2842; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
2843; AVX2:       # %bb.0:
2844; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2845; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2846; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2847; AVX2-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2848; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2849; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2850; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2851; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2852; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2853; AVX2-NEXT:    retq
2854;
2855; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
2856; AVX512F:       # %bb.0:
2857; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2858; AVX512F-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2859; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2860; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm1
2861; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2862; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2863; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
2864; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
2865; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2866; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2867; AVX512F-NEXT:    vzeroupper
2868; AVX512F-NEXT:    retq
2869;
2870; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6:
2871; AVX512BW:       # %bb.0:
2872; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
2873; AVX512BW-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm0
2874; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2875; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2876; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2877; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2878; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %xmm0, %xmm0
2879; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2880; AVX512BW-NEXT:    retq
2881;
2882; XOP-LABEL: combine_vec_sdiv_nonuniform6:
2883; XOP:       # %bb.0:
2884; XOP-NEXT:    vpmulhw {{.*}}(%rip), %xmm0, %xmm1
2885; XOP-NEXT:    vpmacsww %xmm1, {{.*}}(%rip), %xmm0, %xmm0
2886; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2887; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2888; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2889; XOP-NEXT:    vpshaw {{.*}}(%rip), %xmm0, %xmm0
2890; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2891; XOP-NEXT:    retq
2892  %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767>
2893  ret <8 x i16> %1
2894}
2895
2896define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
2897; SSE2-LABEL: combine_vec_sdiv_nonuniform7:
2898; SSE2:       # %bb.0:
2899; SSE2-NEXT:    pxor %xmm1, %xmm1
2900; SSE2-NEXT:    psubw %xmm0, %xmm1
2901; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2902; SSE2-NEXT:    retq
2903;
2904; SSE41-LABEL: combine_vec_sdiv_nonuniform7:
2905; SSE41:       # %bb.0:
2906; SSE41-NEXT:    pxor %xmm1, %xmm1
2907; SSE41-NEXT:    psubw %xmm0, %xmm1
2908; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2909; SSE41-NEXT:    retq
2910;
2911; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2912; AVX1:       # %bb.0:
2913; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2914; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
2915; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2916; AVX1-NEXT:    retq
2917;
2918; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2919; AVX2ORLATER:       # %bb.0:
2920; AVX2ORLATER-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2921; AVX2ORLATER-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
2922; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2923; AVX2ORLATER-NEXT:    retq
2924;
2925; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2926; XOP:       # %bb.0:
2927; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2928; XOP-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
2929; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2930; XOP-NEXT:    retq
2931  %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
2932  ret <8 x i16> %1
2933}
2934
2935define <16 x i8> @pr38658(<16 x i8> %x) {
2936; SSE2-LABEL: pr38658:
2937; SSE2:       # %bb.0:
2938; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2939; SSE2-NEXT:    psraw $8, %xmm2
2940; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
2941; SSE2-NEXT:    psrlw $8, %xmm2
2942; SSE2-NEXT:    pxor %xmm3, %xmm3
2943; SSE2-NEXT:    pxor %xmm1, %xmm1
2944; SSE2-NEXT:    packuswb %xmm2, %xmm1
2945; SSE2-NEXT:    paddb %xmm0, %xmm1
2946; SSE2-NEXT:    movdqa %xmm1, %xmm0
2947; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2948; SSE2-NEXT:    movdqa %xmm1, %xmm2
2949; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2950; SSE2-NEXT:    psraw $8, %xmm2
2951; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm2
2952; SSE2-NEXT:    psrlw $8, %xmm2
2953; SSE2-NEXT:    packuswb %xmm2, %xmm0
2954; SSE2-NEXT:    psrlw $7, %xmm1
2955; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
2956; SSE2-NEXT:    paddb %xmm0, %xmm1
2957; SSE2-NEXT:    movdqa %xmm1, %xmm0
2958; SSE2-NEXT:    retq
2959;
2960; SSE41-LABEL: pr38658:
2961; SSE41:       # %bb.0:
2962; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2963; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
2964; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm2
2965; SSE41-NEXT:    psrlw $8, %xmm2
2966; SSE41-NEXT:    pxor %xmm1, %xmm1
2967; SSE41-NEXT:    packuswb %xmm2, %xmm1
2968; SSE41-NEXT:    paddb %xmm0, %xmm1
2969; SSE41-NEXT:    movdqa %xmm1, %xmm0
2970; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2971; SSE41-NEXT:    psraw $8, %xmm0
2972; SSE41-NEXT:    movdqa %xmm0, %xmm2
2973; SSE41-NEXT:    psllw $6, %xmm2
2974; SSE41-NEXT:    psllw $8, %xmm0
2975; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2976; SSE41-NEXT:    psrlw $8, %xmm0
2977; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2978; SSE41-NEXT:    packuswb %xmm0, %xmm2
2979; SSE41-NEXT:    psrlw $7, %xmm1
2980; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
2981; SSE41-NEXT:    paddb %xmm2, %xmm1
2982; SSE41-NEXT:    movdqa %xmm1, %xmm0
2983; SSE41-NEXT:    retq
2984;
2985; AVX1-LABEL: pr38658:
2986; AVX1:       # %bb.0:
2987; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2988; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm1
2989; AVX1-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
2990; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2991; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2992; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
2993; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2994; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2995; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
2996; AVX1-NEXT:    vpsllw $6, %xmm1, %xmm2
2997; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
2998; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
2999; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
3000; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3001; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
3002; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
3003; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3004; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3005; AVX1-NEXT:    retq
3006;
3007; AVX2-LABEL: pr38658:
3008; AVX2:       # %bb.0:
3009; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
3010; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
3011; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
3012; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3013; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3014; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3015; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
3016; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
3017; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
3018; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3019; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3020; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm0
3021; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3022; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3023; AVX2-NEXT:    vzeroupper
3024; AVX2-NEXT:    retq
3025;
3026; AVX512F-LABEL: pr38658:
3027; AVX512F:       # %bb.0:
3028; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
3029; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
3030; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
3031; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3032; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
3033; AVX512F-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3034; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm1
3035; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
3036; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
3037; AVX512F-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0
3038; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3039; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
3040; AVX512F-NEXT:    vzeroupper
3041; AVX512F-NEXT:    retq
3042;
3043; AVX512BW-LABEL: pr38658:
3044; AVX512BW:       # %bb.0:
3045; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
3046; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
3047; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
3048; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
3049; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3050; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm1
3051; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
3052; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
3053; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0
3054; AVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
3055; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
3056; AVX512BW-NEXT:    vzeroupper
3057; AVX512BW-NEXT:    retq
3058;
3059; XOP-LABEL: pr38658:
3060; XOP:       # %bb.0:
3061; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3062; XOP-NEXT:    vpmovsxbw %xmm1, %xmm1
3063; XOP-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
3064; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
3065; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
3066; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3067; XOP-NEXT:    vpshab {{.*}}(%rip), %xmm0, %xmm1
3068; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0
3069; XOP-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
3070; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3071; XOP-NEXT:    retq
3072  %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7>
3073  ret <16 x i8> %1
3074}
3075
3076define i1 @bool_sdiv(i1 %x, i1 %y) {
3077; CHECK-LABEL: bool_sdiv:
3078; CHECK:       # %bb.0:
3079; CHECK-NEXT:    movl %edi, %eax
3080; CHECK-NEXT:    # kill: def $al killed $al killed $eax
3081; CHECK-NEXT:    retq
3082  %r = sdiv i1 %x, %y
3083  ret i1 %r
3084}
3085
3086define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
3087; CHECK-LABEL: boolvec_sdiv:
3088; CHECK:       # %bb.0:
3089; CHECK-NEXT:    retq
3090  %r = sdiv <4 x i1> %x, %y
3091  ret <4 x i1> %r
3092}
3093
3094define i32 @combine_sdiv_two(i32 %x) {
3095; CHECK-LABEL: combine_sdiv_two:
3096; CHECK:       # %bb.0:
3097; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3098; CHECK-NEXT:    movl %edi, %eax
3099; CHECK-NEXT:    shrl $31, %eax
3100; CHECK-NEXT:    addl %edi, %eax
3101; CHECK-NEXT:    sarl %eax
3102; CHECK-NEXT:    retq
3103  %1 = sdiv i32 %x, 2
3104  ret i32 %1
3105}
3106
3107define i32 @combine_sdiv_negtwo(i32 %x) {
3108; CHECK-LABEL: combine_sdiv_negtwo:
3109; CHECK:       # %bb.0:
3110; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3111; CHECK-NEXT:    movl %edi, %eax
3112; CHECK-NEXT:    shrl $31, %eax
3113; CHECK-NEXT:    addl %edi, %eax
3114; CHECK-NEXT:    sarl %eax
3115; CHECK-NEXT:    negl %eax
3116; CHECK-NEXT:    retq
3117  %1 = sdiv i32 %x, -2
3118  ret i32 %1
3119}
3120
3121define i8 @combine_i8_sdiv_pow2(i8 %x) {
3122; CHECK-LABEL: combine_i8_sdiv_pow2:
3123; CHECK:       # %bb.0:
3124; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3125; CHECK-NEXT:    movl %edi, %eax
3126; CHECK-NEXT:    sarb $7, %al
3127; CHECK-NEXT:    shrb $4, %al
3128; CHECK-NEXT:    addl %edi, %eax
3129; CHECK-NEXT:    sarb $4, %al
3130; CHECK-NEXT:    # kill: def $al killed $al killed $eax
3131; CHECK-NEXT:    retq
3132  %1 = sdiv i8 %x, 16
3133  ret i8 %1
3134}
3135
3136define i8 @combine_i8_sdiv_negpow2(i8 %x) {
3137; CHECK-LABEL: combine_i8_sdiv_negpow2:
3138; CHECK:       # %bb.0:
3139; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3140; CHECK-NEXT:    movl %edi, %eax
3141; CHECK-NEXT:    sarb $7, %al
3142; CHECK-NEXT:    shrb $2, %al
3143; CHECK-NEXT:    addl %edi, %eax
3144; CHECK-NEXT:    sarb $6, %al
3145; CHECK-NEXT:    negb %al
3146; CHECK-NEXT:    # kill: def $al killed $al killed $eax
3147; CHECK-NEXT:    retq
3148  %1 = sdiv i8 %x, -64
3149  ret i8 %1
3150}
3151
3152define i16 @combine_i16_sdiv_pow2(i16 %x) {
3153; CHECK-LABEL: combine_i16_sdiv_pow2:
3154; CHECK:       # %bb.0:
3155; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3156; CHECK-NEXT:    leal 15(%rdi), %eax
3157; CHECK-NEXT:    testw %di, %di
3158; CHECK-NEXT:    cmovnsl %edi, %eax
3159; CHECK-NEXT:    cwtl
3160; CHECK-NEXT:    shrl $4, %eax
3161; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
3162; CHECK-NEXT:    retq
3163  %1 = sdiv i16 %x, 16
3164  ret i16 %1
3165}
3166
3167define i16 @combine_i16_sdiv_negpow2(i16 %x) {
3168; CHECK-LABEL: combine_i16_sdiv_negpow2:
3169; CHECK:       # %bb.0:
3170; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3171; CHECK-NEXT:    leal 255(%rdi), %eax
3172; CHECK-NEXT:    testw %di, %di
3173; CHECK-NEXT:    cmovnsl %edi, %eax
3174; CHECK-NEXT:    cwtl
3175; CHECK-NEXT:    sarl $8, %eax
3176; CHECK-NEXT:    negl %eax
3177; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
3178; CHECK-NEXT:    retq
3179  %1 = sdiv i16 %x, -256
3180  ret i16 %1
3181}
3182
3183define i32 @combine_i32_sdiv_pow2(i32 %x) {
3184; CHECK-LABEL: combine_i32_sdiv_pow2:
3185; CHECK:       # %bb.0:
3186; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3187; CHECK-NEXT:    leal 15(%rdi), %eax
3188; CHECK-NEXT:    testl %edi, %edi
3189; CHECK-NEXT:    cmovnsl %edi, %eax
3190; CHECK-NEXT:    sarl $4, %eax
3191; CHECK-NEXT:    retq
3192  %1 = sdiv i32 %x, 16
3193  ret i32 %1
3194}
3195
3196define i32 @combine_i32_sdiv_negpow2(i32 %x) {
3197; CHECK-LABEL: combine_i32_sdiv_negpow2:
3198; CHECK:       # %bb.0:
3199; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3200; CHECK-NEXT:    leal 255(%rdi), %eax
3201; CHECK-NEXT:    testl %edi, %edi
3202; CHECK-NEXT:    cmovnsl %edi, %eax
3203; CHECK-NEXT:    sarl $8, %eax
3204; CHECK-NEXT:    negl %eax
3205; CHECK-NEXT:    retq
3206  %1 = sdiv i32 %x, -256
3207  ret i32 %1
3208}
3209
3210define i64 @combine_i64_sdiv_pow2(i64 %x) {
3211; CHECK-LABEL: combine_i64_sdiv_pow2:
3212; CHECK:       # %bb.0:
3213; CHECK-NEXT:    leaq 15(%rdi), %rax
3214; CHECK-NEXT:    testq %rdi, %rdi
3215; CHECK-NEXT:    cmovnsq %rdi, %rax
3216; CHECK-NEXT:    sarq $4, %rax
3217; CHECK-NEXT:    retq
3218  %1 = sdiv i64 %x, 16
3219  ret i64 %1
3220}
3221
3222define i64 @combine_i64_sdiv_negpow2(i64 %x) {
3223; CHECK-LABEL: combine_i64_sdiv_negpow2:
3224; CHECK:       # %bb.0:
3225; CHECK-NEXT:    leaq 255(%rdi), %rax
3226; CHECK-NEXT:    testq %rdi, %rdi
3227; CHECK-NEXT:    cmovnsq %rdi, %rax
3228; CHECK-NEXT:    sarq $8, %rax
3229; CHECK-NEXT:    negq %rax
3230; CHECK-NEXT:    retq
3231  %1 = sdiv i64 %x, -256
3232  ret i64 %1
3233}
3234