• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
8
9declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>)
10declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>)
11declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>)
12declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
13declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>)
14declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
15declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
16
17declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
18declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
19declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
20
21declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>)
22declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>)
23declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>)
24
25define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
26; SSE-LABEL: saddo_v1i32:
27; SSE:       # %bb.0:
28; SSE-NEXT:    xorl %eax, %eax
29; SSE-NEXT:    addl %esi, %edi
30; SSE-NEXT:    seto %al
31; SSE-NEXT:    negl %eax
32; SSE-NEXT:    movl %edi, (%rdx)
33; SSE-NEXT:    retq
34;
35; AVX-LABEL: saddo_v1i32:
36; AVX:       # %bb.0:
37; AVX-NEXT:    xorl %eax, %eax
38; AVX-NEXT:    addl %esi, %edi
39; AVX-NEXT:    seto %al
40; AVX-NEXT:    negl %eax
41; AVX-NEXT:    movl %edi, (%rdx)
42; AVX-NEXT:    retq
43  %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
44  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
45  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
46  %res = sext <1 x i1> %obit to <1 x i32>
47  store <1 x i32> %val, <1 x i32>* %p2
48  ret <1 x i32> %res
49}
50
51define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
52; SSE-LABEL: saddo_v2i32:
53; SSE:       # %bb.0:
54; SSE-NEXT:    pxor %xmm2, %xmm2
55; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
56; SSE-NEXT:    paddd %xmm0, %xmm1
57; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
58; SSE-NEXT:    pxor %xmm2, %xmm0
59; SSE-NEXT:    movq %xmm1, (%rdi)
60; SSE-NEXT:    retq
61;
62; AVX1-LABEL: saddo_v2i32:
63; AVX1:       # %bb.0:
64; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
65; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
66; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
67; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
68; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
69; AVX1-NEXT:    vmovq %xmm1, (%rdi)
70; AVX1-NEXT:    retq
71;
72; AVX2-LABEL: saddo_v2i32:
73; AVX2:       # %bb.0:
74; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
75; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
76; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
77; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
78; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
79; AVX2-NEXT:    vmovq %xmm1, (%rdi)
80; AVX2-NEXT:    retq
81;
82; AVX512-LABEL: saddo_v2i32:
83; AVX512:       # %bb.0:
84; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
85; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
86; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
87; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
88; AVX512-NEXT:    kxorw %k1, %k0, %k1
89; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
90; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
91; AVX512-NEXT:    vmovq %xmm1, (%rdi)
92; AVX512-NEXT:    retq
93  %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
94  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
95  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
96  %res = sext <2 x i1> %obit to <2 x i32>
97  store <2 x i32> %val, <2 x i32>* %p2
98  ret <2 x i32> %res
99}
100
101define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
102; SSE2-LABEL: saddo_v3i32:
103; SSE2:       # %bb.0:
104; SSE2-NEXT:    pxor %xmm2, %xmm2
105; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
106; SSE2-NEXT:    paddd %xmm0, %xmm1
107; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
108; SSE2-NEXT:    pxor %xmm2, %xmm0
109; SSE2-NEXT:    movq %xmm1, (%rdi)
110; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
111; SSE2-NEXT:    movd %xmm1, 8(%rdi)
112; SSE2-NEXT:    retq
113;
114; SSSE3-LABEL: saddo_v3i32:
115; SSSE3:       # %bb.0:
116; SSSE3-NEXT:    pxor %xmm2, %xmm2
117; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
118; SSSE3-NEXT:    paddd %xmm0, %xmm1
119; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
120; SSSE3-NEXT:    pxor %xmm2, %xmm0
121; SSSE3-NEXT:    movq %xmm1, (%rdi)
122; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
123; SSSE3-NEXT:    movd %xmm1, 8(%rdi)
124; SSSE3-NEXT:    retq
125;
126; SSE41-LABEL: saddo_v3i32:
127; SSE41:       # %bb.0:
128; SSE41-NEXT:    pxor %xmm2, %xmm2
129; SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
130; SSE41-NEXT:    paddd %xmm0, %xmm1
131; SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
132; SSE41-NEXT:    pxor %xmm2, %xmm0
133; SSE41-NEXT:    pextrd $2, %xmm1, 8(%rdi)
134; SSE41-NEXT:    movq %xmm1, (%rdi)
135; SSE41-NEXT:    retq
136;
137; AVX1-LABEL: saddo_v3i32:
138; AVX1:       # %bb.0:
139; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
140; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
141; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
142; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
143; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
144; AVX1-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
145; AVX1-NEXT:    vmovq %xmm1, (%rdi)
146; AVX1-NEXT:    retq
147;
148; AVX2-LABEL: saddo_v3i32:
149; AVX2:       # %bb.0:
150; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
151; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
152; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
153; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
154; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
155; AVX2-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
156; AVX2-NEXT:    vmovq %xmm1, (%rdi)
157; AVX2-NEXT:    retq
158;
159; AVX512-LABEL: saddo_v3i32:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
162; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
163; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
164; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
165; AVX512-NEXT:    kxorw %k1, %k0, %k1
166; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
167; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
168; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
169; AVX512-NEXT:    vmovq %xmm1, (%rdi)
170; AVX512-NEXT:    retq
171  %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
172  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
173  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
174  %res = sext <3 x i1> %obit to <3 x i32>
175  store <3 x i32> %val, <3 x i32>* %p2
176  ret <3 x i32> %res
177}
178
179define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
180; SSE-LABEL: saddo_v4i32:
181; SSE:       # %bb.0:
182; SSE-NEXT:    pxor %xmm2, %xmm2
183; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
184; SSE-NEXT:    paddd %xmm0, %xmm1
185; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
186; SSE-NEXT:    pxor %xmm2, %xmm0
187; SSE-NEXT:    movdqa %xmm1, (%rdi)
188; SSE-NEXT:    retq
189;
190; AVX1-LABEL: saddo_v4i32:
191; AVX1:       # %bb.0:
192; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
193; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
194; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
195; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
196; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
197; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
198; AVX1-NEXT:    retq
199;
200; AVX2-LABEL: saddo_v4i32:
201; AVX2:       # %bb.0:
202; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
203; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm2, %xmm2
204; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
205; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
206; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
207; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
208; AVX2-NEXT:    retq
209;
210; AVX512-LABEL: saddo_v4i32:
211; AVX512:       # %bb.0:
212; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
213; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm2, %k0
214; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
215; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
216; AVX512-NEXT:    kxorw %k1, %k0, %k1
217; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
218; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
219; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
220; AVX512-NEXT:    retq
221  %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
222  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
223  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
224  %res = sext <4 x i1> %obit to <4 x i32>
225  store <4 x i32> %val, <4 x i32>* %p2
226  ret <4 x i32> %res
227}
228
229define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
230; SSE2-LABEL: saddo_v6i32:
231; SSE2:       # %bb.0:
232; SSE2-NEXT:    movq %rdi, %rax
233; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
234; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
235; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
236; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
237; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
238; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
239; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
240; SSE2-NEXT:    movd %r8d, %xmm0
241; SSE2-NEXT:    movd %ecx, %xmm1
242; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
243; SSE2-NEXT:    movd %edx, %xmm0
244; SSE2-NEXT:    movd %esi, %xmm3
245; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
246; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
247; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
248; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
249; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
250; SSE2-NEXT:    movd %r9d, %xmm0
251; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
252; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
253; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
254; SSE2-NEXT:    movdqa %xmm3, %xmm4
255; SSE2-NEXT:    paddd %xmm2, %xmm4
256; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
257; SSE2-NEXT:    pxor %xmm5, %xmm5
258; SSE2-NEXT:    pxor %xmm6, %xmm6
259; SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
260; SSE2-NEXT:    pxor %xmm3, %xmm6
261; SSE2-NEXT:    movdqa %xmm0, %xmm2
262; SSE2-NEXT:    paddd %xmm1, %xmm2
263; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
264; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
265; SSE2-NEXT:    pxor %xmm0, %xmm5
266; SSE2-NEXT:    movq %xmm2, 16(%rcx)
267; SSE2-NEXT:    movdqa %xmm4, (%rcx)
268; SSE2-NEXT:    movq %xmm5, 16(%rdi)
269; SSE2-NEXT:    movdqa %xmm6, (%rdi)
270; SSE2-NEXT:    retq
271;
272; SSSE3-LABEL: saddo_v6i32:
273; SSSE3:       # %bb.0:
274; SSSE3-NEXT:    movq %rdi, %rax
275; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
276; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
277; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
278; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
279; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
280; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
281; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
282; SSSE3-NEXT:    movd %r8d, %xmm0
283; SSSE3-NEXT:    movd %ecx, %xmm1
284; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
285; SSSE3-NEXT:    movd %edx, %xmm0
286; SSSE3-NEXT:    movd %esi, %xmm3
287; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
288; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
289; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
290; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
291; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
292; SSSE3-NEXT:    movd %r9d, %xmm0
293; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
294; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
295; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
296; SSSE3-NEXT:    movdqa %xmm3, %xmm4
297; SSSE3-NEXT:    paddd %xmm2, %xmm4
298; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
299; SSSE3-NEXT:    pxor %xmm5, %xmm5
300; SSSE3-NEXT:    pxor %xmm6, %xmm6
301; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm6
302; SSSE3-NEXT:    pxor %xmm3, %xmm6
303; SSSE3-NEXT:    movdqa %xmm0, %xmm2
304; SSSE3-NEXT:    paddd %xmm1, %xmm2
305; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
306; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
307; SSSE3-NEXT:    pxor %xmm0, %xmm5
308; SSSE3-NEXT:    movq %xmm2, 16(%rcx)
309; SSSE3-NEXT:    movdqa %xmm4, (%rcx)
310; SSSE3-NEXT:    movq %xmm5, 16(%rdi)
311; SSSE3-NEXT:    movdqa %xmm6, (%rdi)
312; SSSE3-NEXT:    retq
313;
314; SSE41-LABEL: saddo_v6i32:
315; SSE41:       # %bb.0:
316; SSE41-NEXT:    movq %rdi, %rax
317; SSE41-NEXT:    movd %esi, %xmm1
318; SSE41-NEXT:    pinsrd $1, %edx, %xmm1
319; SSE41-NEXT:    pinsrd $2, %ecx, %xmm1
320; SSE41-NEXT:    pinsrd $3, %r8d, %xmm1
321; SSE41-NEXT:    movd %r9d, %xmm0
322; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
323; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
324; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm2
325; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
326; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
327; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
328; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm3
329; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
330; SSE41-NEXT:    movdqa %xmm1, %xmm4
331; SSE41-NEXT:    paddd %xmm3, %xmm4
332; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
333; SSE41-NEXT:    pxor %xmm5, %xmm5
334; SSE41-NEXT:    pxor %xmm6, %xmm6
335; SSE41-NEXT:    pcmpgtd %xmm3, %xmm6
336; SSE41-NEXT:    pxor %xmm1, %xmm6
337; SSE41-NEXT:    pcmpgtd %xmm2, %xmm5
338; SSE41-NEXT:    paddd %xmm0, %xmm2
339; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
340; SSE41-NEXT:    pxor %xmm5, %xmm0
341; SSE41-NEXT:    movq %xmm2, 16(%rcx)
342; SSE41-NEXT:    movdqa %xmm4, (%rcx)
343; SSE41-NEXT:    movq %xmm0, 16(%rdi)
344; SSE41-NEXT:    movdqa %xmm6, (%rdi)
345; SSE41-NEXT:    retq
346;
347; AVX1-LABEL: saddo_v6i32:
348; AVX1:       # %bb.0:
349; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
350; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
351; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
352; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm3
353; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
354; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
355; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
356; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
357; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
358; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
359; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
360; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
361; AVX1-NEXT:    vmovq %xmm2, 16(%rdi)
362; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
363; AVX1-NEXT:    retq
364;
365; AVX2-LABEL: saddo_v6i32:
366; AVX2:       # %bb.0:
367; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
368; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm2
369; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
370; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
371; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
372; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
373; AVX2-NEXT:    vmovq %xmm2, 16(%rdi)
374; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
375; AVX2-NEXT:    retq
376;
377; AVX512-LABEL: saddo_v6i32:
378; AVX512:       # %bb.0:
379; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
380; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k0
381; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
382; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
383; AVX512-NEXT:    kxorw %k1, %k0, %k1
384; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
385; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
386; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
387; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
388; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
389; AVX512-NEXT:    retq
390  %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
391  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
392  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
393  %res = sext <6 x i1> %obit to <6 x i32>
394  store <6 x i32> %val, <6 x i32>* %p2
395  ret <6 x i32> %res
396}
397
398define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
399; SSE-LABEL: saddo_v8i32:
400; SSE:       # %bb.0:
401; SSE-NEXT:    pxor %xmm4, %xmm4
402; SSE-NEXT:    pxor %xmm5, %xmm5
403; SSE-NEXT:    pcmpgtd %xmm2, %xmm5
404; SSE-NEXT:    paddd %xmm0, %xmm2
405; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
406; SSE-NEXT:    pxor %xmm5, %xmm0
407; SSE-NEXT:    pcmpgtd %xmm3, %xmm4
408; SSE-NEXT:    paddd %xmm1, %xmm3
409; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
410; SSE-NEXT:    pxor %xmm4, %xmm1
411; SSE-NEXT:    movdqa %xmm3, 16(%rdi)
412; SSE-NEXT:    movdqa %xmm2, (%rdi)
413; SSE-NEXT:    retq
414;
415; AVX1-LABEL: saddo_v8i32:
416; AVX1:       # %bb.0:
417; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
418; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
419; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm4
420; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm3, %xmm3
421; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
422; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
423; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
424; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm4
425; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
426; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
427; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
428; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
429; AVX1-NEXT:    vmovdqa %xmm2, 16(%rdi)
430; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
431; AVX1-NEXT:    retq
432;
433; AVX2-LABEL: saddo_v8i32:
434; AVX2:       # %bb.0:
435; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
436; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm2, %ymm2
437; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
438; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
439; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
440; AVX2-NEXT:    vmovdqa %ymm1, (%rdi)
441; AVX2-NEXT:    retq
442;
443; AVX512-LABEL: saddo_v8i32:
444; AVX512:       # %bb.0:
445; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
446; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm2, %k0
447; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
448; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
449; AVX512-NEXT:    kxorw %k1, %k0, %k1
450; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
451; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
452; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
453; AVX512-NEXT:    retq
454  %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
455  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
456  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
457  %res = sext <8 x i1> %obit to <8 x i32>
458  store <8 x i32> %val, <8 x i32>* %p2
459  ret <8 x i32> %res
460}
461
462define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
463; SSE-LABEL: saddo_v16i32:
464; SSE:       # %bb.0:
465; SSE-NEXT:    pxor %xmm8, %xmm8
466; SSE-NEXT:    pxor %xmm9, %xmm9
467; SSE-NEXT:    pcmpgtd %xmm4, %xmm9
468; SSE-NEXT:    paddd %xmm0, %xmm4
469; SSE-NEXT:    pcmpgtd %xmm4, %xmm0
470; SSE-NEXT:    pxor %xmm9, %xmm0
471; SSE-NEXT:    pxor %xmm9, %xmm9
472; SSE-NEXT:    pcmpgtd %xmm5, %xmm9
473; SSE-NEXT:    paddd %xmm1, %xmm5
474; SSE-NEXT:    pcmpgtd %xmm5, %xmm1
475; SSE-NEXT:    pxor %xmm9, %xmm1
476; SSE-NEXT:    pxor %xmm9, %xmm9
477; SSE-NEXT:    pcmpgtd %xmm6, %xmm9
478; SSE-NEXT:    paddd %xmm2, %xmm6
479; SSE-NEXT:    pcmpgtd %xmm6, %xmm2
480; SSE-NEXT:    pxor %xmm9, %xmm2
481; SSE-NEXT:    pcmpgtd %xmm7, %xmm8
482; SSE-NEXT:    paddd %xmm3, %xmm7
483; SSE-NEXT:    pcmpgtd %xmm7, %xmm3
484; SSE-NEXT:    pxor %xmm8, %xmm3
485; SSE-NEXT:    movdqa %xmm7, 48(%rdi)
486; SSE-NEXT:    movdqa %xmm6, 32(%rdi)
487; SSE-NEXT:    movdqa %xmm5, 16(%rdi)
488; SSE-NEXT:    movdqa %xmm4, (%rdi)
489; SSE-NEXT:    retq
490;
491; AVX1-LABEL: saddo_v16i32:
492; AVX1:       # %bb.0:
493; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
494; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
495; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm6
496; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
497; AVX1-NEXT:    vpaddd %xmm4, %xmm7, %xmm8
498; AVX1-NEXT:    vpcmpgtd %xmm8, %xmm7, %xmm7
499; AVX1-NEXT:    vpxor %xmm7, %xmm6, %xmm6
500; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm5, %xmm7
501; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
502; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
503; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm1
504; AVX1-NEXT:    vpackssdw %xmm6, %xmm1, %xmm1
505; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
506; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm5, %xmm7
507; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
508; AVX1-NEXT:    vpaddd %xmm6, %xmm4, %xmm6
509; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
510; AVX1-NEXT:    vpxor %xmm4, %xmm7, %xmm4
511; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm5
512; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
513; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
514; AVX1-NEXT:    vpxor %xmm0, %xmm5, %xmm0
515; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
516; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
517; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm4
518; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
519; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
520; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
521; AVX1-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
522; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm4
523; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
524; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
525; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
526; AVX1-NEXT:    vmovdqa %xmm8, 48(%rdi)
527; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
528; AVX1-NEXT:    vmovdqa %xmm6, 16(%rdi)
529; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
530; AVX1-NEXT:    retq
531;
532; AVX2-LABEL: saddo_v16i32:
533; AVX2:       # %bb.0:
534; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
535; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm4, %ymm5
536; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm3
537; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
538; AVX2-NEXT:    vpxor %ymm1, %ymm5, %ymm1
539; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5
540; AVX2-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
541; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm4, %ymm4
542; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
543; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
544; AVX2-NEXT:    vpxor %ymm0, %ymm4, %ymm0
545; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
546; AVX2-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
547; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
548; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
549; AVX2-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
550; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
551; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
552; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
553; AVX2-NEXT:    retq
554;
555; AVX512-LABEL: saddo_v16i32:
556; AVX512:       # %bb.0:
557; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
558; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k0
559; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
560; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
561; AVX512-NEXT:    kxorw %k1, %k0, %k1
562; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
563; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
564; AVX512-NEXT:    retq
565  %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
566  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
567  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
568  %res = sext <16 x i1> %obit to <16 x i32>
569  store <16 x i32> %val, <16 x i32>* %p2
570  ret <16 x i32> %res
571}
572
573define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
574; SSE2-LABEL: saddo_v16i8:
575; SSE2:       # %bb.0:
576; SSE2-NEXT:    movdqa %xmm0, %xmm2
577; SSE2-NEXT:    paddsb %xmm1, %xmm2
578; SSE2-NEXT:    paddb %xmm1, %xmm0
579; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
580; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
581; SSE2-NEXT:    pxor %xmm2, %xmm3
582; SSE2-NEXT:    movdqa %xmm3, %xmm1
583; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
584; SSE2-NEXT:    movdqa %xmm1, %xmm4
585; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
586; SSE2-NEXT:    pslld $31, %xmm4
587; SSE2-NEXT:    psrad $31, %xmm4
588; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
589; SSE2-NEXT:    pslld $31, %xmm1
590; SSE2-NEXT:    psrad $31, %xmm1
591; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
592; SSE2-NEXT:    movdqa %xmm3, %xmm2
593; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
594; SSE2-NEXT:    pslld $31, %xmm2
595; SSE2-NEXT:    psrad $31, %xmm2
596; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
597; SSE2-NEXT:    pslld $31, %xmm3
598; SSE2-NEXT:    psrad $31, %xmm3
599; SSE2-NEXT:    movdqa %xmm0, (%rdi)
600; SSE2-NEXT:    movdqa %xmm4, %xmm0
601; SSE2-NEXT:    retq
602;
603; SSSE3-LABEL: saddo_v16i8:
604; SSSE3:       # %bb.0:
605; SSSE3-NEXT:    movdqa %xmm0, %xmm2
606; SSSE3-NEXT:    paddsb %xmm1, %xmm2
607; SSSE3-NEXT:    paddb %xmm1, %xmm0
608; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
609; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
610; SSSE3-NEXT:    pxor %xmm2, %xmm3
611; SSSE3-NEXT:    movdqa %xmm3, %xmm1
612; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
613; SSSE3-NEXT:    movdqa %xmm1, %xmm4
614; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
615; SSSE3-NEXT:    pslld $31, %xmm4
616; SSSE3-NEXT:    psrad $31, %xmm4
617; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
618; SSSE3-NEXT:    pslld $31, %xmm1
619; SSSE3-NEXT:    psrad $31, %xmm1
620; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
621; SSSE3-NEXT:    movdqa %xmm3, %xmm2
622; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
623; SSSE3-NEXT:    pslld $31, %xmm2
624; SSSE3-NEXT:    psrad $31, %xmm2
625; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
626; SSSE3-NEXT:    pslld $31, %xmm3
627; SSSE3-NEXT:    psrad $31, %xmm3
628; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
629; SSSE3-NEXT:    movdqa %xmm4, %xmm0
630; SSSE3-NEXT:    retq
631;
632; SSE41-LABEL: saddo_v16i8:
633; SSE41:       # %bb.0:
634; SSE41-NEXT:    movdqa %xmm0, %xmm2
635; SSE41-NEXT:    paddsb %xmm1, %xmm2
636; SSE41-NEXT:    paddb %xmm1, %xmm0
637; SSE41-NEXT:    pcmpeqb %xmm0, %xmm2
638; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
639; SSE41-NEXT:    pxor %xmm2, %xmm3
640; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
641; SSE41-NEXT:    pslld $31, %xmm4
642; SSE41-NEXT:    psrad $31, %xmm4
643; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
644; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
645; SSE41-NEXT:    pslld $31, %xmm1
646; SSE41-NEXT:    psrad $31, %xmm1
647; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
648; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
649; SSE41-NEXT:    pslld $31, %xmm2
650; SSE41-NEXT:    psrad $31, %xmm2
651; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
652; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
653; SSE41-NEXT:    pslld $31, %xmm3
654; SSE41-NEXT:    psrad $31, %xmm3
655; SSE41-NEXT:    movdqa %xmm0, (%rdi)
656; SSE41-NEXT:    movdqa %xmm4, %xmm0
657; SSE41-NEXT:    retq
658;
659; AVX1-LABEL: saddo_v16i8:
660; AVX1:       # %bb.0:
661; AVX1-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
662; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm3
663; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
664; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
665; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
666; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
667; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
668; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
669; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
670; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
671; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
672; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
673; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
674; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
675; AVX1-NEXT:    vmovdqa %xmm3, (%rdi)
676; AVX1-NEXT:    retq
677;
678; AVX2-LABEL: saddo_v16i8:
679; AVX2:       # %bb.0:
680; AVX2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
681; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm3
682; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm3, %xmm0
683; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
684; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
685; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
686; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
687; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
688; AVX2-NEXT:    vmovdqa %xmm3, (%rdi)
689; AVX2-NEXT:    retq
690;
691; AVX512-LABEL: saddo_v16i8:
692; AVX512:       # %bb.0:
693; AVX512-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2
694; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
695; AVX512-NEXT:    vpcmpneqb %xmm2, %xmm1, %k1
696; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
697; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
698; AVX512-NEXT:    retq
699  %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
700  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
701  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
702  %res = sext <16 x i1> %obit to <16 x i32>
703  store <16 x i8> %val, <16 x i8>* %p2
704  ret <16 x i32> %res
705}
706
707define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
708; SSE2-LABEL: saddo_v8i16:
709; SSE2:       # %bb.0:
710; SSE2-NEXT:    movdqa %xmm0, %xmm2
711; SSE2-NEXT:    paddsw %xmm1, %xmm2
712; SSE2-NEXT:    paddw %xmm1, %xmm0
713; SSE2-NEXT:    pcmpeqw %xmm0, %xmm2
714; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
715; SSE2-NEXT:    pxor %xmm2, %xmm1
716; SSE2-NEXT:    movdqa %xmm1, %xmm2
717; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
718; SSE2-NEXT:    pslld $31, %xmm2
719; SSE2-NEXT:    psrad $31, %xmm2
720; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
721; SSE2-NEXT:    pslld $31, %xmm1
722; SSE2-NEXT:    psrad $31, %xmm1
723; SSE2-NEXT:    movdqa %xmm0, (%rdi)
724; SSE2-NEXT:    movdqa %xmm2, %xmm0
725; SSE2-NEXT:    retq
726;
727; SSSE3-LABEL: saddo_v8i16:
728; SSSE3:       # %bb.0:
729; SSSE3-NEXT:    movdqa %xmm0, %xmm2
730; SSSE3-NEXT:    paddsw %xmm1, %xmm2
731; SSSE3-NEXT:    paddw %xmm1, %xmm0
732; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm2
733; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
734; SSSE3-NEXT:    pxor %xmm2, %xmm1
735; SSSE3-NEXT:    movdqa %xmm1, %xmm2
736; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
737; SSSE3-NEXT:    pslld $31, %xmm2
738; SSSE3-NEXT:    psrad $31, %xmm2
739; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
740; SSSE3-NEXT:    pslld $31, %xmm1
741; SSSE3-NEXT:    psrad $31, %xmm1
742; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
743; SSSE3-NEXT:    movdqa %xmm2, %xmm0
744; SSSE3-NEXT:    retq
745;
746; SSE41-LABEL: saddo_v8i16:
747; SSE41:       # %bb.0:
748; SSE41-NEXT:    movdqa %xmm0, %xmm2
749; SSE41-NEXT:    paddsw %xmm1, %xmm2
750; SSE41-NEXT:    paddw %xmm1, %xmm0
751; SSE41-NEXT:    pcmpeqw %xmm0, %xmm2
752; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
753; SSE41-NEXT:    pxor %xmm2, %xmm1
754; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
755; SSE41-NEXT:    pslld $31, %xmm2
756; SSE41-NEXT:    psrad $31, %xmm2
757; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
758; SSE41-NEXT:    pslld $31, %xmm1
759; SSE41-NEXT:    psrad $31, %xmm1
760; SSE41-NEXT:    movdqa %xmm0, (%rdi)
761; SSE41-NEXT:    movdqa %xmm2, %xmm0
762; SSE41-NEXT:    retq
763;
764; AVX1-LABEL: saddo_v8i16:
765; AVX1:       # %bb.0:
766; AVX1-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2
767; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
768; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
769; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
770; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
771; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm2
772; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
773; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
774; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
775; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
776; AVX1-NEXT:    retq
777;
778; AVX2-LABEL: saddo_v8i16:
779; AVX2:       # %bb.0:
780; AVX2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2
781; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
782; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm0
783; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
784; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
785; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
786; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
787; AVX2-NEXT:    retq
788;
789; AVX512-LABEL: saddo_v8i16:
790; AVX512:       # %bb.0:
791; AVX512-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2
792; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
793; AVX512-NEXT:    vpcmpneqw %xmm2, %xmm1, %k1
794; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
795; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
796; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
797; AVX512-NEXT:    retq
798  %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
799  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
800  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
801  %res = sext <8 x i1> %obit to <8 x i32>
802  store <8 x i16> %val, <8 x i16>* %p2
803  ret <8 x i32> %res
804}
805
806define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
807; SSE-LABEL: saddo_v2i64:
808; SSE:       # %bb.0:
809; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
810; SSE-NEXT:    movdqa %xmm0, %xmm3
811; SSE-NEXT:    pxor %xmm2, %xmm3
812; SSE-NEXT:    paddq %xmm1, %xmm0
813; SSE-NEXT:    pxor %xmm0, %xmm2
814; SSE-NEXT:    movdqa %xmm3, %xmm4
815; SSE-NEXT:    pcmpgtd %xmm2, %xmm4
816; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
817; SSE-NEXT:    pcmpeqd %xmm3, %xmm2
818; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
819; SSE-NEXT:    pand %xmm5, %xmm2
820; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
821; SSE-NEXT:    por %xmm2, %xmm3
822; SSE-NEXT:    pxor %xmm2, %xmm2
823; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
824; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
825; SSE-NEXT:    pxor %xmm3, %xmm1
826; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
827; SSE-NEXT:    movdqa %xmm0, (%rdi)
828; SSE-NEXT:    movdqa %xmm1, %xmm0
829; SSE-NEXT:    retq
830;
831; AVX1-LABEL: saddo_v2i64:
832; AVX1:       # %bb.0:
833; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
834; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
835; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
836; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
837; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
838; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
839; AVX1-NEXT:    vmovdqa %xmm1, (%rdi)
840; AVX1-NEXT:    retq
841;
842; AVX2-LABEL: saddo_v2i64:
843; AVX2:       # %bb.0:
844; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
845; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
846; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
847; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
848; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
849; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
850; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
851; AVX2-NEXT:    retq
852;
853; AVX512-LABEL: saddo_v2i64:
854; AVX512:       # %bb.0:
855; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
856; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm2, %k0
857; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
858; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
859; AVX512-NEXT:    kxorw %k1, %k0, %k1
860; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
861; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
862; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
863; AVX512-NEXT:    retq
864  %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
865  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
866  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
867  %res = sext <2 x i1> %obit to <2 x i32>
868  store <2 x i64> %val, <2 x i64>* %p2
869  ret <2 x i32> %res
870}
871
872define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
873; SSE2-LABEL: saddo_v4i24:
874; SSE2:       # %bb.0:
875; SSE2-NEXT:    movdqa %xmm0, %xmm2
876; SSE2-NEXT:    pslld $8, %xmm1
877; SSE2-NEXT:    psrad $8, %xmm1
878; SSE2-NEXT:    pslld $8, %xmm2
879; SSE2-NEXT:    psrad $8, %xmm2
880; SSE2-NEXT:    paddd %xmm1, %xmm2
881; SSE2-NEXT:    movdqa %xmm2, %xmm0
882; SSE2-NEXT:    pslld $8, %xmm0
883; SSE2-NEXT:    psrad $8, %xmm0
884; SSE2-NEXT:    pcmpeqd %xmm2, %xmm0
885; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
886; SSE2-NEXT:    pxor %xmm1, %xmm0
887; SSE2-NEXT:    movd %xmm2, %eax
888; SSE2-NEXT:    movw %ax, (%rdi)
889; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
890; SSE2-NEXT:    movd %xmm1, %ecx
891; SSE2-NEXT:    movw %cx, 9(%rdi)
892; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
893; SSE2-NEXT:    movd %xmm1, %edx
894; SSE2-NEXT:    movw %dx, 6(%rdi)
895; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
896; SSE2-NEXT:    movd %xmm1, %esi
897; SSE2-NEXT:    movw %si, 3(%rdi)
898; SSE2-NEXT:    shrl $16, %eax
899; SSE2-NEXT:    movb %al, 2(%rdi)
900; SSE2-NEXT:    shrl $16, %ecx
901; SSE2-NEXT:    movb %cl, 11(%rdi)
902; SSE2-NEXT:    shrl $16, %edx
903; SSE2-NEXT:    movb %dl, 8(%rdi)
904; SSE2-NEXT:    shrl $16, %esi
905; SSE2-NEXT:    movb %sil, 5(%rdi)
906; SSE2-NEXT:    retq
907;
908; SSSE3-LABEL: saddo_v4i24:
909; SSSE3:       # %bb.0:
910; SSSE3-NEXT:    movdqa %xmm0, %xmm2
911; SSSE3-NEXT:    pslld $8, %xmm1
912; SSSE3-NEXT:    psrad $8, %xmm1
913; SSSE3-NEXT:    pslld $8, %xmm2
914; SSSE3-NEXT:    psrad $8, %xmm2
915; SSSE3-NEXT:    paddd %xmm1, %xmm2
916; SSSE3-NEXT:    movdqa %xmm2, %xmm0
917; SSSE3-NEXT:    pslld $8, %xmm0
918; SSSE3-NEXT:    psrad $8, %xmm0
919; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm0
920; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
921; SSSE3-NEXT:    pxor %xmm1, %xmm0
922; SSSE3-NEXT:    movd %xmm2, %eax
923; SSSE3-NEXT:    movw %ax, (%rdi)
924; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
925; SSSE3-NEXT:    movd %xmm1, %ecx
926; SSSE3-NEXT:    movw %cx, 9(%rdi)
927; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
928; SSSE3-NEXT:    movd %xmm1, %edx
929; SSSE3-NEXT:    movw %dx, 6(%rdi)
930; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
931; SSSE3-NEXT:    movd %xmm1, %esi
932; SSSE3-NEXT:    movw %si, 3(%rdi)
933; SSSE3-NEXT:    shrl $16, %eax
934; SSSE3-NEXT:    movb %al, 2(%rdi)
935; SSSE3-NEXT:    shrl $16, %ecx
936; SSSE3-NEXT:    movb %cl, 11(%rdi)
937; SSSE3-NEXT:    shrl $16, %edx
938; SSSE3-NEXT:    movb %dl, 8(%rdi)
939; SSSE3-NEXT:    shrl $16, %esi
940; SSSE3-NEXT:    movb %sil, 5(%rdi)
941; SSSE3-NEXT:    retq
942;
943; SSE41-LABEL: saddo_v4i24:
944; SSE41:       # %bb.0:
945; SSE41-NEXT:    movdqa %xmm0, %xmm2
946; SSE41-NEXT:    pslld $8, %xmm1
947; SSE41-NEXT:    psrad $8, %xmm1
948; SSE41-NEXT:    pslld $8, %xmm2
949; SSE41-NEXT:    psrad $8, %xmm2
950; SSE41-NEXT:    paddd %xmm1, %xmm2
951; SSE41-NEXT:    movdqa %xmm2, %xmm0
952; SSE41-NEXT:    pslld $8, %xmm0
953; SSE41-NEXT:    psrad $8, %xmm0
954; SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
955; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
956; SSE41-NEXT:    pxor %xmm1, %xmm0
957; SSE41-NEXT:    pextrd $3, %xmm2, %eax
958; SSE41-NEXT:    movw %ax, 9(%rdi)
959; SSE41-NEXT:    pextrd $2, %xmm2, %ecx
960; SSE41-NEXT:    movw %cx, 6(%rdi)
961; SSE41-NEXT:    pextrd $1, %xmm2, %edx
962; SSE41-NEXT:    movw %dx, 3(%rdi)
963; SSE41-NEXT:    movd %xmm2, %esi
964; SSE41-NEXT:    movw %si, (%rdi)
965; SSE41-NEXT:    shrl $16, %eax
966; SSE41-NEXT:    movb %al, 11(%rdi)
967; SSE41-NEXT:    shrl $16, %ecx
968; SSE41-NEXT:    movb %cl, 8(%rdi)
969; SSE41-NEXT:    shrl $16, %edx
970; SSE41-NEXT:    movb %dl, 5(%rdi)
971; SSE41-NEXT:    shrl $16, %esi
972; SSE41-NEXT:    movb %sil, 2(%rdi)
973; SSE41-NEXT:    retq
974;
975; AVX1-LABEL: saddo_v4i24:
976; AVX1:       # %bb.0:
977; AVX1-NEXT:    vpslld $8, %xmm1, %xmm1
978; AVX1-NEXT:    vpsrad $8, %xmm1, %xmm1
979; AVX1-NEXT:    vpslld $8, %xmm0, %xmm0
980; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
981; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
982; AVX1-NEXT:    vpslld $8, %xmm1, %xmm0
983; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
984; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
985; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
986; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
987; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
988; AVX1-NEXT:    movw %ax, 9(%rdi)
989; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
990; AVX1-NEXT:    movw %cx, 6(%rdi)
991; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
992; AVX1-NEXT:    movw %dx, 3(%rdi)
993; AVX1-NEXT:    vmovd %xmm1, %esi
994; AVX1-NEXT:    movw %si, (%rdi)
995; AVX1-NEXT:    shrl $16, %eax
996; AVX1-NEXT:    movb %al, 11(%rdi)
997; AVX1-NEXT:    shrl $16, %ecx
998; AVX1-NEXT:    movb %cl, 8(%rdi)
999; AVX1-NEXT:    shrl $16, %edx
1000; AVX1-NEXT:    movb %dl, 5(%rdi)
1001; AVX1-NEXT:    shrl $16, %esi
1002; AVX1-NEXT:    movb %sil, 2(%rdi)
1003; AVX1-NEXT:    retq
1004;
1005; AVX2-LABEL: saddo_v4i24:
1006; AVX2:       # %bb.0:
1007; AVX2-NEXT:    vpslld $8, %xmm1, %xmm1
1008; AVX2-NEXT:    vpsrad $8, %xmm1, %xmm1
1009; AVX2-NEXT:    vpslld $8, %xmm0, %xmm0
1010; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
1011; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1012; AVX2-NEXT:    vpslld $8, %xmm1, %xmm0
1013; AVX2-NEXT:    vpsrad $8, %xmm0, %xmm0
1014; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1015; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1016; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1017; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
1018; AVX2-NEXT:    movw %ax, 9(%rdi)
1019; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
1020; AVX2-NEXT:    movw %cx, 6(%rdi)
1021; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
1022; AVX2-NEXT:    movw %dx, 3(%rdi)
1023; AVX2-NEXT:    vmovd %xmm1, %esi
1024; AVX2-NEXT:    movw %si, (%rdi)
1025; AVX2-NEXT:    shrl $16, %eax
1026; AVX2-NEXT:    movb %al, 11(%rdi)
1027; AVX2-NEXT:    shrl $16, %ecx
1028; AVX2-NEXT:    movb %cl, 8(%rdi)
1029; AVX2-NEXT:    shrl $16, %edx
1030; AVX2-NEXT:    movb %dl, 5(%rdi)
1031; AVX2-NEXT:    shrl $16, %esi
1032; AVX2-NEXT:    movb %sil, 2(%rdi)
1033; AVX2-NEXT:    retq
1034;
1035; AVX512-LABEL: saddo_v4i24:
1036; AVX512:       # %bb.0:
1037; AVX512-NEXT:    vpslld $8, %xmm1, %xmm1
1038; AVX512-NEXT:    vpsrad $8, %xmm1, %xmm1
1039; AVX512-NEXT:    vpslld $8, %xmm0, %xmm0
1040; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
1041; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1042; AVX512-NEXT:    vpslld $8, %xmm1, %xmm0
1043; AVX512-NEXT:    vpsrad $8, %xmm0, %xmm0
1044; AVX512-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
1045; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
1046; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
1047; AVX512-NEXT:    movw %ax, 9(%rdi)
1048; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
1049; AVX512-NEXT:    movw %cx, 6(%rdi)
1050; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
1051; AVX512-NEXT:    movw %dx, 3(%rdi)
1052; AVX512-NEXT:    vmovd %xmm1, %esi
1053; AVX512-NEXT:    movw %si, (%rdi)
1054; AVX512-NEXT:    shrl $16, %eax
1055; AVX512-NEXT:    movb %al, 11(%rdi)
1056; AVX512-NEXT:    shrl $16, %ecx
1057; AVX512-NEXT:    movb %cl, 8(%rdi)
1058; AVX512-NEXT:    shrl $16, %edx
1059; AVX512-NEXT:    movb %dl, 5(%rdi)
1060; AVX512-NEXT:    shrl $16, %esi
1061; AVX512-NEXT:    movb %sil, 2(%rdi)
1062; AVX512-NEXT:    retq
1063  %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
1064  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
1065  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
1066  %res = sext <4 x i1> %obit to <4 x i32>
1067  store <4 x i24> %val, <4 x i24>* %p2
1068  ret <4 x i32> %res
1069}
1070
1071define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
1072; SSE-LABEL: saddo_v4i1:
1073; SSE:       # %bb.0:
1074; SSE-NEXT:    pslld $31, %xmm1
1075; SSE-NEXT:    psrad $31, %xmm1
1076; SSE-NEXT:    pslld $31, %xmm0
1077; SSE-NEXT:    psrad $31, %xmm0
1078; SSE-NEXT:    paddd %xmm1, %xmm0
1079; SSE-NEXT:    movdqa %xmm0, %xmm1
1080; SSE-NEXT:    pslld $31, %xmm1
1081; SSE-NEXT:    movmskps %xmm1, %eax
1082; SSE-NEXT:    psrad $31, %xmm1
1083; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
1084; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
1085; SSE-NEXT:    pxor %xmm0, %xmm1
1086; SSE-NEXT:    movb %al, (%rdi)
1087; SSE-NEXT:    movdqa %xmm1, %xmm0
1088; SSE-NEXT:    retq
1089;
1090; AVX1-LABEL: saddo_v4i1:
1091; AVX1:       # %bb.0:
1092; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
1093; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
1094; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
1095; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
1096; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1097; AVX1-NEXT:    vpslld $31, %xmm0, %xmm1
1098; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
1099; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
1100; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1101; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1102; AVX1-NEXT:    vmovmskps %xmm1, %eax
1103; AVX1-NEXT:    movb %al, (%rdi)
1104; AVX1-NEXT:    retq
1105;
1106; AVX2-LABEL: saddo_v4i1:
1107; AVX2:       # %bb.0:
1108; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
1109; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
1110; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
1111; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
1112; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1113; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1
1114; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
1115; AVX2-NEXT:    vpcmpeqd %xmm0, %xmm2, %xmm0
1116; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1117; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1118; AVX2-NEXT:    vmovmskps %xmm1, %eax
1119; AVX2-NEXT:    movb %al, (%rdi)
1120; AVX2-NEXT:    retq
1121;
1122; AVX512-LABEL: saddo_v4i1:
1123; AVX512:       # %bb.0:
1124; AVX512-NEXT:    vpslld $31, %xmm1, %xmm1
1125; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
1126; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm2
1127; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k0
1128; AVX512-NEXT:    vptestmd %xmm1, %xmm1, %k1
1129; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k2
1130; AVX512-NEXT:    kxorw %k1, %k0, %k0
1131; AVX512-NEXT:    kxorw %k2, %k0, %k1
1132; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1133; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1134; AVX512-NEXT:    kshiftlw $12, %k0, %k0
1135; AVX512-NEXT:    kshiftrw $12, %k0, %k0
1136; AVX512-NEXT:    kmovd %k0, %eax
1137; AVX512-NEXT:    movb %al, (%rdi)
1138; AVX512-NEXT:    retq
1139  %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
1140  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
1141  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
1142  %res = sext <4 x i1> %obit to <4 x i32>
1143  store <4 x i1> %val, <4 x i1>* %p2
1144  ret <4 x i32> %res
1145}
1146
1147define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
1148; SSE2-LABEL: saddo_v2i128:
1149; SSE2:       # %bb.0:
1150; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1151; SSE2-NEXT:    addq %r8, %rdi
1152; SSE2-NEXT:    adcq %r9, %rsi
1153; SSE2-NEXT:    seto %r8b
1154; SSE2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1155; SSE2-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1156; SSE2-NEXT:    seto %al
1157; SSE2-NEXT:    movzbl %al, %eax
1158; SSE2-NEXT:    negl %eax
1159; SSE2-NEXT:    movd %eax, %xmm1
1160; SSE2-NEXT:    movzbl %r8b, %eax
1161; SSE2-NEXT:    negl %eax
1162; SSE2-NEXT:    movd %eax, %xmm0
1163; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1164; SSE2-NEXT:    movq %rdx, 16(%r10)
1165; SSE2-NEXT:    movq %rdi, (%r10)
1166; SSE2-NEXT:    movq %rcx, 24(%r10)
1167; SSE2-NEXT:    movq %rsi, 8(%r10)
1168; SSE2-NEXT:    retq
1169;
1170; SSSE3-LABEL: saddo_v2i128:
1171; SSSE3:       # %bb.0:
1172; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1173; SSSE3-NEXT:    addq %r8, %rdi
1174; SSSE3-NEXT:    adcq %r9, %rsi
1175; SSSE3-NEXT:    seto %r8b
1176; SSSE3-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1177; SSSE3-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1178; SSSE3-NEXT:    seto %al
1179; SSSE3-NEXT:    movzbl %al, %eax
1180; SSSE3-NEXT:    negl %eax
1181; SSSE3-NEXT:    movd %eax, %xmm1
1182; SSSE3-NEXT:    movzbl %r8b, %eax
1183; SSSE3-NEXT:    negl %eax
1184; SSSE3-NEXT:    movd %eax, %xmm0
1185; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1186; SSSE3-NEXT:    movq %rdx, 16(%r10)
1187; SSSE3-NEXT:    movq %rdi, (%r10)
1188; SSSE3-NEXT:    movq %rcx, 24(%r10)
1189; SSSE3-NEXT:    movq %rsi, 8(%r10)
1190; SSSE3-NEXT:    retq
1191;
1192; SSE41-LABEL: saddo_v2i128:
1193; SSE41:       # %bb.0:
1194; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1195; SSE41-NEXT:    addq %r8, %rdi
1196; SSE41-NEXT:    adcq %r9, %rsi
1197; SSE41-NEXT:    seto %r8b
1198; SSE41-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1199; SSE41-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1200; SSE41-NEXT:    seto %al
1201; SSE41-NEXT:    movzbl %al, %r9d
1202; SSE41-NEXT:    negl %r9d
1203; SSE41-NEXT:    movzbl %r8b, %eax
1204; SSE41-NEXT:    negl %eax
1205; SSE41-NEXT:    movd %eax, %xmm0
1206; SSE41-NEXT:    pinsrd $1, %r9d, %xmm0
1207; SSE41-NEXT:    movq %rdx, 16(%r10)
1208; SSE41-NEXT:    movq %rdi, (%r10)
1209; SSE41-NEXT:    movq %rcx, 24(%r10)
1210; SSE41-NEXT:    movq %rsi, 8(%r10)
1211; SSE41-NEXT:    retq
1212;
1213; AVX1-LABEL: saddo_v2i128:
1214; AVX1:       # %bb.0:
1215; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1216; AVX1-NEXT:    addq %r8, %rdi
1217; AVX1-NEXT:    adcq %r9, %rsi
1218; AVX1-NEXT:    seto %r8b
1219; AVX1-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1220; AVX1-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1221; AVX1-NEXT:    seto %al
1222; AVX1-NEXT:    movzbl %al, %r9d
1223; AVX1-NEXT:    negl %r9d
1224; AVX1-NEXT:    movzbl %r8b, %eax
1225; AVX1-NEXT:    negl %eax
1226; AVX1-NEXT:    vmovd %eax, %xmm0
1227; AVX1-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
1228; AVX1-NEXT:    movq %rdx, 16(%r10)
1229; AVX1-NEXT:    movq %rdi, (%r10)
1230; AVX1-NEXT:    movq %rcx, 24(%r10)
1231; AVX1-NEXT:    movq %rsi, 8(%r10)
1232; AVX1-NEXT:    retq
1233;
1234; AVX2-LABEL: saddo_v2i128:
1235; AVX2:       # %bb.0:
1236; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1237; AVX2-NEXT:    addq %r8, %rdi
1238; AVX2-NEXT:    adcq %r9, %rsi
1239; AVX2-NEXT:    seto %r8b
1240; AVX2-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1241; AVX2-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1242; AVX2-NEXT:    seto %al
1243; AVX2-NEXT:    movzbl %al, %r9d
1244; AVX2-NEXT:    negl %r9d
1245; AVX2-NEXT:    movzbl %r8b, %eax
1246; AVX2-NEXT:    negl %eax
1247; AVX2-NEXT:    vmovd %eax, %xmm0
1248; AVX2-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
1249; AVX2-NEXT:    movq %rdx, 16(%r10)
1250; AVX2-NEXT:    movq %rdi, (%r10)
1251; AVX2-NEXT:    movq %rcx, 24(%r10)
1252; AVX2-NEXT:    movq %rsi, 8(%r10)
1253; AVX2-NEXT:    retq
1254;
1255; AVX512-LABEL: saddo_v2i128:
1256; AVX512:       # %bb.0:
1257; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
1258; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
1259; AVX512-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
1260; AVX512-NEXT:    seto %al
1261; AVX512-NEXT:    kmovd %eax, %k0
1262; AVX512-NEXT:    addq %r8, %rdi
1263; AVX512-NEXT:    adcq %r9, %rsi
1264; AVX512-NEXT:    seto %al
1265; AVX512-NEXT:    andl $1, %eax
1266; AVX512-NEXT:    kmovw %eax, %k1
1267; AVX512-NEXT:    kshiftlw $1, %k0, %k0
1268; AVX512-NEXT:    korw %k0, %k1, %k1
1269; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1270; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1271; AVX512-NEXT:    movq %rdx, 16(%r10)
1272; AVX512-NEXT:    movq %rdi, (%r10)
1273; AVX512-NEXT:    movq %rcx, 24(%r10)
1274; AVX512-NEXT:    movq %rsi, 8(%r10)
1275; AVX512-NEXT:    retq
1276  %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
1277  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
1278  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
1279  %res = sext <2 x i1> %obit to <2 x i32>
1280  store <2 x i128> %val, <2 x i128>* %p2
1281  ret <2 x i32> %res
1282}
1283