1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 8 9declare {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32>, <1 x i32>) 10declare {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) 11declare {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32>, <3 x i32>) 12declare {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>) 13declare {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32>, <6 x i32>) 14declare {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>) 15declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>) 16 17declare {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>) 18declare {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>) 19declare {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>) 20 21declare {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24>, <4 x i24>) 22declare {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1>, <4 x i1>) 23declare {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128>, <2 x i128>) 24 25define <1 x i32> @saddo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { 26; SSE-LABEL: saddo_v1i32: 27; SSE: # %bb.0: 28; SSE-NEXT: xorl %eax, %eax 29; SSE-NEXT: addl %esi, %edi 30; SSE-NEXT: seto %al 31; SSE-NEXT: negl %eax 32; SSE-NEXT: movl %edi, (%rdx) 33; SSE-NEXT: retq 34; 35; AVX-LABEL: saddo_v1i32: 36; AVX: # %bb.0: 37; AVX-NEXT: xorl %eax, %eax 38; AVX-NEXT: addl %esi, %edi 39; AVX-NEXT: seto %al 40; AVX-NEXT: negl %eax 41; AVX-NEXT: movl %edi, (%rdx) 42; AVX-NEXT: retq 43 %t = call {<1 x i32>, <1 x i1>} @llvm.sadd.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 44 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 45 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 46 %res = sext <1 x i1> %obit to <1 x i32> 47 store <1 x i32> %val, <1 x i32>* %p2 48 ret <1 x i32> %res 49} 50 51define <2 x i32> @saddo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { 52; SSE-LABEL: saddo_v2i32: 53; SSE: # %bb.0: 54; SSE-NEXT: pxor %xmm2, %xmm2 55; SSE-NEXT: pcmpgtd %xmm1, %xmm2 56; SSE-NEXT: paddd %xmm0, %xmm1 57; SSE-NEXT: pcmpgtd %xmm1, %xmm0 58; SSE-NEXT: pxor %xmm2, %xmm0 59; SSE-NEXT: movq %xmm1, (%rdi) 60; SSE-NEXT: retq 61; 62; AVX1-LABEL: saddo_v2i32: 63; AVX1: # %bb.0: 64; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 65; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 66; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 67; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 68; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 69; AVX1-NEXT: vmovq %xmm1, (%rdi) 70; AVX1-NEXT: retq 71; 72; AVX2-LABEL: saddo_v2i32: 73; AVX2: # %bb.0: 74; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 75; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 76; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 77; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 78; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 79; AVX2-NEXT: vmovq %xmm1, (%rdi) 80; AVX2-NEXT: retq 81; 82; AVX512-LABEL: saddo_v2i32: 83; AVX512: # %bb.0: 84; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 85; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 86; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 87; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 88; AVX512-NEXT: kxorw %k1, %k0, %k1 89; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 90; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 91; AVX512-NEXT: vmovq %xmm1, (%rdi) 92; AVX512-NEXT: retq 93 %t = call {<2 x i32>, <2 x i1>} @llvm.sadd.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 94 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 95 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 96 %res = sext <2 x i1> %obit to <2 x i32> 97 store <2 x i32> %val, <2 x i32>* %p2 98 ret <2 x i32> %res 99} 100 101define <3 x i32> @saddo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { 102; SSE2-LABEL: saddo_v3i32: 103; SSE2: # %bb.0: 104; SSE2-NEXT: pxor %xmm2, %xmm2 105; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 106; SSE2-NEXT: paddd %xmm0, %xmm1 107; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 108; SSE2-NEXT: pxor %xmm2, %xmm0 109; SSE2-NEXT: movq %xmm1, (%rdi) 110; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 111; SSE2-NEXT: movd %xmm1, 8(%rdi) 112; SSE2-NEXT: retq 113; 114; SSSE3-LABEL: saddo_v3i32: 115; SSSE3: # %bb.0: 116; SSSE3-NEXT: pxor %xmm2, %xmm2 117; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 118; SSSE3-NEXT: paddd %xmm0, %xmm1 119; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 120; SSSE3-NEXT: pxor %xmm2, %xmm0 121; SSSE3-NEXT: movq %xmm1, (%rdi) 122; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 123; SSSE3-NEXT: movd %xmm1, 8(%rdi) 124; SSSE3-NEXT: retq 125; 126; SSE41-LABEL: saddo_v3i32: 127; SSE41: # %bb.0: 128; SSE41-NEXT: pxor %xmm2, %xmm2 129; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 130; SSE41-NEXT: paddd %xmm0, %xmm1 131; SSE41-NEXT: pcmpgtd %xmm1, %xmm0 132; SSE41-NEXT: pxor %xmm2, %xmm0 133; SSE41-NEXT: pextrd $2, %xmm1, 8(%rdi) 134; SSE41-NEXT: movq %xmm1, (%rdi) 135; SSE41-NEXT: retq 136; 137; AVX1-LABEL: saddo_v3i32: 138; AVX1: # %bb.0: 139; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 140; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 141; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 142; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 143; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 144; AVX1-NEXT: vpextrd $2, %xmm1, 8(%rdi) 145; AVX1-NEXT: vmovq %xmm1, (%rdi) 146; AVX1-NEXT: retq 147; 148; AVX2-LABEL: saddo_v3i32: 149; AVX2: # %bb.0: 150; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 151; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 152; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 153; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 154; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 155; AVX2-NEXT: vpextrd $2, %xmm1, 8(%rdi) 156; AVX2-NEXT: vmovq %xmm1, (%rdi) 157; AVX2-NEXT: retq 158; 159; AVX512-LABEL: saddo_v3i32: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 162; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 163; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 164; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 165; AVX512-NEXT: kxorw %k1, %k0, %k1 166; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 167; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 168; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 169; AVX512-NEXT: vmovq %xmm1, (%rdi) 170; AVX512-NEXT: retq 171 %t = call {<3 x i32>, <3 x i1>} @llvm.sadd.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 172 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 173 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 174 %res = sext <3 x i1> %obit to <3 x i32> 175 store <3 x i32> %val, <3 x i32>* %p2 176 ret <3 x i32> %res 177} 178 179define <4 x i32> @saddo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { 180; SSE-LABEL: saddo_v4i32: 181; SSE: # %bb.0: 182; SSE-NEXT: pxor %xmm2, %xmm2 183; SSE-NEXT: pcmpgtd %xmm1, %xmm2 184; SSE-NEXT: paddd %xmm0, %xmm1 185; SSE-NEXT: pcmpgtd %xmm1, %xmm0 186; SSE-NEXT: pxor %xmm2, %xmm0 187; SSE-NEXT: movdqa %xmm1, (%rdi) 188; SSE-NEXT: retq 189; 190; AVX1-LABEL: saddo_v4i32: 191; AVX1: # %bb.0: 192; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 193; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 194; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 195; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 196; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 197; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 198; AVX1-NEXT: retq 199; 200; AVX2-LABEL: saddo_v4i32: 201; AVX2: # %bb.0: 202; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 203; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2 204; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 205; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 206; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 207; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 208; AVX2-NEXT: retq 209; 210; AVX512-LABEL: saddo_v4i32: 211; AVX512: # %bb.0: 212; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 213; AVX512-NEXT: vpcmpgtd %xmm1, %xmm2, %k0 214; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 215; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 216; AVX512-NEXT: kxorw %k1, %k0, %k1 217; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 218; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 219; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 220; AVX512-NEXT: retq 221 %t = call {<4 x i32>, <4 x i1>} @llvm.sadd.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 222 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 223 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 224 %res = sext <4 x i1> %obit to <4 x i32> 225 store <4 x i32> %val, <4 x i32>* %p2 226 ret <4 x i32> %res 227} 228 229define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { 230; SSE2-LABEL: saddo_v6i32: 231; SSE2: # %bb.0: 232; SSE2-NEXT: movq %rdi, %rax 233; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 234; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 235; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 236; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 237; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 238; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 239; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 240; SSE2-NEXT: movd %r8d, %xmm0 241; SSE2-NEXT: movd %ecx, %xmm1 242; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 243; SSE2-NEXT: movd %edx, %xmm0 244; SSE2-NEXT: movd %esi, %xmm3 245; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 246; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 247; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 248; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 249; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 250; SSE2-NEXT: movd %r9d, %xmm0 251; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 252; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 253; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 254; SSE2-NEXT: movdqa %xmm3, %xmm4 255; SSE2-NEXT: paddd %xmm2, %xmm4 256; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 257; SSE2-NEXT: pxor %xmm5, %xmm5 258; SSE2-NEXT: pxor %xmm6, %xmm6 259; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 260; SSE2-NEXT: pxor %xmm3, %xmm6 261; SSE2-NEXT: movdqa %xmm0, %xmm2 262; SSE2-NEXT: paddd %xmm1, %xmm2 263; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 264; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 265; SSE2-NEXT: pxor %xmm0, %xmm5 266; SSE2-NEXT: movq %xmm2, 16(%rcx) 267; SSE2-NEXT: movdqa %xmm4, (%rcx) 268; SSE2-NEXT: movq %xmm5, 16(%rdi) 269; SSE2-NEXT: movdqa %xmm6, (%rdi) 270; SSE2-NEXT: retq 271; 272; SSSE3-LABEL: saddo_v6i32: 273; SSSE3: # %bb.0: 274; SSSE3-NEXT: movq %rdi, %rax 275; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 276; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 277; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 278; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 279; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 280; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 281; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 282; SSSE3-NEXT: movd %r8d, %xmm0 283; SSSE3-NEXT: movd %ecx, %xmm1 284; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 285; SSSE3-NEXT: movd %edx, %xmm0 286; SSSE3-NEXT: movd %esi, %xmm3 287; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 288; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 289; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 290; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 291; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 292; SSSE3-NEXT: movd %r9d, %xmm0 293; SSSE3-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero 294; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 295; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 296; SSSE3-NEXT: movdqa %xmm3, %xmm4 297; SSSE3-NEXT: paddd %xmm2, %xmm4 298; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 299; SSSE3-NEXT: pxor %xmm5, %xmm5 300; SSSE3-NEXT: pxor %xmm6, %xmm6 301; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 302; SSSE3-NEXT: pxor %xmm3, %xmm6 303; SSSE3-NEXT: movdqa %xmm0, %xmm2 304; SSSE3-NEXT: paddd %xmm1, %xmm2 305; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 306; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 307; SSSE3-NEXT: pxor %xmm0, %xmm5 308; SSSE3-NEXT: movq %xmm2, 16(%rcx) 309; SSSE3-NEXT: movdqa %xmm4, (%rcx) 310; SSSE3-NEXT: movq %xmm5, 16(%rdi) 311; SSSE3-NEXT: movdqa %xmm6, (%rdi) 312; SSSE3-NEXT: retq 313; 314; SSE41-LABEL: saddo_v6i32: 315; SSE41: # %bb.0: 316; SSE41-NEXT: movq %rdi, %rax 317; SSE41-NEXT: movd %esi, %xmm1 318; SSE41-NEXT: pinsrd $1, %edx, %xmm1 319; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 320; SSE41-NEXT: pinsrd $3, %r8d, %xmm1 321; SSE41-NEXT: movd %r9d, %xmm0 322; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm0 323; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 324; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm2 325; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 326; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 327; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm3 328; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm3 329; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 330; SSE41-NEXT: movdqa %xmm1, %xmm4 331; SSE41-NEXT: paddd %xmm3, %xmm4 332; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 333; SSE41-NEXT: pxor %xmm5, %xmm5 334; SSE41-NEXT: pxor %xmm6, %xmm6 335; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 336; SSE41-NEXT: pxor %xmm1, %xmm6 337; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 338; SSE41-NEXT: paddd %xmm0, %xmm2 339; SSE41-NEXT: pcmpgtd %xmm2, %xmm0 340; SSE41-NEXT: pxor %xmm5, %xmm0 341; SSE41-NEXT: movq %xmm2, 16(%rcx) 342; SSE41-NEXT: movdqa %xmm4, (%rcx) 343; SSE41-NEXT: movq %xmm0, 16(%rdi) 344; SSE41-NEXT: movdqa %xmm6, (%rdi) 345; SSE41-NEXT: retq 346; 347; AVX1-LABEL: saddo_v6i32: 348; AVX1: # %bb.0: 349; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 350; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 351; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 352; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 353; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 354; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 355; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 356; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 357; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 358; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 359; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 360; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 361; AVX1-NEXT: vmovq %xmm2, 16(%rdi) 362; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 363; AVX1-NEXT: retq 364; 365; AVX2-LABEL: saddo_v6i32: 366; AVX2: # %bb.0: 367; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 368; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2 369; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 370; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 371; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 372; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 373; AVX2-NEXT: vmovq %xmm2, 16(%rdi) 374; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 375; AVX2-NEXT: retq 376; 377; AVX512-LABEL: saddo_v6i32: 378; AVX512: # %bb.0: 379; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 380; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 381; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 382; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 383; AVX512-NEXT: kxorw %k1, %k0, %k1 384; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 385; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 386; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 387; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 388; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 389; AVX512-NEXT: retq 390 %t = call {<6 x i32>, <6 x i1>} @llvm.sadd.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 391 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 392 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 393 %res = sext <6 x i1> %obit to <6 x i32> 394 store <6 x i32> %val, <6 x i32>* %p2 395 ret <6 x i32> %res 396} 397 398define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { 399; SSE-LABEL: saddo_v8i32: 400; SSE: # %bb.0: 401; SSE-NEXT: pxor %xmm4, %xmm4 402; SSE-NEXT: pxor %xmm5, %xmm5 403; SSE-NEXT: pcmpgtd %xmm2, %xmm5 404; SSE-NEXT: paddd %xmm0, %xmm2 405; SSE-NEXT: pcmpgtd %xmm2, %xmm0 406; SSE-NEXT: pxor %xmm5, %xmm0 407; SSE-NEXT: pcmpgtd %xmm3, %xmm4 408; SSE-NEXT: paddd %xmm1, %xmm3 409; SSE-NEXT: pcmpgtd %xmm3, %xmm1 410; SSE-NEXT: pxor %xmm4, %xmm1 411; SSE-NEXT: movdqa %xmm3, 16(%rdi) 412; SSE-NEXT: movdqa %xmm2, (%rdi) 413; SSE-NEXT: retq 414; 415; AVX1-LABEL: saddo_v8i32: 416; AVX1: # %bb.0: 417; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 418; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 419; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4 420; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3 421; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 422; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 423; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 424; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4 425; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 426; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 427; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 428; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0 429; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) 430; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 431; AVX1-NEXT: retq 432; 433; AVX2-LABEL: saddo_v8i32: 434; AVX2: # %bb.0: 435; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 436; AVX2-NEXT: vpcmpgtd %ymm1, %ymm2, %ymm2 437; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 438; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 439; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 440; AVX2-NEXT: vmovdqa %ymm1, (%rdi) 441; AVX2-NEXT: retq 442; 443; AVX512-LABEL: saddo_v8i32: 444; AVX512: # %bb.0: 445; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 446; AVX512-NEXT: vpcmpgtd %ymm1, %ymm2, %k0 447; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm1 448; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 449; AVX512-NEXT: kxorw %k1, %k0, %k1 450; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 451; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 452; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 453; AVX512-NEXT: retq 454 %t = call {<8 x i32>, <8 x i1>} @llvm.sadd.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 455 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 456 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 457 %res = sext <8 x i1> %obit to <8 x i32> 458 store <8 x i32> %val, <8 x i32>* %p2 459 ret <8 x i32> %res 460} 461 462define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { 463; SSE-LABEL: saddo_v16i32: 464; SSE: # %bb.0: 465; SSE-NEXT: pxor %xmm8, %xmm8 466; SSE-NEXT: pxor %xmm9, %xmm9 467; SSE-NEXT: pcmpgtd %xmm4, %xmm9 468; SSE-NEXT: paddd %xmm0, %xmm4 469; SSE-NEXT: pcmpgtd %xmm4, %xmm0 470; SSE-NEXT: pxor %xmm9, %xmm0 471; SSE-NEXT: pxor %xmm9, %xmm9 472; SSE-NEXT: pcmpgtd %xmm5, %xmm9 473; SSE-NEXT: paddd %xmm1, %xmm5 474; SSE-NEXT: pcmpgtd %xmm5, %xmm1 475; SSE-NEXT: pxor %xmm9, %xmm1 476; SSE-NEXT: pxor %xmm9, %xmm9 477; SSE-NEXT: pcmpgtd %xmm6, %xmm9 478; SSE-NEXT: paddd %xmm2, %xmm6 479; SSE-NEXT: pcmpgtd %xmm6, %xmm2 480; SSE-NEXT: pxor %xmm9, %xmm2 481; SSE-NEXT: pcmpgtd %xmm7, %xmm8 482; SSE-NEXT: paddd %xmm3, %xmm7 483; SSE-NEXT: pcmpgtd %xmm7, %xmm3 484; SSE-NEXT: pxor %xmm8, %xmm3 485; SSE-NEXT: movdqa %xmm7, 48(%rdi) 486; SSE-NEXT: movdqa %xmm6, 32(%rdi) 487; SSE-NEXT: movdqa %xmm5, 16(%rdi) 488; SSE-NEXT: movdqa %xmm4, (%rdi) 489; SSE-NEXT: retq 490; 491; AVX1-LABEL: saddo_v16i32: 492; AVX1: # %bb.0: 493; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 494; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 495; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6 496; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 497; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm8 498; AVX1-NEXT: vpcmpgtd %xmm8, %xmm7, %xmm7 499; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6 500; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7 501; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 502; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1 503; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1 504; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 505; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 506; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7 507; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 508; AVX1-NEXT: vpaddd %xmm6, %xmm4, %xmm6 509; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4 510; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm4 511; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5 512; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 513; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 514; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 515; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 516; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 517; AVX1-NEXT: vpmovsxbd %xmm0, %xmm4 518; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 519; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 520; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 521; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 522; AVX1-NEXT: vpmovsxbd %xmm1, %xmm4 523; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 524; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 525; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 526; AVX1-NEXT: vmovdqa %xmm8, 48(%rdi) 527; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 528; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi) 529; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 530; AVX1-NEXT: retq 531; 532; AVX2-LABEL: saddo_v16i32: 533; AVX2: # %bb.0: 534; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 535; AVX2-NEXT: vpcmpgtd %ymm3, %ymm4, %ymm5 536; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm3 537; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 538; AVX2-NEXT: vpxor %ymm1, %ymm5, %ymm1 539; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 540; AVX2-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 541; AVX2-NEXT: vpcmpgtd %ymm2, %ymm4, %ymm4 542; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 543; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 544; AVX2-NEXT: vpxor %ymm0, %ymm4, %ymm0 545; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 546; AVX2-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 547; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 548; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 549; AVX2-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 550; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 551; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 552; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 553; AVX2-NEXT: retq 554; 555; AVX512-LABEL: saddo_v16i32: 556; AVX512: # %bb.0: 557; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 558; AVX512-NEXT: vpcmpgtd %zmm1, %zmm2, %k0 559; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm1 560; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 561; AVX512-NEXT: kxorw %k1, %k0, %k1 562; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 563; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 564; AVX512-NEXT: retq 565 %t = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 566 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 567 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 568 %res = sext <16 x i1> %obit to <16 x i32> 569 store <16 x i32> %val, <16 x i32>* %p2 570 ret <16 x i32> %res 571} 572 573define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { 574; SSE2-LABEL: saddo_v16i8: 575; SSE2: # %bb.0: 576; SSE2-NEXT: movdqa %xmm0, %xmm2 577; SSE2-NEXT: paddsb %xmm1, %xmm2 578; SSE2-NEXT: paddb %xmm1, %xmm0 579; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 580; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 581; SSE2-NEXT: pxor %xmm2, %xmm3 582; SSE2-NEXT: movdqa %xmm3, %xmm1 583; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 584; SSE2-NEXT: movdqa %xmm1, %xmm4 585; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 586; SSE2-NEXT: pslld $31, %xmm4 587; SSE2-NEXT: psrad $31, %xmm4 588; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 589; SSE2-NEXT: pslld $31, %xmm1 590; SSE2-NEXT: psrad $31, %xmm1 591; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 592; SSE2-NEXT: movdqa %xmm3, %xmm2 593; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 594; SSE2-NEXT: pslld $31, %xmm2 595; SSE2-NEXT: psrad $31, %xmm2 596; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 597; SSE2-NEXT: pslld $31, %xmm3 598; SSE2-NEXT: psrad $31, %xmm3 599; SSE2-NEXT: movdqa %xmm0, (%rdi) 600; SSE2-NEXT: movdqa %xmm4, %xmm0 601; SSE2-NEXT: retq 602; 603; SSSE3-LABEL: saddo_v16i8: 604; SSSE3: # %bb.0: 605; SSSE3-NEXT: movdqa %xmm0, %xmm2 606; SSSE3-NEXT: paddsb %xmm1, %xmm2 607; SSSE3-NEXT: paddb %xmm1, %xmm0 608; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 609; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 610; SSSE3-NEXT: pxor %xmm2, %xmm3 611; SSSE3-NEXT: movdqa %xmm3, %xmm1 612; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 613; SSSE3-NEXT: movdqa %xmm1, %xmm4 614; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 615; SSSE3-NEXT: pslld $31, %xmm4 616; SSSE3-NEXT: psrad $31, %xmm4 617; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 618; SSSE3-NEXT: pslld $31, %xmm1 619; SSSE3-NEXT: psrad $31, %xmm1 620; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 621; SSSE3-NEXT: movdqa %xmm3, %xmm2 622; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 623; SSSE3-NEXT: pslld $31, %xmm2 624; SSSE3-NEXT: psrad $31, %xmm2 625; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 626; SSSE3-NEXT: pslld $31, %xmm3 627; SSSE3-NEXT: psrad $31, %xmm3 628; SSSE3-NEXT: movdqa %xmm0, (%rdi) 629; SSSE3-NEXT: movdqa %xmm4, %xmm0 630; SSSE3-NEXT: retq 631; 632; SSE41-LABEL: saddo_v16i8: 633; SSE41: # %bb.0: 634; SSE41-NEXT: movdqa %xmm0, %xmm2 635; SSE41-NEXT: paddsb %xmm1, %xmm2 636; SSE41-NEXT: paddb %xmm1, %xmm0 637; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 638; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 639; SSE41-NEXT: pxor %xmm2, %xmm3 640; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 641; SSE41-NEXT: pslld $31, %xmm4 642; SSE41-NEXT: psrad $31, %xmm4 643; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 644; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 645; SSE41-NEXT: pslld $31, %xmm1 646; SSE41-NEXT: psrad $31, %xmm1 647; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 648; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 649; SSE41-NEXT: pslld $31, %xmm2 650; SSE41-NEXT: psrad $31, %xmm2 651; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 652; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 653; SSE41-NEXT: pslld $31, %xmm3 654; SSE41-NEXT: psrad $31, %xmm3 655; SSE41-NEXT: movdqa %xmm0, (%rdi) 656; SSE41-NEXT: movdqa %xmm4, %xmm0 657; SSE41-NEXT: retq 658; 659; AVX1-LABEL: saddo_v16i8: 660; AVX1: # %bb.0: 661; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 662; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm3 663; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm0 664; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 665; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 666; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 667; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 668; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 669; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 670; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 671; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 672; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 673; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 674; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 675; AVX1-NEXT: vmovdqa %xmm3, (%rdi) 676; AVX1-NEXT: retq 677; 678; AVX2-LABEL: saddo_v16i8: 679; AVX2: # %bb.0: 680; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 681; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm3 682; AVX2-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm0 683; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 684; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 685; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 686; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 687; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 688; AVX2-NEXT: vmovdqa %xmm3, (%rdi) 689; AVX2-NEXT: retq 690; 691; AVX512-LABEL: saddo_v16i8: 692; AVX512: # %bb.0: 693; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 694; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 695; AVX512-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 696; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 697; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 698; AVX512-NEXT: retq 699 %t = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 700 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 701 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 702 %res = sext <16 x i1> %obit to <16 x i32> 703 store <16 x i8> %val, <16 x i8>* %p2 704 ret <16 x i32> %res 705} 706 707define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { 708; SSE2-LABEL: saddo_v8i16: 709; SSE2: # %bb.0: 710; SSE2-NEXT: movdqa %xmm0, %xmm2 711; SSE2-NEXT: paddsw %xmm1, %xmm2 712; SSE2-NEXT: paddw %xmm1, %xmm0 713; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 714; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 715; SSE2-NEXT: pxor %xmm2, %xmm1 716; SSE2-NEXT: movdqa %xmm1, %xmm2 717; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 718; SSE2-NEXT: pslld $31, %xmm2 719; SSE2-NEXT: psrad $31, %xmm2 720; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 721; SSE2-NEXT: pslld $31, %xmm1 722; SSE2-NEXT: psrad $31, %xmm1 723; SSE2-NEXT: movdqa %xmm0, (%rdi) 724; SSE2-NEXT: movdqa %xmm2, %xmm0 725; SSE2-NEXT: retq 726; 727; SSSE3-LABEL: saddo_v8i16: 728; SSSE3: # %bb.0: 729; SSSE3-NEXT: movdqa %xmm0, %xmm2 730; SSSE3-NEXT: paddsw %xmm1, %xmm2 731; SSSE3-NEXT: paddw %xmm1, %xmm0 732; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 733; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 734; SSSE3-NEXT: pxor %xmm2, %xmm1 735; SSSE3-NEXT: movdqa %xmm1, %xmm2 736; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 737; SSSE3-NEXT: pslld $31, %xmm2 738; SSSE3-NEXT: psrad $31, %xmm2 739; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 740; SSSE3-NEXT: pslld $31, %xmm1 741; SSSE3-NEXT: psrad $31, %xmm1 742; SSSE3-NEXT: movdqa %xmm0, (%rdi) 743; SSSE3-NEXT: movdqa %xmm2, %xmm0 744; SSSE3-NEXT: retq 745; 746; SSE41-LABEL: saddo_v8i16: 747; SSE41: # %bb.0: 748; SSE41-NEXT: movdqa %xmm0, %xmm2 749; SSE41-NEXT: paddsw %xmm1, %xmm2 750; SSE41-NEXT: paddw %xmm1, %xmm0 751; SSE41-NEXT: pcmpeqw %xmm0, %xmm2 752; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 753; SSE41-NEXT: pxor %xmm2, %xmm1 754; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 755; SSE41-NEXT: pslld $31, %xmm2 756; SSE41-NEXT: psrad $31, %xmm2 757; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 758; SSE41-NEXT: pslld $31, %xmm1 759; SSE41-NEXT: psrad $31, %xmm1 760; SSE41-NEXT: movdqa %xmm0, (%rdi) 761; SSE41-NEXT: movdqa %xmm2, %xmm0 762; SSE41-NEXT: retq 763; 764; AVX1-LABEL: saddo_v8i16: 765; AVX1: # %bb.0: 766; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 767; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 768; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm0 769; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 770; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 771; AVX1-NEXT: vpmovsxwd %xmm0, %xmm2 772; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 773; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 774; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 775; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 776; AVX1-NEXT: retq 777; 778; AVX2-LABEL: saddo_v8i16: 779; AVX2: # %bb.0: 780; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 781; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 782; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm0 783; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 784; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 785; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 786; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 787; AVX2-NEXT: retq 788; 789; AVX512-LABEL: saddo_v8i16: 790; AVX512: # %bb.0: 791; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 792; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1 793; AVX512-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 794; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 795; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 796; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 797; AVX512-NEXT: retq 798 %t = call {<8 x i16>, <8 x i1>} @llvm.sadd.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 799 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 800 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 801 %res = sext <8 x i1> %obit to <8 x i32> 802 store <8 x i16> %val, <8 x i16>* %p2 803 ret <8 x i32> %res 804} 805 806define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { 807; SSE-LABEL: saddo_v2i64: 808; SSE: # %bb.0: 809; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 810; SSE-NEXT: movdqa %xmm0, %xmm3 811; SSE-NEXT: pxor %xmm2, %xmm3 812; SSE-NEXT: paddq %xmm1, %xmm0 813; SSE-NEXT: pxor %xmm0, %xmm2 814; SSE-NEXT: movdqa %xmm3, %xmm4 815; SSE-NEXT: pcmpgtd %xmm2, %xmm4 816; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 817; SSE-NEXT: pcmpeqd %xmm3, %xmm2 818; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 819; SSE-NEXT: pand %xmm5, %xmm2 820; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 821; SSE-NEXT: por %xmm2, %xmm3 822; SSE-NEXT: pxor %xmm2, %xmm2 823; SSE-NEXT: pcmpgtd %xmm1, %xmm2 824; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 825; SSE-NEXT: pxor %xmm3, %xmm1 826; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 827; SSE-NEXT: movdqa %xmm0, (%rdi) 828; SSE-NEXT: movdqa %xmm1, %xmm0 829; SSE-NEXT: retq 830; 831; AVX1-LABEL: saddo_v2i64: 832; AVX1: # %bb.0: 833; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 834; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 835; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 836; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 837; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 838; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 839; AVX1-NEXT: vmovdqa %xmm1, (%rdi) 840; AVX1-NEXT: retq 841; 842; AVX2-LABEL: saddo_v2i64: 843; AVX2: # %bb.0: 844; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 845; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 846; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 847; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 848; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 849; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 850; AVX2-NEXT: vmovdqa %xmm1, (%rdi) 851; AVX2-NEXT: retq 852; 853; AVX512-LABEL: saddo_v2i64: 854; AVX512: # %bb.0: 855; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 856; AVX512-NEXT: vpcmpgtq %xmm1, %xmm2, %k0 857; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 858; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1 859; AVX512-NEXT: kxorw %k1, %k0, %k1 860; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 861; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 862; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 863; AVX512-NEXT: retq 864 %t = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 865 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 866 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 867 %res = sext <2 x i1> %obit to <2 x i32> 868 store <2 x i64> %val, <2 x i64>* %p2 869 ret <2 x i32> %res 870} 871 872define <4 x i32> @saddo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { 873; SSE2-LABEL: saddo_v4i24: 874; SSE2: # %bb.0: 875; SSE2-NEXT: movdqa %xmm0, %xmm2 876; SSE2-NEXT: pslld $8, %xmm1 877; SSE2-NEXT: psrad $8, %xmm1 878; SSE2-NEXT: pslld $8, %xmm2 879; SSE2-NEXT: psrad $8, %xmm2 880; SSE2-NEXT: paddd %xmm1, %xmm2 881; SSE2-NEXT: movdqa %xmm2, %xmm0 882; SSE2-NEXT: pslld $8, %xmm0 883; SSE2-NEXT: psrad $8, %xmm0 884; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 885; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 886; SSE2-NEXT: pxor %xmm1, %xmm0 887; SSE2-NEXT: movd %xmm2, %eax 888; SSE2-NEXT: movw %ax, (%rdi) 889; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 890; SSE2-NEXT: movd %xmm1, %ecx 891; SSE2-NEXT: movw %cx, 9(%rdi) 892; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 893; SSE2-NEXT: movd %xmm1, %edx 894; SSE2-NEXT: movw %dx, 6(%rdi) 895; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 896; SSE2-NEXT: movd %xmm1, %esi 897; SSE2-NEXT: movw %si, 3(%rdi) 898; SSE2-NEXT: shrl $16, %eax 899; SSE2-NEXT: movb %al, 2(%rdi) 900; SSE2-NEXT: shrl $16, %ecx 901; SSE2-NEXT: movb %cl, 11(%rdi) 902; SSE2-NEXT: shrl $16, %edx 903; SSE2-NEXT: movb %dl, 8(%rdi) 904; SSE2-NEXT: shrl $16, %esi 905; SSE2-NEXT: movb %sil, 5(%rdi) 906; SSE2-NEXT: retq 907; 908; SSSE3-LABEL: saddo_v4i24: 909; SSSE3: # %bb.0: 910; SSSE3-NEXT: movdqa %xmm0, %xmm2 911; SSSE3-NEXT: pslld $8, %xmm1 912; SSSE3-NEXT: psrad $8, %xmm1 913; SSSE3-NEXT: pslld $8, %xmm2 914; SSSE3-NEXT: psrad $8, %xmm2 915; SSSE3-NEXT: paddd %xmm1, %xmm2 916; SSSE3-NEXT: movdqa %xmm2, %xmm0 917; SSSE3-NEXT: pslld $8, %xmm0 918; SSSE3-NEXT: psrad $8, %xmm0 919; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 920; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 921; SSSE3-NEXT: pxor %xmm1, %xmm0 922; SSSE3-NEXT: movd %xmm2, %eax 923; SSSE3-NEXT: movw %ax, (%rdi) 924; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3] 925; SSSE3-NEXT: movd %xmm1, %ecx 926; SSSE3-NEXT: movw %cx, 9(%rdi) 927; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 928; SSSE3-NEXT: movd %xmm1, %edx 929; SSSE3-NEXT: movw %dx, 6(%rdi) 930; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 931; SSSE3-NEXT: movd %xmm1, %esi 932; SSSE3-NEXT: movw %si, 3(%rdi) 933; SSSE3-NEXT: shrl $16, %eax 934; SSSE3-NEXT: movb %al, 2(%rdi) 935; SSSE3-NEXT: shrl $16, %ecx 936; SSSE3-NEXT: movb %cl, 11(%rdi) 937; SSSE3-NEXT: shrl $16, %edx 938; SSSE3-NEXT: movb %dl, 8(%rdi) 939; SSSE3-NEXT: shrl $16, %esi 940; SSSE3-NEXT: movb %sil, 5(%rdi) 941; SSSE3-NEXT: retq 942; 943; SSE41-LABEL: saddo_v4i24: 944; SSE41: # %bb.0: 945; SSE41-NEXT: movdqa %xmm0, %xmm2 946; SSE41-NEXT: pslld $8, %xmm1 947; SSE41-NEXT: psrad $8, %xmm1 948; SSE41-NEXT: pslld $8, %xmm2 949; SSE41-NEXT: psrad $8, %xmm2 950; SSE41-NEXT: paddd %xmm1, %xmm2 951; SSE41-NEXT: movdqa %xmm2, %xmm0 952; SSE41-NEXT: pslld $8, %xmm0 953; SSE41-NEXT: psrad $8, %xmm0 954; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 955; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 956; SSE41-NEXT: pxor %xmm1, %xmm0 957; SSE41-NEXT: pextrd $3, %xmm2, %eax 958; SSE41-NEXT: movw %ax, 9(%rdi) 959; SSE41-NEXT: pextrd $2, %xmm2, %ecx 960; SSE41-NEXT: movw %cx, 6(%rdi) 961; SSE41-NEXT: pextrd $1, %xmm2, %edx 962; SSE41-NEXT: movw %dx, 3(%rdi) 963; SSE41-NEXT: movd %xmm2, %esi 964; SSE41-NEXT: movw %si, (%rdi) 965; SSE41-NEXT: shrl $16, %eax 966; SSE41-NEXT: movb %al, 11(%rdi) 967; SSE41-NEXT: shrl $16, %ecx 968; SSE41-NEXT: movb %cl, 8(%rdi) 969; SSE41-NEXT: shrl $16, %edx 970; SSE41-NEXT: movb %dl, 5(%rdi) 971; SSE41-NEXT: shrl $16, %esi 972; SSE41-NEXT: movb %sil, 2(%rdi) 973; SSE41-NEXT: retq 974; 975; AVX1-LABEL: saddo_v4i24: 976; AVX1: # %bb.0: 977; AVX1-NEXT: vpslld $8, %xmm1, %xmm1 978; AVX1-NEXT: vpsrad $8, %xmm1, %xmm1 979; AVX1-NEXT: vpslld $8, %xmm0, %xmm0 980; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 981; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 982; AVX1-NEXT: vpslld $8, %xmm1, %xmm0 983; AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 984; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 985; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 986; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 987; AVX1-NEXT: vpextrd $3, %xmm1, %eax 988; AVX1-NEXT: movw %ax, 9(%rdi) 989; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 990; AVX1-NEXT: movw %cx, 6(%rdi) 991; AVX1-NEXT: vpextrd $1, %xmm1, %edx 992; AVX1-NEXT: movw %dx, 3(%rdi) 993; AVX1-NEXT: vmovd %xmm1, %esi 994; AVX1-NEXT: movw %si, (%rdi) 995; AVX1-NEXT: shrl $16, %eax 996; AVX1-NEXT: movb %al, 11(%rdi) 997; AVX1-NEXT: shrl $16, %ecx 998; AVX1-NEXT: movb %cl, 8(%rdi) 999; AVX1-NEXT: shrl $16, %edx 1000; AVX1-NEXT: movb %dl, 5(%rdi) 1001; AVX1-NEXT: shrl $16, %esi 1002; AVX1-NEXT: movb %sil, 2(%rdi) 1003; AVX1-NEXT: retq 1004; 1005; AVX2-LABEL: saddo_v4i24: 1006; AVX2: # %bb.0: 1007; AVX2-NEXT: vpslld $8, %xmm1, %xmm1 1008; AVX2-NEXT: vpsrad $8, %xmm1, %xmm1 1009; AVX2-NEXT: vpslld $8, %xmm0, %xmm0 1010; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 1011; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1012; AVX2-NEXT: vpslld $8, %xmm1, %xmm0 1013; AVX2-NEXT: vpsrad $8, %xmm0, %xmm0 1014; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 1015; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1016; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 1017; AVX2-NEXT: vpextrd $3, %xmm1, %eax 1018; AVX2-NEXT: movw %ax, 9(%rdi) 1019; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 1020; AVX2-NEXT: movw %cx, 6(%rdi) 1021; AVX2-NEXT: vpextrd $1, %xmm1, %edx 1022; AVX2-NEXT: movw %dx, 3(%rdi) 1023; AVX2-NEXT: vmovd %xmm1, %esi 1024; AVX2-NEXT: movw %si, (%rdi) 1025; AVX2-NEXT: shrl $16, %eax 1026; AVX2-NEXT: movb %al, 11(%rdi) 1027; AVX2-NEXT: shrl $16, %ecx 1028; AVX2-NEXT: movb %cl, 8(%rdi) 1029; AVX2-NEXT: shrl $16, %edx 1030; AVX2-NEXT: movb %dl, 5(%rdi) 1031; AVX2-NEXT: shrl $16, %esi 1032; AVX2-NEXT: movb %sil, 2(%rdi) 1033; AVX2-NEXT: retq 1034; 1035; AVX512-LABEL: saddo_v4i24: 1036; AVX512: # %bb.0: 1037; AVX512-NEXT: vpslld $8, %xmm1, %xmm1 1038; AVX512-NEXT: vpsrad $8, %xmm1, %xmm1 1039; AVX512-NEXT: vpslld $8, %xmm0, %xmm0 1040; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 1041; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1042; AVX512-NEXT: vpslld $8, %xmm1, %xmm0 1043; AVX512-NEXT: vpsrad $8, %xmm0, %xmm0 1044; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 1045; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 1046; AVX512-NEXT: vpextrd $3, %xmm1, %eax 1047; AVX512-NEXT: movw %ax, 9(%rdi) 1048; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 1049; AVX512-NEXT: movw %cx, 6(%rdi) 1050; AVX512-NEXT: vpextrd $1, %xmm1, %edx 1051; AVX512-NEXT: movw %dx, 3(%rdi) 1052; AVX512-NEXT: vmovd %xmm1, %esi 1053; AVX512-NEXT: movw %si, (%rdi) 1054; AVX512-NEXT: shrl $16, %eax 1055; AVX512-NEXT: movb %al, 11(%rdi) 1056; AVX512-NEXT: shrl $16, %ecx 1057; AVX512-NEXT: movb %cl, 8(%rdi) 1058; AVX512-NEXT: shrl $16, %edx 1059; AVX512-NEXT: movb %dl, 5(%rdi) 1060; AVX512-NEXT: shrl $16, %esi 1061; AVX512-NEXT: movb %sil, 2(%rdi) 1062; AVX512-NEXT: retq 1063 %t = call {<4 x i24>, <4 x i1>} @llvm.sadd.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 1064 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 1065 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 1066 %res = sext <4 x i1> %obit to <4 x i32> 1067 store <4 x i24> %val, <4 x i24>* %p2 1068 ret <4 x i32> %res 1069} 1070 1071define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { 1072; SSE-LABEL: saddo_v4i1: 1073; SSE: # %bb.0: 1074; SSE-NEXT: pslld $31, %xmm1 1075; SSE-NEXT: psrad $31, %xmm1 1076; SSE-NEXT: pslld $31, %xmm0 1077; SSE-NEXT: psrad $31, %xmm0 1078; SSE-NEXT: paddd %xmm1, %xmm0 1079; SSE-NEXT: movdqa %xmm0, %xmm1 1080; SSE-NEXT: pslld $31, %xmm1 1081; SSE-NEXT: movmskps %xmm1, %eax 1082; SSE-NEXT: psrad $31, %xmm1 1083; SSE-NEXT: pcmpeqd %xmm0, %xmm1 1084; SSE-NEXT: pcmpeqd %xmm0, %xmm0 1085; SSE-NEXT: pxor %xmm0, %xmm1 1086; SSE-NEXT: movb %al, (%rdi) 1087; SSE-NEXT: movdqa %xmm1, %xmm0 1088; SSE-NEXT: retq 1089; 1090; AVX1-LABEL: saddo_v4i1: 1091; AVX1: # %bb.0: 1092; AVX1-NEXT: vpslld $31, %xmm1, %xmm1 1093; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 1094; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 1095; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 1096; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1097; AVX1-NEXT: vpslld $31, %xmm0, %xmm1 1098; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 1099; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 1100; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1101; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 1102; AVX1-NEXT: vmovmskps %xmm1, %eax 1103; AVX1-NEXT: movb %al, (%rdi) 1104; AVX1-NEXT: retq 1105; 1106; AVX2-LABEL: saddo_v4i1: 1107; AVX2: # %bb.0: 1108; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 1109; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 1110; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 1111; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 1112; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1113; AVX2-NEXT: vpslld $31, %xmm0, %xmm1 1114; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 1115; AVX2-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0 1116; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 1117; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 1118; AVX2-NEXT: vmovmskps %xmm1, %eax 1119; AVX2-NEXT: movb %al, (%rdi) 1120; AVX2-NEXT: retq 1121; 1122; AVX512-LABEL: saddo_v4i1: 1123; AVX512: # %bb.0: 1124; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 1125; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 1126; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm2 1127; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 1128; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 1129; AVX512-NEXT: vptestmd %xmm2, %xmm2, %k2 1130; AVX512-NEXT: kxorw %k1, %k0, %k0 1131; AVX512-NEXT: kxorw %k2, %k0, %k1 1132; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1133; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1134; AVX512-NEXT: kshiftlw $12, %k0, %k0 1135; AVX512-NEXT: kshiftrw $12, %k0, %k0 1136; AVX512-NEXT: kmovd %k0, %eax 1137; AVX512-NEXT: movb %al, (%rdi) 1138; AVX512-NEXT: retq 1139 %t = call {<4 x i1>, <4 x i1>} @llvm.sadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 1140 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 1141 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 1142 %res = sext <4 x i1> %obit to <4 x i32> 1143 store <4 x i1> %val, <4 x i1>* %p2 1144 ret <4 x i32> %res 1145} 1146 1147define <2 x i32> @saddo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { 1148; SSE2-LABEL: saddo_v2i128: 1149; SSE2: # %bb.0: 1150; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1151; SSE2-NEXT: addq %r8, %rdi 1152; SSE2-NEXT: adcq %r9, %rsi 1153; SSE2-NEXT: seto %r8b 1154; SSE2-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1155; SSE2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1156; SSE2-NEXT: seto %al 1157; SSE2-NEXT: movzbl %al, %eax 1158; SSE2-NEXT: negl %eax 1159; SSE2-NEXT: movd %eax, %xmm1 1160; SSE2-NEXT: movzbl %r8b, %eax 1161; SSE2-NEXT: negl %eax 1162; SSE2-NEXT: movd %eax, %xmm0 1163; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1164; SSE2-NEXT: movq %rdx, 16(%r10) 1165; SSE2-NEXT: movq %rdi, (%r10) 1166; SSE2-NEXT: movq %rcx, 24(%r10) 1167; SSE2-NEXT: movq %rsi, 8(%r10) 1168; SSE2-NEXT: retq 1169; 1170; SSSE3-LABEL: saddo_v2i128: 1171; SSSE3: # %bb.0: 1172; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 1173; SSSE3-NEXT: addq %r8, %rdi 1174; SSSE3-NEXT: adcq %r9, %rsi 1175; SSSE3-NEXT: seto %r8b 1176; SSSE3-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1177; SSSE3-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1178; SSSE3-NEXT: seto %al 1179; SSSE3-NEXT: movzbl %al, %eax 1180; SSSE3-NEXT: negl %eax 1181; SSSE3-NEXT: movd %eax, %xmm1 1182; SSSE3-NEXT: movzbl %r8b, %eax 1183; SSSE3-NEXT: negl %eax 1184; SSSE3-NEXT: movd %eax, %xmm0 1185; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1186; SSSE3-NEXT: movq %rdx, 16(%r10) 1187; SSSE3-NEXT: movq %rdi, (%r10) 1188; SSSE3-NEXT: movq %rcx, 24(%r10) 1189; SSSE3-NEXT: movq %rsi, 8(%r10) 1190; SSSE3-NEXT: retq 1191; 1192; SSE41-LABEL: saddo_v2i128: 1193; SSE41: # %bb.0: 1194; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 1195; SSE41-NEXT: addq %r8, %rdi 1196; SSE41-NEXT: adcq %r9, %rsi 1197; SSE41-NEXT: seto %r8b 1198; SSE41-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1199; SSE41-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1200; SSE41-NEXT: seto %al 1201; SSE41-NEXT: movzbl %al, %r9d 1202; SSE41-NEXT: negl %r9d 1203; SSE41-NEXT: movzbl %r8b, %eax 1204; SSE41-NEXT: negl %eax 1205; SSE41-NEXT: movd %eax, %xmm0 1206; SSE41-NEXT: pinsrd $1, %r9d, %xmm0 1207; SSE41-NEXT: movq %rdx, 16(%r10) 1208; SSE41-NEXT: movq %rdi, (%r10) 1209; SSE41-NEXT: movq %rcx, 24(%r10) 1210; SSE41-NEXT: movq %rsi, 8(%r10) 1211; SSE41-NEXT: retq 1212; 1213; AVX1-LABEL: saddo_v2i128: 1214; AVX1: # %bb.0: 1215; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10 1216; AVX1-NEXT: addq %r8, %rdi 1217; AVX1-NEXT: adcq %r9, %rsi 1218; AVX1-NEXT: seto %r8b 1219; AVX1-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1220; AVX1-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1221; AVX1-NEXT: seto %al 1222; AVX1-NEXT: movzbl %al, %r9d 1223; AVX1-NEXT: negl %r9d 1224; AVX1-NEXT: movzbl %r8b, %eax 1225; AVX1-NEXT: negl %eax 1226; AVX1-NEXT: vmovd %eax, %xmm0 1227; AVX1-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 1228; AVX1-NEXT: movq %rdx, 16(%r10) 1229; AVX1-NEXT: movq %rdi, (%r10) 1230; AVX1-NEXT: movq %rcx, 24(%r10) 1231; AVX1-NEXT: movq %rsi, 8(%r10) 1232; AVX1-NEXT: retq 1233; 1234; AVX2-LABEL: saddo_v2i128: 1235; AVX2: # %bb.0: 1236; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 1237; AVX2-NEXT: addq %r8, %rdi 1238; AVX2-NEXT: adcq %r9, %rsi 1239; AVX2-NEXT: seto %r8b 1240; AVX2-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1241; AVX2-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1242; AVX2-NEXT: seto %al 1243; AVX2-NEXT: movzbl %al, %r9d 1244; AVX2-NEXT: negl %r9d 1245; AVX2-NEXT: movzbl %r8b, %eax 1246; AVX2-NEXT: negl %eax 1247; AVX2-NEXT: vmovd %eax, %xmm0 1248; AVX2-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 1249; AVX2-NEXT: movq %rdx, 16(%r10) 1250; AVX2-NEXT: movq %rdi, (%r10) 1251; AVX2-NEXT: movq %rcx, 24(%r10) 1252; AVX2-NEXT: movq %rsi, 8(%r10) 1253; AVX2-NEXT: retq 1254; 1255; AVX512-LABEL: saddo_v2i128: 1256; AVX512: # %bb.0: 1257; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 1258; AVX512-NEXT: addq {{[0-9]+}}(%rsp), %rdx 1259; AVX512-NEXT: adcq {{[0-9]+}}(%rsp), %rcx 1260; AVX512-NEXT: seto %al 1261; AVX512-NEXT: kmovd %eax, %k0 1262; AVX512-NEXT: addq %r8, %rdi 1263; AVX512-NEXT: adcq %r9, %rsi 1264; AVX512-NEXT: seto %al 1265; AVX512-NEXT: andl $1, %eax 1266; AVX512-NEXT: kmovw %eax, %k1 1267; AVX512-NEXT: kshiftlw $1, %k0, %k0 1268; AVX512-NEXT: korw %k0, %k1, %k1 1269; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1270; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1271; AVX512-NEXT: movq %rdx, 16(%r10) 1272; AVX512-NEXT: movq %rdi, (%r10) 1273; AVX512-NEXT: movq %rcx, 24(%r10) 1274; AVX512-NEXT: movq %rsi, 8(%r10) 1275; AVX512-NEXT: retq 1276 %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 1277 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 1278 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 1279 %res = sext <2 x i1> %obit to <2 x i32> 1280 store <2 x i128> %val, <2 x i128>* %p2 1281 ret <2 x i32> %res 1282} 1283