1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX 4 5; fold (add x, 0) -> x 6define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) { 7; SSE-LABEL: combine_vec_add_to_zero: 8; SSE: # %bb.0: 9; SSE-NEXT: retq 10; 11; AVX-LABEL: combine_vec_add_to_zero: 12; AVX: # %bb.0: 13; AVX-NEXT: retq 14 %1 = add <4 x i32> %a, zeroinitializer 15 ret <4 x i32> %1 16} 17 18; fold ((c1-A)+c2) -> (c1+c2)-A 19define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) { 20; SSE-LABEL: combine_vec_add_constant_sub: 21; SSE: # %bb.0: 22; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2,4,6] 23; SSE-NEXT: psubd %xmm0, %xmm1 24; SSE-NEXT: movdqa %xmm1, %xmm0 25; SSE-NEXT: retq 26; 27; AVX-LABEL: combine_vec_add_constant_sub: 28; AVX: # %bb.0: 29; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6] 30; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 31; AVX-NEXT: retq 32 %1 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %a 33 %2 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %1 34 ret <4 x i32> %2 35} 36 37; fold ((0-A) + B) -> B-A 38define <4 x i32> @combine_vec_add_neg0(<4 x i32> %a, <4 x i32> %b) { 39; SSE-LABEL: combine_vec_add_neg0: 40; SSE: # %bb.0: 41; SSE-NEXT: psubd %xmm0, %xmm1 42; SSE-NEXT: movdqa %xmm1, %xmm0 43; SSE-NEXT: retq 44; 45; AVX-LABEL: combine_vec_add_neg0: 46; AVX: # %bb.0: 47; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 48; AVX-NEXT: retq 49 %1 = sub <4 x i32> zeroinitializer, %a 50 %2 = add <4 x i32> %1, %b 51 ret <4 x i32> %2 52} 53 54; fold (A + (0-B)) -> A-B 55define <4 x i32> @combine_vec_add_neg1(<4 x i32> %a, <4 x i32> %b) { 56; SSE-LABEL: combine_vec_add_neg1: 57; SSE: # %bb.0: 58; SSE-NEXT: psubd %xmm1, %xmm0 59; SSE-NEXT: retq 60; 61; AVX-LABEL: combine_vec_add_neg1: 62; AVX: # %bb.0: 63; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 64; AVX-NEXT: retq 65 %1 = sub <4 x i32> zeroinitializer, %b 66 %2 = add <4 x i32> %a, %1 67 ret <4 x i32> %2 68} 69 70; fold (A+(B-A)) -> B 71define <4 x i32> @combine_vec_add_sub0(<4 x i32> %a, <4 x i32> %b) { 72; SSE-LABEL: combine_vec_add_sub0: 73; SSE: # %bb.0: 74; SSE-NEXT: movaps %xmm1, %xmm0 75; SSE-NEXT: retq 76; 77; AVX-LABEL: combine_vec_add_sub0: 78; AVX: # %bb.0: 79; AVX-NEXT: vmovaps %xmm1, %xmm0 80; AVX-NEXT: retq 81 %1 = sub <4 x i32> %b, %a 82 %2 = add <4 x i32> %a, %1 83 ret <4 x i32> %2 84} 85 86; fold ((B-A)+A) -> B 87define <4 x i32> @combine_vec_add_sub1(<4 x i32> %a, <4 x i32> %b) { 88; SSE-LABEL: combine_vec_add_sub1: 89; SSE: # %bb.0: 90; SSE-NEXT: movaps %xmm1, %xmm0 91; SSE-NEXT: retq 92; 93; AVX-LABEL: combine_vec_add_sub1: 94; AVX: # %bb.0: 95; AVX-NEXT: vmovaps %xmm1, %xmm0 96; AVX-NEXT: retq 97 %1 = sub <4 x i32> %b, %a 98 %2 = add <4 x i32> %1, %a 99 ret <4 x i32> %2 100} 101 102; fold (A+(B-(A+C))) to (B-C) 103define <4 x i32> @combine_vec_add_sub_add0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 104; SSE-LABEL: combine_vec_add_sub_add0: 105; SSE: # %bb.0: 106; SSE-NEXT: psubd %xmm2, %xmm1 107; SSE-NEXT: movdqa %xmm1, %xmm0 108; SSE-NEXT: retq 109; 110; AVX-LABEL: combine_vec_add_sub_add0: 111; AVX: # %bb.0: 112; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0 113; AVX-NEXT: retq 114 %1 = add <4 x i32> %a, %c 115 %2 = sub <4 x i32> %b, %1 116 %3 = add <4 x i32> %a, %2 117 ret <4 x i32> %3 118} 119 120; fold (A+(B-(C+A))) to (B-C) 121define <4 x i32> @combine_vec_add_sub_add1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 122; SSE-LABEL: combine_vec_add_sub_add1: 123; SSE: # %bb.0: 124; SSE-NEXT: psubd %xmm2, %xmm1 125; SSE-NEXT: movdqa %xmm1, %xmm0 126; SSE-NEXT: retq 127; 128; AVX-LABEL: combine_vec_add_sub_add1: 129; AVX: # %bb.0: 130; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0 131; AVX-NEXT: retq 132 %1 = add <4 x i32> %c, %a 133 %2 = sub <4 x i32> %b, %1 134 %3 = add <4 x i32> %a, %2 135 ret <4 x i32> %3 136} 137 138; fold (A+((B-A)+C)) to (B+C) 139define <4 x i32> @combine_vec_add_sub_add2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 140; SSE-LABEL: combine_vec_add_sub_add2: 141; SSE: # %bb.0: 142; SSE-NEXT: paddd %xmm2, %xmm1 143; SSE-NEXT: movdqa %xmm1, %xmm0 144; SSE-NEXT: retq 145; 146; AVX-LABEL: combine_vec_add_sub_add2: 147; AVX: # %bb.0: 148; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm0 149; AVX-NEXT: retq 150 %1 = sub <4 x i32> %b, %a 151 %2 = add <4 x i32> %1, %c 152 %3 = add <4 x i32> %a, %2 153 ret <4 x i32> %3 154} 155 156; fold (A+((B-A)-C)) to (B-C) 157define <4 x i32> @combine_vec_add_sub_add3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 158; SSE-LABEL: combine_vec_add_sub_add3: 159; SSE: # %bb.0: 160; SSE-NEXT: psubd %xmm2, %xmm1 161; SSE-NEXT: movdqa %xmm1, %xmm0 162; SSE-NEXT: retq 163; 164; AVX-LABEL: combine_vec_add_sub_add3: 165; AVX: # %bb.0: 166; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0 167; AVX-NEXT: retq 168 %1 = sub <4 x i32> %b, %a 169 %2 = sub <4 x i32> %1, %c 170 %3 = add <4 x i32> %a, %2 171 ret <4 x i32> %3 172} 173 174; fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant 175define <4 x i32> @combine_vec_add_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d) { 176; SSE-LABEL: combine_vec_add_sub_sub: 177; SSE: # %bb.0: 178; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 179; SSE-NEXT: paddd %xmm2, %xmm1 180; SSE-NEXT: psubd %xmm1, %xmm0 181; SSE-NEXT: retq 182; 183; AVX-LABEL: combine_vec_add_sub_sub: 184; AVX: # %bb.0: 185; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 186; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 187; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 188; AVX-NEXT: retq 189 %1 = sub <4 x i32> %a, %b 190 %2 = sub <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %d 191 %3 = add <4 x i32> %1, %2 192 ret <4 x i32> %3 193} 194 195; fold (a+b) -> (a|b) iff a and b share no bits. 196define <4 x i32> @combine_vec_add_uniquebits(<4 x i32> %a, <4 x i32> %b) { 197; SSE-LABEL: combine_vec_add_uniquebits: 198; SSE: # %bb.0: 199; SSE-NEXT: andps {{.*}}(%rip), %xmm0 200; SSE-NEXT: andps {{.*}}(%rip), %xmm1 201; SSE-NEXT: orps %xmm1, %xmm0 202; SSE-NEXT: retq 203; 204; AVX-LABEL: combine_vec_add_uniquebits: 205; AVX: # %bb.0: 206; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 207; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 208; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 209; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 210; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 211; AVX-NEXT: retq 212 %1 = and <4 x i32> %a, <i32 61680, i32 61680, i32 61680, i32 61680> 213 %2 = and <4 x i32> %b, <i32 3855, i32 3855, i32 3855, i32 3855> 214 %3 = add <4 x i32> %1, %2 215 ret <4 x i32> %3 216} 217 218; fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) 219define <4 x i32> @combine_vec_add_shl_neg0(<4 x i32> %x, <4 x i32> %y) { 220; SSE-LABEL: combine_vec_add_shl_neg0: 221; SSE: # %bb.0: 222; SSE-NEXT: pslld $5, %xmm1 223; SSE-NEXT: psubd %xmm1, %xmm0 224; SSE-NEXT: retq 225; 226; AVX-LABEL: combine_vec_add_shl_neg0: 227; AVX: # %bb.0: 228; AVX-NEXT: vpslld $5, %xmm1, %xmm1 229; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 230; AVX-NEXT: retq 231 %1 = sub <4 x i32> zeroinitializer, %y 232 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 233 %3 = add <4 x i32> %x, %2 234 ret <4 x i32> %3 235} 236 237; fold (add shl(0 - y, n), x) -> sub(x, shl(y, n)) 238define <4 x i32> @combine_vec_add_shl_neg1(<4 x i32> %x, <4 x i32> %y) { 239; SSE-LABEL: combine_vec_add_shl_neg1: 240; SSE: # %bb.0: 241; SSE-NEXT: pslld $5, %xmm1 242; SSE-NEXT: psubd %xmm1, %xmm0 243; SSE-NEXT: retq 244; 245; AVX-LABEL: combine_vec_add_shl_neg1: 246; AVX: # %bb.0: 247; AVX-NEXT: vpslld $5, %xmm1, %xmm1 248; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 249; AVX-NEXT: retq 250 %1 = sub <4 x i32> zeroinitializer, %y 251 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 252 %3 = add <4 x i32> %2, %x 253 ret <4 x i32> %3 254} 255 256; (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x)) 257; and similar xforms where the inner op is either ~0 or 0. 258define <4 x i32> @combine_vec_add_and_compare(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) { 259; SSE-LABEL: combine_vec_add_and_compare: 260; SSE: # %bb.0: 261; SSE-NEXT: pcmpeqd %xmm2, %xmm1 262; SSE-NEXT: psubd %xmm1, %xmm0 263; SSE-NEXT: retq 264; 265; AVX-LABEL: combine_vec_add_and_compare: 266; AVX: # %bb.0: 267; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 268; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 269; AVX-NEXT: retq 270 %1 = icmp eq <4 x i32> %a1, %a2 271 %2 = sext <4 x i1> %1 to <4 x i32> 272 %3 = and <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 273 %4 = add <4 x i32> %a0, %3 274 ret <4 x i32> %4 275} 276 277; add (sext i1), X -> sub X, (zext i1) 278define <4 x i32> @combine_vec_add_sext(<4 x i1> %a0, <4 x i32> %a1) { 279; SSE-LABEL: combine_vec_add_sext: 280; SSE: # %bb.0: 281; SSE-NEXT: pslld $31, %xmm0 282; SSE-NEXT: psrad $31, %xmm0 283; SSE-NEXT: paddd %xmm1, %xmm0 284; SSE-NEXT: retq 285; 286; AVX-LABEL: combine_vec_add_sext: 287; AVX: # %bb.0: 288; AVX-NEXT: vpslld $31, %xmm0, %xmm0 289; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 290; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 291; AVX-NEXT: retq 292 %1 = sext <4 x i1> %a0 to <4 x i32> 293 %2 = add <4 x i32> %1, %a1 294 ret <4 x i32> %2 295} 296 297; add (sext i1), X -> sub X, (zext i1) 298define <4 x i32> @combine_vec_add_sextinreg(<4 x i32> %a0, <4 x i32> %a1) { 299; SSE-LABEL: combine_vec_add_sextinreg: 300; SSE: # %bb.0: 301; SSE-NEXT: pslld $31, %xmm0 302; SSE-NEXT: psrad $31, %xmm0 303; SSE-NEXT: paddd %xmm1, %xmm0 304; SSE-NEXT: retq 305; 306; AVX-LABEL: combine_vec_add_sextinreg: 307; AVX: # %bb.0: 308; AVX-NEXT: vpslld $31, %xmm0, %xmm0 309; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 310; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 311; AVX-NEXT: retq 312 %1 = shl <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31> 313 %2 = ashr <4 x i32> %1, <i32 31, i32 31, i32 31, i32 31> 314 %3 = add <4 x i32> %2, %a1 315 ret <4 x i32> %3 316} 317