1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX 4 5; First, check the generic pattern for any 2 vector constants. Then, check special cases where 6; the constants are all off-by-one. Finally, check the extra special cases where the constants 7; include 0 or -1. 8; Each minimal select test is repeated with a more typical pattern that includes a compare to 9; generate the condition value. 10 11; TODO: If we don't have blendv, this can definitely be improved. There's also a selection of 12; chips where it makes sense to transform the general case blendv to 2 bit-ops. That should be 13; a uarch-specfic transform. At some point (Ryzen?), the implementation should catch up to the 14; architecture, so blendv is as fast as a single bit-op. 15 16define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) { 17; SSE-LABEL: sel_C1_or_C2_vec: 18; SSE: # %bb.0: 19; SSE-NEXT: pslld $31, %xmm0 20; SSE-NEXT: psrad $31, %xmm0 21; SSE-NEXT: movdqa %xmm0, %xmm1 22; SSE-NEXT: pandn {{.*}}(%rip), %xmm1 23; SSE-NEXT: pand {{.*}}(%rip), %xmm0 24; SSE-NEXT: por %xmm1, %xmm0 25; SSE-NEXT: retq 26; 27; AVX-LABEL: sel_C1_or_C2_vec: 28; AVX: # %bb.0: 29; AVX-NEXT: vpslld $31, %xmm0, %xmm0 30; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [42,0,4294967294,4294967295] 31; AVX-NEXT: vblendvps %xmm0, {{.*}}(%rip), %xmm1, %xmm0 32; AVX-NEXT: retq 33 %add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1> 34 ret <4 x i32> %add 35} 36 37define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) { 38; SSE-LABEL: cmp_sel_C1_or_C2_vec: 39; SSE: # %bb.0: 40; SSE-NEXT: pcmpeqd %xmm1, %xmm0 41; SSE-NEXT: movdqa %xmm0, %xmm1 42; SSE-NEXT: pandn {{.*}}(%rip), %xmm1 43; SSE-NEXT: pand {{.*}}(%rip), %xmm0 44; SSE-NEXT: por %xmm1, %xmm0 45; SSE-NEXT: retq 46; 47; AVX-LABEL: cmp_sel_C1_or_C2_vec: 48; AVX: # %bb.0: 49; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 50; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [42,0,4294967294,4294967295] 51; AVX-NEXT: vblendvps %xmm0, {{.*}}(%rip), %xmm1, %xmm0 52; AVX-NEXT: retq 53 %cond = icmp eq <4 x i32> %x, %y 54 %add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1> 55 ret <4 x i32> %add 56} 57 58define <4 x i32> @sel_Cplus1_or_C_vec(<4 x i1> %cond) { 59; SSE-LABEL: sel_Cplus1_or_C_vec: 60; SSE: # %bb.0: 61; SSE-NEXT: pand {{.*}}(%rip), %xmm0 62; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 63; SSE-NEXT: retq 64; 65; AVX-LABEL: sel_Cplus1_or_C_vec: 66; AVX: # %bb.0: 67; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 68; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 69; AVX-NEXT: retq 70 %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1> 71 ret <4 x i32> %add 72} 73 74define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) { 75; SSE-LABEL: cmp_sel_Cplus1_or_C_vec: 76; SSE: # %bb.0: 77; SSE-NEXT: pcmpeqd %xmm1, %xmm0 78; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,0,4294967294,4294967295] 79; SSE-NEXT: psubd %xmm0, %xmm1 80; SSE-NEXT: movdqa %xmm1, %xmm0 81; SSE-NEXT: retq 82; 83; AVX-LABEL: cmp_sel_Cplus1_or_C_vec: 84; AVX: # %bb.0: 85; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 86; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,0,4294967294,4294967295] 87; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 88; AVX-NEXT: retq 89 %cond = icmp eq <4 x i32> %x, %y 90 %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1> 91 ret <4 x i32> %add 92} 93 94define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) { 95; SSE-LABEL: sel_Cminus1_or_C_vec: 96; SSE: # %bb.0: 97; SSE-NEXT: pslld $31, %xmm0 98; SSE-NEXT: psrad $31, %xmm0 99; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 100; SSE-NEXT: retq 101; 102; AVX-LABEL: sel_Cminus1_or_C_vec: 103; AVX: # %bb.0: 104; AVX-NEXT: vpslld $31, %xmm0, %xmm0 105; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 106; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 107; AVX-NEXT: retq 108 %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1> 109 ret <4 x i32> %add 110} 111 112define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) { 113; SSE-LABEL: cmp_sel_Cminus1_or_C_vec: 114; SSE: # %bb.0: 115; SSE-NEXT: pcmpeqd %xmm1, %xmm0 116; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 117; SSE-NEXT: retq 118; 119; AVX-LABEL: cmp_sel_Cminus1_or_C_vec: 120; AVX: # %bb.0: 121; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 122; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 123; AVX-NEXT: retq 124 %cond = icmp eq <4 x i32> %x, %y 125 %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1> 126 ret <4 x i32> %add 127} 128 129define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) { 130; SSE-LABEL: sel_minus1_or_0_vec: 131; SSE: # %bb.0: 132; SSE-NEXT: pslld $31, %xmm0 133; SSE-NEXT: psrad $31, %xmm0 134; SSE-NEXT: retq 135; 136; AVX-LABEL: sel_minus1_or_0_vec: 137; AVX: # %bb.0: 138; AVX-NEXT: vpslld $31, %xmm0, %xmm0 139; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 140; AVX-NEXT: retq 141 %add = select <4 x i1> %cond, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 142 ret <4 x i32> %add 143} 144 145define <4 x i32> @cmp_sel_minus1_or_0_vec(<4 x i32> %x, <4 x i32> %y) { 146; SSE-LABEL: cmp_sel_minus1_or_0_vec: 147; SSE: # %bb.0: 148; SSE-NEXT: pcmpeqd %xmm1, %xmm0 149; SSE-NEXT: retq 150; 151; AVX-LABEL: cmp_sel_minus1_or_0_vec: 152; AVX: # %bb.0: 153; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 154; AVX-NEXT: retq 155 %cond = icmp eq <4 x i32> %x, %y 156 %add = select <4 x i1> %cond, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 157 ret <4 x i32> %add 158} 159 160define <4 x i32> @sel_0_or_minus1_vec(<4 x i1> %cond) { 161; SSE-LABEL: sel_0_or_minus1_vec: 162; SSE: # %bb.0: 163; SSE-NEXT: pand {{.*}}(%rip), %xmm0 164; SSE-NEXT: pcmpeqd %xmm1, %xmm1 165; SSE-NEXT: paddd %xmm1, %xmm0 166; SSE-NEXT: retq 167; 168; AVX-LABEL: sel_0_or_minus1_vec: 169; AVX: # %bb.0: 170; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 171; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 172; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 173; AVX-NEXT: retq 174 %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> 175 ret <4 x i32> %add 176} 177 178define <4 x i32> @cmp_sel_0_or_minus1_vec(<4 x i32> %x, <4 x i32> %y) { 179; SSE-LABEL: cmp_sel_0_or_minus1_vec: 180; SSE: # %bb.0: 181; SSE-NEXT: pcmpeqd %xmm1, %xmm0 182; SSE-NEXT: pcmpeqd %xmm1, %xmm1 183; SSE-NEXT: pxor %xmm1, %xmm0 184; SSE-NEXT: retq 185; 186; AVX-LABEL: cmp_sel_0_or_minus1_vec: 187; AVX: # %bb.0: 188; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 189; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 190; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 191; AVX-NEXT: retq 192 %cond = icmp eq <4 x i32> %x, %y 193 %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> 194 ret <4 x i32> %add 195} 196 197define <4 x i32> @sel_1_or_0_vec(<4 x i1> %cond) { 198; SSE-LABEL: sel_1_or_0_vec: 199; SSE: # %bb.0: 200; SSE-NEXT: andps {{.*}}(%rip), %xmm0 201; SSE-NEXT: retq 202; 203; AVX-LABEL: sel_1_or_0_vec: 204; AVX: # %bb.0: 205; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 206; AVX-NEXT: retq 207 %add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 208 ret <4 x i32> %add 209} 210 211define <4 x i32> @cmp_sel_1_or_0_vec(<4 x i32> %x, <4 x i32> %y) { 212; SSE-LABEL: cmp_sel_1_or_0_vec: 213; SSE: # %bb.0: 214; SSE-NEXT: pcmpeqd %xmm1, %xmm0 215; SSE-NEXT: psrld $31, %xmm0 216; SSE-NEXT: retq 217; 218; AVX-LABEL: cmp_sel_1_or_0_vec: 219; AVX: # %bb.0: 220; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 221; AVX-NEXT: vpsrld $31, %xmm0, %xmm0 222; AVX-NEXT: retq 223 %cond = icmp eq <4 x i32> %x, %y 224 %add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 225 ret <4 x i32> %add 226} 227 228define <4 x i32> @sel_0_or_1_vec(<4 x i1> %cond) { 229; SSE-LABEL: sel_0_or_1_vec: 230; SSE: # %bb.0: 231; SSE-NEXT: andnps {{.*}}(%rip), %xmm0 232; SSE-NEXT: retq 233; 234; AVX-LABEL: sel_0_or_1_vec: 235; AVX: # %bb.0: 236; AVX-NEXT: vandnps {{.*}}(%rip), %xmm0, %xmm0 237; AVX-NEXT: retq 238 %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 239 ret <4 x i32> %add 240} 241 242define <4 x i32> @cmp_sel_0_or_1_vec(<4 x i32> %x, <4 x i32> %y) { 243; SSE-LABEL: cmp_sel_0_or_1_vec: 244; SSE: # %bb.0: 245; SSE-NEXT: pcmpeqd %xmm1, %xmm0 246; SSE-NEXT: pandn {{.*}}(%rip), %xmm0 247; SSE-NEXT: retq 248; 249; AVX-LABEL: cmp_sel_0_or_1_vec: 250; AVX: # %bb.0: 251; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 252; AVX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 253; AVX-NEXT: retq 254 %cond = icmp eq <4 x i32> %x, %y 255 %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 256 ret <4 x i32> %add 257} 258 259