1; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 2; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 3; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5 6define void @test1(i16* nocapture %head) nounwind { 7; SSE-LABEL: test1: 8; SSE: ## BB#0: ## %vector.ph 9; SSE-NEXT: movdqu (%rdi), %xmm0 10; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 11; SSE-NEXT: movdqu %xmm0, (%rdi) 12; SSE-NEXT: retq 13; 14; AVX-LABEL: test1: 15; AVX: ## BB#0: ## %vector.ph 16; AVX-NEXT: vmovdqu (%rdi), %xmm0 17; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 18; AVX-NEXT: vmovdqu %xmm0, (%rdi) 19; AVX-NEXT: retq 20vector.ph: 21 %0 = getelementptr inbounds i16, i16* %head, i64 0 22 %1 = bitcast i16* %0 to <8 x i16>* 23 %2 = load <8 x i16>, <8 x i16>* %1, align 2 24 %3 = icmp slt <8 x i16> %2, zeroinitializer 25 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 26 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 27 store <8 x i16> %5, <8 x i16>* %1, align 2 28 ret void 29} 30 31define void @test2(i16* nocapture %head) nounwind { 32; SSE-LABEL: test2: 33; SSE: ## BB#0: ## %vector.ph 34; SSE-NEXT: movdqu (%rdi), %xmm0 35; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0 36; SSE-NEXT: movdqu %xmm0, (%rdi) 37; SSE-NEXT: retq 38; 39; AVX-LABEL: test2: 40; AVX: ## BB#0: ## %vector.ph 41; AVX-NEXT: vmovdqu (%rdi), %xmm0 42; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 43; AVX-NEXT: vmovdqu %xmm0, (%rdi) 44; AVX-NEXT: retq 45vector.ph: 46 %0 = getelementptr inbounds i16, i16* %head, i64 0 47 %1 = bitcast i16* %0 to <8 x i16>* 48 %2 = load <8 x i16>, <8 x i16>* %1, align 2 49 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 50 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 51 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 52 store <8 x i16> %5, <8 x i16>* %1, align 2 53 ret void 54} 55 56define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { 57; SSE-LABEL: test3: 58; SSE: ## BB#0: ## %vector.ph 59; SSE-NEXT: movd %esi, %xmm0 60; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 61; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 62; SSE-NEXT: movdqu (%rdi), %xmm1 63; SSE-NEXT: psubusw %xmm0, %xmm1 64; SSE-NEXT: movdqu %xmm1, (%rdi) 65; SSE-NEXT: retq 66; 67; AVX1-LABEL: test3: 68; AVX1: ## BB#0: ## %vector.ph 69; AVX1-NEXT: vmovd %esi, %xmm0 70; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 71; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 72; AVX1-NEXT: vmovdqu (%rdi), %xmm1 73; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0 74; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 75; AVX1-NEXT: retq 76; 77; AVX2-LABEL: test3: 78; AVX2: ## BB#0: ## %vector.ph 79; AVX2-NEXT: vmovd %esi, %xmm0 80; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 81; AVX2-NEXT: vmovdqu (%rdi), %xmm1 82; AVX2-NEXT: vpsubusw %xmm0, %xmm1, %xmm0 83; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 84; AVX2-NEXT: retq 85vector.ph: 86 %0 = insertelement <8 x i16> undef, i16 %w, i32 0 87 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 88 %1 = getelementptr inbounds i16, i16* %head, i64 0 89 %2 = bitcast i16* %1 to <8 x i16>* 90 %3 = load <8 x i16>, <8 x i16>* %2, align 2 91 %4 = icmp ult <8 x i16> %3, %broadcast15 92 %5 = sub <8 x i16> %3, %broadcast15 93 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5 94 store <8 x i16> %6, <8 x i16>* %2, align 2 95 ret void 96} 97 98define void @test4(i8* nocapture %head) nounwind { 99; SSE-LABEL: test4: 100; SSE: ## BB#0: ## %vector.ph 101; SSE-NEXT: movdqu (%rdi), %xmm0 102; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 103; SSE-NEXT: movdqu %xmm0, (%rdi) 104; SSE-NEXT: retq 105; 106; AVX-LABEL: test4: 107; AVX: ## BB#0: ## %vector.ph 108; AVX-NEXT: vmovdqu (%rdi), %xmm0 109; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 110; AVX-NEXT: vmovdqu %xmm0, (%rdi) 111; AVX-NEXT: retq 112vector.ph: 113 %0 = getelementptr inbounds i8, i8* %head, i64 0 114 %1 = bitcast i8* %0 to <16 x i8>* 115 %2 = load <16 x i8>, <16 x i8>* %1, align 1 116 %3 = icmp slt <16 x i8> %2, zeroinitializer 117 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 118 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 119 store <16 x i8> %5, <16 x i8>* %1, align 1 120 ret void 121} 122 123define void @test5(i8* nocapture %head) nounwind { 124; SSE-LABEL: test5: 125; SSE: ## BB#0: ## %vector.ph 126; SSE-NEXT: movdqu (%rdi), %xmm0 127; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0 128; SSE-NEXT: movdqu %xmm0, (%rdi) 129; SSE-NEXT: retq 130; 131; AVX-LABEL: test5: 132; AVX: ## BB#0: ## %vector.ph 133; AVX-NEXT: vmovdqu (%rdi), %xmm0 134; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 135; AVX-NEXT: vmovdqu %xmm0, (%rdi) 136; AVX-NEXT: retq 137vector.ph: 138 %0 = getelementptr inbounds i8, i8* %head, i64 0 139 %1 = bitcast i8* %0 to <16 x i8>* 140 %2 = load <16 x i8>, <16 x i8>* %1, align 1 141 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 142 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 143 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 144 store <16 x i8> %5, <16 x i8>* %1, align 1 145 ret void 146} 147 148define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind { 149; SSE2-LABEL: test6: 150; SSE2: ## BB#0: ## %vector.ph 151; SSE2-NEXT: movd %esi, %xmm0 152; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 153; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 154; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 155; SSE2-NEXT: movdqu (%rdi), %xmm1 156; SSE2-NEXT: psubusb %xmm0, %xmm1 157; SSE2-NEXT: movdqu %xmm1, (%rdi) 158; SSE2-NEXT: retq 159; 160; SSSE3-LABEL: test6: 161; SSSE3: ## BB#0: ## %vector.ph 162; SSSE3-NEXT: movd %esi, %xmm0 163; SSSE3-NEXT: pxor %xmm1, %xmm1 164; SSSE3-NEXT: pshufb %xmm1, %xmm0 165; SSSE3-NEXT: movdqu (%rdi), %xmm1 166; SSSE3-NEXT: psubusb %xmm0, %xmm1 167; SSSE3-NEXT: movdqu %xmm1, (%rdi) 168; SSSE3-NEXT: retq 169; 170; AVX1-LABEL: test6: 171; AVX1: ## BB#0: ## %vector.ph 172; AVX1-NEXT: vmovd %esi, %xmm0 173; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 174; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 175; AVX1-NEXT: vmovdqu (%rdi), %xmm1 176; AVX1-NEXT: vpsubusb %xmm0, %xmm1, %xmm0 177; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 178; AVX1-NEXT: retq 179; 180; AVX2-LABEL: test6: 181; AVX2: ## BB#0: ## %vector.ph 182; AVX2-NEXT: vmovd %esi, %xmm0 183; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 184; AVX2-NEXT: vmovdqu (%rdi), %xmm1 185; AVX2-NEXT: vpsubusb %xmm0, %xmm1, %xmm0 186; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 187; AVX2-NEXT: retq 188vector.ph: 189 %0 = insertelement <16 x i8> undef, i8 %w, i32 0 190 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 191 %1 = getelementptr inbounds i8, i8* %head, i64 0 192 %2 = bitcast i8* %1 to <16 x i8>* 193 %3 = load <16 x i8>, <16 x i8>* %2, align 1 194 %4 = icmp ult <16 x i8> %3, %broadcast15 195 %5 = sub <16 x i8> %3, %broadcast15 196 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5 197 store <16 x i8> %6, <16 x i8>* %2, align 1 198 ret void 199} 200 201define void @test7(i16* nocapture %head) nounwind { 202; SSE-LABEL: test7: 203; SSE: ## BB#0: ## %vector.ph 204; SSE-NEXT: movdqu (%rdi), %xmm0 205; SSE-NEXT: movdqu 16(%rdi), %xmm1 206; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 207; SSE-NEXT: psubusw %xmm2, %xmm0 208; SSE-NEXT: psubusw %xmm2, %xmm1 209; SSE-NEXT: movdqu %xmm1, 16(%rdi) 210; SSE-NEXT: movdqu %xmm0, (%rdi) 211; SSE-NEXT: retq 212; 213; AVX1-LABEL: test7: 214; AVX1: ## BB#0: ## %vector.ph 215; AVX1-NEXT: vmovups (%rdi), %ymm0 216; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 217; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 218; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1 219; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 220; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 221; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 222; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 223; AVX1-NEXT: vmovups %ymm0, (%rdi) 224; AVX1-NEXT: vzeroupper 225; AVX1-NEXT: retq 226; 227; AVX2-LABEL: test7: 228; AVX2: ## BB#0: ## %vector.ph 229; AVX2-NEXT: vmovdqu (%rdi), %ymm0 230; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 231; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 232; AVX2-NEXT: vzeroupper 233; AVX2-NEXT: retq 234vector.ph: 235 %0 = getelementptr inbounds i16, i16* %head, i64 0 236 %1 = bitcast i16* %0 to <16 x i16>* 237 %2 = load <16 x i16>, <16 x i16>* %1, align 2 238 %3 = icmp slt <16 x i16> %2, zeroinitializer 239 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 240 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 241 store <16 x i16> %5, <16 x i16>* %1, align 2 242 ret void 243} 244 245define void @test8(i16* nocapture %head) nounwind { 246; SSE-LABEL: test8: 247; SSE: ## BB#0: ## %vector.ph 248; SSE-NEXT: movdqu (%rdi), %xmm0 249; SSE-NEXT: movdqu 16(%rdi), %xmm1 250; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767] 251; SSE-NEXT: psubusw %xmm2, %xmm0 252; SSE-NEXT: psubusw %xmm2, %xmm1 253; SSE-NEXT: movdqu %xmm1, 16(%rdi) 254; SSE-NEXT: movdqu %xmm0, (%rdi) 255; SSE-NEXT: retq 256; 257; AVX1-LABEL: test8: 258; AVX1: ## BB#0: ## %vector.ph 259; AVX1-NEXT: vmovups (%rdi), %ymm0 260; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 261; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] 262; AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 263; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534] 264; AVX1-NEXT: vpcmpgtw %xmm4, %xmm3, %xmm3 265; AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm2 266; AVX1-NEXT: vpcmpgtw %xmm4, %xmm2, %xmm2 267; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 268; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769] 269; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 270; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 271; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 272; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0 273; AVX1-NEXT: vmovups %ymm0, (%rdi) 274; AVX1-NEXT: vzeroupper 275; AVX1-NEXT: retq 276; 277; AVX2-LABEL: test8: 278; AVX2: ## BB#0: ## %vector.ph 279; AVX2-NEXT: vmovdqu (%rdi), %ymm0 280; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0 281; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 282; AVX2-NEXT: vzeroupper 283; AVX2-NEXT: retq 284vector.ph: 285 %0 = getelementptr inbounds i16, i16* %head, i64 0 286 %1 = bitcast i16* %0 to <16 x i16>* 287 %2 = load <16 x i16>, <16 x i16>* %1, align 2 288 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 289 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 290 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 291 store <16 x i16> %5, <16 x i16>* %1, align 2 292 ret void 293 294} 295 296define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { 297; SSE-LABEL: test9: 298; SSE: ## BB#0: ## %vector.ph 299; SSE-NEXT: movd %esi, %xmm0 300; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 301; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 302; SSE-NEXT: movdqu (%rdi), %xmm1 303; SSE-NEXT: movdqu 16(%rdi), %xmm2 304; SSE-NEXT: psubusw %xmm0, %xmm1 305; SSE-NEXT: psubusw %xmm0, %xmm2 306; SSE-NEXT: movdqu %xmm2, 16(%rdi) 307; SSE-NEXT: movdqu %xmm1, (%rdi) 308; SSE-NEXT: retq 309; 310; AVX1-LABEL: test9: 311; AVX1: ## BB#0: ## %vector.ph 312; AVX1-NEXT: vmovups (%rdi), %ymm0 313; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 314; AVX1-NEXT: vmovd %esi, %xmm2 315; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 316; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] 317; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3 318; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4 319; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 320; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm4 321; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1 322; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2 323; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 324; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 325; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 326; AVX1-NEXT: vmovups %ymm0, (%rdi) 327; AVX1-NEXT: vzeroupper 328; AVX1-NEXT: retq 329; 330; AVX2-LABEL: test9: 331; AVX2: ## BB#0: ## %vector.ph 332; AVX2-NEXT: vmovd %esi, %xmm0 333; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 334; AVX2-NEXT: vmovdqu (%rdi), %ymm1 335; AVX2-NEXT: vpsubusw %ymm0, %ymm1, %ymm0 336; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 337; AVX2-NEXT: vzeroupper 338; AVX2-NEXT: retq 339vector.ph: 340 %0 = insertelement <16 x i16> undef, i16 %w, i32 0 341 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer 342 %1 = getelementptr inbounds i16, i16* %head, i64 0 343 %2 = bitcast i16* %1 to <16 x i16>* 344 %3 = load <16 x i16>, <16 x i16>* %2, align 2 345 %4 = icmp ult <16 x i16> %3, %broadcast15 346 %5 = sub <16 x i16> %3, %broadcast15 347 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5 348 store <16 x i16> %6, <16 x i16>* %2, align 2 349 ret void 350} 351 352define void @test10(i8* nocapture %head) nounwind { 353; SSE-LABEL: test10: 354; SSE: ## BB#0: ## %vector.ph 355; SSE-NEXT: movdqu (%rdi), %xmm0 356; SSE-NEXT: movdqu 16(%rdi), %xmm1 357; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 358; SSE-NEXT: psubusb %xmm2, %xmm0 359; SSE-NEXT: psubusb %xmm2, %xmm1 360; SSE-NEXT: movdqu %xmm1, 16(%rdi) 361; SSE-NEXT: movdqu %xmm0, (%rdi) 362; SSE-NEXT: retq 363; 364; AVX1-LABEL: test10: 365; AVX1: ## BB#0: ## %vector.ph 366; AVX1-NEXT: vmovups (%rdi), %ymm0 367; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 368; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 369; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 370; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 371; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 372; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 373; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 374; AVX1-NEXT: vmovups %ymm0, (%rdi) 375; AVX1-NEXT: vzeroupper 376; AVX1-NEXT: retq 377; 378; AVX2-LABEL: test10: 379; AVX2: ## BB#0: ## %vector.ph 380; AVX2-NEXT: vmovdqu (%rdi), %ymm0 381; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 382; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 383; AVX2-NEXT: vzeroupper 384; AVX2-NEXT: retq 385vector.ph: 386 %0 = getelementptr inbounds i8, i8* %head, i64 0 387 %1 = bitcast i8* %0 to <32 x i8>* 388 %2 = load <32 x i8>, <32 x i8>* %1, align 1 389 %3 = icmp slt <32 x i8> %2, zeroinitializer 390 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 391 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 392 store <32 x i8> %5, <32 x i8>* %1, align 1 393 ret void 394 395} 396 397define void @test11(i8* nocapture %head) nounwind { 398; SSE-LABEL: test11: 399; SSE: ## BB#0: ## %vector.ph 400; SSE-NEXT: movdqu (%rdi), %xmm0 401; SSE-NEXT: movdqu 16(%rdi), %xmm1 402; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 403; SSE-NEXT: psubusb %xmm2, %xmm0 404; SSE-NEXT: psubusb %xmm2, %xmm1 405; SSE-NEXT: movdqu %xmm1, 16(%rdi) 406; SSE-NEXT: movdqu %xmm0, (%rdi) 407; SSE-NEXT: retq 408; 409; AVX1-LABEL: test11: 410; AVX1: ## BB#0: ## %vector.ph 411; AVX1-NEXT: vmovups (%rdi), %ymm0 412; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 413; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 414; AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 415; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] 416; AVX1-NEXT: vpcmpgtb %xmm4, %xmm3, %xmm3 417; AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm2 418; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm2 419; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 420; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] 421; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 422; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 423; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 424; AVX1-NEXT: vandps %ymm0, %ymm2, %ymm0 425; AVX1-NEXT: vmovups %ymm0, (%rdi) 426; AVX1-NEXT: vzeroupper 427; AVX1-NEXT: retq 428; 429; AVX2-LABEL: test11: 430; AVX2: ## BB#0: ## %vector.ph 431; AVX2-NEXT: vmovdqu (%rdi), %ymm0 432; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0 433; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 434; AVX2-NEXT: vzeroupper 435; AVX2-NEXT: retq 436vector.ph: 437 %0 = getelementptr inbounds i8, i8* %head, i64 0 438 %1 = bitcast i8* %0 to <32 x i8>* 439 %2 = load <32 x i8>, <32 x i8>* %1, align 1 440 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 441 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 442 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 443 store <32 x i8> %5, <32 x i8>* %1, align 1 444 ret void 445} 446 447define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind { 448; SSE2-LABEL: test12: 449; SSE2: ## BB#0: ## %vector.ph 450; SSE2-NEXT: movd %esi, %xmm0 451; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 452; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 453; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 454; SSE2-NEXT: movdqu (%rdi), %xmm1 455; SSE2-NEXT: movdqu 16(%rdi), %xmm2 456; SSE2-NEXT: psubusb %xmm0, %xmm1 457; SSE2-NEXT: psubusb %xmm0, %xmm2 458; SSE2-NEXT: movdqu %xmm2, 16(%rdi) 459; SSE2-NEXT: movdqu %xmm1, (%rdi) 460; SSE2-NEXT: retq 461; 462; SSSE3-LABEL: test12: 463; SSSE3: ## BB#0: ## %vector.ph 464; SSSE3-NEXT: movd %esi, %xmm0 465; SSSE3-NEXT: pxor %xmm1, %xmm1 466; SSSE3-NEXT: pshufb %xmm1, %xmm0 467; SSSE3-NEXT: movdqu (%rdi), %xmm1 468; SSSE3-NEXT: movdqu 16(%rdi), %xmm2 469; SSSE3-NEXT: psubusb %xmm0, %xmm1 470; SSSE3-NEXT: psubusb %xmm0, %xmm2 471; SSSE3-NEXT: movdqu %xmm2, 16(%rdi) 472; SSSE3-NEXT: movdqu %xmm1, (%rdi) 473; SSSE3-NEXT: retq 474; 475; AVX1-LABEL: test12: 476; AVX1: ## BB#0: ## %vector.ph 477; AVX1-NEXT: vmovups (%rdi), %ymm0 478; AVX1-NEXT: vmovd %esi, %xmm1 479; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 480; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 481; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 482; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 483; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm4 484; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 485; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm4 486; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 487; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 488; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 489; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 490; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 491; AVX1-NEXT: vmovups %ymm0, (%rdi) 492; AVX1-NEXT: vzeroupper 493; AVX1-NEXT: retq 494; 495; AVX2-LABEL: test12: 496; AVX2: ## BB#0: ## %vector.ph 497; AVX2-NEXT: vmovd %esi, %xmm0 498; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 499; AVX2-NEXT: vmovdqu (%rdi), %ymm1 500; AVX2-NEXT: vpsubusb %ymm0, %ymm1, %ymm0 501; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 502; AVX2-NEXT: vzeroupper 503; AVX2-NEXT: retq 504vector.ph: 505 %0 = insertelement <32 x i8> undef, i8 %w, i32 0 506 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer 507 %1 = getelementptr inbounds i8, i8* %head, i64 0 508 %2 = bitcast i8* %1 to <32 x i8>* 509 %3 = load <32 x i8>, <32 x i8>* %2, align 1 510 %4 = icmp ult <32 x i8> %3, %broadcast15 511 %5 = sub <32 x i8> %3, %broadcast15 512 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5 513 store <32 x i8> %6, <32 x i8>* %2, align 1 514 ret void 515} 516