1; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2 2; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1 3; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 4 5target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 6target triple = "x86_64-apple-macosx10.8.0" 7 8define void @test1(i16* nocapture %head) nounwind { 9vector.ph: 10 br label %vector.body 11 12vector.body: ; preds = %vector.body, %vector.ph 13 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 14 %0 = getelementptr inbounds i16* %head, i64 %index 15 %1 = bitcast i16* %0 to <8 x i16>* 16 %2 = load <8 x i16>* %1, align 2 17 %3 = icmp slt <8 x i16> %2, zeroinitializer 18 %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 19 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 20 store <8 x i16> %5, <8 x i16>* %1, align 2 21 %index.next = add i64 %index, 8 22 %6 = icmp eq i64 %index.next, 16384 23 br i1 %6, label %for.end, label %vector.body 24 25for.end: ; preds = %vector.body 26 ret void 27 28; SSE2: @test1 29; SSE2: psubusw LCPI0_0(%rip), %xmm0 30 31; AVX1: @test1 32; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 33 34; AVX2: @test1 35; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0 36} 37 38define void @test2(i16* nocapture %head) nounwind { 39vector.ph: 40 br label %vector.body 41 42vector.body: ; preds = %vector.body, %vector.ph 43 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 44 %0 = getelementptr inbounds i16* %head, i64 %index 45 %1 = bitcast i16* %0 to <8 x i16>* 46 %2 = load <8 x i16>* %1, align 2 47 %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 48 %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 49 %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer 50 store <8 x i16> %5, <8 x i16>* %1, align 2 51 %index.next = add i64 %index, 8 52 %6 = icmp eq i64 %index.next, 16384 53 br i1 %6, label %for.end, label %vector.body 54 55for.end: ; preds = %vector.body 56 ret void 57 58; SSE2: @test2 59; SSE2: psubusw LCPI1_0(%rip), %xmm0 60 61; AVX1: @test2 62; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 63 64; AVX2: @test2 65; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0 66} 67 68define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { 69vector.ph: 70 %0 = insertelement <8 x i16> undef, i16 %w, i32 0 71 %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer 72 br label %vector.body 73 74vector.body: ; preds = %vector.body, %vector.ph 75 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 76 %1 = getelementptr inbounds i16* %head, i64 %index 77 %2 = bitcast i16* %1 to <8 x i16>* 78 %3 = load <8 x i16>* %2, align 2 79 %4 = icmp ult <8 x i16> %3, %broadcast15 80 %5 = sub <8 x i16> %3, %broadcast15 81 %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5 82 store <8 x i16> %6, <8 x i16>* %2, align 2 83 %index.next = add i64 %index, 8 84 %7 = icmp eq i64 %index.next, 16384 85 br i1 %7, label %for.end, label %vector.body 86 87for.end: ; preds = %vector.body 88 ret void 89 90; SSE2: @test3 91; SSE2: psubusw %xmm0, %xmm1 92 93; AVX1: @test3 94; AVX1: vpsubusw %xmm0, %xmm1, %xmm1 95 96; AVX2: @test3 97; AVX2: vpsubusw %xmm0, %xmm1, %xmm1 98} 99 100define void @test4(i8* nocapture %head) nounwind { 101vector.ph: 102 br label %vector.body 103 104vector.body: ; preds = %vector.body, %vector.ph 105 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 106 %0 = getelementptr inbounds i8* %head, i64 %index 107 %1 = bitcast i8* %0 to <16 x i8>* 108 %2 = load <16 x i8>* %1, align 1 109 %3 = icmp slt <16 x i8> %2, zeroinitializer 110 %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 111 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 112 store <16 x i8> %5, <16 x i8>* %1, align 1 113 %index.next = add i64 %index, 16 114 %6 = icmp eq i64 %index.next, 16384 115 br i1 %6, label %for.end, label %vector.body 116 117for.end: ; preds = %vector.body 118 ret void 119 120; SSE2: @test4 121; SSE2: psubusb LCPI3_0(%rip), %xmm0 122 123; AVX1: @test4 124; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 125 126; AVX2: @test4 127; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0 128} 129 130define void @test5(i8* nocapture %head) nounwind { 131vector.ph: 132 br label %vector.body 133 134vector.body: ; preds = %vector.body, %vector.ph 135 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 136 %0 = getelementptr inbounds i8* %head, i64 %index 137 %1 = bitcast i8* %0 to <16 x i8>* 138 %2 = load <16 x i8>* %1, align 1 139 %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 140 %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 141 %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer 142 store <16 x i8> %5, <16 x i8>* %1, align 1 143 %index.next = add i64 %index, 16 144 %6 = icmp eq i64 %index.next, 16384 145 br i1 %6, label %for.end, label %vector.body 146 147for.end: ; preds = %vector.body 148 ret void 149 150; SSE2: @test5 151; SSE2: psubusb LCPI4_0(%rip), %xmm0 152 153; AVX1: @test5 154; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0 155 156; AVX2: @test5 157; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0 158} 159 160define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind { 161vector.ph: 162 %0 = insertelement <16 x i8> undef, i8 %w, i32 0 163 %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer 164 br label %vector.body 165 166vector.body: ; preds = %vector.body, %vector.ph 167 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 168 %1 = getelementptr inbounds i8* %head, i64 %index 169 %2 = bitcast i8* %1 to <16 x i8>* 170 %3 = load <16 x i8>* %2, align 1 171 %4 = icmp ult <16 x i8> %3, %broadcast15 172 %5 = sub <16 x i8> %3, %broadcast15 173 %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5 174 store <16 x i8> %6, <16 x i8>* %2, align 1 175 %index.next = add i64 %index, 16 176 %7 = icmp eq i64 %index.next, 16384 177 br i1 %7, label %for.end, label %vector.body 178 179for.end: ; preds = %vector.body 180 ret void 181 182; SSE2: @test6 183; SSE2: psubusb %xmm0, %xmm1 184 185; AVX1: @test6 186; AVX1: vpsubusb %xmm0, %xmm1, %xmm1 187 188; AVX2: @test6 189; AVX2: vpsubusb %xmm0, %xmm1, %xmm1 190} 191 192define void @test7(i16* nocapture %head) nounwind { 193vector.ph: 194 br label %vector.body 195 196vector.body: ; preds = %vector.body, %vector.ph 197 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 198 %0 = getelementptr inbounds i16* %head, i64 %index 199 %1 = bitcast i16* %0 to <16 x i16>* 200 %2 = load <16 x i16>* %1, align 2 201 %3 = icmp slt <16 x i16> %2, zeroinitializer 202 %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768> 203 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 204 store <16 x i16> %5, <16 x i16>* %1, align 2 205 %index.next = add i64 %index, 8 206 %6 = icmp eq i64 %index.next, 16384 207 br i1 %6, label %for.end, label %vector.body 208 209for.end: ; preds = %vector.body 210 ret void 211 212; AVX2: @test7 213; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0 214} 215 216define void @test8(i16* nocapture %head) nounwind { 217vector.ph: 218 br label %vector.body 219 220vector.body: ; preds = %vector.body, %vector.ph 221 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 222 %0 = getelementptr inbounds i16* %head, i64 %index 223 %1 = bitcast i16* %0 to <16 x i16>* 224 %2 = load <16 x i16>* %1, align 2 225 %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766> 226 %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767> 227 %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer 228 store <16 x i16> %5, <16 x i16>* %1, align 2 229 %index.next = add i64 %index, 8 230 %6 = icmp eq i64 %index.next, 16384 231 br i1 %6, label %for.end, label %vector.body 232 233for.end: ; preds = %vector.body 234 ret void 235 236; AVX2: @test8 237; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0 238} 239 240define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { 241vector.ph: 242 %0 = insertelement <16 x i16> undef, i16 %w, i32 0 243 %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer 244 br label %vector.body 245 246vector.body: ; preds = %vector.body, %vector.ph 247 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 248 %1 = getelementptr inbounds i16* %head, i64 %index 249 %2 = bitcast i16* %1 to <16 x i16>* 250 %3 = load <16 x i16>* %2, align 2 251 %4 = icmp ult <16 x i16> %3, %broadcast15 252 %5 = sub <16 x i16> %3, %broadcast15 253 %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5 254 store <16 x i16> %6, <16 x i16>* %2, align 2 255 %index.next = add i64 %index, 8 256 %7 = icmp eq i64 %index.next, 16384 257 br i1 %7, label %for.end, label %vector.body 258 259for.end: ; preds = %vector.body 260 ret void 261 262 263; AVX2: @test9 264; AVX2: vpsubusw %ymm0, %ymm1, %ymm1 265} 266 267define void @test10(i8* nocapture %head) nounwind { 268vector.ph: 269 br label %vector.body 270 271vector.body: ; preds = %vector.body, %vector.ph 272 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 273 %0 = getelementptr inbounds i8* %head, i64 %index 274 %1 = bitcast i8* %0 to <32 x i8>* 275 %2 = load <32 x i8>* %1, align 1 276 %3 = icmp slt <32 x i8> %2, zeroinitializer 277 %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128> 278 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 279 store <32 x i8> %5, <32 x i8>* %1, align 1 280 %index.next = add i64 %index, 16 281 %6 = icmp eq i64 %index.next, 16384 282 br i1 %6, label %for.end, label %vector.body 283 284for.end: ; preds = %vector.body 285 ret void 286 287 288; AVX2: @test10 289; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0 290} 291 292define void @test11(i8* nocapture %head) nounwind { 293vector.ph: 294 br label %vector.body 295 296vector.body: ; preds = %vector.body, %vector.ph 297 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 298 %0 = getelementptr inbounds i8* %head, i64 %index 299 %1 = bitcast i8* %0 to <32 x i8>* 300 %2 = load <32 x i8>* %1, align 1 301 %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126> 302 %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127> 303 %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer 304 store <32 x i8> %5, <32 x i8>* %1, align 1 305 %index.next = add i64 %index, 16 306 %6 = icmp eq i64 %index.next, 16384 307 br i1 %6, label %for.end, label %vector.body 308 309for.end: ; preds = %vector.body 310 ret void 311 312; AVX2: @test11 313; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0 314} 315 316define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind { 317vector.ph: 318 %0 = insertelement <32 x i8> undef, i8 %w, i32 0 319 %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer 320 br label %vector.body 321 322vector.body: ; preds = %vector.body, %vector.ph 323 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 324 %1 = getelementptr inbounds i8* %head, i64 %index 325 %2 = bitcast i8* %1 to <32 x i8>* 326 %3 = load <32 x i8>* %2, align 1 327 %4 = icmp ult <32 x i8> %3, %broadcast15 328 %5 = sub <32 x i8> %3, %broadcast15 329 %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5 330 store <32 x i8> %6, <32 x i8>* %2, align 1 331 %index.next = add i64 %index, 16 332 %7 = icmp eq i64 %index.next, 16384 333 br i1 %7, label %for.end, label %vector.body 334 335for.end: ; preds = %vector.body 336 ret void 337 338; AVX2: @test12 339; AVX2: vpsubusb %ymm0, %ymm1, %ymm1 340} 341