1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=WIDE 3 4; FIXME: We shouldn't require both a movd and an insert in the wide version. 5 6define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { 7; WIDE-LABEL: update: 8; WIDE: # %bb.0: # %entry 9; WIDE-NEXT: subl $12, %esp 10; WIDE-NEXT: movl $0, (%esp) 11; WIDE-NEXT: pcmpeqd %xmm0, %xmm0 12; WIDE-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 13; WIDE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 14; WIDE-NEXT: .p2align 4, 0x90 15; WIDE-NEXT: .LBB0_1: # %forcond 16; WIDE-NEXT: # =>This Inner Loop Header: Depth=1 17; WIDE-NEXT: movl (%esp), %eax 18; WIDE-NEXT: cmpl {{[0-9]+}}(%esp), %eax 19; WIDE-NEXT: jge .LBB0_3 20; WIDE-NEXT: # %bb.2: # %forbody 21; WIDE-NEXT: # in Loop: Header=BB0_1 Depth=1 22; WIDE-NEXT: movl (%esp), %eax 23; WIDE-NEXT: leal (,%eax,8), %ecx 24; WIDE-NEXT: movl {{[0-9]+}}(%esp), %edx 25; WIDE-NEXT: addl %ecx, %edx 26; WIDE-NEXT: movl %edx, {{[0-9]+}}(%esp) 27; WIDE-NEXT: addl {{[0-9]+}}(%esp), %ecx 28; WIDE-NEXT: movl %ecx, {{[0-9]+}}(%esp) 29; WIDE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 30; WIDE-NEXT: psubb %xmm0, %xmm3 31; WIDE-NEXT: psrlw $2, %xmm3 32; WIDE-NEXT: pand %xmm1, %xmm3 33; WIDE-NEXT: pxor %xmm2, %xmm3 34; WIDE-NEXT: psubb %xmm2, %xmm3 35; WIDE-NEXT: movq %xmm3, (%edx,%eax,8) 36; WIDE-NEXT: incl (%esp) 37; WIDE-NEXT: jmp .LBB0_1 38; WIDE-NEXT: .LBB0_3: # %afterfor 39; WIDE-NEXT: addl $12, %esp 40; WIDE-NEXT: retl 41entry: 42 %dst_i.addr = alloca i64* 43 %src_i.addr = alloca i64* 44 %n.addr = alloca i32 45 %i = alloca i32, align 4 46 %dst = alloca <8 x i8>*, align 4 47 %src = alloca <8 x i8>*, align 4 48 store i64* %dst_i, i64** %dst_i.addr 49 store i64* %src_i, i64** %src_i.addr 50 store i32 %n, i32* %n.addr 51 store i32 0, i32* %i 52 br label %forcond 53 54forcond: 55 %tmp = load i32, i32* %i 56 %tmp1 = load i32, i32* %n.addr 57 %cmp = icmp slt i32 %tmp, %tmp1 58 br i1 %cmp, label %forbody, label %afterfor 59 60forbody: 61 %tmp2 = load i32, i32* %i 62 %tmp3 = load i64*, i64** %dst_i.addr 63 %arrayidx = getelementptr i64, i64* %tmp3, i32 %tmp2 64 %conv = bitcast i64* %arrayidx to <8 x i8>* 65 store <8 x i8>* %conv, <8 x i8>** %dst 66 %tmp4 = load i32, i32* %i 67 %tmp5 = load i64*, i64** %src_i.addr 68 %arrayidx6 = getelementptr i64, i64* %tmp5, i32 %tmp4 69 %conv7 = bitcast i64* %arrayidx6 to <8 x i8>* 70 store <8 x i8>* %conv7, <8 x i8>** %src 71 %tmp8 = load i32, i32* %i 72 %tmp9 = load <8 x i8>*, <8 x i8>** %dst 73 %arrayidx10 = getelementptr <8 x i8>, <8 x i8>* %tmp9, i32 %tmp8 74 %tmp11 = load i32, i32* %i 75 %tmp12 = load <8 x i8>*, <8 x i8>** %src 76 %arrayidx13 = getelementptr <8 x i8>, <8 x i8>* %tmp12, i32 %tmp11 77 %tmp14 = load <8 x i8>, <8 x i8>* %arrayidx13 78 %add = add <8 x i8> %tmp14, < i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1 > 79 %shr = ashr <8 x i8> %add, < i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2 > 80 store <8 x i8> %shr, <8 x i8>* %arrayidx10 81 br label %forinc 82 83forinc: 84 %tmp15 = load i32, i32* %i 85 %inc = add i32 %tmp15, 1 86 store i32 %inc, i32* %i 87 br label %forcond 88 89afterfor: 90 ret void 91} 92 93