• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=NARROW
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDE
4
5; FIXME: We shouldn't require both a movd and an insert in the wide version.
6
7define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
8; NARROW-LABEL: update:
9; NARROW:       # %bb.0: # %entry
10; NARROW-NEXT:    subl $12, %esp
11; NARROW-NEXT:    movl $0, (%esp)
12; NARROW-NEXT:    pcmpeqd %xmm0, %xmm0
13; NARROW-NEXT:    movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
14; NARROW-NEXT:    jmp .LBB0_1
15; NARROW-NEXT:    .p2align 4, 0x90
16; NARROW-NEXT:  .LBB0_2: # %forbody
17; NARROW-NEXT:    # in Loop: Header=BB0_1 Depth=1
18; NARROW-NEXT:    movl (%esp), %eax
19; NARROW-NEXT:    leal (,%eax,8), %ecx
20; NARROW-NEXT:    movl {{[0-9]+}}(%esp), %edx
21; NARROW-NEXT:    addl %ecx, %edx
22; NARROW-NEXT:    movl %edx, {{[0-9]+}}(%esp)
23; NARROW-NEXT:    addl {{[0-9]+}}(%esp), %ecx
24; NARROW-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
25; NARROW-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
26; NARROW-NEXT:    psubw %xmm0, %xmm2
27; NARROW-NEXT:    psllw $8, %xmm2
28; NARROW-NEXT:    psraw $8, %xmm2
29; NARROW-NEXT:    psrlw $2, %xmm2
30; NARROW-NEXT:    pshufb %xmm1, %xmm2
31; NARROW-NEXT:    movq %xmm2, (%edx,%eax,8)
32; NARROW-NEXT:    incl (%esp)
33; NARROW-NEXT:  .LBB0_1: # %forcond
34; NARROW-NEXT:    # =>This Inner Loop Header: Depth=1
35; NARROW-NEXT:    movl (%esp), %eax
36; NARROW-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
37; NARROW-NEXT:    jl .LBB0_2
38; NARROW-NEXT:  # %bb.3: # %afterfor
39; NARROW-NEXT:    addl $12, %esp
40; NARROW-NEXT:    retl
41;
42; WIDE-LABEL: update:
43; WIDE:       # %bb.0: # %entry
44; WIDE-NEXT:    subl $12, %esp
45; WIDE-NEXT:    movl $0, (%esp)
46; WIDE-NEXT:    pcmpeqd %xmm0, %xmm0
47; WIDE-NEXT:    movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
48; WIDE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
49; WIDE-NEXT:    jmp .LBB0_1
50; WIDE-NEXT:    .p2align 4, 0x90
51; WIDE-NEXT:  .LBB0_2: # %forbody
52; WIDE-NEXT:    # in Loop: Header=BB0_1 Depth=1
53; WIDE-NEXT:    movl (%esp), %eax
54; WIDE-NEXT:    leal (,%eax,8), %ecx
55; WIDE-NEXT:    movl {{[0-9]+}}(%esp), %edx
56; WIDE-NEXT:    addl %ecx, %edx
57; WIDE-NEXT:    movl %edx, {{[0-9]+}}(%esp)
58; WIDE-NEXT:    addl {{[0-9]+}}(%esp), %ecx
59; WIDE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
60; WIDE-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
61; WIDE-NEXT:    pinsrd $1, 4(%ecx,%eax,8), %xmm3
62; WIDE-NEXT:    psubb %xmm0, %xmm3
63; WIDE-NEXT:    psrlw $2, %xmm3
64; WIDE-NEXT:    pand %xmm1, %xmm3
65; WIDE-NEXT:    pxor %xmm2, %xmm3
66; WIDE-NEXT:    psubb %xmm2, %xmm3
67; WIDE-NEXT:    pextrd $1, %xmm3, 4(%edx,%eax,8)
68; WIDE-NEXT:    movd %xmm3, (%edx,%eax,8)
69; WIDE-NEXT:    incl (%esp)
70; WIDE-NEXT:  .LBB0_1: # %forcond
71; WIDE-NEXT:    # =>This Inner Loop Header: Depth=1
72; WIDE-NEXT:    movl (%esp), %eax
73; WIDE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
74; WIDE-NEXT:    jl .LBB0_2
75; WIDE-NEXT:  # %bb.3: # %afterfor
76; WIDE-NEXT:    addl $12, %esp
77; WIDE-NEXT:    retl
78entry:
79	%dst_i.addr = alloca i64*
80	%src_i.addr = alloca i64*
81	%n.addr = alloca i32
82	%i = alloca i32, align 4
83	%dst = alloca <8 x i8>*, align 4
84	%src = alloca <8 x i8>*, align 4
85	store i64* %dst_i, i64** %dst_i.addr
86	store i64* %src_i, i64** %src_i.addr
87	store i32 %n, i32* %n.addr
88	store i32 0, i32* %i
89	br label %forcond
90
91forcond:
92	%tmp = load i32, i32* %i
93	%tmp1 = load i32, i32* %n.addr
94	%cmp = icmp slt i32 %tmp, %tmp1
95	br i1 %cmp, label %forbody, label %afterfor
96
97forbody:
98	%tmp2 = load i32, i32* %i
99	%tmp3 = load i64*, i64** %dst_i.addr
100	%arrayidx = getelementptr i64, i64* %tmp3, i32 %tmp2
101	%conv = bitcast i64* %arrayidx to <8 x i8>*
102	store <8 x i8>* %conv, <8 x i8>** %dst
103	%tmp4 = load i32, i32* %i
104	%tmp5 = load i64*, i64** %src_i.addr
105	%arrayidx6 = getelementptr i64, i64* %tmp5, i32 %tmp4
106	%conv7 = bitcast i64* %arrayidx6 to <8 x i8>*
107	store <8 x i8>* %conv7, <8 x i8>** %src
108	%tmp8 = load i32, i32* %i
109	%tmp9 = load <8 x i8>*, <8 x i8>** %dst
110	%arrayidx10 = getelementptr <8 x i8>, <8 x i8>* %tmp9, i32 %tmp8
111	%tmp11 = load i32, i32* %i
112	%tmp12 = load <8 x i8>*, <8 x i8>** %src
113	%arrayidx13 = getelementptr <8 x i8>, <8 x i8>* %tmp12, i32 %tmp11
114	%tmp14 = load <8 x i8>, <8 x i8>* %arrayidx13
115	%add = add <8 x i8> %tmp14, < i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1 >
116	%shr = ashr <8 x i8> %add, < i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2 >
117	store <8 x i8> %shr, <8 x i8>* %arrayidx10
118	br label %forinc
119
120forinc:
121	%tmp15 = load i32, i32* %i
122	%inc = add i32 %tmp15, 1
123	store i32 %inc, i32* %i
124	br label %forcond
125
126afterfor:
127	ret void
128}
129
130