• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=XOP
6
7; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428
8; This is a larger-than-usual regression test to verify that several backend
9; transforms are working together. We want to hoist the expansion of non-uniform
10; vector shifts out of a loop if we do not have real vector shift instructions.
11; See test/Transforms/CodeGenPrepare/X86/vec-shift.ll for the 1st step in that
12; sequence.
13
14define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1) nounwind {
15; SSE-LABEL: vector_variable_shift_left_loop:
16; SSE:       # %bb.0: # %entry
17; SSE-NEXT:    testl %edx, %edx
18; SSE-NEXT:    jle .LBB0_9
19; SSE-NEXT:  # %bb.1: # %for.body.preheader
20; SSE-NEXT:    movl %ecx, %r9d
21; SSE-NEXT:    movl %edx, %eax
22; SSE-NEXT:    cmpl $31, %edx
23; SSE-NEXT:    ja .LBB0_3
24; SSE-NEXT:  # %bb.2:
25; SSE-NEXT:    xorl %edx, %edx
26; SSE-NEXT:    jmp .LBB0_6
27; SSE-NEXT:  .LBB0_3: # %vector.ph
28; SSE-NEXT:    movl %eax, %edx
29; SSE-NEXT:    andl $-32, %edx
30; SSE-NEXT:    movd %r9d, %xmm0
31; SSE-NEXT:    movd %r8d, %xmm2
32; SSE-NEXT:    xorl %ecx, %ecx
33; SSE-NEXT:    pmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
34; SSE-NEXT:    pmovzxdq {{.*#+}} xmm15 = xmm2[0],zero,xmm2[1],zero
35; SSE-NEXT:    .p2align 4, 0x90
36; SSE-NEXT:  .LBB0_4: # %vector.body
37; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
38; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
39; SSE-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
40; SSE-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
41; SSE-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
42; SSE-NEXT:    pxor %xmm1, %xmm1
43; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
44; SSE-NEXT:    pmovsxbd %xmm0, %xmm7
45; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
46; SSE-NEXT:    pmovsxbd %xmm0, %xmm0
47; SSE-NEXT:    pcmpeqb %xmm1, %xmm3
48; SSE-NEXT:    pmovsxbd %xmm3, %xmm13
49; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
50; SSE-NEXT:    pmovsxbd %xmm3, %xmm6
51; SSE-NEXT:    pcmpeqb %xmm1, %xmm4
52; SSE-NEXT:    pmovsxbd %xmm4, %xmm11
53; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
54; SSE-NEXT:    pmovsxbd %xmm3, %xmm2
55; SSE-NEXT:    pcmpeqb %xmm1, %xmm5
56; SSE-NEXT:    pmovsxbd %xmm5, %xmm9
57; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
58; SSE-NEXT:    pmovsxbd %xmm3, %xmm10
59; SSE-NEXT:    movdqu 16(%rdi,%rcx,4), %xmm3
60; SSE-NEXT:    movdqa %xmm3, %xmm4
61; SSE-NEXT:    pslld %xmm14, %xmm4
62; SSE-NEXT:    pslld %xmm15, %xmm3
63; SSE-NEXT:    blendvps %xmm0, %xmm4, %xmm3
64; SSE-NEXT:    movdqu (%rdi,%rcx,4), %xmm8
65; SSE-NEXT:    movdqa %xmm8, %xmm5
66; SSE-NEXT:    pslld %xmm14, %xmm5
67; SSE-NEXT:    pslld %xmm15, %xmm8
68; SSE-NEXT:    movdqa %xmm7, %xmm0
69; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm8
70; SSE-NEXT:    movdqu 48(%rdi,%rcx,4), %xmm12
71; SSE-NEXT:    movdqa %xmm12, %xmm5
72; SSE-NEXT:    pslld %xmm14, %xmm5
73; SSE-NEXT:    pslld %xmm15, %xmm12
74; SSE-NEXT:    movdqa %xmm6, %xmm0
75; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm12
76; SSE-NEXT:    movdqu 32(%rdi,%rcx,4), %xmm6
77; SSE-NEXT:    movdqa %xmm6, %xmm5
78; SSE-NEXT:    pslld %xmm14, %xmm5
79; SSE-NEXT:    pslld %xmm15, %xmm6
80; SSE-NEXT:    movdqa %xmm13, %xmm0
81; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm6
82; SSE-NEXT:    movdqu 80(%rdi,%rcx,4), %xmm1
83; SSE-NEXT:    movdqa %xmm1, %xmm5
84; SSE-NEXT:    pslld %xmm14, %xmm5
85; SSE-NEXT:    pslld %xmm15, %xmm1
86; SSE-NEXT:    movdqa %xmm2, %xmm0
87; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm1
88; SSE-NEXT:    movdqu 64(%rdi,%rcx,4), %xmm5
89; SSE-NEXT:    movdqa %xmm5, %xmm2
90; SSE-NEXT:    pslld %xmm14, %xmm2
91; SSE-NEXT:    pslld %xmm15, %xmm5
92; SSE-NEXT:    movdqa %xmm11, %xmm0
93; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm5
94; SSE-NEXT:    movdqu 112(%rdi,%rcx,4), %xmm2
95; SSE-NEXT:    movdqa %xmm2, %xmm4
96; SSE-NEXT:    pslld %xmm14, %xmm4
97; SSE-NEXT:    pslld %xmm15, %xmm2
98; SSE-NEXT:    movdqa %xmm10, %xmm0
99; SSE-NEXT:    blendvps %xmm0, %xmm4, %xmm2
100; SSE-NEXT:    movdqu 96(%rdi,%rcx,4), %xmm4
101; SSE-NEXT:    movdqa %xmm4, %xmm7
102; SSE-NEXT:    pslld %xmm14, %xmm7
103; SSE-NEXT:    pslld %xmm15, %xmm4
104; SSE-NEXT:    movdqa %xmm9, %xmm0
105; SSE-NEXT:    blendvps %xmm0, %xmm7, %xmm4
106; SSE-NEXT:    movups %xmm8, (%rdi,%rcx,4)
107; SSE-NEXT:    movups %xmm3, 16(%rdi,%rcx,4)
108; SSE-NEXT:    movups %xmm6, 32(%rdi,%rcx,4)
109; SSE-NEXT:    movups %xmm12, 48(%rdi,%rcx,4)
110; SSE-NEXT:    movups %xmm5, 64(%rdi,%rcx,4)
111; SSE-NEXT:    movups %xmm1, 80(%rdi,%rcx,4)
112; SSE-NEXT:    movups %xmm4, 96(%rdi,%rcx,4)
113; SSE-NEXT:    movups %xmm2, 112(%rdi,%rcx,4)
114; SSE-NEXT:    addq $32, %rcx
115; SSE-NEXT:    cmpq %rcx, %rdx
116; SSE-NEXT:    jne .LBB0_4
117; SSE-NEXT:  # %bb.5: # %middle.block
118; SSE-NEXT:    cmpq %rax, %rdx
119; SSE-NEXT:    jne .LBB0_6
120; SSE-NEXT:  .LBB0_9: # %for.cond.cleanup
121; SSE-NEXT:    retq
122; SSE-NEXT:    .p2align 4, 0x90
123; SSE-NEXT:  .LBB0_8: # %for.body
124; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
125; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
126; SSE-NEXT:    shll %cl, (%rdi,%rdx,4)
127; SSE-NEXT:    incq %rdx
128; SSE-NEXT:    cmpq %rdx, %rax
129; SSE-NEXT:    je .LBB0_9
130; SSE-NEXT:  .LBB0_6: # %for.body
131; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
132; SSE-NEXT:    cmpb $0, (%rsi,%rdx)
133; SSE-NEXT:    movl %r9d, %ecx
134; SSE-NEXT:    je .LBB0_8
135; SSE-NEXT:  # %bb.7: # %for.body
136; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
137; SSE-NEXT:    movl %r8d, %ecx
138; SSE-NEXT:    jmp .LBB0_8
139;
140; AVX1-LABEL: vector_variable_shift_left_loop:
141; AVX1:       # %bb.0: # %entry
142; AVX1-NEXT:    testl %edx, %edx
143; AVX1-NEXT:    jle .LBB0_9
144; AVX1-NEXT:  # %bb.1: # %for.body.preheader
145; AVX1-NEXT:    movl %ecx, %r9d
146; AVX1-NEXT:    movl %edx, %eax
147; AVX1-NEXT:    cmpl $31, %edx
148; AVX1-NEXT:    ja .LBB0_3
149; AVX1-NEXT:  # %bb.2:
150; AVX1-NEXT:    xorl %edx, %edx
151; AVX1-NEXT:    jmp .LBB0_6
152; AVX1-NEXT:  .LBB0_3: # %vector.ph
153; AVX1-NEXT:    movl %eax, %edx
154; AVX1-NEXT:    andl $-32, %edx
155; AVX1-NEXT:    vmovd %r9d, %xmm0
156; AVX1-NEXT:    vmovd %r8d, %xmm1
157; AVX1-NEXT:    xorl %ecx, %ecx
158; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
159; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
160; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
161; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
162; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero
163; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
164; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
165; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
166; AVX1-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
167; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero
168; AVX1-NEXT:    .p2align 4, 0x90
169; AVX1-NEXT:  .LBB0_4: # %vector.body
170; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
171; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
172; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
173; AVX1-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
174; AVX1-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
175; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
176; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm12, %xmm1
177; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm5
178; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
179; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
180; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm12, %xmm2
181; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm6
182; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
183; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
184; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm12, %xmm3
185; AVX1-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
186; AVX1-NEXT:    # xmm7 = mem[0],zero,mem[1],zero
187; AVX1-NEXT:    vmovdqu (%rdi,%rcx,4), %xmm8
188; AVX1-NEXT:    vpslld %xmm7, %xmm8, %xmm9
189; AVX1-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
190; AVX1-NEXT:    # xmm10 = mem[0],zero,mem[1],zero
191; AVX1-NEXT:    vpslld %xmm10, %xmm8, %xmm0
192; AVX1-NEXT:    vblendvps %xmm5, %xmm9, %xmm0, %xmm8
193; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm5
194; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
195; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
196; AVX1-NEXT:    vpcmpeqb %xmm4, %xmm12, %xmm4
197; AVX1-NEXT:    vmovdqu 16(%rdi,%rcx,4), %xmm0
198; AVX1-NEXT:    vpslld %xmm7, %xmm0, %xmm7
199; AVX1-NEXT:    vpslld %xmm10, %xmm0, %xmm0
200; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm9
201; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
202; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm12
203; AVX1-NEXT:    vblendvps %xmm1, %xmm7, %xmm0, %xmm10
204; AVX1-NEXT:    vmovdqu 32(%rdi,%rcx,4), %xmm1
205; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
206; AVX1-NEXT:    vpslld %xmm0, %xmm1, %xmm7
207; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
208; AVX1-NEXT:    vpslld %xmm4, %xmm1, %xmm1
209; AVX1-NEXT:    vblendvps %xmm6, %xmm7, %xmm1, %xmm1
210; AVX1-NEXT:    vmovdqu 48(%rdi,%rcx,4), %xmm6
211; AVX1-NEXT:    vpslld %xmm0, %xmm6, %xmm7
212; AVX1-NEXT:    vpslld %xmm4, %xmm6, %xmm6
213; AVX1-NEXT:    vblendvps %xmm2, %xmm7, %xmm6, %xmm2
214; AVX1-NEXT:    vmovdqu 64(%rdi,%rcx,4), %xmm6
215; AVX1-NEXT:    vpslld %xmm13, %xmm6, %xmm7
216; AVX1-NEXT:    vpslld %xmm14, %xmm6, %xmm6
217; AVX1-NEXT:    vblendvps %xmm5, %xmm7, %xmm6, %xmm5
218; AVX1-NEXT:    vmovdqu 80(%rdi,%rcx,4), %xmm6
219; AVX1-NEXT:    vpslld %xmm13, %xmm6, %xmm7
220; AVX1-NEXT:    vpslld %xmm14, %xmm6, %xmm6
221; AVX1-NEXT:    vblendvps %xmm3, %xmm7, %xmm6, %xmm3
222; AVX1-NEXT:    vmovdqu 96(%rdi,%rcx,4), %xmm6
223; AVX1-NEXT:    vpslld %xmm15, %xmm6, %xmm7
224; AVX1-NEXT:    vpslld %xmm11, %xmm6, %xmm6
225; AVX1-NEXT:    vblendvps %xmm9, %xmm7, %xmm6, %xmm6
226; AVX1-NEXT:    vmovdqu 112(%rdi,%rcx,4), %xmm7
227; AVX1-NEXT:    vpslld %xmm15, %xmm7, %xmm0
228; AVX1-NEXT:    vpslld %xmm11, %xmm7, %xmm7
229; AVX1-NEXT:    vblendvps %xmm12, %xmm0, %xmm7, %xmm0
230; AVX1-NEXT:    vmovups %xmm8, (%rdi,%rcx,4)
231; AVX1-NEXT:    vmovups %xmm10, 16(%rdi,%rcx,4)
232; AVX1-NEXT:    vmovups %xmm1, 32(%rdi,%rcx,4)
233; AVX1-NEXT:    vmovups %xmm2, 48(%rdi,%rcx,4)
234; AVX1-NEXT:    vmovups %xmm5, 64(%rdi,%rcx,4)
235; AVX1-NEXT:    vmovups %xmm3, 80(%rdi,%rcx,4)
236; AVX1-NEXT:    vmovups %xmm6, 96(%rdi,%rcx,4)
237; AVX1-NEXT:    vmovups %xmm0, 112(%rdi,%rcx,4)
238; AVX1-NEXT:    addq $32, %rcx
239; AVX1-NEXT:    cmpq %rcx, %rdx
240; AVX1-NEXT:    jne .LBB0_4
241; AVX1-NEXT:  # %bb.5: # %middle.block
242; AVX1-NEXT:    cmpq %rax, %rdx
243; AVX1-NEXT:    jne .LBB0_6
244; AVX1-NEXT:  .LBB0_9: # %for.cond.cleanup
245; AVX1-NEXT:    vzeroupper
246; AVX1-NEXT:    retq
247; AVX1-NEXT:    .p2align 4, 0x90
248; AVX1-NEXT:  .LBB0_8: # %for.body
249; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
250; AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
251; AVX1-NEXT:    shll %cl, (%rdi,%rdx,4)
252; AVX1-NEXT:    incq %rdx
253; AVX1-NEXT:    cmpq %rdx, %rax
254; AVX1-NEXT:    je .LBB0_9
255; AVX1-NEXT:  .LBB0_6: # %for.body
256; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
257; AVX1-NEXT:    cmpb $0, (%rsi,%rdx)
258; AVX1-NEXT:    movl %r9d, %ecx
259; AVX1-NEXT:    je .LBB0_8
260; AVX1-NEXT:  # %bb.7: # %for.body
261; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
262; AVX1-NEXT:    movl %r8d, %ecx
263; AVX1-NEXT:    jmp .LBB0_8
264;
265; AVX2-LABEL: vector_variable_shift_left_loop:
266; AVX2:       # %bb.0: # %entry
267; AVX2-NEXT:    testl %edx, %edx
268; AVX2-NEXT:    jle .LBB0_9
269; AVX2-NEXT:  # %bb.1: # %for.body.preheader
270; AVX2-NEXT:    movl %ecx, %r9d
271; AVX2-NEXT:    movl %edx, %eax
272; AVX2-NEXT:    cmpl $31, %edx
273; AVX2-NEXT:    ja .LBB0_3
274; AVX2-NEXT:  # %bb.2:
275; AVX2-NEXT:    xorl %edx, %edx
276; AVX2-NEXT:    jmp .LBB0_6
277; AVX2-NEXT:  .LBB0_3: # %vector.ph
278; AVX2-NEXT:    movl %eax, %edx
279; AVX2-NEXT:    andl $-32, %edx
280; AVX2-NEXT:    vmovd %r9d, %xmm0
281; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
282; AVX2-NEXT:    vmovd %r8d, %xmm1
283; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
284; AVX2-NEXT:    xorl %ecx, %ecx
285; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
286; AVX2-NEXT:    .p2align 4, 0x90
287; AVX2-NEXT:  .LBB0_4: # %vector.body
288; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
289; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
290; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
291; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
292; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
293; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm3, %ymm3
294; AVX2-NEXT:    vblendvps %ymm3, %ymm0, %ymm1, %ymm3
295; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm4, %ymm4
296; AVX2-NEXT:    vblendvps %ymm4, %ymm0, %ymm1, %ymm4
297; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm5, %ymm5
298; AVX2-NEXT:    vblendvps %ymm5, %ymm0, %ymm1, %ymm5
299; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm6, %ymm6
300; AVX2-NEXT:    vblendvps %ymm6, %ymm0, %ymm1, %ymm6
301; AVX2-NEXT:    vmovdqu (%rdi,%rcx,4), %ymm7
302; AVX2-NEXT:    vpsllvd %ymm3, %ymm7, %ymm3
303; AVX2-NEXT:    vmovdqu 32(%rdi,%rcx,4), %ymm7
304; AVX2-NEXT:    vpsllvd %ymm4, %ymm7, %ymm4
305; AVX2-NEXT:    vmovdqu 64(%rdi,%rcx,4), %ymm7
306; AVX2-NEXT:    vpsllvd %ymm5, %ymm7, %ymm5
307; AVX2-NEXT:    vmovdqu 96(%rdi,%rcx,4), %ymm7
308; AVX2-NEXT:    vpsllvd %ymm6, %ymm7, %ymm6
309; AVX2-NEXT:    vmovdqu %ymm3, (%rdi,%rcx,4)
310; AVX2-NEXT:    vmovdqu %ymm4, 32(%rdi,%rcx,4)
311; AVX2-NEXT:    vmovdqu %ymm5, 64(%rdi,%rcx,4)
312; AVX2-NEXT:    vmovdqu %ymm6, 96(%rdi,%rcx,4)
313; AVX2-NEXT:    addq $32, %rcx
314; AVX2-NEXT:    cmpq %rcx, %rdx
315; AVX2-NEXT:    jne .LBB0_4
316; AVX2-NEXT:  # %bb.5: # %middle.block
317; AVX2-NEXT:    cmpq %rax, %rdx
318; AVX2-NEXT:    jne .LBB0_6
319; AVX2-NEXT:  .LBB0_9: # %for.cond.cleanup
320; AVX2-NEXT:    vzeroupper
321; AVX2-NEXT:    retq
322; AVX2-NEXT:    .p2align 4, 0x90
323; AVX2-NEXT:  .LBB0_8: # %for.body
324; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
325; AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
326; AVX2-NEXT:    shll %cl, (%rdi,%rdx,4)
327; AVX2-NEXT:    incq %rdx
328; AVX2-NEXT:    cmpq %rdx, %rax
329; AVX2-NEXT:    je .LBB0_9
330; AVX2-NEXT:  .LBB0_6: # %for.body
331; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
332; AVX2-NEXT:    cmpb $0, (%rsi,%rdx)
333; AVX2-NEXT:    movl %r9d, %ecx
334; AVX2-NEXT:    je .LBB0_8
335; AVX2-NEXT:  # %bb.7: # %for.body
336; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
337; AVX2-NEXT:    movl %r8d, %ecx
338; AVX2-NEXT:    jmp .LBB0_8
339;
340; XOP-LABEL: vector_variable_shift_left_loop:
341; XOP:       # %bb.0: # %entry
342; XOP-NEXT:    testl %edx, %edx
343; XOP-NEXT:    jle .LBB0_9
344; XOP-NEXT:  # %bb.1: # %for.body.preheader
345; XOP-NEXT:    movl %ecx, %r9d
346; XOP-NEXT:    movl %edx, %eax
347; XOP-NEXT:    cmpl $31, %edx
348; XOP-NEXT:    ja .LBB0_3
349; XOP-NEXT:  # %bb.2:
350; XOP-NEXT:    xorl %edx, %edx
351; XOP-NEXT:    jmp .LBB0_6
352; XOP-NEXT:  .LBB0_3: # %vector.ph
353; XOP-NEXT:    movl %eax, %edx
354; XOP-NEXT:    andl $-32, %edx
355; XOP-NEXT:    vmovd %r9d, %xmm0
356; XOP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
357; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm9
358; XOP-NEXT:    vmovd %r8d, %xmm1
359; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
360; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm14
361; XOP-NEXT:    xorl %ecx, %ecx
362; XOP-NEXT:    vpxor %xmm8, %xmm8, %xmm8
363; XOP-NEXT:    vextractf128 $1, %ymm9, %xmm15
364; XOP-NEXT:    vextractf128 $1, %ymm14, %xmm4
365; XOP-NEXT:    .p2align 4, 0x90
366; XOP-NEXT:  .LBB0_4: # %vector.body
367; XOP-NEXT:    # =>This Inner Loop Header: Depth=1
368; XOP-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
369; XOP-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
370; XOP-NEXT:    vmovq {{.*#+}} xmm7 = mem[0],zero
371; XOP-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
372; XOP-NEXT:    vpcomeqb %xmm8, %xmm5, %xmm5
373; XOP-NEXT:    vpmovsxbd %xmm5, %xmm0
374; XOP-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
375; XOP-NEXT:    vpmovsxbd %xmm5, %xmm5
376; XOP-NEXT:    vpcomeqb %xmm8, %xmm6, %xmm6
377; XOP-NEXT:    vpmovsxbd %xmm6, %xmm10
378; XOP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1]
379; XOP-NEXT:    vpmovsxbd %xmm6, %xmm6
380; XOP-NEXT:    vpcomeqb %xmm8, %xmm7, %xmm7
381; XOP-NEXT:    vpmovsxbd %xmm7, %xmm11
382; XOP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1]
383; XOP-NEXT:    vpmovsxbd %xmm7, %xmm7
384; XOP-NEXT:    vpcomeqb %xmm8, %xmm2, %xmm2
385; XOP-NEXT:    vpmovsxbd %xmm2, %xmm12
386; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
387; XOP-NEXT:    vpmovsxbd %xmm2, %xmm2
388; XOP-NEXT:    vblendvps %xmm5, %xmm15, %xmm4, %xmm5
389; XOP-NEXT:    vpshld %xmm5, 16(%rdi,%rcx,4), %xmm13
390; XOP-NEXT:    vblendvps %xmm0, %xmm9, %xmm14, %xmm0
391; XOP-NEXT:    vpshld %xmm0, (%rdi,%rcx,4), %xmm0
392; XOP-NEXT:    vblendvps %xmm6, %xmm15, %xmm4, %xmm6
393; XOP-NEXT:    vpshld %xmm6, 48(%rdi,%rcx,4), %xmm6
394; XOP-NEXT:    vblendvps %xmm10, %xmm9, %xmm14, %xmm5
395; XOP-NEXT:    vpshld %xmm5, 32(%rdi,%rcx,4), %xmm5
396; XOP-NEXT:    vblendvps %xmm7, %xmm15, %xmm4, %xmm7
397; XOP-NEXT:    vpshld %xmm7, 80(%rdi,%rcx,4), %xmm7
398; XOP-NEXT:    vblendvps %xmm11, %xmm9, %xmm14, %xmm1
399; XOP-NEXT:    vpshld %xmm1, 64(%rdi,%rcx,4), %xmm1
400; XOP-NEXT:    vblendvps %xmm2, %xmm15, %xmm4, %xmm2
401; XOP-NEXT:    vpshld %xmm2, 112(%rdi,%rcx,4), %xmm2
402; XOP-NEXT:    vblendvps %xmm12, %xmm9, %xmm14, %xmm3
403; XOP-NEXT:    vpshld %xmm3, 96(%rdi,%rcx,4), %xmm3
404; XOP-NEXT:    vmovdqu %xmm0, (%rdi,%rcx,4)
405; XOP-NEXT:    vmovdqu %xmm13, 16(%rdi,%rcx,4)
406; XOP-NEXT:    vmovdqu %xmm5, 32(%rdi,%rcx,4)
407; XOP-NEXT:    vmovdqu %xmm6, 48(%rdi,%rcx,4)
408; XOP-NEXT:    vmovdqu %xmm1, 64(%rdi,%rcx,4)
409; XOP-NEXT:    vmovdqu %xmm7, 80(%rdi,%rcx,4)
410; XOP-NEXT:    vmovdqu %xmm3, 96(%rdi,%rcx,4)
411; XOP-NEXT:    vmovdqu %xmm2, 112(%rdi,%rcx,4)
412; XOP-NEXT:    addq $32, %rcx
413; XOP-NEXT:    cmpq %rcx, %rdx
414; XOP-NEXT:    jne .LBB0_4
415; XOP-NEXT:  # %bb.5: # %middle.block
416; XOP-NEXT:    cmpq %rax, %rdx
417; XOP-NEXT:    jne .LBB0_6
418; XOP-NEXT:  .LBB0_9: # %for.cond.cleanup
419; XOP-NEXT:    vzeroupper
420; XOP-NEXT:    retq
421; XOP-NEXT:    .p2align 4, 0x90
422; XOP-NEXT:  .LBB0_8: # %for.body
423; XOP-NEXT:    # in Loop: Header=BB0_6 Depth=1
424; XOP-NEXT:    # kill: def $cl killed $cl killed $ecx
425; XOP-NEXT:    shll %cl, (%rdi,%rdx,4)
426; XOP-NEXT:    incq %rdx
427; XOP-NEXT:    cmpq %rdx, %rax
428; XOP-NEXT:    je .LBB0_9
429; XOP-NEXT:  .LBB0_6: # %for.body
430; XOP-NEXT:    # =>This Inner Loop Header: Depth=1
431; XOP-NEXT:    cmpb $0, (%rsi,%rdx)
432; XOP-NEXT:    movl %r9d, %ecx
433; XOP-NEXT:    je .LBB0_8
434; XOP-NEXT:  # %bb.7: # %for.body
435; XOP-NEXT:    # in Loop: Header=BB0_6 Depth=1
436; XOP-NEXT:    movl %r8d, %ecx
437; XOP-NEXT:    jmp .LBB0_8
438entry:
439  %cmp12 = icmp sgt i32 %count, 0
440  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
441
442for.body.preheader:
443  %wide.trip.count = zext i32 %count to i64
444  %min.iters.check = icmp ult i32 %count, 32
445  br i1 %min.iters.check, label %for.body.preheader40, label %vector.ph
446
447for.body.preheader40:
448  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
449  br label %for.body
450
451vector.ph:
452  %n.vec = and i64 %wide.trip.count, 4294967264
453  %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %amt0, i32 0
454  %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
455  %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %amt1, i32 0
456  %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer
457  %broadcast.splatinsert24 = insertelement <8 x i32> undef, i32 %amt0, i32 0
458  %broadcast.splat25 = shufflevector <8 x i32> %broadcast.splatinsert24, <8 x i32> undef, <8 x i32> zeroinitializer
459  %broadcast.splatinsert26 = insertelement <8 x i32> undef, i32 %amt1, i32 0
460  %broadcast.splat27 = shufflevector <8 x i32> %broadcast.splatinsert26, <8 x i32> undef, <8 x i32> zeroinitializer
461  %broadcast.splatinsert28 = insertelement <8 x i32> undef, i32 %amt0, i32 0
462  %broadcast.splat29 = shufflevector <8 x i32> %broadcast.splatinsert28, <8 x i32> undef, <8 x i32> zeroinitializer
463  %broadcast.splatinsert30 = insertelement <8 x i32> undef, i32 %amt1, i32 0
464  %broadcast.splat31 = shufflevector <8 x i32> %broadcast.splatinsert30, <8 x i32> undef, <8 x i32> zeroinitializer
465  %broadcast.splatinsert32 = insertelement <8 x i32> undef, i32 %amt0, i32 0
466  %broadcast.splat33 = shufflevector <8 x i32> %broadcast.splatinsert32, <8 x i32> undef, <8 x i32> zeroinitializer
467  %broadcast.splatinsert34 = insertelement <8 x i32> undef, i32 %amt1, i32 0
468  %broadcast.splat35 = shufflevector <8 x i32> %broadcast.splatinsert34, <8 x i32> undef, <8 x i32> zeroinitializer
469  br label %vector.body
470
471vector.body:
472  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
473  %0 = getelementptr inbounds i8, i8* %control, i64 %index
474  %1 = bitcast i8* %0 to <8 x i8>*
475  %wide.load = load <8 x i8>, <8 x i8>* %1, align 1
476  %2 = getelementptr inbounds i8, i8* %0, i64 8
477  %3 = bitcast i8* %2 to <8 x i8>*
478  %wide.load17 = load <8 x i8>, <8 x i8>* %3, align 1
479  %4 = getelementptr inbounds i8, i8* %0, i64 16
480  %5 = bitcast i8* %4 to <8 x i8>*
481  %wide.load18 = load <8 x i8>, <8 x i8>* %5, align 1
482  %6 = getelementptr inbounds i8, i8* %0, i64 24
483  %7 = bitcast i8* %6 to <8 x i8>*
484  %wide.load19 = load <8 x i8>, <8 x i8>* %7, align 1
485  %8 = icmp eq <8 x i8> %wide.load, zeroinitializer
486  %9 = icmp eq <8 x i8> %wide.load17, zeroinitializer
487  %10 = icmp eq <8 x i8> %wide.load18, zeroinitializer
488  %11 = icmp eq <8 x i8> %wide.load19, zeroinitializer
489  %12 = select <8 x i1> %8, <8 x i32> %broadcast.splat21, <8 x i32> %broadcast.splat23
490  %13 = select <8 x i1> %9, <8 x i32> %broadcast.splat25, <8 x i32> %broadcast.splat27
491  %14 = select <8 x i1> %10, <8 x i32> %broadcast.splat29, <8 x i32> %broadcast.splat31
492  %15 = select <8 x i1> %11, <8 x i32> %broadcast.splat33, <8 x i32> %broadcast.splat35
493  %16 = getelementptr inbounds i32, i32* %arr, i64 %index
494  %17 = bitcast i32* %16 to <8 x i32>*
495  %wide.load36 = load <8 x i32>, <8 x i32>* %17, align 4
496  %18 = getelementptr inbounds i32, i32* %16, i64 8
497  %19 = bitcast i32* %18 to <8 x i32>*
498  %wide.load37 = load <8 x i32>, <8 x i32>* %19, align 4
499  %20 = getelementptr inbounds i32, i32* %16, i64 16
500  %21 = bitcast i32* %20 to <8 x i32>*
501  %wide.load38 = load <8 x i32>, <8 x i32>* %21, align 4
502  %22 = getelementptr inbounds i32, i32* %16, i64 24
503  %23 = bitcast i32* %22 to <8 x i32>*
504  %wide.load39 = load <8 x i32>, <8 x i32>* %23, align 4
505  %24 = shl <8 x i32> %wide.load36, %12
506  %25 = shl <8 x i32> %wide.load37, %13
507  %26 = shl <8 x i32> %wide.load38, %14
508  %27 = shl <8 x i32> %wide.load39, %15
509  %28 = bitcast i32* %16 to <8 x i32>*
510  store <8 x i32> %24, <8 x i32>* %28, align 4
511  %29 = bitcast i32* %18 to <8 x i32>*
512  store <8 x i32> %25, <8 x i32>* %29, align 4
513  %30 = bitcast i32* %20 to <8 x i32>*
514  store <8 x i32> %26, <8 x i32>* %30, align 4
515  %31 = bitcast i32* %22 to <8 x i32>*
516  store <8 x i32> %27, <8 x i32>* %31, align 4
517  %index.next = add i64 %index, 32
518  %32 = icmp eq i64 %index.next, %n.vec
519  br i1 %32, label %middle.block, label %vector.body
520
521middle.block:
522  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
523  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader40
524
525for.cond.cleanup:
526  ret void
527
528for.body:
529  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader40 ]
530  %arrayidx = getelementptr inbounds i8, i8* %control, i64 %indvars.iv
531  %33 = load i8, i8* %arrayidx, align 1
532  %tobool = icmp eq i8 %33, 0
533  %cond = select i1 %tobool, i32 %amt0, i32 %amt1
534  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
535  %34 = load i32, i32* %arrayidx2, align 4
536  %shl = shl i32 %34, %cond
537  store i32 %shl, i32* %arrayidx2, align 4
538  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
539  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
540  br i1 %exitcond, label %for.cond.cleanup, label %for.body
541}
542
543define void @vector_variable_shift_left_loop_simpler(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) nounwind {
544; SSE-LABEL: vector_variable_shift_left_loop_simpler:
545; SSE:       # %bb.0: # %entry
546; SSE-NEXT:    testl %edx, %edx
547; SSE-NEXT:    jle .LBB1_3
548; SSE-NEXT:  # %bb.1: # %vector.ph
549; SSE-NEXT:    movl %edx, %eax
550; SSE-NEXT:    andl $-4, %eax
551; SSE-NEXT:    movd %ecx, %xmm0
552; SSE-NEXT:    movd %r8d, %xmm3
553; SSE-NEXT:    movd %r9d, %xmm1
554; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
555; SSE-NEXT:    xorl %ecx, %ecx
556; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
557; SSE-NEXT:    movdqa %xmm1, %xmm2
558; SSE-NEXT:    pslld %xmm0, %xmm2
559; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
560; SSE-NEXT:    pslld %xmm0, %xmm1
561; SSE-NEXT:    pxor %xmm3, %xmm3
562; SSE-NEXT:    .p2align 4, 0x90
563; SSE-NEXT:  .LBB1_2: # %vector.body
564; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
565; SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
566; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
567; SSE-NEXT:    movdqa %xmm1, %xmm4
568; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm4
569; SSE-NEXT:    movups %xmm4, (%rdi,%rcx,4)
570; SSE-NEXT:    addq $4, %rcx
571; SSE-NEXT:    cmpq %rcx, %rax
572; SSE-NEXT:    jne .LBB1_2
573; SSE-NEXT:  .LBB1_3: # %exit
574; SSE-NEXT:    retq
575;
576; AVX1-LABEL: vector_variable_shift_left_loop_simpler:
577; AVX1:       # %bb.0: # %entry
578; AVX1-NEXT:    testl %edx, %edx
579; AVX1-NEXT:    jle .LBB1_3
580; AVX1-NEXT:  # %bb.1: # %vector.ph
581; AVX1-NEXT:    movl %edx, %eax
582; AVX1-NEXT:    andl $-4, %eax
583; AVX1-NEXT:    vmovd %ecx, %xmm0
584; AVX1-NEXT:    vmovd %r8d, %xmm1
585; AVX1-NEXT:    vmovd %r9d, %xmm2
586; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
587; AVX1-NEXT:    xorl %ecx, %ecx
588; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
589; AVX1-NEXT:    vpslld %xmm0, %xmm2, %xmm0
590; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
591; AVX1-NEXT:    vpslld %xmm1, %xmm2, %xmm1
592; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
593; AVX1-NEXT:    .p2align 4, 0x90
594; AVX1-NEXT:  .LBB1_2: # %vector.body
595; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
596; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
597; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm3
598; AVX1-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm3
599; AVX1-NEXT:    vmovups %xmm3, (%rdi,%rcx,4)
600; AVX1-NEXT:    addq $4, %rcx
601; AVX1-NEXT:    cmpq %rcx, %rax
602; AVX1-NEXT:    jne .LBB1_2
603; AVX1-NEXT:  .LBB1_3: # %exit
604; AVX1-NEXT:    retq
605;
606; AVX2-LABEL: vector_variable_shift_left_loop_simpler:
607; AVX2:       # %bb.0: # %entry
608; AVX2-NEXT:    testl %edx, %edx
609; AVX2-NEXT:    jle .LBB1_3
610; AVX2-NEXT:  # %bb.1: # %vector.ph
611; AVX2-NEXT:    movl %edx, %eax
612; AVX2-NEXT:    andl $-4, %eax
613; AVX2-NEXT:    vmovd %ecx, %xmm0
614; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
615; AVX2-NEXT:    vmovd %r8d, %xmm1
616; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
617; AVX2-NEXT:    vmovd %r9d, %xmm2
618; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
619; AVX2-NEXT:    xorl %ecx, %ecx
620; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
621; AVX2-NEXT:    .p2align 4, 0x90
622; AVX2-NEXT:  .LBB1_2: # %vector.body
623; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
624; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
625; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm4
626; AVX2-NEXT:    vblendvps %xmm4, %xmm0, %xmm1, %xmm4
627; AVX2-NEXT:    vpsllvd %xmm4, %xmm2, %xmm4
628; AVX2-NEXT:    vmovdqu %xmm4, (%rdi,%rcx,4)
629; AVX2-NEXT:    addq $4, %rcx
630; AVX2-NEXT:    cmpq %rcx, %rax
631; AVX2-NEXT:    jne .LBB1_2
632; AVX2-NEXT:  .LBB1_3: # %exit
633; AVX2-NEXT:    retq
634;
635; XOP-LABEL: vector_variable_shift_left_loop_simpler:
636; XOP:       # %bb.0: # %entry
637; XOP-NEXT:    testl %edx, %edx
638; XOP-NEXT:    jle .LBB1_3
639; XOP-NEXT:  # %bb.1: # %vector.ph
640; XOP-NEXT:    movl %edx, %eax
641; XOP-NEXT:    andl $-4, %eax
642; XOP-NEXT:    vmovd %ecx, %xmm0
643; XOP-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
644; XOP-NEXT:    vmovd %r8d, %xmm1
645; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
646; XOP-NEXT:    vmovd %r9d, %xmm2
647; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
648; XOP-NEXT:    xorl %ecx, %ecx
649; XOP-NEXT:    vpxor %xmm3, %xmm3, %xmm3
650; XOP-NEXT:    .p2align 4, 0x90
651; XOP-NEXT:  .LBB1_2: # %vector.body
652; XOP-NEXT:    # =>This Inner Loop Header: Depth=1
653; XOP-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
654; XOP-NEXT:    vpcomeqd %xmm3, %xmm4, %xmm4
655; XOP-NEXT:    vblendvps %xmm4, %xmm0, %xmm1, %xmm4
656; XOP-NEXT:    vpshld %xmm4, %xmm2, %xmm4
657; XOP-NEXT:    vmovdqu %xmm4, (%rdi,%rcx,4)
658; XOP-NEXT:    addq $4, %rcx
659; XOP-NEXT:    cmpq %rcx, %rax
660; XOP-NEXT:    jne .LBB1_2
661; XOP-NEXT:  .LBB1_3: # %exit
662; XOP-NEXT:    retq
663entry:
664  %cmp16 = icmp sgt i32 %count, 0
665  %wide.trip.count = zext i32 %count to i64
666  br i1 %cmp16, label %vector.ph, label %exit
667
668vector.ph:
669  %n.vec = and i64 %wide.trip.count, 4294967292
670  %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0
671  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
672  %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0
673  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
674  %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0
675  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
676  br label %vector.body
677
678vector.body:
679  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
680  %0 = getelementptr inbounds i8, i8* %control, i64 %index
681  %1 = bitcast i8* %0 to <4 x i8>*
682  %wide.load = load <4 x i8>, <4 x i8>* %1, align 1
683  %2 = icmp eq <4 x i8> %wide.load, zeroinitializer
684  %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2
685  %4 = shl <4 x i32> %splat3, %3
686  %5 = getelementptr inbounds i32, i32* %arr, i64 %index
687  %6 = bitcast i32* %5 to <4 x i32>*
688  store <4 x i32> %4, <4 x i32>* %6, align 4
689  %index.next = add i64 %index, 4
690  %7 = icmp eq i64 %index.next, %n.vec
691  br i1 %7, label %exit, label %vector.body
692
693exit:
694  ret void
695}
696