1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=XOP 6 7; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428 8; This is a larger-than-usual regression test to verify that several backend 9; transforms are working together. We want to hoist the expansion of non-uniform 10; vector shifts out of a loop if we do not have real vector shift instructions. 11; See test/Transforms/CodeGenPrepare/X86/vec-shift.ll for the 1st step in that 12; sequence. 13 14define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1) nounwind { 15; SSE-LABEL: vector_variable_shift_left_loop: 16; SSE: # %bb.0: # %entry 17; SSE-NEXT: testl %edx, %edx 18; SSE-NEXT: jle .LBB0_9 19; SSE-NEXT: # %bb.1: # %for.body.preheader 20; SSE-NEXT: movl %ecx, %r9d 21; SSE-NEXT: movl %edx, %eax 22; SSE-NEXT: cmpl $31, %edx 23; SSE-NEXT: ja .LBB0_3 24; SSE-NEXT: # %bb.2: 25; SSE-NEXT: xorl %edx, %edx 26; SSE-NEXT: jmp .LBB0_6 27; SSE-NEXT: .LBB0_3: # %vector.ph 28; SSE-NEXT: movl %eax, %edx 29; SSE-NEXT: andl $-32, %edx 30; SSE-NEXT: movd %r9d, %xmm0 31; SSE-NEXT: movd %r8d, %xmm2 32; SSE-NEXT: xorl %ecx, %ecx 33; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero 34; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm2[0],zero,xmm2[1],zero 35; SSE-NEXT: .p2align 4, 0x90 36; SSE-NEXT: .LBB0_4: # %vector.body 37; SSE-NEXT: # =>This Inner Loop Header: Depth=1 38; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 39; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 40; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 41; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero 42; SSE-NEXT: pxor %xmm1, %xmm1 43; SSE-NEXT: pcmpeqb %xmm1, %xmm0 44; SSE-NEXT: pmovsxbd %xmm0, %xmm7 45; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 46; SSE-NEXT: pmovsxbd %xmm0, %xmm0 47; SSE-NEXT: pcmpeqb %xmm1, %xmm3 48; SSE-NEXT: pmovsxbd %xmm3, %xmm13 49; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] 50; SSE-NEXT: pmovsxbd %xmm3, %xmm6 51; SSE-NEXT: pcmpeqb %xmm1, %xmm4 52; SSE-NEXT: pmovsxbd %xmm4, %xmm11 53; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] 54; SSE-NEXT: pmovsxbd %xmm3, %xmm2 55; SSE-NEXT: pcmpeqb %xmm1, %xmm5 56; SSE-NEXT: pmovsxbd %xmm5, %xmm9 57; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1] 58; SSE-NEXT: pmovsxbd %xmm3, %xmm10 59; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3 60; SSE-NEXT: movdqa %xmm3, %xmm4 61; SSE-NEXT: pslld %xmm14, %xmm4 62; SSE-NEXT: pslld %xmm15, %xmm3 63; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3 64; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm8 65; SSE-NEXT: movdqa %xmm8, %xmm5 66; SSE-NEXT: pslld %xmm14, %xmm5 67; SSE-NEXT: pslld %xmm15, %xmm8 68; SSE-NEXT: movdqa %xmm7, %xmm0 69; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm8 70; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12 71; SSE-NEXT: movdqa %xmm12, %xmm5 72; SSE-NEXT: pslld %xmm14, %xmm5 73; SSE-NEXT: pslld %xmm15, %xmm12 74; SSE-NEXT: movdqa %xmm6, %xmm0 75; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12 76; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6 77; SSE-NEXT: movdqa %xmm6, %xmm5 78; SSE-NEXT: pslld %xmm14, %xmm5 79; SSE-NEXT: pslld %xmm15, %xmm6 80; SSE-NEXT: movdqa %xmm13, %xmm0 81; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6 82; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1 83; SSE-NEXT: movdqa %xmm1, %xmm5 84; SSE-NEXT: pslld %xmm14, %xmm5 85; SSE-NEXT: pslld %xmm15, %xmm1 86; SSE-NEXT: movdqa %xmm2, %xmm0 87; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1 88; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5 89; SSE-NEXT: movdqa %xmm5, %xmm2 90; SSE-NEXT: pslld %xmm14, %xmm2 91; SSE-NEXT: pslld %xmm15, %xmm5 92; SSE-NEXT: movdqa %xmm11, %xmm0 93; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5 94; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2 95; SSE-NEXT: movdqa %xmm2, %xmm4 96; SSE-NEXT: pslld %xmm14, %xmm4 97; SSE-NEXT: pslld %xmm15, %xmm2 98; SSE-NEXT: movdqa %xmm10, %xmm0 99; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2 100; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4 101; SSE-NEXT: movdqa %xmm4, %xmm7 102; SSE-NEXT: pslld %xmm14, %xmm7 103; SSE-NEXT: pslld %xmm15, %xmm4 104; SSE-NEXT: movdqa %xmm9, %xmm0 105; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4 106; SSE-NEXT: movups %xmm8, (%rdi,%rcx,4) 107; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4) 108; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4) 109; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4) 110; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4) 111; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4) 112; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4) 113; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4) 114; SSE-NEXT: addq $32, %rcx 115; SSE-NEXT: cmpq %rcx, %rdx 116; SSE-NEXT: jne .LBB0_4 117; SSE-NEXT: # %bb.5: # %middle.block 118; SSE-NEXT: cmpq %rax, %rdx 119; SSE-NEXT: jne .LBB0_6 120; SSE-NEXT: .LBB0_9: # %for.cond.cleanup 121; SSE-NEXT: retq 122; SSE-NEXT: .p2align 4, 0x90 123; SSE-NEXT: .LBB0_8: # %for.body 124; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1 125; SSE-NEXT: # kill: def $cl killed $cl killed $ecx 126; SSE-NEXT: shll %cl, (%rdi,%rdx,4) 127; SSE-NEXT: incq %rdx 128; SSE-NEXT: cmpq %rdx, %rax 129; SSE-NEXT: je .LBB0_9 130; SSE-NEXT: .LBB0_6: # %for.body 131; SSE-NEXT: # =>This Inner Loop Header: Depth=1 132; SSE-NEXT: cmpb $0, (%rsi,%rdx) 133; SSE-NEXT: movl %r9d, %ecx 134; SSE-NEXT: je .LBB0_8 135; SSE-NEXT: # %bb.7: # %for.body 136; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1 137; SSE-NEXT: movl %r8d, %ecx 138; SSE-NEXT: jmp .LBB0_8 139; 140; AVX1-LABEL: vector_variable_shift_left_loop: 141; AVX1: # %bb.0: # %entry 142; AVX1-NEXT: testl %edx, %edx 143; AVX1-NEXT: jle .LBB0_9 144; AVX1-NEXT: # %bb.1: # %for.body.preheader 145; AVX1-NEXT: movl %ecx, %r9d 146; AVX1-NEXT: movl %edx, %eax 147; AVX1-NEXT: cmpl $31, %edx 148; AVX1-NEXT: ja .LBB0_3 149; AVX1-NEXT: # %bb.2: 150; AVX1-NEXT: xorl %edx, %edx 151; AVX1-NEXT: jmp .LBB0_6 152; AVX1-NEXT: .LBB0_3: # %vector.ph 153; AVX1-NEXT: movl %eax, %edx 154; AVX1-NEXT: andl $-32, %edx 155; AVX1-NEXT: vmovd %r9d, %xmm0 156; AVX1-NEXT: vmovd %r8d, %xmm1 157; AVX1-NEXT: xorl %ecx, %ecx 158; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 159; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 160; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 161; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 162; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero 163; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero 164; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 165; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero 166; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 167; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero 168; AVX1-NEXT: .p2align 4, 0x90 169; AVX1-NEXT: .LBB0_4: # %vector.body 170; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 171; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 172; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 173; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 174; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 175; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 176; AVX1-NEXT: vpcmpeqb %xmm1, %xmm12, %xmm1 177; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 178; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 179; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 180; AVX1-NEXT: vpcmpeqb %xmm2, %xmm12, %xmm2 181; AVX1-NEXT: vpmovsxbd %xmm2, %xmm6 182; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 183; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 184; AVX1-NEXT: vpcmpeqb %xmm3, %xmm12, %xmm3 185; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 186; AVX1-NEXT: # xmm7 = mem[0],zero,mem[1],zero 187; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm8 188; AVX1-NEXT: vpslld %xmm7, %xmm8, %xmm9 189; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 190; AVX1-NEXT: # xmm10 = mem[0],zero,mem[1],zero 191; AVX1-NEXT: vpslld %xmm10, %xmm8, %xmm0 192; AVX1-NEXT: vblendvps %xmm5, %xmm9, %xmm0, %xmm8 193; AVX1-NEXT: vpmovsxbd %xmm3, %xmm5 194; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1] 195; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 196; AVX1-NEXT: vpcmpeqb %xmm4, %xmm12, %xmm4 197; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0 198; AVX1-NEXT: vpslld %xmm7, %xmm0, %xmm7 199; AVX1-NEXT: vpslld %xmm10, %xmm0, %xmm0 200; AVX1-NEXT: vpmovsxbd %xmm4, %xmm9 201; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] 202; AVX1-NEXT: vpmovsxbd %xmm4, %xmm12 203; AVX1-NEXT: vblendvps %xmm1, %xmm7, %xmm0, %xmm10 204; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm1 205; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 206; AVX1-NEXT: vpslld %xmm0, %xmm1, %xmm7 207; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 208; AVX1-NEXT: vpslld %xmm4, %xmm1, %xmm1 209; AVX1-NEXT: vblendvps %xmm6, %xmm7, %xmm1, %xmm1 210; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm6 211; AVX1-NEXT: vpslld %xmm0, %xmm6, %xmm7 212; AVX1-NEXT: vpslld %xmm4, %xmm6, %xmm6 213; AVX1-NEXT: vblendvps %xmm2, %xmm7, %xmm6, %xmm2 214; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm6 215; AVX1-NEXT: vpslld %xmm13, %xmm6, %xmm7 216; AVX1-NEXT: vpslld %xmm14, %xmm6, %xmm6 217; AVX1-NEXT: vblendvps %xmm5, %xmm7, %xmm6, %xmm5 218; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm6 219; AVX1-NEXT: vpslld %xmm13, %xmm6, %xmm7 220; AVX1-NEXT: vpslld %xmm14, %xmm6, %xmm6 221; AVX1-NEXT: vblendvps %xmm3, %xmm7, %xmm6, %xmm3 222; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm6 223; AVX1-NEXT: vpslld %xmm15, %xmm6, %xmm7 224; AVX1-NEXT: vpslld %xmm11, %xmm6, %xmm6 225; AVX1-NEXT: vblendvps %xmm9, %xmm7, %xmm6, %xmm6 226; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm7 227; AVX1-NEXT: vpslld %xmm15, %xmm7, %xmm0 228; AVX1-NEXT: vpslld %xmm11, %xmm7, %xmm7 229; AVX1-NEXT: vblendvps %xmm12, %xmm0, %xmm7, %xmm0 230; AVX1-NEXT: vmovups %xmm8, (%rdi,%rcx,4) 231; AVX1-NEXT: vmovups %xmm10, 16(%rdi,%rcx,4) 232; AVX1-NEXT: vmovups %xmm1, 32(%rdi,%rcx,4) 233; AVX1-NEXT: vmovups %xmm2, 48(%rdi,%rcx,4) 234; AVX1-NEXT: vmovups %xmm5, 64(%rdi,%rcx,4) 235; AVX1-NEXT: vmovups %xmm3, 80(%rdi,%rcx,4) 236; AVX1-NEXT: vmovups %xmm6, 96(%rdi,%rcx,4) 237; AVX1-NEXT: vmovups %xmm0, 112(%rdi,%rcx,4) 238; AVX1-NEXT: addq $32, %rcx 239; AVX1-NEXT: cmpq %rcx, %rdx 240; AVX1-NEXT: jne .LBB0_4 241; AVX1-NEXT: # %bb.5: # %middle.block 242; AVX1-NEXT: cmpq %rax, %rdx 243; AVX1-NEXT: jne .LBB0_6 244; AVX1-NEXT: .LBB0_9: # %for.cond.cleanup 245; AVX1-NEXT: vzeroupper 246; AVX1-NEXT: retq 247; AVX1-NEXT: .p2align 4, 0x90 248; AVX1-NEXT: .LBB0_8: # %for.body 249; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1 250; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx 251; AVX1-NEXT: shll %cl, (%rdi,%rdx,4) 252; AVX1-NEXT: incq %rdx 253; AVX1-NEXT: cmpq %rdx, %rax 254; AVX1-NEXT: je .LBB0_9 255; AVX1-NEXT: .LBB0_6: # %for.body 256; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 257; AVX1-NEXT: cmpb $0, (%rsi,%rdx) 258; AVX1-NEXT: movl %r9d, %ecx 259; AVX1-NEXT: je .LBB0_8 260; AVX1-NEXT: # %bb.7: # %for.body 261; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1 262; AVX1-NEXT: movl %r8d, %ecx 263; AVX1-NEXT: jmp .LBB0_8 264; 265; AVX2-LABEL: vector_variable_shift_left_loop: 266; AVX2: # %bb.0: # %entry 267; AVX2-NEXT: testl %edx, %edx 268; AVX2-NEXT: jle .LBB0_9 269; AVX2-NEXT: # %bb.1: # %for.body.preheader 270; AVX2-NEXT: movl %ecx, %r9d 271; AVX2-NEXT: movl %edx, %eax 272; AVX2-NEXT: cmpl $31, %edx 273; AVX2-NEXT: ja .LBB0_3 274; AVX2-NEXT: # %bb.2: 275; AVX2-NEXT: xorl %edx, %edx 276; AVX2-NEXT: jmp .LBB0_6 277; AVX2-NEXT: .LBB0_3: # %vector.ph 278; AVX2-NEXT: movl %eax, %edx 279; AVX2-NEXT: andl $-32, %edx 280; AVX2-NEXT: vmovd %r9d, %xmm0 281; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 282; AVX2-NEXT: vmovd %r8d, %xmm1 283; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 284; AVX2-NEXT: xorl %ecx, %ecx 285; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 286; AVX2-NEXT: .p2align 4, 0x90 287; AVX2-NEXT: .LBB0_4: # %vector.body 288; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 289; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 290; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 291; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 292; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 293; AVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 294; AVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 295; AVX2-NEXT: vpcmpeqd %ymm2, %ymm4, %ymm4 296; AVX2-NEXT: vblendvps %ymm4, %ymm0, %ymm1, %ymm4 297; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm5 298; AVX2-NEXT: vblendvps %ymm5, %ymm0, %ymm1, %ymm5 299; AVX2-NEXT: vpcmpeqd %ymm2, %ymm6, %ymm6 300; AVX2-NEXT: vblendvps %ymm6, %ymm0, %ymm1, %ymm6 301; AVX2-NEXT: vmovdqu (%rdi,%rcx,4), %ymm7 302; AVX2-NEXT: vpsllvd %ymm3, %ymm7, %ymm3 303; AVX2-NEXT: vmovdqu 32(%rdi,%rcx,4), %ymm7 304; AVX2-NEXT: vpsllvd %ymm4, %ymm7, %ymm4 305; AVX2-NEXT: vmovdqu 64(%rdi,%rcx,4), %ymm7 306; AVX2-NEXT: vpsllvd %ymm5, %ymm7, %ymm5 307; AVX2-NEXT: vmovdqu 96(%rdi,%rcx,4), %ymm7 308; AVX2-NEXT: vpsllvd %ymm6, %ymm7, %ymm6 309; AVX2-NEXT: vmovdqu %ymm3, (%rdi,%rcx,4) 310; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi,%rcx,4) 311; AVX2-NEXT: vmovdqu %ymm5, 64(%rdi,%rcx,4) 312; AVX2-NEXT: vmovdqu %ymm6, 96(%rdi,%rcx,4) 313; AVX2-NEXT: addq $32, %rcx 314; AVX2-NEXT: cmpq %rcx, %rdx 315; AVX2-NEXT: jne .LBB0_4 316; AVX2-NEXT: # %bb.5: # %middle.block 317; AVX2-NEXT: cmpq %rax, %rdx 318; AVX2-NEXT: jne .LBB0_6 319; AVX2-NEXT: .LBB0_9: # %for.cond.cleanup 320; AVX2-NEXT: vzeroupper 321; AVX2-NEXT: retq 322; AVX2-NEXT: .p2align 4, 0x90 323; AVX2-NEXT: .LBB0_8: # %for.body 324; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1 325; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx 326; AVX2-NEXT: shll %cl, (%rdi,%rdx,4) 327; AVX2-NEXT: incq %rdx 328; AVX2-NEXT: cmpq %rdx, %rax 329; AVX2-NEXT: je .LBB0_9 330; AVX2-NEXT: .LBB0_6: # %for.body 331; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 332; AVX2-NEXT: cmpb $0, (%rsi,%rdx) 333; AVX2-NEXT: movl %r9d, %ecx 334; AVX2-NEXT: je .LBB0_8 335; AVX2-NEXT: # %bb.7: # %for.body 336; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1 337; AVX2-NEXT: movl %r8d, %ecx 338; AVX2-NEXT: jmp .LBB0_8 339; 340; XOP-LABEL: vector_variable_shift_left_loop: 341; XOP: # %bb.0: # %entry 342; XOP-NEXT: testl %edx, %edx 343; XOP-NEXT: jle .LBB0_9 344; XOP-NEXT: # %bb.1: # %for.body.preheader 345; XOP-NEXT: movl %ecx, %r9d 346; XOP-NEXT: movl %edx, %eax 347; XOP-NEXT: cmpl $31, %edx 348; XOP-NEXT: ja .LBB0_3 349; XOP-NEXT: # %bb.2: 350; XOP-NEXT: xorl %edx, %edx 351; XOP-NEXT: jmp .LBB0_6 352; XOP-NEXT: .LBB0_3: # %vector.ph 353; XOP-NEXT: movl %eax, %edx 354; XOP-NEXT: andl $-32, %edx 355; XOP-NEXT: vmovd %r9d, %xmm0 356; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 357; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm9 358; XOP-NEXT: vmovd %r8d, %xmm1 359; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 360; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm14 361; XOP-NEXT: xorl %ecx, %ecx 362; XOP-NEXT: vpxor %xmm8, %xmm8, %xmm8 363; XOP-NEXT: vextractf128 $1, %ymm9, %xmm15 364; XOP-NEXT: vextractf128 $1, %ymm14, %xmm4 365; XOP-NEXT: .p2align 4, 0x90 366; XOP-NEXT: .LBB0_4: # %vector.body 367; XOP-NEXT: # =>This Inner Loop Header: Depth=1 368; XOP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero 369; XOP-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero 370; XOP-NEXT: vmovq {{.*#+}} xmm7 = mem[0],zero 371; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 372; XOP-NEXT: vpcomeqb %xmm8, %xmm5, %xmm5 373; XOP-NEXT: vpmovsxbd %xmm5, %xmm0 374; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 375; XOP-NEXT: vpmovsxbd %xmm5, %xmm5 376; XOP-NEXT: vpcomeqb %xmm8, %xmm6, %xmm6 377; XOP-NEXT: vpmovsxbd %xmm6, %xmm10 378; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] 379; XOP-NEXT: vpmovsxbd %xmm6, %xmm6 380; XOP-NEXT: vpcomeqb %xmm8, %xmm7, %xmm7 381; XOP-NEXT: vpmovsxbd %xmm7, %xmm11 382; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,1,1] 383; XOP-NEXT: vpmovsxbd %xmm7, %xmm7 384; XOP-NEXT: vpcomeqb %xmm8, %xmm2, %xmm2 385; XOP-NEXT: vpmovsxbd %xmm2, %xmm12 386; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 387; XOP-NEXT: vpmovsxbd %xmm2, %xmm2 388; XOP-NEXT: vblendvps %xmm5, %xmm15, %xmm4, %xmm5 389; XOP-NEXT: vpshld %xmm5, 16(%rdi,%rcx,4), %xmm13 390; XOP-NEXT: vblendvps %xmm0, %xmm9, %xmm14, %xmm0 391; XOP-NEXT: vpshld %xmm0, (%rdi,%rcx,4), %xmm0 392; XOP-NEXT: vblendvps %xmm6, %xmm15, %xmm4, %xmm6 393; XOP-NEXT: vpshld %xmm6, 48(%rdi,%rcx,4), %xmm6 394; XOP-NEXT: vblendvps %xmm10, %xmm9, %xmm14, %xmm5 395; XOP-NEXT: vpshld %xmm5, 32(%rdi,%rcx,4), %xmm5 396; XOP-NEXT: vblendvps %xmm7, %xmm15, %xmm4, %xmm7 397; XOP-NEXT: vpshld %xmm7, 80(%rdi,%rcx,4), %xmm7 398; XOP-NEXT: vblendvps %xmm11, %xmm9, %xmm14, %xmm1 399; XOP-NEXT: vpshld %xmm1, 64(%rdi,%rcx,4), %xmm1 400; XOP-NEXT: vblendvps %xmm2, %xmm15, %xmm4, %xmm2 401; XOP-NEXT: vpshld %xmm2, 112(%rdi,%rcx,4), %xmm2 402; XOP-NEXT: vblendvps %xmm12, %xmm9, %xmm14, %xmm3 403; XOP-NEXT: vpshld %xmm3, 96(%rdi,%rcx,4), %xmm3 404; XOP-NEXT: vmovdqu %xmm0, (%rdi,%rcx,4) 405; XOP-NEXT: vmovdqu %xmm13, 16(%rdi,%rcx,4) 406; XOP-NEXT: vmovdqu %xmm5, 32(%rdi,%rcx,4) 407; XOP-NEXT: vmovdqu %xmm6, 48(%rdi,%rcx,4) 408; XOP-NEXT: vmovdqu %xmm1, 64(%rdi,%rcx,4) 409; XOP-NEXT: vmovdqu %xmm7, 80(%rdi,%rcx,4) 410; XOP-NEXT: vmovdqu %xmm3, 96(%rdi,%rcx,4) 411; XOP-NEXT: vmovdqu %xmm2, 112(%rdi,%rcx,4) 412; XOP-NEXT: addq $32, %rcx 413; XOP-NEXT: cmpq %rcx, %rdx 414; XOP-NEXT: jne .LBB0_4 415; XOP-NEXT: # %bb.5: # %middle.block 416; XOP-NEXT: cmpq %rax, %rdx 417; XOP-NEXT: jne .LBB0_6 418; XOP-NEXT: .LBB0_9: # %for.cond.cleanup 419; XOP-NEXT: vzeroupper 420; XOP-NEXT: retq 421; XOP-NEXT: .p2align 4, 0x90 422; XOP-NEXT: .LBB0_8: # %for.body 423; XOP-NEXT: # in Loop: Header=BB0_6 Depth=1 424; XOP-NEXT: # kill: def $cl killed $cl killed $ecx 425; XOP-NEXT: shll %cl, (%rdi,%rdx,4) 426; XOP-NEXT: incq %rdx 427; XOP-NEXT: cmpq %rdx, %rax 428; XOP-NEXT: je .LBB0_9 429; XOP-NEXT: .LBB0_6: # %for.body 430; XOP-NEXT: # =>This Inner Loop Header: Depth=1 431; XOP-NEXT: cmpb $0, (%rsi,%rdx) 432; XOP-NEXT: movl %r9d, %ecx 433; XOP-NEXT: je .LBB0_8 434; XOP-NEXT: # %bb.7: # %for.body 435; XOP-NEXT: # in Loop: Header=BB0_6 Depth=1 436; XOP-NEXT: movl %r8d, %ecx 437; XOP-NEXT: jmp .LBB0_8 438entry: 439 %cmp12 = icmp sgt i32 %count, 0 440 br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup 441 442for.body.preheader: 443 %wide.trip.count = zext i32 %count to i64 444 %min.iters.check = icmp ult i32 %count, 32 445 br i1 %min.iters.check, label %for.body.preheader40, label %vector.ph 446 447for.body.preheader40: 448 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 449 br label %for.body 450 451vector.ph: 452 %n.vec = and i64 %wide.trip.count, 4294967264 453 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %amt0, i32 0 454 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer 455 %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %amt1, i32 0 456 %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer 457 %broadcast.splatinsert24 = insertelement <8 x i32> undef, i32 %amt0, i32 0 458 %broadcast.splat25 = shufflevector <8 x i32> %broadcast.splatinsert24, <8 x i32> undef, <8 x i32> zeroinitializer 459 %broadcast.splatinsert26 = insertelement <8 x i32> undef, i32 %amt1, i32 0 460 %broadcast.splat27 = shufflevector <8 x i32> %broadcast.splatinsert26, <8 x i32> undef, <8 x i32> zeroinitializer 461 %broadcast.splatinsert28 = insertelement <8 x i32> undef, i32 %amt0, i32 0 462 %broadcast.splat29 = shufflevector <8 x i32> %broadcast.splatinsert28, <8 x i32> undef, <8 x i32> zeroinitializer 463 %broadcast.splatinsert30 = insertelement <8 x i32> undef, i32 %amt1, i32 0 464 %broadcast.splat31 = shufflevector <8 x i32> %broadcast.splatinsert30, <8 x i32> undef, <8 x i32> zeroinitializer 465 %broadcast.splatinsert32 = insertelement <8 x i32> undef, i32 %amt0, i32 0 466 %broadcast.splat33 = shufflevector <8 x i32> %broadcast.splatinsert32, <8 x i32> undef, <8 x i32> zeroinitializer 467 %broadcast.splatinsert34 = insertelement <8 x i32> undef, i32 %amt1, i32 0 468 %broadcast.splat35 = shufflevector <8 x i32> %broadcast.splatinsert34, <8 x i32> undef, <8 x i32> zeroinitializer 469 br label %vector.body 470 471vector.body: 472 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 473 %0 = getelementptr inbounds i8, i8* %control, i64 %index 474 %1 = bitcast i8* %0 to <8 x i8>* 475 %wide.load = load <8 x i8>, <8 x i8>* %1, align 1 476 %2 = getelementptr inbounds i8, i8* %0, i64 8 477 %3 = bitcast i8* %2 to <8 x i8>* 478 %wide.load17 = load <8 x i8>, <8 x i8>* %3, align 1 479 %4 = getelementptr inbounds i8, i8* %0, i64 16 480 %5 = bitcast i8* %4 to <8 x i8>* 481 %wide.load18 = load <8 x i8>, <8 x i8>* %5, align 1 482 %6 = getelementptr inbounds i8, i8* %0, i64 24 483 %7 = bitcast i8* %6 to <8 x i8>* 484 %wide.load19 = load <8 x i8>, <8 x i8>* %7, align 1 485 %8 = icmp eq <8 x i8> %wide.load, zeroinitializer 486 %9 = icmp eq <8 x i8> %wide.load17, zeroinitializer 487 %10 = icmp eq <8 x i8> %wide.load18, zeroinitializer 488 %11 = icmp eq <8 x i8> %wide.load19, zeroinitializer 489 %12 = select <8 x i1> %8, <8 x i32> %broadcast.splat21, <8 x i32> %broadcast.splat23 490 %13 = select <8 x i1> %9, <8 x i32> %broadcast.splat25, <8 x i32> %broadcast.splat27 491 %14 = select <8 x i1> %10, <8 x i32> %broadcast.splat29, <8 x i32> %broadcast.splat31 492 %15 = select <8 x i1> %11, <8 x i32> %broadcast.splat33, <8 x i32> %broadcast.splat35 493 %16 = getelementptr inbounds i32, i32* %arr, i64 %index 494 %17 = bitcast i32* %16 to <8 x i32>* 495 %wide.load36 = load <8 x i32>, <8 x i32>* %17, align 4 496 %18 = getelementptr inbounds i32, i32* %16, i64 8 497 %19 = bitcast i32* %18 to <8 x i32>* 498 %wide.load37 = load <8 x i32>, <8 x i32>* %19, align 4 499 %20 = getelementptr inbounds i32, i32* %16, i64 16 500 %21 = bitcast i32* %20 to <8 x i32>* 501 %wide.load38 = load <8 x i32>, <8 x i32>* %21, align 4 502 %22 = getelementptr inbounds i32, i32* %16, i64 24 503 %23 = bitcast i32* %22 to <8 x i32>* 504 %wide.load39 = load <8 x i32>, <8 x i32>* %23, align 4 505 %24 = shl <8 x i32> %wide.load36, %12 506 %25 = shl <8 x i32> %wide.load37, %13 507 %26 = shl <8 x i32> %wide.load38, %14 508 %27 = shl <8 x i32> %wide.load39, %15 509 %28 = bitcast i32* %16 to <8 x i32>* 510 store <8 x i32> %24, <8 x i32>* %28, align 4 511 %29 = bitcast i32* %18 to <8 x i32>* 512 store <8 x i32> %25, <8 x i32>* %29, align 4 513 %30 = bitcast i32* %20 to <8 x i32>* 514 store <8 x i32> %26, <8 x i32>* %30, align 4 515 %31 = bitcast i32* %22 to <8 x i32>* 516 store <8 x i32> %27, <8 x i32>* %31, align 4 517 %index.next = add i64 %index, 32 518 %32 = icmp eq i64 %index.next, %n.vec 519 br i1 %32, label %middle.block, label %vector.body 520 521middle.block: 522 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 523 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader40 524 525for.cond.cleanup: 526 ret void 527 528for.body: 529 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader40 ] 530 %arrayidx = getelementptr inbounds i8, i8* %control, i64 %indvars.iv 531 %33 = load i8, i8* %arrayidx, align 1 532 %tobool = icmp eq i8 %33, 0 533 %cond = select i1 %tobool, i32 %amt0, i32 %amt1 534 %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv 535 %34 = load i32, i32* %arrayidx2, align 4 536 %shl = shl i32 %34, %cond 537 store i32 %shl, i32* %arrayidx2, align 4 538 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 539 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 540 br i1 %exitcond, label %for.cond.cleanup, label %for.body 541} 542 543define void @vector_variable_shift_left_loop_simpler(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) nounwind { 544; SSE-LABEL: vector_variable_shift_left_loop_simpler: 545; SSE: # %bb.0: # %entry 546; SSE-NEXT: testl %edx, %edx 547; SSE-NEXT: jle .LBB1_3 548; SSE-NEXT: # %bb.1: # %vector.ph 549; SSE-NEXT: movl %edx, %eax 550; SSE-NEXT: andl $-4, %eax 551; SSE-NEXT: movd %ecx, %xmm0 552; SSE-NEXT: movd %r8d, %xmm3 553; SSE-NEXT: movd %r9d, %xmm1 554; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 555; SSE-NEXT: xorl %ecx, %ecx 556; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 557; SSE-NEXT: movdqa %xmm1, %xmm2 558; SSE-NEXT: pslld %xmm0, %xmm2 559; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero 560; SSE-NEXT: pslld %xmm0, %xmm1 561; SSE-NEXT: pxor %xmm3, %xmm3 562; SSE-NEXT: .p2align 4, 0x90 563; SSE-NEXT: .LBB1_2: # %vector.body 564; SSE-NEXT: # =>This Inner Loop Header: Depth=1 565; SSE-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 566; SSE-NEXT: pcmpeqd %xmm3, %xmm0 567; SSE-NEXT: movdqa %xmm1, %xmm4 568; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm4 569; SSE-NEXT: movups %xmm4, (%rdi,%rcx,4) 570; SSE-NEXT: addq $4, %rcx 571; SSE-NEXT: cmpq %rcx, %rax 572; SSE-NEXT: jne .LBB1_2 573; SSE-NEXT: .LBB1_3: # %exit 574; SSE-NEXT: retq 575; 576; AVX1-LABEL: vector_variable_shift_left_loop_simpler: 577; AVX1: # %bb.0: # %entry 578; AVX1-NEXT: testl %edx, %edx 579; AVX1-NEXT: jle .LBB1_3 580; AVX1-NEXT: # %bb.1: # %vector.ph 581; AVX1-NEXT: movl %edx, %eax 582; AVX1-NEXT: andl $-4, %eax 583; AVX1-NEXT: vmovd %ecx, %xmm0 584; AVX1-NEXT: vmovd %r8d, %xmm1 585; AVX1-NEXT: vmovd %r9d, %xmm2 586; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 587; AVX1-NEXT: xorl %ecx, %ecx 588; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 589; AVX1-NEXT: vpslld %xmm0, %xmm2, %xmm0 590; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 591; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm1 592; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 593; AVX1-NEXT: .p2align 4, 0x90 594; AVX1-NEXT: .LBB1_2: # %vector.body 595; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 596; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 597; AVX1-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm3 598; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm3 599; AVX1-NEXT: vmovups %xmm3, (%rdi,%rcx,4) 600; AVX1-NEXT: addq $4, %rcx 601; AVX1-NEXT: cmpq %rcx, %rax 602; AVX1-NEXT: jne .LBB1_2 603; AVX1-NEXT: .LBB1_3: # %exit 604; AVX1-NEXT: retq 605; 606; AVX2-LABEL: vector_variable_shift_left_loop_simpler: 607; AVX2: # %bb.0: # %entry 608; AVX2-NEXT: testl %edx, %edx 609; AVX2-NEXT: jle .LBB1_3 610; AVX2-NEXT: # %bb.1: # %vector.ph 611; AVX2-NEXT: movl %edx, %eax 612; AVX2-NEXT: andl $-4, %eax 613; AVX2-NEXT: vmovd %ecx, %xmm0 614; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 615; AVX2-NEXT: vmovd %r8d, %xmm1 616; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 617; AVX2-NEXT: vmovd %r9d, %xmm2 618; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 619; AVX2-NEXT: xorl %ecx, %ecx 620; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 621; AVX2-NEXT: .p2align 4, 0x90 622; AVX2-NEXT: .LBB1_2: # %vector.body 623; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 624; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 625; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 626; AVX2-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4 627; AVX2-NEXT: vpsllvd %xmm4, %xmm2, %xmm4 628; AVX2-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4) 629; AVX2-NEXT: addq $4, %rcx 630; AVX2-NEXT: cmpq %rcx, %rax 631; AVX2-NEXT: jne .LBB1_2 632; AVX2-NEXT: .LBB1_3: # %exit 633; AVX2-NEXT: retq 634; 635; XOP-LABEL: vector_variable_shift_left_loop_simpler: 636; XOP: # %bb.0: # %entry 637; XOP-NEXT: testl %edx, %edx 638; XOP-NEXT: jle .LBB1_3 639; XOP-NEXT: # %bb.1: # %vector.ph 640; XOP-NEXT: movl %edx, %eax 641; XOP-NEXT: andl $-4, %eax 642; XOP-NEXT: vmovd %ecx, %xmm0 643; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 644; XOP-NEXT: vmovd %r8d, %xmm1 645; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 646; XOP-NEXT: vmovd %r9d, %xmm2 647; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 648; XOP-NEXT: xorl %ecx, %ecx 649; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 650; XOP-NEXT: .p2align 4, 0x90 651; XOP-NEXT: .LBB1_2: # %vector.body 652; XOP-NEXT: # =>This Inner Loop Header: Depth=1 653; XOP-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 654; XOP-NEXT: vpcomeqd %xmm3, %xmm4, %xmm4 655; XOP-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4 656; XOP-NEXT: vpshld %xmm4, %xmm2, %xmm4 657; XOP-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4) 658; XOP-NEXT: addq $4, %rcx 659; XOP-NEXT: cmpq %rcx, %rax 660; XOP-NEXT: jne .LBB1_2 661; XOP-NEXT: .LBB1_3: # %exit 662; XOP-NEXT: retq 663entry: 664 %cmp16 = icmp sgt i32 %count, 0 665 %wide.trip.count = zext i32 %count to i64 666 br i1 %cmp16, label %vector.ph, label %exit 667 668vector.ph: 669 %n.vec = and i64 %wide.trip.count, 4294967292 670 %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0 671 %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer 672 %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0 673 %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer 674 %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0 675 %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer 676 br label %vector.body 677 678vector.body: 679 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 680 %0 = getelementptr inbounds i8, i8* %control, i64 %index 681 %1 = bitcast i8* %0 to <4 x i8>* 682 %wide.load = load <4 x i8>, <4 x i8>* %1, align 1 683 %2 = icmp eq <4 x i8> %wide.load, zeroinitializer 684 %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2 685 %4 = shl <4 x i32> %splat3, %3 686 %5 = getelementptr inbounds i32, i32* %arr, i64 %index 687 %6 = bitcast i32* %5 to <4 x i32>* 688 store <4 x i32> %4, <4 x i32>* %6, align 4 689 %index.next = add i64 %index, 4 690 %7 = icmp eq i64 %index.next, %n.vec 691 br i1 %7, label %exit, label %vector.body 692 693exit: 694 ret void 695} 696