1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s 3; RUN: llc -O0 < %s -mtriple=x86_64-unknown-unknown -mattr=avx,slow-unaligned-mem-32 | FileCheck %s -check-prefix=CHECK_O0 4 5define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind { 6; CHECK-LABEL: test_256_load: 7; CHECK: # %bb.0: # %entry 8; CHECK-NEXT: pushq %r15 9; CHECK-NEXT: pushq %r14 10; CHECK-NEXT: pushq %rbx 11; CHECK-NEXT: subq $96, %rsp 12; CHECK-NEXT: movq %rdx, %r14 13; CHECK-NEXT: movq %rsi, %r15 14; CHECK-NEXT: movq %rdi, %rbx 15; CHECK-NEXT: vmovaps (%rdi), %ymm0 16; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 17; CHECK-NEXT: vmovaps (%rsi), %ymm1 18; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill 19; CHECK-NEXT: vmovaps (%rdx), %ymm2 20; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 21; CHECK-NEXT: callq dummy 22; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 23; CHECK-NEXT: vmovaps %ymm0, (%rbx) 24; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 25; CHECK-NEXT: vmovaps %ymm0, (%r15) 26; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 27; CHECK-NEXT: vmovaps %ymm0, (%r14) 28; CHECK-NEXT: addq $96, %rsp 29; CHECK-NEXT: popq %rbx 30; CHECK-NEXT: popq %r14 31; CHECK-NEXT: popq %r15 32; CHECK-NEXT: vzeroupper 33; CHECK-NEXT: retq 34; 35; CHECK_O0-LABEL: test_256_load: 36; CHECK_O0: # %bb.0: # %entry 37; CHECK_O0-NEXT: subq $152, %rsp 38; CHECK_O0-NEXT: vmovapd (%rdi), %ymm0 39; CHECK_O0-NEXT: vmovaps (%rsi), %ymm1 40; CHECK_O0-NEXT: vmovdqa (%rdx), %ymm2 41; CHECK_O0-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill 42; CHECK_O0-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill 43; CHECK_O0-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) # 32-byte Spill 44; CHECK_O0-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill 45; CHECK_O0-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill 46; CHECK_O0-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill 47; CHECK_O0-NEXT: callq dummy 48; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload 49; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload 50; CHECK_O0-NEXT: vmovapd %ymm0, (%rdx) 51; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload 52; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 # 32-byte Reload 53; CHECK_O0-NEXT: vmovaps %ymm1, (%rsi) 54; CHECK_O0-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload 55; CHECK_O0-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm2 # 32-byte Reload 56; CHECK_O0-NEXT: vmovdqa %ymm2, (%rdi) 57; CHECK_O0-NEXT: addq $152, %rsp 58; CHECK_O0-NEXT: vzeroupper 59; CHECK_O0-NEXT: retq 60entry: 61 %0 = bitcast double* %d to <4 x double>* 62 %tmp1.i = load <4 x double>, <4 x double>* %0, align 32 63 %1 = bitcast float* %f to <8 x float>* 64 %tmp1.i17 = load <8 x float>, <8 x float>* %1, align 32 65 %tmp1.i16 = load <4 x i64>, <4 x i64>* %i, align 32 66 tail call void @dummy(<4 x double> %tmp1.i, <8 x float> %tmp1.i17, <4 x i64> %tmp1.i16) nounwind 67 store <4 x double> %tmp1.i, <4 x double>* %0, align 32 68 store <8 x float> %tmp1.i17, <8 x float>* %1, align 32 69 store <4 x i64> %tmp1.i16, <4 x i64>* %i, align 32 70 ret void 71} 72 73declare void @dummy(<4 x double>, <8 x float>, <4 x i64>) 74 75;; 76;; The two tests below check that we must fold load + scalar_to_vector 77;; + ins_subvec+ zext into only a single vmovss or vmovsd or vinsertps from memory 78 79define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind { 80; CHECK-LABEL: mov00: 81; CHECK: # %bb.0: 82; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 83; CHECK-NEXT: retq 84; 85; CHECK_O0-LABEL: mov00: 86; CHECK_O0: # %bb.0: 87; CHECK_O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 88; CHECK_O0-NEXT: # implicit-def: $ymm1 89; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 90; CHECK_O0-NEXT: vmovaps %xmm1, %xmm0 91; CHECK_O0-NEXT: vxorps %xmm2, %xmm2, %xmm2 92; CHECK_O0-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] 93; CHECK_O0-NEXT: # kill: def $ymm0 killed $xmm0 94; CHECK_O0-NEXT: retq 95 %val = load float, float* %ptr 96 %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0 97 ret <8 x float> %i0 98} 99 100define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind { 101; CHECK-LABEL: mov01: 102; CHECK: # %bb.0: 103; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 104; CHECK-NEXT: retq 105; 106; CHECK_O0-LABEL: mov01: 107; CHECK_O0: # %bb.0: 108; CHECK_O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 109; CHECK_O0-NEXT: # implicit-def: $ymm1 110; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 111; CHECK_O0-NEXT: vmovaps %xmm1, %xmm0 112; CHECK_O0-NEXT: vxorps %xmm2, %xmm2, %xmm2 113; CHECK_O0-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] 114; CHECK_O0-NEXT: # kill: def $ymm0 killed $xmm0 115; CHECK_O0-NEXT: retq 116 %val = load double, double* %ptr 117 %i0 = insertelement <4 x double> zeroinitializer, double %val, i32 0 118 ret <4 x double> %i0 119} 120 121define void @storev16i16(<16 x i16> %a) nounwind { 122; CHECK-LABEL: storev16i16: 123; CHECK: # %bb.0: 124; CHECK-NEXT: vmovaps %ymm0, (%rax) 125; 126; CHECK_O0-LABEL: storev16i16: 127; CHECK_O0: # %bb.0: 128; CHECK_O0-NEXT: # implicit-def: $rax 129; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax) 130 store <16 x i16> %a, <16 x i16>* undef, align 32 131 unreachable 132} 133 134define void @storev16i16_01(<16 x i16> %a) nounwind { 135; CHECK-LABEL: storev16i16_01: 136; CHECK: # %bb.0: 137; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) 138; CHECK-NEXT: vmovups %xmm0, (%rax) 139; 140; CHECK_O0-LABEL: storev16i16_01: 141; CHECK_O0: # %bb.0: 142; CHECK_O0-NEXT: # implicit-def: $rax 143; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax) 144 store <16 x i16> %a, <16 x i16>* undef, align 4 145 unreachable 146} 147 148define void @storev32i8(<32 x i8> %a) nounwind { 149; CHECK-LABEL: storev32i8: 150; CHECK: # %bb.0: 151; CHECK-NEXT: vmovaps %ymm0, (%rax) 152; 153; CHECK_O0-LABEL: storev32i8: 154; CHECK_O0: # %bb.0: 155; CHECK_O0-NEXT: # implicit-def: $rax 156; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax) 157 store <32 x i8> %a, <32 x i8>* undef, align 32 158 unreachable 159} 160 161define void @storev32i8_01(<32 x i8> %a) nounwind { 162; CHECK-LABEL: storev32i8_01: 163; CHECK: # %bb.0: 164; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax) 165; CHECK-NEXT: vmovups %xmm0, (%rax) 166; 167; CHECK_O0-LABEL: storev32i8_01: 168; CHECK_O0: # %bb.0: 169; CHECK_O0-NEXT: # implicit-def: $rax 170; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax) 171 store <32 x i8> %a, <32 x i8>* undef, align 4 172 unreachable 173} 174 175; It is faster to make two saves, if the data is already in xmm registers. For 176; example, after making an integer operation. 177define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { 178; CHECK-LABEL: double_save: 179; CHECK: # %bb.0: 180; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) 181; CHECK-NEXT: vmovaps %xmm0, (%rdi) 182; CHECK-NEXT: retq 183; 184; CHECK_O0-LABEL: double_save: 185; CHECK_O0: # %bb.0: 186; CHECK_O0-NEXT: # implicit-def: $ymm2 187; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 188; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 189; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) 190; CHECK_O0-NEXT: vzeroupper 191; CHECK_O0-NEXT: retq 192 %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 193 store <8 x i32> %Z, <8 x i32>* %P, align 16 194 ret void 195} 196 197declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind 198 199define void @f_f() nounwind { 200; CHECK-LABEL: f_f: 201; CHECK: # %bb.0: # %allocas 202; CHECK-NEXT: xorl %eax, %eax 203; CHECK-NEXT: testb %al, %al 204; CHECK-NEXT: jne .LBB8_2 205; CHECK-NEXT: # %bb.1: # %cif_mask_all 206; CHECK-NEXT: .LBB8_2: # %cif_mask_mixed 207; CHECK-NEXT: xorl %eax, %eax 208; CHECK-NEXT: testb %al, %al 209; CHECK-NEXT: jne .LBB8_4 210; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all 211; CHECK-NEXT: movl $-1, %eax 212; CHECK-NEXT: vmovd %eax, %xmm0 213; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) 214; CHECK-NEXT: .LBB8_4: # %cif_mixed_test_any_check 215; 216; CHECK_O0-LABEL: f_f: 217; CHECK_O0: # %bb.0: # %allocas 218; CHECK_O0-NEXT: # implicit-def: $al 219; CHECK_O0-NEXT: testb $1, %al 220; CHECK_O0-NEXT: jne .LBB8_1 221; CHECK_O0-NEXT: jmp .LBB8_2 222; CHECK_O0-NEXT: .LBB8_1: # %cif_mask_all 223; CHECK_O0-NEXT: .LBB8_2: # %cif_mask_mixed 224; CHECK_O0-NEXT: # implicit-def: $al 225; CHECK_O0-NEXT: testb $1, %al 226; CHECK_O0-NEXT: jne .LBB8_3 227; CHECK_O0-NEXT: jmp .LBB8_4 228; CHECK_O0-NEXT: .LBB8_3: # %cif_mixed_test_all 229; CHECK_O0-NEXT: movl $-1, %eax 230; CHECK_O0-NEXT: vmovd %eax, %xmm0 231; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1 232; CHECK_O0-NEXT: # implicit-def: $rcx 233; CHECK_O0-NEXT: # implicit-def: $ymm2 234; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rcx) 235; CHECK_O0-NEXT: .LBB8_4: # %cif_mixed_test_any_check 236allocas: 237 br i1 undef, label %cif_mask_all, label %cif_mask_mixed 238 239cif_mask_all: 240 unreachable 241 242cif_mask_mixed: 243 br i1 undef, label %cif_mixed_test_all, label %cif_mixed_test_any_check 244 245cif_mixed_test_all: 246 call void @llvm.x86.avx.maskstore.ps.256(i8* undef, <8 x i32> <i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <8 x float> undef) nounwind 247 unreachable 248 249cif_mixed_test_any_check: 250 unreachable 251} 252 253define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { 254; CHECK-LABEL: add8i32: 255; CHECK: # %bb.0: 256; CHECK-NEXT: vmovups (%rsi), %xmm0 257; CHECK-NEXT: vmovups 16(%rsi), %xmm1 258; CHECK-NEXT: vmovups %xmm1, 16(%rdi) 259; CHECK-NEXT: vmovups %xmm0, (%rdi) 260; CHECK-NEXT: retq 261; 262; CHECK_O0-LABEL: add8i32: 263; CHECK_O0: # %bb.0: 264; CHECK_O0-NEXT: vmovdqu (%rsi), %xmm0 265; CHECK_O0-NEXT: vmovdqu 16(%rsi), %xmm1 266; CHECK_O0-NEXT: # implicit-def: $ymm2 267; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 268; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 269; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) 270; CHECK_O0-NEXT: vzeroupper 271; CHECK_O0-NEXT: retq 272 %b = load <8 x i32>, <8 x i32>* %bp, align 1 273 %x = add <8 x i32> zeroinitializer, %b 274 store <8 x i32> %x, <8 x i32>* %ret, align 1 275 ret void 276} 277 278define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { 279; CHECK-LABEL: add4i64a64: 280; CHECK: # %bb.0: 281; CHECK-NEXT: vmovaps (%rsi), %ymm0 282; CHECK-NEXT: vmovaps %ymm0, (%rdi) 283; CHECK-NEXT: vzeroupper 284; CHECK-NEXT: retq 285; 286; CHECK_O0-LABEL: add4i64a64: 287; CHECK_O0: # %bb.0: 288; CHECK_O0-NEXT: vmovaps (%rsi), %ymm0 289; CHECK_O0-NEXT: vmovdqa %ymm0, (%rdi) 290; CHECK_O0-NEXT: vzeroupper 291; CHECK_O0-NEXT: retq 292 %b = load <4 x i64>, <4 x i64>* %bp, align 64 293 %x = add <4 x i64> zeroinitializer, %b 294 store <4 x i64> %x, <4 x i64>* %ret, align 64 295 ret void 296} 297 298define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { 299; CHECK-LABEL: add4i64a16: 300; CHECK: # %bb.0: 301; CHECK-NEXT: vmovaps (%rsi), %xmm0 302; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 303; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) 304; CHECK-NEXT: vmovaps %xmm0, (%rdi) 305; CHECK-NEXT: retq 306; 307; CHECK_O0-LABEL: add4i64a16: 308; CHECK_O0: # %bb.0: 309; CHECK_O0-NEXT: vmovdqa (%rsi), %xmm0 310; CHECK_O0-NEXT: vmovdqa 16(%rsi), %xmm1 311; CHECK_O0-NEXT: # implicit-def: $ymm2 312; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2 313; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2 314; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi) 315; CHECK_O0-NEXT: vzeroupper 316; CHECK_O0-NEXT: retq 317 %b = load <4 x i64>, <4 x i64>* %bp, align 16 318 %x = add <4 x i64> zeroinitializer, %b 319 store <4 x i64> %x, <4 x i64>* %ret, align 16 320 ret void 321} 322 323