1; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS 2; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR 3; 4; <rdar://problem/14477220> 5 6%class.Complex = type { float, float } 7 8 9; Check that independent slices leads to independent loads then the slices leads to 10; different register file. 11; 12; The layout is: 13; LSB 0 1 2 3 | 4 5 6 7 MSB 14; Low High 15; The base address points to 0 and is 8-bytes aligned. 16; Low slice starts at 0 (base) and is 8-bytes aligned. 17; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned. 18; 19; STRESS-LABEL: t1: 20; Load out[out_start + 8].real, this is base + 8 * 8 + 0. 21; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]] 22; Add low slice: out[out_start].real, this is base + 0. 23; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] 24; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. 25; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] 26; Add high slice: out[out_start].imm, this is base + 4. 27; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] 28; Swap Imm and Real. 29; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] 30; Put the results back into out[out_start]. 31; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]]) 32; 33; Same for REGULAR, we eliminate register bank copy with each slices. 34; REGULAR-LABEL: t1: 35; Load out[out_start + 8].real, this is base + 8 * 8 + 0. 36; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]] 37; Add low slice: out[out_start].real, this is base + 0. 38; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] 39; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. 40; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] 41; Add high slice: out[out_start].imm, this is base + 4. 42; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] 43; Swap Imm and Real. 44; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] 45; Put the results back into out[out_start]. 46; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]]) 47define void @t1(%class.Complex* nocapture %out, i64 %out_start) { 48entry: 49 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start 50 %tmp = bitcast %class.Complex* %arrayidx to i64* 51 %tmp1 = load i64* %tmp, align 8 52 %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32 53 %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float 54 %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32 55 %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32 56 %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float 57 %add = add i64 %out_start, 8 58 %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add 59 %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0 60 %tmp4 = load float* %i.i, align 4 61 %add.i = fadd float %tmp4, %tmp2 62 %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0 63 %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1 64 %tmp5 = load float* %r.i, align 4 65 %add5.i = fadd float %tmp5, %tmp3 66 %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1 67 %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>* 68 store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4 69 ret void 70} 71 72; Function Attrs: nounwind 73declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1 74 75; Function Attrs: nounwind 76declare void @llvm.lifetime.start(i64, i8* nocapture) 77 78; Function Attrs: nounwind 79declare void @llvm.lifetime.end(i64, i8* nocapture) 80 81; Check that we do not read outside of the chunk of bits of the original loads. 82; 83; The 64-bits should have been split in one 32-bits and one 16-bits slices. 84; The 16-bits should be zero extended to match the final type. 85; 86; The memory layout is: 87; LSB 0 1 2 3 | 4 5 | 6 7 MSB 88; Low High 89; The base address points to 0 and is 8-bytes aligned. 90; Low slice starts at 0 (base) and is 8-bytes aligned. 91; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned. 92; 93; STRESS-LABEL: t2: 94; STRESS: movzwl 6([[BASE:[^)]+]]), %eax 95; STRESS-NEXT: addl ([[BASE]]), %eax 96; STRESS-NEXT: ret 97; 98; For the REGULAR heuristic, this is not profitable to slice things that are not 99; next to each other in memory. Here we have a hole with bytes #4-5. 100; REGULAR-LABEL: t2: 101; REGULAR: shrq $48 102define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) { 103 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start 104 %bitcast = bitcast %class.Complex* %arrayidx to i64* 105 %chunk64 = load i64* %bitcast, align 8 106 %slice32_low = trunc i64 %chunk64 to i32 107 %shift48 = lshr i64 %chunk64, 48 108 %slice32_high = trunc i64 %shift48 to i32 109 %res = add i32 %slice32_high, %slice32_low 110 ret i32 %res 111} 112 113; Check that we do not optimize overlapping slices. 114; 115; The 64-bits should NOT have been split in as slices are overlapping. 116; First slice uses bytes numbered 0 to 3. 117; Second slice uses bytes numbered 6 and 7. 118; Third slice uses bytes numbered 4 to 7. 119; 120; STRESS-LABEL: t3: 121; STRESS: shrq $48 122; STRESS: shrq $32 123; 124; REGULAR-LABEL: t3: 125; REGULAR: shrq $48 126; REGULAR: shrq $32 127define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) { 128 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start 129 %bitcast = bitcast %class.Complex* %arrayidx to i64* 130 %chunk64 = load i64* %bitcast, align 8 131 %slice32_low = trunc i64 %chunk64 to i32 132 %shift48 = lshr i64 %chunk64, 48 133 %slice32_high = trunc i64 %shift48 to i32 134 %shift32 = lshr i64 %chunk64, 32 135 %slice32_lowhigh = trunc i64 %shift32 to i32 136 %tmpres = add i32 %slice32_high, %slice32_low 137 %res = add i32 %slice32_lowhigh, %tmpres 138 ret i32 %res 139} 140