1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s 3 4; Source file looks something like this: 5; 6; typedef int AAA[100][100]; 7; 8; void testCombineMultiplies(AAA a,int lll) 9; { 10; int LOC = lll + 5; 11; 12; a[LOC][LOC] = 11; 13; 14; a[LOC][20] = 22; 15; a[LOC+20][20] = 33; 16; } 17; 18; We want to make sure we don't generate 2 multiply instructions, 19; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp 20; should combine the instructions in such a way to avoid the extra 21; multiply. 22; 23; Output looks roughly like this: 24; 25; movl 8(%esp), %eax 26; movl 12(%esp), %ecx 27; imull $400, %ecx, %edx # imm = 0x190 28; leal (%edx,%eax), %esi 29; movl $11, 2020(%esi,%ecx,4) 30; movl $22, 2080(%edx,%eax) 31; movl $33, 10080(%edx,%eax) 32 33; Function Attrs: nounwind 34define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind { 35; CHECK-LABEL: testCombineMultiplies: 36; CHECK: # %bb.0: # %entry 37; CHECK-NEXT: pushl %esi 38; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax 39; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx 40; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190 41; CHECK-NEXT: leal (%edx,%eax), %esi 42; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4) 43; CHECK-NEXT: movl $22, 2080(%edx,%eax) 44; CHECK-NEXT: movl $33, 10080(%edx,%eax) 45; CHECK-NEXT: popl %esi 46; CHECK-NEXT: retl 47entry: 48 %add = add nsw i32 %lll, 5 49 %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add 50 store i32 11, i32* %arrayidx1, align 4 51 %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20 52 store i32 22, i32* %arrayidx3, align 4 53 %add4 = add nsw i32 %lll, 25 54 %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20 55 store i32 33, i32* %arrayidx6, align 4 56 ret void 57} 58 59 60; Test for the same optimization on vector multiplies. 61; 62; Source looks something like this: 63; 64; typedef int v4int __attribute__((__vector_size__(16))); 65; 66; v4int x; 67; v4int v2, v3; 68; void testCombineMultiplies_splat(v4int v1) { 69; v2 = (v1 + (v4int){ 11, 11, 11, 11 }) * (v4int) {22, 22, 22, 22}; 70; v3 = (v1 + (v4int){ 33, 33, 33, 33 }) * (v4int) {22, 22, 22, 22}; 71; x = (v1 + (v4int){ 11, 11, 11, 11 }); 72; } 73; 74; Output looks something like this: 75; 76; testCombineMultiplies_splat: # @testCombineMultiplies_splat 77; # %bb.0: # %entry 78; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11] 79; paddd %xmm0, %xmm1 80; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22] 81; pshufd $245, %xmm0, %xmm3 # xmm3 = xmm0[1,1,3,3] 82; pmuludq %xmm2, %xmm0 83; pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] 84; pmuludq %xmm2, %xmm3 85; pshufd $232, %xmm3, %xmm2 # xmm2 = xmm3[0,2,2,3] 86; punpckldq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 87; movdqa .LCPI1_2, %xmm2 # xmm2 = [242,242,242,242] 88; paddd %xmm0, %xmm2 89; paddd .LCPI1_3, %xmm0 90; movdqa %xmm2, v2 91; movdqa %xmm0, v3 92; movdqa %xmm1, x 93; retl 94; 95; Again, we want to make sure we don't generate two different multiplies. 96; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two 97; pmuludq instructions), followed by two adds. Without this optimization, we'd 98; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions). 99 100@v2 = common global <4 x i32> zeroinitializer, align 16 101@v3 = common global <4 x i32> zeroinitializer, align 16 102@x = common global <4 x i32> zeroinitializer, align 16 103 104; Function Attrs: nounwind 105define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind { 106; CHECK-LABEL: testCombineMultiplies_splat: 107; CHECK: # %bb.0: # %entry 108; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11] 109; CHECK-NEXT: paddd %xmm0, %xmm1 110; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22] 111; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 112; CHECK-NEXT: pmuludq %xmm2, %xmm0 113; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 114; CHECK-NEXT: pmuludq %xmm2, %xmm3 115; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 116; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 117; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242] 118; CHECK-NEXT: paddd %xmm0, %xmm2 119; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0 120; CHECK-NEXT: movdqa %xmm2, v2 121; CHECK-NEXT: movdqa %xmm0, v3 122; CHECK-NEXT: movdqa %xmm1, x 123; CHECK-NEXT: retl 124entry: 125 %add1 = add <4 x i32> %v1, <i32 11, i32 11, i32 11, i32 11> 126 %mul1 = mul <4 x i32> %add1, <i32 22, i32 22, i32 22, i32 22> 127 %add2 = add <4 x i32> %v1, <i32 33, i32 33, i32 33, i32 33> 128 %mul2 = mul <4 x i32> %add2, <i32 22, i32 22, i32 22, i32 22> 129 store <4 x i32> %mul1, <4 x i32>* @v2, align 16 130 store <4 x i32> %mul2, <4 x i32>* @v3, align 16 131 store <4 x i32> %add1, <4 x i32>* @x, align 16 132 ret void 133} 134 135; Finally, check the non-splatted vector case. This is very similar 136; to the previous test case, except for the vector values. 137 138; Function Attrs: nounwind 139define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind { 140; CHECK-LABEL: testCombineMultiplies_non_splat: 141; CHECK: # %bb.0: # %entry 142; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44] 143; CHECK-NEXT: paddd %xmm0, %xmm1 144; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55] 145; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 146; CHECK-NEXT: pmuludq %xmm2, %xmm0 147; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 148; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 149; CHECK-NEXT: pmuludq %xmm3, %xmm2 150; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 151; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 152; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] 153; CHECK-NEXT: paddd %xmm0, %xmm2 154; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0 155; CHECK-NEXT: movdqa %xmm2, v2 156; CHECK-NEXT: movdqa %xmm0, v3 157; CHECK-NEXT: movdqa %xmm1, x 158; CHECK-NEXT: retl 159entry: 160 %add1 = add <4 x i32> %v1, <i32 11, i32 22, i32 33, i32 44> 161 %mul1 = mul <4 x i32> %add1, <i32 22, i32 33, i32 44, i32 55> 162 %add2 = add <4 x i32> %v1, <i32 33, i32 44, i32 55, i32 66> 163 %mul2 = mul <4 x i32> %add2, <i32 22, i32 33, i32 44, i32 55> 164 store <4 x i32> %mul1, <4 x i32>* @v2, align 16 165 store <4 x i32> %mul2, <4 x i32>* @v3, align 16 166 store <4 x i32> %add1, <4 x i32>* @x, align 16 167 ret void 168} 169