1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefix=SSE 3; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx | FileCheck %s --check-prefix=AVX 4; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=AVX 5 6; Verify that the SLP vectorizer is able to figure out that commutativity 7; offers the possibility to splat/broadcast %c and thus make it profitable 8; to vectorize this case 9 10@cle = external unnamed_addr global [32 x i8], align 16 11@cle32 = external unnamed_addr global [32 x i32], align 16 12 13 14; Check that we correctly detect a splat/broadcast by leveraging the 15; commutativity property of `xor`. 16 17define void @splat(i8 %a, i8 %b, i8 %c) { 18; SSE-LABEL: @splat( 19; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]] 20; SSE-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 21; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]] 22; SSE-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1) 23; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]] 24; SSE-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2) 25; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]] 26; SSE-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3) 27; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]] 28; SSE-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4) 29; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]] 30; SSE-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5) 31; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]] 32; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6) 33; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] 34; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7) 35; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]] 36; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8) 37; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]] 38; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9) 39; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]] 40; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10) 41; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]] 42; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11) 43; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]] 44; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12) 45; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]] 46; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13) 47; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]] 48; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14) 49; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]] 50; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15) 51; SSE-NEXT: ret void 52; 53; AVX-LABEL: @splat( 54; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0 55; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1 56; AVX-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2 57; AVX-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3 58; AVX-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4 59; AVX-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5 60; AVX-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6 61; AVX-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7 62; AVX-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8 63; AVX-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9 64; AVX-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10 65; AVX-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11 66; AVX-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12 67; AVX-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13 68; AVX-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14 69; AVX-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15 70; AVX-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0 71; AVX-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1 72; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 73; AVX-NEXT: [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]] 74; AVX-NEXT: store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 75; AVX-NEXT: ret void 76; 77 %1 = xor i8 %c, %a 78 store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 79 %2 = xor i8 %a, %c 80 store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1) 81 %3 = xor i8 %a, %c 82 store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2) 83 %4 = xor i8 %a, %c 84 store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3) 85 %5 = xor i8 %c, %a 86 store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4) 87 %6 = xor i8 %c, %b 88 store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5) 89 %7 = xor i8 %c, %a 90 store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6) 91 %8 = xor i8 %c, %b 92 store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7) 93 %9 = xor i8 %a, %c 94 store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8) 95 %10 = xor i8 %a, %c 96 store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9) 97 %11 = xor i8 %a, %c 98 store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10) 99 %12 = xor i8 %a, %c 100 store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11) 101 %13 = xor i8 %a, %c 102 store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12) 103 %14 = xor i8 %a, %c 104 store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13) 105 %15 = xor i8 %a, %c 106 store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14) 107 %16 = xor i8 %a, %c 108 store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15) 109 ret void 110} 111 112; Check that we correctly detect that we can have the same opcode on one side by 113; leveraging the commutativity property of `xor`. 114 115define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { 116; SSE-LABEL: @same_opcode_on_one_side( 117; SSE-NEXT: [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]] 118; SSE-NEXT: [[ADD2:%.*]] = add i32 [[C]], [[A]] 119; SSE-NEXT: [[ADD3:%.*]] = add i32 [[A]], [[C]] 120; SSE-NEXT: [[ADD4:%.*]] = add i32 [[C]], [[A]] 121; SSE-NEXT: [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]] 122; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16 123; SSE-NEXT: [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]] 124; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1) 125; SSE-NEXT: [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]] 126; SSE-NEXT: store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2) 127; SSE-NEXT: [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]] 128; SSE-NEXT: store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3) 129; SSE-NEXT: ret void 130; 131; AVX-LABEL: @same_opcode_on_one_side( 132; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 133; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1 134; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2 135; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3 136; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0 137; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1 138; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2 139; AVX-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3 140; AVX-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] 141; AVX-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1 142; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2 143; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3 144; AVX-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]] 145; AVX-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16 146; AVX-NEXT: ret void 147; 148 %add1 = add i32 %c, %a 149 %add2 = add i32 %c, %a 150 %add3 = add i32 %a, %c 151 %add4 = add i32 %c, %a 152 %1 = xor i32 %add1, %a 153 store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16 154 %2 = xor i32 %b, %add2 155 store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1) 156 %3 = xor i32 %c, %add3 157 store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2) 158 %4 = xor i32 %a, %add4 159 store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3) 160 ret void 161} 162