• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=CHECK,AVX
4; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=CHECK,AVX2
5; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=CHECK,AVX512
6; RUN:  opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
7
8define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
9; CHECK-LABEL: @gather_load(
10; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
11; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, [[TBAA0:!tbaa !.*]]
12; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
13; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, [[TBAA0]]
14; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
15; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
16; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0]]
17; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
18; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i32 1
19; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
20; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 3
21; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
22; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
23; CHECK-NEXT:    store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, [[TBAA0]]
24; CHECK-NEXT:    ret void
25;
26  %3 = getelementptr inbounds i32, i32* %1, i64 1
27  %4 = load i32, i32* %1, align 4, !tbaa !2
28  %5 = getelementptr inbounds i32, i32* %0, i64 1
29  %6 = getelementptr inbounds i32, i32* %1, i64 11
30  %7 = load i32, i32* %6, align 4, !tbaa !2
31  %8 = getelementptr inbounds i32, i32* %0, i64 2
32  %9 = getelementptr inbounds i32, i32* %1, i64 4
33  %10 = load i32, i32* %9, align 4, !tbaa !2
34  %11 = getelementptr inbounds i32, i32* %0, i64 3
35  %12 = load i32, i32* %3, align 4, !tbaa !2
36  %13 = insertelement <4 x i32> undef, i32 %4, i32 0
37  %14 = insertelement <4 x i32> %13, i32 %7, i32 1
38  %15 = insertelement <4 x i32> %14, i32 %10, i32 2
39  %16 = insertelement <4 x i32> %15, i32 %12, i32 3
40  %17 = add nsw <4 x i32> %16, <i32 1, i32 2, i32 3, i32 4>
41  %18 = bitcast i32* %0 to <4 x i32>*
42  store <4 x i32> %17, <4 x i32>* %18, align 4, !tbaa !2
43  ret void
44}
45
46define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
47; SSE-LABEL: @gather_load_2(
48; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
49; SSE-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
50; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
51; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
52; SSE-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
53; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
54; SSE-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
55; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
56; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
57; SSE-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
58; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
59; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
60; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
61; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
62; SSE-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
63; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
64; SSE-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
65; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
66; SSE-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
67; SSE-NEXT:    ret void
68;
69; AVX-LABEL: @gather_load_2(
70; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
71; AVX-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, [[TBAA0:!tbaa !.*]]
72; AVX-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
73; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
74; AVX-NEXT:    store i32 [[TMP5]], i32* [[TMP0]], align 4, [[TBAA0]]
75; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
76; AVX-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, [[TBAA0]]
77; AVX-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
78; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
79; AVX-NEXT:    store i32 [[TMP9]], i32* [[TMP6]], align 4, [[TBAA0]]
80; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
81; AVX-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, [[TBAA0]]
82; AVX-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
83; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
84; AVX-NEXT:    store i32 [[TMP13]], i32* [[TMP10]], align 4, [[TBAA0]]
85; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
86; AVX-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, [[TBAA0]]
87; AVX-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
88; AVX-NEXT:    store i32 [[TMP17]], i32* [[TMP14]], align 4, [[TBAA0]]
89; AVX-NEXT:    ret void
90;
91; AVX2-LABEL: @gather_load_2(
92; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1:%.*]], i32 0
93; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
94; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
95; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
96; AVX2-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
97; AVX2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
98; AVX2-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
99; AVX2-NEXT:    ret void
100;
101; AVX512-LABEL: @gather_load_2(
102; AVX512-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1:%.*]], i32 0
103; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> undef, <4 x i32> zeroinitializer
104; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
105; AVX512-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0:!tbaa !.*]]
106; AVX512-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
107; AVX512-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
108; AVX512-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4, [[TBAA0]]
109; AVX512-NEXT:    ret void
110;
111  %3 = getelementptr inbounds i32, i32* %1, i64 1
112  %4 = load i32, i32* %3, align 4, !tbaa !2
113  %5 = add nsw i32 %4, 1
114  %6 = getelementptr inbounds i32, i32* %0, i64 1
115  store i32 %5, i32* %0, align 4, !tbaa !2
116  %7 = getelementptr inbounds i32, i32* %1, i64 10
117  %8 = load i32, i32* %7, align 4, !tbaa !2
118  %9 = add nsw i32 %8, 2
119  %10 = getelementptr inbounds i32, i32* %0, i64 2
120  store i32 %9, i32* %6, align 4, !tbaa !2
121  %11 = getelementptr inbounds i32, i32* %1, i64 3
122  %12 = load i32, i32* %11, align 4, !tbaa !2
123  %13 = add nsw i32 %12, 3
124  %14 = getelementptr inbounds i32, i32* %0, i64 3
125  store i32 %13, i32* %10, align 4, !tbaa !2
126  %15 = getelementptr inbounds i32, i32* %1, i64 5
127  %16 = load i32, i32* %15, align 4, !tbaa !2
128  %17 = add nsw i32 %16, 4
129  store i32 %17, i32* %14, align 4, !tbaa !2
130  ret void
131}
132
133
134define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
135; SSE-LABEL: @gather_load_3(
136; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
137; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
138; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
139; SSE-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
140; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
141; SSE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
142; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
143; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
144; SSE-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
145; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
146; SSE-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
147; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
148; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
149; SSE-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
150; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
151; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
152; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
153; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
154; SSE-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
155; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
156; SSE-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
157; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
158; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
159; SSE-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
160; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
161; SSE-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
162; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
163; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
164; SSE-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
165; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
166; SSE-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
167; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
168; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
169; SSE-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
170; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
171; SSE-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
172; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
173; SSE-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
174; SSE-NEXT:    ret void
175;
176; AVX-LABEL: @gather_load_3(
177; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
178; AVX-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
179; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
180; AVX-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
181; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
182; AVX-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]]
183; AVX-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
184; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
185; AVX-NEXT:    store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]]
186; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
187; AVX-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]]
188; AVX-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
189; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
190; AVX-NEXT:    store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]]
191; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
192; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]]
193; AVX-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
194; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
195; AVX-NEXT:    store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]]
196; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
197; AVX-NEXT:    [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, [[TBAA0]]
198; AVX-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
199; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
200; AVX-NEXT:    store i32 [[TMP20]], i32* [[TMP17]], align 4, [[TBAA0]]
201; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
202; AVX-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, [[TBAA0]]
203; AVX-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
204; AVX-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
205; AVX-NEXT:    store i32 [[TMP24]], i32* [[TMP21]], align 4, [[TBAA0]]
206; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
207; AVX-NEXT:    [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, [[TBAA0]]
208; AVX-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
209; AVX-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
210; AVX-NEXT:    store i32 [[TMP28]], i32* [[TMP25]], align 4, [[TBAA0]]
211; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
212; AVX-NEXT:    [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, [[TBAA0]]
213; AVX-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
214; AVX-NEXT:    store i32 [[TMP32]], i32* [[TMP29]], align 4, [[TBAA0]]
215; AVX-NEXT:    ret void
216;
217; AVX2-LABEL: @gather_load_3(
218; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
219; AVX2-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
220; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
221; AVX2-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
222; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0
223; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
224; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
225; AVX2-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
226; AVX2-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
227; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
228; AVX2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
229; AVX2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
230; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
231; AVX2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
232; AVX2-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
233; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
234; AVX2-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
235; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
236; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
237; AVX2-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
238; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
239; AVX2-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
240; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
241; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
242; AVX2-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
243; AVX2-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
244; AVX2-NEXT:    ret void
245;
246; AVX512-LABEL: @gather_load_3(
247; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, [[TBAA0]]
248; AVX512-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
249; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
250; AVX512-NEXT:    store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]]
251; AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0
252; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer
253; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
254; AVX512-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
255; AVX512-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
256; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
257; AVX512-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
258; AVX512-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]]
259; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
260; AVX512-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]]
261; AVX512-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 2
262; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
263; AVX512-NEXT:    store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]]
264; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
265; AVX512-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]]
266; AVX512-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 3
267; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
268; AVX512-NEXT:    store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]]
269; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
270; AVX512-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]]
271; AVX512-NEXT:    [[TMP23:%.*]] = add i32 [[TMP22]], 4
272; AVX512-NEXT:    store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]]
273; AVX512-NEXT:    ret void
274;
275  %3 = load i32, i32* %1, align 4, !tbaa !2
276  %4 = add i32 %3, 1
277  %5 = getelementptr inbounds i32, i32* %0, i64 1
278  store i32 %4, i32* %0, align 4, !tbaa !2
279  %6 = getelementptr inbounds i32, i32* %1, i64 11
280  %7 = load i32, i32* %6, align 4, !tbaa !2
281  %8 = add i32 %7, 2
282  %9 = getelementptr inbounds i32, i32* %0, i64 2
283  store i32 %8, i32* %5, align 4, !tbaa !2
284  %10 = getelementptr inbounds i32, i32* %1, i64 4
285  %11 = load i32, i32* %10, align 4, !tbaa !2
286  %12 = add i32 %11, 3
287  %13 = getelementptr inbounds i32, i32* %0, i64 3
288  store i32 %12, i32* %9, align 4, !tbaa !2
289  %14 = getelementptr inbounds i32, i32* %1, i64 15
290  %15 = load i32, i32* %14, align 4, !tbaa !2
291  %16 = add i32 %15, 4
292  %17 = getelementptr inbounds i32, i32* %0, i64 4
293  store i32 %16, i32* %13, align 4, !tbaa !2
294  %18 = getelementptr inbounds i32, i32* %1, i64 18
295  %19 = load i32, i32* %18, align 4, !tbaa !2
296  %20 = add i32 %19, 1
297  %21 = getelementptr inbounds i32, i32* %0, i64 5
298  store i32 %20, i32* %17, align 4, !tbaa !2
299  %22 = getelementptr inbounds i32, i32* %1, i64 9
300  %23 = load i32, i32* %22, align 4, !tbaa !2
301  %24 = add i32 %23, 2
302  %25 = getelementptr inbounds i32, i32* %0, i64 6
303  store i32 %24, i32* %21, align 4, !tbaa !2
304  %26 = getelementptr inbounds i32, i32* %1, i64 6
305  %27 = load i32, i32* %26, align 4, !tbaa !2
306  %28 = add i32 %27, 3
307  %29 = getelementptr inbounds i32, i32* %0, i64 7
308  store i32 %28, i32* %25, align 4, !tbaa !2
309  %30 = getelementptr inbounds i32, i32* %1, i64 21
310  %31 = load i32, i32* %30, align 4, !tbaa !2
311  %32 = add i32 %31, 4
312  store i32 %32, i32* %29, align 4, !tbaa !2
313  ret void
314}
315
316define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) {
317; SSE-LABEL: @gather_load_4(
318; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
319; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
320; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
321; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
322; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
323; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
324; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
325; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
326; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
327; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
328; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
329; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
330; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
331; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
332; SSE-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
333; SSE-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
334; SSE-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
335; SSE-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
336; SSE-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
337; SSE-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
338; SSE-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
339; SSE-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
340; SSE-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
341; SSE-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
342; SSE-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
343; SSE-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
344; SSE-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
345; SSE-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
346; SSE-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
347; SSE-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
348; SSE-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
349; SSE-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
350; SSE-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
351; SSE-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
352; SSE-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
353; SSE-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
354; SSE-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
355; SSE-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
356; SSE-NEXT:    ret void
357;
358; AVX-LABEL: @gather_load_4(
359; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
360; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
361; AVX-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
362; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
363; AVX-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
364; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
365; AVX-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
366; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
367; AVX-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
368; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
369; AVX-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
370; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
371; AVX-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
372; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
373; AVX-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
374; AVX-NEXT:    [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]]
375; AVX-NEXT:    [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]]
376; AVX-NEXT:    [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]]
377; AVX-NEXT:    [[T19:%.*]] = load i32, i32* [[T18]], align 4, [[TBAA0]]
378; AVX-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
379; AVX-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
380; AVX-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
381; AVX-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
382; AVX-NEXT:    [[T8:%.*]] = add i32 [[T7]], 2
383; AVX-NEXT:    [[T12:%.*]] = add i32 [[T11]], 3
384; AVX-NEXT:    [[T16:%.*]] = add i32 [[T15]], 4
385; AVX-NEXT:    [[T20:%.*]] = add i32 [[T19]], 1
386; AVX-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
387; AVX-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
388; AVX-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
389; AVX-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
390; AVX-NEXT:    store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]]
391; AVX-NEXT:    store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]]
392; AVX-NEXT:    store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]]
393; AVX-NEXT:    store i32 [[T20]], i32* [[T17]], align 4, [[TBAA0]]
394; AVX-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
395; AVX-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
396; AVX-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
397; AVX-NEXT:    ret void
398;
399; AVX2-LABEL: @gather_load_4(
400; AVX2-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
401; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0
402; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
403; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
404; AVX2-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
405; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
406; AVX2-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
407; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
408; AVX2-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
409; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
410; AVX2-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
411; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
412; AVX2-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
413; AVX2-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
414; AVX2-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
415; AVX2-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
416; AVX2-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
417; AVX2-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
418; AVX2-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
419; AVX2-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
420; AVX2-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
421; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
422; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
423; AVX2-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
424; AVX2-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
425; AVX2-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
426; AVX2-NEXT:    ret void
427;
428; AVX512-LABEL: @gather_load_4(
429; AVX512-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
430; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0
431; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
432; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
433; AVX512-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
434; AVX512-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
435; AVX512-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
436; AVX512-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
437; AVX512-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
438; AVX512-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
439; AVX512-NEXT:    [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]]
440; AVX512-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), [[TBAA0]]
441; AVX512-NEXT:    [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]]
442; AVX512-NEXT:    [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]]
443; AVX512-NEXT:    [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]]
444; AVX512-NEXT:    [[T4:%.*]] = add i32 [[T3]], 1
445; AVX512-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
446; AVX512-NEXT:    [[T24:%.*]] = add i32 [[T23]], 2
447; AVX512-NEXT:    [[T28:%.*]] = add i32 [[T27]], 3
448; AVX512-NEXT:    [[T32:%.*]] = add i32 [[T31]], 4
449; AVX512-NEXT:    store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]]
450; AVX512-NEXT:    [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
451; AVX512-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]]
452; AVX512-NEXT:    store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]]
453; AVX512-NEXT:    store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]]
454; AVX512-NEXT:    store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]]
455; AVX512-NEXT:    ret void
456;
457  %t5 = getelementptr inbounds i32, i32* %t0, i64 1
458  %t6 = getelementptr inbounds i32, i32* %t1, i64 11
459  %t9 = getelementptr inbounds i32, i32* %t0, i64 2
460  %t10 = getelementptr inbounds i32, i32* %t1, i64 4
461  %t13 = getelementptr inbounds i32, i32* %t0, i64 3
462  %t14 = getelementptr inbounds i32, i32* %t1, i64 15
463  %t17 = getelementptr inbounds i32, i32* %t0, i64 4
464  %t18 = getelementptr inbounds i32, i32* %t1, i64 18
465  %t21 = getelementptr inbounds i32, i32* %t0, i64 5
466  %t22 = getelementptr inbounds i32, i32* %t1, i64 9
467  %t25 = getelementptr inbounds i32, i32* %t0, i64 6
468  %t26 = getelementptr inbounds i32, i32* %t1, i64 6
469  %t29 = getelementptr inbounds i32, i32* %t0, i64 7
470  %t30 = getelementptr inbounds i32, i32* %t1, i64 21
471
472  %t3 = load i32, i32* %t1, align 4, !tbaa !2
473  %t7 = load i32, i32* %t6, align 4, !tbaa !2
474  %t11 = load i32, i32* %t10, align 4, !tbaa !2
475  %t15 = load i32, i32* %t14, align 4, !tbaa !2
476  %t19 = load i32, i32* %t18, align 4, !tbaa !2
477  %t23 = load i32, i32* %t22, align 4, !tbaa !2
478  %t27 = load i32, i32* %t26, align 4, !tbaa !2
479  %t31 = load i32, i32* %t30, align 4, !tbaa !2
480
481  %t4 = add i32 %t3, 1
482  %t8 = add i32 %t7, 2
483  %t12 = add i32 %t11, 3
484  %t16 = add i32 %t15, 4
485  %t20 = add i32 %t19, 1
486  %t24 = add i32 %t23, 2
487  %t28 = add i32 %t27, 3
488  %t32 = add i32 %t31, 4
489
490  store i32 %t4, i32* %t0, align 4, !tbaa !2
491  store i32 %t8, i32* %t5, align 4, !tbaa !2
492  store i32 %t12, i32* %t9, align 4, !tbaa !2
493  store i32 %t16, i32* %t13, align 4, !tbaa !2
494  store i32 %t20, i32* %t17, align 4, !tbaa !2
495  store i32 %t24, i32* %t21, align 4, !tbaa !2
496  store i32 %t28, i32* %t25, align 4, !tbaa !2
497  store i32 %t32, i32* %t29, align 4, !tbaa !2
498
499  ret void
500}
501
502
503define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
504; SSE-LABEL: @gather_load_div(
505; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
506; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
507; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
508; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x float*> undef, float* [[TMP1]], i32 0
509; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x float*> [[TMP6]], float* [[TMP3]], i32 1
510; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x float*> [[TMP7]], float* [[TMP4]], i32 2
511; SSE-NEXT:    [[TMP9:%.*]] = insertelement <4 x float*> [[TMP8]], float* [[TMP5]], i32 3
512; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
513; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> undef, <4 x i32> zeroinitializer
514; SSE-NEXT:    [[TMP12:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 4, i64 13, i64 11, i64 44>
515; SSE-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
516; SSE-NEXT:    [[TMP14:%.*]] = fdiv <4 x float> [[TMP10]], [[TMP13]]
517; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
518; SSE-NEXT:    [[TMP16:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
519; SSE-NEXT:    store <4 x float> [[TMP14]], <4 x float>* [[TMP16]], align 4, [[TBAA0]]
520; SSE-NEXT:    [[TMP17:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
521; SSE-NEXT:    [[TMP18:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP17]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
522; SSE-NEXT:    [[TMP19:%.*]] = getelementptr float, <4 x float*> [[TMP11]], <4 x i64> <i64 33, i64 30, i64 27, i64 23>
523; SSE-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP19]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef), [[TBAA0]]
524; SSE-NEXT:    [[TMP21:%.*]] = fdiv <4 x float> [[TMP18]], [[TMP20]]
525; SSE-NEXT:    [[TMP22:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
526; SSE-NEXT:    store <4 x float> [[TMP21]], <4 x float>* [[TMP22]], align 4, [[TBAA0]]
527; SSE-NEXT:    ret void
528;
529; AVX-LABEL: @gather_load_div(
530; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
531; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
532; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
533; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
534; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
535; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
536; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
537; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> undef, float* [[TMP1]], i32 0
538; AVX-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
539; AVX-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
540; AVX-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
541; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
542; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
543; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
544; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
545; AVX-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
546; AVX-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
547; AVX-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
548; AVX-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
549; AVX-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
550; AVX-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
551; AVX-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
552; AVX-NEXT:    ret void
553;
554; AVX2-LABEL: @gather_load_div(
555; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
556; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
557; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
558; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
559; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
560; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
561; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
562; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> undef, float* [[TMP1]], i32 0
563; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
564; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
565; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
566; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
567; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
568; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
569; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
570; AVX2-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
571; AVX2-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
572; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
573; AVX2-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
574; AVX2-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
575; AVX2-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
576; AVX2-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
577; AVX2-NEXT:    ret void
578;
579; AVX512-LABEL: @gather_load_div(
580; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
581; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
582; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
583; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
584; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
585; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
586; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
587; AVX512-NEXT:    [[TMP10:%.*]] = insertelement <8 x float*> undef, float* [[TMP1]], i32 0
588; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
589; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <8 x float*> [[TMP11]], float* [[TMP4]], i32 2
590; AVX512-NEXT:    [[TMP13:%.*]] = insertelement <8 x float*> [[TMP12]], float* [[TMP5]], i32 3
591; AVX512-NEXT:    [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP6]], i32 4
592; AVX512-NEXT:    [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP7]], i32 5
593; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <8 x float*> [[TMP15]], float* [[TMP8]], i32 6
594; AVX512-NEXT:    [[TMP17:%.*]] = insertelement <8 x float*> [[TMP16]], float* [[TMP9]], i32 7
595; AVX512-NEXT:    [[TMP18:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP17]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
596; AVX512-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> undef, <8 x i32> zeroinitializer
597; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, <8 x float*> [[TMP19]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
598; AVX512-NEXT:    [[TMP21:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP20]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), [[TBAA0]]
599; AVX512-NEXT:    [[TMP22:%.*]] = fdiv <8 x float> [[TMP18]], [[TMP21]]
600; AVX512-NEXT:    [[TMP23:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
601; AVX512-NEXT:    store <8 x float> [[TMP22]], <8 x float>* [[TMP23]], align 4, [[TBAA0]]
602; AVX512-NEXT:    ret void
603;
604  %3 = load float, float* %1, align 4, !tbaa !2
605  %4 = getelementptr inbounds float, float* %1, i64 4
606  %5 = load float, float* %4, align 4, !tbaa !2
607  %6 = fdiv float %3, %5
608  %7 = getelementptr inbounds float, float* %0, i64 1
609  store float %6, float* %0, align 4, !tbaa !2
610  %8 = getelementptr inbounds float, float* %1, i64 10
611  %9 = load float, float* %8, align 4, !tbaa !2
612  %10 = getelementptr inbounds float, float* %1, i64 13
613  %11 = load float, float* %10, align 4, !tbaa !2
614  %12 = fdiv float %9, %11
615  %13 = getelementptr inbounds float, float* %0, i64 2
616  store float %12, float* %7, align 4, !tbaa !2
617  %14 = getelementptr inbounds float, float* %1, i64 3
618  %15 = load float, float* %14, align 4, !tbaa !2
619  %16 = getelementptr inbounds float, float* %1, i64 11
620  %17 = load float, float* %16, align 4, !tbaa !2
621  %18 = fdiv float %15, %17
622  %19 = getelementptr inbounds float, float* %0, i64 3
623  store float %18, float* %13, align 4, !tbaa !2
624  %20 = getelementptr inbounds float, float* %1, i64 14
625  %21 = load float, float* %20, align 4, !tbaa !2
626  %22 = getelementptr inbounds float, float* %1, i64 44
627  %23 = load float, float* %22, align 4, !tbaa !2
628  %24 = fdiv float %21, %23
629  %25 = getelementptr inbounds float, float* %0, i64 4
630  store float %24, float* %19, align 4, !tbaa !2
631  %26 = getelementptr inbounds float, float* %1, i64 17
632  %27 = load float, float* %26, align 4, !tbaa !2
633  %28 = getelementptr inbounds float, float* %1, i64 33
634  %29 = load float, float* %28, align 4, !tbaa !2
635  %30 = fdiv float %27, %29
636  %31 = getelementptr inbounds float, float* %0, i64 5
637  store float %30, float* %25, align 4, !tbaa !2
638  %32 = getelementptr inbounds float, float* %1, i64 8
639  %33 = load float, float* %32, align 4, !tbaa !2
640  %34 = getelementptr inbounds float, float* %1, i64 30
641  %35 = load float, float* %34, align 4, !tbaa !2
642  %36 = fdiv float %33, %35
643  %37 = getelementptr inbounds float, float* %0, i64 6
644  store float %36, float* %31, align 4, !tbaa !2
645  %38 = getelementptr inbounds float, float* %1, i64 5
646  %39 = load float, float* %38, align 4, !tbaa !2
647  %40 = getelementptr inbounds float, float* %1, i64 27
648  %41 = load float, float* %40, align 4, !tbaa !2
649  %42 = fdiv float %39, %41
650  %43 = getelementptr inbounds float, float* %0, i64 7
651  store float %42, float* %37, align 4, !tbaa !2
652  %44 = getelementptr inbounds float, float* %1, i64 20
653  %45 = load float, float* %44, align 4, !tbaa !2
654  %46 = getelementptr inbounds float, float* %1, i64 23
655  %47 = load float, float* %46, align 4, !tbaa !2
656  %48 = fdiv float %45, %47
657  store float %48, float* %43, align 4, !tbaa !2
658  ret void
659}
660
661!2 = !{!3, !3, i64 0}
662!3 = !{!"short", !4, i64 0}
663!4 = !{!"omnipotent char", !5, i64 0}
664!5 = !{!"Simple C++ TBAA"}
665