• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s
2; RUN: opt -codegenprepare -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s
3; RUN: llc -mtriple=thumbv7-apple-ios %s -o - -mattr=+neon | FileCheck --check-prefix=ASM %s
4
5; IR-BOTH-LABEL: @simpleOneInstructionPromotion
6; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
7; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 undef, i32 1>
8; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1
9; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
10; IR-BOTH-NEXT: ret
11;
12; Make sure we got rid of any expensive vmov.32 instructions.
13; ASM-LABEL: simpleOneInstructionPromotion:
14; ASM: vldr [[LOAD:d[0-9]+]], [r0]
15; ASM-NEXT: vorr.i32 [[LOAD]], #0x1
16; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1:32]
17; ASM-NEXT: bx
18define void @simpleOneInstructionPromotion(<2 x i32>* %addr1, i32* %dest) {
19  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
20  %extract = extractelement <2 x i32> %in1, i32 1
21  %out = or i32 %extract, 1
22  store i32 %out, i32* %dest, align 4
23  ret void
24}
25
26; IR-BOTH-LABEL: @unsupportedInstructionForPromotion
27; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
28; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
29; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2
30; IR-BOTH-NEXT: store i1 [[CMP]], i1* %dest
31; IR-BOTH-NEXT: ret
32;
33; ASM-LABEL: unsupportedInstructionForPromotion:
34; ASM: vldr [[LOAD:d[0-9]+]], [r0]
35; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
36; ASM: bx
37define void @unsupportedInstructionForPromotion(<2 x i32>* %addr1, i32 %in2, i1* %dest) {
38  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
39  %extract = extractelement <2 x i32> %in1, i32 0
40  %out = icmp eq i32 %extract, %in2
41  store i1 %out, i1* %dest, align 4
42  ret void
43}
44
45
46; IR-BOTH-LABEL: @unsupportedChainInDifferentBBs
47; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
48; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0
49; IR-BOTH-NEXT: br i1 %bool, label %bb2, label %end
50; BB2
51; IR-BOTH: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
52; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest, align 4
53; IR-BOTH: ret
54;
55; ASM-LABEL: unsupportedChainInDifferentBBs:
56; ASM: vldr [[LOAD:d[0-9]+]], [r0]
57; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
58; ASM: bx
59define void @unsupportedChainInDifferentBBs(<2 x i32>* %addr1, i32* %dest, i1 %bool) {
60bb1:
61  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
62  %extract = extractelement <2 x i32> %in1, i32 0
63  br i1 %bool, label %bb2, label %end
64bb2:
65  %out = or i32 %extract, 1
66  store i32 %out, i32* %dest, align 4
67  br label %end
68end:
69  ret void
70}
71
72; IR-LABEL: @chainOfInstructionsToPromote
73; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
74; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 undef>
75; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], <i32 1, i32 undef>
76; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], <i32 1, i32 undef>
77; IR-BOTH-NEXT: [[VECTOR_OR4:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR3]], <i32 1, i32 undef>
78; IR-BOTH-NEXT: [[VECTOR_OR5:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR4]], <i32 1, i32 undef>
79; IR-BOTH-NEXT: [[VECTOR_OR6:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR5]], <i32 1, i32 undef>
80; IR-BOTH-NEXT: [[VECTOR_OR7:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR6]], <i32 1, i32 undef>
81; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR7]], i32 0
82; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
83; IR-BOTH-NEXT: ret
84;
85; ASM-LABEL: chainOfInstructionsToPromote:
86; ASM: vldr [[LOAD:d[0-9]+]], [r0]
87; ASM-NOT: vmov.32 {{r[0-9]+}}, [[LOAD]]
88; ASM: bx
89define void @chainOfInstructionsToPromote(<2 x i32>* %addr1, i32* %dest) {
90  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
91  %extract = extractelement <2 x i32> %in1, i32 0
92  %out1 = or i32 %extract, 1
93  %out2 = or i32 %out1, 1
94  %out3 = or i32 %out2, 1
95  %out4 = or i32 %out3, 1
96  %out5 = or i32 %out4, 1
97  %out6 = or i32 %out5, 1
98  %out7 = or i32 %out6, 1
99  store i32 %out7, i32* %dest, align 4
100  ret void
101}
102
103; IR-BOTH-LABEL: @unsupportedMultiUses
104; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
105; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
106; IR-BOTH-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
107; IR-BOTH-NEXT: store i32 [[OR]], i32* %dest
108; IR-BOTH-NEXT: ret i32 [[OR]]
109;
110; ASM-LABEL: unsupportedMultiUses:
111; ASM: vldr [[LOAD:d[0-9]+]], [r0]
112; ASM: vmov.32 {{r[0-9]+}}, [[LOAD]]
113; ASM: bx
114define i32 @unsupportedMultiUses(<2 x i32>* %addr1, i32* %dest) {
115  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
116  %extract = extractelement <2 x i32> %in1, i32 1
117  %out = or i32 %extract, 1
118  store i32 %out, i32* %dest, align 4
119  ret i32 %out
120}
121
122; Check that we promote we a splat constant when this is a division.
123; The NORMAL mode does not promote anything as divisions are not legal.
124; IR-BOTH-LABEL: @udivCase
125; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
126; Scalar version:
127; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
128; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 [[EXTRACT]], 7
129; Vector version:
130; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = udiv <2 x i32> [[LOAD]], <i32 7, i32 7>
131; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
132;
133; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
134; IR-BOTH-NEXT: ret
135define void @udivCase(<2 x i32>* %addr1, i32* %dest) {
136  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
137  %extract = extractelement <2 x i32> %in1, i32 1
138  %out = udiv i32 %extract, 7
139  store i32 %out, i32* %dest, align 4
140  ret void
141}
142
143; IR-BOTH-LABEL: @uremCase
144; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
145; Scalar version:
146; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
147; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = urem i32 [[EXTRACT]], 7
148; Vector version:
149; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = urem <2 x i32> [[LOAD]], <i32 7, i32 7>
150; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
151;
152; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
153; IR-BOTH-NEXT: ret
154define void @uremCase(<2 x i32>* %addr1, i32* %dest) {
155  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
156  %extract = extractelement <2 x i32> %in1, i32 1
157  %out = urem i32 %extract, 7
158  store i32 %out, i32* %dest, align 4
159  ret void
160}
161
162; IR-BOTH-LABEL: @sdivCase
163; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
164; Scalar version:
165; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
166; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sdiv i32 [[EXTRACT]], 7
167; Vector version:
168; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = sdiv <2 x i32> [[LOAD]], <i32 7, i32 7>
169; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
170;
171; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
172; IR-BOTH-NEXT: ret
173define void @sdivCase(<2 x i32>* %addr1, i32* %dest) {
174  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
175  %extract = extractelement <2 x i32> %in1, i32 1
176  %out = sdiv i32 %extract, 7
177  store i32 %out, i32* %dest, align 4
178  ret void
179}
180
181; IR-BOTH-LABEL: @sremCase
182; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
183; Scalar version:
184; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
185; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 [[EXTRACT]], 7
186; Vector version:
187; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = srem <2 x i32> [[LOAD]], <i32 7, i32 7>
188; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[DIV]], i32 1
189;
190; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
191; IR-BOTH-NEXT: ret
192define void @sremCase(<2 x i32>* %addr1, i32* %dest) {
193  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
194  %extract = extractelement <2 x i32> %in1, i32 1
195  %out = srem i32 %extract, 7
196  store i32 %out, i32* %dest, align 4
197  ret void
198}
199
200; IR-BOTH-LABEL: @fdivCase
201; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1
202; Scalar version:
203; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
204; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fdiv float [[EXTRACT]], 7.0
205; Vector version:
206; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fdiv <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
207; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
208;
209; IR-BOTH-NEXT: store float [[RES]], float* %dest
210; IR-BOTH-NEXT: ret
211define void @fdivCase(<2 x float>* %addr1, float* %dest) {
212  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
213  %extract = extractelement <2 x float> %in1, i32 1
214  %out = fdiv float %extract, 7.0
215  store float %out, float* %dest, align 4
216  ret void
217}
218
219; IR-BOTH-LABEL: @fremCase
220; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1
221; Scalar version:
222; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
223; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem float [[EXTRACT]], 7.0
224; Vector version:
225; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem <2 x float> [[LOAD]], <float 7.000000e+00, float 7.000000e+00>
226; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
227;
228; IR-BOTH-NEXT: store float [[RES]], float* %dest
229; IR-BOTH-NEXT: ret
230define void @fremCase(<2 x float>* %addr1, float* %dest) {
231  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
232  %extract = extractelement <2 x float> %in1, i32 1
233  %out = frem float %extract, 7.0
234  store float %out, float* %dest, align 4
235  ret void
236}
237
238; Check that we do not promote when we may introduce undefined behavior
239; like division by zero.
240; IR-BOTH-LABEL: @undefDivCase
241; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
242; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
243; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = udiv i32 7, [[EXTRACT]]
244; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
245; IR-BOTH-NEXT: ret
246define void @undefDivCase(<2 x i32>* %addr1, i32* %dest) {
247  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
248  %extract = extractelement <2 x i32> %in1, i32 1
249  %out = udiv i32 7, %extract
250  store i32 %out, i32* %dest, align 4
251  ret void
252}
253
254
255; Check that we do not promote when we may introduce undefined behavior
256; like division by zero.
257; IR-BOTH-LABEL: @undefRemCase
258; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
259; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 1
260; IR-BOTH-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = srem i32 7, [[EXTRACT]]
261; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
262; IR-BOTH-NEXT: ret
263define void @undefRemCase(<2 x i32>* %addr1, i32* %dest) {
264  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
265  %extract = extractelement <2 x i32> %in1, i32 1
266  %out = srem i32 7, %extract
267  store i32 %out, i32* %dest, align 4
268  ret void
269}
270
271; Check that we use an undef mask for undefined behavior if the fast-math
272; flag is set.
273; IR-BOTH-LABEL: @undefConstantFRemCaseWithFastMath
274; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1
275; Scalar version:
276; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
277; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float [[EXTRACT]], 7.0
278; Vector version:
279; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> [[LOAD]], <float undef, float 7.000000e+00>
280; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
281;
282; IR-BOTH-NEXT: store float [[RES]], float* %dest
283; IR-BOTH-NEXT: ret
284define void @undefConstantFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
285  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
286  %extract = extractelement <2 x float> %in1, i32 1
287  %out = frem nnan float %extract, 7.0
288  store float %out, float* %dest, align 4
289  ret void
290}
291
292; Check that we use an undef mask for undefined behavior if the fast-math
293; flag is set.
294; IR-BOTH-LABEL: @undefVectorFRemCaseWithFastMath
295; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1
296; Scalar version:
297; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
298; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = frem nnan float 7.000000e+00, [[EXTRACT]]
299; Vector version:
300; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = frem nnan <2 x float> <float undef, float 7.000000e+00>, [[LOAD]]
301; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
302;
303; IR-BOTH-NEXT: store float [[RES]], float* %dest
304; IR-BOTH-NEXT: ret
305define void @undefVectorFRemCaseWithFastMath(<2 x float>* %addr1, float* %dest) {
306  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
307  %extract = extractelement <2 x float> %in1, i32 1
308  %out = frem nnan float 7.0, %extract
309  store float %out, float* %dest, align 4
310  ret void
311}
312
313; Check that we are able to promote floating point value.
314; This requires the STRESS mode, as floating point value are
315; not promote on armv7.
316; IR-BOTH-LABEL: @simpleOneInstructionPromotionFloat
317; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x float>, <2 x float>* %addr1
318; Scalar version:
319; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[LOAD]], i32 1
320; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = fadd float [[EXTRACT]], 1.0
321; Vector version:
322; IR-STRESS-NEXT: [[DIV:%[a-zA-Z_0-9-]+]] = fadd <2 x float> [[LOAD]], <float undef, float 1.000000e+00>
323; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x float> [[DIV]], i32 1
324;
325; IR-BOTH-NEXT: store float [[RES]], float* %dest
326; IR-BOTH-NEXT: ret
327define void @simpleOneInstructionPromotionFloat(<2 x float>* %addr1, float* %dest) {
328  %in1 = load <2 x float>, <2 x float>* %addr1, align 8
329  %extract = extractelement <2 x float> %in1, i32 1
330  %out = fadd float %extract, 1.0
331  store float %out, float* %dest, align 4
332  ret void
333}
334
335; Check that we correctly use a splat constant when we cannot
336; determine at compile time the index of the extract.
337; This requires the STRESS modes, as variable index are expensive
338; to lower.
339; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx
340; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, <2 x i32>* %addr1
341; Scalar version:
342; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx
343; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1
344; Vector version:
345; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], <i32 1, i32 1>
346; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx
347;
348; IR-BOTH-NEXT: store i32 [[RES]], i32* %dest
349; IR-BOTH-NEXT: ret
350define void @simpleOneInstructionPromotionVariableIdx(<2 x i32>* %addr1, i32* %dest, i32 %idx) {
351  %in1 = load <2 x i32>, <2 x i32>* %addr1, align 8
352  %extract = extractelement <2 x i32> %in1, i32 %idx
353  %out = or i32 %extract, 1
354  store i32 %out, i32* %dest, align 4
355  ret void
356}
357
358; Check a vector with more than 2 elements.
359; This requires the STRESS mode because currently 'or v8i8' is not marked
360; as legal or custom, althought the actual assembly is better if we were
361; promoting it.
362; IR-BOTH-LABEL: @simpleOneInstructionPromotion8x8
363; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <8 x i8>, <8 x i8>* %addr1
364; Scalar version:
365; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[LOAD]], i32 1
366; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i8 [[EXTRACT]], 1
367; Vector version:
368; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <8 x i8> [[LOAD]], <i8 undef, i8 1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>
369; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <8 x i8> [[OR]], i32 1
370;
371; IR-BOTH-NEXT: store i8 [[RES]], i8* %dest
372; IR-BOTH-NEXT: ret
373define void @simpleOneInstructionPromotion8x8(<8 x i8>* %addr1, i8* %dest) {
374  %in1 = load <8 x i8>, <8 x i8>* %addr1, align 8
375  %extract = extractelement <8 x i8> %in1, i32 1
376  %out = or i8 %extract, 1
377  store i8 %out, i8* %dest, align 4
378  ret void
379}
380
381; Check that we optimized the sequence correctly when it can be
382; lowered on a Q register.
383; IR-BOTH-LABEL: @simpleOneInstructionPromotion
384; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <4 x i32>, <4 x i32>* %addr1
385; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <4 x i32> [[LOAD]], <i32 undef, i32 1, i32 undef, i32 undef>
386; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <4 x i32> [[VECTOR_OR]], i32 1
387; IR-BOTH-NEXT: store i32 [[EXTRACT]], i32* %dest
388; IR-BOTH-NEXT: ret
389;
390; Make sure we got rid of any expensive vmov.32 instructions.
391; ASM-LABEL: simpleOneInstructionPromotion4x32:
392; ASM: vld1.64 {[[LOAD:d[0-9]+]], d{{[0-9]+}}}, [r0]
393; The Q register used here must be [[LOAD]] / 2, but we cannot express that.
394; ASM-NEXT: vorr.i32 q{{[[0-9]+}}, #0x1
395; ASM-NEXT: vst1.32 {[[LOAD]][1]}, [r1]
396; ASM-NEXT: bx
397define void @simpleOneInstructionPromotion4x32(<4 x i32>* %addr1, i32* %dest) {
398  %in1 = load <4 x i32>, <4 x i32>* %addr1, align 8
399  %extract = extractelement <4 x i32> %in1, i32 1
400  %out = or i32 %extract, 1
401  store i32 %out, i32* %dest, align 1
402  ret void
403}
404