• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
17
18target triple = "aarch64-unknown-linux-gnu"
19
20; Don't use SVE when its registers are no bigger than NEON.
21; NO_SVE-NOT: ptrue
22
23;
24; ANDV
25;
26
27; No single instruction NEON ANDV support. Use SVE.
28define i8 @andv_v8i8(<8 x i8> %a) #0 {
29; CHECK-LABEL: andv_v8i8:
30; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
31; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
32; CHECK: fmov w0, s[[REDUCE]]
33; CHECK: ret
34  %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
35  ret i8 %res
36}
37
38; No single instruction NEON ANDV support. Use SVE.
39define i8 @andv_v16i8(<16 x i8> %a) #0 {
40; CHECK-LABEL: andv_v16i8:
41; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
42; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b
43; CHECK: fmov w0, s[[REDUCE]]
44; CHECK: ret
45  %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
46  ret i8 %res
47}
48
49define i8 @andv_v32i8(<32 x i8>* %a) #0 {
50; CHECK-LABEL: andv_v32i8:
51; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
52; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
53; CHECK-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
54; CHECK-NEXT: fmov w0, s[[REDUCE]]
55; CHECK-NEXT: ret
56  %op = load <32 x i8>, <32 x i8>* %a
57  %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op)
58  ret i8 %res
59}
60
61define i8 @andv_v64i8(<64 x i8>* %a) #0 {
62; CHECK-LABEL: andv_v64i8:
63; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
64; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
65; VBITS_GE_512-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
66; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
67; VBITS_GE_512-NEXT: ret
68
69; Ensure sensible type legalisation.
70; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
71; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
72; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
73; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
74; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
75; VBITS_EQ_256-DAG: andv b[[REDUCE:[0-9]+]], [[PG]], [[AND]].b
76; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
77; VBITS_EQ_256-NEXT: ret
78
79  %op = load <64 x i8>, <64 x i8>* %a
80  %res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op)
81  ret i8 %res
82}
83
84define i8 @andv_v128i8(<128 x i8>* %a) #0 {
85; CHECK-LABEL: andv_v128i8:
86; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
87; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
88; VBITS_GE_1024-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
89; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
90; VBITS_GE_1024-NEXT: ret
91  %op = load <128 x i8>, <128 x i8>* %a
92  %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op)
93  ret i8 %res
94}
95
96define i8 @andv_v256i8(<256 x i8>* %a) #0 {
97; CHECK-LABEL: andv_v256i8:
98; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
99; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
100; VBITS_GE_2048-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
101; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
102; VBITS_GE_2048-NEXT: ret
103  %op = load <256 x i8>, <256 x i8>* %a
104  %res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op)
105  ret i8 %res
106}
107
108; No single instruction NEON ANDV support. Use SVE.
109define i16 @andv_v4i16(<4 x i16> %a) #0 {
110; CHECK-LABEL: andv_v4i16:
111; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
112; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
113; CHECK: fmov w0, s[[REDUCE]]
114; CHECK: ret
115  %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a)
116  ret i16 %res
117}
118
119; No single instruction NEON ANDV support. Use SVE.
120define i16 @andv_v8i16(<8 x i16> %a) #0 {
121; CHECK-LABEL: andv_v8i16:
122; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
123; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h
124; CHECK: fmov w0, s[[REDUCE]]
125; CHECK: ret
126  %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a)
127  ret i16 %res
128}
129
130define i16 @andv_v16i16(<16 x i16>* %a) #0 {
131; CHECK-LABEL: andv_v16i16:
132; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
133; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
134; CHECK-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
135; CHECK-NEXT: fmov w0, s[[REDUCE]]
136; CHECK-NEXT: ret
137  %op = load <16 x i16>, <16 x i16>* %a
138  %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op)
139  ret i16 %res
140}
141
142define i16 @andv_v32i16(<32 x i16>* %a) #0 {
143; CHECK-LABEL: andv_v32i16:
144; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
145; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
146; VBITS_GE_512-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
147; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
148; VBITS_GE_512-NEXT: ret
149
150; Ensure sensible type legalisation.
151; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
152; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
153; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
154; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
155; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
156; VBITS_EQ_256-DAG: andv h[[REDUCE:[0-9]+]], [[PG]], [[AND]].h
157; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
158; VBITS_EQ_256-NEXT: ret
159  %op = load <32 x i16>, <32 x i16>* %a
160  %res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op)
161  ret i16 %res
162}
163
164define i16 @andv_v64i16(<64 x i16>* %a) #0 {
165; CHECK-LABEL: andv_v64i16:
166; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
167; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
168; VBITS_GE_1024-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
169; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
170; VBITS_GE_1024-NEXT: ret
171  %op = load <64 x i16>, <64 x i16>* %a
172  %res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op)
173  ret i16 %res
174}
175
176define i16 @andv_v128i16(<128 x i16>* %a) #0 {
177; CHECK-LABEL: andv_v128i16:
178; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
179; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
180; VBITS_GE_2048-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
181; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
182; VBITS_GE_2048-NEXT: ret
183  %op = load <128 x i16>, <128 x i16>* %a
184  %res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op)
185  ret i16 %res
186}
187
188; No single instruction NEON ANDV support. Use SVE.
189define i32 @andv_v2i32(<2 x i32> %a) #0 {
190; CHECK-LABEL: andv_v2i32:
191; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
192; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
193; CHECK: fmov w0, [[REDUCE]]
194; CHECK: ret
195  %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a)
196  ret i32 %res
197}
198
199; No single instruction NEON ANDV support. Use SVE.
200define i32 @andv_v4i32(<4 x i32> %a) #0 {
201; CHECK-LABEL: andv_v4i32:
202; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
203; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s
204; CHECK: fmov w0, [[REDUCE]]
205; CHECK: ret
206  %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
207  ret i32 %res
208}
209
210define i32 @andv_v8i32(<8 x i32>* %a) #0 {
211; CHECK-LABEL: andv_v8i32:
212; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
213; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
214; CHECK-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
215; CHECK-NEXT: fmov w0, [[REDUCE]]
216; CHECK-NEXT: ret
217  %op = load <8 x i32>, <8 x i32>* %a
218  %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op)
219  ret i32 %res
220}
221
222define i32 @andv_v16i32(<16 x i32>* %a) #0 {
223; CHECK-LABEL: andv_v16i32:
224; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
225; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
226; VBITS_GE_512-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
227; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
228; VBITS_GE_512-NEXT: ret
229
230; Ensure sensible type legalisation.
231; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
232; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
233; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
234; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
235; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
236; VBITS_EQ_256-DAG: andv [[REDUCE:s[0-9]+]], [[PG]], [[AND]].s
237; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
238; VBITS_EQ_256-NEXT: ret
239  %op = load <16 x i32>, <16 x i32>* %a
240  %res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op)
241  ret i32 %res
242}
243
244define i32 @andv_v32i32(<32 x i32>* %a) #0 {
245; CHECK-LABEL: andv_v32i32:
246; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
247; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
248; VBITS_GE_1024-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
249; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
250; VBITS_GE_1024-NEXT: ret
251  %op = load <32 x i32>, <32 x i32>* %a
252  %res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op)
253  ret i32 %res
254}
255
256define i32 @andv_v64i32(<64 x i32>* %a) #0 {
257; CHECK-LABEL: andv_v64i32:
258; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
259; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
260; VBITS_GE_2048-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
261; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
262; VBITS_GE_2048-NEXT: ret
263  %op = load <64 x i32>, <64 x i32>* %a
264  %res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op)
265  ret i32 %res
266}
267
268; Nothing to do for single element vectors.
269define i64 @andv_v1i64(<1 x i64> %a) #0 {
270; CHECK-LABEL: andv_v1i64:
271; CHECK: fmov x0, d0
272; CHECK: ret
273  %res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a)
274  ret i64 %res
275}
276
277; Use SVE for 128-bit vectors
278define i64 @andv_v2i64(<2 x i64> %a) #0 {
279; CHECK-LABEL: andv_v2i64:
280; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
281; CHECK: andv [[REDUCE:d[0-9]+]], [[PG]], z0.d
282; CHECK: fmov x0, [[REDUCE]]
283; CHECK: ret
284  %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
285  ret i64 %res
286}
287
288define i64 @andv_v4i64(<4 x i64>* %a) #0 {
289; CHECK-LABEL: andv_v4i64:
290; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
291; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
292; CHECK-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
293; CHECK-NEXT: fmov x0, [[REDUCE]]
294; CHECK-NEXT: ret
295  %op = load <4 x i64>, <4 x i64>* %a
296  %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op)
297  ret i64 %res
298}
299
300define i64 @andv_v8i64(<8 x i64>* %a) #0 {
301; CHECK-LABEL: andv_v8i64:
302; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
303; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
304; VBITS_GE_512-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
305; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
306; VBITS_GE_512-NEXT: ret
307
308; Ensure sensible type legalisation.
309; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
310; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
311; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
312; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
313; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d
314; VBITS_EQ_256-DAG: andv [[REDUCE:d[0-9]+]], [[PG]], [[AND]].d
315; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
316; VBITS_EQ_256-NEXT: ret
317  %op = load <8 x i64>, <8 x i64>* %a
318  %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op)
319  ret i64 %res
320}
321
322define i64 @andv_v16i64(<16 x i64>* %a) #0 {
323; CHECK-LABEL: andv_v16i64:
324; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
325; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
326; VBITS_GE_1024-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
327; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
328; VBITS_GE_1024-NEXT: ret
329  %op = load <16 x i64>, <16 x i64>* %a
330  %res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op)
331  ret i64 %res
332}
333
334define i64 @andv_v32i64(<32 x i64>* %a) #0 {
335; CHECK-LABEL: andv_v32i64:
336; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
337; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
338; VBITS_GE_2048-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
339; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
340; VBITS_GE_2048-NEXT: ret
341  %op = load <32 x i64>, <32 x i64>* %a
342  %res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op)
343  ret i64 %res
344}
345
346;
347; EORV
348;
349
350; No single instruction NEON EORV support. Use SVE.
351define i8 @eorv_v8i8(<8 x i8> %a) #0 {
352; CHECK-LABEL: eorv_v8i8:
353; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
354; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
355; CHECK: fmov w0, s[[REDUCE]]
356; CHECK: ret
357  %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
358  ret i8 %res
359}
360
361; No single instruction NEON EORV support. Use SVE.
362define i8 @eorv_v16i8(<16 x i8> %a) #0 {
363; CHECK-LABEL: eorv_v16i8:
364; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
365; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b
366; CHECK: fmov w0, s[[REDUCE]]
367; CHECK: ret
368  %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
369  ret i8 %res
370}
371
372define i8 @eorv_v32i8(<32 x i8>* %a) #0 {
373; CHECK-LABEL: eorv_v32i8:
374; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
375; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
376; CHECK-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
377; CHECK-NEXT: fmov w0, s[[REDUCE]]
378; CHECK-NEXT: ret
379  %op = load <32 x i8>, <32 x i8>* %a
380  %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op)
381  ret i8 %res
382}
383
384define i8 @eorv_v64i8(<64 x i8>* %a) #0 {
385; CHECK-LABEL: eorv_v64i8:
386; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
387; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
388; VBITS_GE_512-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
389; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
390; VBITS_GE_512-NEXT: ret
391
392; Ensure sensible type legalisation.
393; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
394; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
395; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
396; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
397; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
398; VBITS_EQ_256-DAG: eorv b[[REDUCE:[0-9]+]], [[PG]], [[EOR]].b
399; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
400; VBITS_EQ_256-NEXT: ret
401
402  %op = load <64 x i8>, <64 x i8>* %a
403  %res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op)
404  ret i8 %res
405}
406
407define i8 @eorv_v128i8(<128 x i8>* %a) #0 {
408; CHECK-LABEL: eorv_v128i8:
409; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
410; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
411; VBITS_GE_1024-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
412; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
413; VBITS_GE_1024-NEXT: ret
414  %op = load <128 x i8>, <128 x i8>* %a
415  %res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op)
416  ret i8 %res
417}
418
419define i8 @eorv_v256i8(<256 x i8>* %a) #0 {
420; CHECK-LABEL: eorv_v256i8:
421; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
422; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
423; VBITS_GE_2048-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
424; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
425; VBITS_GE_2048-NEXT: ret
426  %op = load <256 x i8>, <256 x i8>* %a
427  %res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op)
428  ret i8 %res
429}
430
431; No single instruction NEON EORV support. Use SVE.
432define i16 @eorv_v4i16(<4 x i16> %a) #0 {
433; CHECK-LABEL: eorv_v4i16:
434; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
435; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
436; CHECK: fmov w0, s[[REDUCE]]
437; CHECK: ret
438  %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a)
439  ret i16 %res
440}
441
442; No single instruction NEON EORV support. Use SVE.
443define i16 @eorv_v8i16(<8 x i16> %a) #0 {
444; CHECK-LABEL: eorv_v8i16:
445; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
446; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h
447; CHECK: fmov w0, s[[REDUCE]]
448; CHECK: ret
449  %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a)
450  ret i16 %res
451}
452
453define i16 @eorv_v16i16(<16 x i16>* %a) #0 {
454; CHECK-LABEL: eorv_v16i16:
455; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
456; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
457; CHECK-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
458; CHECK-NEXT: fmov w0, s[[REDUCE]]
459; CHECK-NEXT: ret
460  %op = load <16 x i16>, <16 x i16>* %a
461  %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op)
462  ret i16 %res
463}
464
465define i16 @eorv_v32i16(<32 x i16>* %a) #0 {
466; CHECK-LABEL: eorv_v32i16:
467; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
468; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
469; VBITS_GE_512-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
470; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
471; VBITS_GE_512-NEXT: ret
472
473; Ensure sensible type legalisation.
474; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
475; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
476; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
477; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
478; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
479; VBITS_EQ_256-DAG: eorv h[[REDUCE:[0-9]+]], [[PG]], [[EOR]].h
480; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
481; VBITS_EQ_256-NEXT: ret
482  %op = load <32 x i16>, <32 x i16>* %a
483  %res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op)
484  ret i16 %res
485}
486
487define i16 @eorv_v64i16(<64 x i16>* %a) #0 {
488; CHECK-LABEL: eorv_v64i16:
489; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
490; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
491; VBITS_GE_1024-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
492; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
493; VBITS_GE_1024-NEXT: ret
494  %op = load <64 x i16>, <64 x i16>* %a
495  %res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op)
496  ret i16 %res
497}
498
499define i16 @eorv_v128i16(<128 x i16>* %a) #0 {
500; CHECK-LABEL: eorv_v128i16:
501; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
502; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
503; VBITS_GE_2048-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
504; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
505; VBITS_GE_2048-NEXT: ret
506  %op = load <128 x i16>, <128 x i16>* %a
507  %res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op)
508  ret i16 %res
509}
510
511; No single instruction NEON EORV support. Use SVE.
512define i32 @eorv_v2i32(<2 x i32> %a) #0 {
513; CHECK-LABEL: eorv_v2i32:
514; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
515; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
516; CHECK: fmov w0, [[REDUCE]]
517; CHECK: ret
518  %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a)
519  ret i32 %res
520}
521
522; No single instruction NEON EORV support. Use SVE.
523define i32 @eorv_v4i32(<4 x i32> %a) #0 {
524; CHECK-LABEL: eorv_v4i32:
525; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
526; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s
527; CHECK: fmov w0, [[REDUCE]]
528; CHECK: ret
529  %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
530  ret i32 %res
531}
532
533define i32 @eorv_v8i32(<8 x i32>* %a) #0 {
534; CHECK-LABEL: eorv_v8i32:
535; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
536; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
537; CHECK-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
538; CHECK-NEXT: fmov w0, [[REDUCE]]
539; CHECK-NEXT: ret
540  %op = load <8 x i32>, <8 x i32>* %a
541  %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op)
542  ret i32 %res
543}
544
545define i32 @eorv_v16i32(<16 x i32>* %a) #0 {
546; CHECK-LABEL: eorv_v16i32:
547; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
548; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
549; VBITS_GE_512-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
550; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
551; VBITS_GE_512-NEXT: ret
552
553; Ensure sensible type legalisation.
554; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
555; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
556; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
557; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
558; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
559; VBITS_EQ_256-DAG: eorv [[REDUCE:s[0-9]+]], [[PG]], [[EOR]].s
560; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
561; VBITS_EQ_256-NEXT: ret
562  %op = load <16 x i32>, <16 x i32>* %a
563  %res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op)
564  ret i32 %res
565}
566
567define i32 @eorv_v32i32(<32 x i32>* %a) #0 {
568; CHECK-LABEL: eorv_v32i32:
569; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
570; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
571; VBITS_GE_1024-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
572; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
573; VBITS_GE_1024-NEXT: ret
574  %op = load <32 x i32>, <32 x i32>* %a
575  %res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op)
576  ret i32 %res
577}
578
579define i32 @eorv_v64i32(<64 x i32>* %a) #0 {
580; CHECK-LABEL: eorv_v64i32:
581; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
582; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
583; VBITS_GE_2048-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
584; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
585; VBITS_GE_2048-NEXT: ret
586  %op = load <64 x i32>, <64 x i32>* %a
587  %res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op)
588  ret i32 %res
589}
590
591; Nothing to do for single element vectors.
592define i64 @eorv_v1i64(<1 x i64> %a) #0 {
593; CHECK-LABEL: eorv_v1i64:
594; CHECK: fmov x0, d0
595; CHECK: ret
596  %res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a)
597  ret i64 %res
598}
599
600; Use SVE for 128-bit vectors
601define i64 @eorv_v2i64(<2 x i64> %a) #0 {
602; CHECK-LABEL: eorv_v2i64:
603; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
604; CHECK: eorv [[REDUCE:d[0-9]+]], [[PG]], z0.d
605; CHECK: fmov x0, [[REDUCE]]
606; CHECK: ret
607  %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a)
608  ret i64 %res
609}
610
611define i64 @eorv_v4i64(<4 x i64>* %a) #0 {
612; CHECK-LABEL: eorv_v4i64:
613; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
614; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
615; CHECK-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
616; CHECK-NEXT: fmov x0, [[REDUCE]]
617; CHECK-NEXT: ret
618  %op = load <4 x i64>, <4 x i64>* %a
619  %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op)
620  ret i64 %res
621}
622
623define i64 @eorv_v8i64(<8 x i64>* %a) #0 {
624; CHECK-LABEL: eorv_v8i64:
625; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
626; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
627; VBITS_GE_512-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
628; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
629; VBITS_GE_512-NEXT: ret
630
631; Ensure sensible type legalisation.
632; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
633; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
634; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
635; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
636; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d
637; VBITS_EQ_256-DAG: eorv [[REDUCE:d[0-9]+]], [[PG]], [[EOR]].d
638; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
639; VBITS_EQ_256-NEXT: ret
640  %op = load <8 x i64>, <8 x i64>* %a
641  %res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op)
642  ret i64 %res
643}
644
645define i64 @eorv_v16i64(<16 x i64>* %a) #0 {
646; CHECK-LABEL: eorv_v16i64:
647; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
648; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
649; VBITS_GE_1024-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
650; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
651; VBITS_GE_1024-NEXT: ret
652  %op = load <16 x i64>, <16 x i64>* %a
653  %res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op)
654  ret i64 %res
655}
656
657define i64 @eorv_v32i64(<32 x i64>* %a) #0 {
658; CHECK-LABEL: eorv_v32i64:
659; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
660; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
661; VBITS_GE_2048-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
662; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
663; VBITS_GE_2048-NEXT: ret
664  %op = load <32 x i64>, <32 x i64>* %a
665  %res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op)
666  ret i64 %res
667}
668
669;
670; ORV
671;
672
673; No single instruction NEON ORV support. Use SVE.
674define i8 @orv_v8i8(<8 x i8> %a) #0 {
675; CHECK-LABEL: orv_v8i8:
676; CHECK: ptrue [[PG:p[0-9]+]].b, vl8
677; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
678; CHECK: fmov w0, s[[REDUCE]]
679; CHECK: ret
680  %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
681  ret i8 %res
682}
683
684; No single instruction NEON ORV support. Use SVE.
685define i8 @orv_v16i8(<16 x i8> %a) #0 {
686; CHECK-LABEL: orv_v16i8:
687; CHECK: ptrue [[PG:p[0-9]+]].b, vl16
688; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b
689; CHECK: fmov w0, s[[REDUCE]]
690; CHECK: ret
691  %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
692  ret i8 %res
693}
694
695define i8 @orv_v32i8(<32 x i8>* %a) #0 {
696; CHECK-LABEL: orv_v32i8:
697; CHECK: ptrue [[PG:p[0-9]+]].b, vl32
698; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
699; CHECK-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
700; CHECK-NEXT: fmov w0, s[[REDUCE]]
701; CHECK-NEXT: ret
702  %op = load <32 x i8>, <32 x i8>* %a
703  %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op)
704  ret i8 %res
705}
706
707define i8 @orv_v64i8(<64 x i8>* %a) #0 {
708; CHECK-LABEL: orv_v64i8:
709; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64
710; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
711; VBITS_GE_512-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
712; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
713; VBITS_GE_512-NEXT: ret
714
715; Ensure sensible type legalisation.
716; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
717; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32
718; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0]
719; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]]
720; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
721; VBITS_EQ_256-DAG: orv b[[REDUCE:[0-9]+]], [[PG]], [[OR]].b
722; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
723; VBITS_EQ_256-NEXT: ret
724
725  %op = load <64 x i8>, <64 x i8>* %a
726  %res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op)
727  ret i8 %res
728}
729
730define i8 @orv_v128i8(<128 x i8>* %a) #0 {
731; CHECK-LABEL: orv_v128i8:
732; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128
733; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
734; VBITS_GE_1024-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
735; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
736; VBITS_GE_1024-NEXT: ret
737  %op = load <128 x i8>, <128 x i8>* %a
738  %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op)
739  ret i8 %res
740}
741
742define i8 @orv_v256i8(<256 x i8>* %a) #0 {
743; CHECK-LABEL: orv_v256i8:
744; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256
745; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0]
746; VBITS_GE_2048-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b
747; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
748; VBITS_GE_2048-NEXT: ret
749  %op = load <256 x i8>, <256 x i8>* %a
750  %res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op)
751  ret i8 %res
752}
753
754; No single instruction NEON ORV support. Use SVE.
755define i16 @orv_v4i16(<4 x i16> %a) #0 {
756; CHECK-LABEL: orv_v4i16:
757; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
758; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
759; CHECK: fmov w0, s[[REDUCE]]
760; CHECK: ret
761  %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
762  ret i16 %res
763}
764
765; No single instruction NEON ORV support. Use SVE.
766define i16 @orv_v8i16(<8 x i16> %a) #0 {
767; CHECK-LABEL: orv_v8i16:
768; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
769; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h
770; CHECK: fmov w0, s[[REDUCE]]
771; CHECK: ret
772  %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
773  ret i16 %res
774}
775
776define i16 @orv_v16i16(<16 x i16>* %a) #0 {
777; CHECK-LABEL: orv_v16i16:
778; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
779; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
780; CHECK-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
781; CHECK-NEXT: fmov w0, s[[REDUCE]]
782; CHECK-NEXT: ret
783  %op = load <16 x i16>, <16 x i16>* %a
784  %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op)
785  ret i16 %res
786}
787
788define i16 @orv_v32i16(<32 x i16>* %a) #0 {
789; CHECK-LABEL: orv_v32i16:
790; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
791; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
792; VBITS_GE_512-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
793; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]]
794; VBITS_GE_512-NEXT: ret
795
796; Ensure sensible type legalisation.
797; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
798; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
799; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
800; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]]
801; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
802; VBITS_EQ_256-DAG: orv h[[REDUCE:[0-9]+]], [[PG]], [[OR]].h
803; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]]
804; VBITS_EQ_256-NEXT: ret
805  %op = load <32 x i16>, <32 x i16>* %a
806  %res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op)
807  ret i16 %res
808}
809
810define i16 @orv_v64i16(<64 x i16>* %a) #0 {
811; CHECK-LABEL: orv_v64i16:
812; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
813; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
814; VBITS_GE_1024-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
815; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]]
816; VBITS_GE_1024-NEXT: ret
817  %op = load <64 x i16>, <64 x i16>* %a
818  %res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op)
819  ret i16 %res
820}
821
822define i16 @orv_v128i16(<128 x i16>* %a) #0 {
823; CHECK-LABEL: orv_v128i16:
824; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
825; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
826; VBITS_GE_2048-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h
827; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]]
828; VBITS_GE_2048-NEXT: ret
829  %op = load <128 x i16>, <128 x i16>* %a
830  %res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op)
831  ret i16 %res
832}
833
834; No single instruction NEON ORV support. Use SVE.
835define i32 @orv_v2i32(<2 x i32> %a) #0 {
836; CHECK-LABEL: orv_v2i32:
837; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
838; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
839; CHECK: fmov w0, [[REDUCE]]
840; CHECK: ret
841  %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
842  ret i32 %res
843}
844
845; No single instruction NEON ORV support. Use SVE.
846define i32 @orv_v4i32(<4 x i32> %a) #0 {
847; CHECK-LABEL: orv_v4i32:
848; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
849; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s
850; CHECK: fmov w0, [[REDUCE]]
851; CHECK: ret
852  %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
853  ret i32 %res
854}
855
856define i32 @orv_v8i32(<8 x i32>* %a) #0 {
857; CHECK-LABEL: orv_v8i32:
858; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
859; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
860; CHECK-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
861; CHECK-NEXT: fmov w0, [[REDUCE]]
862; CHECK-NEXT: ret
863  %op = load <8 x i32>, <8 x i32>* %a
864  %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op)
865  ret i32 %res
866}
867
868define i32 @orv_v16i32(<16 x i32>* %a) #0 {
869; CHECK-LABEL: orv_v16i32:
870; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
871; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
872; VBITS_GE_512-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
873; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]]
874; VBITS_GE_512-NEXT: ret
875
876; Ensure sensible type legalisation.
877; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
878; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
879; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
880; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]]
881; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
882; VBITS_EQ_256-DAG: orv [[REDUCE:s[0-9]+]], [[PG]], [[OR]].s
883; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]]
884; VBITS_EQ_256-NEXT: ret
885  %op = load <16 x i32>, <16 x i32>* %a
886  %res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op)
887  ret i32 %res
888}
889
890define i32 @orv_v32i32(<32 x i32>* %a) #0 {
891; CHECK-LABEL: orv_v32i32:
892; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
893; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
894; VBITS_GE_1024-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
895; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]]
896; VBITS_GE_1024-NEXT: ret
897  %op = load <32 x i32>, <32 x i32>* %a
898  %res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op)
899  ret i32 %res
900}
901
902define i32 @orv_v64i32(<64 x i32>* %a) #0 {
903; CHECK-LABEL: orv_v64i32:
904; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
905; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
906; VBITS_GE_2048-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s
907; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]]
908; VBITS_GE_2048-NEXT: ret
909  %op = load <64 x i32>, <64 x i32>* %a
910  %res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op)
911  ret i32 %res
912}
913
914; Nothing to do for single element vectors.
915define i64 @orv_v1i64(<1 x i64> %a) #0 {
916; CHECK-LABEL: orv_v1i64:
917; CHECK: fmov x0, d0
918; CHECK: ret
919  %res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a)
920  ret i64 %res
921}
922
923; Use SVE for 128-bit vectors
924define i64 @orv_v2i64(<2 x i64> %a) #0 {
925; CHECK-LABEL: orv_v2i64:
926; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
927; CHECK: orv [[REDUCE:d[0-9]+]], [[PG]], z0.d
928; CHECK: fmov x0, [[REDUCE]]
929; CHECK: ret
930  %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
931  ret i64 %res
932}
933
934define i64 @orv_v4i64(<4 x i64>* %a) #0 {
935; CHECK-LABEL: orv_v4i64:
936; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
937; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
938; CHECK-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
939; CHECK-NEXT: fmov x0, [[REDUCE]]
940; CHECK-NEXT: ret
941  %op = load <4 x i64>, <4 x i64>* %a
942  %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op)
943  ret i64 %res
944}
945
946define i64 @orv_v8i64(<8 x i64>* %a) #0 {
947; CHECK-LABEL: orv_v8i64:
948; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
949; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
950; VBITS_GE_512-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
951; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]]
952; VBITS_GE_512-NEXT: ret
953
954; Ensure sensible type legalisation.
955; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
956; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32
957; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
958; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]]
959; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d
960; VBITS_EQ_256-DAG: orv [[REDUCE:d[0-9]+]], [[PG]], [[OR]].d
961; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]]
962; VBITS_EQ_256-NEXT: ret
963  %op = load <8 x i64>, <8 x i64>* %a
964  %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op)
965  ret i64 %res
966}
967
968define i64 @orv_v16i64(<16 x i64>* %a) #0 {
969; CHECK-LABEL: orv_v16i64:
970; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
971; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
972; VBITS_GE_1024-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
973; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]]
974; VBITS_GE_1024-NEXT: ret
975  %op = load <16 x i64>, <16 x i64>* %a
976  %res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op)
977  ret i64 %res
978}
979
980define i64 @orv_v32i64(<32 x i64>* %a) #0 {
981; CHECK-LABEL: orv_v32i64:
982; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
983; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
984; VBITS_GE_2048-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d
985; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]]
986; VBITS_GE_2048-NEXT: ret
987  %op = load <32 x i64>, <32 x i64>* %a
988  %res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op)
989  ret i64 %res
990}
991
992attributes #0 = { "target-features"="+sve" }
993
994declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>)
995declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
996declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>)
997declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>)
998declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>)
999declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>)
1000
1001declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>)
1002declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
1003declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>)
1004declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>)
1005declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>)
1006declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>)
1007
1008declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>)
1009declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
1010declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>)
1011declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>)
1012declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>)
1013declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>)
1014
1015declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>)
1016declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
1017declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>)
1018declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>)
1019declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>)
1020declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>)
1021
1022declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>)
1023declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
1024declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>)
1025declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>)
1026declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>)
1027declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>)
1028
1029declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>)
1030declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
1031declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>)
1032declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>)
1033declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>)
1034declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>)
1035
1036declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>)
1037declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
1038declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>)
1039declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>)
1040declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>)
1041declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>)
1042
1043declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>)
1044declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
1045declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>)
1046declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>)
1047declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>)
1048declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>)
1049
1050declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>)
1051declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>)
1052declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>)
1053declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>)
1054declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>)
1055declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>)
1056
1057declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>)
1058declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>)
1059declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>)
1060declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>)
1061declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>)
1062declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>)
1063
1064declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>)
1065declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
1066declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>)
1067declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)
1068declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>)
1069declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>)
1070
1071declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>)
1072declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>)
1073declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>)
1074declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>)
1075declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>)
1076declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>)
1077