• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
17
18target triple = "aarch64-unknown-linux-gnu"
19
20; Don't use SVE when its registers are no bigger than NEON.
21; NO_SVE-NOT: ptrue
22
23;
24; SDIV
25;
26
27; Vector vXi8 sdiv are not legal for NEON so use SVE when available.
28define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
29; CHECK-LABEL: sdiv_v8i8:
30; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].h, z1.b
31; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, z0.b
32; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,2)]]
33; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
34; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
35; CHECK-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
36; CHECK-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
37; CHECK-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
38; CHECK-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
39; CHECK-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
40; CHECK-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
41; CHECK-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
42; CHECK-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h
43; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
44; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
45; CHECK-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
46; CHECK-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
47; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
48; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
49; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b
50; CHECK:    ret
51  %res = sdiv <8 x i8> %op1, %op2
52  ret <8 x i8> %res
53}
54
55define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
56; CHECK-LABEL: sdiv_v16i8:
57; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].h, z1.b
58; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, z0.b
59; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,4)]]
60; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b
61; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b
62; CHECK-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
63; CHECK-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
64; CHECK-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
65; CHECK-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
66; CHECK-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
67; CHECK-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
68; CHECK-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
69; CHECK-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h
70; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
71; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
72; CHECK-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
73; CHECK-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
74; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
75; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
76; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b
77; CHECK: ret
78  %res = sdiv <16 x i8> %op1, %op2
79  ret <16 x i8> %res
80}
81
82define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
83; CHECK-LABEL: sdiv_v32i8:
84; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
85; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
86; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
87; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
88; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
89; CHECK-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
90; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
91; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
92; CHECK-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
93; CHECK-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
94; CHECK-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
95; CHECK-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
96; CHECK-NEXT: sdivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
97; CHECK-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
98; CHECK-NEXT: sdivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
99; CHECK-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
100; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
101; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
102; CHECK-NEXT: sdiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
103; CHECK-NEXT: sdiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
104; CHECK-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
105; CHECK-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
106; CHECK-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
107; CHECK-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
108; CHECK-NEXT: ret
109  %op1 = load <32 x i8>, <32 x i8>* %a
110  %op2 = load <32 x i8>, <32 x i8>* %b
111  %res = sdiv <32 x i8> %op1, %op2
112  store <32 x i8> %res, <32 x i8>* %a
113  ret void
114}
115
116define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
117; CHECK-LABEL: sdiv_v64i8:
118; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
119; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
120; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
121; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s,  vl[[#min(VBYTES,16)]]
122; VBITS_GE_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
123; VBITS_GE_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
124; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
125; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
126; VBITS_GE_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
127; VBITS_GE_512-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
128; VBITS_GE_512-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
129; VBITS_GE_512-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
130; VBITS_GE_512-NEXT: sdivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
131; VBITS_GE_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
132; VBITS_GE_512-NEXT: sdivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
133; VBITS_GE_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
134; VBITS_GE_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
135; VBITS_GE_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
136; VBITS_GE_512-NEXT: sdiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
137; VBITS_GE_512-NEXT: sdiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
138; VBITS_GE_512-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
139; VBITS_GE_512-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
140; VBITS_GE_512-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
141; VBITS_GE_512-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
142; VBITS_GE_512-NEXT: ret
143  %op1 = load <64 x i8>, <64 x i8>* %a
144  %op2 = load <64 x i8>, <64 x i8>* %b
145  %res = sdiv <64 x i8> %op1, %op2
146  store <64 x i8> %res, <64 x i8>* %a
147  ret void
148}
149
150define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
151; CHECK-LABEL: sdiv_v128i8:
152; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
153; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
154; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
155; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s,  vl[[#min(VBYTES,32)]]
156; VBITS_GE_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
157; VBITS_GE_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
158; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
159; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
160; VBITS_GE_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
161; VBITS_GE_1024-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
162; VBITS_GE_1024-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
163; VBITS_GE_1024-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
164; VBITS_GE_1024-NEXT: sdivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
165; VBITS_GE_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
166; VBITS_GE_1024-NEXT: sdivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
167; VBITS_GE_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
168; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
169; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
170; VBITS_GE_1024-NEXT: sdiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
171; VBITS_GE_1024-NEXT: sdiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
172; VBITS_GE_1024-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
173; VBITS_GE_1024-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
174; VBITS_GE_1024-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
175; VBITS_GE_1024-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
176; VBITS_GE_1024-NEXT: ret
177  %op1 = load <128 x i8>, <128 x i8>* %a
178  %op2 = load <128 x i8>, <128 x i8>* %b
179  %res = sdiv <128 x i8> %op1, %op2
180  store <128 x i8> %res, <128 x i8>* %a
181  ret void
182}
183
184define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
185; CHECK-LABEL: sdiv_v256i8:
186; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
187; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
188; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
189; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
190; VBITS_GE_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
191; VBITS_GE_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
192; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
193; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
194; VBITS_GE_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
195; VBITS_GE_2048-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
196; VBITS_GE_2048-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
197; VBITS_GE_2048-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
198; VBITS_GE_2048-NEXT: sdivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
199; VBITS_GE_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
200; VBITS_GE_2048-NEXT: sdivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
201; VBITS_GE_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
202; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
203; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
204; VBITS_GE_2048-NEXT: sdiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
205; VBITS_GE_2048-NEXT: sdiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
206; VBITS_GE_2048-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
207; VBITS_GE_2048-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
208; VBITS_GE_2048-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
209; VBITS_GE_2048-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
210; VBITS_GE_2048-NEXT: ret
211  %op1 = load <256 x i8>, <256 x i8>* %a
212  %op2 = load <256 x i8>, <256 x i8>* %b
213  %res = sdiv <256 x i8> %op1, %op2
214  store <256 x i8> %res, <256 x i8>* %a
215  ret void
216}
217
218; Vector vXi16 sdiv are not legal for NEON so use SVE when available.
219define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
220; CHECK-LABEL: sdiv_v4i16:
221; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
222; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
223; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),2)]]
224; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, z1.h
225; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, z0.h
226; CHECK-NEXT: sdivr   [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
227; CHECK-NEXT: sdiv    [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
228; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
229; CHECK-NEXT: ret
230  %res = sdiv <4 x i16> %op1, %op2
231  ret <4 x i16> %res
232}
233
234define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
235; CHECK-LABEL: sdiv_v8i16:
236; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
237; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
238; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),4)]]
239; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, z1.h
240; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, z0.h
241; CHECK-NEXT: sdivr   [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
242; CHECK-NEXT: sdiv    [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
243; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
244; CHECK-NEXT: ret
245  %res = sdiv <8 x i16> %op1, %op2
246  ret <8 x i16> %res
247}
248
249define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
250; CHECK-LABEL: sdiv_v16i16:
251; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
252; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
253; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
254; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),8)]]
255; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
256; CHECK-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
257; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
258; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
259; CHECK-NEXT: sdivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
260; CHECK-NEXT: sdiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
261; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
262; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
263; CHECK-NEXT: ret
264  %op1 = load <16 x i16>, <16 x i16>* %a
265  %op2 = load <16 x i16>, <16 x i16>* %b
266  %res = sdiv <16 x i16> %op1, %op2
267  store <16 x i16> %res, <16 x i16>* %a
268  ret void
269}
270
271define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
272; CHECK-LABEL: sdiv_v32i16:
273; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
274; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
275; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
276; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),16)]]
277; VBITS_GE_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
278; VBITS_GE_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
279; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
280; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
281; VBITS_GE_512-NEXT: sdivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
282; VBITS_GE_512-NEXT: sdiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
283; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
284; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
285; VBITS_GE_512-NEXT: ret
286  %op1 = load <32 x i16>, <32 x i16>* %a
287  %op2 = load <32 x i16>, <32 x i16>* %b
288  %res = sdiv <32 x i16> %op1, %op2
289  store <32 x i16> %res, <32 x i16>* %a
290  ret void
291}
292
293define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
294; CHECK-LABEL: sdiv_v64i16:
295; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
296; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
297; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
298; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),32)]]
299; VBITS_GE_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
300; VBITS_GE_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
301; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
302; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
303; VBITS_GE_1024-NEXT: sdivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
304; VBITS_GE_1024-NEXT: sdiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
305; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
306; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
307; VBITS_GE_1024-NEXT: ret
308  %op1 = load <64 x i16>, <64 x i16>* %a
309  %op2 = load <64 x i16>, <64 x i16>* %b
310  %res = sdiv <64 x i16> %op1, %op2
311  store <64 x i16> %res, <64 x i16>* %a
312  ret void
313}
314
315define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
316; CHECK-LABEL: sdiv_v128i16:
317; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
318; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
319; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
320; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),64)]]
321; VBITS_GE_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
322; VBITS_GE_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
323; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
324; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
325; VBITS_GE_2048-NEXT: sdivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
326; VBITS_GE_2048-NEXT: sdiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
327; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
328; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
329; VBITS_GE_2048-NEXT: ret
330  %op1 = load <128 x i16>, <128 x i16>* %a
331  %op2 = load <128 x i16>, <128 x i16>* %b
332  %res = sdiv <128 x i16> %op1, %op2
333  store <128 x i16> %res, <128 x i16>* %a
334  ret void
335}
336
337; Vector v2i32 sdiv are not legal for NEON so use SVE when available.
338define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
339; CHECK-LABEL: sdiv_v2i32:
340; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),2)]]
341; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
342; CHECK: ret
343  %res = sdiv <2 x i32> %op1, %op2
344  ret <2 x i32> %res
345}
346
347; Vector v4i32 sdiv are not legal for NEON so use SVE when available.
348define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
349; CHECK-LABEL: sdiv_v4i32:
350; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),4)]]
351; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s
352; CHECK: ret
353  %res = sdiv <4 x i32> %op1, %op2
354  ret <4 x i32> %res
355}
356
357define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
358; CHECK-LABEL: sdiv_v8i32:
359; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
360; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
361; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
362; CHECK-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
363; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
364; CHECK-NEXT: ret
365  %op1 = load <8 x i32>, <8 x i32>* %a
366  %op2 = load <8 x i32>, <8 x i32>* %b
367  %res = sdiv <8 x i32> %op1, %op2
368  store <8 x i32> %res, <8 x i32>* %a
369  ret void
370}
371
372define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
373; CHECK-LABEL: sdiv_v16i32:
374; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
375; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
376; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
377; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
378; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
379; VBITS_GE_512-NEXT: ret
380  %op1 = load <16 x i32>, <16 x i32>* %a
381  %op2 = load <16 x i32>, <16 x i32>* %b
382  %res = sdiv <16 x i32> %op1, %op2
383  store <16 x i32> %res, <16 x i32>* %a
384  ret void
385}
386
387define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
388; CHECK-LABEL: sdiv_v32i32:
389; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
390; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
391; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
392; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
393; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
394; VBITS_GE_1024-NEXT: ret
395  %op1 = load <32 x i32>, <32 x i32>* %a
396  %op2 = load <32 x i32>, <32 x i32>* %b
397  %res = sdiv <32 x i32> %op1, %op2
398  store <32 x i32> %res, <32 x i32>* %a
399  ret void
400}
401
402define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
403; CHECK-LABEL: sdiv_v64i32:
404; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
405; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
406; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
407; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
408; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
409; VBITS_GE_2048-NEXT: ret
410  %op1 = load <64 x i32>, <64 x i32>* %a
411  %op2 = load <64 x i32>, <64 x i32>* %b
412  %res = sdiv <64 x i32> %op1, %op2
413  store <64 x i32> %res, <64 x i32>* %a
414  ret void
415}
416
417; Vector i64 sdiv are not legal for NEON so use SVE when available.
418define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
419; CHECK-LABEL: sdiv_v1i64:
420; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
421; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
422; CHECK: ret
423  %res = sdiv <1 x i64> %op1, %op2
424  ret <1 x i64> %res
425}
426
427; Vector i64 sdiv are not legal for NEON so use SVE when available.
428define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
429; CHECK-LABEL: sdiv_v2i64:
430; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
431; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d
432; CHECK: ret
433  %res = sdiv <2 x i64> %op1, %op2
434  ret <2 x i64> %res
435}
436
437define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
438; CHECK-LABEL: sdiv_v4i64:
439; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
440; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
441; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
442; CHECK-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
443; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
444; CHECK-NEXT: ret
445  %op1 = load <4 x i64>, <4 x i64>* %a
446  %op2 = load <4 x i64>, <4 x i64>* %b
447  %res = sdiv <4 x i64> %op1, %op2
448  store <4 x i64> %res, <4 x i64>* %a
449  ret void
450}
451
452define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
453; CHECK-LABEL: sdiv_v8i64:
454; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
455; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
456; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
457; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
458; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
459; VBITS_GE_512-NEXT: ret
460  %op1 = load <8 x i64>, <8 x i64>* %a
461  %op2 = load <8 x i64>, <8 x i64>* %b
462  %res = sdiv <8 x i64> %op1, %op2
463  store <8 x i64> %res, <8 x i64>* %a
464  ret void
465}
466
467define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
468; CHECK-LABEL: sdiv_v16i64:
469; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
470; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
471; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
472; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
473; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
474; VBITS_GE_1024-NEXT: ret
475  %op1 = load <16 x i64>, <16 x i64>* %a
476  %op2 = load <16 x i64>, <16 x i64>* %b
477  %res = sdiv <16 x i64> %op1, %op2
478  store <16 x i64> %res, <16 x i64>* %a
479  ret void
480}
481
482define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
483; CHECK-LABEL: sdiv_v32i64:
484; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
485; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
486; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
487; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
488; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
489; VBITS_GE_2048-NEXT: ret
490  %op1 = load <32 x i64>, <32 x i64>* %a
491  %op2 = load <32 x i64>, <32 x i64>* %b
492  %res = sdiv <32 x i64> %op1, %op2
493  store <32 x i64> %res, <32 x i64>* %a
494  ret void
495}
496
497;
498; UDIV
499;
500
501; Vector vXi8 udiv are not legal for NEON so use SVE when available.
502define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
503; CHECK-LABEL: udiv_v8i8:
504; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].h, z1.b
505; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, z0.b
506; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,2)]]
507; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
508; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
509; CHECK-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
510; CHECK-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
511; CHECK-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
512; CHECK-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
513; CHECK-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
514; CHECK-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
515; CHECK-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
516; CHECK-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h
517; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
518; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
519; CHECK-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
520; CHECK-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
521; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
522; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
523; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b
524; CHECK:    ret
525  %res = udiv <8 x i8> %op1, %op2
526  ret <8 x i8> %res
527}
528
529define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
530; CHECK-LABEL: udiv_v16i8:
531; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].h, z1.b
532; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, z0.b
533; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,4)]]
534; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b
535; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b
536; CHECK-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h
537; CHECK-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h
538; CHECK-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
539; CHECK-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
540; CHECK-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
541; CHECK-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
542; CHECK-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
543; CHECK-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h
544; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h
545; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h
546; CHECK-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
547; CHECK-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
548; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
549; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
550; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b
551; CHECK: ret
552  %res = udiv <16 x i8> %op1, %op2
553  ret <16 x i8> %res
554}
555
556define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
557; CHECK-LABEL: udiv_v32i8:
558; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
559; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
560; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
561; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
562; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
563; CHECK-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
564; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
565; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
566; CHECK-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
567; CHECK-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
568; CHECK-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
569; CHECK-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
570; CHECK-NEXT: udivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
571; CHECK-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
572; CHECK-NEXT: udivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
573; CHECK-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
574; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
575; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
576; CHECK-NEXT: udiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
577; CHECK-NEXT: udiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
578; CHECK-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
579; CHECK-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
580; CHECK-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
581; CHECK-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
582; CHECK-NEXT: ret
583  %op1 = load <32 x i8>, <32 x i8>* %a
584  %op2 = load <32 x i8>, <32 x i8>* %b
585  %res = udiv <32 x i8> %op1, %op2
586  store <32 x i8> %res, <32 x i8>* %a
587  ret void
588}
589
590define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
591; CHECK-LABEL: udiv_v64i8:
592; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
593; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
594; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
595; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s,  vl[[#min(VBYTES,16)]]
596; VBITS_GE_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
597; VBITS_GE_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
598; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
599; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
600; VBITS_GE_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
601; VBITS_GE_512-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
602; VBITS_GE_512-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
603; VBITS_GE_512-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
604; VBITS_GE_512-NEXT: udivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
605; VBITS_GE_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
606; VBITS_GE_512-NEXT: udivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
607; VBITS_GE_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
608; VBITS_GE_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
609; VBITS_GE_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
610; VBITS_GE_512-NEXT: udiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
611; VBITS_GE_512-NEXT: udiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
612; VBITS_GE_512-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
613; VBITS_GE_512-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
614; VBITS_GE_512-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
615; VBITS_GE_512-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
616; VBITS_GE_512-NEXT: ret
617  %op1 = load <64 x i8>, <64 x i8>* %a
618  %op2 = load <64 x i8>, <64 x i8>* %b
619  %res = udiv <64 x i8> %op1, %op2
620  store <64 x i8> %res, <64 x i8>* %a
621  ret void
622}
623
624define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
625; CHECK-LABEL: udiv_v128i8:
626; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
627; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
628; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
629; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s,  vl[[#min(VBYTES,32)]]
630; VBITS_GE_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
631; VBITS_GE_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
632; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
633; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
634; VBITS_GE_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
635; VBITS_GE_1024-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
636; VBITS_GE_1024-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
637; VBITS_GE_1024-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
638; VBITS_GE_1024-NEXT: udivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
639; VBITS_GE_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
640; VBITS_GE_1024-NEXT: udivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
641; VBITS_GE_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
642; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
643; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
644; VBITS_GE_1024-NEXT: udiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
645; VBITS_GE_1024-NEXT: udiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
646; VBITS_GE_1024-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
647; VBITS_GE_1024-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
648; VBITS_GE_1024-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
649; VBITS_GE_1024-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
650; VBITS_GE_1024-NEXT: ret
651  %op1 = load <128 x i8>, <128 x i8>* %a
652  %op2 = load <128 x i8>, <128 x i8>* %b
653  %res = udiv <128 x i8> %op1, %op2
654  store <128 x i8> %res, <128 x i8>* %a
655  ret void
656}
657
658define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
659; CHECK-LABEL: udiv_v256i8:
660; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
661; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
662; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
663; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
664; VBITS_GE_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b
665; VBITS_GE_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b
666; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b
667; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b
668; VBITS_GE_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h
669; VBITS_GE_2048-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h
670; VBITS_GE_2048-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h
671; VBITS_GE_2048-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h
672; VBITS_GE_2048-NEXT: udivr   [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s
673; VBITS_GE_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h
674; VBITS_GE_2048-NEXT: udivr   [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s
675; VBITS_GE_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h
676; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h
677; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h
678; VBITS_GE_2048-NEXT: udiv    [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s
679; VBITS_GE_2048-NEXT: udiv    [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s
680; VBITS_GE_2048-NEXT: uzp1    [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h
681; VBITS_GE_2048-NEXT: uzp1    [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h
682; VBITS_GE_2048-NEXT: uzp1    [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b
683; VBITS_GE_2048-NEXT: st1b    { [[RES]].b }, [[PG]], [x0]
684; VBITS_GE_2048-NEXT: ret
685  %op1 = load <256 x i8>, <256 x i8>* %a
686  %op2 = load <256 x i8>, <256 x i8>* %b
687  %res = udiv <256 x i8> %op1, %op2
688  store <256 x i8> %res, <256 x i8>* %a
689  ret void
690}
691
692; Vector vXi16 udiv are not legal for NEON so use SVE when available.
693define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
694; CHECK-LABEL: udiv_v4i16:
695; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
696; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
697; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),2)]]
698; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, z1.h
699; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, z0.h
700; CHECK-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
701; CHECK-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
702; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
703; CHECK-NEXT: ret
704  %res = udiv <4 x i16> %op1, %op2
705  ret <4 x i16> %res
706}
707
708define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
709; CHECK-LABEL: udiv_v8i16:
710; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].s, z1.h
711; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, z0.h
712; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),4)]]
713; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, z1.h
714; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, z0.h
715; CHECK-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s
716; CHECK-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
717; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
718; CHECK-NEXT: ret
719  %res = udiv <8 x i16> %op1, %op2
720  ret <8 x i16> %res
721}
722
723define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
724; CHECK-LABEL: udiv_v16i16:
725; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
726; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
727; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
728; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),8)]]
729; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
730; CHECK-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
731; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
732; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
733; CHECK-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
734; CHECK-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
735; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
736; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
737; CHECK-NEXT: ret
738  %op1 = load <16 x i16>, <16 x i16>* %a
739  %op2 = load <16 x i16>, <16 x i16>* %b
740  %res = udiv <16 x i16> %op1, %op2
741  store <16 x i16> %res, <16 x i16>* %a
742  ret void
743}
744
745define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
746; CHECK-LABEL: udiv_v32i16:
747; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
748; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
749; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
750; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),16)]]
751; VBITS_GE_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
752; VBITS_GE_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
753; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
754; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
755; VBITS_GE_512-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
756; VBITS_GE_512-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
757; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
758; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
759; VBITS_GE_512-NEXT: ret
760  %op1 = load <32 x i16>, <32 x i16>* %a
761  %op2 = load <32 x i16>, <32 x i16>* %b
762  %res = udiv <32 x i16> %op1, %op2
763  store <32 x i16> %res, <32 x i16>* %a
764  ret void
765}
766
767define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
768; CHECK-LABEL: udiv_v64i16:
769; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
770; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
771; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
772; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),32)]]
773; VBITS_GE_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
774; VBITS_GE_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
775; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
776; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
777; VBITS_GE_1024-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
778; VBITS_GE_1024-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
779; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
780; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
781; VBITS_GE_1024-NEXT: ret
782  %op1 = load <64 x i16>, <64 x i16>* %a
783  %op2 = load <64 x i16>, <64 x i16>* %b
784  %res = udiv <64 x i16> %op1, %op2
785  store <64 x i16> %res, <64 x i16>* %a
786  ret void
787}
788
789define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
790; CHECK-LABEL: udiv_v128i16:
791; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
792; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
793; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
794; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),64)]]
795; VBITS_GE_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h
796; VBITS_GE_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h
797; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h
798; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h
799; VBITS_GE_2048-NEXT: udivr   [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s
800; VBITS_GE_2048-NEXT: udiv    [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s
801; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h
802; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
803; VBITS_GE_2048-NEXT: ret
804  %op1 = load <128 x i16>, <128 x i16>* %a
805  %op2 = load <128 x i16>, <128 x i16>* %b
806  %res = udiv <128 x i16> %op1, %op2
807  store <128 x i16> %res, <128 x i16>* %a
808  ret void
809}
810
811; Vector v2i32 udiv are not legal for NEON so use SVE when available.
812define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
813; CHECK-LABEL: udiv_v2i32:
814; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),2)]]
815; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
816; CHECK: ret
817  %res = udiv <2 x i32> %op1, %op2
818  ret <2 x i32> %res
819}
820
821; Vector v4i32 udiv are not legal for NEON so use SVE when available.
822define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
823; CHECK-LABEL: udiv_v4i32:
824; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),4)]]
825; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s
826; CHECK: ret
827  %res = udiv <4 x i32> %op1, %op2
828  ret <4 x i32> %res
829}
830
831define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
832; CHECK-LABEL: udiv_v8i32:
833; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
834; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
835; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
836; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
837; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
838; CHECK-NEXT: ret
839  %op1 = load <8 x i32>, <8 x i32>* %a
840  %op2 = load <8 x i32>, <8 x i32>* %b
841  %res = udiv <8 x i32> %op1, %op2
842  store <8 x i32> %res, <8 x i32>* %a
843  ret void
844}
845
846define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
847; CHECK-LABEL: udiv_v16i32:
848; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
849; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
850; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
851; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
852; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
853; VBITS_GE_512-NEXT: ret
854  %op1 = load <16 x i32>, <16 x i32>* %a
855  %op2 = load <16 x i32>, <16 x i32>* %b
856  %res = udiv <16 x i32> %op1, %op2
857  store <16 x i32> %res, <16 x i32>* %a
858  ret void
859}
860
861define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
862; CHECK-LABEL: udiv_v32i32:
863; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
864; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
865; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
866; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
867; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
868; VBITS_GE_1024-NEXT: ret
869  %op1 = load <32 x i32>, <32 x i32>* %a
870  %op2 = load <32 x i32>, <32 x i32>* %b
871  %res = udiv <32 x i32> %op1, %op2
872  store <32 x i32> %res, <32 x i32>* %a
873  ret void
874}
875
876define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
877; CHECK-LABEL: udiv_v64i32:
878; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
879; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
880; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
881; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
882; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
883; VBITS_GE_2048-NEXT: ret
884  %op1 = load <64 x i32>, <64 x i32>* %a
885  %op2 = load <64 x i32>, <64 x i32>* %b
886  %res = udiv <64 x i32> %op1, %op2
887  store <64 x i32> %res, <64 x i32>* %a
888  ret void
889}
890
891; Vector i64 udiv are not legal for NEON so use SVE when available.
892define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
893; CHECK-LABEL: udiv_v1i64:
894; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
895; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
896; CHECK: ret
897  %res = udiv <1 x i64> %op1, %op2
898  ret <1 x i64> %res
899}
900
901; Vector i64 udiv are not legal for NEON so use SVE when available.
902define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
903; CHECK-LABEL: udiv_v2i64:
904; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
905; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d
906; CHECK: ret
907  %res = udiv <2 x i64> %op1, %op2
908  ret <2 x i64> %res
909}
910
911define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
912; CHECK-LABEL: udiv_v4i64:
913; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
914; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
915; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
916; CHECK-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
917; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
918; CHECK-NEXT: ret
919  %op1 = load <4 x i64>, <4 x i64>* %a
920  %op2 = load <4 x i64>, <4 x i64>* %b
921  %res = udiv <4 x i64> %op1, %op2
922  store <4 x i64> %res, <4 x i64>* %a
923  ret void
924}
925
926define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
927; CHECK-LABEL: udiv_v8i64:
928; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
929; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
930; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
931; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
932; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
933; VBITS_GE_512-NEXT: ret
934  %op1 = load <8 x i64>, <8 x i64>* %a
935  %op2 = load <8 x i64>, <8 x i64>* %b
936  %res = udiv <8 x i64> %op1, %op2
937  store <8 x i64> %res, <8 x i64>* %a
938  ret void
939}
940
941define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
942; CHECK-LABEL: udiv_v16i64:
943; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
944; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
945; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
946; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
947; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
948; VBITS_GE_1024-NEXT: ret
949  %op1 = load <16 x i64>, <16 x i64>* %a
950  %op2 = load <16 x i64>, <16 x i64>* %b
951  %res = udiv <16 x i64> %op1, %op2
952  store <16 x i64> %res, <16 x i64>* %a
953  ret void
954}
955
956define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
957; CHECK-LABEL: udiv_v32i64:
958; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
959; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
960; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
961; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
962; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
963; VBITS_GE_2048-NEXT: ret
964  %op1 = load <32 x i64>, <32 x i64>* %a
965  %op2 = load <32 x i64>, <32 x i64>* %b
966  %res = udiv <32 x i64> %op1, %op2
967  store <32 x i64> %res, <32 x i64>* %a
968  ret void
969}
970
971attributes #0 = { "target-features"="+sve" }
972