1; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK 4; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 17 18target triple = "aarch64-unknown-linux-gnu" 19 20; Don't use SVE when its registers are no bigger than NEON. 21; NO_SVE-NOT: ptrue 22 23; 24; SDIV 25; 26 27; Vector vXi8 sdiv are not legal for NEON so use SVE when available. 28define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 29; CHECK-LABEL: sdiv_v8i8: 30; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].h, z1.b 31; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, z0.b 32; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,2)]] 33; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b 34; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b 35; CHECK-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h 36; CHECK-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h 37; CHECK-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 38; CHECK-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 39; CHECK-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 40; CHECK-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 41; CHECK-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 42; CHECK-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h 43; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h 44; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h 45; CHECK-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 46; CHECK-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 47; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 48; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 49; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b 50; CHECK: ret 51 %res = sdiv <8 x i8> %op1, %op2 52 ret <8 x i8> %res 53} 54 55define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 56; CHECK-LABEL: sdiv_v16i8: 57; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].h, z1.b 58; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, z0.b 59; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,4)]] 60; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, z1.b 61; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, z0.b 62; CHECK-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h 63; CHECK-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h 64; CHECK-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 65; CHECK-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 66; CHECK-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 67; CHECK-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 68; CHECK-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 69; CHECK-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h 70; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h 71; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h 72; CHECK-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 73; CHECK-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 74; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 75; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 76; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b 77; CHECK: ret 78 %res = sdiv <16 x i8> %op1, %op2 79 ret <16 x i8> %res 80} 81 82define void @sdiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 83; CHECK-LABEL: sdiv_v32i8: 84; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] 85; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 86; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 87; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,8)]] 88; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 89; CHECK-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 90; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 91; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 92; CHECK-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 93; CHECK-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 94; CHECK-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 95; CHECK-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 96; CHECK-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 97; CHECK-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 98; CHECK-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 99; CHECK-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 100; CHECK-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 101; CHECK-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 102; CHECK-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 103; CHECK-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 104; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 105; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 106; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 107; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 108; CHECK-NEXT: ret 109 %op1 = load <32 x i8>, <32 x i8>* %a 110 %op2 = load <32 x i8>, <32 x i8>* %b 111 %res = sdiv <32 x i8> %op1, %op2 112 store <32 x i8> %res, <32 x i8>* %a 113 ret void 114} 115 116define void @sdiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 117; CHECK-LABEL: sdiv_v64i8: 118; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] 119; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 120; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 121; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,16)]] 122; VBITS_GE_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 123; VBITS_GE_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 124; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 125; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 126; VBITS_GE_512-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 127; VBITS_GE_512-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 128; VBITS_GE_512-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 129; VBITS_GE_512-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 130; VBITS_GE_512-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 131; VBITS_GE_512-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 132; VBITS_GE_512-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 133; VBITS_GE_512-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 134; VBITS_GE_512-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 135; VBITS_GE_512-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 136; VBITS_GE_512-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 137; VBITS_GE_512-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 138; VBITS_GE_512-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 139; VBITS_GE_512-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 140; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 141; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 142; VBITS_GE_512-NEXT: ret 143 %op1 = load <64 x i8>, <64 x i8>* %a 144 %op2 = load <64 x i8>, <64 x i8>* %b 145 %res = sdiv <64 x i8> %op1, %op2 146 store <64 x i8> %res, <64 x i8>* %a 147 ret void 148} 149 150define void @sdiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 151; CHECK-LABEL: sdiv_v128i8: 152; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] 153; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 154; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 155; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,32)]] 156; VBITS_GE_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 157; VBITS_GE_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 158; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 159; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 160; VBITS_GE_1024-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 161; VBITS_GE_1024-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 162; VBITS_GE_1024-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 163; VBITS_GE_1024-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 164; VBITS_GE_1024-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 165; VBITS_GE_1024-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 166; VBITS_GE_1024-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 167; VBITS_GE_1024-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 168; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 169; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 170; VBITS_GE_1024-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 171; VBITS_GE_1024-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 172; VBITS_GE_1024-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 173; VBITS_GE_1024-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 174; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 175; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 176; VBITS_GE_1024-NEXT: ret 177 %op1 = load <128 x i8>, <128 x i8>* %a 178 %op2 = load <128 x i8>, <128 x i8>* %b 179 %res = sdiv <128 x i8> %op1, %op2 180 store <128 x i8> %res, <128 x i8>* %a 181 ret void 182} 183 184define void @sdiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 185; CHECK-LABEL: sdiv_v256i8: 186; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] 187; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 188; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 189; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,64)]] 190; VBITS_GE_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 191; VBITS_GE_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 192; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 193; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 194; VBITS_GE_2048-NEXT: sunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 195; VBITS_GE_2048-NEXT: sunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 196; VBITS_GE_2048-NEXT: sunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 197; VBITS_GE_2048-NEXT: sunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 198; VBITS_GE_2048-NEXT: sdivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 199; VBITS_GE_2048-NEXT: sunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 200; VBITS_GE_2048-NEXT: sdivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 201; VBITS_GE_2048-NEXT: sunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 202; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 203; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 204; VBITS_GE_2048-NEXT: sdiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 205; VBITS_GE_2048-NEXT: sdiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 206; VBITS_GE_2048-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 207; VBITS_GE_2048-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 208; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 209; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 210; VBITS_GE_2048-NEXT: ret 211 %op1 = load <256 x i8>, <256 x i8>* %a 212 %op2 = load <256 x i8>, <256 x i8>* %b 213 %res = sdiv <256 x i8> %op1, %op2 214 store <256 x i8> %res, <256 x i8>* %a 215 ret void 216} 217 218; Vector vXi16 sdiv are not legal for NEON so use SVE when available. 219define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 220; CHECK-LABEL: sdiv_v4i16: 221; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].s, z1.h 222; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, z0.h 223; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),2)]] 224; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, z1.h 225; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, z0.h 226; CHECK-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s 227; CHECK-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 228; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 229; CHECK-NEXT: ret 230 %res = sdiv <4 x i16> %op1, %op2 231 ret <4 x i16> %res 232} 233 234define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 235; CHECK-LABEL: sdiv_v8i16: 236; CHECK: sunpkhi [[OP2_HI:z[0-9]+]].s, z1.h 237; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, z0.h 238; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),4)]] 239; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, z1.h 240; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, z0.h 241; CHECK-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s 242; CHECK-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 243; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 244; CHECK-NEXT: ret 245 %res = sdiv <8 x i16> %op1, %op2 246 ret <8 x i16> %res 247} 248 249define void @sdiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 250; CHECK-LABEL: sdiv_v16i16: 251; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] 252; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 253; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 254; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),8)]] 255; CHECK-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 256; CHECK-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 257; CHECK-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 258; CHECK-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 259; CHECK-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 260; CHECK-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 261; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 262; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 263; CHECK-NEXT: ret 264 %op1 = load <16 x i16>, <16 x i16>* %a 265 %op2 = load <16 x i16>, <16 x i16>* %b 266 %res = sdiv <16 x i16> %op1, %op2 267 store <16 x i16> %res, <16 x i16>* %a 268 ret void 269} 270 271define void @sdiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 272; CHECK-LABEL: sdiv_v32i16: 273; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] 274; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 275; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 276; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),16)]] 277; VBITS_GE_512-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 278; VBITS_GE_512-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 279; VBITS_GE_512-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 280; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 281; VBITS_GE_512-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 282; VBITS_GE_512-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 283; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 284; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 285; VBITS_GE_512-NEXT: ret 286 %op1 = load <32 x i16>, <32 x i16>* %a 287 %op2 = load <32 x i16>, <32 x i16>* %b 288 %res = sdiv <32 x i16> %op1, %op2 289 store <32 x i16> %res, <32 x i16>* %a 290 ret void 291} 292 293define void @sdiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 294; CHECK-LABEL: sdiv_v64i16: 295; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] 296; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 297; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 298; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),32)]] 299; VBITS_GE_1024-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 300; VBITS_GE_1024-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 301; VBITS_GE_1024-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 302; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 303; VBITS_GE_1024-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 304; VBITS_GE_1024-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 305; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 306; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 307; VBITS_GE_1024-NEXT: ret 308 %op1 = load <64 x i16>, <64 x i16>* %a 309 %op2 = load <64 x i16>, <64 x i16>* %b 310 %res = sdiv <64 x i16> %op1, %op2 311 store <64 x i16> %res, <64 x i16>* %a 312 ret void 313} 314 315define void @sdiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 316; CHECK-LABEL: sdiv_v128i16: 317; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] 318; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 319; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 320; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),64)]] 321; VBITS_GE_2048-NEXT: sunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 322; VBITS_GE_2048-NEXT: sunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 323; VBITS_GE_2048-NEXT: sunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 324; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 325; VBITS_GE_2048-NEXT: sdivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 326; VBITS_GE_2048-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 327; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 328; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 329; VBITS_GE_2048-NEXT: ret 330 %op1 = load <128 x i16>, <128 x i16>* %a 331 %op2 = load <128 x i16>, <128 x i16>* %b 332 %res = sdiv <128 x i16> %op1, %op2 333 store <128 x i16> %res, <128 x i16>* %a 334 ret void 335} 336 337; Vector v2i32 sdiv are not legal for NEON so use SVE when available. 338define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 339; CHECK-LABEL: sdiv_v2i32: 340; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),2)]] 341; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s 342; CHECK: ret 343 %res = sdiv <2 x i32> %op1, %op2 344 ret <2 x i32> %res 345} 346 347; Vector v4i32 sdiv are not legal for NEON so use SVE when available. 348define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 349; CHECK-LABEL: sdiv_v4i32: 350; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),4)]] 351; CHECK: sdiv z0.s, [[PG]]/m, z0.s, z1.s 352; CHECK: ret 353 %res = sdiv <4 x i32> %op1, %op2 354 ret <4 x i32> %res 355} 356 357define void @sdiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 358; CHECK-LABEL: sdiv_v8i32: 359; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] 360; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 361; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 362; CHECK-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 363; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 364; CHECK-NEXT: ret 365 %op1 = load <8 x i32>, <8 x i32>* %a 366 %op2 = load <8 x i32>, <8 x i32>* %b 367 %res = sdiv <8 x i32> %op1, %op2 368 store <8 x i32> %res, <8 x i32>* %a 369 ret void 370} 371 372define void @sdiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 373; CHECK-LABEL: sdiv_v16i32: 374; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] 375; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 376; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 377; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 378; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 379; VBITS_GE_512-NEXT: ret 380 %op1 = load <16 x i32>, <16 x i32>* %a 381 %op2 = load <16 x i32>, <16 x i32>* %b 382 %res = sdiv <16 x i32> %op1, %op2 383 store <16 x i32> %res, <16 x i32>* %a 384 ret void 385} 386 387define void @sdiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 388; CHECK-LABEL: sdiv_v32i32: 389; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] 390; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 391; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 392; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 393; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 394; VBITS_GE_1024-NEXT: ret 395 %op1 = load <32 x i32>, <32 x i32>* %a 396 %op2 = load <32 x i32>, <32 x i32>* %b 397 %res = sdiv <32 x i32> %op1, %op2 398 store <32 x i32> %res, <32 x i32>* %a 399 ret void 400} 401 402define void @sdiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 403; CHECK-LABEL: sdiv_v64i32: 404; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] 405; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 406; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 407; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 408; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 409; VBITS_GE_2048-NEXT: ret 410 %op1 = load <64 x i32>, <64 x i32>* %a 411 %op2 = load <64 x i32>, <64 x i32>* %b 412 %res = sdiv <64 x i32> %op1, %op2 413 store <64 x i32> %res, <64 x i32>* %a 414 ret void 415} 416 417; Vector i64 sdiv are not legal for NEON so use SVE when available. 418define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 419; CHECK-LABEL: sdiv_v1i64: 420; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 421; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d 422; CHECK: ret 423 %res = sdiv <1 x i64> %op1, %op2 424 ret <1 x i64> %res 425} 426 427; Vector i64 sdiv are not legal for NEON so use SVE when available. 428define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 429; CHECK-LABEL: sdiv_v2i64: 430; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 431; CHECK: sdiv z0.d, [[PG]]/m, z0.d, z1.d 432; CHECK: ret 433 %res = sdiv <2 x i64> %op1, %op2 434 ret <2 x i64> %res 435} 436 437define void @sdiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 438; CHECK-LABEL: sdiv_v4i64: 439; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] 440; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 441; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 442; CHECK-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 443; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 444; CHECK-NEXT: ret 445 %op1 = load <4 x i64>, <4 x i64>* %a 446 %op2 = load <4 x i64>, <4 x i64>* %b 447 %res = sdiv <4 x i64> %op1, %op2 448 store <4 x i64> %res, <4 x i64>* %a 449 ret void 450} 451 452define void @sdiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 453; CHECK-LABEL: sdiv_v8i64: 454; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] 455; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 456; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 457; VBITS_GE_512-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 458; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 459; VBITS_GE_512-NEXT: ret 460 %op1 = load <8 x i64>, <8 x i64>* %a 461 %op2 = load <8 x i64>, <8 x i64>* %b 462 %res = sdiv <8 x i64> %op1, %op2 463 store <8 x i64> %res, <8 x i64>* %a 464 ret void 465} 466 467define void @sdiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 468; CHECK-LABEL: sdiv_v16i64: 469; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] 470; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 471; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 472; VBITS_GE_1024-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 473; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 474; VBITS_GE_1024-NEXT: ret 475 %op1 = load <16 x i64>, <16 x i64>* %a 476 %op2 = load <16 x i64>, <16 x i64>* %b 477 %res = sdiv <16 x i64> %op1, %op2 478 store <16 x i64> %res, <16 x i64>* %a 479 ret void 480} 481 482define void @sdiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 483; CHECK-LABEL: sdiv_v32i64: 484; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] 485; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 486; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 487; VBITS_GE_2048-NEXT: sdiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 488; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 489; VBITS_GE_2048-NEXT: ret 490 %op1 = load <32 x i64>, <32 x i64>* %a 491 %op2 = load <32 x i64>, <32 x i64>* %b 492 %res = sdiv <32 x i64> %op1, %op2 493 store <32 x i64> %res, <32 x i64>* %a 494 ret void 495} 496 497; 498; UDIV 499; 500 501; Vector vXi8 udiv are not legal for NEON so use SVE when available. 502define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 503; CHECK-LABEL: udiv_v8i8: 504; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].h, z1.b 505; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, z0.b 506; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,2)]] 507; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b 508; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b 509; CHECK-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h 510; CHECK-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h 511; CHECK-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 512; CHECK-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 513; CHECK-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 514; CHECK-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 515; CHECK-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 516; CHECK-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h 517; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h 518; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h 519; CHECK-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 520; CHECK-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 521; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 522; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 523; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b 524; CHECK: ret 525 %res = udiv <8 x i8> %op1, %op2 526 ret <8 x i8> %res 527} 528 529define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 530; CHECK-LABEL: udiv_v16i8: 531; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].h, z1.b 532; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, z0.b 533; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,4)]] 534; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, z1.b 535; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, z0.b 536; CHECK-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]+]].s, [[OP2_HI]].h 537; CHECK-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]+]].s, [[OP1_HI]].h 538; CHECK-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 539; CHECK-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 540; CHECK-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 541; CHECK-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 542; CHECK-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 543; CHECK-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, z0.h 544; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, z1.h 545; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, z0.h 546; CHECK-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 547; CHECK-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 548; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 549; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 550; CHECK-NEXT: uzp1 z0.b, [[RES_LO]].b, [[RES_HI]].b 551; CHECK: ret 552 %res = udiv <16 x i8> %op1, %op2 553 ret <16 x i8> %res 554} 555 556define void @udiv_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 557; CHECK-LABEL: udiv_v32i8: 558; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] 559; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 560; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 561; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,8)]] 562; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 563; CHECK-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 564; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 565; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 566; CHECK-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 567; CHECK-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 568; CHECK-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 569; CHECK-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 570; CHECK-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 571; CHECK-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 572; CHECK-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 573; CHECK-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 574; CHECK-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 575; CHECK-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 576; CHECK-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 577; CHECK-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 578; CHECK-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 579; CHECK-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 580; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 581; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 582; CHECK-NEXT: ret 583 %op1 = load <32 x i8>, <32 x i8>* %a 584 %op2 = load <32 x i8>, <32 x i8>* %b 585 %res = udiv <32 x i8> %op1, %op2 586 store <32 x i8> %res, <32 x i8>* %a 587 ret void 588} 589 590define void @udiv_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 591; CHECK-LABEL: udiv_v64i8: 592; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] 593; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 594; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 595; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,16)]] 596; VBITS_GE_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 597; VBITS_GE_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 598; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 599; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 600; VBITS_GE_512-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 601; VBITS_GE_512-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 602; VBITS_GE_512-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 603; VBITS_GE_512-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 604; VBITS_GE_512-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 605; VBITS_GE_512-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 606; VBITS_GE_512-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 607; VBITS_GE_512-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 608; VBITS_GE_512-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 609; VBITS_GE_512-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 610; VBITS_GE_512-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 611; VBITS_GE_512-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 612; VBITS_GE_512-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 613; VBITS_GE_512-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 614; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 615; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 616; VBITS_GE_512-NEXT: ret 617 %op1 = load <64 x i8>, <64 x i8>* %a 618 %op2 = load <64 x i8>, <64 x i8>* %b 619 %res = udiv <64 x i8> %op1, %op2 620 store <64 x i8> %res, <64 x i8>* %a 621 ret void 622} 623 624define void @udiv_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 625; CHECK-LABEL: udiv_v128i8: 626; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] 627; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 628; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 629; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,32)]] 630; VBITS_GE_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 631; VBITS_GE_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 632; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 633; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 634; VBITS_GE_1024-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 635; VBITS_GE_1024-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 636; VBITS_GE_1024-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 637; VBITS_GE_1024-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 638; VBITS_GE_1024-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 639; VBITS_GE_1024-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 640; VBITS_GE_1024-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 641; VBITS_GE_1024-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 642; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 643; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 644; VBITS_GE_1024-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 645; VBITS_GE_1024-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 646; VBITS_GE_1024-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 647; VBITS_GE_1024-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 648; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 649; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 650; VBITS_GE_1024-NEXT: ret 651 %op1 = load <128 x i8>, <128 x i8>* %a 652 %op2 = load <128 x i8>, <128 x i8>* %b 653 %res = udiv <128 x i8> %op1, %op2 654 store <128 x i8> %res, <128 x i8>* %a 655 ret void 656} 657 658define void @udiv_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 659; CHECK-LABEL: udiv_v256i8: 660; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] 661; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 662; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 663; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(VBYTES,64)]] 664; VBITS_GE_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].h, [[OP1]].b 665; VBITS_GE_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].h, [[OP2]].b 666; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].h, [[OP2]].b 667; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].h, [[OP1]].b 668; VBITS_GE_2048-NEXT: uunpkhi [[OP2_HI_HI:z[0-9]]].s, [[OP2_HI]].h 669; VBITS_GE_2048-NEXT: uunpkhi [[OP1_HI_HI:z[0-9]]].s, [[OP1_HI]].h 670; VBITS_GE_2048-NEXT: uunpklo [[OP2_HI_LO:z[0-9]+]].s, [[OP2_HI]].h 671; VBITS_GE_2048-NEXT: uunpklo [[OP1_HI_LO:z[0-9]+]].s, [[OP1_HI]].h 672; VBITS_GE_2048-NEXT: udivr [[RES_HI_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_HI]].s, [[OP1_HI_HI]].s 673; VBITS_GE_2048-NEXT: uunpkhi [[OP2_LO_HI:z[0-9]+]].s, [[OP2_LO]].h 674; VBITS_GE_2048-NEXT: udivr [[RES_HI_LO:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI_LO]].s, [[OP1_HI_LO]].s 675; VBITS_GE_2048-NEXT: uunpkhi [[OP1_LO_HI:z[0-9]+]].s, [[OP1_LO]].h 676; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO_LO:z[0-9]+]].s, [[OP2_LO]].h 677; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO_LO:z[0-9]+]].s, [[OP1_LO]].h 678; VBITS_GE_2048-NEXT: udiv [[RES_LO_HI:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_HI]].s, [[OP2_LO_HI]].s 679; VBITS_GE_2048-NEXT: udiv [[RES_LO_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO_LO]].s, [[OP2_LO_LO]].s 680; VBITS_GE_2048-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h 681; VBITS_GE_2048-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h 682; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b 683; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 684; VBITS_GE_2048-NEXT: ret 685 %op1 = load <256 x i8>, <256 x i8>* %a 686 %op2 = load <256 x i8>, <256 x i8>* %b 687 %res = udiv <256 x i8> %op1, %op2 688 store <256 x i8> %res, <256 x i8>* %a 689 ret void 690} 691 692; Vector vXi16 udiv are not legal for NEON so use SVE when available. 693define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 694; CHECK-LABEL: udiv_v4i16: 695; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].s, z1.h 696; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, z0.h 697; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),2)]] 698; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, z1.h 699; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, z0.h 700; CHECK-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s 701; CHECK-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 702; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 703; CHECK-NEXT: ret 704 %res = udiv <4 x i16> %op1, %op2 705 ret <4 x i16> %res 706} 707 708define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 709; CHECK-LABEL: udiv_v8i16: 710; CHECK: uunpkhi [[OP2_HI:z[0-9]+]].s, z1.h 711; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, z0.h 712; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,2),4)]] 713; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, z1.h 714; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, z0.h 715; CHECK-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP2_HI]].s, [[OP1_HI]].s 716; CHECK-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 717; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 718; CHECK-NEXT: ret 719 %res = udiv <8 x i16> %op1, %op2 720 ret <8 x i16> %res 721} 722 723define void @udiv_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 724; CHECK-LABEL: udiv_v16i16: 725; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] 726; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 727; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 728; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),8)]] 729; CHECK-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 730; CHECK-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 731; CHECK-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 732; CHECK-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 733; CHECK-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 734; CHECK-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 735; CHECK-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 736; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 737; CHECK-NEXT: ret 738 %op1 = load <16 x i16>, <16 x i16>* %a 739 %op2 = load <16 x i16>, <16 x i16>* %b 740 %res = udiv <16 x i16> %op1, %op2 741 store <16 x i16> %res, <16 x i16>* %a 742 ret void 743} 744 745define void @udiv_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 746; CHECK-LABEL: udiv_v32i16: 747; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] 748; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 749; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 750; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),16)]] 751; VBITS_GE_512-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 752; VBITS_GE_512-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 753; VBITS_GE_512-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 754; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 755; VBITS_GE_512-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 756; VBITS_GE_512-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 757; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 758; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 759; VBITS_GE_512-NEXT: ret 760 %op1 = load <32 x i16>, <32 x i16>* %a 761 %op2 = load <32 x i16>, <32 x i16>* %b 762 %res = udiv <32 x i16> %op1, %op2 763 store <32 x i16> %res, <32 x i16>* %a 764 ret void 765} 766 767define void @udiv_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 768; CHECK-LABEL: udiv_v64i16: 769; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] 770; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 771; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 772; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),32)]] 773; VBITS_GE_1024-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 774; VBITS_GE_1024-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 775; VBITS_GE_1024-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 776; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 777; VBITS_GE_1024-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 778; VBITS_GE_1024-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 779; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 780; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 781; VBITS_GE_1024-NEXT: ret 782 %op1 = load <64 x i16>, <64 x i16>* %a 783 %op2 = load <64 x i16>, <64 x i16>* %b 784 %res = udiv <64 x i16> %op1, %op2 785 store <64 x i16> %res, <64 x i16>* %a 786 ret void 787} 788 789define void @udiv_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 790; CHECK-LABEL: udiv_v128i16: 791; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] 792; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 793; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 794; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,2),64)]] 795; VBITS_GE_2048-NEXT: uunpkhi [[OP1_HI:z[0-9]+]].s, [[OP1]].h 796; VBITS_GE_2048-NEXT: uunpkhi [[OP2_HI:z[0-9]+]].s, [[OP2]].h 797; VBITS_GE_2048-NEXT: uunpklo [[OP2_LO:z[0-9]+]].s, [[OP2]].h 798; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h 799; VBITS_GE_2048-NEXT: udivr [[RES_HI:z[0-9]+]].s, [[PG1]]/m, [[OP2_HI]].s, [[OP1_HI]].s 800; VBITS_GE_2048-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP1_LO]].s, [[OP2_LO]].s 801; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h 802; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 803; VBITS_GE_2048-NEXT: ret 804 %op1 = load <128 x i16>, <128 x i16>* %a 805 %op2 = load <128 x i16>, <128 x i16>* %b 806 %res = udiv <128 x i16> %op1, %op2 807 store <128 x i16> %res, <128 x i16>* %a 808 ret void 809} 810 811; Vector v2i32 udiv are not legal for NEON so use SVE when available. 812define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 813; CHECK-LABEL: udiv_v2i32: 814; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),2)]] 815; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s 816; CHECK: ret 817 %res = udiv <2 x i32> %op1, %op2 818 ret <2 x i32> %res 819} 820 821; Vector v4i32 udiv are not legal for NEON so use SVE when available. 822define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 823; CHECK-LABEL: udiv_v4i32: 824; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),4)]] 825; CHECK: udiv z0.s, [[PG]]/m, z0.s, z1.s 826; CHECK: ret 827 %res = udiv <4 x i32> %op1, %op2 828 ret <4 x i32> %res 829} 830 831define void @udiv_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 832; CHECK-LABEL: udiv_v8i32: 833; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] 834; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 835; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 836; CHECK-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 837; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 838; CHECK-NEXT: ret 839 %op1 = load <8 x i32>, <8 x i32>* %a 840 %op2 = load <8 x i32>, <8 x i32>* %b 841 %res = udiv <8 x i32> %op1, %op2 842 store <8 x i32> %res, <8 x i32>* %a 843 ret void 844} 845 846define void @udiv_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 847; CHECK-LABEL: udiv_v16i32: 848; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] 849; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 850; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 851; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 852; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 853; VBITS_GE_512-NEXT: ret 854 %op1 = load <16 x i32>, <16 x i32>* %a 855 %op2 = load <16 x i32>, <16 x i32>* %b 856 %res = udiv <16 x i32> %op1, %op2 857 store <16 x i32> %res, <16 x i32>* %a 858 ret void 859} 860 861define void @udiv_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 862; CHECK-LABEL: udiv_v32i32: 863; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] 864; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 865; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 866; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 867; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 868; VBITS_GE_1024-NEXT: ret 869 %op1 = load <32 x i32>, <32 x i32>* %a 870 %op2 = load <32 x i32>, <32 x i32>* %b 871 %res = udiv <32 x i32> %op1, %op2 872 store <32 x i32> %res, <32 x i32>* %a 873 ret void 874} 875 876define void @udiv_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 877; CHECK-LABEL: udiv_v64i32: 878; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] 879; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 880; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 881; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 882; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 883; VBITS_GE_2048-NEXT: ret 884 %op1 = load <64 x i32>, <64 x i32>* %a 885 %op2 = load <64 x i32>, <64 x i32>* %b 886 %res = udiv <64 x i32> %op1, %op2 887 store <64 x i32> %res, <64 x i32>* %a 888 ret void 889} 890 891; Vector i64 udiv are not legal for NEON so use SVE when available. 892define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 893; CHECK-LABEL: udiv_v1i64: 894; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 895; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d 896; CHECK: ret 897 %res = udiv <1 x i64> %op1, %op2 898 ret <1 x i64> %res 899} 900 901; Vector i64 udiv are not legal for NEON so use SVE when available. 902define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 903; CHECK-LABEL: udiv_v2i64: 904; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 905; CHECK: udiv z0.d, [[PG]]/m, z0.d, z1.d 906; CHECK: ret 907 %res = udiv <2 x i64> %op1, %op2 908 ret <2 x i64> %res 909} 910 911define void @udiv_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 912; CHECK-LABEL: udiv_v4i64: 913; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] 914; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 915; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 916; CHECK-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 917; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 918; CHECK-NEXT: ret 919 %op1 = load <4 x i64>, <4 x i64>* %a 920 %op2 = load <4 x i64>, <4 x i64>* %b 921 %res = udiv <4 x i64> %op1, %op2 922 store <4 x i64> %res, <4 x i64>* %a 923 ret void 924} 925 926define void @udiv_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 927; CHECK-LABEL: udiv_v8i64: 928; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] 929; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 930; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 931; VBITS_GE_512-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 932; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 933; VBITS_GE_512-NEXT: ret 934 %op1 = load <8 x i64>, <8 x i64>* %a 935 %op2 = load <8 x i64>, <8 x i64>* %b 936 %res = udiv <8 x i64> %op1, %op2 937 store <8 x i64> %res, <8 x i64>* %a 938 ret void 939} 940 941define void @udiv_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 942; CHECK-LABEL: udiv_v16i64: 943; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] 944; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 945; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 946; VBITS_GE_1024-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 947; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 948; VBITS_GE_1024-NEXT: ret 949 %op1 = load <16 x i64>, <16 x i64>* %a 950 %op2 = load <16 x i64>, <16 x i64>* %b 951 %res = udiv <16 x i64> %op1, %op2 952 store <16 x i64> %res, <16 x i64>* %a 953 ret void 954} 955 956define void @udiv_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 957; CHECK-LABEL: udiv_v32i64: 958; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] 959; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 960; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 961; VBITS_GE_2048-NEXT: udiv [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 962; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 963; VBITS_GE_2048-NEXT: ret 964 %op1 = load <32 x i64>, <32 x i64>* %a 965 %op2 = load <32 x i64>, <32 x i64>* %b 966 %res = udiv <32 x i64> %op1, %op2 967 store <32 x i64> %res, <32 x i64>* %a 968 ret void 969} 970 971attributes #0 = { "target-features"="+sve" } 972