1; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK 4; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 17 18target triple = "aarch64-unknown-linux-gnu" 19 20; Don't use SVE when its registers are no bigger than NEON. 21; NO_SVE-NOT: ptrue 22 23; 24; SMAX 25; 26 27; Don't use SVE for 64-bit vectors. 28define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 29; CHECK-LABEL: smax_v8i8: 30; CHECK: smax v0.8b, v0.8b, v1.8b 31; CHECK: ret 32 %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2) 33 ret <8 x i8> %res 34} 35 36; Don't use SVE for 128-bit vectors. 37define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 38; CHECK-LABEL: smax_v16i8: 39; CHECK: smax v0.16b, v0.16b, v1.16b 40; CHECK: ret 41 %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2) 42 ret <16 x i8> %res 43} 44 45define void @smax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 46; CHECK-LABEL: smax_v32i8: 47; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 48; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 49; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 50; CHECK-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 51; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 52; CHECK-NEXT: ret 53 %op1 = load <32 x i8>, <32 x i8>* %a 54 %op2 = load <32 x i8>, <32 x i8>* %b 55 %res = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %op1, <32 x i8> %op2) 56 store <32 x i8> %res, <32 x i8>* %a 57 ret void 58} 59 60define void @smax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 61; CHECK-LABEL: smax_v64i8: 62; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 63; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 64; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 65; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 66; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 67; VBITS_GE_512-NEXT: ret 68; 69; Ensure sensible type legalisation. 70; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 71; VBITS_EQ_256-DAG: mov w[[A:[0-9]+]], #32 72; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A]]] 73; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0] 74; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1, x[[A]]] 75; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1] 76; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b 77; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b 78; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0, x[[A]]] 79; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0] 80; VBITS_EQ_256-NEXT: ret 81 %op1 = load <64 x i8>, <64 x i8>* %a 82 %op2 = load <64 x i8>, <64 x i8>* %b 83 %res = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %op1, <64 x i8> %op2) 84 store <64 x i8> %res, <64 x i8>* %a 85 ret void 86} 87 88define void @smax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 89; CHECK-LABEL: smax_v128i8: 90; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 91; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 92; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 93; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 94; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 95; VBITS_GE_1024-NEXT: ret 96 %op1 = load <128 x i8>, <128 x i8>* %a 97 %op2 = load <128 x i8>, <128 x i8>* %b 98 %res = call <128 x i8> @llvm.smax.v128i8(<128 x i8> %op1, <128 x i8> %op2) 99 store <128 x i8> %res, <128 x i8>* %a 100 ret void 101} 102 103define void @smax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 104; CHECK-LABEL: smax_v256i8: 105; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 106; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 107; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 108; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 109; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 110; VBITS_GE_2048-NEXT: ret 111 %op1 = load <256 x i8>, <256 x i8>* %a 112 %op2 = load <256 x i8>, <256 x i8>* %b 113 %res = call <256 x i8> @llvm.smax.v256i8(<256 x i8> %op1, <256 x i8> %op2) 114 store <256 x i8> %res, <256 x i8>* %a 115 ret void 116} 117 118; Don't use SVE for 64-bit vectors. 119define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 120; CHECK-LABEL: smax_v4i16: 121; CHECK: smax v0.4h, v0.4h, v1.4h 122; CHECK-NEXT: ret 123 %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2) 124 ret <4 x i16> %res 125} 126 127; Don't use SVE for 128-bit vectors. 128define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 129; CHECK-LABEL: smax_v8i16: 130; CHECK: smax v0.8h, v0.8h, v1.8h 131; CHECK-NEXT: ret 132 %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2) 133 ret <8 x i16> %res 134} 135 136define void @smax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 137; CHECK-LABEL: smax_v16i16: 138; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 139; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 140; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 141; CHECK-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 142; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 143; CHECK-NEXT: ret 144 %op1 = load <16 x i16>, <16 x i16>* %a 145 %op2 = load <16 x i16>, <16 x i16>* %b 146 %res = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %op1, <16 x i16> %op2) 147 store <16 x i16> %res, <16 x i16>* %a 148 ret void 149} 150 151define void @smax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 152; CHECK-LABEL: smax_v32i16: 153; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 154; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 155; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 156; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 157; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 158; VBITS_GE_512-NEXT: ret 159 160; Ensure sensible type legalisation. 161; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 162; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 163; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 164; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0] 165; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] 166; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] 167; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x[[B_HI]]] 168; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h 169; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h 170; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] 171; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] 172; VBITS_EQ_256-NEXT: ret 173 %op1 = load <32 x i16>, <32 x i16>* %a 174 %op2 = load <32 x i16>, <32 x i16>* %b 175 %res = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %op1, <32 x i16> %op2) 176 store <32 x i16> %res, <32 x i16>* %a 177 ret void 178} 179 180define void @smax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 181; CHECK-LABEL: smax_v64i16: 182; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 183; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 184; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 185; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 186; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 187; VBITS_GE_1024-NEXT: ret 188 %op1 = load <64 x i16>, <64 x i16>* %a 189 %op2 = load <64 x i16>, <64 x i16>* %b 190 %res = call <64 x i16> @llvm.smax.v64i16(<64 x i16> %op1, <64 x i16> %op2) 191 store <64 x i16> %res, <64 x i16>* %a 192 ret void 193} 194 195define void @smax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 196; CHECK-LABEL: smax_v128i16: 197; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 198; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 199; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 200; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 201; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 202; VBITS_GE_2048-NEXT: ret 203 %op1 = load <128 x i16>, <128 x i16>* %a 204 %op2 = load <128 x i16>, <128 x i16>* %b 205 %res = call <128 x i16> @llvm.smax.v128i16(<128 x i16> %op1, <128 x i16> %op2) 206 store <128 x i16> %res, <128 x i16>* %a 207 ret void 208} 209 210; Don't use SVE for 64-bit vectors. 211define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 212; CHECK-LABEL: smax_v2i32: 213; CHECK: smax v0.2s, v0.2s, v1.2s 214; CHECK-NEXT: ret 215 %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2) 216 ret <2 x i32> %res 217} 218 219; Don't use SVE for 128-bit vectors. 220define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 221; CHECK-LABEL: smax_v4i32: 222; CHECK: smax v0.4s, v0.4s, v1.4s 223; CHECK-NEXT: ret 224 %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2) 225 ret <4 x i32> %res 226} 227 228define void @smax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 229; CHECK-LABEL: smax_v8i32: 230; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 231; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 232; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 233; CHECK-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 234; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 235; CHECK-NEXT: ret 236 %op1 = load <8 x i32>, <8 x i32>* %a 237 %op2 = load <8 x i32>, <8 x i32>* %b 238 %res = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %op1, <8 x i32> %op2) 239 store <8 x i32> %res, <8 x i32>* %a 240 ret void 241} 242 243define void @smax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 244; CHECK-LABEL: smax_v16i32: 245; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 246; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 247; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 248; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 249; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 250; VBITS_GE_512-NEXT: ret 251 252; Ensure sensible type legalisation. 253; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 254; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 255; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 256; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0] 257; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] 258; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] 259; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x[[B_HI]]] 260; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 261; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s 262; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] 263; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] 264; VBITS_EQ_256-NEXT: ret 265 %op1 = load <16 x i32>, <16 x i32>* %a 266 %op2 = load <16 x i32>, <16 x i32>* %b 267 %res = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %op1, <16 x i32> %op2) 268 store <16 x i32> %res, <16 x i32>* %a 269 ret void 270} 271 272define void @smax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 273; CHECK-LABEL: smax_v32i32: 274; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 275; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 276; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 277; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 278; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 279; VBITS_GE_1024-NEXT: ret 280 %op1 = load <32 x i32>, <32 x i32>* %a 281 %op2 = load <32 x i32>, <32 x i32>* %b 282 %res = call <32 x i32> @llvm.smax.v32i32(<32 x i32> %op1, <32 x i32> %op2) 283 store <32 x i32> %res, <32 x i32>* %a 284 ret void 285} 286 287define void @smax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 288; CHECK-LABEL: smax_v64i32: 289; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 290; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 291; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 292; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 293; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 294; VBITS_GE_2048-NEXT: ret 295 %op1 = load <64 x i32>, <64 x i32>* %a 296 %op2 = load <64 x i32>, <64 x i32>* %b 297 %res = call <64 x i32> @llvm.smax.v64i32(<64 x i32> %op1, <64 x i32> %op2) 298 store <64 x i32> %res, <64 x i32>* %a 299 ret void 300} 301 302; Vector i64 max are not legal for NEON so use SVE when available. 303define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 304; CHECK-LABEL: smax_v1i64: 305; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 306; CHECK-NEXT: smax z0.d, [[PG]]/m, z0.d, z1.d 307; CHECK-NEXT: ret 308 %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2) 309 ret <1 x i64> %res 310} 311 312; Vector i64 max are not legal for NEON so use SVE when available. 313define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 314; CHECK-LABEL: smax_v2i64: 315; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 316; CHECK-NEXT: smax z0.d, [[PG]]/m, z0.d, z1.d 317; CHECK-NEXT: ret 318 %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2) 319 ret <2 x i64> %res 320} 321 322define void @smax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 323; CHECK-LABEL: smax_v4i64: 324; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 325; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 326; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 327; CHECK-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 328; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 329; CHECK-NEXT: ret 330 %op1 = load <4 x i64>, <4 x i64>* %a 331 %op2 = load <4 x i64>, <4 x i64>* %b 332 %res = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %op1, <4 x i64> %op2) 333 store <4 x i64> %res, <4 x i64>* %a 334 ret void 335} 336 337define void @smax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 338; CHECK-LABEL: smax_v8i64: 339; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 340; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 341; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 342; VBITS_GE_512-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 343; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 344; VBITS_GE_512-NEXT: ret 345 346; Ensure sensible type legalisation. 347; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 348; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 349; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 350; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0] 351; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] 352; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] 353; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x[[B_HI]]] 354; VBITS_EQ_256-DAG: smax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d 355; VBITS_EQ_256-DAG: smax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d 356; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] 357; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] 358; VBITS_EQ_256-NEXT: ret 359 %op1 = load <8 x i64>, <8 x i64>* %a 360 %op2 = load <8 x i64>, <8 x i64>* %b 361 %res = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %op1, <8 x i64> %op2) 362 store <8 x i64> %res, <8 x i64>* %a 363 ret void 364} 365 366define void @smax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 367; CHECK-LABEL: smax_v16i64: 368; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 369; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 370; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 371; VBITS_GE_1024-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 372; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 373; VBITS_GE_1024-NEXT: ret 374 %op1 = load <16 x i64>, <16 x i64>* %a 375 %op2 = load <16 x i64>, <16 x i64>* %b 376 %res = call <16 x i64> @llvm.smax.v16i64(<16 x i64> %op1, <16 x i64> %op2) 377 store <16 x i64> %res, <16 x i64>* %a 378 ret void 379} 380 381define void @smax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 382; CHECK-LABEL: smax_v32i64: 383; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 384; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 385; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 386; VBITS_GE_2048-NEXT: smax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 387; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 388; VBITS_GE_2048-NEXT: ret 389 %op1 = load <32 x i64>, <32 x i64>* %a 390 %op2 = load <32 x i64>, <32 x i64>* %b 391 %res = call <32 x i64> @llvm.smax.v32i64(<32 x i64> %op1, <32 x i64> %op2) 392 store <32 x i64> %res, <32 x i64>* %a 393 ret void 394} 395 396; 397; SMIN 398; 399 400; Don't use SVE for 64-bit vectors. 401define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 402; CHECK-LABEL: smin_v8i8: 403; CHECK: smin v0.8b, v0.8b, v1.8b 404; CHECK-NEXT: ret 405 %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2) 406 ret <8 x i8> %res 407} 408 409; Don't use SVE for 128-bit vectors. 410define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 411; CHECK-LABEL: smin_v16i8: 412; CHECK: smin v0.16b, v0.16b, v1.16b 413; CHECK: ret 414 %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2) 415 ret <16 x i8> %res 416} 417 418define void @smin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 419; CHECK-LABEL: smin_v32i8: 420; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 421; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 422; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 423; CHECK-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 424; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 425; CHECK-NEXT: ret 426 %op1 = load <32 x i8>, <32 x i8>* %a 427 %op2 = load <32 x i8>, <32 x i8>* %b 428 %res = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %op1, <32 x i8> %op2) 429 store <32 x i8> %res, <32 x i8>* %a 430 ret void 431} 432 433define void @smin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 434; CHECK-LABEL: smin_v64i8: 435; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 436; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 437; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 438; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 439; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 440; VBITS_GE_512-NEXT: ret 441; 442; Ensure sensible type legalisation. 443; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 444; VBITS_EQ_256-DAG: mov w[[A:[0-9]+]], #32 445; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A]]] 446; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0] 447; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1, x[[A]]] 448; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1] 449; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b 450; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b 451; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0, x[[A]]] 452; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0] 453 %op1 = load <64 x i8>, <64 x i8>* %a 454 %op2 = load <64 x i8>, <64 x i8>* %b 455 %res = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %op1, <64 x i8> %op2) 456 store <64 x i8> %res, <64 x i8>* %a 457 ret void 458} 459 460define void @smin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 461; CHECK-LABEL: smin_v128i8: 462; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 463; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 464; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 465; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 466; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 467; VBITS_GE_1024-NEXT: ret 468 %op1 = load <128 x i8>, <128 x i8>* %a 469 %op2 = load <128 x i8>, <128 x i8>* %b 470 %res = call <128 x i8> @llvm.smin.v128i8(<128 x i8> %op1, <128 x i8> %op2) 471 store <128 x i8> %res, <128 x i8>* %a 472 ret void 473} 474 475define void @smin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 476; CHECK-LABEL: smin_v256i8: 477; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 478; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 479; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 480; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 481; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 482; VBITS_GE_2048-NEXT: ret 483 %op1 = load <256 x i8>, <256 x i8>* %a 484 %op2 = load <256 x i8>, <256 x i8>* %b 485 %res = call <256 x i8> @llvm.smin.v256i8(<256 x i8> %op1, <256 x i8> %op2) 486 store <256 x i8> %res, <256 x i8>* %a 487 ret void 488} 489 490; Don't use SVE for 64-bit vectors. 491define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 492; CHECK-LABEL: smin_v4i16: 493; CHECK: smin v0.4h, v0.4h, v1.4h 494; CHECK-NEXT: ret 495 %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2) 496 ret <4 x i16> %res 497} 498 499; Don't use SVE for 128-bit vectors. 500define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 501; CHECK-LABEL: smin_v8i16: 502; CHECK: smin v0.8h, v0.8h, v1.8h 503; CHECK-NEXT: ret 504 %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2) 505 ret <8 x i16> %res 506} 507 508define void @smin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 509; CHECK-LABEL: smin_v16i16: 510; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 511; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 512; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 513; CHECK-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 514; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 515; CHECK-NEXT: ret 516 %op1 = load <16 x i16>, <16 x i16>* %a 517 %op2 = load <16 x i16>, <16 x i16>* %b 518 %res = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %op1, <16 x i16> %op2) 519 store <16 x i16> %res, <16 x i16>* %a 520 ret void 521} 522 523define void @smin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 524; CHECK-LABEL: smin_v32i16: 525; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 526; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 527; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 528; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 529; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 530; VBITS_GE_512-NEXT: ret 531 532; Ensure sensible type legalisation. 533; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 534; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 535; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 536; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0] 537; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] 538; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] 539; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x[[B_HI]]] 540; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h 541; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h 542; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] 543; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] 544; VBITS_EQ_256-NEXT: ret 545 %op1 = load <32 x i16>, <32 x i16>* %a 546 %op2 = load <32 x i16>, <32 x i16>* %b 547 %res = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %op1, <32 x i16> %op2) 548 store <32 x i16> %res, <32 x i16>* %a 549 ret void 550} 551 552define void @smin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 553; CHECK-LABEL: smin_v64i16: 554; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 555; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 556; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 557; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 558; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 559; VBITS_GE_1024-NEXT: ret 560 %op1 = load <64 x i16>, <64 x i16>* %a 561 %op2 = load <64 x i16>, <64 x i16>* %b 562 %res = call <64 x i16> @llvm.smin.v64i16(<64 x i16> %op1, <64 x i16> %op2) 563 store <64 x i16> %res, <64 x i16>* %a 564 ret void 565} 566 567define void @smin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 568; CHECK-LABEL: smin_v128i16: 569; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 570; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 571; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 572; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 573; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 574; VBITS_GE_2048-NEXT: ret 575 %op1 = load <128 x i16>, <128 x i16>* %a 576 %op2 = load <128 x i16>, <128 x i16>* %b 577 %res = call <128 x i16> @llvm.smin.v128i16(<128 x i16> %op1, <128 x i16> %op2) 578 store <128 x i16> %res, <128 x i16>* %a 579 ret void 580} 581 582; Don't use SVE for 64-bit vectors. 583define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 584; CHECK-LABEL: smin_v2i32: 585; CHECK: smin v0.2s, v0.2s, v1.2s 586; CHECK-NEXT: ret 587 %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2) 588 ret <2 x i32> %res 589} 590 591; Don't use SVE for 128-bit vectors. 592define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 593; CHECK-LABEL: smin_v4i32: 594; CHECK: smin v0.4s, v0.4s, v1.4s 595; CHECK-NEXT: ret 596 %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2) 597 ret <4 x i32> %res 598} 599 600define void @smin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 601; CHECK-LABEL: smin_v8i32: 602; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 603; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 604; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 605; CHECK-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 606; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 607; CHECK-NEXT: ret 608 %op1 = load <8 x i32>, <8 x i32>* %a 609 %op2 = load <8 x i32>, <8 x i32>* %b 610 %res = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %op1, <8 x i32> %op2) 611 store <8 x i32> %res, <8 x i32>* %a 612 ret void 613} 614 615define void @smin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 616; CHECK-LABEL: smin_v16i32: 617; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 618; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 619; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 620; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 621; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 622; VBITS_GE_512-NEXT: ret 623 624; Ensure sensible type legalisation. 625; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 626; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 627; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 628; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0] 629; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] 630; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] 631; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x[[B_HI]]] 632; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 633; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s 634; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] 635; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] 636; VBITS_EQ_256-NEXT: ret 637 %op1 = load <16 x i32>, <16 x i32>* %a 638 %op2 = load <16 x i32>, <16 x i32>* %b 639 %res = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %op1, <16 x i32> %op2) 640 store <16 x i32> %res, <16 x i32>* %a 641 ret void 642} 643 644define void @smin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 645; CHECK-LABEL: smin_v32i32: 646; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 647; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 648; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 649; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 650; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 651; VBITS_GE_1024-NEXT: ret 652 %op1 = load <32 x i32>, <32 x i32>* %a 653 %op2 = load <32 x i32>, <32 x i32>* %b 654 %res = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %op1, <32 x i32> %op2) 655 store <32 x i32> %res, <32 x i32>* %a 656 ret void 657} 658 659define void @smin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 660; CHECK-LABEL: smin_v64i32: 661; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 662; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 663; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 664; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 665; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 666; VBITS_GE_2048-NEXT: ret 667 %op1 = load <64 x i32>, <64 x i32>* %a 668 %op2 = load <64 x i32>, <64 x i32>* %b 669 %res = call <64 x i32> @llvm.smin.v64i32(<64 x i32> %op1, <64 x i32> %op2) 670 store <64 x i32> %res, <64 x i32>* %a 671 ret void 672} 673 674; Vector i64 min are not legal for NEON so use SVE when available. 675define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 676; CHECK-LABEL: smin_v1i64: 677; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 678; CHECK-NEXT: smin z0.d, [[PG]]/m, z0.d, z1.d 679; CHECK-NEXT: ret 680 %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2) 681 ret <1 x i64> %res 682} 683 684; Vector i64 min are not legal for NEON so use SVE when available. 685define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 686; CHECK-LABEL: smin_v2i64: 687; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 688; CHECK-NEXT: smin z0.d, [[PG]]/m, z0.d, z1.d 689; CHECK-NEXT: ret 690 %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2) 691 ret <2 x i64> %res 692} 693 694define void @smin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 695; CHECK-LABEL: smin_v4i64: 696; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 697; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 698; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 699; CHECK-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 700; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 701; CHECK-NEXT: ret 702 %op1 = load <4 x i64>, <4 x i64>* %a 703 %op2 = load <4 x i64>, <4 x i64>* %b 704 %res = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %op1, <4 x i64> %op2) 705 store <4 x i64> %res, <4 x i64>* %a 706 ret void 707} 708 709define void @smin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 710; CHECK-LABEL: smin_v8i64: 711; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 712; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 713; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 714; VBITS_GE_512-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 715; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 716; VBITS_GE_512-NEXT: ret 717 718; Ensure sensible type legalisation. 719; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 720; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 721; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 722; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0] 723; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] 724; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] 725; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x[[B_HI]]] 726; VBITS_EQ_256-DAG: smin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d 727; VBITS_EQ_256-DAG: smin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d 728; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] 729; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] 730; VBITS_EQ_256-NEXT: ret 731 %op1 = load <8 x i64>, <8 x i64>* %a 732 %op2 = load <8 x i64>, <8 x i64>* %b 733 %res = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %op1, <8 x i64> %op2) 734 store <8 x i64> %res, <8 x i64>* %a 735 ret void 736} 737 738define void @smin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 739; CHECK-LABEL: smin_v16i64: 740; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 741; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 742; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 743; VBITS_GE_1024-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 744; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 745; VBITS_GE_1024-NEXT: ret 746 %op1 = load <16 x i64>, <16 x i64>* %a 747 %op2 = load <16 x i64>, <16 x i64>* %b 748 %res = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %op1, <16 x i64> %op2) 749 store <16 x i64> %res, <16 x i64>* %a 750 ret void 751} 752 753define void @smin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 754; CHECK-LABEL: smin_v32i64: 755; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 756; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 757; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 758; VBITS_GE_2048-NEXT: smin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 759; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 760; VBITS_GE_2048-NEXT: ret 761 %op1 = load <32 x i64>, <32 x i64>* %a 762 %op2 = load <32 x i64>, <32 x i64>* %b 763 %res = call <32 x i64> @llvm.smin.v32i64(<32 x i64> %op1, <32 x i64> %op2) 764 store <32 x i64> %res, <32 x i64>* %a 765 ret void 766} 767 768; 769; UMAX 770; 771 772; Don't use SVE for 64-bit vectors. 773define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 774; CHECK-LABEL: umax_v8i8: 775; CHECK: umax v0.8b, v0.8b, v1.8b 776; CHECK: ret 777 %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2) 778 ret <8 x i8> %res 779} 780 781; Don't use SVE for 128-bit vectors. 782define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 783; CHECK-LABEL: umax_v16i8: 784; CHECK: umax v0.16b, v0.16b, v1.16b 785; CHECK: ret 786 %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2) 787 ret <16 x i8> %res 788} 789 790define void @umax_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 791; CHECK-LABEL: umax_v32i8: 792; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 793; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 794; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 795; CHECK-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 796; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 797; CHECK-NEXT: ret 798 %op1 = load <32 x i8>, <32 x i8>* %a 799 %op2 = load <32 x i8>, <32 x i8>* %b 800 %res = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %op1, <32 x i8> %op2) 801 store <32 x i8> %res, <32 x i8>* %a 802 ret void 803} 804 805define void @umax_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 806; CHECK-LABEL: umax_v64i8: 807; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 808; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 809; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 810; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 811; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 812; VBITS_GE_512-NEXT: ret 813; 814; Ensure sensible type legalisation. 815; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 816; VBITS_EQ_256-DAG: mov w[[A:[0-9]+]], #32 817; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A]]] 818; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0] 819; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1, x[[A]]] 820; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1] 821; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b 822; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b 823; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0, x[[A]]] 824; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0] 825; VBITS_EQ_256-NEXT: ret 826 %op1 = load <64 x i8>, <64 x i8>* %a 827 %op2 = load <64 x i8>, <64 x i8>* %b 828 %res = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %op1, <64 x i8> %op2) 829 store <64 x i8> %res, <64 x i8>* %a 830 ret void 831} 832 833define void @umax_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 834; CHECK-LABEL: umax_v128i8: 835; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 836; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 837; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 838; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 839; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 840; VBITS_GE_1024-NEXT: ret 841 %op1 = load <128 x i8>, <128 x i8>* %a 842 %op2 = load <128 x i8>, <128 x i8>* %b 843 %res = call <128 x i8> @llvm.umax.v128i8(<128 x i8> %op1, <128 x i8> %op2) 844 store <128 x i8> %res, <128 x i8>* %a 845 ret void 846} 847 848define void @umax_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 849; CHECK-LABEL: umax_v256i8: 850; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 851; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 852; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 853; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 854; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 855; VBITS_GE_2048-NEXT: ret 856 %op1 = load <256 x i8>, <256 x i8>* %a 857 %op2 = load <256 x i8>, <256 x i8>* %b 858 %res = call <256 x i8> @llvm.umax.v256i8(<256 x i8> %op1, <256 x i8> %op2) 859 store <256 x i8> %res, <256 x i8>* %a 860 ret void 861} 862 863; Don't use SVE for 64-bit vectors. 864define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 865; CHECK-LABEL: umax_v4i16: 866; CHECK: umax v0.4h, v0.4h, v1.4h 867; CHECK-NEXT: ret 868 %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2) 869 ret <4 x i16> %res 870} 871 872; Don't use SVE for 128-bit vectors. 873define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 874; CHECK-LABEL: umax_v8i16: 875; CHECK: umax v0.8h, v0.8h, v1.8h 876; CHECK-NEXT: ret 877 %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2) 878 ret <8 x i16> %res 879} 880 881define void @umax_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 882; CHECK-LABEL: umax_v16i16: 883; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 884; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 885; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 886; CHECK-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 887; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 888; CHECK-NEXT: ret 889 %op1 = load <16 x i16>, <16 x i16>* %a 890 %op2 = load <16 x i16>, <16 x i16>* %b 891 %res = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %op1, <16 x i16> %op2) 892 store <16 x i16> %res, <16 x i16>* %a 893 ret void 894} 895 896define void @umax_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 897; CHECK-LABEL: umax_v32i16: 898; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 899; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 900; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 901; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 902; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 903; VBITS_GE_512-NEXT: ret 904 905; Ensure sensible type legalisation. 906; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 907; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 908; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 909; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0] 910; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] 911; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] 912; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x[[B_HI]]] 913; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h 914; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h 915; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] 916; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] 917; VBITS_EQ_256-NEXT: ret 918 %op1 = load <32 x i16>, <32 x i16>* %a 919 %op2 = load <32 x i16>, <32 x i16>* %b 920 %res = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %op1, <32 x i16> %op2) 921 store <32 x i16> %res, <32 x i16>* %a 922 ret void 923} 924 925define void @umax_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 926; CHECK-LABEL: umax_v64i16: 927; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 928; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 929; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 930; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 931; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 932; VBITS_GE_1024-NEXT: ret 933 %op1 = load <64 x i16>, <64 x i16>* %a 934 %op2 = load <64 x i16>, <64 x i16>* %b 935 %res = call <64 x i16> @llvm.umax.v64i16(<64 x i16> %op1, <64 x i16> %op2) 936 store <64 x i16> %res, <64 x i16>* %a 937 ret void 938} 939 940define void @umax_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 941; CHECK-LABEL: umax_v128i16: 942; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 943; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 944; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 945; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 946; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 947; VBITS_GE_2048-NEXT: ret 948 %op1 = load <128 x i16>, <128 x i16>* %a 949 %op2 = load <128 x i16>, <128 x i16>* %b 950 %res = call <128 x i16> @llvm.umax.v128i16(<128 x i16> %op1, <128 x i16> %op2) 951 store <128 x i16> %res, <128 x i16>* %a 952 ret void 953} 954 955; Don't use SVE for 64-bit vectors. 956define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 957; CHECK-LABEL: umax_v2i32: 958; CHECK: umax v0.2s, v0.2s, v1.2s 959; CHECK-NEXT: ret 960 %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2) 961 ret <2 x i32> %res 962} 963 964; Don't use SVE for 128-bit vectors. 965define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 966; CHECK-LABEL: umax_v4i32: 967; CHECK: umax v0.4s, v0.4s, v1.4s 968; CHECK-NEXT: ret 969 %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2) 970 ret <4 x i32> %res 971} 972 973define void @umax_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 974; CHECK-LABEL: umax_v8i32: 975; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 976; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 977; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 978; CHECK-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 979; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 980; CHECK-NEXT: ret 981 %op1 = load <8 x i32>, <8 x i32>* %a 982 %op2 = load <8 x i32>, <8 x i32>* %b 983 %res = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %op1, <8 x i32> %op2) 984 store <8 x i32> %res, <8 x i32>* %a 985 ret void 986} 987 988define void @umax_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 989; CHECK-LABEL: umax_v16i32: 990; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 991; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 992; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 993; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 994; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 995; VBITS_GE_512-NEXT: ret 996 997; Ensure sensible type legalisation. 998; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 999; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 1000; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 1001; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0] 1002; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] 1003; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] 1004; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x[[B_HI]]] 1005; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 1006; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s 1007; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] 1008; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] 1009; VBITS_EQ_256-NEXT: ret 1010 %op1 = load <16 x i32>, <16 x i32>* %a 1011 %op2 = load <16 x i32>, <16 x i32>* %b 1012 %res = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %op1, <16 x i32> %op2) 1013 store <16 x i32> %res, <16 x i32>* %a 1014 ret void 1015} 1016 1017define void @umax_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 1018; CHECK-LABEL: umax_v32i32: 1019; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 1020; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 1021; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 1022; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 1023; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 1024; VBITS_GE_1024-NEXT: ret 1025 %op1 = load <32 x i32>, <32 x i32>* %a 1026 %op2 = load <32 x i32>, <32 x i32>* %b 1027 %res = call <32 x i32> @llvm.umax.v32i32(<32 x i32> %op1, <32 x i32> %op2) 1028 store <32 x i32> %res, <32 x i32>* %a 1029 ret void 1030} 1031 1032define void @umax_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 1033; CHECK-LABEL: umax_v64i32: 1034; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 1035; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 1036; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 1037; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 1038; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 1039; VBITS_GE_2048-NEXT: ret 1040 %op1 = load <64 x i32>, <64 x i32>* %a 1041 %op2 = load <64 x i32>, <64 x i32>* %b 1042 %res = call <64 x i32> @llvm.umax.v64i32(<64 x i32> %op1, <64 x i32> %op2) 1043 store <64 x i32> %res, <64 x i32>* %a 1044 ret void 1045} 1046 1047; Vector i64 max are not legal for NEON so use SVE when available. 1048define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 1049; CHECK-LABEL: umax_v1i64: 1050; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 1051; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d 1052; CHECK-NEXT: ret 1053 %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2) 1054 ret <1 x i64> %res 1055} 1056 1057; Vector i64 max are not legal for NEON so use SVE when available. 1058define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 1059; CHECK-LABEL: umax_v2i64: 1060; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 1061; CHECK-NEXT: umax z0.d, [[PG]]/m, z0.d, z1.d 1062; CHECK-NEXT: ret 1063 %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2) 1064 ret <2 x i64> %res 1065} 1066 1067define void @umax_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 1068; CHECK-LABEL: umax_v4i64: 1069; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 1070; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1071; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1072; CHECK-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1073; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1074; CHECK-NEXT: ret 1075 %op1 = load <4 x i64>, <4 x i64>* %a 1076 %op2 = load <4 x i64>, <4 x i64>* %b 1077 %res = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %op1, <4 x i64> %op2) 1078 store <4 x i64> %res, <4 x i64>* %a 1079 ret void 1080} 1081 1082define void @umax_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 1083; CHECK-LABEL: umax_v8i64: 1084; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 1085; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1086; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1087; VBITS_GE_512-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1088; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1089; VBITS_GE_512-NEXT: ret 1090 1091; Ensure sensible type legalisation. 1092; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 1093; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 1094; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 1095; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0] 1096; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] 1097; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] 1098; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x[[B_HI]]] 1099; VBITS_EQ_256-DAG: umax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d 1100; VBITS_EQ_256-DAG: umax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d 1101; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] 1102; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] 1103; VBITS_EQ_256-NEXT: ret 1104 %op1 = load <8 x i64>, <8 x i64>* %a 1105 %op2 = load <8 x i64>, <8 x i64>* %b 1106 %res = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %op1, <8 x i64> %op2) 1107 store <8 x i64> %res, <8 x i64>* %a 1108 ret void 1109} 1110 1111define void @umax_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 1112; CHECK-LABEL: umax_v16i64: 1113; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 1114; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1115; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1116; VBITS_GE_1024-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1117; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1118; VBITS_GE_1024-NEXT: ret 1119 %op1 = load <16 x i64>, <16 x i64>* %a 1120 %op2 = load <16 x i64>, <16 x i64>* %b 1121 %res = call <16 x i64> @llvm.umax.v16i64(<16 x i64> %op1, <16 x i64> %op2) 1122 store <16 x i64> %res, <16 x i64>* %a 1123 ret void 1124} 1125 1126define void @umax_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 1127; CHECK-LABEL: umax_v32i64: 1128; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 1129; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1130; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1131; VBITS_GE_2048-NEXT: umax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1132; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1133; VBITS_GE_2048-NEXT: ret 1134 %op1 = load <32 x i64>, <32 x i64>* %a 1135 %op2 = load <32 x i64>, <32 x i64>* %b 1136 %res = call <32 x i64> @llvm.umax.v32i64(<32 x i64> %op1, <32 x i64> %op2) 1137 store <32 x i64> %res, <32 x i64>* %a 1138 ret void 1139} 1140 1141; 1142; UMIN 1143; 1144 1145; Don't use SVE for 64-bit vectors. 1146define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { 1147; CHECK-LABEL: umin_v8i8: 1148; CHECK: umin v0.8b, v0.8b, v1.8b 1149; CHECK-NEXT: ret 1150 %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2) 1151 ret <8 x i8> %res 1152} 1153 1154; Don't use SVE for 128-bit vectors. 1155define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { 1156; CHECK-LABEL: umin_v16i8: 1157; CHECK: umin v0.16b, v0.16b, v1.16b 1158; CHECK: ret 1159 %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2) 1160 ret <16 x i8> %res 1161} 1162 1163define void @umin_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { 1164; CHECK-LABEL: umin_v32i8: 1165; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 1166; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 1167; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 1168; CHECK-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 1169; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 1170; CHECK-NEXT: ret 1171 %op1 = load <32 x i8>, <32 x i8>* %a 1172 %op2 = load <32 x i8>, <32 x i8>* %b 1173 %res = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %op1, <32 x i8> %op2) 1174 store <32 x i8> %res, <32 x i8>* %a 1175 ret void 1176} 1177 1178define void @umin_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { 1179; CHECK-LABEL: umin_v64i8: 1180; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 1181; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 1182; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 1183; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 1184; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 1185; VBITS_GE_512-NEXT: ret 1186; 1187; Ensure sensible type legalisation. 1188; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 1189; VBITS_EQ_256-DAG: mov w[[A:[0-9]+]], #32 1190; VBITS_EQ_256-DAG: ld1b { [[OP1_LO:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A]]] 1191; VBITS_EQ_256-DAG: ld1b { [[OP1_HI:z[0-9]+]].b }, [[PG]]/z, [x0] 1192; VBITS_EQ_256-DAG: ld1b { [[OP2_LO:z[0-9]+]].b }, [[PG]]/z, [x1, x[[A]]] 1193; VBITS_EQ_256-DAG: ld1b { [[OP2_HI:z[0-9]+]].b }, [[PG]]/z, [x1] 1194; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].b, [[PG]]/m, [[OP1_LO]].b, [[OP2_LO]].b 1195; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].b, [[PG]]/m, [[OP1_HI]].b, [[OP2_HI]].b 1196; VBITS_EQ_256-DAG: st1b { [[RES_LO]].b }, [[PG]], [x0, x[[A]]] 1197; VBITS_EQ_256-DAG: st1b { [[RES_HI]].b }, [[PG]], [x0] 1198 %op1 = load <64 x i8>, <64 x i8>* %a 1199 %op2 = load <64 x i8>, <64 x i8>* %b 1200 %res = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %op1, <64 x i8> %op2) 1201 store <64 x i8> %res, <64 x i8>* %a 1202 ret void 1203} 1204 1205define void @umin_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { 1206; CHECK-LABEL: umin_v128i8: 1207; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 1208; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 1209; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 1210; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 1211; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 1212; VBITS_GE_1024-NEXT: ret 1213 %op1 = load <128 x i8>, <128 x i8>* %a 1214 %op2 = load <128 x i8>, <128 x i8>* %b 1215 %res = call <128 x i8> @llvm.umin.v128i8(<128 x i8> %op1, <128 x i8> %op2) 1216 store <128 x i8> %res, <128 x i8>* %a 1217 ret void 1218} 1219 1220define void @umin_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { 1221; CHECK-LABEL: umin_v256i8: 1222; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 1223; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] 1224; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] 1225; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b 1226; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] 1227; VBITS_GE_2048-NEXT: ret 1228 %op1 = load <256 x i8>, <256 x i8>* %a 1229 %op2 = load <256 x i8>, <256 x i8>* %b 1230 %res = call <256 x i8> @llvm.umin.v256i8(<256 x i8> %op1, <256 x i8> %op2) 1231 store <256 x i8> %res, <256 x i8>* %a 1232 ret void 1233} 1234 1235; Don't use SVE for 64-bit vectors. 1236define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { 1237; CHECK-LABEL: umin_v4i16: 1238; CHECK: umin v0.4h, v0.4h, v1.4h 1239; CHECK-NEXT: ret 1240 %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2) 1241 ret <4 x i16> %res 1242} 1243 1244; Don't use SVE for 128-bit vectors. 1245define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { 1246; CHECK-LABEL: umin_v8i16: 1247; CHECK: umin v0.8h, v0.8h, v1.8h 1248; CHECK-NEXT: ret 1249 %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2) 1250 ret <8 x i16> %res 1251} 1252 1253define void @umin_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { 1254; CHECK-LABEL: umin_v16i16: 1255; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 1256; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 1257; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 1258; CHECK-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 1259; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 1260; CHECK-NEXT: ret 1261 %op1 = load <16 x i16>, <16 x i16>* %a 1262 %op2 = load <16 x i16>, <16 x i16>* %b 1263 %res = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %op1, <16 x i16> %op2) 1264 store <16 x i16> %res, <16 x i16>* %a 1265 ret void 1266} 1267 1268define void @umin_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { 1269; CHECK-LABEL: umin_v32i16: 1270; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 1271; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 1272; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 1273; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 1274; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 1275; VBITS_GE_512-NEXT: ret 1276 1277; Ensure sensible type legalisation. 1278; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 1279; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 1280; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 1281; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0] 1282; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] 1283; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] 1284; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x[[B_HI]]] 1285; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h 1286; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h 1287; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0] 1288; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x[[A_HI]] 1289; VBITS_EQ_256-NEXT: ret 1290 %op1 = load <32 x i16>, <32 x i16>* %a 1291 %op2 = load <32 x i16>, <32 x i16>* %b 1292 %res = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %op1, <32 x i16> %op2) 1293 store <32 x i16> %res, <32 x i16>* %a 1294 ret void 1295} 1296 1297define void @umin_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { 1298; CHECK-LABEL: umin_v64i16: 1299; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 1300; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 1301; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 1302; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 1303; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 1304; VBITS_GE_1024-NEXT: ret 1305 %op1 = load <64 x i16>, <64 x i16>* %a 1306 %op2 = load <64 x i16>, <64 x i16>* %b 1307 %res = call <64 x i16> @llvm.umin.v64i16(<64 x i16> %op1, <64 x i16> %op2) 1308 store <64 x i16> %res, <64 x i16>* %a 1309 ret void 1310} 1311 1312define void @umin_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { 1313; CHECK-LABEL: umin_v128i16: 1314; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 1315; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] 1316; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] 1317; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h 1318; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] 1319; VBITS_GE_2048-NEXT: ret 1320 %op1 = load <128 x i16>, <128 x i16>* %a 1321 %op2 = load <128 x i16>, <128 x i16>* %b 1322 %res = call <128 x i16> @llvm.umin.v128i16(<128 x i16> %op1, <128 x i16> %op2) 1323 store <128 x i16> %res, <128 x i16>* %a 1324 ret void 1325} 1326 1327; Don't use SVE for 64-bit vectors. 1328define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 { 1329; CHECK-LABEL: umin_v2i32: 1330; CHECK: umin v0.2s, v0.2s, v1.2s 1331; CHECK-NEXT: ret 1332 %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2) 1333 ret <2 x i32> %res 1334} 1335 1336; Don't use SVE for 128-bit vectors. 1337define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 { 1338; CHECK-LABEL: umin_v4i32: 1339; CHECK: umin v0.4s, v0.4s, v1.4s 1340; CHECK-NEXT: ret 1341 %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2) 1342 ret <4 x i32> %res 1343} 1344 1345define void @umin_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { 1346; CHECK-LABEL: umin_v8i32: 1347; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 1348; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 1349; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 1350; CHECK-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 1351; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 1352; CHECK-NEXT: ret 1353 %op1 = load <8 x i32>, <8 x i32>* %a 1354 %op2 = load <8 x i32>, <8 x i32>* %b 1355 %res = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %op1, <8 x i32> %op2) 1356 store <8 x i32> %res, <8 x i32>* %a 1357 ret void 1358} 1359 1360define void @umin_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { 1361; CHECK-LABEL: umin_v16i32: 1362; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 1363; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 1364; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 1365; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 1366; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 1367; VBITS_GE_512-NEXT: ret 1368 1369; Ensure sensible type legalisation. 1370; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 1371; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 1372; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 1373; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0] 1374; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] 1375; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] 1376; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x[[B_HI]]] 1377; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s 1378; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s 1379; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0] 1380; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x[[A_HI]] 1381; VBITS_EQ_256-NEXT: ret 1382 %op1 = load <16 x i32>, <16 x i32>* %a 1383 %op2 = load <16 x i32>, <16 x i32>* %b 1384 %res = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %op1, <16 x i32> %op2) 1385 store <16 x i32> %res, <16 x i32>* %a 1386 ret void 1387} 1388 1389define void @umin_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { 1390; CHECK-LABEL: umin_v32i32: 1391; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 1392; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 1393; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 1394; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 1395; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 1396; VBITS_GE_1024-NEXT: ret 1397 %op1 = load <32 x i32>, <32 x i32>* %a 1398 %op2 = load <32 x i32>, <32 x i32>* %b 1399 %res = call <32 x i32> @llvm.umin.v32i32(<32 x i32> %op1, <32 x i32> %op2) 1400 store <32 x i32> %res, <32 x i32>* %a 1401 ret void 1402} 1403 1404define void @umin_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { 1405; CHECK-LABEL: umin_v64i32: 1406; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 1407; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] 1408; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] 1409; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s 1410; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] 1411; VBITS_GE_2048-NEXT: ret 1412 %op1 = load <64 x i32>, <64 x i32>* %a 1413 %op2 = load <64 x i32>, <64 x i32>* %b 1414 %res = call <64 x i32> @llvm.umin.v64i32(<64 x i32> %op1, <64 x i32> %op2) 1415 store <64 x i32> %res, <64 x i32>* %a 1416 ret void 1417} 1418 1419; Vector i64 min are not legal for NEON so use SVE when available. 1420define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 1421; CHECK-LABEL: umin_v1i64: 1422; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 1423; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d 1424; CHECK-NEXT: ret 1425 %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2) 1426 ret <1 x i64> %res 1427} 1428 1429; Vector i64 min are not legal for NEON so use SVE when available. 1430define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 1431; CHECK-LABEL: umin_v2i64: 1432; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 1433; CHECK-NEXT: umin z0.d, [[PG]]/m, z0.d, z1.d 1434; CHECK-NEXT: ret 1435 %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2) 1436 ret <2 x i64> %res 1437} 1438 1439define void @umin_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { 1440; CHECK-LABEL: umin_v4i64: 1441; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 1442; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1443; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1444; CHECK-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1445; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1446; CHECK-NEXT: ret 1447 %op1 = load <4 x i64>, <4 x i64>* %a 1448 %op2 = load <4 x i64>, <4 x i64>* %b 1449 %res = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %op1, <4 x i64> %op2) 1450 store <4 x i64> %res, <4 x i64>* %a 1451 ret void 1452} 1453 1454define void @umin_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { 1455; CHECK-LABEL: umin_v8i64: 1456; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 1457; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1458; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1459; VBITS_GE_512-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1460; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1461; VBITS_GE_512-NEXT: ret 1462 1463; Ensure sensible type legalisation. 1464; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 1465; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 1466; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 1467; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0] 1468; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] 1469; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] 1470; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x[[B_HI]]] 1471; VBITS_EQ_256-DAG: umin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d 1472; VBITS_EQ_256-DAG: umin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d 1473; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0] 1474; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x[[A_HI]] 1475; VBITS_EQ_256-NEXT: ret 1476 %op1 = load <8 x i64>, <8 x i64>* %a 1477 %op2 = load <8 x i64>, <8 x i64>* %b 1478 %res = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %op1, <8 x i64> %op2) 1479 store <8 x i64> %res, <8 x i64>* %a 1480 ret void 1481} 1482 1483define void @umin_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { 1484; CHECK-LABEL: umin_v16i64: 1485; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 1486; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1487; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1488; VBITS_GE_1024-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1489; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1490; VBITS_GE_1024-NEXT: ret 1491 %op1 = load <16 x i64>, <16 x i64>* %a 1492 %op2 = load <16 x i64>, <16 x i64>* %b 1493 %res = call <16 x i64> @llvm.umin.v16i64(<16 x i64> %op1, <16 x i64> %op2) 1494 store <16 x i64> %res, <16 x i64>* %a 1495 ret void 1496} 1497 1498define void @umin_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { 1499; CHECK-LABEL: umin_v32i64: 1500; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 1501; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] 1502; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] 1503; VBITS_GE_2048-NEXT: umin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d 1504; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] 1505; VBITS_GE_2048-NEXT: ret 1506 %op1 = load <32 x i64>, <32 x i64>* %a 1507 %op2 = load <32 x i64>, <32 x i64>* %b 1508 %res = call <32 x i64> @llvm.umin.v32i64(<32 x i64> %op1, <32 x i64> %op2) 1509 store <32 x i64> %res, <32 x i64>* %a 1510 ret void 1511} 1512 1513attributes #0 = { "target-features"="+sve" } 1514 1515declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>) 1516declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) 1517declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) 1518declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>) 1519declare <128 x i8> @llvm.smin.v128i8(<128 x i8>, <128 x i8>) 1520declare <256 x i8> @llvm.smin.v256i8(<256 x i8>, <256 x i8>) 1521declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) 1522declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) 1523declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) 1524declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>) 1525declare <64 x i16> @llvm.smin.v64i16(<64 x i16>, <64 x i16>) 1526declare <128 x i16> @llvm.smin.v128i16(<128 x i16>, <128 x i16>) 1527declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>) 1528declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) 1529declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) 1530declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) 1531declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>) 1532declare <64 x i32> @llvm.smin.v64i32(<64 x i32>, <64 x i32>) 1533declare <1 x i64> @llvm.smin.v1i64(<1 x i64>, <1 x i64>) 1534declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) 1535declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) 1536declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>) 1537declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>) 1538declare <32 x i64> @llvm.smin.v32i64(<32 x i64>, <32 x i64>) 1539 1540declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>) 1541declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) 1542declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) 1543declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>) 1544declare <128 x i8> @llvm.smax.v128i8(<128 x i8>, <128 x i8>) 1545declare <256 x i8> @llvm.smax.v256i8(<256 x i8>, <256 x i8>) 1546declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) 1547declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) 1548declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) 1549declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>) 1550declare <64 x i16> @llvm.smax.v64i16(<64 x i16>, <64 x i16>) 1551declare <128 x i16> @llvm.smax.v128i16(<128 x i16>, <128 x i16>) 1552declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>) 1553declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) 1554declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) 1555declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) 1556declare <32 x i32> @llvm.smax.v32i32(<32 x i32>, <32 x i32>) 1557declare <64 x i32> @llvm.smax.v64i32(<64 x i32>, <64 x i32>) 1558declare <1 x i64> @llvm.smax.v1i64(<1 x i64>, <1 x i64>) 1559declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) 1560declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) 1561declare <8 x i64> @llvm.smax.v8i64(<8 x i64>, <8 x i64>) 1562declare <16 x i64> @llvm.smax.v16i64(<16 x i64>, <16 x i64>) 1563declare <32 x i64> @llvm.smax.v32i64(<32 x i64>, <32 x i64>) 1564 1565declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>) 1566declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) 1567declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) 1568declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>) 1569declare <128 x i8> @llvm.umin.v128i8(<128 x i8>, <128 x i8>) 1570declare <256 x i8> @llvm.umin.v256i8(<256 x i8>, <256 x i8>) 1571declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) 1572declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) 1573declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) 1574declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>) 1575declare <64 x i16> @llvm.umin.v64i16(<64 x i16>, <64 x i16>) 1576declare <128 x i16> @llvm.umin.v128i16(<128 x i16>, <128 x i16>) 1577declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>) 1578declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) 1579declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) 1580declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) 1581declare <32 x i32> @llvm.umin.v32i32(<32 x i32>, <32 x i32>) 1582declare <64 x i32> @llvm.umin.v64i32(<64 x i32>, <64 x i32>) 1583declare <1 x i64> @llvm.umin.v1i64(<1 x i64>, <1 x i64>) 1584declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) 1585declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) 1586declare <8 x i64> @llvm.umin.v8i64(<8 x i64>, <8 x i64>) 1587declare <16 x i64> @llvm.umin.v16i64(<16 x i64>, <16 x i64>) 1588declare <32 x i64> @llvm.umin.v32i64(<32 x i64>, <32 x i64>) 1589 1590declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>) 1591declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) 1592declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) 1593declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>) 1594declare <128 x i8> @llvm.umax.v128i8(<128 x i8>, <128 x i8>) 1595declare <256 x i8> @llvm.umax.v256i8(<256 x i8>, <256 x i8>) 1596declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) 1597declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) 1598declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) 1599declare <32 x i16> @llvm.umax.v32i16(<32 x i16>, <32 x i16>) 1600declare <64 x i16> @llvm.umax.v64i16(<64 x i16>, <64 x i16>) 1601declare <128 x i16> @llvm.umax.v128i16(<128 x i16>, <128 x i16>) 1602declare <2 x i32> @llvm.umax.v2i32(<2 x i32>, <2 x i32>) 1603declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) 1604declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) 1605declare <16 x i32> @llvm.umax.v16i32(<16 x i32>, <16 x i32>) 1606declare <32 x i32> @llvm.umax.v32i32(<32 x i32>, <32 x i32>) 1607declare <64 x i32> @llvm.umax.v64i32(<64 x i32>, <64 x i32>) 1608declare <1 x i64> @llvm.umax.v1i64(<1 x i64>, <1 x i64>) 1609declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) 1610declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) 1611declare <8 x i64> @llvm.umax.v8i64(<8 x i64>, <8 x i64>) 1612declare <16 x i64> @llvm.umax.v16i64(<16 x i64>, <16 x i64>) 1613declare <32 x i64> @llvm.umax.v32i64(<32 x i64>, <32 x i64>) 1614 1615