1; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE 2; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 3; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK 4; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 5; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 6; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 7; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 8; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 9; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 10; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 11; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 12; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 13; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 14; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 15; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 16; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 17 18target triple = "aarch64-unknown-linux-gnu" 19 20; Don't use SVE when its registers are no bigger than NEON. 21; NO_SVE-NOT: ptrue 22 23; 24; ANDV 25; 26 27; No single instruction NEON ANDV support. Use SVE. 28define i8 @andv_v8i8(<8 x i8> %a) #0 { 29; CHECK-LABEL: andv_v8i8: 30; CHECK: ptrue [[PG:p[0-9]+]].b, vl8 31; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b 32; CHECK: fmov w0, s[[REDUCE]] 33; CHECK: ret 34 %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a) 35 ret i8 %res 36} 37 38; No single instruction NEON ANDV support. Use SVE. 39define i8 @andv_v16i8(<16 x i8> %a) #0 { 40; CHECK-LABEL: andv_v16i8: 41; CHECK: ptrue [[PG:p[0-9]+]].b, vl16 42; CHECK: andv b[[REDUCE:[0-9]+]], [[PG]], z0.b 43; CHECK: fmov w0, s[[REDUCE]] 44; CHECK: ret 45 %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a) 46 ret i8 %res 47} 48 49define i8 @andv_v32i8(<32 x i8>* %a) #0 { 50; CHECK-LABEL: andv_v32i8: 51; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 52; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 53; CHECK-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 54; CHECK-NEXT: fmov w0, s[[REDUCE]] 55; CHECK-NEXT: ret 56 %op = load <32 x i8>, <32 x i8>* %a 57 %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op) 58 ret i8 %res 59} 60 61define i8 @andv_v64i8(<64 x i8>* %a) #0 { 62; CHECK-LABEL: andv_v64i8: 63; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 64; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 65; VBITS_GE_512-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 66; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 67; VBITS_GE_512-NEXT: ret 68 69; Ensure sensible type legalisation. 70; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 71; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32 72; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 73; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]] 74; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d 75; VBITS_EQ_256-DAG: andv b[[REDUCE:[0-9]+]], [[PG]], [[AND]].b 76; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 77; VBITS_EQ_256-NEXT: ret 78 79 %op = load <64 x i8>, <64 x i8>* %a 80 %res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op) 81 ret i8 %res 82} 83 84define i8 @andv_v128i8(<128 x i8>* %a) #0 { 85; CHECK-LABEL: andv_v128i8: 86; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 87; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 88; VBITS_GE_1024-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 89; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 90; VBITS_GE_1024-NEXT: ret 91 %op = load <128 x i8>, <128 x i8>* %a 92 %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op) 93 ret i8 %res 94} 95 96define i8 @andv_v256i8(<256 x i8>* %a) #0 { 97; CHECK-LABEL: andv_v256i8: 98; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 99; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 100; VBITS_GE_2048-NEXT: andv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 101; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 102; VBITS_GE_2048-NEXT: ret 103 %op = load <256 x i8>, <256 x i8>* %a 104 %res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op) 105 ret i8 %res 106} 107 108; No single instruction NEON ANDV support. Use SVE. 109define i16 @andv_v4i16(<4 x i16> %a) #0 { 110; CHECK-LABEL: andv_v4i16: 111; CHECK: ptrue [[PG:p[0-9]+]].h, vl4 112; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h 113; CHECK: fmov w0, s[[REDUCE]] 114; CHECK: ret 115 %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a) 116 ret i16 %res 117} 118 119; No single instruction NEON ANDV support. Use SVE. 120define i16 @andv_v8i16(<8 x i16> %a) #0 { 121; CHECK-LABEL: andv_v8i16: 122; CHECK: ptrue [[PG:p[0-9]+]].h, vl8 123; CHECK: andv h[[REDUCE:[0-9]+]], [[PG]], z0.h 124; CHECK: fmov w0, s[[REDUCE]] 125; CHECK: ret 126 %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a) 127 ret i16 %res 128} 129 130define i16 @andv_v16i16(<16 x i16>* %a) #0 { 131; CHECK-LABEL: andv_v16i16: 132; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 133; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 134; CHECK-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 135; CHECK-NEXT: fmov w0, s[[REDUCE]] 136; CHECK-NEXT: ret 137 %op = load <16 x i16>, <16 x i16>* %a 138 %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op) 139 ret i16 %res 140} 141 142define i16 @andv_v32i16(<32 x i16>* %a) #0 { 143; CHECK-LABEL: andv_v32i16: 144; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 145; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 146; VBITS_GE_512-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 147; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 148; VBITS_GE_512-NEXT: ret 149 150; Ensure sensible type legalisation. 151; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 152; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 153; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 154; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] 155; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d 156; VBITS_EQ_256-DAG: andv h[[REDUCE:[0-9]+]], [[PG]], [[AND]].h 157; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 158; VBITS_EQ_256-NEXT: ret 159 %op = load <32 x i16>, <32 x i16>* %a 160 %res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op) 161 ret i16 %res 162} 163 164define i16 @andv_v64i16(<64 x i16>* %a) #0 { 165; CHECK-LABEL: andv_v64i16: 166; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 167; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 168; VBITS_GE_1024-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 169; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 170; VBITS_GE_1024-NEXT: ret 171 %op = load <64 x i16>, <64 x i16>* %a 172 %res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op) 173 ret i16 %res 174} 175 176define i16 @andv_v128i16(<128 x i16>* %a) #0 { 177; CHECK-LABEL: andv_v128i16: 178; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 179; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 180; VBITS_GE_2048-NEXT: andv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 181; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 182; VBITS_GE_2048-NEXT: ret 183 %op = load <128 x i16>, <128 x i16>* %a 184 %res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op) 185 ret i16 %res 186} 187 188; No single instruction NEON ANDV support. Use SVE. 189define i32 @andv_v2i32(<2 x i32> %a) #0 { 190; CHECK-LABEL: andv_v2i32: 191; CHECK: ptrue [[PG:p[0-9]+]].s, vl2 192; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s 193; CHECK: fmov w0, [[REDUCE]] 194; CHECK: ret 195 %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) 196 ret i32 %res 197} 198 199; No single instruction NEON ANDV support. Use SVE. 200define i32 @andv_v4i32(<4 x i32> %a) #0 { 201; CHECK-LABEL: andv_v4i32: 202; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 203; CHECK: andv [[REDUCE:s[0-9]+]], [[PG]], z0.s 204; CHECK: fmov w0, [[REDUCE]] 205; CHECK: ret 206 %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) 207 ret i32 %res 208} 209 210define i32 @andv_v8i32(<8 x i32>* %a) #0 { 211; CHECK-LABEL: andv_v8i32: 212; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 213; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 214; CHECK-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 215; CHECK-NEXT: fmov w0, [[REDUCE]] 216; CHECK-NEXT: ret 217 %op = load <8 x i32>, <8 x i32>* %a 218 %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op) 219 ret i32 %res 220} 221 222define i32 @andv_v16i32(<16 x i32>* %a) #0 { 223; CHECK-LABEL: andv_v16i32: 224; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 225; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 226; VBITS_GE_512-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 227; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] 228; VBITS_GE_512-NEXT: ret 229 230; Ensure sensible type legalisation. 231; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 232; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 233; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 234; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] 235; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d 236; VBITS_EQ_256-DAG: andv [[REDUCE:s[0-9]+]], [[PG]], [[AND]].s 237; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]] 238; VBITS_EQ_256-NEXT: ret 239 %op = load <16 x i32>, <16 x i32>* %a 240 %res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op) 241 ret i32 %res 242} 243 244define i32 @andv_v32i32(<32 x i32>* %a) #0 { 245; CHECK-LABEL: andv_v32i32: 246; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 247; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 248; VBITS_GE_1024-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 249; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] 250; VBITS_GE_1024-NEXT: ret 251 %op = load <32 x i32>, <32 x i32>* %a 252 %res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op) 253 ret i32 %res 254} 255 256define i32 @andv_v64i32(<64 x i32>* %a) #0 { 257; CHECK-LABEL: andv_v64i32: 258; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 259; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 260; VBITS_GE_2048-NEXT: andv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 261; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] 262; VBITS_GE_2048-NEXT: ret 263 %op = load <64 x i32>, <64 x i32>* %a 264 %res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op) 265 ret i32 %res 266} 267 268; Nothing to do for single element vectors. 269define i64 @andv_v1i64(<1 x i64> %a) #0 { 270; CHECK-LABEL: andv_v1i64: 271; CHECK: fmov x0, d0 272; CHECK: ret 273 %res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a) 274 ret i64 %res 275} 276 277; Use SVE for 128-bit vectors 278define i64 @andv_v2i64(<2 x i64> %a) #0 { 279; CHECK-LABEL: andv_v2i64: 280; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 281; CHECK: andv [[REDUCE:d[0-9]+]], [[PG]], z0.d 282; CHECK: fmov x0, [[REDUCE]] 283; CHECK: ret 284 %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a) 285 ret i64 %res 286} 287 288define i64 @andv_v4i64(<4 x i64>* %a) #0 { 289; CHECK-LABEL: andv_v4i64: 290; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 291; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 292; CHECK-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 293; CHECK-NEXT: fmov x0, [[REDUCE]] 294; CHECK-NEXT: ret 295 %op = load <4 x i64>, <4 x i64>* %a 296 %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op) 297 ret i64 %res 298} 299 300define i64 @andv_v8i64(<8 x i64>* %a) #0 { 301; CHECK-LABEL: andv_v8i64: 302; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 303; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 304; VBITS_GE_512-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 305; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 306; VBITS_GE_512-NEXT: ret 307 308; Ensure sensible type legalisation. 309; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 310; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 311; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 312; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] 313; VBITS_EQ_256-DAG: and [[AND:z[0-9]+]].d, [[LO]].d, [[HI]].d 314; VBITS_EQ_256-DAG: andv [[REDUCE:d[0-9]+]], [[PG]], [[AND]].d 315; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 316; VBITS_EQ_256-NEXT: ret 317 %op = load <8 x i64>, <8 x i64>* %a 318 %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op) 319 ret i64 %res 320} 321 322define i64 @andv_v16i64(<16 x i64>* %a) #0 { 323; CHECK-LABEL: andv_v16i64: 324; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 325; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 326; VBITS_GE_1024-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 327; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 328; VBITS_GE_1024-NEXT: ret 329 %op = load <16 x i64>, <16 x i64>* %a 330 %res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op) 331 ret i64 %res 332} 333 334define i64 @andv_v32i64(<32 x i64>* %a) #0 { 335; CHECK-LABEL: andv_v32i64: 336; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 337; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 338; VBITS_GE_2048-NEXT: andv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 339; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 340; VBITS_GE_2048-NEXT: ret 341 %op = load <32 x i64>, <32 x i64>* %a 342 %res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op) 343 ret i64 %res 344} 345 346; 347; EORV 348; 349 350; No single instruction NEON EORV support. Use SVE. 351define i8 @eorv_v8i8(<8 x i8> %a) #0 { 352; CHECK-LABEL: eorv_v8i8: 353; CHECK: ptrue [[PG:p[0-9]+]].b, vl8 354; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b 355; CHECK: fmov w0, s[[REDUCE]] 356; CHECK: ret 357 %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a) 358 ret i8 %res 359} 360 361; No single instruction NEON EORV support. Use SVE. 362define i8 @eorv_v16i8(<16 x i8> %a) #0 { 363; CHECK-LABEL: eorv_v16i8: 364; CHECK: ptrue [[PG:p[0-9]+]].b, vl16 365; CHECK: eorv b[[REDUCE:[0-9]+]], [[PG]], z0.b 366; CHECK: fmov w0, s[[REDUCE]] 367; CHECK: ret 368 %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a) 369 ret i8 %res 370} 371 372define i8 @eorv_v32i8(<32 x i8>* %a) #0 { 373; CHECK-LABEL: eorv_v32i8: 374; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 375; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 376; CHECK-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 377; CHECK-NEXT: fmov w0, s[[REDUCE]] 378; CHECK-NEXT: ret 379 %op = load <32 x i8>, <32 x i8>* %a 380 %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op) 381 ret i8 %res 382} 383 384define i8 @eorv_v64i8(<64 x i8>* %a) #0 { 385; CHECK-LABEL: eorv_v64i8: 386; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 387; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 388; VBITS_GE_512-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 389; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 390; VBITS_GE_512-NEXT: ret 391 392; Ensure sensible type legalisation. 393; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 394; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32 395; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 396; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]] 397; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d 398; VBITS_EQ_256-DAG: eorv b[[REDUCE:[0-9]+]], [[PG]], [[EOR]].b 399; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 400; VBITS_EQ_256-NEXT: ret 401 402 %op = load <64 x i8>, <64 x i8>* %a 403 %res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op) 404 ret i8 %res 405} 406 407define i8 @eorv_v128i8(<128 x i8>* %a) #0 { 408; CHECK-LABEL: eorv_v128i8: 409; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 410; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 411; VBITS_GE_1024-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 412; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 413; VBITS_GE_1024-NEXT: ret 414 %op = load <128 x i8>, <128 x i8>* %a 415 %res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op) 416 ret i8 %res 417} 418 419define i8 @eorv_v256i8(<256 x i8>* %a) #0 { 420; CHECK-LABEL: eorv_v256i8: 421; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 422; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 423; VBITS_GE_2048-NEXT: eorv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 424; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 425; VBITS_GE_2048-NEXT: ret 426 %op = load <256 x i8>, <256 x i8>* %a 427 %res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op) 428 ret i8 %res 429} 430 431; No single instruction NEON EORV support. Use SVE. 432define i16 @eorv_v4i16(<4 x i16> %a) #0 { 433; CHECK-LABEL: eorv_v4i16: 434; CHECK: ptrue [[PG:p[0-9]+]].h, vl4 435; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h 436; CHECK: fmov w0, s[[REDUCE]] 437; CHECK: ret 438 %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a) 439 ret i16 %res 440} 441 442; No single instruction NEON EORV support. Use SVE. 443define i16 @eorv_v8i16(<8 x i16> %a) #0 { 444; CHECK-LABEL: eorv_v8i16: 445; CHECK: ptrue [[PG:p[0-9]+]].h, vl8 446; CHECK: eorv h[[REDUCE:[0-9]+]], [[PG]], z0.h 447; CHECK: fmov w0, s[[REDUCE]] 448; CHECK: ret 449 %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a) 450 ret i16 %res 451} 452 453define i16 @eorv_v16i16(<16 x i16>* %a) #0 { 454; CHECK-LABEL: eorv_v16i16: 455; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 456; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 457; CHECK-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 458; CHECK-NEXT: fmov w0, s[[REDUCE]] 459; CHECK-NEXT: ret 460 %op = load <16 x i16>, <16 x i16>* %a 461 %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op) 462 ret i16 %res 463} 464 465define i16 @eorv_v32i16(<32 x i16>* %a) #0 { 466; CHECK-LABEL: eorv_v32i16: 467; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 468; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 469; VBITS_GE_512-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 470; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 471; VBITS_GE_512-NEXT: ret 472 473; Ensure sensible type legalisation. 474; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 475; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 476; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 477; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] 478; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d 479; VBITS_EQ_256-DAG: eorv h[[REDUCE:[0-9]+]], [[PG]], [[EOR]].h 480; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 481; VBITS_EQ_256-NEXT: ret 482 %op = load <32 x i16>, <32 x i16>* %a 483 %res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op) 484 ret i16 %res 485} 486 487define i16 @eorv_v64i16(<64 x i16>* %a) #0 { 488; CHECK-LABEL: eorv_v64i16: 489; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 490; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 491; VBITS_GE_1024-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 492; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 493; VBITS_GE_1024-NEXT: ret 494 %op = load <64 x i16>, <64 x i16>* %a 495 %res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op) 496 ret i16 %res 497} 498 499define i16 @eorv_v128i16(<128 x i16>* %a) #0 { 500; CHECK-LABEL: eorv_v128i16: 501; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 502; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 503; VBITS_GE_2048-NEXT: eorv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 504; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 505; VBITS_GE_2048-NEXT: ret 506 %op = load <128 x i16>, <128 x i16>* %a 507 %res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op) 508 ret i16 %res 509} 510 511; No single instruction NEON EORV support. Use SVE. 512define i32 @eorv_v2i32(<2 x i32> %a) #0 { 513; CHECK-LABEL: eorv_v2i32: 514; CHECK: ptrue [[PG:p[0-9]+]].s, vl2 515; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s 516; CHECK: fmov w0, [[REDUCE]] 517; CHECK: ret 518 %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) 519 ret i32 %res 520} 521 522; No single instruction NEON EORV support. Use SVE. 523define i32 @eorv_v4i32(<4 x i32> %a) #0 { 524; CHECK-LABEL: eorv_v4i32: 525; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 526; CHECK: eorv [[REDUCE:s[0-9]+]], [[PG]], z0.s 527; CHECK: fmov w0, [[REDUCE]] 528; CHECK: ret 529 %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) 530 ret i32 %res 531} 532 533define i32 @eorv_v8i32(<8 x i32>* %a) #0 { 534; CHECK-LABEL: eorv_v8i32: 535; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 536; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 537; CHECK-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 538; CHECK-NEXT: fmov w0, [[REDUCE]] 539; CHECK-NEXT: ret 540 %op = load <8 x i32>, <8 x i32>* %a 541 %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op) 542 ret i32 %res 543} 544 545define i32 @eorv_v16i32(<16 x i32>* %a) #0 { 546; CHECK-LABEL: eorv_v16i32: 547; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 548; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 549; VBITS_GE_512-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 550; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] 551; VBITS_GE_512-NEXT: ret 552 553; Ensure sensible type legalisation. 554; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 555; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 556; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 557; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] 558; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d 559; VBITS_EQ_256-DAG: eorv [[REDUCE:s[0-9]+]], [[PG]], [[EOR]].s 560; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]] 561; VBITS_EQ_256-NEXT: ret 562 %op = load <16 x i32>, <16 x i32>* %a 563 %res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op) 564 ret i32 %res 565} 566 567define i32 @eorv_v32i32(<32 x i32>* %a) #0 { 568; CHECK-LABEL: eorv_v32i32: 569; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 570; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 571; VBITS_GE_1024-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 572; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] 573; VBITS_GE_1024-NEXT: ret 574 %op = load <32 x i32>, <32 x i32>* %a 575 %res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op) 576 ret i32 %res 577} 578 579define i32 @eorv_v64i32(<64 x i32>* %a) #0 { 580; CHECK-LABEL: eorv_v64i32: 581; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 582; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 583; VBITS_GE_2048-NEXT: eorv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 584; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] 585; VBITS_GE_2048-NEXT: ret 586 %op = load <64 x i32>, <64 x i32>* %a 587 %res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op) 588 ret i32 %res 589} 590 591; Nothing to do for single element vectors. 592define i64 @eorv_v1i64(<1 x i64> %a) #0 { 593; CHECK-LABEL: eorv_v1i64: 594; CHECK: fmov x0, d0 595; CHECK: ret 596 %res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a) 597 ret i64 %res 598} 599 600; Use SVE for 128-bit vectors 601define i64 @eorv_v2i64(<2 x i64> %a) #0 { 602; CHECK-LABEL: eorv_v2i64: 603; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 604; CHECK: eorv [[REDUCE:d[0-9]+]], [[PG]], z0.d 605; CHECK: fmov x0, [[REDUCE]] 606; CHECK: ret 607 %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a) 608 ret i64 %res 609} 610 611define i64 @eorv_v4i64(<4 x i64>* %a) #0 { 612; CHECK-LABEL: eorv_v4i64: 613; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 614; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 615; CHECK-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 616; CHECK-NEXT: fmov x0, [[REDUCE]] 617; CHECK-NEXT: ret 618 %op = load <4 x i64>, <4 x i64>* %a 619 %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op) 620 ret i64 %res 621} 622 623define i64 @eorv_v8i64(<8 x i64>* %a) #0 { 624; CHECK-LABEL: eorv_v8i64: 625; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 626; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 627; VBITS_GE_512-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 628; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 629; VBITS_GE_512-NEXT: ret 630 631; Ensure sensible type legalisation. 632; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 633; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 634; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 635; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] 636; VBITS_EQ_256-DAG: eor [[EOR:z[0-9]+]].d, [[LO]].d, [[HI]].d 637; VBITS_EQ_256-DAG: eorv [[REDUCE:d[0-9]+]], [[PG]], [[EOR]].d 638; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 639; VBITS_EQ_256-NEXT: ret 640 %op = load <8 x i64>, <8 x i64>* %a 641 %res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op) 642 ret i64 %res 643} 644 645define i64 @eorv_v16i64(<16 x i64>* %a) #0 { 646; CHECK-LABEL: eorv_v16i64: 647; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 648; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 649; VBITS_GE_1024-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 650; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 651; VBITS_GE_1024-NEXT: ret 652 %op = load <16 x i64>, <16 x i64>* %a 653 %res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op) 654 ret i64 %res 655} 656 657define i64 @eorv_v32i64(<32 x i64>* %a) #0 { 658; CHECK-LABEL: eorv_v32i64: 659; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 660; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 661; VBITS_GE_2048-NEXT: eorv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 662; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 663; VBITS_GE_2048-NEXT: ret 664 %op = load <32 x i64>, <32 x i64>* %a 665 %res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op) 666 ret i64 %res 667} 668 669; 670; ORV 671; 672 673; No single instruction NEON ORV support. Use SVE. 674define i8 @orv_v8i8(<8 x i8> %a) #0 { 675; CHECK-LABEL: orv_v8i8: 676; CHECK: ptrue [[PG:p[0-9]+]].b, vl8 677; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b 678; CHECK: fmov w0, s[[REDUCE]] 679; CHECK: ret 680 %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a) 681 ret i8 %res 682} 683 684; No single instruction NEON ORV support. Use SVE. 685define i8 @orv_v16i8(<16 x i8> %a) #0 { 686; CHECK-LABEL: orv_v16i8: 687; CHECK: ptrue [[PG:p[0-9]+]].b, vl16 688; CHECK: orv b[[REDUCE:[0-9]+]], [[PG]], z0.b 689; CHECK: fmov w0, s[[REDUCE]] 690; CHECK: ret 691 %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a) 692 ret i8 %res 693} 694 695define i8 @orv_v32i8(<32 x i8>* %a) #0 { 696; CHECK-LABEL: orv_v32i8: 697; CHECK: ptrue [[PG:p[0-9]+]].b, vl32 698; CHECK-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 699; CHECK-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 700; CHECK-NEXT: fmov w0, s[[REDUCE]] 701; CHECK-NEXT: ret 702 %op = load <32 x i8>, <32 x i8>* %a 703 %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op) 704 ret i8 %res 705} 706 707define i8 @orv_v64i8(<64 x i8>* %a) #0 { 708; CHECK-LABEL: orv_v64i8: 709; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl64 710; VBITS_GE_512-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 711; VBITS_GE_512-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 712; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 713; VBITS_GE_512-NEXT: ret 714 715; Ensure sensible type legalisation. 716; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32 717; VBITS_EQ_256-DAG: mov w[[A_HI:[0-9]+]], #32 718; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] 719; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[A_HI]]] 720; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d 721; VBITS_EQ_256-DAG: orv b[[REDUCE:[0-9]+]], [[PG]], [[OR]].b 722; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 723; VBITS_EQ_256-NEXT: ret 724 725 %op = load <64 x i8>, <64 x i8>* %a 726 %res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op) 727 ret i8 %res 728} 729 730define i8 @orv_v128i8(<128 x i8>* %a) #0 { 731; CHECK-LABEL: orv_v128i8: 732; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl128 733; VBITS_GE_1024-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 734; VBITS_GE_1024-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 735; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 736; VBITS_GE_1024-NEXT: ret 737 %op = load <128 x i8>, <128 x i8>* %a 738 %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op) 739 ret i8 %res 740} 741 742define i8 @orv_v256i8(<256 x i8>* %a) #0 { 743; CHECK-LABEL: orv_v256i8: 744; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl256 745; VBITS_GE_2048-NEXT: ld1b { [[OP:z[0-9]+]].b }, [[PG]]/z, [x0] 746; VBITS_GE_2048-NEXT: orv b[[REDUCE:[0-9]+]], [[PG]], [[OP]].b 747; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 748; VBITS_GE_2048-NEXT: ret 749 %op = load <256 x i8>, <256 x i8>* %a 750 %res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op) 751 ret i8 %res 752} 753 754; No single instruction NEON ORV support. Use SVE. 755define i16 @orv_v4i16(<4 x i16> %a) #0 { 756; CHECK-LABEL: orv_v4i16: 757; CHECK: ptrue [[PG:p[0-9]+]].h, vl4 758; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h 759; CHECK: fmov w0, s[[REDUCE]] 760; CHECK: ret 761 %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a) 762 ret i16 %res 763} 764 765; No single instruction NEON ORV support. Use SVE. 766define i16 @orv_v8i16(<8 x i16> %a) #0 { 767; CHECK-LABEL: orv_v8i16: 768; CHECK: ptrue [[PG:p[0-9]+]].h, vl8 769; CHECK: orv h[[REDUCE:[0-9]+]], [[PG]], z0.h 770; CHECK: fmov w0, s[[REDUCE]] 771; CHECK: ret 772 %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a) 773 ret i16 %res 774} 775 776define i16 @orv_v16i16(<16 x i16>* %a) #0 { 777; CHECK-LABEL: orv_v16i16: 778; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 779; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 780; CHECK-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 781; CHECK-NEXT: fmov w0, s[[REDUCE]] 782; CHECK-NEXT: ret 783 %op = load <16 x i16>, <16 x i16>* %a 784 %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op) 785 ret i16 %res 786} 787 788define i16 @orv_v32i16(<32 x i16>* %a) #0 { 789; CHECK-LABEL: orv_v32i16: 790; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 791; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 792; VBITS_GE_512-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 793; VBITS_GE_512-NEXT: fmov w0, s[[REDUCE]] 794; VBITS_GE_512-NEXT: ret 795 796; Ensure sensible type legalisation. 797; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 798; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 799; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] 800; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] 801; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d 802; VBITS_EQ_256-DAG: orv h[[REDUCE:[0-9]+]], [[PG]], [[OR]].h 803; VBITS_EQ_256-NEXT: fmov w0, s[[REDUCE]] 804; VBITS_EQ_256-NEXT: ret 805 %op = load <32 x i16>, <32 x i16>* %a 806 %res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op) 807 ret i16 %res 808} 809 810define i16 @orv_v64i16(<64 x i16>* %a) #0 { 811; CHECK-LABEL: orv_v64i16: 812; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 813; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 814; VBITS_GE_1024-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 815; VBITS_GE_1024-NEXT: fmov w0, s[[REDUCE]] 816; VBITS_GE_1024-NEXT: ret 817 %op = load <64 x i16>, <64 x i16>* %a 818 %res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op) 819 ret i16 %res 820} 821 822define i16 @orv_v128i16(<128 x i16>* %a) #0 { 823; CHECK-LABEL: orv_v128i16: 824; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 825; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] 826; VBITS_GE_2048-NEXT: orv h[[REDUCE:[0-9]+]], [[PG]], [[OP]].h 827; VBITS_GE_2048-NEXT: fmov w0, s[[REDUCE]] 828; VBITS_GE_2048-NEXT: ret 829 %op = load <128 x i16>, <128 x i16>* %a 830 %res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op) 831 ret i16 %res 832} 833 834; No single instruction NEON ORV support. Use SVE. 835define i32 @orv_v2i32(<2 x i32> %a) #0 { 836; CHECK-LABEL: orv_v2i32: 837; CHECK: ptrue [[PG:p[0-9]+]].s, vl2 838; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s 839; CHECK: fmov w0, [[REDUCE]] 840; CHECK: ret 841 %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) 842 ret i32 %res 843} 844 845; No single instruction NEON ORV support. Use SVE. 846define i32 @orv_v4i32(<4 x i32> %a) #0 { 847; CHECK-LABEL: orv_v4i32: 848; CHECK: ptrue [[PG:p[0-9]+]].s, vl4 849; CHECK: orv [[REDUCE:s[0-9]+]], [[PG]], z0.s 850; CHECK: fmov w0, [[REDUCE]] 851; CHECK: ret 852 %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) 853 ret i32 %res 854} 855 856define i32 @orv_v8i32(<8 x i32>* %a) #0 { 857; CHECK-LABEL: orv_v8i32: 858; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 859; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 860; CHECK-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 861; CHECK-NEXT: fmov w0, [[REDUCE]] 862; CHECK-NEXT: ret 863 %op = load <8 x i32>, <8 x i32>* %a 864 %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op) 865 ret i32 %res 866} 867 868define i32 @orv_v16i32(<16 x i32>* %a) #0 { 869; CHECK-LABEL: orv_v16i32: 870; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 871; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 872; VBITS_GE_512-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 873; VBITS_GE_512-NEXT: fmov w0, [[REDUCE]] 874; VBITS_GE_512-NEXT: ret 875 876; Ensure sensible type legalisation. 877; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 878; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 879; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] 880; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] 881; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d 882; VBITS_EQ_256-DAG: orv [[REDUCE:s[0-9]+]], [[PG]], [[OR]].s 883; VBITS_EQ_256-NEXT: fmov w0, [[REDUCE]] 884; VBITS_EQ_256-NEXT: ret 885 %op = load <16 x i32>, <16 x i32>* %a 886 %res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op) 887 ret i32 %res 888} 889 890define i32 @orv_v32i32(<32 x i32>* %a) #0 { 891; CHECK-LABEL: orv_v32i32: 892; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 893; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 894; VBITS_GE_1024-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 895; VBITS_GE_1024-NEXT: fmov w0, [[REDUCE]] 896; VBITS_GE_1024-NEXT: ret 897 %op = load <32 x i32>, <32 x i32>* %a 898 %res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op) 899 ret i32 %res 900} 901 902define i32 @orv_v64i32(<64 x i32>* %a) #0 { 903; CHECK-LABEL: orv_v64i32: 904; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 905; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] 906; VBITS_GE_2048-NEXT: orv [[REDUCE:s[0-9]+]], [[PG]], [[OP]].s 907; VBITS_GE_2048-NEXT: fmov w0, [[REDUCE]] 908; VBITS_GE_2048-NEXT: ret 909 %op = load <64 x i32>, <64 x i32>* %a 910 %res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op) 911 ret i32 %res 912} 913 914; Nothing to do for single element vectors. 915define i64 @orv_v1i64(<1 x i64> %a) #0 { 916; CHECK-LABEL: orv_v1i64: 917; CHECK: fmov x0, d0 918; CHECK: ret 919 %res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a) 920 ret i64 %res 921} 922 923; Use SVE for 128-bit vectors 924define i64 @orv_v2i64(<2 x i64> %a) #0 { 925; CHECK-LABEL: orv_v2i64: 926; CHECK: ptrue [[PG:p[0-9]+]].d, vl2 927; CHECK: orv [[REDUCE:d[0-9]+]], [[PG]], z0.d 928; CHECK: fmov x0, [[REDUCE]] 929; CHECK: ret 930 %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a) 931 ret i64 %res 932} 933 934define i64 @orv_v4i64(<4 x i64>* %a) #0 { 935; CHECK-LABEL: orv_v4i64: 936; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 937; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 938; CHECK-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 939; CHECK-NEXT: fmov x0, [[REDUCE]] 940; CHECK-NEXT: ret 941 %op = load <4 x i64>, <4 x i64>* %a 942 %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op) 943 ret i64 %res 944} 945 946define i64 @orv_v8i64(<8 x i64>* %a) #0 { 947; CHECK-LABEL: orv_v8i64: 948; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 949; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 950; VBITS_GE_512-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 951; VBITS_GE_512-NEXT: fmov x0, [[REDUCE]] 952; VBITS_GE_512-NEXT: ret 953 954; Ensure sensible type legalisation. 955; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 956; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 957; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] 958; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] 959; VBITS_EQ_256-DAG: orr [[OR:z[0-9]+]].d, [[LO]].d, [[HI]].d 960; VBITS_EQ_256-DAG: orv [[REDUCE:d[0-9]+]], [[PG]], [[OR]].d 961; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] 962; VBITS_EQ_256-NEXT: ret 963 %op = load <8 x i64>, <8 x i64>* %a 964 %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op) 965 ret i64 %res 966} 967 968define i64 @orv_v16i64(<16 x i64>* %a) #0 { 969; CHECK-LABEL: orv_v16i64: 970; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 971; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 972; VBITS_GE_1024-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 973; VBITS_GE_1024-NEXT: fmov x0, [[REDUCE]] 974; VBITS_GE_1024-NEXT: ret 975 %op = load <16 x i64>, <16 x i64>* %a 976 %res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op) 977 ret i64 %res 978} 979 980define i64 @orv_v32i64(<32 x i64>* %a) #0 { 981; CHECK-LABEL: orv_v32i64: 982; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 983; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] 984; VBITS_GE_2048-NEXT: orv [[REDUCE:d[0-9]+]], [[PG]], [[OP]].d 985; VBITS_GE_2048-NEXT: fmov x0, [[REDUCE]] 986; VBITS_GE_2048-NEXT: ret 987 %op = load <32 x i64>, <32 x i64>* %a 988 %res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op) 989 ret i64 %res 990} 991 992attributes #0 = { "target-features"="+sve" } 993 994declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>) 995declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>) 996declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>) 997declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>) 998declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>) 999declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>) 1000 1001declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>) 1002declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>) 1003declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) 1004declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>) 1005declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>) 1006declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>) 1007 1008declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) 1009declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) 1010declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) 1011declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) 1012declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>) 1013declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>) 1014 1015declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>) 1016declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) 1017declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) 1018declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) 1019declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) 1020declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>) 1021 1022declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) 1023declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>) 1024declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>) 1025declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>) 1026declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>) 1027declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>) 1028 1029declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>) 1030declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>) 1031declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>) 1032declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>) 1033declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>) 1034declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>) 1035 1036declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) 1037declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) 1038declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) 1039declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) 1040declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>) 1041declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>) 1042 1043declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>) 1044declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) 1045declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) 1046declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) 1047declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) 1048declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>) 1049 1050declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>) 1051declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) 1052declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>) 1053declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>) 1054declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>) 1055declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>) 1056 1057declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>) 1058declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>) 1059declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>) 1060declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>) 1061declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>) 1062declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>) 1063 1064declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) 1065declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) 1066declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) 1067declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) 1068declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>) 1069declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>) 1070 1071declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>) 1072declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) 1073declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) 1074declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>) 1075declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>) 1076declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>) 1077