1 // Copyright 2021 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #include <xnnpack/allocator.h> 7 #include <xnnpack/assembler.h> 8 9 #include <cstddef> 10 #include <cstdint> 11 #include <initializer_list> 12 13 namespace xnnpack { 14 namespace aarch32 { 15 16 enum class SpecialFPRegister { 17 kFPSCR = 1, 18 }; 19 20 constexpr SpecialFPRegister FPSCR = SpecialFPRegister::kFPSCR; 21 22 struct CoreRegister { 23 uint8_t code; 24 }; 25 26 constexpr CoreRegister r0{0}; 27 constexpr CoreRegister r1{1}; 28 constexpr CoreRegister r2{2}; 29 constexpr CoreRegister r3{3}; 30 constexpr CoreRegister r4{4}; 31 constexpr CoreRegister r5{5}; 32 constexpr CoreRegister r6{6}; 33 constexpr CoreRegister r7{7}; 34 constexpr CoreRegister r8{8}; 35 constexpr CoreRegister r9{9}; 36 constexpr CoreRegister r10{10}; 37 constexpr CoreRegister r11{11}; 38 constexpr CoreRegister r12{12}; 39 constexpr CoreRegister r13{13}; 40 constexpr CoreRegister r14{14}; 41 constexpr CoreRegister r15{15}; 42 constexpr CoreRegister sp = r13; 43 constexpr CoreRegister lr = r14; 44 constexpr CoreRegister pc = r15; 45 constexpr CoreRegister APSR_nzcv = r15; 46 47 static inline bool operator==(const CoreRegister lhs, const CoreRegister rhs) { 48 return lhs.code == rhs.code; 49 } 50 51 struct CoreRegisterList { CoreRegisterListCoreRegisterList52 CoreRegisterList(std::initializer_list<CoreRegister> rs) { 53 for (auto r : rs) { 54 list |= 1 << r.code; 55 } 56 } 57 has_more_than_one_registerCoreRegisterList58 bool has_more_than_one_register() { return (list & (list - 1)) != 0; } 59 60 // Bit i is set if CoreRegister is in the list. 61 uint16_t list = 0; 62 }; 63 64 static inline bool operator==(int i, CoreRegisterList registers) { 65 return i == registers.list; 66 } 67 68 struct SRegister { 69 uint8_t code; dSRegister70 uint8_t d() const { return code & 0x1; } vdSRegister71 uint8_t vd() const { return (code & 0x1e) >> 1; } 72 }; 73 74 static inline bool operator==(const SRegister lhs, const SRegister rhs) { 75 return lhs.code == rhs.code; 76 } 77 78 constexpr SRegister s0{0}; 79 constexpr SRegister s1{1}; 80 constexpr SRegister s2{2}; 81 constexpr SRegister s3{3}; 82 constexpr SRegister s4{4}; 83 constexpr SRegister s5{5}; 84 constexpr SRegister s6{6}; 85 constexpr SRegister s7{7}; 86 constexpr SRegister s8{8}; 87 constexpr SRegister s9{9}; 88 constexpr SRegister s10{10}; 89 constexpr SRegister s11{11}; 90 constexpr SRegister s12{12}; 91 constexpr SRegister s13{13}; 92 constexpr SRegister s14{14}; 93 constexpr SRegister s15{15}; 94 constexpr SRegister s16{16}; 95 constexpr SRegister s17{17}; 96 constexpr SRegister s18{18}; 97 constexpr SRegister s19{19}; 98 constexpr SRegister s20{20}; 99 constexpr SRegister s21{21}; 100 constexpr SRegister s22{22}; 101 constexpr SRegister s23{23}; 102 constexpr SRegister s24{24}; 103 constexpr SRegister s25{25}; 104 constexpr SRegister s26{26}; 105 constexpr SRegister s27{27}; 106 constexpr SRegister s28{28}; 107 constexpr SRegister s29{29}; 108 constexpr SRegister s30{30}; 109 constexpr SRegister s31{31}; 110 111 // Define DRegisterLane before DRegister so that we can have the operator[] overloading for nice syntax. 112 struct DRegisterLane { 113 uint8_t code; 114 uint8_t lane; 115 dDRegisterLane116 uint8_t d() const { return (code & 0x10) >> 4; } vdDRegisterLane117 uint8_t vd() const { return code & 0xf; } 118 }; 119 120 static inline bool operator==(const DRegisterLane lhs, const DRegisterLane rhs) { 121 return lhs.code == rhs.code && lhs.lane == rhs.lane; 122 } 123 124 struct DRegister { 125 uint8_t code; 126 dDRegister127 uint8_t d() const { return (code & 0x10) >> 4; } vdDRegister128 uint8_t vd() const { return code & 0xf; } 129 130 const DRegisterLane operator[](std::size_t pos) const { 131 return DRegisterLane{code, static_cast<uint8_t>(pos)}; 132 } 133 }; 134 135 static inline bool operator==(const DRegister lhs, const DRegister rhs) { 136 return lhs.code == rhs.code; 137 } 138 139 constexpr DRegister d0{0}; 140 constexpr DRegister d1{1}; 141 constexpr DRegister d2{2}; 142 constexpr DRegister d3{3}; 143 constexpr DRegister d4{4}; 144 constexpr DRegister d5{5}; 145 constexpr DRegister d6{6}; 146 constexpr DRegister d7{7}; 147 constexpr DRegister d8{8}; 148 constexpr DRegister d9{9}; 149 constexpr DRegister d10{10}; 150 constexpr DRegister d11{11}; 151 constexpr DRegister d12{12}; 152 constexpr DRegister d13{13}; 153 constexpr DRegister d14{14}; 154 constexpr DRegister d15{15}; 155 constexpr DRegister d16{16}; 156 constexpr DRegister d17{17}; 157 constexpr DRegister d18{18}; 158 constexpr DRegister d19{19}; 159 constexpr DRegister d20{20}; 160 constexpr DRegister d21{21}; 161 constexpr DRegister d22{22}; 162 constexpr DRegister d23{23}; 163 constexpr DRegister d24{24}; 164 constexpr DRegister d25{25}; 165 constexpr DRegister d26{26}; 166 constexpr DRegister d27{27}; 167 constexpr DRegister d28{28}; 168 constexpr DRegister d29{29}; 169 constexpr DRegister d30{30}; 170 constexpr DRegister d31{31}; 171 172 struct QRegister { 173 uint8_t code; 174 // Encode code * 2. dQRegister175 uint8_t d() const { return (code & 0x8) >> 3; } vdQRegister176 uint8_t vd() const { return (code & 0x7) << 1; } lowQRegister177 DRegister low() const { return DRegister{uint8_t(code * 2)}; } highQRegister178 DRegister high() const { return DRegister{uint8_t(code * 2 + 1)}; } 179 }; 180 181 static inline bool operator==(const QRegister lhs, const QRegister rhs) { 182 return lhs.code == rhs.code; 183 } 184 185 constexpr QRegister q0{0}; 186 constexpr QRegister q1{1}; 187 constexpr QRegister q2{2}; 188 constexpr QRegister q3{3}; 189 constexpr QRegister q4{4}; 190 constexpr QRegister q5{5}; 191 constexpr QRegister q6{6}; 192 constexpr QRegister q7{7}; 193 constexpr QRegister q8{8}; 194 constexpr QRegister q9{9}; 195 constexpr QRegister q10{10}; 196 constexpr QRegister q11{11}; 197 constexpr QRegister q12{12}; 198 constexpr QRegister q13{13}; 199 constexpr QRegister q14{14}; 200 constexpr QRegister q15{15}; 201 202 // SIMD register lists are used in a more restrictive way, compared to core 203 // registers, only consecutive registers are used as an operand to instruction. 204 template <typename RegType> 205 struct ConsecutiveRegisterList { 206 // End must be >= start. ConsecutiveRegisterListConsecutiveRegisterList207 ConsecutiveRegisterList(RegType s, RegType end) 208 : start(s), 209 length(end.code - s.code + 1) {} ConsecutiveRegisterListConsecutiveRegisterList210 explicit ConsecutiveRegisterList(RegType s, int len) 211 : start(s), 212 length(len) {} ConsecutiveRegisterListConsecutiveRegisterList213 ConsecutiveRegisterList(RegType start) 214 : ConsecutiveRegisterList(start, start) {} 215 216 RegType start; 217 uint8_t length; 218 }; 219 220 // Specific struct for VLD2 and VLD3 register list operand. 221 struct VLoadStoreRegList { VLoadStoreRegListVLoadStoreRegList222 VLoadStoreRegList(DRegister reg1, DRegister reg2) 223 : reg1(reg1), reg2(reg2) { 224 if (reg1.code == reg2.code - 2) { 225 double_spaced = true; 226 } else { 227 double_spaced = false; 228 } 229 } VLoadStoreRegListVLoadStoreRegList230 VLoadStoreRegList(DRegister reg1, DRegister reg2, DRegister reg3) 231 : reg1(reg1), reg2(reg2), reg3(reg3) { 232 if (reg1.code == reg2.code - 2) { 233 double_spaced = true; 234 } else { 235 double_spaced = false; 236 } 237 } 238 239 DRegister reg1; 240 DRegister reg2; 241 DRegister reg3; 242 bool double_spaced; 243 }; 244 245 using SRegisterList = ConsecutiveRegisterList<SRegister>; 246 using DRegisterList = ConsecutiveRegisterList<DRegister>; 247 248 static inline SRegisterList operator-(const SRegister lhs, const SRegister rhs) { 249 return SRegisterList(lhs, rhs); 250 } 251 252 static inline DRegisterList operator-(const DRegister lhs, const DRegister rhs) { 253 return DRegisterList(lhs, rhs); 254 } 255 256 struct QRegisterList { QRegisterListQRegisterList257 QRegisterList(QRegister s) : start(s), length(1) {} QRegisterListQRegisterList258 QRegisterList(QRegister s, QRegister end) : start(s), length(end.code - s.code + 1) {} 259 // Explicit conversion to DRegisterList. DRegisterListQRegisterList260 explicit operator DRegisterList() const { 261 return DRegisterList({static_cast<uint8_t>(start.code * 2)}, length * 2); 262 } 263 264 QRegister start; 265 uint8_t length; 266 }; 267 268 static inline QRegisterList operator-(const QRegister lhs, const QRegister rhs) { 269 return QRegisterList(lhs, rhs); 270 } 271 272 // A8.5 Addressing modes for memory access. 273 enum class AddressingMode { 274 // [<Rn>, <offset>], offset applied to address in Rn. 275 kOffset, 276 // Pre-indexed not used, so not implemented. 277 // [<Rn>], <offset>, address from Rn, offset applied, written back to Rn. 278 kPostIndexed, 279 }; 280 281 // Memory operands, operands for memory access instructions. See 282 // "MemOperandHelper mem" for a nicer syntax that is closer to assembly. 283 class MemOperand { 284 public: MemOperand(CoreRegister rn,int32_t offset)285 MemOperand(CoreRegister rn, int32_t offset) 286 : mode_(AddressingMode::kOffset), 287 rn_(rn), 288 offset_(offset) {} 289 MemOperand(CoreRegister rn,int32_t offset,AddressingMode mode)290 MemOperand(CoreRegister rn, int32_t offset, AddressingMode mode) 291 : mode_(mode), 292 rn_(rn), 293 offset_(offset) {} 294 base()295 CoreRegister base() const { return rn_; } offset()296 int32_t offset() const { return offset_; } mode()297 AddressingMode mode() const { return mode_; } 298 299 // These are bits used for encoding, named based on the encoding description. u()300 int32_t u() { return offset_ >= 0; } p()301 int32_t p() { return mode_ != AddressingMode::kPostIndexed; } 302 // Note, kPostIndexed will write back, but doesn't need to set bit w. w()303 int32_t w() { return 0; } 304 305 // Overload postfix increment to indicate a post-indexed addressing mode for load/stores. 306 MemOperand operator++(int) { 307 mode_ = AddressingMode::kPostIndexed; 308 return *this; 309 } 310 311 private: 312 AddressingMode mode_; 313 CoreRegister rn_; 314 int32_t offset_; 315 }; 316 317 static inline bool operator==(const MemOperand lhs, const MemOperand rhs) { 318 return lhs.mode() == rhs.mode() && lhs.base() == rhs.base() && lhs.offset() == rhs.offset(); 319 } 320 321 static inline MemOperand operator,(CoreRegister r, int32_t offset) { 322 return MemOperand(r, offset); 323 } 324 325 // Helper struct for some syntax sugar to look like native assembly, see mem. 326 struct MemOperandHelper { 327 const MemOperand operator[](MemOperand op) const { return op; } 328 MemOperand operator[](CoreRegister r) const { return MemOperand(r, 0); } 329 }; 330 331 // Use "mem" (and its overload of array subscript operator) to get some syntax 332 // that looks closer to native assembly when accessing memory. For example: 333 // - ldr(r0, mem[rn, offset]); // offset 334 // - ldr(r0, mem[rn], offset); // post-indexed 335 constexpr MemOperandHelper mem; 336 337 // Conditional execution, only support AL (always) for now. 338 enum Condition : uint32_t { 339 kEQ = 0x00000000, 340 kNE = 0x10000000, 341 kCS = 0x20000000, 342 kCC = 0x30000000, 343 kMI = 0x40000000, 344 kPL = 0x50000000, 345 kVS = 0x60000000, 346 kVC = 0x70000000, 347 kHI = 0x80000000, 348 kLS = 0x90000000, 349 kGE = 0xa0000000, 350 kLT = 0xB0000000, 351 kGT = 0xC0000000, 352 kLE = 0xD0000000, 353 kAL = 0xE0000000, 354 kHS = kCS, 355 kLO = kCC, 356 }; 357 358 enum DataSize { 359 k8 = 0, 360 k16 = 1, 361 k32 = 2, 362 }; 363 364 // A simple AAarch32 assembler. 365 class Assembler : public AssemblerBase { 366 public: 367 using AssemblerBase::AssemblerBase; 368 add(CoreRegister rn,CoreRegister rm)369 void add(CoreRegister rn, CoreRegister rm) { add(rn, rn, rm); } 370 void add(CoreRegister rd, CoreRegister rn, CoreRegister rm); 371 // Only support uint8_t immediates for now, it simplifies encoding. 372 void add(CoreRegister rd, CoreRegister rn, uint8_t imm); 373 void adds(CoreRegister rd, CoreRegister rn, uint8_t imm); 374 void and_(CoreRegister rd, CoreRegister rn, uint8_t imm); b(Label & l)375 void b(Label& l) { b(kAL, l); } beq(Label & l)376 void beq(Label& l) { b(kEQ, l); } bne(Label & l)377 void bne(Label& l) { b(kNE, l); } bhi(Label & l)378 void bhi(Label& l) { b(kHI, l); } bhs(Label & l)379 void bhs(Label& l) { b(kHS, l); } blo(Label & l)380 void blo(Label& l) { b(kLO, l); } 381 void bic(CoreRegister rd, CoreRegister rn, uint8_t imm); 382 void bx(CoreRegister rm); 383 // Cmp supports a subset of uint32_t offsets, see "A5.2.4 Modified immediate 384 // constants in ARM instructions", for simplicity we start with uint8_t, which 385 // is fully representation using a "rotation" of 0. 386 void cmp(CoreRegister rn, uint8_t imm); 387 void cmp(CoreRegister rn, CoreRegister rm); 388 void ldr(CoreRegister rt, MemOperand operand, int32_t offset); 389 void ldr(CoreRegister rt, MemOperand operand); 390 // LDRD <Rt>, <Rt2>, [<Rn>{, #+/-<imm>}]. 391 void ldrd(CoreRegister rt, CoreRegister rt2, MemOperand op); 392 void mov(CoreRegister rd, CoreRegister rm); moveq(CoreRegister rd,CoreRegister rm)393 void moveq(CoreRegister rd, CoreRegister rm) { mov(kEQ, rd, rm); } movlo(CoreRegister rd,CoreRegister rm)394 void movlo(CoreRegister rd, CoreRegister rm) { mov(kLO, rd, rm); } movls(CoreRegister rd,CoreRegister rm)395 void movls(CoreRegister rd, CoreRegister rm) { mov(kLS, rd, rm); } 396 void nop(); 397 void pld(MemOperand operand); 398 void pop(CoreRegisterList regs); 399 void push(CoreRegisterList regs); 400 void str(CoreRegister rt, MemOperand op); 401 void sub(CoreRegister rd, CoreRegister rn, uint8_t imm); 402 void sub(CoreRegister rd, CoreRegister rn, CoreRegister rm); 403 // Only support uint8_t immediates for now, it simplifies encoding. 404 void subs(CoreRegister rd, CoreRegister rn, uint8_t imm); 405 void tst(CoreRegister rn, uint8_t imm); 406 407 // SIMD instructions. 408 void vabs_f32(QRegister qd, QRegister qm); 409 void vadd_f32(QRegister qd, QRegister qn, QRegister qm); 410 void vcmpe_f32(SRegister sd, SRegister sm); 411 void vcvt_f32_s32(QRegister qd, QRegister qm); 412 void vcvt_s32_f32(QRegister qd, QRegister qm); 413 void vcvtn_s32_f32(QRegister qd, QRegister qm); vdup_8(QRegister qd,DRegisterLane dm)414 void vdup_8(QRegister qd, DRegisterLane dm) { vdup(k8, qd, dm); } vdup_16(QRegister qd,DRegisterLane dm)415 void vdup_16(QRegister qd, DRegisterLane dm) { vdup(k16, qd, dm); } vdup_32(QRegister qd,DRegisterLane dm)416 void vdup_32(QRegister qd, DRegisterLane dm) { vdup(k32, qd, dm); } 417 void vext_8(QRegister qd, QRegister qn, QRegister qm, uint8_t imm4); 418 // VLD1.8 <list>, [<Rn>]{!} (multiple single elements). vld1_8(DRegisterList regs,MemOperand op)419 void vld1_8(DRegisterList regs, MemOperand op) { vld1(k8, regs, op); } vld1_8(DRegisterList regs,MemOperand op,CoreRegister rm)420 void vld1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { vld1(k8, regs, op, rm); } vld1_8(QRegisterList regs,MemOperand op)421 void vld1_8(QRegisterList regs, MemOperand op) { vld1(k8, static_cast<DRegisterList>(regs), op); } 422 // VLD1.32 <list>, [<Rn>]{!} (multiple single elements). vld1_32(DRegisterList regs,MemOperand op)423 void vld1_32(DRegisterList regs, MemOperand op) { vld1(k32, regs, op); } vld1_32(QRegisterList regs,MemOperand op)424 void vld1_32(QRegisterList regs, MemOperand op) { vld1(k32, static_cast<DRegisterList>(regs), op); } 425 // VLD1.32 <list>, [<Rn>]{!} (single element to one lane). 426 void vld1_32(DRegisterLane dd, MemOperand op); 427 // VLD1.32 <list>, [<Rn>]{!} (single element to all lanes). 428 // We cannot differentiate the register list in C++ syntax, so use an instruction name similar to AArch64 LD1R. 429 void vld1r_32(DRegisterList regs, MemOperand op); 430 void vld2r_32(VLoadStoreRegList regs, MemOperand op); 431 void vld3r_32(VLoadStoreRegList regs, MemOperand op); 432 // VLDM <Rn>{!}, <list> (IA). 433 void vldm(MemOperand rn, SRegisterList regs); 434 void vldm(MemOperand rn, DRegisterList regs); 435 void vldr(SRegister sd, MemOperand op); 436 void vldr(DRegister dd, MemOperand op); 437 void vmax_f32(QRegister qd, QRegister qn, QRegister qm); 438 void vmax_s8(QRegister qd, QRegister qn, QRegister qm); 439 void vmin_f32(QRegister qd, QRegister qn, QRegister qm); 440 void vmin_s8(QRegister qd, QRegister qn, QRegister qm); 441 // VMLA.F32 <Sd>, <Sn>, <Sm> 442 void vmla_f32(SRegister sd, SRegister sn, SRegister sm); 443 // VMLA.F32 <Qd>, <Qn>, <Dm[x]> 444 void vmla_f32(QRegister qd, QRegister qn, DRegisterLane dm); 445 // VMLAL.S16 <Qd>, <Dn>, <Dm[x]> 446 void vmlal_s16(QRegister qd, DRegister dn, DRegisterLane dm); 447 // VMOV.F32 <Qd>, #<imm>; encoding A1 448 void vmov(QRegister qd, uint8_t imm); 449 // VMOV.F32 <Sd>, <Sm>; encoding A2. 450 void vmov(SRegister sd, SRegister sm); 451 // VMOV <Dm>, <Rt>, <Rt2>; encoding A1. 452 void vmov(DRegister dm, CoreRegister rt, CoreRegister rt2); 453 // VMOV <Dd>, <Dm>; encoding A1. 454 void vmov(DRegister dd, DRegister dm); 455 // VMOV <Qd>, <Qm>; encoding A1. 456 void vmov(QRegister qd, QRegister qm); 457 // VMOV_F32 <Sd>, <Sm> vmov_f32(SRegister sd,SRegister sm)458 void vmov_f32(SRegister sd, SRegister sm) { vmov_f32(kAL, sd, sm); } vmovpl_f32(SRegister sd,SRegister sm)459 void vmovpl_f32(SRegister sd, SRegister sm) { vmov_f32(kPL, sd, sm); } vmovmi_f32(SRegister sd,SRegister sm)460 void vmovmi_f32(SRegister sd, SRegister sm) { vmov_f32(kMI, sd, sm); } 461 // VMOV_F64 <Dd>, <Dm> 462 void vmov_f64(DRegister dd, DRegister dm); 463 // VMOVL.S8 <Qd>, <Dm> 464 void vmovl_s8(QRegister qd, DRegister dm); 465 void vmrs(CoreRegister rt, SpecialFPRegister spec_reg); 466 void vmul_f32(QRegister qd, QRegister qn, QRegister qm); 467 void vneg_f32(QRegister qd, QRegister qm); 468 void vpop(DRegisterList regs); 469 void vpush(DRegisterList regs); 470 void vpush(SRegisterList regs); 471 void vqadd_s16(QRegister qd, QRegister qn, QRegister qm); 472 void vqdmulh_s32(QRegister qd, QRegister qn, DRegisterLane dm); 473 void vqmovn_s16(DRegister dd, QRegister qm); 474 void vqmovn_s32(DRegister dd, QRegister qm); 475 void vqshl_s32(QRegister qd, QRegister qm, QRegister qn); 476 void vrshl_s32(QRegister qd, QRegister qm, QRegister qn); 477 void vsdot_s8(QRegister qd, QRegister qn, DRegisterLane dm); 478 // VST1.8 <list>, [<Rn>]{!} (multiple single elements). vst1_8(DRegisterList regs,MemOperand op)479 void vst1_8(DRegisterList regs, MemOperand op) { vst1(k8, regs, op); } 480 // VST1.8 <list>, [<Rn>]{!}, <Rm> (multiple single elements). vst1_8(DRegisterList regs,MemOperand op,CoreRegister rm)481 void vst1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k8, regs, op, rm); } 482 // VST1.8 <list>, [<Rn>]{!} (single element form one lane). vst1_8(DRegisterLane dd,MemOperand op)483 void vst1_8(DRegisterLane dd, MemOperand op) { vst1(k8, dd, op); } 484 // VST1.16 <list>, [<Rn>]{!} (multiple single elements). vst1_16(DRegisterList regs,MemOperand op)485 void vst1_16(DRegisterList regs, MemOperand op) { vst1(k16, regs, op); } 486 // VST1.16 <list>, [<Rn>]{!}, <Rm> (multiple single elements). vst1_16(DRegisterList regs,MemOperand op,CoreRegister rm)487 void vst1_16(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k16, regs, op, rm); } 488 // VST1.16 <list>, [<Rn>]{!} (single element form one lane). vst1_16(DRegisterLane dd,MemOperand op)489 void vst1_16(DRegisterLane dd, MemOperand op) { vst1(k16, dd, op); } 490 // VST1.32 <list>, [<Rn>]{!} (multiple single elements). vst1_32(DRegisterList regs,MemOperand op)491 void vst1_32(DRegisterList regs, MemOperand op) { vst1(k32, regs, op); } 492 // VST1.32 <list>, [<Rn>]{!}, <Rm> (multiple single elements). vst1_32(DRegisterList regs,MemOperand op,CoreRegister rm)493 void vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k32, regs, op, rm); } 494 // VST1.32 <list>, [<Rn>]{!} (single element form one lane). vst1_32(DRegisterLane dd,MemOperand op)495 void vst1_32(DRegisterLane dd, MemOperand op) { vst1(k32, dd, op); } 496 // VSTM <Rn>{!}, <list>, consecutive 64-bit registers. 497 void vstm(MemOperand rn, DRegisterList regs); 498 // VSTR <Sd>, [Rn{, #+/-<imm>}], store single extension register to memory. 499 void vstr(SRegister rn, MemOperand op); 500 501 // Binds Label l to the current location in the code buffer. 502 void bind(Label& l); 503 // Align the cursor to specified number of bytes, `n` must be a power of 2. 504 void align(uint8_t n); 505 506 private: 507 void mov(Condition c, CoreRegister rd, CoreRegister rm); 508 void b(Condition c, Label& l); 509 void vdup(DataSize size, QRegister qd, DRegisterLane dm); 510 void vmov_f32(Condition c, SRegister sd, SRegister sm); 511 void vld1(DataSize size, DRegisterList regs, MemOperand op); 512 void vld1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm); 513 void vst1(DataSize size, DRegisterList regs, MemOperand op); 514 void vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm); 515 void vst1(DataSize size, DRegisterLane dd, MemOperand op); 516 }; 517 518 } // namespace aarch32 519 } // namespace xnnpack 520