1 // Copyright 2021 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #include <xnnpack/allocator.h> 7 #include <xnnpack/assembler.h> 8 9 #include <cstddef> 10 #include <cstdint> 11 #include <initializer_list> 12 13 namespace xnnpack { 14 namespace aarch32 { 15 16 enum class SpecialFPRegister { 17 kFPSCR = 1, 18 }; 19 20 constexpr SpecialFPRegister FPSCR = SpecialFPRegister::kFPSCR; 21 22 struct CoreRegister { 23 uint8_t code; 24 }; 25 26 constexpr CoreRegister r0{0}; 27 constexpr CoreRegister r1{1}; 28 constexpr CoreRegister r2{2}; 29 constexpr CoreRegister r3{3}; 30 constexpr CoreRegister r4{4}; 31 constexpr CoreRegister r5{5}; 32 constexpr CoreRegister r6{6}; 33 constexpr CoreRegister r7{7}; 34 constexpr CoreRegister r8{8}; 35 constexpr CoreRegister r9{9}; 36 constexpr CoreRegister r10{10}; 37 constexpr CoreRegister r11{11}; 38 constexpr CoreRegister r12{12}; 39 constexpr CoreRegister r13{13}; 40 constexpr CoreRegister r14{14}; 41 constexpr CoreRegister r15{15}; 42 constexpr CoreRegister sp = r13; 43 constexpr CoreRegister lr = r14; 44 constexpr CoreRegister pc = r15; 45 constexpr CoreRegister APSR_nzcv = r15; 46 47 static inline bool operator==(const CoreRegister lhs, const CoreRegister rhs) { 48 return lhs.code == rhs.code; 49 } 50 51 struct CoreRegisterList { CoreRegisterListCoreRegisterList52 CoreRegisterList(std::initializer_list<CoreRegister> rs) { 53 for (auto r : rs) { 54 list |= 1 << r.code; 55 } 56 } 57 has_more_than_one_registerCoreRegisterList58 bool has_more_than_one_register() { return (list & (list - 1)) != 0; } 59 60 // Bit i is set if CoreRegister is in the list. 61 uint16_t list = 0; 62 }; 63 64 static inline bool operator==(int i, CoreRegisterList registers) { 65 return i == registers.list; 66 } 67 68 struct SRegister { 69 uint8_t code; dSRegister70 uint8_t d() const { return code & 0x1; } vdSRegister71 uint8_t vd() const { return (code & 0x1e) >> 1; } 72 }; 73 74 static inline bool operator==(const SRegister lhs, const SRegister rhs) { 75 return lhs.code == rhs.code; 76 } 77 78 constexpr SRegister s0{0}; 79 constexpr SRegister s1{1}; 80 constexpr SRegister s2{2}; 81 constexpr SRegister s3{3}; 82 constexpr SRegister s4{4}; 83 constexpr SRegister s5{5}; 84 constexpr SRegister s6{6}; 85 constexpr SRegister s7{7}; 86 constexpr SRegister s8{8}; 87 constexpr SRegister s9{9}; 88 constexpr SRegister s10{10}; 89 constexpr SRegister s11{11}; 90 constexpr SRegister s12{12}; 91 constexpr SRegister s13{13}; 92 constexpr SRegister s14{14}; 93 constexpr SRegister s15{15}; 94 constexpr SRegister s16{16}; 95 constexpr SRegister s17{17}; 96 constexpr SRegister s18{18}; 97 constexpr SRegister s19{19}; 98 constexpr SRegister s20{20}; 99 constexpr SRegister s21{21}; 100 constexpr SRegister s22{22}; 101 constexpr SRegister s23{23}; 102 constexpr SRegister s24{24}; 103 constexpr SRegister s25{25}; 104 constexpr SRegister s26{26}; 105 constexpr SRegister s27{27}; 106 constexpr SRegister s28{28}; 107 constexpr SRegister s29{29}; 108 constexpr SRegister s30{30}; 109 constexpr SRegister s31{31}; 110 111 // Define DRegisterLane before DRegister so that we can have the operator[] overloading for nice syntax. 112 struct DRegisterLane { 113 uint8_t code; 114 uint8_t lane; 115 dDRegisterLane116 uint8_t d() const { return (code & 0x10) >> 4; } vdDRegisterLane117 uint8_t vd() const { return code & 0xf; } 118 }; 119 120 static inline bool operator==(const DRegisterLane lhs, const DRegisterLane rhs) { 121 return lhs.code == rhs.code && lhs.lane == rhs.lane; 122 } 123 124 struct DRegister { 125 uint8_t code; 126 dDRegister127 uint8_t d() const { return (code & 0x10) >> 4; } vdDRegister128 uint8_t vd() const { return code & 0xf; } 129 130 const DRegisterLane operator[](std::size_t pos) const { 131 return DRegisterLane{code, static_cast<uint8_t>(pos)}; 132 } 133 }; 134 135 static inline bool operator==(const DRegister lhs, const DRegister rhs) { 136 return lhs.code == rhs.code; 137 } 138 139 constexpr DRegister d0{0}; 140 constexpr DRegister d1{1}; 141 constexpr DRegister d2{2}; 142 constexpr DRegister d3{3}; 143 constexpr DRegister d4{4}; 144 constexpr DRegister d5{5}; 145 constexpr DRegister d6{6}; 146 constexpr DRegister d7{7}; 147 constexpr DRegister d8{8}; 148 constexpr DRegister d9{9}; 149 constexpr DRegister d10{10}; 150 constexpr DRegister d11{11}; 151 constexpr DRegister d12{12}; 152 constexpr DRegister d13{13}; 153 constexpr DRegister d14{14}; 154 constexpr DRegister d15{15}; 155 constexpr DRegister d16{16}; 156 constexpr DRegister d17{17}; 157 constexpr DRegister d18{18}; 158 constexpr DRegister d19{19}; 159 constexpr DRegister d20{20}; 160 constexpr DRegister d21{21}; 161 constexpr DRegister d22{22}; 162 constexpr DRegister d23{23}; 163 constexpr DRegister d24{24}; 164 constexpr DRegister d25{25}; 165 constexpr DRegister d26{26}; 166 constexpr DRegister d27{27}; 167 constexpr DRegister d28{28}; 168 constexpr DRegister d29{29}; 169 constexpr DRegister d30{30}; 170 constexpr DRegister d31{31}; 171 172 struct QRegister { 173 uint8_t code; 174 // Encode code * 2. dQRegister175 uint8_t d() const { return (code & 0x8) >> 3; } vdQRegister176 uint8_t vd() const { return (code & 0x7) << 1; } 177 }; 178 179 static inline bool operator==(const QRegister lhs, const QRegister rhs) { 180 return lhs.code == rhs.code; 181 } 182 183 constexpr QRegister q0{0}; 184 constexpr QRegister q1{1}; 185 constexpr QRegister q2{2}; 186 constexpr QRegister q3{3}; 187 constexpr QRegister q4{4}; 188 constexpr QRegister q5{5}; 189 constexpr QRegister q6{6}; 190 constexpr QRegister q7{7}; 191 constexpr QRegister q8{8}; 192 constexpr QRegister q9{9}; 193 constexpr QRegister q10{10}; 194 constexpr QRegister q11{11}; 195 constexpr QRegister q12{12}; 196 constexpr QRegister q13{13}; 197 constexpr QRegister q14{14}; 198 constexpr QRegister q15{15}; 199 200 // SIMD register lists are used in a more restrictive way, compared to core 201 // registers, only consecutive registers are used as an operand to instruction. 202 template <typename RegType> 203 struct ConsecutiveRegisterList { 204 // End must be >= start. ConsecutiveRegisterListConsecutiveRegisterList205 ConsecutiveRegisterList(RegType s, RegType end) 206 : start(s), 207 length(end.code - s.code + 1) {} ConsecutiveRegisterListConsecutiveRegisterList208 explicit ConsecutiveRegisterList(RegType s, int len) 209 : start(s), 210 length(len) {} ConsecutiveRegisterListConsecutiveRegisterList211 ConsecutiveRegisterList(RegType start) 212 : ConsecutiveRegisterList(start, start) {} 213 214 RegType start; 215 uint8_t length; 216 }; 217 218 using SRegisterList = ConsecutiveRegisterList<SRegister>; 219 using DRegisterList = ConsecutiveRegisterList<DRegister>; 220 221 static inline SRegisterList operator-(const SRegister lhs, const SRegister rhs) { 222 return SRegisterList(lhs, rhs); 223 } 224 225 static inline DRegisterList operator-(const DRegister lhs, const DRegister rhs) { 226 return DRegisterList(lhs, rhs); 227 } 228 229 struct QRegisterList { QRegisterListQRegisterList230 QRegisterList(QRegister s) : start(s), length(1) {} QRegisterListQRegisterList231 QRegisterList(QRegister s, QRegister end) : start(s), length(end.code - s.code + 1) {} 232 // Explicit conversion to DRegisterList. DRegisterListQRegisterList233 explicit operator DRegisterList() const { 234 return DRegisterList({static_cast<uint8_t>(start.code * 2)}, length * 2); 235 } 236 237 QRegister start; 238 uint8_t length; 239 }; 240 241 static inline QRegisterList operator-(const QRegister lhs, const QRegister rhs) { 242 return QRegisterList(lhs, rhs); 243 } 244 245 // A8.5 Addressing modes for memory access. 246 enum class AddressingMode { 247 // [<Rn>, <offset>], offset applied to address in Rn. 248 kOffset, 249 // Pre-indexed not used, so not implemented. 250 // [<Rn>], <offset>, address from Rn, offset applied, written back to Rn. 251 kPostIndexed, 252 }; 253 254 // Memory operands, operands for memory access instructions. See 255 // "MemOperandHelper mem" for a nicer syntax that is closer to assembly. 256 class MemOperand { 257 public: MemOperand(CoreRegister rn,int32_t offset)258 MemOperand(CoreRegister rn, int32_t offset) 259 : mode_(AddressingMode::kOffset), 260 rn_(rn), 261 offset_(offset) {} 262 MemOperand(CoreRegister rn,int32_t offset,AddressingMode mode)263 MemOperand(CoreRegister rn, int32_t offset, AddressingMode mode) 264 : mode_(mode), 265 rn_(rn), 266 offset_(offset) {} 267 base()268 CoreRegister base() const { return rn_; } offset()269 int32_t offset() const { return offset_; } mode()270 AddressingMode mode() const { return mode_; } 271 272 // These are bits used for encoding, named based on the encoding description. u()273 int32_t u() { return offset_ >= 0; } p()274 int32_t p() { return mode_ != AddressingMode::kPostIndexed; } 275 // Note, kPostIndexed will write back, but doesn't need to set bit w. w()276 int32_t w() { return 0; } 277 278 // Overload postfix increment to indicate a post-indexed addressing mode for load/stores. 279 MemOperand operator++(int) { 280 mode_ = AddressingMode::kPostIndexed; 281 return *this; 282 } 283 284 private: 285 AddressingMode mode_; 286 CoreRegister rn_; 287 int32_t offset_; 288 }; 289 290 static inline bool operator==(const MemOperand lhs, const MemOperand rhs) { 291 return lhs.mode() == rhs.mode() && lhs.base() == rhs.base() && lhs.offset() == rhs.offset(); 292 } 293 294 static inline MemOperand operator,(CoreRegister r, int32_t offset) { 295 return MemOperand(r, offset); 296 } 297 298 // Helper struct for some syntax sugar to look like native assembly, see mem. 299 struct MemOperandHelper { 300 const MemOperand operator[](MemOperand op) const { return op; } 301 MemOperand operator[](CoreRegister r) const { return MemOperand(r, 0); } 302 }; 303 304 // Use "mem" (and its overload of array subscript operator) to get some syntax 305 // that looks closer to native assembly when accessing memory. For example: 306 // - ldr(r0, mem[rn, offset]); // offset 307 // - ldr(r0, mem[rn], offset); // post-indexed 308 constexpr MemOperandHelper mem; 309 310 // Conditional execution, only support AL (always) for now. 311 enum Condition : uint32_t { 312 kEQ = 0x00000000, 313 kNE = 0x10000000, 314 kCS = 0x20000000, 315 kCC = 0x30000000, 316 kMI = 0x40000000, 317 kPL = 0x50000000, 318 kVS = 0x60000000, 319 kVC = 0x70000000, 320 kHI = 0x80000000, 321 kLS = 0x90000000, 322 kGE = 0xa0000000, 323 kLT = 0xB0000000, 324 kGT = 0xC0000000, 325 kLE = 0xD0000000, 326 kAL = 0xE0000000, 327 kHS = kCS, 328 kLO = kCC, 329 }; 330 331 enum DataSize { 332 k8 = 0, 333 k16 = 1, 334 k32 = 2, 335 }; 336 337 // A simple AAarch32 assembler. 338 class Assembler : public AssemblerBase { 339 public: 340 using AssemblerBase::AssemblerBase; 341 add(CoreRegister rn,CoreRegister rm)342 void add(CoreRegister rn, CoreRegister rm) { add(rn, rn, rm); } 343 void add(CoreRegister rd, CoreRegister rn, CoreRegister rm); 344 // Only support uint8_t immediates for now, it simplifies encoding. 345 void add(CoreRegister rd, CoreRegister rn, uint8_t imm); 346 void adds(CoreRegister rd, CoreRegister rn, uint8_t imm); 347 void and_(CoreRegister rd, CoreRegister rn, uint8_t imm); b(Label & l)348 void b(Label& l) { b(kAL, l); } beq(Label & l)349 void beq(Label& l) { b(kEQ, l); } bne(Label & l)350 void bne(Label& l) { b(kNE, l); } bhi(Label & l)351 void bhi(Label& l) { b(kHI, l); } bhs(Label & l)352 void bhs(Label& l) { b(kHS, l); } blo(Label & l)353 void blo(Label& l) { b(kLO, l); } 354 void bic(CoreRegister rd, CoreRegister rn, uint8_t imm); 355 void bx(CoreRegister rm); 356 // Cmp supports a subset of uint32_t offsets, see "A5.2.4 Modified immediate 357 // constants in ARM instructions", for simplicity we start with uint8_t, which 358 // is fully representation using a "rotation" of 0. 359 void cmp(CoreRegister rn, uint8_t imm); 360 void cmp(CoreRegister rn, CoreRegister rm); 361 void ldr(CoreRegister rt, MemOperand operand, int32_t offset); 362 void ldr(CoreRegister rt, MemOperand operand); 363 // LDRD <Rt>, <Rt2>, [<Rn>{, #+/-<imm>}]. 364 void ldrd(CoreRegister rt, CoreRegister rt2, MemOperand op); 365 void mov(CoreRegister rd, CoreRegister rm); moveq(CoreRegister rd,CoreRegister rm)366 void moveq(CoreRegister rd, CoreRegister rm) { mov(kEQ, rd, rm); } movlo(CoreRegister rd,CoreRegister rm)367 void movlo(CoreRegister rd, CoreRegister rm) { mov(kLO, rd, rm); } movls(CoreRegister rd,CoreRegister rm)368 void movls(CoreRegister rd, CoreRegister rm) { mov(kLS, rd, rm); } 369 void nop(); 370 void pld(MemOperand operand); 371 void pop(CoreRegisterList regs); 372 void push(CoreRegisterList regs); 373 void str(CoreRegister rt, MemOperand op); 374 void sub(CoreRegister rd, CoreRegister rn, uint8_t imm); 375 void sub(CoreRegister rd, CoreRegister rn, CoreRegister rm); 376 // Only support uint8_t immediates for now, it simplifies encoding. 377 void subs(CoreRegister rd, CoreRegister rn, uint8_t imm); 378 void tst(CoreRegister rn, uint8_t imm); 379 380 // SIMD instructions. 381 void vcmpe_f32(SRegister sd, SRegister sm); 382 void vcvt_f32_s32(QRegister qd, QRegister qm); 383 void vcvt_s32_f32(QRegister qd, QRegister qm); 384 void vcvtn_s32_f32(QRegister qd, QRegister qm); vdup_8(QRegister qd,DRegisterLane dm)385 void vdup_8(QRegister qd, DRegisterLane dm) { vdup(k8, qd, dm); } vdup_16(QRegister qd,DRegisterLane dm)386 void vdup_16(QRegister qd, DRegisterLane dm) { vdup(k16, qd, dm); } vdup_32(QRegister qd,DRegisterLane dm)387 void vdup_32(QRegister qd, DRegisterLane dm) { vdup(k32, qd, dm); } 388 void vext_8(QRegister qd, QRegister qn, QRegister qm, uint8_t imm4); 389 // VLD1.8 <list>, [<Rn>]{!} (multiple single elements). vld1_8(DRegisterList regs,MemOperand op)390 void vld1_8(DRegisterList regs, MemOperand op) { vld1(k8, regs, op); } vld1_8(DRegisterList regs,MemOperand op,CoreRegister rm)391 void vld1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { vld1(k8, regs, op, rm); } vld1_8(QRegisterList regs,MemOperand op)392 void vld1_8(QRegisterList regs, MemOperand op) { vld1(k8, static_cast<DRegisterList>(regs), op); } 393 // VLD1.32 <list>, [<Rn>]{!} (multiple single elements). vld1_32(DRegisterList regs,MemOperand op)394 void vld1_32(DRegisterList regs, MemOperand op) { vld1(k32, regs, op); } vld1_32(QRegisterList regs,MemOperand op)395 void vld1_32(QRegisterList regs, MemOperand op) { vld1(k32, static_cast<DRegisterList>(regs), op); } 396 // VLD1.32 <list>, [<Rn>]{!} (single element to one lane). 397 void vld1_32(DRegisterLane dd, MemOperand op); 398 // VLD1.32 <list>, [<Rn>]{!} (single element to all lanes). 399 // We cannot differentiate the register list in C++ syntax, so use an instruction name similar to AArch64 LD1R. 400 void vld1r_32(DRegisterList regs, MemOperand op); 401 // VLDM <Rn>{!}, <list> (IA). 402 void vldm(MemOperand rn, SRegisterList regs); 403 void vldm(MemOperand rn, DRegisterList regs); 404 void vldr(SRegister sd, MemOperand op); 405 void vldr(DRegister dd, MemOperand op); 406 void vmax_f32(QRegister qd, QRegister qn, QRegister qm); 407 void vmax_s8(QRegister qd, QRegister qn, QRegister qm); 408 void vmin_f32(QRegister qd, QRegister qn, QRegister qm); 409 void vmin_s8(QRegister qd, QRegister qn, QRegister qm); 410 // VMLA.F32 <Sd>, <Sn>, <Sm> 411 void vmla_f32(SRegister sd, SRegister sn, SRegister sm); 412 // VMLA.F32 <Qd>, <Qn>, <Dm[x]> 413 void vmla_f32(QRegister qd, QRegister qn, DRegisterLane dm); 414 // VMLAL.S16 <Qd>, <Dn>, <Dm[x]> 415 void vmlal_s16(QRegister qd, DRegister dn, DRegisterLane dm); 416 // VMOV.F32 <Sd>, <Sm>; encoding A2. 417 void vmov(SRegister sd, SRegister sm); 418 // VMOV <Dm>, <Rt>, <Rt2>; encoding A1. 419 void vmov(DRegister dm, CoreRegister rt, CoreRegister rt2); 420 // VMOV <Dd>, <Dm>; encoding A1. 421 void vmov(DRegister dd, DRegister dm); 422 // VMOV <Qd>, <Qm>; encoding A1. 423 void vmov(QRegister qd, QRegister qm); 424 // VMOV_F32 <Sd>, <Sm> vmov_f32(SRegister sd,SRegister sm)425 void vmov_f32(SRegister sd, SRegister sm) { vmov_f32(kAL, sd, sm); } vmovpl_f32(SRegister sd,SRegister sm)426 void vmovpl_f32(SRegister sd, SRegister sm) { vmov_f32(kPL, sd, sm); } vmovmi_f32(SRegister sd,SRegister sm)427 void vmovmi_f32(SRegister sd, SRegister sm) { vmov_f32(kMI, sd, sm); } 428 // VMOV_F64 <Dd>, <Dm> 429 void vmov_f64(DRegister dd, DRegister dm); 430 // VMOVL.S8 <Qd>, <Dm> 431 void vmovl_s8(QRegister qd, DRegister dm); 432 void vmrs(CoreRegister rt, SpecialFPRegister spec_reg); 433 void vmul_f32(QRegister qd, QRegister qn, QRegister qm); 434 void vpop(DRegisterList regs); 435 void vpush(DRegisterList regs); 436 void vpush(SRegisterList regs); 437 void vqadd_s16(QRegister qd, QRegister qn, QRegister qm); 438 void vqdmulh_s32(QRegister qd, QRegister qn, DRegisterLane dm); 439 void vqmovn_s16(DRegister dd, QRegister qm); 440 void vqmovn_s32(DRegister dd, QRegister qm); 441 void vqshl_s32(QRegister qd, QRegister qm, QRegister qn); 442 void vrshl_s32(QRegister qd, QRegister qm, QRegister qn); 443 void vsdot_s8(QRegister qd, QRegister qn, DRegisterLane dm); 444 // VST1.8 <list>, [<Rn>]{!} (multiple single elements). vst1_8(DRegisterList regs,MemOperand op)445 void vst1_8(DRegisterList regs, MemOperand op) { vst1(k8, regs, op); } 446 // VST1.8 <list>, [<Rn>]{!}, <Rm> (multiple single elements). vst1_8(DRegisterList regs,MemOperand op,CoreRegister rm)447 void vst1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k8, regs, op, rm); } 448 // VST1.8 <list>, [<Rn>]{!} (single element form one lane). vst1_8(DRegisterLane dd,MemOperand op)449 void vst1_8(DRegisterLane dd, MemOperand op) { vst1(k8, dd, op); } 450 // VST1.16 <list>, [<Rn>]{!} (multiple single elements). vst1_16(DRegisterList regs,MemOperand op)451 void vst1_16(DRegisterList regs, MemOperand op) { vst1(k16, regs, op); } 452 // VST1.16 <list>, [<Rn>]{!}, <Rm> (multiple single elements). vst1_16(DRegisterList regs,MemOperand op,CoreRegister rm)453 void vst1_16(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k16, regs, op, rm); } 454 // VST1.16 <list>, [<Rn>]{!} (single element form one lane). vst1_16(DRegisterLane dd,MemOperand op)455 void vst1_16(DRegisterLane dd, MemOperand op) { vst1(k16, dd, op); } 456 // VST1.32 <list>, [<Rn>]{!} (multiple single elements). vst1_32(DRegisterList regs,MemOperand op)457 void vst1_32(DRegisterList regs, MemOperand op) { vst1(k32, regs, op); } 458 // VST1.32 <list>, [<Rn>]{!}, <Rm> (multiple single elements). vst1_32(DRegisterList regs,MemOperand op,CoreRegister rm)459 void vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k32, regs, op, rm); } 460 // VST1.32 <list>, [<Rn>]{!} (single element form one lane). vst1_32(DRegisterLane dd,MemOperand op)461 void vst1_32(DRegisterLane dd, MemOperand op) { vst1(k32, dd, op); } 462 // VSTM <Rn>{!}, <list>, consecutive 64-bit registers. 463 void vstm(MemOperand rn, DRegisterList regs); 464 // VSTR <Sd>, [Rn{, #+/-<imm>}], store single extension register to memory. 465 void vstr(SRegister rn, MemOperand op); 466 467 // Binds Label l to the current location in the code buffer. 468 void bind(Label& l); 469 // Align the cursor to specified number of bytes, `n` must be a power of 2. 470 void align(uint8_t n); 471 472 private: 473 void mov(Condition c, CoreRegister rd, CoreRegister rm); 474 void b(Condition c, Label& l); 475 void vdup(DataSize size, QRegister qd, DRegisterLane dm); 476 void vmov_f32(Condition c, SRegister sd, SRegister sm); 477 void vld1(DataSize size, DRegisterList regs, MemOperand op); 478 void vld1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm); 479 void vst1(DataSize size, DRegisterList regs, MemOperand op); 480 void vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm); 481 void vst1(DataSize size, DRegisterLane dd, MemOperand op); 482 }; 483 484 } // namespace aarch32 485 } // namespace xnnpack 486