• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <xnnpack/allocator.h>
7 #include <xnnpack/assembler.h>
8 
9 #include <cstddef>
10 #include <cstdint>
11 #include <initializer_list>
12 
13 namespace xnnpack {
14 namespace aarch32 {
15 
16 enum class SpecialFPRegister {
17   kFPSCR = 1,
18 };
19 
20 constexpr SpecialFPRegister FPSCR = SpecialFPRegister::kFPSCR;
21 
22 struct CoreRegister {
23   uint8_t code;
24 };
25 
26 constexpr CoreRegister r0{0};
27 constexpr CoreRegister r1{1};
28 constexpr CoreRegister r2{2};
29 constexpr CoreRegister r3{3};
30 constexpr CoreRegister r4{4};
31 constexpr CoreRegister r5{5};
32 constexpr CoreRegister r6{6};
33 constexpr CoreRegister r7{7};
34 constexpr CoreRegister r8{8};
35 constexpr CoreRegister r9{9};
36 constexpr CoreRegister r10{10};
37 constexpr CoreRegister r11{11};
38 constexpr CoreRegister r12{12};
39 constexpr CoreRegister r13{13};
40 constexpr CoreRegister r14{14};
41 constexpr CoreRegister r15{15};
42 constexpr CoreRegister sp = r13;
43 constexpr CoreRegister lr = r14;
44 constexpr CoreRegister pc = r15;
45 constexpr CoreRegister APSR_nzcv = r15;
46 
47 static inline bool operator==(const CoreRegister lhs, const CoreRegister rhs) {
48   return lhs.code == rhs.code;
49 }
50 
51 struct CoreRegisterList {
CoreRegisterListCoreRegisterList52   CoreRegisterList(std::initializer_list<CoreRegister> rs) {
53     for (auto r : rs) {
54       list |= 1 << r.code;
55     }
56   }
57 
has_more_than_one_registerCoreRegisterList58   bool has_more_than_one_register() { return (list & (list - 1)) != 0; }
59 
60   // Bit i is set if CoreRegister is in the list.
61   uint16_t list = 0;
62 };
63 
64 static inline bool operator==(int i, CoreRegisterList registers) {
65   return i == registers.list;
66 }
67 
68 struct SRegister {
69   uint8_t code;
dSRegister70   uint8_t d() const { return code & 0x1; }
vdSRegister71   uint8_t vd() const { return (code & 0x1e) >> 1; }
72 };
73 
74 static inline bool operator==(const SRegister lhs, const SRegister rhs) {
75   return lhs.code == rhs.code;
76 }
77 
78 constexpr SRegister s0{0};
79 constexpr SRegister s1{1};
80 constexpr SRegister s2{2};
81 constexpr SRegister s3{3};
82 constexpr SRegister s4{4};
83 constexpr SRegister s5{5};
84 constexpr SRegister s6{6};
85 constexpr SRegister s7{7};
86 constexpr SRegister s8{8};
87 constexpr SRegister s9{9};
88 constexpr SRegister s10{10};
89 constexpr SRegister s11{11};
90 constexpr SRegister s12{12};
91 constexpr SRegister s13{13};
92 constexpr SRegister s14{14};
93 constexpr SRegister s15{15};
94 constexpr SRegister s16{16};
95 constexpr SRegister s17{17};
96 constexpr SRegister s18{18};
97 constexpr SRegister s19{19};
98 constexpr SRegister s20{20};
99 constexpr SRegister s21{21};
100 constexpr SRegister s22{22};
101 constexpr SRegister s23{23};
102 constexpr SRegister s24{24};
103 constexpr SRegister s25{25};
104 constexpr SRegister s26{26};
105 constexpr SRegister s27{27};
106 constexpr SRegister s28{28};
107 constexpr SRegister s29{29};
108 constexpr SRegister s30{30};
109 constexpr SRegister s31{31};
110 
111 // Define DRegisterLane before DRegister so that we can have the operator[] overloading for nice syntax.
112 struct DRegisterLane {
113   uint8_t code;
114   uint8_t lane;
115 
dDRegisterLane116   uint8_t d() const { return (code & 0x10) >> 4; }
vdDRegisterLane117   uint8_t vd() const { return code & 0xf; }
118 };
119 
120 static inline bool operator==(const DRegisterLane lhs, const DRegisterLane rhs) {
121   return lhs.code == rhs.code && lhs.lane == rhs.lane;
122 }
123 
124 struct DRegister {
125   uint8_t code;
126 
dDRegister127   uint8_t d() const { return (code & 0x10) >> 4; }
vdDRegister128   uint8_t vd() const { return code & 0xf; }
129 
130   const DRegisterLane operator[](std::size_t pos) const {
131     return DRegisterLane{code, static_cast<uint8_t>(pos)};
132   }
133 };
134 
135 static inline bool operator==(const DRegister lhs, const DRegister rhs) {
136   return lhs.code == rhs.code;
137 }
138 
139 constexpr DRegister d0{0};
140 constexpr DRegister d1{1};
141 constexpr DRegister d2{2};
142 constexpr DRegister d3{3};
143 constexpr DRegister d4{4};
144 constexpr DRegister d5{5};
145 constexpr DRegister d6{6};
146 constexpr DRegister d7{7};
147 constexpr DRegister d8{8};
148 constexpr DRegister d9{9};
149 constexpr DRegister d10{10};
150 constexpr DRegister d11{11};
151 constexpr DRegister d12{12};
152 constexpr DRegister d13{13};
153 constexpr DRegister d14{14};
154 constexpr DRegister d15{15};
155 constexpr DRegister d16{16};
156 constexpr DRegister d17{17};
157 constexpr DRegister d18{18};
158 constexpr DRegister d19{19};
159 constexpr DRegister d20{20};
160 constexpr DRegister d21{21};
161 constexpr DRegister d22{22};
162 constexpr DRegister d23{23};
163 constexpr DRegister d24{24};
164 constexpr DRegister d25{25};
165 constexpr DRegister d26{26};
166 constexpr DRegister d27{27};
167 constexpr DRegister d28{28};
168 constexpr DRegister d29{29};
169 constexpr DRegister d30{30};
170 constexpr DRegister d31{31};
171 
172 struct QRegister {
173   uint8_t code;
174   // Encode code * 2.
dQRegister175   uint8_t d() const { return (code & 0x8) >> 3; }
vdQRegister176   uint8_t vd() const { return (code & 0x7) << 1; }
lowQRegister177   DRegister low() const { return DRegister{uint8_t(code * 2)}; }
highQRegister178   DRegister high() const { return DRegister{uint8_t(code * 2 + 1)}; }
179 };
180 
181 static inline bool operator==(const QRegister lhs, const QRegister rhs) {
182   return lhs.code == rhs.code;
183 }
184 
185 constexpr QRegister q0{0};
186 constexpr QRegister q1{1};
187 constexpr QRegister q2{2};
188 constexpr QRegister q3{3};
189 constexpr QRegister q4{4};
190 constexpr QRegister q5{5};
191 constexpr QRegister q6{6};
192 constexpr QRegister q7{7};
193 constexpr QRegister q8{8};
194 constexpr QRegister q9{9};
195 constexpr QRegister q10{10};
196 constexpr QRegister q11{11};
197 constexpr QRegister q12{12};
198 constexpr QRegister q13{13};
199 constexpr QRegister q14{14};
200 constexpr QRegister q15{15};
201 
202 // SIMD register lists are used in a more restrictive way, compared to core
203 // registers, only consecutive registers are used as an operand to instruction.
204 template <typename RegType>
205 struct ConsecutiveRegisterList {
206   // End must be >= start.
ConsecutiveRegisterListConsecutiveRegisterList207   ConsecutiveRegisterList(RegType s, RegType end)
208       : start(s),
209         length(end.code - s.code + 1) {}
ConsecutiveRegisterListConsecutiveRegisterList210   explicit ConsecutiveRegisterList(RegType s, int len)
211       : start(s),
212         length(len) {}
ConsecutiveRegisterListConsecutiveRegisterList213   ConsecutiveRegisterList(RegType start)
214       : ConsecutiveRegisterList(start, start) {}
215 
216   RegType start;
217   uint8_t length;
218 };
219 
220 // Specific struct for VLD2 and VLD3 register list operand.
221 struct VLoadStoreRegList {
VLoadStoreRegListVLoadStoreRegList222   VLoadStoreRegList(DRegister reg1, DRegister reg2)
223       : reg1(reg1), reg2(reg2) {
224     if (reg1.code == reg2.code - 2) {
225       double_spaced = true;
226     } else {
227       double_spaced = false;
228     }
229   }
VLoadStoreRegListVLoadStoreRegList230   VLoadStoreRegList(DRegister reg1, DRegister reg2, DRegister reg3)
231       : reg1(reg1), reg2(reg2), reg3(reg3) {
232     if (reg1.code == reg2.code - 2) {
233       double_spaced = true;
234     } else {
235       double_spaced = false;
236     }
237   }
238 
239   DRegister reg1;
240   DRegister reg2;
241   DRegister reg3;
242   bool double_spaced;
243 };
244 
245 using SRegisterList = ConsecutiveRegisterList<SRegister>;
246 using DRegisterList = ConsecutiveRegisterList<DRegister>;
247 
248 static inline SRegisterList operator-(const SRegister lhs, const SRegister rhs) {
249   return SRegisterList(lhs, rhs);
250 }
251 
252 static inline DRegisterList operator-(const DRegister lhs, const DRegister rhs) {
253   return DRegisterList(lhs, rhs);
254 }
255 
256 struct QRegisterList {
QRegisterListQRegisterList257   QRegisterList(QRegister s) : start(s), length(1) {}
QRegisterListQRegisterList258   QRegisterList(QRegister s, QRegister end) : start(s), length(end.code - s.code + 1) {}
259   // Explicit conversion to DRegisterList.
DRegisterListQRegisterList260   explicit operator DRegisterList() const {
261     return DRegisterList({static_cast<uint8_t>(start.code * 2)}, length * 2);
262   }
263 
264   QRegister start;
265   uint8_t length;
266 };
267 
268 static inline QRegisterList operator-(const QRegister lhs, const QRegister rhs) {
269   return QRegisterList(lhs, rhs);
270 }
271 
272 // A8.5 Addressing modes for memory access.
273 enum class AddressingMode {
274   // [<Rn>, <offset>], offset applied to address in Rn.
275   kOffset,
276   // Pre-indexed not used, so not implemented.
277   // [<Rn>], <offset>, address from Rn, offset applied, written back to Rn.
278   kPostIndexed,
279 };
280 
281 // Memory operands, operands for memory access instructions. See
282 // "MemOperandHelper mem" for a nicer syntax that is closer to assembly.
283 class MemOperand {
284  public:
MemOperand(CoreRegister rn,int32_t offset)285   MemOperand(CoreRegister rn, int32_t offset)
286       : mode_(AddressingMode::kOffset),
287         rn_(rn),
288         offset_(offset) {}
289 
MemOperand(CoreRegister rn,int32_t offset,AddressingMode mode)290   MemOperand(CoreRegister rn, int32_t offset, AddressingMode mode)
291       : mode_(mode),
292         rn_(rn),
293         offset_(offset) {}
294 
base()295   CoreRegister base() const { return rn_; }
offset()296   int32_t offset() const { return offset_; }
mode()297   AddressingMode mode() const { return mode_; }
298 
299   // These are bits used for encoding, named based on the encoding description.
u()300   int32_t u() { return offset_ >= 0; }
p()301   int32_t p() { return mode_ != AddressingMode::kPostIndexed; }
302   // Note, kPostIndexed will write back, but doesn't need to set bit w.
w()303   int32_t w() { return 0; }
304 
305   // Overload postfix increment to indicate a post-indexed addressing mode for load/stores.
306   MemOperand operator++(int) {
307     mode_ = AddressingMode::kPostIndexed;
308     return *this;
309   }
310 
311  private:
312   AddressingMode mode_;
313   CoreRegister rn_;
314   int32_t offset_;
315 };
316 
317 static inline bool operator==(const MemOperand lhs, const MemOperand rhs) {
318   return lhs.mode() == rhs.mode() && lhs.base() == rhs.base() && lhs.offset() == rhs.offset();
319 }
320 
321 static inline MemOperand operator,(CoreRegister r, int32_t offset) {
322   return MemOperand(r, offset);
323 }
324 
325 // Helper struct for some syntax sugar to look like native assembly, see mem.
326 struct MemOperandHelper {
327   const MemOperand operator[](MemOperand op) const { return op; }
328   MemOperand operator[](CoreRegister r) const { return MemOperand(r, 0); }
329 };
330 
331 // Use "mem" (and its overload of array subscript operator) to get some syntax
332 // that looks closer to native assembly when accessing memory. For example:
333 // - ldr(r0, mem[rn, offset]); // offset
334 // - ldr(r0, mem[rn], offset); // post-indexed
335 constexpr MemOperandHelper mem;
336 
337 // Conditional execution, only support AL (always) for now.
338 enum Condition : uint32_t {
339   kEQ = 0x00000000,
340   kNE = 0x10000000,
341   kCS = 0x20000000,
342   kCC = 0x30000000,
343   kMI = 0x40000000,
344   kPL = 0x50000000,
345   kVS = 0x60000000,
346   kVC = 0x70000000,
347   kHI = 0x80000000,
348   kLS = 0x90000000,
349   kGE = 0xa0000000,
350   kLT = 0xB0000000,
351   kGT = 0xC0000000,
352   kLE = 0xD0000000,
353   kAL = 0xE0000000,
354   kHS = kCS,
355   kLO = kCC,
356 };
357 
358 enum DataSize {
359   k8 = 0,
360   k16 = 1,
361   k32 = 2,
362 };
363 
364 // A simple AAarch32 assembler.
365 class Assembler : public AssemblerBase {
366  public:
367   using AssemblerBase::AssemblerBase;
368 
add(CoreRegister rn,CoreRegister rm)369   void add(CoreRegister rn, CoreRegister rm) { add(rn, rn, rm); }
370   void add(CoreRegister rd, CoreRegister rn, CoreRegister rm);
371   // Only support uint8_t immediates for now, it simplifies encoding.
372   void add(CoreRegister rd, CoreRegister rn, uint8_t imm);
373   void adds(CoreRegister rd, CoreRegister rn, uint8_t imm);
374   void and_(CoreRegister rd, CoreRegister rn, uint8_t imm);
b(Label & l)375   void b(Label& l) { b(kAL, l); }
beq(Label & l)376   void beq(Label& l) { b(kEQ, l); }
bne(Label & l)377   void bne(Label& l) { b(kNE, l); }
bhi(Label & l)378   void bhi(Label& l) { b(kHI, l); }
bhs(Label & l)379   void bhs(Label& l) { b(kHS, l); }
blo(Label & l)380   void blo(Label& l) { b(kLO, l); }
381   void bic(CoreRegister rd, CoreRegister rn, uint8_t imm);
382   void bx(CoreRegister rm);
383   // Cmp supports a subset of uint32_t offsets, see "A5.2.4 Modified immediate
384   // constants in ARM instructions", for simplicity we start with uint8_t, which
385   // is fully representation using a "rotation" of 0.
386   void cmp(CoreRegister rn, uint8_t imm);
387   void cmp(CoreRegister rn, CoreRegister rm);
388   void ldr(CoreRegister rt, MemOperand operand, int32_t offset);
389   void ldr(CoreRegister rt, MemOperand operand);
390   // LDRD <Rt>, <Rt2>, [<Rn>{, #+/-<imm>}].
391   void ldrd(CoreRegister rt, CoreRegister rt2, MemOperand op);
392   void mov(CoreRegister rd, CoreRegister rm);
moveq(CoreRegister rd,CoreRegister rm)393   void moveq(CoreRegister rd, CoreRegister rm) { mov(kEQ, rd, rm); }
movlo(CoreRegister rd,CoreRegister rm)394   void movlo(CoreRegister rd, CoreRegister rm) { mov(kLO, rd, rm); }
movls(CoreRegister rd,CoreRegister rm)395   void movls(CoreRegister rd, CoreRegister rm) { mov(kLS, rd, rm); }
396   void nop();
397   void pld(MemOperand operand);
398   void pop(CoreRegisterList regs);
399   void push(CoreRegisterList regs);
400   void str(CoreRegister rt, MemOperand op);
401   void sub(CoreRegister rd, CoreRegister rn, uint8_t imm);
402   void sub(CoreRegister rd, CoreRegister rn, CoreRegister rm);
403   // Only support uint8_t immediates for now, it simplifies encoding.
404   void subs(CoreRegister rd, CoreRegister rn, uint8_t imm);
405   void tst(CoreRegister rn, uint8_t imm);
406 
407   // SIMD instructions.
408   void vabs_f32(QRegister qd, QRegister qm);
409   void vadd_f32(QRegister qd, QRegister qn, QRegister qm);
410   void vcmpe_f32(SRegister sd, SRegister sm);
411   void vcvt_f32_s32(QRegister qd, QRegister qm);
412   void vcvt_s32_f32(QRegister qd, QRegister qm);
413   void vcvtn_s32_f32(QRegister qd, QRegister qm);
vdup_8(QRegister qd,DRegisterLane dm)414   void vdup_8(QRegister qd, DRegisterLane dm) { vdup(k8, qd, dm); }
vdup_16(QRegister qd,DRegisterLane dm)415   void vdup_16(QRegister qd, DRegisterLane dm) { vdup(k16, qd, dm); }
vdup_32(QRegister qd,DRegisterLane dm)416   void vdup_32(QRegister qd, DRegisterLane dm) { vdup(k32, qd, dm); }
417   void vext_8(QRegister qd, QRegister qn, QRegister qm, uint8_t imm4);
418   // VLD1.8 <list>, [<Rn>]{!} (multiple single elements).
vld1_8(DRegisterList regs,MemOperand op)419   void vld1_8(DRegisterList regs, MemOperand op) { vld1(k8, regs, op); }
vld1_8(DRegisterList regs,MemOperand op,CoreRegister rm)420   void vld1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { vld1(k8, regs, op, rm); }
vld1_8(QRegisterList regs,MemOperand op)421   void vld1_8(QRegisterList regs, MemOperand op) { vld1(k8, static_cast<DRegisterList>(regs), op); }
422   // VLD1.32 <list>, [<Rn>]{!} (multiple single elements).
vld1_32(DRegisterList regs,MemOperand op)423   void vld1_32(DRegisterList regs, MemOperand op) { vld1(k32, regs, op); }
vld1_32(QRegisterList regs,MemOperand op)424   void vld1_32(QRegisterList regs, MemOperand op) { vld1(k32, static_cast<DRegisterList>(regs), op); }
425   // VLD1.32 <list>, [<Rn>]{!} (single element to one lane).
426   void vld1_32(DRegisterLane dd, MemOperand op);
427   // VLD1.32 <list>, [<Rn>]{!} (single element to all lanes).
428   // We cannot differentiate the register list in C++ syntax, so use an instruction name similar to AArch64 LD1R.
429   void vld1r_32(DRegisterList regs, MemOperand op);
430   void vld2r_32(VLoadStoreRegList regs, MemOperand op);
431   void vld3r_32(VLoadStoreRegList regs, MemOperand op);
432   // VLDM <Rn>{!}, <list> (IA).
433   void vldm(MemOperand rn, SRegisterList regs);
434   void vldm(MemOperand rn, DRegisterList regs);
435   void vldr(SRegister sd, MemOperand op);
436   void vldr(DRegister dd, MemOperand op);
437   void vmax_f32(QRegister qd, QRegister qn, QRegister qm);
438   void vmax_s8(QRegister qd, QRegister qn, QRegister qm);
439   void vmin_f32(QRegister qd, QRegister qn, QRegister qm);
440   void vmin_s8(QRegister qd, QRegister qn, QRegister qm);
441   // VMLA.F32 <Sd>, <Sn>, <Sm>
442   void vmla_f32(SRegister sd, SRegister sn, SRegister sm);
443   // VMLA.F32 <Qd>, <Qn>, <Dm[x]>
444   void vmla_f32(QRegister qd, QRegister qn, DRegisterLane dm);
445   // VMLAL.S16 <Qd>, <Dn>, <Dm[x]>
446   void vmlal_s16(QRegister qd, DRegister dn, DRegisterLane dm);
447   // VMOV.F32 <Qd>, #<imm>; encoding A1
448   void vmov(QRegister qd, uint8_t imm);
449   // VMOV.F32 <Sd>, <Sm>; encoding A2.
450   void vmov(SRegister sd, SRegister sm);
451   // VMOV <Dm>, <Rt>, <Rt2>; encoding A1.
452   void vmov(DRegister dm, CoreRegister rt, CoreRegister rt2);
453   // VMOV <Dd>, <Dm>; encoding A1.
454   void vmov(DRegister dd, DRegister dm);
455   // VMOV <Qd>, <Qm>; encoding A1.
456   void vmov(QRegister qd, QRegister qm);
457   // VMOV_F32 <Sd>, <Sm>
vmov_f32(SRegister sd,SRegister sm)458   void vmov_f32(SRegister sd, SRegister sm) { vmov_f32(kAL, sd, sm); }
vmovpl_f32(SRegister sd,SRegister sm)459   void vmovpl_f32(SRegister sd, SRegister sm) { vmov_f32(kPL, sd, sm); }
vmovmi_f32(SRegister sd,SRegister sm)460   void vmovmi_f32(SRegister sd, SRegister sm) { vmov_f32(kMI, sd, sm); }
461   // VMOV_F64 <Dd>, <Dm>
462   void vmov_f64(DRegister dd, DRegister dm);
463   // VMOVL.S8 <Qd>, <Dm>
464   void vmovl_s8(QRegister qd, DRegister dm);
465   void vmrs(CoreRegister rt, SpecialFPRegister spec_reg);
466   void vmul_f32(QRegister qd, QRegister qn, QRegister qm);
467   void vneg_f32(QRegister qd, QRegister qm);
468   void vpop(DRegisterList regs);
469   void vpush(DRegisterList regs);
470   void vpush(SRegisterList regs);
471   void vqadd_s16(QRegister qd, QRegister qn, QRegister qm);
472   void vqdmulh_s32(QRegister qd, QRegister qn, DRegisterLane dm);
473   void vqmovn_s16(DRegister dd, QRegister qm);
474   void vqmovn_s32(DRegister dd, QRegister qm);
475   void vqshl_s32(QRegister qd, QRegister qm, QRegister qn);
476   void vrshl_s32(QRegister qd, QRegister qm, QRegister qn);
477   void vsdot_s8(QRegister qd, QRegister qn, DRegisterLane dm);
478   // VST1.8 <list>, [<Rn>]{!} (multiple single elements).
vst1_8(DRegisterList regs,MemOperand op)479   void vst1_8(DRegisterList regs, MemOperand op) { vst1(k8, regs, op); }
480   // VST1.8 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
vst1_8(DRegisterList regs,MemOperand op,CoreRegister rm)481   void vst1_8(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k8, regs, op, rm); }
482   // VST1.8 <list>, [<Rn>]{!} (single element form one lane).
vst1_8(DRegisterLane dd,MemOperand op)483   void vst1_8(DRegisterLane dd, MemOperand op) { vst1(k8, dd, op); }
484   // VST1.16 <list>, [<Rn>]{!} (multiple single elements).
vst1_16(DRegisterList regs,MemOperand op)485   void vst1_16(DRegisterList regs, MemOperand op) { vst1(k16, regs, op); }
486   // VST1.16 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
vst1_16(DRegisterList regs,MemOperand op,CoreRegister rm)487   void vst1_16(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k16, regs, op, rm); }
488   // VST1.16 <list>, [<Rn>]{!} (single element form one lane).
vst1_16(DRegisterLane dd,MemOperand op)489   void vst1_16(DRegisterLane dd, MemOperand op) { vst1(k16, dd, op); }
490   // VST1.32 <list>, [<Rn>]{!} (multiple single elements).
vst1_32(DRegisterList regs,MemOperand op)491   void vst1_32(DRegisterList regs, MemOperand op) { vst1(k32, regs, op); }
492   // VST1.32 <list>, [<Rn>]{!}, <Rm> (multiple single elements).
vst1_32(DRegisterList regs,MemOperand op,CoreRegister rm)493   void vst1_32(DRegisterList regs, MemOperand op, CoreRegister rm) { vst1(k32, regs, op, rm); }
494   // VST1.32 <list>, [<Rn>]{!} (single element form one lane).
vst1_32(DRegisterLane dd,MemOperand op)495   void vst1_32(DRegisterLane dd, MemOperand op) { vst1(k32, dd, op); }
496   // VSTM <Rn>{!}, <list>, consecutive 64-bit registers.
497   void vstm(MemOperand rn, DRegisterList regs);
498   // VSTR <Sd>, [Rn{, #+/-<imm>}], store single extension register to memory.
499   void vstr(SRegister rn, MemOperand op);
500 
501   // Binds Label l to the current location in the code buffer.
502   void bind(Label& l);
503   // Align the cursor to specified number of bytes, `n` must be a power of 2.
504   void align(uint8_t n);
505 
506  private:
507   void mov(Condition c, CoreRegister rd, CoreRegister rm);
508   void b(Condition c, Label& l);
509   void vdup(DataSize size, QRegister qd, DRegisterLane dm);
510   void vmov_f32(Condition c, SRegister sd, SRegister sm);
511   void vld1(DataSize size, DRegisterList regs, MemOperand op);
512   void vld1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm);
513   void vst1(DataSize size, DRegisterList regs, MemOperand op);
514   void vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm);
515   void vst1(DataSize size, DRegisterLane dd, MemOperand op);
516 };
517 
518 }  // namespace aarch32
519 }  // namespace xnnpack
520