• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include "xnnpack/aarch32-assembler.h"
7 #include "xnnpack/assembler.h"
8 #include "xnnpack/math.h"
9 
10 #include <cmath>
11 #include <cstddef>
12 
13 namespace xnnpack {
14 namespace aarch32 {
15 // Max value of imm for vldr/str (takes imm8, but shift right by 2 when encoding).
16 constexpr int32_t kUint10Max = 1023;
17 // Max value of imm that fits in ldr/str encoding (takes imm12, with a separate bit for sign).
18 constexpr int32_t kUint12Max = 4095;
19 
20 // PC register contains current address of instruction + 8 (2 instructions).
21 constexpr ptrdiff_t kPCDelta = 8;
22 // Constants used for checking branch offsets bounds.
23 constexpr ptrdiff_t kInt24Max = 8388607;
24 constexpr ptrdiff_t kInt24Min = -8388608;
25 
26 // Check if a branch offset is valid, it must fit in 24 bits.
branch_offset_valid(ptrdiff_t offset)27 bool branch_offset_valid(ptrdiff_t offset) {
28   return offset < kInt24Max && offset > kInt24Min;
29 }
30 
invalid_register_list(DRegisterList regs)31 bool invalid_register_list(DRegisterList regs) {
32   return regs.length == 0 || regs.length > 16 || regs.start.code + regs.length > 32;
33 }
34 
invalid_register_list(SRegisterList regs)35 bool invalid_register_list(SRegisterList regs) {
36   return regs.length == 0 || regs.start.code + regs.length > 32;
37 }
38 
encode(SRegister r,uint32_t single_bit_pos,uint32_t four_bits_pos)39 uint32_t encode(SRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
40   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
41 }
42 
encode(DRegister r,uint32_t single_bit_pos,uint32_t four_bits_pos)43 uint32_t encode(DRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
44   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
45 }
46 
encode(DRegisterLane r,uint32_t single_bit_pos,uint32_t four_bits_pos)47 uint32_t encode(DRegisterLane r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
48   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
49 }
50 
encode(QRegister r,uint32_t single_bit_pos,uint32_t four_bits_pos)51 uint32_t encode(QRegister r, uint32_t single_bit_pos, uint32_t four_bits_pos) {
52   return r.d() << single_bit_pos | r.vd() << four_bits_pos;
53 }
54 
encode(SRegisterList regs,uint32_t single_bit_pos,uint32_t four_bits_pos)55 uint32_t encode(SRegisterList regs, uint32_t single_bit_pos, uint32_t four_bits_pos) {
56   const SRegister r = regs.start;
57   return r.d() << single_bit_pos | r.vd() << four_bits_pos | regs.length;
58 }
59 
encode(DRegisterList regs,uint32_t single_bit_pos,uint32_t four_bits_pos)60 uint32_t encode(DRegisterList regs, uint32_t single_bit_pos, uint32_t four_bits_pos) {
61   const DRegister r = regs.start;
62   return r.d() << single_bit_pos | r.vd() << four_bits_pos | regs.length * 2;
63 }
64 
encode_mem_puw(MemOperand op)65 uint32_t encode_mem_puw(MemOperand op) {
66   return op.p() << 24 | op.u() << 23 | op.w() << 21 | op.base().code << 16;
67 }
68 
69 // Return value of 0 is invalid, indicates error.
encode_regs_length_to_type(DRegisterList regs)70 uint32_t encode_regs_length_to_type(DRegisterList regs) {
71   switch (regs.length) {
72     case 1:
73       return 0x7;
74     case 2:
75       return 0xA;
76     case 3:
77       return 0x6;
78     case 4:
79       return 0x2;
80   }
81   return 0;
82 }
83 
add(CoreRegister rd,CoreRegister rn,CoreRegister rm)84 void Assembler::add(CoreRegister rd, CoreRegister rn, CoreRegister rm) {
85   emit32(kAL | 0x8 << 20 | rn.code << 16 | rd.code << 12 | rm.code);
86 }
87 
add(CoreRegister rd,CoreRegister rn,uint8_t imm)88 void Assembler::add(CoreRegister rd, CoreRegister rn, uint8_t imm) {
89   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
90   emit32(kAL | 0x28 << 20 | rn.code << 16 | rd.code << 12 | imm);
91 }
92 
adds(CoreRegister rd,CoreRegister rn,uint8_t imm)93 void Assembler::adds(CoreRegister rd, CoreRegister rn, uint8_t imm) {
94   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
95   emit32(kAL | 0x29 << 20 | rn.code << 16 | rd.code << 12 | imm);
96 }
97 
and_(CoreRegister rd,CoreRegister rn,uint8_t imm)98 void Assembler::and_(CoreRegister rd, CoreRegister rn, uint8_t imm) {
99   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
100   emit32(kAL | 1 << 25 | rn.code << 16 | rd.code << 12 | imm);
101 }
102 
b(Condition c,Label & l)103 void Assembler::b(Condition c, Label& l) {
104   if (l.bound) {
105     // Offset is relative to after this b instruction + kPCDelta.
106     const ptrdiff_t offset = l.offset - cursor_ - kPCDelta;
107     if (!branch_offset_valid(offset)) {
108       error_ = Error::kLabelOffsetOutOfBounds;
109       return;
110     }
111 
112     // No need to shift by 2 since our offset is already in terms of uint32_t.
113     emit32(c | 0xA << 24 | ((offset >> kInstructionSizeInBytesLog2) & 0x00FFFFFF));
114   } else {
115     if (!l.add_use(cursor_)) {
116       error_ = Error::kLabelHasTooManyUsers;
117       return;
118     }
119     // Emit 0 offset first, will patch it up when label is bound later.
120     emit32(c | 0xA << 24);
121   }
122 }
123 
bind(Label & l)124 void Assembler::bind(Label& l) {
125   if (l.bound) {
126     error_ = Error::kLabelAlreadyBound;
127     return;
128   }
129 
130   l.bound = true;
131   l.offset = cursor_;
132 
133   // Patch all users.
134   for (size_t i = 0; i < l.num_users; i++) {
135     byte* user = l.users[i];
136     const ptrdiff_t offset = l.offset - user - kPCDelta;
137     uint32_t* instr = reinterpret_cast<uint32_t*>(user);
138 
139     if (!branch_offset_valid(offset)) {
140       error_ = Error::kLabelOffsetOutOfBounds;
141       return;
142     }
143 
144     *instr |= (offset >> kInstructionSizeInBytesLog2) & 0x00FFFFFF;
145   }
146 }
147 
bic(CoreRegister rd,CoreRegister rn,uint8_t imm)148 void Assembler::bic(CoreRegister rd, CoreRegister rn, uint8_t imm) {
149   emit32(kAL | 0x03C00000 | rn.code << 16 | rd.code << 12 | imm);
150 }
151 
bx(CoreRegister rm)152 void Assembler::bx(CoreRegister rm) {
153   emit32(kAL | 0x12fff10 | rm.code);
154 }
155 
cmp(CoreRegister rn,uint8_t imm)156 void Assembler::cmp(CoreRegister rn, uint8_t imm) {
157   emit32(kAL | 0x35 << 20 | rn.code << 16 | imm);
158 }
159 
cmp(CoreRegister rn,CoreRegister rm)160 void Assembler::cmp(CoreRegister rn, CoreRegister rm) {
161   emit32(kAL | 0x01500000 | rn.code << 16 | rm.code);
162 }
163 
ldr(CoreRegister rt,MemOperand op,int32_t offset)164 void Assembler::ldr(CoreRegister rt, MemOperand op, int32_t offset) {
165   ldr(rt, MemOperand(op.base(), offset, AddressingMode::kPostIndexed));
166 }
167 
ldr(CoreRegister rt,MemOperand op)168 void Assembler::ldr(CoreRegister rt, MemOperand op) {
169   const int32_t offset = op.offset();
170   if (std::abs(offset) > kUint12Max) {
171     error_ = Error::kInvalidOperand;
172     return;
173   }
174 
175   emit32(kAL | 0x41 << 20 | encode_mem_puw(op) | rt.code << 12 | offset);
176 }
177 
ldrd(CoreRegister rt,CoreRegister rt2,MemOperand op)178 void Assembler::ldrd(CoreRegister rt, CoreRegister rt2, MemOperand op) {
179   const int32_t offset = op.offset();
180   if ((std::abs(op.offset()) > UINT8_MAX) || (rt.code + 1 != rt2.code)) {
181     error_ = Error::kInvalidOperand;
182     return;
183   }
184   const uint32_t offset_top = (offset & 0xF0) << 4;
185   const uint32_t offset_bot = (offset & 0xF);
186 
187   emit32(kAL | 0x004000D0 | encode_mem_puw(op) | rt.code << 12 | offset_top | offset_bot);
188 }
189 
mov(CoreRegister rd,CoreRegister rm)190 void Assembler::mov(CoreRegister rd, CoreRegister rm) {
191   mov(kAL, rd, rm);
192 }
193 
mov(Condition c,CoreRegister Rd,CoreRegister Rm)194 void Assembler::mov(Condition c, CoreRegister Rd, CoreRegister Rm) {
195   emit32(c | 0x1A << 20 | Rd.code << 12 | Rm.code);
196 }
197 
nop()198 void Assembler::nop() {
199   emit32(kAL | 0x0320F000);
200 }
201 
pld(MemOperand op)202 void Assembler::pld(MemOperand op) {
203   emit32(0xF550F000 | op.u() << 23 | op.base().code << 16 | op.offset());
204 }
205 
pop(CoreRegisterList regs)206 void Assembler::pop(CoreRegisterList regs) {
207   if (!regs.has_more_than_one_register()) {
208     // TODO(zhin): there is a different valid encoding for single register.
209     error_ = Error::kInvalidOperand;
210     return;
211   }
212 
213   emit32(kAL | 0x8BD << 16 | regs.list);
214 }
215 
push(CoreRegisterList regs)216 void Assembler::push(CoreRegisterList regs) {
217   if (!regs.has_more_than_one_register()) {
218     // TODO(zhin): there is a different valid encoding for single register.
219     error_ = Error::kInvalidOperand;
220     return;
221   }
222 
223   emit32(kAL | 0x92D << 16 | regs.list);
224 }
225 
str(CoreRegister rt,MemOperand op)226 void Assembler::str(CoreRegister rt, MemOperand op) {
227   const int32_t offset = op.offset();
228   if (std::abs(offset) > kUint12Max) {
229     error_ = Error::kInvalidOperand;
230     return;
231   }
232   emit32(kAL | 1 << 26 | encode_mem_puw(op) | rt.code << 12 | offset);
233 }
234 
sub(CoreRegister rd,CoreRegister rn,uint8_t imm)235 void Assembler::sub(CoreRegister rd, CoreRegister rn, uint8_t imm) {
236   emit32(kAL | 0x24 << 20 | rn.code << 16 | rd.code << 12 | imm);
237 }
238 
sub(CoreRegister rd,CoreRegister rn,CoreRegister rm)239 void Assembler::sub(CoreRegister rd, CoreRegister rn, CoreRegister rm) {
240   emit32(kAL | 0x4 << 20 | rn.code << 16 | rd.code << 12 | rm.code);
241 }
242 
subs(CoreRegister rd,CoreRegister rn,uint8_t imm)243 void Assembler::subs(CoreRegister rd, CoreRegister rn, uint8_t imm) {
244   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
245   emit32(kAL | 0x25 << 20 | rn.code << 16 | rd.code << 12 | imm);
246 }
247 
tst(CoreRegister rn,uint8_t imm)248 void Assembler::tst(CoreRegister rn, uint8_t imm) {
249   // Rotation = 0, since imm is limited to 8 bits and fits in encoding.
250   emit32(kAL | 0x31 << 20 | rn.code << 16 | imm);
251 }
252 
vcmpe_f32(SRegister sd,SRegister sm)253 void Assembler::vcmpe_f32(SRegister sd, SRegister sm) {
254   emit32(kAL | 0x0EB40AC0 | encode(sd, 22, 12) | encode(sm, 5, 0));
255 }
256 
vcvt_f32_s32(QRegister qd,QRegister qm)257 void Assembler::vcvt_f32_s32(QRegister qd, QRegister qm) {
258   emit32(0xF3BB0640 | encode(qd, 22, 12) | encode(qm, 5, 0));
259 }
260 
vcvt_s32_f32(QRegister qd,QRegister qm)261 void Assembler::vcvt_s32_f32(QRegister qd, QRegister qm) {
262   emit32(0xF3BB0740 | encode(qd, 22, 12) | encode(qm, 5, 0));
263 }
264 
vcvtn_s32_f32(QRegister qd,QRegister qm)265 void Assembler::vcvtn_s32_f32(QRegister qd, QRegister qm) {
266   emit32(0xF3BB0140 | encode(qd, 22, 12) | encode(qm, 5, 0));
267 }
268 
vdup(DataSize size,QRegister qd,DRegisterLane dm)269 void Assembler::vdup(DataSize size, QRegister qd, DRegisterLane dm) {
270   uint8_t imm4 = 0;
271   switch (size) {
272     case k8:
273       if (dm.lane > 7) {
274         error_ = Error::kInvalidLaneIndex;
275         return;
276       }
277       imm4 = 1 | ((dm.lane & 0x7) << 1);
278       break;
279     case k16:
280       if (dm.lane > 3) {
281         error_ = Error::kInvalidLaneIndex;
282         return;
283       }
284       imm4 = 2 | ((dm.lane & 0x3) << 2);
285       break;
286     case k32:
287       if (dm.lane > 1) {
288         error_ = Error::kInvalidLaneIndex;
289         return;
290       }
291       imm4 = 4 | ((dm.lane & 0x1) << 3);
292       break;
293   }
294   emit32(0xF3B00C40 | imm4 << 16 | encode(qd, 22, 12) | encode(dm, 5, 0));
295 }
296 
vext_8(QRegister qd,QRegister qn,QRegister qm,uint8_t imm4)297 void Assembler::vext_8(QRegister qd, QRegister qn, QRegister qm, uint8_t imm4) {
298   if (imm4 > 15) {
299     error_ = Error::kInvalidOperand;
300     return;
301   }
302   emit32(0xF2B00040 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0) | imm4 << 8);
303 }
304 
vld1(DataSize size,DRegisterList regs,MemOperand op)305 void Assembler::vld1(DataSize size, DRegisterList regs, MemOperand op) {
306   const uint8_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
307   vld1(size, regs, op, CoreRegister{rm});
308 }
309 
vld1(DataSize size,DRegisterList regs,MemOperand op,CoreRegister rm)310 void Assembler::vld1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm) {
311   const uint8_t type = encode_regs_length_to_type(regs);
312   if (!type) {
313     error_ = Error::kInvalidRegisterListLength;
314     return;
315   }
316 
317   emit32(0xF4200000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm.code);
318 }
319 
vld1_32(DRegisterLane dd,MemOperand op)320 void Assembler::vld1_32(DRegisterLane dd, MemOperand op) {
321   if (dd.lane > 1) {
322     error_ = Error::kInvalidLaneIndex;
323     return;
324   }
325   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
326   emit32(kAL | 0xF4A00800 | dd.lane << 7 | encode(dd, 22, 12) | op.base().code << 16 | rm);
327 }
328 
vld1r_32(DRegisterList regs,MemOperand op)329 void Assembler::vld1r_32(DRegisterList regs, MemOperand op) {
330   if ((op.mode() == AddressingMode::kOffset && op.offset() != 0) || regs.length > 2) {
331     error_ = Error::kInvalidOperand;
332     return;
333   }
334 
335   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
336   emit32(0xF4A00C80 | encode(regs.start, 22, 12) | op.base().code << 16 | (regs.length - 1) << 5 | rm);
337 }
338 
vldm(MemOperand rn,SRegisterList regs)339 void Assembler::vldm(MemOperand rn, SRegisterList regs) {
340   if (invalid_register_list(regs)) {
341     error_ = Error::kInvalidRegisterListLength;
342     return;
343   }
344   uint32_t w = (rn.mode() == AddressingMode::kOffset ? 0 : 1) << 21;
345   emit32(kAL | 0x0C900A00 | w | rn.base().code << 16 | encode(regs, 22, 12));
346 }
347 
vldm(MemOperand rn,DRegisterList regs)348 void Assembler::vldm(MemOperand rn, DRegisterList regs) {
349   if (invalid_register_list(regs)) {
350     error_ = Error::kInvalidRegisterListLength;
351     return;
352   }
353   uint32_t w = (rn.mode() == AddressingMode::kOffset ? 0 : 1) << 21;
354   emit32(kAL | 0x0C900B00 | w | rn.base().code << 16 | encode(regs, 22, 12));
355 }
356 
vldr(SRegister sd,MemOperand op)357 void Assembler::vldr(SRegister sd, MemOperand op) {
358   const uint32_t offset = std::abs(op.offset());
359   if (op.mode() != AddressingMode::kOffset || offset > kUint10Max || offset % 4 != 0) {
360     error_ = Error::kInvalidOperand;
361     return;
362   }
363 
364   emit32(kAL | 0x0D100A00 | op.u() << 23 | encode(sd, 22, 12) | op.base().code << 16 | offset >> 2);
365 }
366 
vldr(DRegister dd,MemOperand op)367 void Assembler::vldr(DRegister dd, MemOperand op) {
368   const uint32_t offset = std::abs(op.offset());
369   if (op.mode() != AddressingMode::kOffset || offset > kUint10Max || offset % 4 != 0) {
370     error_ = Error::kInvalidOperand;
371     return;
372   }
373 
374   emit32(kAL | 0x0D100B00 | op.u() << 23 | encode(dd, 22, 12) | op.base().code << 16 | offset >> 2);
375 }
376 
vmax_f32(QRegister qd,QRegister qn,QRegister qm)377 void Assembler::vmax_f32(QRegister qd, QRegister qn, QRegister qm) {
378   emit32(0xF2000F40 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
379 }
380 
vmax_s8(QRegister qd,QRegister qn,QRegister qm)381 void Assembler::vmax_s8(QRegister qd, QRegister qn, QRegister qm) {
382  emit32(0xF2000640 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
383 }
384 
vmin_f32(QRegister qd,QRegister qn,QRegister qm)385 void Assembler::vmin_f32(QRegister qd, QRegister qn, QRegister qm) {
386   emit32(0xF2200F40 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
387 }
388 
vmin_s8(QRegister qd,QRegister qn,QRegister qm)389 void Assembler::vmin_s8(QRegister qd, QRegister qn, QRegister qm) {
390  emit32(0xF2000650 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
391 }
392 
vmla_f32(SRegister sd,SRegister sn,SRegister sm)393 void Assembler::vmla_f32(SRegister sd, SRegister sn, SRegister sm) {
394   emit32(kAL | 0x0E000A00 | encode(sd, 22, 12) | encode (sn, 7, 16) | encode(sm, 5, 0));
395 }
396 
vmla_f32(QRegister qd,QRegister qn,DRegisterLane dm)397 void Assembler::vmla_f32(QRegister qd, QRegister qn, DRegisterLane dm) {
398   if (dm.lane > 1) {
399     error_ = Error::kInvalidLaneIndex;
400     return;
401   }
402   emit32(0xF3A00140 | encode(qd, 22, 12) | encode(qn, 7, 16) | dm.lane << 5 | dm.code);
403 }
404 
vmlal_s16(QRegister qd,DRegister dn,DRegisterLane dm)405 void Assembler::vmlal_s16(QRegister qd, DRegister dn, DRegisterLane dm) {
406   if (dm.lane > 3) {
407     error_ = Error::kInvalidLaneIndex;
408     return;
409   }
410   if (dm.code > 7) {
411     error_ = Error::kInvalidOperand;
412     return;
413   }
414 
415   uint8_t lane_top = dm.lane >> 1;
416   uint8_t lane_bot = dm.lane & 1;
417   emit32(0xF2900240 | encode(qd, 22, 12) | encode(dn, 7, 16) | lane_top << 5 | lane_bot << 3 | dm.code);
418 }
419 
vmov(SRegister sd,SRegister sm)420 void Assembler::vmov(SRegister sd, SRegister sm) {
421   emit32(kAL | 0x0EB00A40 | encode(sd, 22, 12) | encode(sm, 5, 0));
422 }
423 
vmov(DRegister dm,CoreRegister rt,CoreRegister rt2)424 void Assembler::vmov(DRegister dm, CoreRegister rt, CoreRegister rt2) {
425   emit32(kAL | 0x0C400B10 | rt2.code << 16 | rt.code << 12 | encode(dm, 5, 0));
426 }
427 
vmov(DRegister dd,DRegister dm)428 void Assembler::vmov(DRegister dd, DRegister dm) {
429   emit32(0xF2600110 | encode(dd, 22, 12) | encode(dm, 7, 16) | encode(dm, 5, 0));
430 }
431 
vmov(QRegister qd,QRegister qm)432 void Assembler::vmov(QRegister qd, QRegister qm) {
433   emit32(0xF2200150 | encode(qd, 22, 12) | encode(qm, 7, 16) | encode(qm, 5, 0));
434 }
435 
vmov_f32(Condition c,SRegister sd,SRegister sm)436 void Assembler::vmov_f32(Condition c, SRegister sd, SRegister sm) {
437   emit32(c | 0x0EB00A40 | encode(sd, 22, 12) | encode(sm, 5, 0));
438 }
439 
vmov_f64(DRegister dd,DRegister dm)440 void Assembler::vmov_f64(DRegister dd, DRegister dm) {
441   emit32(kAL | 0x0EB00B40 | encode(dd, 22, 12) | encode(dm, 5, 0));
442 }
443 
vmovl_s8(QRegister qd,DRegister dm)444 void Assembler::vmovl_s8(QRegister qd, DRegister dm) {
445   emit32(0xF2880A10 | encode(qd, 22, 12) | encode(dm, 5, 0));
446 }
447 
vmrs(CoreRegister rt,SpecialFPRegister spec_reg)448 void Assembler::vmrs(CoreRegister rt, SpecialFPRegister spec_reg) {
449   emit32(kAL | 0x0EF00A10 | static_cast<uint32_t>(spec_reg) << 16 | rt.code << 12);
450 }
451 
vmul_f32(QRegister qd,QRegister qn,QRegister qm)452 void Assembler::vmul_f32(QRegister qd, QRegister qn, QRegister qm) {
453   emit32(0xF3000D50 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
454 }
455 
vpop(DRegisterList regs)456 void Assembler::vpop(DRegisterList regs) {
457   if (invalid_register_list(regs)) {
458     error_ = Error::kInvalidRegisterListLength;
459     return;
460   }
461   emit32(kAL | encode(regs, 22, 12) | 0xCBD << 16 | 0xB << 8);
462 }
463 
vpush(DRegisterList regs)464 void Assembler::vpush(DRegisterList regs) {
465   if (invalid_register_list(regs)) {
466     error_ = Error::kInvalidRegisterListLength;
467     return;
468   }
469   emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xB << 8);
470 }
471 
vpush(SRegisterList regs)472 void Assembler::vpush(SRegisterList regs) {
473   if (invalid_register_list(regs)) {
474     error_ = Error::kInvalidRegisterListLength;
475     return;
476   }
477   emit32(kAL | encode(regs, 22, 12) | 0xD2D << 16 | 0xA << 8);
478 }
479 
vqadd_s16(QRegister qd,QRegister qn,QRegister qm)480 void Assembler::vqadd_s16(QRegister qd, QRegister qn, QRegister qm) {
481   emit32(0xF2100050 | encode(qd, 22, 12) | encode(qn, 7, 16) | encode(qm, 5, 0));
482 }
483 
vqdmulh_s32(QRegister qd,QRegister qn,DRegisterLane dm)484 void Assembler::vqdmulh_s32(QRegister qd, QRegister qn, DRegisterLane dm) {
485   if (dm.code > 15) {
486     error_ = Error::kInvalidOperand;
487     return;
488   }
489   if (dm.lane > 1) {
490     error_ = Error::kInvalidLaneIndex;
491     return;
492   }
493   emit32(0xF3A00C40 | encode(qd, 22, 12) | encode(qn, 7, 16) | dm.lane << 5 | dm.code);
494 }
495 
vqmovn_s16(DRegister dd,QRegister qm)496 void Assembler::vqmovn_s16(DRegister dd, QRegister qm) {
497   emit32(0xF3B20280 | encode(dd, 22, 12) | encode(qm, 5, 0));
498 }
499 
vqmovn_s32(DRegister dd,QRegister qm)500 void Assembler::vqmovn_s32(DRegister dd, QRegister qm) {
501   emit32(0xF3B60280 | encode(dd, 22, 12) | encode(qm, 5, 0));
502 }
503 
vqshl_s32(QRegister qd,QRegister qm,QRegister qn)504 void Assembler::vqshl_s32(QRegister qd, QRegister qm, QRegister qn) {
505   emit32(0xF2200450 | encode(qd, 22, 12) | encode(qm, 5, 0) | encode(qn, 7, 16));
506 }
507 
vrshl_s32(QRegister qd,QRegister qm,QRegister qn)508 void Assembler::vrshl_s32(QRegister qd, QRegister qm, QRegister qn) {
509   emit32(0xF2200540 | encode(qd, 22, 12) | encode(qm, 5, 0) | encode(qn, 7, 16));
510 }
511 
vsdot_s8(QRegister qd,QRegister qn,DRegisterLane dm)512 void Assembler::vsdot_s8(QRegister qd, QRegister qn, DRegisterLane dm) {
513   if (dm.lane > 1) {
514     error_ = Error::kInvalidLaneIndex;
515     return;
516   }
517   emit32(0xFE200D40 | encode(qd, 22, 12) | encode(qn, 7, 16) | dm.lane << 5 | dm.code);
518 }
519 
vst1(DataSize size,DRegisterList regs,MemOperand op)520 void Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op) {
521   const uint8_t type = encode_regs_length_to_type(regs);
522   if (!type) {
523     error_ = Error::kInvalidRegisterListLength;
524     return;
525   }
526 
527   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
528   emit32(0xF4000000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm);
529 }
530 
vst1(DataSize size,DRegisterList regs,MemOperand op,CoreRegister rm)531 void Assembler::vst1(DataSize size, DRegisterList regs, MemOperand op, CoreRegister rm) {
532   if (rm.code == 0b1101 || rm.code == 0b1111) {
533     error_ = Error::kInvalidOperand;
534     return;
535   }
536 
537   const uint8_t type = encode_regs_length_to_type(regs);
538   if (!type) {
539     error_ = Error::kInvalidRegisterListLength;
540     return;
541   }
542 
543   emit32(0xF4000000 | encode(regs.start, 22, 12) | op.base().code << 16 | type << 8 | size << 6 | rm.code);
544 }
545 
vst1(DataSize size,DRegisterLane dd,MemOperand op)546 void Assembler::vst1(DataSize size, DRegisterLane dd, MemOperand op) {
547   if ((size == k8 && dd.lane > 7) || (size == k16 && dd.lane > 3) || (size == k32 && dd.lane > 1)) {
548     error_ = Error::kInvalidLaneIndex;
549     return;
550   }
551 
552   const uint8_t shift = size == k8 ? 5 : size == k16 ? 6 : 7;
553   const uint32_t rm = op.mode() == AddressingMode::kPostIndexed ? 0xD : 0xF;
554   emit32(0xF4800000 | encode(dd, 22, 12) | op.base().code << 16 | size << 10 | dd.lane << shift | rm);
555 }
556 
vstm(MemOperand rn,DRegisterList regs)557 void Assembler::vstm(MemOperand rn, DRegisterList regs) {
558   if (invalid_register_list(regs)) {
559     error_ = Error::kInvalidRegisterListLength;
560     return;
561   }
562   uint32_t w = (rn.mode() == AddressingMode::kOffset ? 0 : 1) << 21;
563   emit32(kAL | 0x0C800B00 | w | rn.base().code << 16 |  encode(regs.start, 22, 12) | regs.length << 1);
564 }
565 
vstr(SRegister rn,MemOperand op)566 void Assembler::vstr(SRegister rn, MemOperand op) {
567   const uint32_t offset = std::abs(op.offset());
568   if (op.mode() != AddressingMode::kOffset || offset > kUint10Max || offset % 4 != 0) {
569     error_ = Error::kInvalidOperand;
570     return;
571   }
572   emit32(kAL | 0x0D000A00 | op.u() << 23 | op.base().code << 16 | encode(rn, 22, 12) | offset >> 2);
573 }
574 
align(uint8_t n)575 void Assembler::align(uint8_t n) {
576   if (!is_po2(n) || (n % kInstructionSizeInBytes != 0)) {
577     error_ = Error::kInvalidOperand;
578     return;
579   }
580 
581   uintptr_t cursor = reinterpret_cast<uintptr_t>(cursor_);
582   const uintptr_t target = round_up_po2(cursor, n);
583   while (cursor < target) {
584     nop();
585     cursor += kInstructionSizeInBytes;
586   }
587 }
588 
589 }  // namespace aarch32
590 }  // namespace xnnpack
591