1 // Copyright 2019, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include "macro-assembler-aarch64.h"
28
29 namespace vixl {
30 namespace aarch64 {
31
AddSubHelper(AddSubHelperOption option,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)32 void MacroAssembler::AddSubHelper(AddSubHelperOption option,
33 const ZRegister& zd,
34 const ZRegister& zn,
35 IntegerOperand imm) {
36 VIXL_ASSERT(imm.FitsInLane(zd));
37
38 // Simple, encodable cases.
39 if (TrySingleAddSub(option, zd, zn, imm)) return;
40
41 VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
42 bool add_imm = (option == kAddImmediate);
43
44 // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
45 // instruction. Also interpret the immediate as signed, so we can convert
46 // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
47 IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
48 if (signed_imm.IsNegative()) {
49 AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
50 IntegerOperand n_imm(signed_imm.GetMagnitude());
51 // IntegerOperand can represent -INT_MIN, so this is always safe.
52 VIXL_ASSERT(n_imm.IsPositiveOrZero());
53 if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
54 }
55
56 // Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
57 UseScratchRegisterScope temps(this);
58 ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
59 Dup(scratch, imm);
60
61 SingleEmissionCheckScope guard(this);
62 if (add_imm) {
63 add(zd, zn, scratch);
64 } else {
65 sub(zd, zn, scratch);
66 }
67 }
68
TrySingleAddSub(AddSubHelperOption option,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)69 bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
70 const ZRegister& zd,
71 const ZRegister& zn,
72 IntegerOperand imm) {
73 VIXL_ASSERT(imm.FitsInLane(zd));
74
75 int imm8;
76 int shift = -1;
77 if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
78 imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
79 MovprfxHelperScope guard(this, zd, zn);
80 switch (option) {
81 case kAddImmediate:
82 add(zd, zd, imm8, shift);
83 return true;
84 case kSubImmediate:
85 sub(zd, zd, imm8, shift);
86 return true;
87 }
88 }
89 return false;
90 }
91
IntWideImmHelper(IntWideImmFn imm_fn,SVEArithPredicatedFn reg_macro,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm,bool is_signed)92 void MacroAssembler::IntWideImmHelper(IntWideImmFn imm_fn,
93 SVEArithPredicatedFn reg_macro,
94 const ZRegister& zd,
95 const ZRegister& zn,
96 IntegerOperand imm,
97 bool is_signed) {
98 if (is_signed) {
99 // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
100 if (imm.IsInt8()) {
101 MovprfxHelperScope guard(this, zd, zn);
102 (this->*imm_fn)(zd, zd, imm.AsInt8());
103 return;
104 }
105 } else {
106 // E.g. UMIN_z_zi, UMAX_z_zi
107 if (imm.IsUint8()) {
108 MovprfxHelperScope guard(this, zd, zn);
109 (this->*imm_fn)(zd, zd, imm.AsUint8());
110 return;
111 }
112 }
113
114 UseScratchRegisterScope temps(this);
115 PRegister pg = temps.AcquireGoverningP();
116 Ptrue(pg.WithSameLaneSizeAs(zd));
117
118 // Try to re-use zd if we can, so we can avoid a movprfx.
119 ZRegister scratch =
120 zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
121 : zd;
122 Dup(scratch, imm);
123
124 // The vector-form macro for commutative operations will swap the arguments to
125 // avoid movprfx, if necessary.
126 (this->*reg_macro)(zd, pg.Merging(), zn, scratch);
127 }
128
Mul(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)129 void MacroAssembler::Mul(const ZRegister& zd,
130 const ZRegister& zn,
131 IntegerOperand imm) {
132 VIXL_ASSERT(allow_macro_instructions_);
133 IntWideImmFn imm_fn = &Assembler::mul;
134 SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
135 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
136 }
137
Smin(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)138 void MacroAssembler::Smin(const ZRegister& zd,
139 const ZRegister& zn,
140 IntegerOperand imm) {
141 VIXL_ASSERT(allow_macro_instructions_);
142 VIXL_ASSERT(imm.FitsInSignedLane(zd));
143 IntWideImmFn imm_fn = &Assembler::smin;
144 SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
145 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
146 }
147
Smax(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)148 void MacroAssembler::Smax(const ZRegister& zd,
149 const ZRegister& zn,
150 IntegerOperand imm) {
151 VIXL_ASSERT(allow_macro_instructions_);
152 VIXL_ASSERT(imm.FitsInSignedLane(zd));
153 IntWideImmFn imm_fn = &Assembler::smax;
154 SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
155 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
156 }
157
Umax(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)158 void MacroAssembler::Umax(const ZRegister& zd,
159 const ZRegister& zn,
160 IntegerOperand imm) {
161 VIXL_ASSERT(allow_macro_instructions_);
162 VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
163 IntWideImmFn imm_fn = &Assembler::umax;
164 SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
165 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
166 }
167
Umin(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)168 void MacroAssembler::Umin(const ZRegister& zd,
169 const ZRegister& zn,
170 IntegerOperand imm) {
171 VIXL_ASSERT(allow_macro_instructions_);
172 VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
173 IntWideImmFn imm_fn = &Assembler::umin;
174 SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
175 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
176 }
177
Addpl(const Register & xd,const Register & xn,int64_t multiplier)178 void MacroAssembler::Addpl(const Register& xd,
179 const Register& xn,
180 int64_t multiplier) {
181 VIXL_ASSERT(allow_macro_instructions_);
182
183 // This macro relies on `Rdvl` to handle some out-of-range cases. Check that
184 // `VL * multiplier` cannot overflow, for any possible value of VL.
185 VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
186 VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
187
188 if (xd.IsZero()) return;
189 if (xn.IsZero() && xd.IsSP()) {
190 // TODO: This operation doesn't make much sense, but we could support it
191 // with a scratch register if necessary.
192 VIXL_UNIMPLEMENTED();
193 }
194
195 // Handling xzr requires an extra move, so defer it until later so we can try
196 // to use `rdvl` instead (via `Addvl`).
197 if (IsInt6(multiplier) && !xn.IsZero()) {
198 SingleEmissionCheckScope guard(this);
199 addpl(xd, xn, static_cast<int>(multiplier));
200 return;
201 }
202
203 // If `multiplier` is a multiple of 8, we can use `Addvl` instead.
204 if ((multiplier % kZRegBitsPerPRegBit) == 0) {
205 Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
206 return;
207 }
208
209 if (IsInt6(multiplier)) {
210 VIXL_ASSERT(xn.IsZero()); // Other cases were handled with `addpl`.
211 // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
212 // materialise a zero.
213 MacroEmissionCheckScope guard(this);
214 movz(xd, 0);
215 addpl(xd, xd, static_cast<int>(multiplier));
216 return;
217 }
218
219 // TODO: Some probable cases result in rather long sequences. For example,
220 // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
221 // outside the encodable range. We should look for ways to cover such cases
222 // without drastically increasing the complexity of this logic.
223
224 // For other cases, calculate xn + (PL * multiplier) using discrete
225 // instructions. This requires two scratch registers in the general case, so
226 // try to re-use the destination as a scratch register.
227 UseScratchRegisterScope temps(this);
228 temps.Include(xd);
229 temps.Exclude(xn);
230
231 Register scratch = temps.AcquireX();
232 // Because there is no `rdpl`, so we have to calculate PL from VL. We can't
233 // scale the multiplier because (we already know) it isn't a multiple of 8.
234 Rdvl(scratch, multiplier);
235
236 MacroEmissionCheckScope guard(this);
237 if (xn.IsZero()) {
238 asr(xd, scratch, kZRegBitsPerPRegBitLog2);
239 } else if (xd.IsSP() || xn.IsSP()) {
240 // TODO: MacroAssembler::Add should be able to handle this.
241 asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
242 add(xd, xn, scratch);
243 } else {
244 add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
245 }
246 }
247
Addvl(const Register & xd,const Register & xn,int64_t multiplier)248 void MacroAssembler::Addvl(const Register& xd,
249 const Register& xn,
250 int64_t multiplier) {
251 VIXL_ASSERT(allow_macro_instructions_);
252 VIXL_ASSERT(xd.IsX());
253 VIXL_ASSERT(xn.IsX());
254
255 // Check that `VL * multiplier` cannot overflow, for any possible value of VL.
256 VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
257 VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
258
259 if (xd.IsZero()) return;
260 if (xn.IsZero() && xd.IsSP()) {
261 // TODO: This operation doesn't make much sense, but we could support it
262 // with a scratch register if necessary. `rdvl` cannot write into `sp`.
263 VIXL_UNIMPLEMENTED();
264 }
265
266 if (IsInt6(multiplier)) {
267 SingleEmissionCheckScope guard(this);
268 if (xn.IsZero()) {
269 rdvl(xd, static_cast<int>(multiplier));
270 } else {
271 addvl(xd, xn, static_cast<int>(multiplier));
272 }
273 return;
274 }
275
276 // TODO: Some probable cases result in rather long sequences. For example,
277 // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
278 // outside the encodable range. We should look for ways to cover such cases
279 // without drastically increasing the complexity of this logic.
280
281 // For other cases, calculate xn + (VL * multiplier) using discrete
282 // instructions. This requires two scratch registers in the general case, so
283 // we try to re-use the destination as a scratch register.
284 UseScratchRegisterScope temps(this);
285 temps.Include(xd);
286 temps.Exclude(xn);
287
288 Register a = temps.AcquireX();
289 Mov(a, multiplier);
290
291 MacroEmissionCheckScope guard(this);
292 Register b = temps.AcquireX();
293 rdvl(b, 1);
294 if (xn.IsZero()) {
295 mul(xd, a, b);
296 } else if (xd.IsSP() || xn.IsSP()) {
297 mul(a, a, b);
298 add(xd, xn, a);
299 } else {
300 madd(xd, a, b, xn);
301 }
302 }
303
CalculateSVEAddress(const Register & xd,const SVEMemOperand & addr,int vl_divisor_log2)304 void MacroAssembler::CalculateSVEAddress(const Register& xd,
305 const SVEMemOperand& addr,
306 int vl_divisor_log2) {
307 VIXL_ASSERT(allow_macro_instructions_);
308 VIXL_ASSERT(!addr.IsScatterGather());
309 VIXL_ASSERT(xd.IsX());
310
311 // The lower bound is where a whole Z register is accessed.
312 VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
313 // The upper bound is for P register accesses, and for instructions like
314 // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
315 VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));
316
317 SVEOffsetModifier mod = addr.GetOffsetModifier();
318 Register base = addr.GetScalarBase();
319
320 if (addr.IsEquivalentToScalar()) {
321 // For example:
322 // [x0]
323 // [x0, #0]
324 // [x0, xzr, LSL 2]
325 Mov(xd, base);
326 } else if (addr.IsScalarPlusImmediate()) {
327 // For example:
328 // [x0, #42]
329 // [x0, #42, MUL VL]
330 int64_t offset = addr.GetImmediateOffset();
331 VIXL_ASSERT(offset != 0); // Handled by IsEquivalentToScalar.
332 if (addr.IsMulVl()) {
333 int vl_divisor = 1 << vl_divisor_log2;
334 // For all possible values of vl_divisor, we can simply use `Addpl`. This
335 // will select `addvl` if necessary.
336 VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
337 Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
338 } else {
339 // IsScalarPlusImmediate() ensures that no other modifiers can occur.
340 VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
341 Add(xd, base, offset);
342 }
343 } else if (addr.IsScalarPlusScalar()) {
344 // For example:
345 // [x0, x1]
346 // [x0, x1, LSL #4]
347 Register offset = addr.GetScalarOffset();
348 VIXL_ASSERT(!offset.IsZero()); // Handled by IsEquivalentToScalar.
349 if (mod == SVE_LSL) {
350 Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
351 } else {
352 // IsScalarPlusScalar() ensures that no other modifiers can occur.
353 VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
354 Add(xd, base, offset);
355 }
356 } else {
357 // All other forms are scatter-gather addresses, which cannot be evaluated
358 // into an X register.
359 VIXL_UNREACHABLE();
360 }
361 }
362
Cpy(const ZRegister & zd,const PRegister & pg,IntegerOperand imm)363 void MacroAssembler::Cpy(const ZRegister& zd,
364 const PRegister& pg,
365 IntegerOperand imm) {
366 VIXL_ASSERT(allow_macro_instructions_);
367 VIXL_ASSERT(imm.FitsInLane(zd));
368 int imm8;
369 int shift;
370 if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
371 imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
372 SingleEmissionCheckScope guard(this);
373 cpy(zd, pg, imm8, shift);
374 return;
375 }
376
377 // The fallbacks rely on `cpy` variants that only support merging predication.
378 // If zeroing predication was requested, zero the destination first.
379 if (pg.IsZeroing()) {
380 SingleEmissionCheckScope guard(this);
381 dup(zd, 0);
382 }
383 PRegisterM pg_m = pg.Merging();
384
385 // Try to encode the immediate using fcpy.
386 VIXL_ASSERT(imm.FitsInLane(zd));
387 if (zd.GetLaneSizeInBits() >= kHRegSize) {
388 double fp_imm = 0.0;
389 switch (zd.GetLaneSizeInBits()) {
390 case kHRegSize:
391 fp_imm =
392 FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
393 break;
394 case kSRegSize:
395 fp_imm = RawbitsToFloat(imm.AsUint32());
396 break;
397 case kDRegSize:
398 fp_imm = RawbitsToDouble(imm.AsUint64());
399 break;
400 default:
401 VIXL_UNREACHABLE();
402 break;
403 }
404 // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
405 // we can use IsImmFP64 for all lane sizes.
406 if (IsImmFP64(fp_imm)) {
407 SingleEmissionCheckScope guard(this);
408 fcpy(zd, pg_m, fp_imm);
409 return;
410 }
411 }
412
413 // Fall back to using a scratch register.
414 UseScratchRegisterScope temps(this);
415 Register scratch = temps.AcquireRegisterToHoldLane(zd);
416 Mov(scratch, imm);
417
418 SingleEmissionCheckScope guard(this);
419 cpy(zd, pg_m, scratch);
420 }
421
422 // TODO: We implement Fcpy (amongst other things) for all FP types because it
423 // allows us to preserve user-specified NaNs. We should come up with some
424 // FPImmediate type to abstract this, and avoid all the duplication below (and
425 // elsewhere).
426
Fcpy(const ZRegister & zd,const PRegisterM & pg,double imm)427 void MacroAssembler::Fcpy(const ZRegister& zd,
428 const PRegisterM& pg,
429 double imm) {
430 VIXL_ASSERT(allow_macro_instructions_);
431 VIXL_ASSERT(pg.IsMerging());
432
433 if (IsImmFP64(imm)) {
434 SingleEmissionCheckScope guard(this);
435 fcpy(zd, pg, imm);
436 return;
437 }
438
439 // As a fall-back, cast the immediate to the required lane size, and try to
440 // encode the bit pattern using `Cpy`.
441 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
442 }
443
Fcpy(const ZRegister & zd,const PRegisterM & pg,float imm)444 void MacroAssembler::Fcpy(const ZRegister& zd,
445 const PRegisterM& pg,
446 float imm) {
447 VIXL_ASSERT(allow_macro_instructions_);
448 VIXL_ASSERT(pg.IsMerging());
449
450 if (IsImmFP32(imm)) {
451 SingleEmissionCheckScope guard(this);
452 fcpy(zd, pg, imm);
453 return;
454 }
455
456 // As a fall-back, cast the immediate to the required lane size, and try to
457 // encode the bit pattern using `Cpy`.
458 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
459 }
460
Fcpy(const ZRegister & zd,const PRegisterM & pg,Float16 imm)461 void MacroAssembler::Fcpy(const ZRegister& zd,
462 const PRegisterM& pg,
463 Float16 imm) {
464 VIXL_ASSERT(allow_macro_instructions_);
465 VIXL_ASSERT(pg.IsMerging());
466
467 if (IsImmFP16(imm)) {
468 SingleEmissionCheckScope guard(this);
469 fcpy(zd, pg, imm);
470 return;
471 }
472
473 // As a fall-back, cast the immediate to the required lane size, and try to
474 // encode the bit pattern using `Cpy`.
475 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
476 }
477
Dup(const ZRegister & zd,IntegerOperand imm)478 void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
479 VIXL_ASSERT(allow_macro_instructions_);
480 VIXL_ASSERT(imm.FitsInLane(zd));
481 unsigned lane_size = zd.GetLaneSizeInBits();
482 int imm8;
483 int shift;
484 if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
485 imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
486 SingleEmissionCheckScope guard(this);
487 dup(zd, imm8, shift);
488 } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
489 SingleEmissionCheckScope guard(this);
490 dupm(zd, imm.AsUintN(lane_size));
491 } else {
492 UseScratchRegisterScope temps(this);
493 Register scratch = temps.AcquireRegisterToHoldLane(zd);
494 Mov(scratch, imm);
495
496 SingleEmissionCheckScope guard(this);
497 dup(zd, scratch);
498 }
499 }
500
NoncommutativeArithmeticHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,SVEArithPredicatedFn fn,SVEArithPredicatedFn rev_fn)501 void MacroAssembler::NoncommutativeArithmeticHelper(
502 const ZRegister& zd,
503 const PRegisterM& pg,
504 const ZRegister& zn,
505 const ZRegister& zm,
506 SVEArithPredicatedFn fn,
507 SVEArithPredicatedFn rev_fn) {
508 if (zd.Aliases(zn)) {
509 // E.g. zd = zd / zm
510 SingleEmissionCheckScope guard(this);
511 (this->*fn)(zd, pg, zn, zm);
512 } else if (zd.Aliases(zm)) {
513 // E.g. zd = zn / zd
514 SingleEmissionCheckScope guard(this);
515 (this->*rev_fn)(zd, pg, zm, zn);
516 } else {
517 // E.g. zd = zn / zm
518 MovprfxHelperScope guard(this, zd, pg, zn);
519 (this->*fn)(zd, pg, zd, zm);
520 }
521 }
522
FPCommutativeArithmeticHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,SVEArithPredicatedFn fn,FPMacroNaNPropagationOption nan_option)523 void MacroAssembler::FPCommutativeArithmeticHelper(
524 const ZRegister& zd,
525 const PRegisterM& pg,
526 const ZRegister& zn,
527 const ZRegister& zm,
528 SVEArithPredicatedFn fn,
529 FPMacroNaNPropagationOption nan_option) {
530 ResolveFPNaNPropagationOption(&nan_option);
531
532 if (zd.Aliases(zn)) {
533 SingleEmissionCheckScope guard(this);
534 (this->*fn)(zd, pg, zd, zm);
535 } else if (zd.Aliases(zm)) {
536 switch (nan_option) {
537 case FastNaNPropagation: {
538 // Swap the arguments.
539 SingleEmissionCheckScope guard(this);
540 (this->*fn)(zd, pg, zd, zn);
541 return;
542 }
543 case StrictNaNPropagation: {
544 UseScratchRegisterScope temps(this);
545 // Use a scratch register to keep the argument order exactly as
546 // specified.
547 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
548 {
549 MovprfxHelperScope guard(this, scratch, pg, zn);
550 (this->*fn)(scratch, pg, scratch, zm);
551 }
552 Mov(zd, scratch);
553 return;
554 }
555 case NoFPMacroNaNPropagationSelected:
556 VIXL_UNREACHABLE();
557 return;
558 }
559 } else {
560 MovprfxHelperScope guard(this, zd, pg, zn);
561 (this->*fn)(zd, pg, zd, zm);
562 }
563 }
564
Asr(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)565 void MacroAssembler::Asr(const ZRegister& zd,
566 const PRegisterM& pg,
567 const ZRegister& zn,
568 const ZRegister& zm) {
569 VIXL_ASSERT(allow_macro_instructions_);
570 NoncommutativeArithmeticHelper(zd,
571 pg,
572 zn,
573 zm,
574 static_cast<SVEArithPredicatedFn>(
575 &Assembler::asr),
576 static_cast<SVEArithPredicatedFn>(
577 &Assembler::asrr));
578 }
579
Lsl(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)580 void MacroAssembler::Lsl(const ZRegister& zd,
581 const PRegisterM& pg,
582 const ZRegister& zn,
583 const ZRegister& zm) {
584 VIXL_ASSERT(allow_macro_instructions_);
585 NoncommutativeArithmeticHelper(zd,
586 pg,
587 zn,
588 zm,
589 static_cast<SVEArithPredicatedFn>(
590 &Assembler::lsl),
591 static_cast<SVEArithPredicatedFn>(
592 &Assembler::lslr));
593 }
594
Lsr(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)595 void MacroAssembler::Lsr(const ZRegister& zd,
596 const PRegisterM& pg,
597 const ZRegister& zn,
598 const ZRegister& zm) {
599 VIXL_ASSERT(allow_macro_instructions_);
600 NoncommutativeArithmeticHelper(zd,
601 pg,
602 zn,
603 zm,
604 static_cast<SVEArithPredicatedFn>(
605 &Assembler::lsr),
606 static_cast<SVEArithPredicatedFn>(
607 &Assembler::lsrr));
608 }
609
Fdiv(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)610 void MacroAssembler::Fdiv(const ZRegister& zd,
611 const PRegisterM& pg,
612 const ZRegister& zn,
613 const ZRegister& zm) {
614 VIXL_ASSERT(allow_macro_instructions_);
615 NoncommutativeArithmeticHelper(zd,
616 pg,
617 zn,
618 zm,
619 static_cast<SVEArithPredicatedFn>(
620 &Assembler::fdiv),
621 static_cast<SVEArithPredicatedFn>(
622 &Assembler::fdivr));
623 }
624
Fsub(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)625 void MacroAssembler::Fsub(const ZRegister& zd,
626 const PRegisterM& pg,
627 const ZRegister& zn,
628 const ZRegister& zm) {
629 VIXL_ASSERT(allow_macro_instructions_);
630 NoncommutativeArithmeticHelper(zd,
631 pg,
632 zn,
633 zm,
634 static_cast<SVEArithPredicatedFn>(
635 &Assembler::fsub),
636 static_cast<SVEArithPredicatedFn>(
637 &Assembler::fsubr));
638 }
639
Fadd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)640 void MacroAssembler::Fadd(const ZRegister& zd,
641 const PRegisterM& pg,
642 const ZRegister& zn,
643 const ZRegister& zm,
644 FPMacroNaNPropagationOption nan_option) {
645 VIXL_ASSERT(allow_macro_instructions_);
646 FPCommutativeArithmeticHelper(zd,
647 pg,
648 zn,
649 zm,
650 static_cast<SVEArithPredicatedFn>(
651 &Assembler::fadd),
652 nan_option);
653 }
654
Fabd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)655 void MacroAssembler::Fabd(const ZRegister& zd,
656 const PRegisterM& pg,
657 const ZRegister& zn,
658 const ZRegister& zm,
659 FPMacroNaNPropagationOption nan_option) {
660 VIXL_ASSERT(allow_macro_instructions_);
661 FPCommutativeArithmeticHelper(zd,
662 pg,
663 zn,
664 zm,
665 static_cast<SVEArithPredicatedFn>(
666 &Assembler::fabd),
667 nan_option);
668 }
669
Fmul(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)670 void MacroAssembler::Fmul(const ZRegister& zd,
671 const PRegisterM& pg,
672 const ZRegister& zn,
673 const ZRegister& zm,
674 FPMacroNaNPropagationOption nan_option) {
675 VIXL_ASSERT(allow_macro_instructions_);
676 FPCommutativeArithmeticHelper(zd,
677 pg,
678 zn,
679 zm,
680 static_cast<SVEArithPredicatedFn>(
681 &Assembler::fmul),
682 nan_option);
683 }
684
Fmulx(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)685 void MacroAssembler::Fmulx(const ZRegister& zd,
686 const PRegisterM& pg,
687 const ZRegister& zn,
688 const ZRegister& zm,
689 FPMacroNaNPropagationOption nan_option) {
690 VIXL_ASSERT(allow_macro_instructions_);
691 FPCommutativeArithmeticHelper(zd,
692 pg,
693 zn,
694 zm,
695 static_cast<SVEArithPredicatedFn>(
696 &Assembler::fmulx),
697 nan_option);
698 }
699
Fmax(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)700 void MacroAssembler::Fmax(const ZRegister& zd,
701 const PRegisterM& pg,
702 const ZRegister& zn,
703 const ZRegister& zm,
704 FPMacroNaNPropagationOption nan_option) {
705 VIXL_ASSERT(allow_macro_instructions_);
706 FPCommutativeArithmeticHelper(zd,
707 pg,
708 zn,
709 zm,
710 static_cast<SVEArithPredicatedFn>(
711 &Assembler::fmax),
712 nan_option);
713 }
714
Fmin(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)715 void MacroAssembler::Fmin(const ZRegister& zd,
716 const PRegisterM& pg,
717 const ZRegister& zn,
718 const ZRegister& zm,
719 FPMacroNaNPropagationOption nan_option) {
720 VIXL_ASSERT(allow_macro_instructions_);
721 FPCommutativeArithmeticHelper(zd,
722 pg,
723 zn,
724 zm,
725 static_cast<SVEArithPredicatedFn>(
726 &Assembler::fmin),
727 nan_option);
728 }
729
Fmaxnm(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)730 void MacroAssembler::Fmaxnm(const ZRegister& zd,
731 const PRegisterM& pg,
732 const ZRegister& zn,
733 const ZRegister& zm,
734 FPMacroNaNPropagationOption nan_option) {
735 VIXL_ASSERT(allow_macro_instructions_);
736 FPCommutativeArithmeticHelper(zd,
737 pg,
738 zn,
739 zm,
740 static_cast<SVEArithPredicatedFn>(
741 &Assembler::fmaxnm),
742 nan_option);
743 }
744
Fminnm(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)745 void MacroAssembler::Fminnm(const ZRegister& zd,
746 const PRegisterM& pg,
747 const ZRegister& zn,
748 const ZRegister& zm,
749 FPMacroNaNPropagationOption nan_option) {
750 VIXL_ASSERT(allow_macro_instructions_);
751 FPCommutativeArithmeticHelper(zd,
752 pg,
753 zn,
754 zm,
755 static_cast<SVEArithPredicatedFn>(
756 &Assembler::fminnm),
757 nan_option);
758 }
759
Fdup(const ZRegister & zd,double imm)760 void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
761 VIXL_ASSERT(allow_macro_instructions_);
762
763 switch (zd.GetLaneSizeInBits()) {
764 case kHRegSize:
765 Fdup(zd, Float16(imm));
766 break;
767 case kSRegSize:
768 Fdup(zd, static_cast<float>(imm));
769 break;
770 case kDRegSize:
771 if (IsImmFP64(imm)) {
772 SingleEmissionCheckScope guard(this);
773 fdup(zd, imm);
774 } else {
775 Dup(zd, DoubleToRawbits(imm));
776 }
777 break;
778 }
779 }
780
Fdup(const ZRegister & zd,float imm)781 void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
782 VIXL_ASSERT(allow_macro_instructions_);
783
784 switch (zd.GetLaneSizeInBits()) {
785 case kHRegSize:
786 Fdup(zd, Float16(imm));
787 break;
788 case kSRegSize:
789 if (IsImmFP32(imm)) {
790 SingleEmissionCheckScope guard(this);
791 fdup(zd, imm);
792 } else {
793 Dup(zd, FloatToRawbits(imm));
794 }
795 break;
796 case kDRegSize:
797 Fdup(zd, static_cast<double>(imm));
798 break;
799 }
800 }
801
Fdup(const ZRegister & zd,Float16 imm)802 void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
803 VIXL_ASSERT(allow_macro_instructions_);
804
805 switch (zd.GetLaneSizeInBits()) {
806 case kHRegSize:
807 if (IsImmFP16(imm)) {
808 SingleEmissionCheckScope guard(this);
809 fdup(zd, imm);
810 } else {
811 Dup(zd, Float16ToRawbits(imm));
812 }
813 break;
814 case kSRegSize:
815 Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
816 break;
817 case kDRegSize:
818 Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
819 break;
820 }
821 }
822
Index(const ZRegister & zd,const Operand & start,const Operand & step)823 void MacroAssembler::Index(const ZRegister& zd,
824 const Operand& start,
825 const Operand& step) {
826 class IndexOperand : public Operand {
827 public:
828 static IndexOperand Prepare(MacroAssembler* masm,
829 UseScratchRegisterScope* temps,
830 const Operand& op,
831 const ZRegister& zd) {
832 // Look for encodable immediates.
833 int imm;
834 if (op.IsImmediate()) {
835 if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd, &imm)) {
836 return IndexOperand(imm);
837 }
838 Register scratch = temps->AcquireRegisterToHoldLane(zd);
839 masm->Mov(scratch, op);
840 return IndexOperand(scratch);
841 } else {
842 // Plain registers can be encoded directly.
843 VIXL_ASSERT(op.IsPlainRegister());
844 return IndexOperand(op.GetRegister());
845 }
846 }
847
848 int GetImm5() const {
849 int64_t imm = GetImmediate();
850 VIXL_ASSERT(IsInt5(imm));
851 return static_cast<int>(imm);
852 }
853
854 private:
855 explicit IndexOperand(const Register& reg) : Operand(reg) {}
856 explicit IndexOperand(int64_t imm) : Operand(imm) {}
857 };
858
859 UseScratchRegisterScope temps(this);
860 IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
861 IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);
862
863 SingleEmissionCheckScope guard(this);
864 if (start_enc.IsImmediate()) {
865 if (step_enc.IsImmediate()) {
866 index(zd, start_enc.GetImm5(), step_enc.GetImm5());
867 } else {
868 index(zd, start_enc.GetImm5(), step_enc.GetRegister());
869 }
870 } else {
871 if (step_enc.IsImmediate()) {
872 index(zd, start_enc.GetRegister(), step_enc.GetImm5());
873 } else {
874 index(zd, start_enc.GetRegister(), step_enc.GetRegister());
875 }
876 }
877 }
878
Insr(const ZRegister & zdn,IntegerOperand imm)879 void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
880 VIXL_ASSERT(allow_macro_instructions_);
881 VIXL_ASSERT(imm.FitsInLane(zdn));
882
883 if (imm.IsZero()) {
884 SingleEmissionCheckScope guard(this);
885 insr(zdn, xzr);
886 return;
887 }
888
889 UseScratchRegisterScope temps(this);
890 Register scratch = temps.AcquireRegisterToHoldLane(zdn);
891
892 // TODO: There are many cases where we could optimise immediates, such as by
893 // detecting repeating patterns or FP immediates. We should optimise and
894 // abstract this for use in other SVE mov-immediate-like macros.
895 Mov(scratch, imm);
896
897 SingleEmissionCheckScope guard(this);
898 insr(zdn, scratch);
899 }
900
Mla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)901 void MacroAssembler::Mla(const ZRegister& zd,
902 const PRegisterM& pg,
903 const ZRegister& za,
904 const ZRegister& zn,
905 const ZRegister& zm) {
906 VIXL_ASSERT(allow_macro_instructions_);
907 if (zd.Aliases(za)) {
908 // zda = zda + (zn * zm)
909 SingleEmissionCheckScope guard(this);
910 mla(zd, pg, zn, zm);
911 } else if (zd.Aliases(zn)) {
912 // zdn = za + (zdn * zm)
913 SingleEmissionCheckScope guard(this);
914 mad(zd, pg, zm, za);
915 } else if (zd.Aliases(zm)) {
916 // Multiplication is commutative, so we can swap zn and zm.
917 // zdm = za + (zdm * zn)
918 SingleEmissionCheckScope guard(this);
919 mad(zd, pg, zn, za);
920 } else {
921 // zd = za + (zn * zm)
922 ExactAssemblyScope guard(this, 2 * kInstructionSize);
923 movprfx(zd, pg, za);
924 mla(zd, pg, zn, zm);
925 }
926 }
927
Mls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)928 void MacroAssembler::Mls(const ZRegister& zd,
929 const PRegisterM& pg,
930 const ZRegister& za,
931 const ZRegister& zn,
932 const ZRegister& zm) {
933 VIXL_ASSERT(allow_macro_instructions_);
934 if (zd.Aliases(za)) {
935 // zda = zda - (zn * zm)
936 SingleEmissionCheckScope guard(this);
937 mls(zd, pg, zn, zm);
938 } else if (zd.Aliases(zn)) {
939 // zdn = za - (zdn * zm)
940 SingleEmissionCheckScope guard(this);
941 msb(zd, pg, zm, za);
942 } else if (zd.Aliases(zm)) {
943 // Multiplication is commutative, so we can swap zn and zm.
944 // zdm = za - (zdm * zn)
945 SingleEmissionCheckScope guard(this);
946 msb(zd, pg, zn, za);
947 } else {
948 // zd = za - (zn * zm)
949 ExactAssemblyScope guard(this, 2 * kInstructionSize);
950 movprfx(zd, pg, za);
951 mls(zd, pg, zn, zm);
952 }
953 }
954
CompareHelper(Condition cond,const PRegisterWithLaneSize & pd,const PRegisterZ & pg,const ZRegister & zn,IntegerOperand imm)955 void MacroAssembler::CompareHelper(Condition cond,
956 const PRegisterWithLaneSize& pd,
957 const PRegisterZ& pg,
958 const ZRegister& zn,
959 IntegerOperand imm) {
960 UseScratchRegisterScope temps(this);
961 ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
962 Dup(zm, imm);
963 SingleEmissionCheckScope guard(this);
964 cmp(cond, pd, pg, zn, zm);
965 }
966
Pfirst(const PRegisterWithLaneSize & pd,const PRegister & pg,const PRegisterWithLaneSize & pn)967 void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
968 const PRegister& pg,
969 const PRegisterWithLaneSize& pn) {
970 VIXL_ASSERT(allow_macro_instructions_);
971 VIXL_ASSERT(pd.IsLaneSizeB());
972 VIXL_ASSERT(pn.IsLaneSizeB());
973 if (pd.Is(pn)) {
974 SingleEmissionCheckScope guard(this);
975 pfirst(pd, pg, pn);
976 } else {
977 UseScratchRegisterScope temps(this);
978 PRegister temp_pg = pg;
979 if (pd.Aliases(pg)) {
980 temp_pg = temps.AcquireP();
981 Mov(temp_pg.VnB(), pg.VnB());
982 }
983 Mov(pd, pn);
984 SingleEmissionCheckScope guard(this);
985 pfirst(pd, temp_pg, pd);
986 }
987 }
988
Pnext(const PRegisterWithLaneSize & pd,const PRegister & pg,const PRegisterWithLaneSize & pn)989 void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
990 const PRegister& pg,
991 const PRegisterWithLaneSize& pn) {
992 VIXL_ASSERT(allow_macro_instructions_);
993 VIXL_ASSERT(AreSameFormat(pd, pn));
994 if (pd.Is(pn)) {
995 SingleEmissionCheckScope guard(this);
996 pnext(pd, pg, pn);
997 } else {
998 UseScratchRegisterScope temps(this);
999 PRegister temp_pg = pg;
1000 if (pd.Aliases(pg)) {
1001 temp_pg = temps.AcquireP();
1002 Mov(temp_pg.VnB(), pg.VnB());
1003 }
1004 Mov(pd.VnB(), pn.VnB());
1005 SingleEmissionCheckScope guard(this);
1006 pnext(pd, temp_pg, pd);
1007 }
1008 }
1009
Ptrue(const PRegisterWithLaneSize & pd,SVEPredicateConstraint pattern,FlagsUpdate s)1010 void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
1011 SVEPredicateConstraint pattern,
1012 FlagsUpdate s) {
1013 VIXL_ASSERT(allow_macro_instructions_);
1014 switch (s) {
1015 case LeaveFlags:
1016 Ptrue(pd, pattern);
1017 return;
1018 case SetFlags:
1019 Ptrues(pd, pattern);
1020 return;
1021 }
1022 VIXL_UNREACHABLE();
1023 }
1024
Sdiv(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1025 void MacroAssembler::Sdiv(const ZRegister& zd,
1026 const PRegisterM& pg,
1027 const ZRegister& zn,
1028 const ZRegister& zm) {
1029 VIXL_ASSERT(allow_macro_instructions_);
1030 NoncommutativeArithmeticHelper(zd,
1031 pg,
1032 zn,
1033 zm,
1034 static_cast<SVEArithPredicatedFn>(
1035 &Assembler::sdiv),
1036 static_cast<SVEArithPredicatedFn>(
1037 &Assembler::sdivr));
1038 }
1039
Sub(const ZRegister & zd,IntegerOperand imm,const ZRegister & zm)1040 void MacroAssembler::Sub(const ZRegister& zd,
1041 IntegerOperand imm,
1042 const ZRegister& zm) {
1043 VIXL_ASSERT(allow_macro_instructions_);
1044
1045 int imm8;
1046 int shift = -1;
1047 if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
1048 imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
1049 MovprfxHelperScope guard(this, zd, zm);
1050 subr(zd, zd, imm8, shift);
1051 } else {
1052 UseScratchRegisterScope temps(this);
1053 ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
1054 Dup(scratch, imm);
1055
1056 SingleEmissionCheckScope guard(this);
1057 sub(zd, scratch, zm);
1058 }
1059 }
1060
Sub(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1061 void MacroAssembler::Sub(const ZRegister& zd,
1062 const PRegisterM& pg,
1063 const ZRegister& zn,
1064 const ZRegister& zm) {
1065 VIXL_ASSERT(allow_macro_instructions_);
1066 NoncommutativeArithmeticHelper(zd,
1067 pg,
1068 zn,
1069 zm,
1070 static_cast<SVEArithPredicatedFn>(
1071 &Assembler::sub),
1072 static_cast<SVEArithPredicatedFn>(
1073 &Assembler::subr));
1074 }
1075
Udiv(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1076 void MacroAssembler::Udiv(const ZRegister& zd,
1077 const PRegisterM& pg,
1078 const ZRegister& zn,
1079 const ZRegister& zm) {
1080 VIXL_ASSERT(allow_macro_instructions_);
1081 NoncommutativeArithmeticHelper(zd,
1082 pg,
1083 zn,
1084 zm,
1085 static_cast<SVEArithPredicatedFn>(
1086 &Assembler::udiv),
1087 static_cast<SVEArithPredicatedFn>(
1088 &Assembler::udivr));
1089 }
1090
SVELoadBroadcastImmHelper(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr,SVELoadBroadcastFn fn,int divisor)1091 void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
1092 const PRegisterZ& pg,
1093 const SVEMemOperand& addr,
1094 SVELoadBroadcastFn fn,
1095 int divisor) {
1096 VIXL_ASSERT(addr.IsScalarPlusImmediate());
1097 int64_t imm = addr.GetImmediateOffset();
1098 if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
1099 SingleEmissionCheckScope guard(this);
1100 (this->*fn)(zt, pg, addr);
1101 } else {
1102 UseScratchRegisterScope temps(this);
1103 Register scratch = temps.AcquireX();
1104 CalculateSVEAddress(scratch, addr, zt);
1105 SingleEmissionCheckScope guard(this);
1106 (this->*fn)(zt, pg, SVEMemOperand(scratch));
1107 }
1108 }
1109
SVELoadStoreScalarImmHelper(const CPURegister & rt,const SVEMemOperand & addr,SVELoadStoreFn fn)1110 void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
1111 const SVEMemOperand& addr,
1112 SVELoadStoreFn fn) {
1113 VIXL_ASSERT(allow_macro_instructions_);
1114 VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());
1115
1116 if (addr.IsPlainScalar() ||
1117 (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
1118 addr.IsMulVl())) {
1119 SingleEmissionCheckScope guard(this);
1120 (this->*fn)(rt, addr);
1121 return;
1122 }
1123
1124 if (addr.IsEquivalentToScalar()) {
1125 SingleEmissionCheckScope guard(this);
1126 (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
1127 return;
1128 }
1129
1130 UseScratchRegisterScope temps(this);
1131 Register scratch = temps.AcquireX();
1132 CalculateSVEAddress(scratch, addr, rt);
1133 SingleEmissionCheckScope guard(this);
1134 (this->*fn)(rt, SVEMemOperand(scratch));
1135 }
1136
1137 template <typename Tg, typename Tf>
SVELoadStoreScalarImmHelper(const ZRegister & zt,const Tg & pg,const SVEMemOperand & addr,Tf fn,int imm_bits,int shift_amount,SVEOffsetModifier supported_modifier,int vl_divisor_log2)1138 void MacroAssembler::SVELoadStoreScalarImmHelper(
1139 const ZRegister& zt,
1140 const Tg& pg,
1141 const SVEMemOperand& addr,
1142 Tf fn,
1143 int imm_bits,
1144 int shift_amount,
1145 SVEOffsetModifier supported_modifier,
1146 int vl_divisor_log2) {
1147 VIXL_ASSERT(allow_macro_instructions_);
1148 int imm_divisor = 1 << shift_amount;
1149
1150 if (addr.IsPlainScalar() ||
1151 (addr.IsScalarPlusImmediate() &&
1152 IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
1153 ((addr.GetImmediateOffset() % imm_divisor) == 0) &&
1154 (addr.GetOffsetModifier() == supported_modifier))) {
1155 SingleEmissionCheckScope guard(this);
1156 (this->*fn)(zt, pg, addr);
1157 return;
1158 }
1159
1160 if (addr.IsEquivalentToScalar()) {
1161 SingleEmissionCheckScope guard(this);
1162 (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1163 return;
1164 }
1165
1166 if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
1167 (vl_divisor_log2 == -1)) {
1168 // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
1169 // dependent.
1170 VIXL_UNIMPLEMENTED();
1171 }
1172
1173 UseScratchRegisterScope temps(this);
1174 Register scratch = temps.AcquireX();
1175 CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1176 SingleEmissionCheckScope guard(this);
1177 (this->*fn)(zt, pg, SVEMemOperand(scratch));
1178 }
1179
1180 template <typename Tg, typename Tf>
SVELoadStore1Helper(int msize_in_bytes_log2,const ZRegister & zt,const Tg & pg,const SVEMemOperand & addr,Tf fn)1181 void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
1182 const ZRegister& zt,
1183 const Tg& pg,
1184 const SVEMemOperand& addr,
1185 Tf fn) {
1186 if (addr.IsPlainScalar() ||
1187 (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1188 addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
1189 (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
1190 addr.IsMulVl())) {
1191 SingleEmissionCheckScope guard(this);
1192 (this->*fn)(zt, pg, addr);
1193 return;
1194 }
1195
1196 if (addr.IsEquivalentToScalar()) {
1197 SingleEmissionCheckScope guard(this);
1198 (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1199 return;
1200 }
1201
1202 if (addr.IsVectorPlusImmediate()) {
1203 uint64_t offset = addr.GetImmediateOffset();
1204 if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
1205 IsUint5(offset >> msize_in_bytes_log2)) {
1206 SingleEmissionCheckScope guard(this);
1207 (this->*fn)(zt, pg, addr);
1208 return;
1209 }
1210 }
1211
1212 if (addr.IsScalarPlusVector()) {
1213 VIXL_ASSERT(addr.IsScatterGather());
1214 SingleEmissionCheckScope guard(this);
1215 (this->*fn)(zt, pg, addr);
1216 return;
1217 }
1218
1219 UseScratchRegisterScope temps(this);
1220 if (addr.IsScatterGather()) {
1221 // In scatter-gather modes, zt and zn/zm have the same lane size. However,
1222 // for 32-bit accesses, the result of each lane's address calculation still
1223 // requires 64 bits; we can't naively use `Adr` for the address calculation
1224 // because it would truncate each address to 32 bits.
1225
1226 if (addr.IsVectorPlusImmediate()) {
1227 // Synthesise the immediate in an X register, then use a
1228 // scalar-plus-vector access with the original vector.
1229 Register scratch = temps.AcquireX();
1230 Mov(scratch, addr.GetImmediateOffset());
1231 SingleEmissionCheckScope guard(this);
1232 SVEOffsetModifier om =
1233 zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
1234 (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
1235 return;
1236 }
1237
1238 VIXL_UNIMPLEMENTED();
1239 } else {
1240 Register scratch = temps.AcquireX();
1241 // TODO: If we have an immediate offset that is a multiple of
1242 // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
1243 // save an instruction.
1244 int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
1245 CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1246 SingleEmissionCheckScope guard(this);
1247 (this->*fn)(zt, pg, SVEMemOperand(scratch));
1248 }
1249 }
1250
1251 template <typename Tf>
SVELoadFFHelper(int msize_in_bytes_log2,const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr,Tf fn)1252 void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
1253 const ZRegister& zt,
1254 const PRegisterZ& pg,
1255 const SVEMemOperand& addr,
1256 Tf fn) {
1257 if (addr.IsScatterGather()) {
1258 // Scatter-gather first-fault loads share encodings with normal loads.
1259 SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
1260 return;
1261 }
1262
1263 // Contiguous first-faulting loads have no scalar-plus-immediate form at all,
1264 // so we don't do immediate synthesis.
1265
1266 // We cannot currently distinguish "[x0]" from "[x0, #0]", and this
1267 // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
1268 if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
1269 addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
1270 SingleEmissionCheckScope guard(this);
1271 (this->*fn)(zt, pg, addr);
1272 return;
1273 }
1274
1275 VIXL_UNIMPLEMENTED();
1276 }
1277
Ld1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1278 void MacroAssembler::Ld1b(const ZRegister& zt,
1279 const PRegisterZ& pg,
1280 const SVEMemOperand& addr) {
1281 VIXL_ASSERT(allow_macro_instructions_);
1282 SVELoadStore1Helper(kBRegSizeInBytesLog2,
1283 zt,
1284 pg,
1285 addr,
1286 static_cast<SVELoad1Fn>(&Assembler::ld1b));
1287 }
1288
Ld1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1289 void MacroAssembler::Ld1h(const ZRegister& zt,
1290 const PRegisterZ& pg,
1291 const SVEMemOperand& addr) {
1292 VIXL_ASSERT(allow_macro_instructions_);
1293 SVELoadStore1Helper(kHRegSizeInBytesLog2,
1294 zt,
1295 pg,
1296 addr,
1297 static_cast<SVELoad1Fn>(&Assembler::ld1h));
1298 }
1299
Ld1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1300 void MacroAssembler::Ld1w(const ZRegister& zt,
1301 const PRegisterZ& pg,
1302 const SVEMemOperand& addr) {
1303 VIXL_ASSERT(allow_macro_instructions_);
1304 SVELoadStore1Helper(kWRegSizeInBytesLog2,
1305 zt,
1306 pg,
1307 addr,
1308 static_cast<SVELoad1Fn>(&Assembler::ld1w));
1309 }
1310
Ld1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1311 void MacroAssembler::Ld1d(const ZRegister& zt,
1312 const PRegisterZ& pg,
1313 const SVEMemOperand& addr) {
1314 VIXL_ASSERT(allow_macro_instructions_);
1315 SVELoadStore1Helper(kDRegSizeInBytesLog2,
1316 zt,
1317 pg,
1318 addr,
1319 static_cast<SVELoad1Fn>(&Assembler::ld1d));
1320 }
1321
Ld1sb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1322 void MacroAssembler::Ld1sb(const ZRegister& zt,
1323 const PRegisterZ& pg,
1324 const SVEMemOperand& addr) {
1325 VIXL_ASSERT(allow_macro_instructions_);
1326 SVELoadStore1Helper(kBRegSizeInBytesLog2,
1327 zt,
1328 pg,
1329 addr,
1330 static_cast<SVELoad1Fn>(&Assembler::ld1sb));
1331 }
1332
Ld1sh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1333 void MacroAssembler::Ld1sh(const ZRegister& zt,
1334 const PRegisterZ& pg,
1335 const SVEMemOperand& addr) {
1336 VIXL_ASSERT(allow_macro_instructions_);
1337 SVELoadStore1Helper(kHRegSizeInBytesLog2,
1338 zt,
1339 pg,
1340 addr,
1341 static_cast<SVELoad1Fn>(&Assembler::ld1sh));
1342 }
1343
Ld1sw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1344 void MacroAssembler::Ld1sw(const ZRegister& zt,
1345 const PRegisterZ& pg,
1346 const SVEMemOperand& addr) {
1347 VIXL_ASSERT(allow_macro_instructions_);
1348 SVELoadStore1Helper(kSRegSizeInBytesLog2,
1349 zt,
1350 pg,
1351 addr,
1352 static_cast<SVELoad1Fn>(&Assembler::ld1sw));
1353 }
1354
St1b(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1355 void MacroAssembler::St1b(const ZRegister& zt,
1356 const PRegister& pg,
1357 const SVEMemOperand& addr) {
1358 VIXL_ASSERT(allow_macro_instructions_);
1359 SVELoadStore1Helper(kBRegSizeInBytesLog2,
1360 zt,
1361 pg,
1362 addr,
1363 static_cast<SVEStore1Fn>(&Assembler::st1b));
1364 }
1365
St1h(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1366 void MacroAssembler::St1h(const ZRegister& zt,
1367 const PRegister& pg,
1368 const SVEMemOperand& addr) {
1369 VIXL_ASSERT(allow_macro_instructions_);
1370 SVELoadStore1Helper(kHRegSizeInBytesLog2,
1371 zt,
1372 pg,
1373 addr,
1374 static_cast<SVEStore1Fn>(&Assembler::st1h));
1375 }
1376
St1w(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1377 void MacroAssembler::St1w(const ZRegister& zt,
1378 const PRegister& pg,
1379 const SVEMemOperand& addr) {
1380 VIXL_ASSERT(allow_macro_instructions_);
1381 SVELoadStore1Helper(kSRegSizeInBytesLog2,
1382 zt,
1383 pg,
1384 addr,
1385 static_cast<SVEStore1Fn>(&Assembler::st1w));
1386 }
1387
St1d(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1388 void MacroAssembler::St1d(const ZRegister& zt,
1389 const PRegister& pg,
1390 const SVEMemOperand& addr) {
1391 VIXL_ASSERT(allow_macro_instructions_);
1392 SVELoadStore1Helper(kDRegSizeInBytesLog2,
1393 zt,
1394 pg,
1395 addr,
1396 static_cast<SVEStore1Fn>(&Assembler::st1d));
1397 }
1398
Ldff1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1399 void MacroAssembler::Ldff1b(const ZRegister& zt,
1400 const PRegisterZ& pg,
1401 const SVEMemOperand& addr) {
1402 VIXL_ASSERT(allow_macro_instructions_);
1403 SVELoadFFHelper(kBRegSizeInBytesLog2,
1404 zt,
1405 pg,
1406 addr,
1407 static_cast<SVELoad1Fn>(&Assembler::ldff1b));
1408 }
1409
Ldff1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1410 void MacroAssembler::Ldff1h(const ZRegister& zt,
1411 const PRegisterZ& pg,
1412 const SVEMemOperand& addr) {
1413 VIXL_ASSERT(allow_macro_instructions_);
1414 SVELoadFFHelper(kHRegSizeInBytesLog2,
1415 zt,
1416 pg,
1417 addr,
1418 static_cast<SVELoad1Fn>(&Assembler::ldff1h));
1419 }
1420
Ldff1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1421 void MacroAssembler::Ldff1w(const ZRegister& zt,
1422 const PRegisterZ& pg,
1423 const SVEMemOperand& addr) {
1424 VIXL_ASSERT(allow_macro_instructions_);
1425 SVELoadFFHelper(kSRegSizeInBytesLog2,
1426 zt,
1427 pg,
1428 addr,
1429 static_cast<SVELoad1Fn>(&Assembler::ldff1w));
1430 }
1431
Ldff1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1432 void MacroAssembler::Ldff1d(const ZRegister& zt,
1433 const PRegisterZ& pg,
1434 const SVEMemOperand& addr) {
1435 VIXL_ASSERT(allow_macro_instructions_);
1436 SVELoadFFHelper(kDRegSizeInBytesLog2,
1437 zt,
1438 pg,
1439 addr,
1440 static_cast<SVELoad1Fn>(&Assembler::ldff1d));
1441 }
1442
Ldff1sb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1443 void MacroAssembler::Ldff1sb(const ZRegister& zt,
1444 const PRegisterZ& pg,
1445 const SVEMemOperand& addr) {
1446 VIXL_ASSERT(allow_macro_instructions_);
1447 SVELoadFFHelper(kBRegSizeInBytesLog2,
1448 zt,
1449 pg,
1450 addr,
1451 static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
1452 }
1453
Ldff1sh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1454 void MacroAssembler::Ldff1sh(const ZRegister& zt,
1455 const PRegisterZ& pg,
1456 const SVEMemOperand& addr) {
1457 VIXL_ASSERT(allow_macro_instructions_);
1458 SVELoadFFHelper(kHRegSizeInBytesLog2,
1459 zt,
1460 pg,
1461 addr,
1462 static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
1463 }
1464
Ldff1sw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1465 void MacroAssembler::Ldff1sw(const ZRegister& zt,
1466 const PRegisterZ& pg,
1467 const SVEMemOperand& addr) {
1468 VIXL_ASSERT(allow_macro_instructions_);
1469 SVELoadFFHelper(kSRegSizeInBytesLog2,
1470 zt,
1471 pg,
1472 addr,
1473 static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
1474 }
1475
Ld1rqb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1476 void MacroAssembler::Ld1rqb(const ZRegister& zt,
1477 const PRegisterZ& pg,
1478 const SVEMemOperand& addr) {
1479 VIXL_ASSERT(allow_macro_instructions_);
1480 SVELoadStoreScalarImmHelper(zt,
1481 pg,
1482 addr,
1483 &MacroAssembler::ld1rqb,
1484 4,
1485 4,
1486 NO_SVE_OFFSET_MODIFIER,
1487 -1);
1488 }
1489
Ld1rqd(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1490 void MacroAssembler::Ld1rqd(const ZRegister& zt,
1491 const PRegisterZ& pg,
1492 const SVEMemOperand& addr) {
1493 VIXL_ASSERT(allow_macro_instructions_);
1494 SVELoadStoreScalarImmHelper(zt,
1495 pg,
1496 addr,
1497 &MacroAssembler::ld1rqd,
1498 4,
1499 4,
1500 NO_SVE_OFFSET_MODIFIER,
1501 -1);
1502 }
1503
Ld1rqh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1504 void MacroAssembler::Ld1rqh(const ZRegister& zt,
1505 const PRegisterZ& pg,
1506 const SVEMemOperand& addr) {
1507 VIXL_ASSERT(allow_macro_instructions_);
1508 SVELoadStoreScalarImmHelper(zt,
1509 pg,
1510 addr,
1511 &MacroAssembler::ld1rqh,
1512 4,
1513 4,
1514 NO_SVE_OFFSET_MODIFIER,
1515 -1);
1516 }
1517
Ld1rqw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1518 void MacroAssembler::Ld1rqw(const ZRegister& zt,
1519 const PRegisterZ& pg,
1520 const SVEMemOperand& addr) {
1521 VIXL_ASSERT(allow_macro_instructions_);
1522 SVELoadStoreScalarImmHelper(zt,
1523 pg,
1524 addr,
1525 &MacroAssembler::ld1rqw,
1526 4,
1527 4,
1528 NO_SVE_OFFSET_MODIFIER,
1529 -1);
1530 }
1531
Ldnt1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1532 void MacroAssembler::Ldnt1b(const ZRegister& zt,
1533 const PRegisterZ& pg,
1534 const SVEMemOperand& addr) {
1535 VIXL_ASSERT(allow_macro_instructions_);
1536 SVELoadStoreScalarImmHelper(zt,
1537 pg,
1538 addr,
1539 &MacroAssembler::ldnt1b,
1540 4,
1541 0,
1542 SVE_MUL_VL);
1543 }
1544
Ldnt1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1545 void MacroAssembler::Ldnt1d(const ZRegister& zt,
1546 const PRegisterZ& pg,
1547 const SVEMemOperand& addr) {
1548 VIXL_ASSERT(allow_macro_instructions_);
1549 SVELoadStoreScalarImmHelper(zt,
1550 pg,
1551 addr,
1552 &MacroAssembler::ldnt1d,
1553 4,
1554 0,
1555 SVE_MUL_VL);
1556 }
1557
Ldnt1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1558 void MacroAssembler::Ldnt1h(const ZRegister& zt,
1559 const PRegisterZ& pg,
1560 const SVEMemOperand& addr) {
1561 VIXL_ASSERT(allow_macro_instructions_);
1562 SVELoadStoreScalarImmHelper(zt,
1563 pg,
1564 addr,
1565 &MacroAssembler::ldnt1h,
1566 4,
1567 0,
1568 SVE_MUL_VL);
1569 }
1570
Ldnt1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1571 void MacroAssembler::Ldnt1w(const ZRegister& zt,
1572 const PRegisterZ& pg,
1573 const SVEMemOperand& addr) {
1574 VIXL_ASSERT(allow_macro_instructions_);
1575 SVELoadStoreScalarImmHelper(zt,
1576 pg,
1577 addr,
1578 &MacroAssembler::ldnt1w,
1579 4,
1580 0,
1581 SVE_MUL_VL);
1582 }
1583
Stnt1b(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1584 void MacroAssembler::Stnt1b(const ZRegister& zt,
1585 const PRegister& pg,
1586 const SVEMemOperand& addr) {
1587 VIXL_ASSERT(allow_macro_instructions_);
1588 SVELoadStoreScalarImmHelper(zt,
1589 pg,
1590 addr,
1591 &MacroAssembler::stnt1b,
1592 4,
1593 0,
1594 SVE_MUL_VL);
1595 }
Stnt1d(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1596 void MacroAssembler::Stnt1d(const ZRegister& zt,
1597 const PRegister& pg,
1598 const SVEMemOperand& addr) {
1599 VIXL_ASSERT(allow_macro_instructions_);
1600 SVELoadStoreScalarImmHelper(zt,
1601 pg,
1602 addr,
1603 &MacroAssembler::stnt1d,
1604 4,
1605 0,
1606 SVE_MUL_VL);
1607 }
Stnt1h(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1608 void MacroAssembler::Stnt1h(const ZRegister& zt,
1609 const PRegister& pg,
1610 const SVEMemOperand& addr) {
1611 VIXL_ASSERT(allow_macro_instructions_);
1612 SVELoadStoreScalarImmHelper(zt,
1613 pg,
1614 addr,
1615 &MacroAssembler::stnt1h,
1616 4,
1617 0,
1618 SVE_MUL_VL);
1619 }
Stnt1w(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1620 void MacroAssembler::Stnt1w(const ZRegister& zt,
1621 const PRegister& pg,
1622 const SVEMemOperand& addr) {
1623 VIXL_ASSERT(allow_macro_instructions_);
1624 SVELoadStoreScalarImmHelper(zt,
1625 pg,
1626 addr,
1627 &MacroAssembler::stnt1w,
1628 4,
1629 0,
1630 SVE_MUL_VL);
1631 }
1632
SVESdotUdotIndexHelper(IntArithIndexFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1633 void MacroAssembler::SVESdotUdotIndexHelper(IntArithIndexFn fn,
1634 const ZRegister& zd,
1635 const ZRegister& za,
1636 const ZRegister& zn,
1637 const ZRegister& zm,
1638 int index) {
1639 if (zd.Aliases(za)) {
1640 // zda = zda + (zn . zm)
1641 SingleEmissionCheckScope guard(this);
1642 (this->*fn)(zd, zn, zm, index);
1643
1644 } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1645 // zdn = za + (zdn . zm[index])
1646 // zdm = za + (zn . zdm[index])
1647 // zdnm = za + (zdnm . zdnm[index])
1648 UseScratchRegisterScope temps(this);
1649 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1650 {
1651 MovprfxHelperScope guard(this, scratch, za);
1652 (this->*fn)(scratch, zn, zm, index);
1653 }
1654
1655 Mov(zd, scratch);
1656 } else {
1657 // zd = za + (zn . zm)
1658 MovprfxHelperScope guard(this, zd, za);
1659 (this->*fn)(zd, zn, zm, index);
1660 }
1661 }
1662
SVESdotUdotHelper(IntArithFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1663 void MacroAssembler::SVESdotUdotHelper(IntArithFn fn,
1664 const ZRegister& zd,
1665 const ZRegister& za,
1666 const ZRegister& zn,
1667 const ZRegister& zm) {
1668 if (zd.Aliases(za)) {
1669 // zda = zda + (zn . zm)
1670 SingleEmissionCheckScope guard(this);
1671 (this->*fn)(zd, zn, zm);
1672
1673 } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1674 // zdn = za + (zdn . zm)
1675 // zdm = za + (zn . zdm)
1676 // zdnm = za + (zdnm . zdnm)
1677 UseScratchRegisterScope temps(this);
1678 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1679 {
1680 MovprfxHelperScope guard(this, scratch, za);
1681 (this->*fn)(scratch, zn, zm);
1682 }
1683
1684 Mov(zd, scratch);
1685 } else {
1686 // zd = za + (zn . zm)
1687 MovprfxHelperScope guard(this, zd, za);
1688 (this->*fn)(zd, zn, zm);
1689 }
1690 }
1691
Fscale(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm)1692 void MacroAssembler::Fscale(const ZRegister& zd,
1693 const PRegisterM& pg,
1694 const ZRegister& zn,
1695 const ZRegister& zm) {
1696 VIXL_ASSERT(allow_macro_instructions_);
1697 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1698 UseScratchRegisterScope temps(this);
1699 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
1700 Mov(scratch, zm);
1701 MovprfxHelperScope guard(this, zd, pg, zn);
1702 fscale(zd, pg, zd, scratch);
1703 } else {
1704 MovprfxHelperScope guard(this, zd, pg, zn);
1705 fscale(zd, pg, zd, zm);
1706 }
1707 }
1708
Sdot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1709 void MacroAssembler::Sdot(const ZRegister& zd,
1710 const ZRegister& za,
1711 const ZRegister& zn,
1712 const ZRegister& zm) {
1713 VIXL_ASSERT(allow_macro_instructions_);
1714 SVESdotUdotHelper(&Assembler::sdot, zd, za, zn, zm);
1715 }
1716
Sdot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1717 void MacroAssembler::Sdot(const ZRegister& zd,
1718 const ZRegister& za,
1719 const ZRegister& zn,
1720 const ZRegister& zm,
1721 int index) {
1722 VIXL_ASSERT(allow_macro_instructions_);
1723 SVESdotUdotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
1724 }
1725
Udot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1726 void MacroAssembler::Udot(const ZRegister& zd,
1727 const ZRegister& za,
1728 const ZRegister& zn,
1729 const ZRegister& zm) {
1730 VIXL_ASSERT(allow_macro_instructions_);
1731 SVESdotUdotHelper(&Assembler::udot, zd, za, zn, zm);
1732 }
1733
Udot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1734 void MacroAssembler::Udot(const ZRegister& zd,
1735 const ZRegister& za,
1736 const ZRegister& zn,
1737 const ZRegister& zm,
1738 int index) {
1739 VIXL_ASSERT(allow_macro_instructions_);
1740 SVESdotUdotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
1741 }
1742
FPMulAddHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,SVEMulAddPredicatedZdaFn fn_zda,SVEMulAddPredicatedZdnFn fn_zdn,FPMacroNaNPropagationOption nan_option)1743 void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
1744 const PRegisterM& pg,
1745 const ZRegister& za,
1746 const ZRegister& zn,
1747 const ZRegister& zm,
1748 SVEMulAddPredicatedZdaFn fn_zda,
1749 SVEMulAddPredicatedZdnFn fn_zdn,
1750 FPMacroNaNPropagationOption nan_option) {
1751 ResolveFPNaNPropagationOption(&nan_option);
1752
1753 if (zd.Aliases(za)) {
1754 // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
1755 SingleEmissionCheckScope guard(this);
1756 (this->*fn_zda)(zd, pg, zn, zm);
1757 } else if (zd.Aliases(zn)) {
1758 // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
1759 SingleEmissionCheckScope guard(this);
1760 (this->*fn_zdn)(zd, pg, zm, za);
1761 } else if (zd.Aliases(zm)) {
1762 switch (nan_option) {
1763 case FastNaNPropagation: {
1764 // We treat multiplication as commutative in the fast mode, so we can
1765 // swap zn and zm.
1766 // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
1767 SingleEmissionCheckScope guard(this);
1768 (this->*fn_zdn)(zd, pg, zn, za);
1769 return;
1770 }
1771 case StrictNaNPropagation: {
1772 UseScratchRegisterScope temps(this);
1773 // Use a scratch register to keep the argument order exactly as
1774 // specified.
1775 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
1776 {
1777 MovprfxHelperScope guard(this, scratch, pg, za);
1778 // scratch = (-)za + ((-)zn * zm)
1779 (this->*fn_zda)(scratch, pg, zn, zm);
1780 }
1781 Mov(zd, scratch);
1782 return;
1783 }
1784 case NoFPMacroNaNPropagationSelected:
1785 VIXL_UNREACHABLE();
1786 return;
1787 }
1788 } else {
1789 // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
1790 MovprfxHelperScope guard(this, zd, pg, za);
1791 (this->*fn_zda)(zd, pg, zn, zm);
1792 }
1793 }
1794
FPMulAddIndexHelper(SVEMulAddIndexFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1795 void MacroAssembler::FPMulAddIndexHelper(SVEMulAddIndexFn fn,
1796 const ZRegister& zd,
1797 const ZRegister& za,
1798 const ZRegister& zn,
1799 const ZRegister& zm,
1800 int index) {
1801 if (zd.Aliases(za)) {
1802 // zda = zda + (zn * zm[i])
1803 SingleEmissionCheckScope guard(this);
1804 (this->*fn)(zd, zn, zm, index);
1805
1806 } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1807 // zdn = za + (zdn * zm[i])
1808 // zdm = za + (zn * zdm[i])
1809 // zdnm = za + (zdnm * zdnm[i])
1810 UseScratchRegisterScope temps(this);
1811 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1812 {
1813 MovprfxHelperScope guard(this, scratch, za);
1814 (this->*fn)(scratch, zn, zm, index);
1815 }
1816 Mov(zd, scratch);
1817 } else {
1818 // zd = za + (zn * zm[i])
1819 MovprfxHelperScope guard(this, zd, za);
1820 (this->*fn)(zd, zn, zm, index);
1821 }
1822 }
1823
Fmla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1824 void MacroAssembler::Fmla(const ZRegister& zd,
1825 const PRegisterM& pg,
1826 const ZRegister& za,
1827 const ZRegister& zn,
1828 const ZRegister& zm,
1829 FPMacroNaNPropagationOption nan_option) {
1830 VIXL_ASSERT(allow_macro_instructions_);
1831 FPMulAddHelper(zd,
1832 pg,
1833 za,
1834 zn,
1835 zm,
1836 &Assembler::fmla,
1837 &Assembler::fmad,
1838 nan_option);
1839 }
1840
Fmla(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1841 void MacroAssembler::Fmla(const ZRegister& zd,
1842 const ZRegister& za,
1843 const ZRegister& zn,
1844 const ZRegister& zm,
1845 int index) {
1846 VIXL_ASSERT(allow_macro_instructions_);
1847 FPMulAddIndexHelper(&Assembler::fmla, zd, za, zn, zm, index);
1848 }
1849
Fmls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1850 void MacroAssembler::Fmls(const ZRegister& zd,
1851 const PRegisterM& pg,
1852 const ZRegister& za,
1853 const ZRegister& zn,
1854 const ZRegister& zm,
1855 FPMacroNaNPropagationOption nan_option) {
1856 VIXL_ASSERT(allow_macro_instructions_);
1857 FPMulAddHelper(zd,
1858 pg,
1859 za,
1860 zn,
1861 zm,
1862 &Assembler::fmls,
1863 &Assembler::fmsb,
1864 nan_option);
1865 }
1866
Fmls(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1867 void MacroAssembler::Fmls(const ZRegister& zd,
1868 const ZRegister& za,
1869 const ZRegister& zn,
1870 const ZRegister& zm,
1871 int index) {
1872 VIXL_ASSERT(allow_macro_instructions_);
1873 FPMulAddIndexHelper(&Assembler::fmls, zd, za, zn, zm, index);
1874 }
1875
Fnmla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1876 void MacroAssembler::Fnmla(const ZRegister& zd,
1877 const PRegisterM& pg,
1878 const ZRegister& za,
1879 const ZRegister& zn,
1880 const ZRegister& zm,
1881 FPMacroNaNPropagationOption nan_option) {
1882 VIXL_ASSERT(allow_macro_instructions_);
1883 FPMulAddHelper(zd,
1884 pg,
1885 za,
1886 zn,
1887 zm,
1888 &Assembler::fnmla,
1889 &Assembler::fnmad,
1890 nan_option);
1891 }
1892
Fnmls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)1893 void MacroAssembler::Fnmls(const ZRegister& zd,
1894 const PRegisterM& pg,
1895 const ZRegister& za,
1896 const ZRegister& zn,
1897 const ZRegister& zm,
1898 FPMacroNaNPropagationOption nan_option) {
1899 VIXL_ASSERT(allow_macro_instructions_);
1900 FPMulAddHelper(zd,
1901 pg,
1902 za,
1903 zn,
1904 zm,
1905 &Assembler::fnmls,
1906 &Assembler::fnmsb,
1907 nan_option);
1908 }
1909
Ftmad(const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,int imm3)1910 void MacroAssembler::Ftmad(const ZRegister& zd,
1911 const ZRegister& zn,
1912 const ZRegister& zm,
1913 int imm3) {
1914 VIXL_ASSERT(allow_macro_instructions_);
1915 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1916 UseScratchRegisterScope temps(this);
1917 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
1918 Mov(scratch, zm);
1919 MovprfxHelperScope guard(this, zd, zn);
1920 ftmad(zd, zd, scratch, imm3);
1921 } else {
1922 MovprfxHelperScope guard(this, zd, zn);
1923 ftmad(zd, zd, zm, imm3);
1924 }
1925 }
1926
Fcadd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,int rot)1927 void MacroAssembler::Fcadd(const ZRegister& zd,
1928 const PRegisterM& pg,
1929 const ZRegister& zn,
1930 const ZRegister& zm,
1931 int rot) {
1932 VIXL_ASSERT(allow_macro_instructions_);
1933 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1934 UseScratchRegisterScope temps(this);
1935 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1936 {
1937 MovprfxHelperScope guard(this, scratch, pg, zn);
1938 fcadd(scratch, pg, scratch, zm, rot);
1939 }
1940 Mov(zd, scratch);
1941 } else {
1942 MovprfxHelperScope guard(this, zd, pg, zn);
1943 fcadd(zd, pg, zd, zm, rot);
1944 }
1945 }
1946
Ext(const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,unsigned offset)1947 void MacroAssembler::Ext(const ZRegister& zd,
1948 const ZRegister& zn,
1949 const ZRegister& zm,
1950 unsigned offset) {
1951 VIXL_ASSERT(allow_macro_instructions_);
1952 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1953 // zd = ext(zn, zd, offset)
1954 UseScratchRegisterScope temps(this);
1955 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1956 {
1957 MovprfxHelperScope guard(this, scratch, zn);
1958 ext(scratch, scratch, zm, offset);
1959 }
1960 Mov(zd, scratch);
1961 } else {
1962 // zd = ext(zn, zm, offset)
1963 // zd = ext(zd, zd, offset)
1964 MovprfxHelperScope guard(this, zd, zn);
1965 ext(zd, zd, zm, offset);
1966 }
1967 }
1968
Splice(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)1969 void MacroAssembler::Splice(const ZRegister& zd,
1970 const PRegister& pg,
1971 const ZRegister& zn,
1972 const ZRegister& zm) {
1973 VIXL_ASSERT(allow_macro_instructions_);
1974 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1975 UseScratchRegisterScope temps(this);
1976 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1977 {
1978 MovprfxHelperScope guard(this, scratch, zn);
1979 splice(scratch, pg, scratch, zm);
1980 }
1981 Mov(zd, scratch);
1982 } else {
1983 MovprfxHelperScope guard(this, zd, zn);
1984 splice(zd, pg, zd, zm);
1985 }
1986 }
1987
Clasta(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)1988 void MacroAssembler::Clasta(const ZRegister& zd,
1989 const PRegister& pg,
1990 const ZRegister& zn,
1991 const ZRegister& zm) {
1992 VIXL_ASSERT(allow_macro_instructions_);
1993 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
1994 UseScratchRegisterScope temps(this);
1995 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1996 {
1997 MovprfxHelperScope guard(this, scratch, zn);
1998 clasta(scratch, pg, scratch, zm);
1999 }
2000 Mov(zd, scratch);
2001 } else {
2002 MovprfxHelperScope guard(this, zd, zn);
2003 clasta(zd, pg, zd, zm);
2004 }
2005 }
2006
Clastb(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)2007 void MacroAssembler::Clastb(const ZRegister& zd,
2008 const PRegister& pg,
2009 const ZRegister& zn,
2010 const ZRegister& zm) {
2011 VIXL_ASSERT(allow_macro_instructions_);
2012 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2013 UseScratchRegisterScope temps(this);
2014 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2015 {
2016 MovprfxHelperScope guard(this, scratch, zn);
2017 clastb(scratch, pg, scratch, zm);
2018 }
2019 Mov(zd, scratch);
2020 } else {
2021 MovprfxHelperScope guard(this, zd, zn);
2022 clastb(zd, pg, zd, zm);
2023 }
2024 }
2025
2026 } // namespace aarch64
2027 } // namespace vixl
2028