1 // Copyright 2019, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include "macro-assembler-aarch64.h"
28
29 namespace vixl {
30 namespace aarch64 {
31
AddSubHelper(AddSubHelperOption option,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)32 void MacroAssembler::AddSubHelper(AddSubHelperOption option,
33 const ZRegister& zd,
34 const ZRegister& zn,
35 IntegerOperand imm) {
36 VIXL_ASSERT(imm.FitsInLane(zd));
37
38 // Simple, encodable cases.
39 if (TrySingleAddSub(option, zd, zn, imm)) return;
40
41 VIXL_ASSERT((option == kAddImmediate) || (option == kSubImmediate));
42 bool add_imm = (option == kAddImmediate);
43
44 // Try to translate Add(..., -imm) to Sub(..., imm) if we can encode it in one
45 // instruction. Also interpret the immediate as signed, so we can convert
46 // Add(zd.VnH(), zn.VnH(), 0xffff...) to Sub(..., 1), etc.
47 IntegerOperand signed_imm(imm.AsIntN(zd.GetLaneSizeInBits()));
48 if (signed_imm.IsNegative()) {
49 AddSubHelperOption n_option = add_imm ? kSubImmediate : kAddImmediate;
50 IntegerOperand n_imm(signed_imm.GetMagnitude());
51 // IntegerOperand can represent -INT_MIN, so this is always safe.
52 VIXL_ASSERT(n_imm.IsPositiveOrZero());
53 if (TrySingleAddSub(n_option, zd, zn, n_imm)) return;
54 }
55
56 // Otherwise, fall back to dup + ADD_z_z/SUB_z_z.
57 UseScratchRegisterScope temps(this);
58 ZRegister scratch = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
59 Dup(scratch, imm);
60
61 SingleEmissionCheckScope guard(this);
62 if (add_imm) {
63 add(zd, zn, scratch);
64 } else {
65 sub(zd, zn, scratch);
66 }
67 }
68
TrySingleAddSub(AddSubHelperOption option,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)69 bool MacroAssembler::TrySingleAddSub(AddSubHelperOption option,
70 const ZRegister& zd,
71 const ZRegister& zn,
72 IntegerOperand imm) {
73 VIXL_ASSERT(imm.FitsInLane(zd));
74
75 int imm8;
76 int shift = -1;
77 if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
78 imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
79 MovprfxHelperScope guard(this, zd, zn);
80 switch (option) {
81 case kAddImmediate:
82 add(zd, zd, imm8, shift);
83 return true;
84 case kSubImmediate:
85 sub(zd, zd, imm8, shift);
86 return true;
87 }
88 }
89 return false;
90 }
91
IntWideImmHelper(IntArithImmFn imm_fn,SVEArithPredicatedFn reg_macro,const ZRegister & zd,const ZRegister & zn,IntegerOperand imm,bool is_signed)92 void MacroAssembler::IntWideImmHelper(IntArithImmFn imm_fn,
93 SVEArithPredicatedFn reg_macro,
94 const ZRegister& zd,
95 const ZRegister& zn,
96 IntegerOperand imm,
97 bool is_signed) {
98 if (is_signed) {
99 // E.g. MUL_z_zi, SMIN_z_zi, SMAX_z_zi
100 if (imm.IsInt8()) {
101 MovprfxHelperScope guard(this, zd, zn);
102 (this->*imm_fn)(zd, zd, imm.AsInt8());
103 return;
104 }
105 } else {
106 // E.g. UMIN_z_zi, UMAX_z_zi
107 if (imm.IsUint8()) {
108 MovprfxHelperScope guard(this, zd, zn);
109 (this->*imm_fn)(zd, zd, imm.AsUint8());
110 return;
111 }
112 }
113
114 UseScratchRegisterScope temps(this);
115 PRegister pg = temps.AcquireGoverningP();
116 Ptrue(pg.WithSameLaneSizeAs(zd));
117
118 // Try to re-use zd if we can, so we can avoid a movprfx.
119 ZRegister scratch =
120 zd.Aliases(zn) ? temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits())
121 : zd;
122 Dup(scratch, imm);
123
124 // The vector-form macro for commutative operations will swap the arguments to
125 // avoid movprfx, if necessary.
126 (this->*reg_macro)(zd, pg.Merging(), zn, scratch);
127 }
128
Mul(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)129 void MacroAssembler::Mul(const ZRegister& zd,
130 const ZRegister& zn,
131 IntegerOperand imm) {
132 VIXL_ASSERT(allow_macro_instructions_);
133 IntArithImmFn imm_fn = &Assembler::mul;
134 SVEArithPredicatedFn reg_fn = &MacroAssembler::Mul;
135 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
136 }
137
Smin(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)138 void MacroAssembler::Smin(const ZRegister& zd,
139 const ZRegister& zn,
140 IntegerOperand imm) {
141 VIXL_ASSERT(allow_macro_instructions_);
142 VIXL_ASSERT(imm.FitsInSignedLane(zd));
143 IntArithImmFn imm_fn = &Assembler::smin;
144 SVEArithPredicatedFn reg_fn = &MacroAssembler::Smin;
145 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
146 }
147
Smax(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)148 void MacroAssembler::Smax(const ZRegister& zd,
149 const ZRegister& zn,
150 IntegerOperand imm) {
151 VIXL_ASSERT(allow_macro_instructions_);
152 VIXL_ASSERT(imm.FitsInSignedLane(zd));
153 IntArithImmFn imm_fn = &Assembler::smax;
154 SVEArithPredicatedFn reg_fn = &MacroAssembler::Smax;
155 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, true);
156 }
157
Umax(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)158 void MacroAssembler::Umax(const ZRegister& zd,
159 const ZRegister& zn,
160 IntegerOperand imm) {
161 VIXL_ASSERT(allow_macro_instructions_);
162 VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
163 IntArithImmFn imm_fn = &Assembler::umax;
164 SVEArithPredicatedFn reg_fn = &MacroAssembler::Umax;
165 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
166 }
167
Umin(const ZRegister & zd,const ZRegister & zn,IntegerOperand imm)168 void MacroAssembler::Umin(const ZRegister& zd,
169 const ZRegister& zn,
170 IntegerOperand imm) {
171 VIXL_ASSERT(allow_macro_instructions_);
172 VIXL_ASSERT(imm.FitsInUnsignedLane(zd));
173 IntArithImmFn imm_fn = &Assembler::umin;
174 SVEArithPredicatedFn reg_fn = &MacroAssembler::Umin;
175 IntWideImmHelper(imm_fn, reg_fn, zd, zn, imm, false);
176 }
177
Addpl(const Register & xd,const Register & xn,int64_t multiplier)178 void MacroAssembler::Addpl(const Register& xd,
179 const Register& xn,
180 int64_t multiplier) {
181 VIXL_ASSERT(allow_macro_instructions_);
182
183 // This macro relies on `Rdvl` to handle some out-of-range cases. Check that
184 // `VL * multiplier` cannot overflow, for any possible value of VL.
185 VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
186 VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
187
188 if (xd.IsZero()) return;
189 if (xn.IsZero() && xd.IsSP()) {
190 // TODO: This operation doesn't make much sense, but we could support it
191 // with a scratch register if necessary.
192 VIXL_UNIMPLEMENTED();
193 }
194
195 // Handling xzr requires an extra move, so defer it until later so we can try
196 // to use `rdvl` instead (via `Addvl`).
197 if (IsInt6(multiplier) && !xn.IsZero()) {
198 SingleEmissionCheckScope guard(this);
199 addpl(xd, xn, static_cast<int>(multiplier));
200 return;
201 }
202
203 // If `multiplier` is a multiple of 8, we can use `Addvl` instead.
204 if ((multiplier % kZRegBitsPerPRegBit) == 0) {
205 Addvl(xd, xn, multiplier / kZRegBitsPerPRegBit);
206 return;
207 }
208
209 if (IsInt6(multiplier)) {
210 VIXL_ASSERT(xn.IsZero()); // Other cases were handled with `addpl`.
211 // There is no simple `rdpl` instruction, and `addpl` cannot accept xzr, so
212 // materialise a zero.
213 MacroEmissionCheckScope guard(this);
214 movz(xd, 0);
215 addpl(xd, xd, static_cast<int>(multiplier));
216 return;
217 }
218
219 // TODO: Some probable cases result in rather long sequences. For example,
220 // `Addpl(sp, sp, 33)` requires five instructions, even though it's only just
221 // outside the encodable range. We should look for ways to cover such cases
222 // without drastically increasing the complexity of this logic.
223
224 // For other cases, calculate xn + (PL * multiplier) using discrete
225 // instructions. This requires two scratch registers in the general case, so
226 // try to re-use the destination as a scratch register.
227 UseScratchRegisterScope temps(this);
228 temps.Include(xd);
229 temps.Exclude(xn);
230
231 Register scratch = temps.AcquireX();
232 // Because there is no `rdpl`, so we have to calculate PL from VL. We can't
233 // scale the multiplier because (we already know) it isn't a multiple of 8.
234 Rdvl(scratch, multiplier);
235
236 MacroEmissionCheckScope guard(this);
237 if (xn.IsZero()) {
238 asr(xd, scratch, kZRegBitsPerPRegBitLog2);
239 } else if (xd.IsSP() || xn.IsSP()) {
240 // TODO: MacroAssembler::Add should be able to handle this.
241 asr(scratch, scratch, kZRegBitsPerPRegBitLog2);
242 add(xd, xn, scratch);
243 } else {
244 add(xd, xn, Operand(scratch, ASR, kZRegBitsPerPRegBitLog2));
245 }
246 }
247
Addvl(const Register & xd,const Register & xn,int64_t multiplier)248 void MacroAssembler::Addvl(const Register& xd,
249 const Register& xn,
250 int64_t multiplier) {
251 VIXL_ASSERT(allow_macro_instructions_);
252 VIXL_ASSERT(xd.IsX());
253 VIXL_ASSERT(xn.IsX());
254
255 // Check that `VL * multiplier` cannot overflow, for any possible value of VL.
256 VIXL_ASSERT(multiplier <= (INT64_MAX / kZRegMaxSizeInBytes));
257 VIXL_ASSERT(multiplier >= (INT64_MIN / kZRegMaxSizeInBytes));
258
259 if (xd.IsZero()) return;
260 if (xn.IsZero() && xd.IsSP()) {
261 // TODO: This operation doesn't make much sense, but we could support it
262 // with a scratch register if necessary. `rdvl` cannot write into `sp`.
263 VIXL_UNIMPLEMENTED();
264 }
265
266 if (IsInt6(multiplier)) {
267 SingleEmissionCheckScope guard(this);
268 if (xn.IsZero()) {
269 rdvl(xd, static_cast<int>(multiplier));
270 } else {
271 addvl(xd, xn, static_cast<int>(multiplier));
272 }
273 return;
274 }
275
276 // TODO: Some probable cases result in rather long sequences. For example,
277 // `Addvl(sp, sp, 42)` requires four instructions, even though it's only just
278 // outside the encodable range. We should look for ways to cover such cases
279 // without drastically increasing the complexity of this logic.
280
281 // For other cases, calculate xn + (VL * multiplier) using discrete
282 // instructions. This requires two scratch registers in the general case, so
283 // we try to re-use the destination as a scratch register.
284 UseScratchRegisterScope temps(this);
285 temps.Include(xd);
286 temps.Exclude(xn);
287
288 Register a = temps.AcquireX();
289 Mov(a, multiplier);
290
291 MacroEmissionCheckScope guard(this);
292 Register b = temps.AcquireX();
293 rdvl(b, 1);
294 if (xn.IsZero()) {
295 mul(xd, a, b);
296 } else if (xd.IsSP() || xn.IsSP()) {
297 mul(a, a, b);
298 add(xd, xn, a);
299 } else {
300 madd(xd, a, b, xn);
301 }
302 }
303
CalculateSVEAddress(const Register & xd,const SVEMemOperand & addr,int vl_divisor_log2)304 void MacroAssembler::CalculateSVEAddress(const Register& xd,
305 const SVEMemOperand& addr,
306 int vl_divisor_log2) {
307 VIXL_ASSERT(allow_macro_instructions_);
308 VIXL_ASSERT(!addr.IsScatterGather());
309 VIXL_ASSERT(xd.IsX());
310
311 // The lower bound is where a whole Z register is accessed.
312 VIXL_ASSERT(!addr.IsMulVl() || (vl_divisor_log2 >= 0));
313 // The upper bound is for P register accesses, and for instructions like
314 // "st1b { z0.d } [...]", where one byte is accessed for every D-sized lane.
315 VIXL_ASSERT(vl_divisor_log2 <= static_cast<int>(kZRegBitsPerPRegBitLog2));
316
317 SVEOffsetModifier mod = addr.GetOffsetModifier();
318 Register base = addr.GetScalarBase();
319
320 if (addr.IsEquivalentToScalar()) {
321 // For example:
322 // [x0]
323 // [x0, #0]
324 // [x0, xzr, LSL 2]
325 Mov(xd, base);
326 } else if (addr.IsScalarPlusImmediate()) {
327 // For example:
328 // [x0, #42]
329 // [x0, #42, MUL VL]
330 int64_t offset = addr.GetImmediateOffset();
331 VIXL_ASSERT(offset != 0); // Handled by IsEquivalentToScalar.
332 if (addr.IsMulVl()) {
333 int vl_divisor = 1 << vl_divisor_log2;
334 // For all possible values of vl_divisor, we can simply use `Addpl`. This
335 // will select `addvl` if necessary.
336 VIXL_ASSERT((kZRegBitsPerPRegBit % vl_divisor) == 0);
337 Addpl(xd, base, offset * (kZRegBitsPerPRegBit / vl_divisor));
338 } else {
339 // IsScalarPlusImmediate() ensures that no other modifiers can occur.
340 VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
341 Add(xd, base, offset);
342 }
343 } else if (addr.IsScalarPlusScalar()) {
344 // For example:
345 // [x0, x1]
346 // [x0, x1, LSL #4]
347 Register offset = addr.GetScalarOffset();
348 VIXL_ASSERT(!offset.IsZero()); // Handled by IsEquivalentToScalar.
349 if (mod == SVE_LSL) {
350 Add(xd, base, Operand(offset, LSL, addr.GetShiftAmount()));
351 } else {
352 // IsScalarPlusScalar() ensures that no other modifiers can occur.
353 VIXL_ASSERT(mod == NO_SVE_OFFSET_MODIFIER);
354 Add(xd, base, offset);
355 }
356 } else {
357 // All other forms are scatter-gather addresses, which cannot be evaluated
358 // into an X register.
359 VIXL_UNREACHABLE();
360 }
361 }
362
Cpy(const ZRegister & zd,const PRegister & pg,IntegerOperand imm)363 void MacroAssembler::Cpy(const ZRegister& zd,
364 const PRegister& pg,
365 IntegerOperand imm) {
366 VIXL_ASSERT(allow_macro_instructions_);
367 VIXL_ASSERT(imm.FitsInLane(zd));
368 int imm8;
369 int shift;
370 if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
371 imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
372 SingleEmissionCheckScope guard(this);
373 cpy(zd, pg, imm8, shift);
374 return;
375 }
376
377 // The fallbacks rely on `cpy` variants that only support merging predication.
378 // If zeroing predication was requested, zero the destination first.
379 if (pg.IsZeroing()) {
380 SingleEmissionCheckScope guard(this);
381 dup(zd, 0);
382 }
383 PRegisterM pg_m = pg.Merging();
384
385 // Try to encode the immediate using fcpy.
386 VIXL_ASSERT(imm.FitsInLane(zd));
387 if (zd.GetLaneSizeInBits() >= kHRegSize) {
388 double fp_imm = 0.0;
389 switch (zd.GetLaneSizeInBits()) {
390 case kHRegSize:
391 fp_imm =
392 FPToDouble(RawbitsToFloat16(imm.AsUint16()), kIgnoreDefaultNaN);
393 break;
394 case kSRegSize:
395 fp_imm = RawbitsToFloat(imm.AsUint32());
396 break;
397 case kDRegSize:
398 fp_imm = RawbitsToDouble(imm.AsUint64());
399 break;
400 default:
401 VIXL_UNREACHABLE();
402 break;
403 }
404 // IsImmFP64 is equivalent to IsImmFP<n> for the same arithmetic value, so
405 // we can use IsImmFP64 for all lane sizes.
406 if (IsImmFP64(fp_imm)) {
407 SingleEmissionCheckScope guard(this);
408 fcpy(zd, pg_m, fp_imm);
409 return;
410 }
411 }
412
413 // Fall back to using a scratch register.
414 UseScratchRegisterScope temps(this);
415 Register scratch = temps.AcquireRegisterToHoldLane(zd);
416 Mov(scratch, imm);
417
418 SingleEmissionCheckScope guard(this);
419 cpy(zd, pg_m, scratch);
420 }
421
422 // TODO: We implement Fcpy (amongst other things) for all FP types because it
423 // allows us to preserve user-specified NaNs. We should come up with some
424 // FPImmediate type to abstract this, and avoid all the duplication below (and
425 // elsewhere).
426
Fcpy(const ZRegister & zd,const PRegisterM & pg,double imm)427 void MacroAssembler::Fcpy(const ZRegister& zd,
428 const PRegisterM& pg,
429 double imm) {
430 VIXL_ASSERT(allow_macro_instructions_);
431 VIXL_ASSERT(pg.IsMerging());
432
433 if (IsImmFP64(imm)) {
434 SingleEmissionCheckScope guard(this);
435 fcpy(zd, pg, imm);
436 return;
437 }
438
439 // As a fall-back, cast the immediate to the required lane size, and try to
440 // encode the bit pattern using `Cpy`.
441 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
442 }
443
Fcpy(const ZRegister & zd,const PRegisterM & pg,float imm)444 void MacroAssembler::Fcpy(const ZRegister& zd,
445 const PRegisterM& pg,
446 float imm) {
447 VIXL_ASSERT(allow_macro_instructions_);
448 VIXL_ASSERT(pg.IsMerging());
449
450 if (IsImmFP32(imm)) {
451 SingleEmissionCheckScope guard(this);
452 fcpy(zd, pg, imm);
453 return;
454 }
455
456 // As a fall-back, cast the immediate to the required lane size, and try to
457 // encode the bit pattern using `Cpy`.
458 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
459 }
460
Fcpy(const ZRegister & zd,const PRegisterM & pg,Float16 imm)461 void MacroAssembler::Fcpy(const ZRegister& zd,
462 const PRegisterM& pg,
463 Float16 imm) {
464 VIXL_ASSERT(allow_macro_instructions_);
465 VIXL_ASSERT(pg.IsMerging());
466
467 if (IsImmFP16(imm)) {
468 SingleEmissionCheckScope guard(this);
469 fcpy(zd, pg, imm);
470 return;
471 }
472
473 // As a fall-back, cast the immediate to the required lane size, and try to
474 // encode the bit pattern using `Cpy`.
475 Cpy(zd, pg, FPToRawbitsWithSize(zd.GetLaneSizeInBits(), imm));
476 }
477
Dup(const ZRegister & zd,IntegerOperand imm)478 void MacroAssembler::Dup(const ZRegister& zd, IntegerOperand imm) {
479 VIXL_ASSERT(allow_macro_instructions_);
480 VIXL_ASSERT(imm.FitsInLane(zd));
481 unsigned lane_size = zd.GetLaneSizeInBits();
482 int imm8;
483 int shift;
484 if (imm.TryEncodeAsShiftedIntNForLane<8, 0>(zd, &imm8, &shift) ||
485 imm.TryEncodeAsShiftedIntNForLane<8, 8>(zd, &imm8, &shift)) {
486 SingleEmissionCheckScope guard(this);
487 dup(zd, imm8, shift);
488 } else if (IsImmLogical(imm.AsUintN(lane_size), lane_size)) {
489 SingleEmissionCheckScope guard(this);
490 dupm(zd, imm.AsUintN(lane_size));
491 } else {
492 UseScratchRegisterScope temps(this);
493 Register scratch = temps.AcquireRegisterToHoldLane(zd);
494 Mov(scratch, imm);
495
496 SingleEmissionCheckScope guard(this);
497 dup(zd, scratch);
498 }
499 }
500
NoncommutativeArithmeticHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,SVEArithPredicatedFn fn,SVEArithPredicatedFn rev_fn)501 void MacroAssembler::NoncommutativeArithmeticHelper(
502 const ZRegister& zd,
503 const PRegisterM& pg,
504 const ZRegister& zn,
505 const ZRegister& zm,
506 SVEArithPredicatedFn fn,
507 SVEArithPredicatedFn rev_fn) {
508 if (zd.Aliases(zn)) {
509 // E.g. zd = zd / zm
510 SingleEmissionCheckScope guard(this);
511 (this->*fn)(zd, pg, zn, zm);
512 } else if (zd.Aliases(zm)) {
513 // E.g. zd = zn / zd
514 SingleEmissionCheckScope guard(this);
515 (this->*rev_fn)(zd, pg, zm, zn);
516 } else {
517 // E.g. zd = zn / zm
518 MovprfxHelperScope guard(this, zd, pg, zn);
519 (this->*fn)(zd, pg, zd, zm);
520 }
521 }
522
FPCommutativeArithmeticHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,SVEArithPredicatedFn fn,FPMacroNaNPropagationOption nan_option)523 void MacroAssembler::FPCommutativeArithmeticHelper(
524 const ZRegister& zd,
525 const PRegisterM& pg,
526 const ZRegister& zn,
527 const ZRegister& zm,
528 SVEArithPredicatedFn fn,
529 FPMacroNaNPropagationOption nan_option) {
530 ResolveFPNaNPropagationOption(&nan_option);
531
532 if (zd.Aliases(zn)) {
533 SingleEmissionCheckScope guard(this);
534 (this->*fn)(zd, pg, zd, zm);
535 } else if (zd.Aliases(zm)) {
536 switch (nan_option) {
537 case FastNaNPropagation: {
538 // Swap the arguments.
539 SingleEmissionCheckScope guard(this);
540 (this->*fn)(zd, pg, zd, zn);
541 return;
542 }
543 case StrictNaNPropagation: {
544 UseScratchRegisterScope temps(this);
545 // Use a scratch register to keep the argument order exactly as
546 // specified.
547 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
548 {
549 MovprfxHelperScope guard(this, scratch, pg, zn);
550 (this->*fn)(scratch, pg, scratch, zm);
551 }
552 Mov(zd, scratch);
553 return;
554 }
555 case NoFPMacroNaNPropagationSelected:
556 VIXL_UNREACHABLE();
557 return;
558 }
559 } else {
560 MovprfxHelperScope guard(this, zd, pg, zn);
561 (this->*fn)(zd, pg, zd, zm);
562 }
563 }
564
565 // Instructions of the form "inst zda, zn, zm, #num", where they are
566 // non-commutative and no reversed form is provided.
567 #define VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(V) \
568 V(Cmla, cmla) \
569 V(Sqrdcmlah, sqrdcmlah)
570
571 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
572 void MacroAssembler::MASMFN(const ZRegister& zd, \
573 const ZRegister& za, \
574 const ZRegister& zn, \
575 const ZRegister& zm, \
576 int imm) { \
577 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
578 UseScratchRegisterScope temps(this); \
579 VIXL_ASSERT(AreSameLaneSize(zn, zm)); \
580 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn); \
581 Mov(ztmp, zd.Aliases(zn) ? zn : zm); \
582 MovprfxHelperScope guard(this, zd, za); \
583 ASMFN(zd, \
584 (zd.Aliases(zn) ? ztmp : zn), \
585 (zd.Aliases(zm) ? ztmp : zm), \
586 imm); \
587 } else { \
588 MovprfxHelperScope guard(this, zd, za); \
589 ASMFN(zd, zn, zm, imm); \
590 } \
591 }
592 VIXL_SVE_NONCOMM_ARITH_ZZZZI_LIST(VIXL_DEFINE_MASM_FUNC)
593 #undef VIXL_DEFINE_MASM_FUNC
594
595 // Instructions of the form "inst zda, zn, zm, #num, #num", where they are
596 // non-commutative and no reversed form is provided.
597 #define VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(V) \
598 V(Cmla, cmla) \
599 V(Sqrdcmlah, sqrdcmlah)
600
601 // This doesn't handle zm when it's out of the range that can be encoded in
602 // instruction. The range depends on element size: z0-z7 for H, z0-15 for S.
603 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
604 void MacroAssembler::MASMFN(const ZRegister& zd, \
605 const ZRegister& za, \
606 const ZRegister& zn, \
607 const ZRegister& zm, \
608 int index, \
609 int rot) { \
610 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) { \
611 UseScratchRegisterScope temps(this); \
612 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd); \
613 { \
614 MovprfxHelperScope guard(this, ztmp, za); \
615 ASMFN(ztmp, zn, zm, index, rot); \
616 } \
617 Mov(zd, ztmp); \
618 } else { \
619 MovprfxHelperScope guard(this, zd, za); \
620 ASMFN(zd, zn, zm, index, rot); \
621 } \
622 }
VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)623 VIXL_SVE_NONCOMM_ARITH_ZZZZII_LIST(VIXL_DEFINE_MASM_FUNC)
624 #undef VIXL_DEFINE_MASM_FUNC
625
626 // Instructions of the form "inst zda, pg, zda, zn", where they are
627 // non-commutative and no reversed form is provided.
628 #define VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(V) \
629 V(Addp, addp) \
630 V(Bic, bic) \
631 V(Faddp, faddp) \
632 V(Fmaxnmp, fmaxnmp) \
633 V(Fminnmp, fminnmp) \
634 V(Fmaxp, fmaxp) \
635 V(Fminp, fminp) \
636 V(Fscale, fscale) \
637 V(Smaxp, smaxp) \
638 V(Sminp, sminp) \
639 V(Suqadd, suqadd) \
640 V(Umaxp, umaxp) \
641 V(Uminp, uminp) \
642 V(Usqadd, usqadd)
643
644 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
645 void MacroAssembler::MASMFN(const ZRegister& zd, \
646 const PRegisterM& pg, \
647 const ZRegister& zn, \
648 const ZRegister& zm) { \
649 VIXL_ASSERT(allow_macro_instructions_); \
650 if (zd.Aliases(zm) && !zd.Aliases(zn)) { \
651 UseScratchRegisterScope temps(this); \
652 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm); \
653 Mov(scratch, zm); \
654 MovprfxHelperScope guard(this, zd, pg, zn); \
655 ASMFN(zd, pg, zd, scratch); \
656 } else { \
657 MovprfxHelperScope guard(this, zd, pg, zn); \
658 ASMFN(zd, pg, zd, zm); \
659 } \
660 }
661 VIXL_SVE_NONCOMM_ARITH_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
662 #undef VIXL_DEFINE_MASM_FUNC
663
664 // Instructions of the form "inst zda, pg, zda, zn", where they are
665 // non-commutative and a reversed form is provided.
666 #define VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(V) \
667 V(Asr, asr) \
668 V(Fdiv, fdiv) \
669 V(Fsub, fsub) \
670 V(Lsl, lsl) \
671 V(Lsr, lsr) \
672 V(Sdiv, sdiv) \
673 V(Shsub, shsub) \
674 V(Sqrshl, sqrshl) \
675 V(Sqshl, sqshl) \
676 V(Sqsub, sqsub) \
677 V(Srshl, srshl) \
678 V(Sub, sub) \
679 V(Udiv, udiv) \
680 V(Uhsub, uhsub) \
681 V(Uqrshl, uqrshl) \
682 V(Uqshl, uqshl) \
683 V(Uqsub, uqsub) \
684 V(Urshl, urshl)
685
686 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN) \
687 void MacroAssembler::MASMFN(const ZRegister& zd, \
688 const PRegisterM& pg, \
689 const ZRegister& zn, \
690 const ZRegister& zm) { \
691 VIXL_ASSERT(allow_macro_instructions_); \
692 NoncommutativeArithmeticHelper(zd, \
693 pg, \
694 zn, \
695 zm, \
696 static_cast<SVEArithPredicatedFn>( \
697 &Assembler::ASMFN), \
698 static_cast<SVEArithPredicatedFn>( \
699 &Assembler::ASMFN##r)); \
700 }
701 VIXL_SVE_NONCOMM_ARITH_REVERSE_ZPZZ_LIST(VIXL_DEFINE_MASM_FUNC)
702 #undef VIXL_DEFINE_MASM_FUNC
703
704 void MacroAssembler::Fadd(const ZRegister& zd,
705 const PRegisterM& pg,
706 const ZRegister& zn,
707 const ZRegister& zm,
708 FPMacroNaNPropagationOption nan_option) {
709 VIXL_ASSERT(allow_macro_instructions_);
710 FPCommutativeArithmeticHelper(zd,
711 pg,
712 zn,
713 zm,
714 static_cast<SVEArithPredicatedFn>(
715 &Assembler::fadd),
716 nan_option);
717 }
718
Fabd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)719 void MacroAssembler::Fabd(const ZRegister& zd,
720 const PRegisterM& pg,
721 const ZRegister& zn,
722 const ZRegister& zm,
723 FPMacroNaNPropagationOption nan_option) {
724 VIXL_ASSERT(allow_macro_instructions_);
725 FPCommutativeArithmeticHelper(zd,
726 pg,
727 zn,
728 zm,
729 static_cast<SVEArithPredicatedFn>(
730 &Assembler::fabd),
731 nan_option);
732 }
733
Fmul(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)734 void MacroAssembler::Fmul(const ZRegister& zd,
735 const PRegisterM& pg,
736 const ZRegister& zn,
737 const ZRegister& zm,
738 FPMacroNaNPropagationOption nan_option) {
739 VIXL_ASSERT(allow_macro_instructions_);
740 FPCommutativeArithmeticHelper(zd,
741 pg,
742 zn,
743 zm,
744 static_cast<SVEArithPredicatedFn>(
745 &Assembler::fmul),
746 nan_option);
747 }
748
Fmulx(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)749 void MacroAssembler::Fmulx(const ZRegister& zd,
750 const PRegisterM& pg,
751 const ZRegister& zn,
752 const ZRegister& zm,
753 FPMacroNaNPropagationOption nan_option) {
754 VIXL_ASSERT(allow_macro_instructions_);
755 FPCommutativeArithmeticHelper(zd,
756 pg,
757 zn,
758 zm,
759 static_cast<SVEArithPredicatedFn>(
760 &Assembler::fmulx),
761 nan_option);
762 }
763
Fmax(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)764 void MacroAssembler::Fmax(const ZRegister& zd,
765 const PRegisterM& pg,
766 const ZRegister& zn,
767 const ZRegister& zm,
768 FPMacroNaNPropagationOption nan_option) {
769 VIXL_ASSERT(allow_macro_instructions_);
770 FPCommutativeArithmeticHelper(zd,
771 pg,
772 zn,
773 zm,
774 static_cast<SVEArithPredicatedFn>(
775 &Assembler::fmax),
776 nan_option);
777 }
778
Fmin(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)779 void MacroAssembler::Fmin(const ZRegister& zd,
780 const PRegisterM& pg,
781 const ZRegister& zn,
782 const ZRegister& zm,
783 FPMacroNaNPropagationOption nan_option) {
784 VIXL_ASSERT(allow_macro_instructions_);
785 FPCommutativeArithmeticHelper(zd,
786 pg,
787 zn,
788 zm,
789 static_cast<SVEArithPredicatedFn>(
790 &Assembler::fmin),
791 nan_option);
792 }
793
Fmaxnm(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)794 void MacroAssembler::Fmaxnm(const ZRegister& zd,
795 const PRegisterM& pg,
796 const ZRegister& zn,
797 const ZRegister& zm,
798 FPMacroNaNPropagationOption nan_option) {
799 VIXL_ASSERT(allow_macro_instructions_);
800 FPCommutativeArithmeticHelper(zd,
801 pg,
802 zn,
803 zm,
804 static_cast<SVEArithPredicatedFn>(
805 &Assembler::fmaxnm),
806 nan_option);
807 }
808
Fminnm(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)809 void MacroAssembler::Fminnm(const ZRegister& zd,
810 const PRegisterM& pg,
811 const ZRegister& zn,
812 const ZRegister& zm,
813 FPMacroNaNPropagationOption nan_option) {
814 VIXL_ASSERT(allow_macro_instructions_);
815 FPCommutativeArithmeticHelper(zd,
816 pg,
817 zn,
818 zm,
819 static_cast<SVEArithPredicatedFn>(
820 &Assembler::fminnm),
821 nan_option);
822 }
823
Fdup(const ZRegister & zd,double imm)824 void MacroAssembler::Fdup(const ZRegister& zd, double imm) {
825 VIXL_ASSERT(allow_macro_instructions_);
826
827 switch (zd.GetLaneSizeInBits()) {
828 case kHRegSize:
829 Fdup(zd, Float16(imm));
830 break;
831 case kSRegSize:
832 Fdup(zd, static_cast<float>(imm));
833 break;
834 case kDRegSize:
835 uint64_t bits = DoubleToRawbits(imm);
836 if (IsImmFP64(bits)) {
837 SingleEmissionCheckScope guard(this);
838 fdup(zd, imm);
839 } else {
840 Dup(zd, bits);
841 }
842 break;
843 }
844 }
845
Fdup(const ZRegister & zd,float imm)846 void MacroAssembler::Fdup(const ZRegister& zd, float imm) {
847 VIXL_ASSERT(allow_macro_instructions_);
848
849 switch (zd.GetLaneSizeInBits()) {
850 case kHRegSize:
851 Fdup(zd, Float16(imm));
852 break;
853 case kSRegSize:
854 if (IsImmFP32(imm)) {
855 SingleEmissionCheckScope guard(this);
856 fdup(zd, imm);
857 } else {
858 Dup(zd, FloatToRawbits(imm));
859 }
860 break;
861 case kDRegSize:
862 Fdup(zd, static_cast<double>(imm));
863 break;
864 }
865 }
866
Fdup(const ZRegister & zd,Float16 imm)867 void MacroAssembler::Fdup(const ZRegister& zd, Float16 imm) {
868 VIXL_ASSERT(allow_macro_instructions_);
869
870 switch (zd.GetLaneSizeInBits()) {
871 case kHRegSize:
872 if (IsImmFP16(imm)) {
873 SingleEmissionCheckScope guard(this);
874 fdup(zd, imm);
875 } else {
876 Dup(zd, Float16ToRawbits(imm));
877 }
878 break;
879 case kSRegSize:
880 Fdup(zd, FPToFloat(imm, kIgnoreDefaultNaN));
881 break;
882 case kDRegSize:
883 Fdup(zd, FPToDouble(imm, kIgnoreDefaultNaN));
884 break;
885 }
886 }
887
Index(const ZRegister & zd,const Operand & start,const Operand & step)888 void MacroAssembler::Index(const ZRegister& zd,
889 const Operand& start,
890 const Operand& step) {
891 class IndexOperand : public Operand {
892 public:
893 static IndexOperand Prepare(MacroAssembler* masm,
894 UseScratchRegisterScope* temps,
895 const Operand& op,
896 const ZRegister& zd_inner) {
897 // Look for encodable immediates.
898 int imm;
899 if (op.IsImmediate()) {
900 if (IntegerOperand(op).TryEncodeAsIntNForLane<5>(zd_inner, &imm)) {
901 return IndexOperand(imm);
902 }
903 Register scratch = temps->AcquireRegisterToHoldLane(zd_inner);
904 masm->Mov(scratch, op);
905 return IndexOperand(scratch);
906 } else {
907 // Plain registers can be encoded directly.
908 VIXL_ASSERT(op.IsPlainRegister());
909 return IndexOperand(op.GetRegister());
910 }
911 }
912
913 int GetImm5() const {
914 int64_t imm = GetImmediate();
915 VIXL_ASSERT(IsInt5(imm));
916 return static_cast<int>(imm);
917 }
918
919 private:
920 explicit IndexOperand(const Register& reg) : Operand(reg) {}
921 explicit IndexOperand(int64_t imm) : Operand(imm) {}
922 };
923
924 UseScratchRegisterScope temps(this);
925 IndexOperand start_enc = IndexOperand::Prepare(this, &temps, start, zd);
926 IndexOperand step_enc = IndexOperand::Prepare(this, &temps, step, zd);
927
928 SingleEmissionCheckScope guard(this);
929 if (start_enc.IsImmediate()) {
930 if (step_enc.IsImmediate()) {
931 index(zd, start_enc.GetImm5(), step_enc.GetImm5());
932 } else {
933 index(zd, start_enc.GetImm5(), step_enc.GetRegister());
934 }
935 } else {
936 if (step_enc.IsImmediate()) {
937 index(zd, start_enc.GetRegister(), step_enc.GetImm5());
938 } else {
939 index(zd, start_enc.GetRegister(), step_enc.GetRegister());
940 }
941 }
942 }
943
Insr(const ZRegister & zdn,IntegerOperand imm)944 void MacroAssembler::Insr(const ZRegister& zdn, IntegerOperand imm) {
945 VIXL_ASSERT(allow_macro_instructions_);
946 VIXL_ASSERT(imm.FitsInLane(zdn));
947
948 if (imm.IsZero()) {
949 SingleEmissionCheckScope guard(this);
950 insr(zdn, xzr);
951 return;
952 }
953
954 UseScratchRegisterScope temps(this);
955 Register scratch = temps.AcquireRegisterToHoldLane(zdn);
956
957 // TODO: There are many cases where we could optimise immediates, such as by
958 // detecting repeating patterns or FP immediates. We should optimise and
959 // abstract this for use in other SVE mov-immediate-like macros.
960 Mov(scratch, imm);
961
962 SingleEmissionCheckScope guard(this);
963 insr(zdn, scratch);
964 }
965
Mla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)966 void MacroAssembler::Mla(const ZRegister& zd,
967 const PRegisterM& pg,
968 const ZRegister& za,
969 const ZRegister& zn,
970 const ZRegister& zm) {
971 VIXL_ASSERT(allow_macro_instructions_);
972 if (zd.Aliases(za)) {
973 // zda = zda + (zn * zm)
974 SingleEmissionCheckScope guard(this);
975 mla(zd, pg, zn, zm);
976 } else if (zd.Aliases(zn)) {
977 // zdn = za + (zdn * zm)
978 SingleEmissionCheckScope guard(this);
979 mad(zd, pg, zm, za);
980 } else if (zd.Aliases(zm)) {
981 // Multiplication is commutative, so we can swap zn and zm.
982 // zdm = za + (zdm * zn)
983 SingleEmissionCheckScope guard(this);
984 mad(zd, pg, zn, za);
985 } else {
986 // zd = za + (zn * zm)
987 ExactAssemblyScope guard(this, 2 * kInstructionSize);
988 movprfx(zd, pg, za);
989 mla(zd, pg, zn, zm);
990 }
991 }
992
Mls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)993 void MacroAssembler::Mls(const ZRegister& zd,
994 const PRegisterM& pg,
995 const ZRegister& za,
996 const ZRegister& zn,
997 const ZRegister& zm) {
998 VIXL_ASSERT(allow_macro_instructions_);
999 if (zd.Aliases(za)) {
1000 // zda = zda - (zn * zm)
1001 SingleEmissionCheckScope guard(this);
1002 mls(zd, pg, zn, zm);
1003 } else if (zd.Aliases(zn)) {
1004 // zdn = za - (zdn * zm)
1005 SingleEmissionCheckScope guard(this);
1006 msb(zd, pg, zm, za);
1007 } else if (zd.Aliases(zm)) {
1008 // Multiplication is commutative, so we can swap zn and zm.
1009 // zdm = za - (zdm * zn)
1010 SingleEmissionCheckScope guard(this);
1011 msb(zd, pg, zn, za);
1012 } else {
1013 // zd = za - (zn * zm)
1014 ExactAssemblyScope guard(this, 2 * kInstructionSize);
1015 movprfx(zd, pg, za);
1016 mls(zd, pg, zn, zm);
1017 }
1018 }
1019
CompareHelper(Condition cond,const PRegisterWithLaneSize & pd,const PRegisterZ & pg,const ZRegister & zn,IntegerOperand imm)1020 void MacroAssembler::CompareHelper(Condition cond,
1021 const PRegisterWithLaneSize& pd,
1022 const PRegisterZ& pg,
1023 const ZRegister& zn,
1024 IntegerOperand imm) {
1025 UseScratchRegisterScope temps(this);
1026 ZRegister zm = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1027 Dup(zm, imm);
1028 SingleEmissionCheckScope guard(this);
1029 cmp(cond, pd, pg, zn, zm);
1030 }
1031
Pfirst(const PRegisterWithLaneSize & pd,const PRegister & pg,const PRegisterWithLaneSize & pn)1032 void MacroAssembler::Pfirst(const PRegisterWithLaneSize& pd,
1033 const PRegister& pg,
1034 const PRegisterWithLaneSize& pn) {
1035 VIXL_ASSERT(allow_macro_instructions_);
1036 VIXL_ASSERT(pd.IsLaneSizeB());
1037 VIXL_ASSERT(pn.IsLaneSizeB());
1038 if (pd.Is(pn)) {
1039 SingleEmissionCheckScope guard(this);
1040 pfirst(pd, pg, pn);
1041 } else {
1042 UseScratchRegisterScope temps(this);
1043 PRegister temp_pg = pg;
1044 if (pd.Aliases(pg)) {
1045 temp_pg = temps.AcquireP();
1046 Mov(temp_pg.VnB(), pg.VnB());
1047 }
1048 Mov(pd, pn);
1049 SingleEmissionCheckScope guard(this);
1050 pfirst(pd, temp_pg, pd);
1051 }
1052 }
1053
Pnext(const PRegisterWithLaneSize & pd,const PRegister & pg,const PRegisterWithLaneSize & pn)1054 void MacroAssembler::Pnext(const PRegisterWithLaneSize& pd,
1055 const PRegister& pg,
1056 const PRegisterWithLaneSize& pn) {
1057 VIXL_ASSERT(allow_macro_instructions_);
1058 VIXL_ASSERT(AreSameFormat(pd, pn));
1059 if (pd.Is(pn)) {
1060 SingleEmissionCheckScope guard(this);
1061 pnext(pd, pg, pn);
1062 } else {
1063 UseScratchRegisterScope temps(this);
1064 PRegister temp_pg = pg;
1065 if (pd.Aliases(pg)) {
1066 temp_pg = temps.AcquireP();
1067 Mov(temp_pg.VnB(), pg.VnB());
1068 }
1069 Mov(pd.VnB(), pn.VnB());
1070 SingleEmissionCheckScope guard(this);
1071 pnext(pd, temp_pg, pd);
1072 }
1073 }
1074
Ptrue(const PRegisterWithLaneSize & pd,SVEPredicateConstraint pattern,FlagsUpdate s)1075 void MacroAssembler::Ptrue(const PRegisterWithLaneSize& pd,
1076 SVEPredicateConstraint pattern,
1077 FlagsUpdate s) {
1078 VIXL_ASSERT(allow_macro_instructions_);
1079 switch (s) {
1080 case LeaveFlags:
1081 Ptrue(pd, pattern);
1082 return;
1083 case SetFlags:
1084 Ptrues(pd, pattern);
1085 return;
1086 }
1087 VIXL_UNREACHABLE();
1088 }
1089
Sub(const ZRegister & zd,IntegerOperand imm,const ZRegister & zm)1090 void MacroAssembler::Sub(const ZRegister& zd,
1091 IntegerOperand imm,
1092 const ZRegister& zm) {
1093 VIXL_ASSERT(allow_macro_instructions_);
1094
1095 int imm8;
1096 int shift = -1;
1097 if (imm.TryEncodeAsShiftedUintNForLane<8, 0>(zd, &imm8, &shift) ||
1098 imm.TryEncodeAsShiftedUintNForLane<8, 8>(zd, &imm8, &shift)) {
1099 MovprfxHelperScope guard(this, zd, zm);
1100 subr(zd, zd, imm8, shift);
1101 } else {
1102 UseScratchRegisterScope temps(this);
1103 ZRegister scratch = temps.AcquireZ().WithLaneSize(zm.GetLaneSizeInBits());
1104 Dup(scratch, imm);
1105
1106 SingleEmissionCheckScope guard(this);
1107 sub(zd, scratch, zm);
1108 }
1109 }
1110
SVELoadBroadcastImmHelper(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr,SVELoadBroadcastFn fn,int divisor)1111 void MacroAssembler::SVELoadBroadcastImmHelper(const ZRegister& zt,
1112 const PRegisterZ& pg,
1113 const SVEMemOperand& addr,
1114 SVELoadBroadcastFn fn,
1115 int divisor) {
1116 VIXL_ASSERT(addr.IsScalarPlusImmediate());
1117 int64_t imm = addr.GetImmediateOffset();
1118 if ((imm % divisor == 0) && IsUint6(imm / divisor)) {
1119 SingleEmissionCheckScope guard(this);
1120 (this->*fn)(zt, pg, addr);
1121 } else {
1122 UseScratchRegisterScope temps(this);
1123 Register scratch = temps.AcquireX();
1124 CalculateSVEAddress(scratch, addr, zt);
1125 SingleEmissionCheckScope guard(this);
1126 (this->*fn)(zt, pg, SVEMemOperand(scratch));
1127 }
1128 }
1129
SVELoadStoreScalarImmHelper(const CPURegister & rt,const SVEMemOperand & addr,SVELoadStoreFn fn)1130 void MacroAssembler::SVELoadStoreScalarImmHelper(const CPURegister& rt,
1131 const SVEMemOperand& addr,
1132 SVELoadStoreFn fn) {
1133 VIXL_ASSERT(allow_macro_instructions_);
1134 VIXL_ASSERT(rt.IsZRegister() || rt.IsPRegister());
1135
1136 if (addr.IsPlainScalar() ||
1137 (addr.IsScalarPlusImmediate() && IsInt9(addr.GetImmediateOffset()) &&
1138 addr.IsMulVl())) {
1139 SingleEmissionCheckScope guard(this);
1140 (this->*fn)(rt, addr);
1141 return;
1142 }
1143
1144 if (addr.IsEquivalentToScalar()) {
1145 SingleEmissionCheckScope guard(this);
1146 (this->*fn)(rt, SVEMemOperand(addr.GetScalarBase()));
1147 return;
1148 }
1149
1150 UseScratchRegisterScope temps(this);
1151 Register scratch = temps.AcquireX();
1152 CalculateSVEAddress(scratch, addr, rt);
1153 SingleEmissionCheckScope guard(this);
1154 (this->*fn)(rt, SVEMemOperand(scratch));
1155 }
1156
1157 template <typename Tg, typename Tf>
SVELoadStoreNTBroadcastQOHelper(const ZRegister & zt,const Tg & pg,const SVEMemOperand & addr,Tf fn,int imm_bits,int shift_amount,SVEOffsetModifier supported_modifier,int vl_divisor_log2)1158 void MacroAssembler::SVELoadStoreNTBroadcastQOHelper(
1159 const ZRegister& zt,
1160 const Tg& pg,
1161 const SVEMemOperand& addr,
1162 Tf fn,
1163 int imm_bits,
1164 int shift_amount,
1165 SVEOffsetModifier supported_modifier,
1166 int vl_divisor_log2) {
1167 VIXL_ASSERT(allow_macro_instructions_);
1168 int imm_divisor = 1 << shift_amount;
1169
1170 if (addr.IsPlainScalar() ||
1171 (addr.IsScalarPlusImmediate() &&
1172 IsIntN(imm_bits, addr.GetImmediateOffset() / imm_divisor) &&
1173 ((addr.GetImmediateOffset() % imm_divisor) == 0) &&
1174 (addr.GetOffsetModifier() == supported_modifier))) {
1175 SingleEmissionCheckScope guard(this);
1176 (this->*fn)(zt, pg, addr);
1177 return;
1178 }
1179
1180 if (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1181 addr.IsEquivalentToLSL(zt.GetLaneSizeInBytesLog2())) {
1182 SingleEmissionCheckScope guard(this);
1183 (this->*fn)(zt, pg, addr);
1184 return;
1185 }
1186
1187 if (addr.IsEquivalentToScalar()) {
1188 SingleEmissionCheckScope guard(this);
1189 (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1190 return;
1191 }
1192
1193 if (addr.IsMulVl() && (supported_modifier != SVE_MUL_VL) &&
1194 (vl_divisor_log2 == -1)) {
1195 // We don't handle [x0, #imm, MUL VL] if the in-memory access size is not VL
1196 // dependent.
1197 VIXL_UNIMPLEMENTED();
1198 }
1199
1200 UseScratchRegisterScope temps(this);
1201 Register scratch = temps.AcquireX();
1202 CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1203 SingleEmissionCheckScope guard(this);
1204 (this->*fn)(zt, pg, SVEMemOperand(scratch));
1205 }
1206
1207 template <typename Tg, typename Tf>
SVELoadStore1Helper(int msize_in_bytes_log2,const ZRegister & zt,const Tg & pg,const SVEMemOperand & addr,Tf fn)1208 void MacroAssembler::SVELoadStore1Helper(int msize_in_bytes_log2,
1209 const ZRegister& zt,
1210 const Tg& pg,
1211 const SVEMemOperand& addr,
1212 Tf fn) {
1213 if (addr.IsPlainScalar() ||
1214 (addr.IsScalarPlusScalar() && !addr.GetScalarOffset().IsZero() &&
1215 addr.IsEquivalentToLSL(msize_in_bytes_log2)) ||
1216 (addr.IsScalarPlusImmediate() && IsInt4(addr.GetImmediateOffset()) &&
1217 addr.IsMulVl())) {
1218 SingleEmissionCheckScope guard(this);
1219 (this->*fn)(zt, pg, addr);
1220 return;
1221 }
1222
1223 if (addr.IsEquivalentToScalar()) {
1224 SingleEmissionCheckScope guard(this);
1225 (this->*fn)(zt, pg, SVEMemOperand(addr.GetScalarBase()));
1226 return;
1227 }
1228
1229 if (addr.IsVectorPlusImmediate()) {
1230 uint64_t offset = addr.GetImmediateOffset();
1231 if (IsMultiple(offset, (1 << msize_in_bytes_log2)) &&
1232 IsUint5(offset >> msize_in_bytes_log2)) {
1233 SingleEmissionCheckScope guard(this);
1234 (this->*fn)(zt, pg, addr);
1235 return;
1236 }
1237 }
1238
1239 if (addr.IsScalarPlusVector()) {
1240 VIXL_ASSERT(addr.IsScatterGather());
1241 SingleEmissionCheckScope guard(this);
1242 (this->*fn)(zt, pg, addr);
1243 return;
1244 }
1245
1246 UseScratchRegisterScope temps(this);
1247 if (addr.IsScatterGather()) {
1248 // In scatter-gather modes, zt and zn/zm have the same lane size. However,
1249 // for 32-bit accesses, the result of each lane's address calculation still
1250 // requires 64 bits; we can't naively use `Adr` for the address calculation
1251 // because it would truncate each address to 32 bits.
1252
1253 if (addr.IsVectorPlusImmediate()) {
1254 // Synthesise the immediate in an X register, then use a
1255 // scalar-plus-vector access with the original vector.
1256 Register scratch = temps.AcquireX();
1257 Mov(scratch, addr.GetImmediateOffset());
1258 SingleEmissionCheckScope guard(this);
1259 SVEOffsetModifier om =
1260 zt.IsLaneSizeS() ? SVE_UXTW : NO_SVE_OFFSET_MODIFIER;
1261 (this->*fn)(zt, pg, SVEMemOperand(scratch, addr.GetVectorBase(), om));
1262 return;
1263 }
1264
1265 VIXL_UNIMPLEMENTED();
1266 } else {
1267 Register scratch = temps.AcquireX();
1268 // TODO: If we have an immediate offset that is a multiple of
1269 // msize_in_bytes, we can use Rdvl/Rdpl and a scalar-plus-scalar form to
1270 // save an instruction.
1271 int vl_divisor_log2 = zt.GetLaneSizeInBytesLog2() - msize_in_bytes_log2;
1272 CalculateSVEAddress(scratch, addr, vl_divisor_log2);
1273 SingleEmissionCheckScope guard(this);
1274 (this->*fn)(zt, pg, SVEMemOperand(scratch));
1275 }
1276 }
1277
1278 template <typename Tf>
SVELoadFFHelper(int msize_in_bytes_log2,const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr,Tf fn)1279 void MacroAssembler::SVELoadFFHelper(int msize_in_bytes_log2,
1280 const ZRegister& zt,
1281 const PRegisterZ& pg,
1282 const SVEMemOperand& addr,
1283 Tf fn) {
1284 if (addr.IsScatterGather()) {
1285 // Scatter-gather first-fault loads share encodings with normal loads.
1286 SVELoadStore1Helper(msize_in_bytes_log2, zt, pg, addr, fn);
1287 return;
1288 }
1289
1290 // Contiguous first-faulting loads have no scalar-plus-immediate form at all,
1291 // so we don't do immediate synthesis.
1292
1293 // We cannot currently distinguish "[x0]" from "[x0, #0]", and this
1294 // is not "scalar-plus-scalar", so we have to permit `IsPlainScalar()` here.
1295 if (addr.IsPlainScalar() || (addr.IsScalarPlusScalar() &&
1296 addr.IsEquivalentToLSL(msize_in_bytes_log2))) {
1297 SingleEmissionCheckScope guard(this);
1298 (this->*fn)(zt, pg, addr);
1299 return;
1300 }
1301
1302 VIXL_UNIMPLEMENTED();
1303 }
1304
Ld1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1305 void MacroAssembler::Ld1b(const ZRegister& zt,
1306 const PRegisterZ& pg,
1307 const SVEMemOperand& addr) {
1308 VIXL_ASSERT(allow_macro_instructions_);
1309 SVELoadStore1Helper(kBRegSizeInBytesLog2,
1310 zt,
1311 pg,
1312 addr,
1313 static_cast<SVELoad1Fn>(&Assembler::ld1b));
1314 }
1315
Ld1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1316 void MacroAssembler::Ld1h(const ZRegister& zt,
1317 const PRegisterZ& pg,
1318 const SVEMemOperand& addr) {
1319 VIXL_ASSERT(allow_macro_instructions_);
1320 SVELoadStore1Helper(kHRegSizeInBytesLog2,
1321 zt,
1322 pg,
1323 addr,
1324 static_cast<SVELoad1Fn>(&Assembler::ld1h));
1325 }
1326
Ld1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1327 void MacroAssembler::Ld1w(const ZRegister& zt,
1328 const PRegisterZ& pg,
1329 const SVEMemOperand& addr) {
1330 VIXL_ASSERT(allow_macro_instructions_);
1331 SVELoadStore1Helper(kWRegSizeInBytesLog2,
1332 zt,
1333 pg,
1334 addr,
1335 static_cast<SVELoad1Fn>(&Assembler::ld1w));
1336 }
1337
Ld1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1338 void MacroAssembler::Ld1d(const ZRegister& zt,
1339 const PRegisterZ& pg,
1340 const SVEMemOperand& addr) {
1341 VIXL_ASSERT(allow_macro_instructions_);
1342 SVELoadStore1Helper(kDRegSizeInBytesLog2,
1343 zt,
1344 pg,
1345 addr,
1346 static_cast<SVELoad1Fn>(&Assembler::ld1d));
1347 }
1348
Ld1sb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1349 void MacroAssembler::Ld1sb(const ZRegister& zt,
1350 const PRegisterZ& pg,
1351 const SVEMemOperand& addr) {
1352 VIXL_ASSERT(allow_macro_instructions_);
1353 SVELoadStore1Helper(kBRegSizeInBytesLog2,
1354 zt,
1355 pg,
1356 addr,
1357 static_cast<SVELoad1Fn>(&Assembler::ld1sb));
1358 }
1359
Ld1sh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1360 void MacroAssembler::Ld1sh(const ZRegister& zt,
1361 const PRegisterZ& pg,
1362 const SVEMemOperand& addr) {
1363 VIXL_ASSERT(allow_macro_instructions_);
1364 SVELoadStore1Helper(kHRegSizeInBytesLog2,
1365 zt,
1366 pg,
1367 addr,
1368 static_cast<SVELoad1Fn>(&Assembler::ld1sh));
1369 }
1370
Ld1sw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1371 void MacroAssembler::Ld1sw(const ZRegister& zt,
1372 const PRegisterZ& pg,
1373 const SVEMemOperand& addr) {
1374 VIXL_ASSERT(allow_macro_instructions_);
1375 SVELoadStore1Helper(kSRegSizeInBytesLog2,
1376 zt,
1377 pg,
1378 addr,
1379 static_cast<SVELoad1Fn>(&Assembler::ld1sw));
1380 }
1381
St1b(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1382 void MacroAssembler::St1b(const ZRegister& zt,
1383 const PRegister& pg,
1384 const SVEMemOperand& addr) {
1385 VIXL_ASSERT(allow_macro_instructions_);
1386 SVELoadStore1Helper(kBRegSizeInBytesLog2,
1387 zt,
1388 pg,
1389 addr,
1390 static_cast<SVEStore1Fn>(&Assembler::st1b));
1391 }
1392
St1h(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1393 void MacroAssembler::St1h(const ZRegister& zt,
1394 const PRegister& pg,
1395 const SVEMemOperand& addr) {
1396 VIXL_ASSERT(allow_macro_instructions_);
1397 SVELoadStore1Helper(kHRegSizeInBytesLog2,
1398 zt,
1399 pg,
1400 addr,
1401 static_cast<SVEStore1Fn>(&Assembler::st1h));
1402 }
1403
St1w(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1404 void MacroAssembler::St1w(const ZRegister& zt,
1405 const PRegister& pg,
1406 const SVEMemOperand& addr) {
1407 VIXL_ASSERT(allow_macro_instructions_);
1408 SVELoadStore1Helper(kSRegSizeInBytesLog2,
1409 zt,
1410 pg,
1411 addr,
1412 static_cast<SVEStore1Fn>(&Assembler::st1w));
1413 }
1414
St1d(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1415 void MacroAssembler::St1d(const ZRegister& zt,
1416 const PRegister& pg,
1417 const SVEMemOperand& addr) {
1418 VIXL_ASSERT(allow_macro_instructions_);
1419 SVELoadStore1Helper(kDRegSizeInBytesLog2,
1420 zt,
1421 pg,
1422 addr,
1423 static_cast<SVEStore1Fn>(&Assembler::st1d));
1424 }
1425
Ldff1b(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1426 void MacroAssembler::Ldff1b(const ZRegister& zt,
1427 const PRegisterZ& pg,
1428 const SVEMemOperand& addr) {
1429 VIXL_ASSERT(allow_macro_instructions_);
1430 SVELoadFFHelper(kBRegSizeInBytesLog2,
1431 zt,
1432 pg,
1433 addr,
1434 static_cast<SVELoad1Fn>(&Assembler::ldff1b));
1435 }
1436
Ldff1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1437 void MacroAssembler::Ldff1h(const ZRegister& zt,
1438 const PRegisterZ& pg,
1439 const SVEMemOperand& addr) {
1440 VIXL_ASSERT(allow_macro_instructions_);
1441 SVELoadFFHelper(kHRegSizeInBytesLog2,
1442 zt,
1443 pg,
1444 addr,
1445 static_cast<SVELoad1Fn>(&Assembler::ldff1h));
1446 }
1447
Ldff1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1448 void MacroAssembler::Ldff1w(const ZRegister& zt,
1449 const PRegisterZ& pg,
1450 const SVEMemOperand& addr) {
1451 VIXL_ASSERT(allow_macro_instructions_);
1452 SVELoadFFHelper(kSRegSizeInBytesLog2,
1453 zt,
1454 pg,
1455 addr,
1456 static_cast<SVELoad1Fn>(&Assembler::ldff1w));
1457 }
1458
Ldff1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1459 void MacroAssembler::Ldff1d(const ZRegister& zt,
1460 const PRegisterZ& pg,
1461 const SVEMemOperand& addr) {
1462 VIXL_ASSERT(allow_macro_instructions_);
1463 SVELoadFFHelper(kDRegSizeInBytesLog2,
1464 zt,
1465 pg,
1466 addr,
1467 static_cast<SVELoad1Fn>(&Assembler::ldff1d));
1468 }
1469
Ldff1sb(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1470 void MacroAssembler::Ldff1sb(const ZRegister& zt,
1471 const PRegisterZ& pg,
1472 const SVEMemOperand& addr) {
1473 VIXL_ASSERT(allow_macro_instructions_);
1474 SVELoadFFHelper(kBRegSizeInBytesLog2,
1475 zt,
1476 pg,
1477 addr,
1478 static_cast<SVELoad1Fn>(&Assembler::ldff1sb));
1479 }
1480
Ldff1sh(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1481 void MacroAssembler::Ldff1sh(const ZRegister& zt,
1482 const PRegisterZ& pg,
1483 const SVEMemOperand& addr) {
1484 VIXL_ASSERT(allow_macro_instructions_);
1485 SVELoadFFHelper(kHRegSizeInBytesLog2,
1486 zt,
1487 pg,
1488 addr,
1489 static_cast<SVELoad1Fn>(&Assembler::ldff1sh));
1490 }
1491
Ldff1sw(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1492 void MacroAssembler::Ldff1sw(const ZRegister& zt,
1493 const PRegisterZ& pg,
1494 const SVEMemOperand& addr) {
1495 VIXL_ASSERT(allow_macro_instructions_);
1496 SVELoadFFHelper(kSRegSizeInBytesLog2,
1497 zt,
1498 pg,
1499 addr,
1500 static_cast<SVELoad1Fn>(&Assembler::ldff1sw));
1501 }
1502
1503 #define VIXL_SVE_LD1R_LIST(V) \
1504 V(qb, 4) V(qh, 4) V(qw, 4) V(qd, 4) V(ob, 5) V(oh, 5) V(ow, 5) V(od, 5)
1505
1506 #define VIXL_DEFINE_MASM_FUNC(SZ, SH) \
1507 void MacroAssembler::Ld1r##SZ(const ZRegister& zt, \
1508 const PRegisterZ& pg, \
1509 const SVEMemOperand& addr) { \
1510 VIXL_ASSERT(allow_macro_instructions_); \
1511 SVELoadStoreNTBroadcastQOHelper(zt, \
1512 pg, \
1513 addr, \
1514 &MacroAssembler::ld1r##SZ, \
1515 4, \
1516 SH, \
1517 NO_SVE_OFFSET_MODIFIER, \
1518 -1); \
1519 }
1520
VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)1521 VIXL_SVE_LD1R_LIST(VIXL_DEFINE_MASM_FUNC)
1522
1523 #undef VIXL_DEFINE_MASM_FUNC
1524 #undef VIXL_SVE_LD1R_LIST
1525
1526 void MacroAssembler::Ldnt1b(const ZRegister& zt,
1527 const PRegisterZ& pg,
1528 const SVEMemOperand& addr) {
1529 VIXL_ASSERT(allow_macro_instructions_);
1530 if (addr.IsVectorPlusScalar()) {
1531 SingleEmissionCheckScope guard(this);
1532 ldnt1b(zt, pg, addr);
1533 } else {
1534 SVELoadStoreNTBroadcastQOHelper(zt,
1535 pg,
1536 addr,
1537 &MacroAssembler::ldnt1b,
1538 4,
1539 0,
1540 SVE_MUL_VL);
1541 }
1542 }
1543
Ldnt1d(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1544 void MacroAssembler::Ldnt1d(const ZRegister& zt,
1545 const PRegisterZ& pg,
1546 const SVEMemOperand& addr) {
1547 VIXL_ASSERT(allow_macro_instructions_);
1548 if (addr.IsVectorPlusScalar()) {
1549 SingleEmissionCheckScope guard(this);
1550 ldnt1d(zt, pg, addr);
1551 } else {
1552 SVELoadStoreNTBroadcastQOHelper(zt,
1553 pg,
1554 addr,
1555 &MacroAssembler::ldnt1d,
1556 4,
1557 0,
1558 SVE_MUL_VL);
1559 }
1560 }
1561
Ldnt1h(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1562 void MacroAssembler::Ldnt1h(const ZRegister& zt,
1563 const PRegisterZ& pg,
1564 const SVEMemOperand& addr) {
1565 VIXL_ASSERT(allow_macro_instructions_);
1566 if (addr.IsVectorPlusScalar()) {
1567 SingleEmissionCheckScope guard(this);
1568 ldnt1h(zt, pg, addr);
1569 } else {
1570 SVELoadStoreNTBroadcastQOHelper(zt,
1571 pg,
1572 addr,
1573 &MacroAssembler::ldnt1h,
1574 4,
1575 0,
1576 SVE_MUL_VL);
1577 }
1578 }
1579
Ldnt1w(const ZRegister & zt,const PRegisterZ & pg,const SVEMemOperand & addr)1580 void MacroAssembler::Ldnt1w(const ZRegister& zt,
1581 const PRegisterZ& pg,
1582 const SVEMemOperand& addr) {
1583 VIXL_ASSERT(allow_macro_instructions_);
1584 if (addr.IsVectorPlusScalar()) {
1585 SingleEmissionCheckScope guard(this);
1586 ldnt1w(zt, pg, addr);
1587 } else {
1588 SVELoadStoreNTBroadcastQOHelper(zt,
1589 pg,
1590 addr,
1591 &MacroAssembler::ldnt1w,
1592 4,
1593 0,
1594 SVE_MUL_VL);
1595 }
1596 }
1597
Stnt1b(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1598 void MacroAssembler::Stnt1b(const ZRegister& zt,
1599 const PRegister& pg,
1600 const SVEMemOperand& addr) {
1601 VIXL_ASSERT(allow_macro_instructions_);
1602 if (addr.IsVectorPlusScalar()) {
1603 SingleEmissionCheckScope guard(this);
1604 stnt1b(zt, pg, addr);
1605 } else {
1606 SVELoadStoreNTBroadcastQOHelper(zt,
1607 pg,
1608 addr,
1609 &MacroAssembler::stnt1b,
1610 4,
1611 0,
1612 SVE_MUL_VL);
1613 }
1614 }
Stnt1d(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1615 void MacroAssembler::Stnt1d(const ZRegister& zt,
1616 const PRegister& pg,
1617 const SVEMemOperand& addr) {
1618 VIXL_ASSERT(allow_macro_instructions_);
1619 if (addr.IsVectorPlusScalar()) {
1620 SingleEmissionCheckScope guard(this);
1621 stnt1d(zt, pg, addr);
1622 } else {
1623 SVELoadStoreNTBroadcastQOHelper(zt,
1624 pg,
1625 addr,
1626 &MacroAssembler::stnt1d,
1627 4,
1628 0,
1629 SVE_MUL_VL);
1630 }
1631 }
Stnt1h(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1632 void MacroAssembler::Stnt1h(const ZRegister& zt,
1633 const PRegister& pg,
1634 const SVEMemOperand& addr) {
1635 VIXL_ASSERT(allow_macro_instructions_);
1636 if (addr.IsVectorPlusScalar()) {
1637 SingleEmissionCheckScope guard(this);
1638 stnt1h(zt, pg, addr);
1639 } else {
1640 SVELoadStoreNTBroadcastQOHelper(zt,
1641 pg,
1642 addr,
1643 &MacroAssembler::stnt1h,
1644 4,
1645 0,
1646 SVE_MUL_VL);
1647 }
1648 }
Stnt1w(const ZRegister & zt,const PRegister & pg,const SVEMemOperand & addr)1649 void MacroAssembler::Stnt1w(const ZRegister& zt,
1650 const PRegister& pg,
1651 const SVEMemOperand& addr) {
1652 VIXL_ASSERT(allow_macro_instructions_);
1653 if (addr.IsVectorPlusScalar()) {
1654 SingleEmissionCheckScope guard(this);
1655 stnt1w(zt, pg, addr);
1656 } else {
1657 SVELoadStoreNTBroadcastQOHelper(zt,
1658 pg,
1659 addr,
1660 &MacroAssembler::stnt1w,
1661 4,
1662 0,
1663 SVE_MUL_VL);
1664 }
1665 }
1666
SVEDotIndexHelper(ZZZImmFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1667 void MacroAssembler::SVEDotIndexHelper(ZZZImmFn fn,
1668 const ZRegister& zd,
1669 const ZRegister& za,
1670 const ZRegister& zn,
1671 const ZRegister& zm,
1672 int index) {
1673 if (zd.Aliases(za)) {
1674 // zda = zda + (zn . zm)
1675 SingleEmissionCheckScope guard(this);
1676 (this->*fn)(zd, zn, zm, index);
1677
1678 } else if (zd.Aliases(zn) || zd.Aliases(zm)) {
1679 // zdn = za + (zdn . zm[index])
1680 // zdm = za + (zn . zdm[index])
1681 // zdnm = za + (zdnm . zdnm[index])
1682 UseScratchRegisterScope temps(this);
1683 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1684 {
1685 MovprfxHelperScope guard(this, scratch, za);
1686 (this->*fn)(scratch, zn, zm, index);
1687 }
1688
1689 Mov(zd, scratch);
1690 } else {
1691 // zd = za + (zn . zm)
1692 MovprfxHelperScope guard(this, zd, za);
1693 (this->*fn)(zd, zn, zm, index);
1694 }
1695 }
1696
FourRegDestructiveHelper(Int3ArithFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1697 void MacroAssembler::FourRegDestructiveHelper(Int3ArithFn fn,
1698 const ZRegister& zd,
1699 const ZRegister& za,
1700 const ZRegister& zn,
1701 const ZRegister& zm) {
1702 if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1703 // zd = za . zd . zm
1704 // zd = za . zn . zd
1705 // zd = za . zd . zd
1706 UseScratchRegisterScope temps(this);
1707 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1708 {
1709 MovprfxHelperScope guard(this, scratch, za);
1710 (this->*fn)(scratch, zn, zm);
1711 }
1712
1713 Mov(zd, scratch);
1714 } else {
1715 MovprfxHelperScope guard(this, zd, za);
1716 (this->*fn)(zd, zn, zm);
1717 }
1718 }
1719
FourRegDestructiveHelper(Int4ArithFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1720 void MacroAssembler::FourRegDestructiveHelper(Int4ArithFn fn,
1721 const ZRegister& zd,
1722 const ZRegister& za,
1723 const ZRegister& zn,
1724 const ZRegister& zm) {
1725 if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1726 // zd = za . zd . zm
1727 // zd = za . zn . zd
1728 // zd = za . zd . zd
1729 UseScratchRegisterScope temps(this);
1730 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1731 {
1732 MovprfxHelperScope guard(this, scratch, za);
1733 (this->*fn)(scratch, scratch, zn, zm);
1734 }
1735
1736 Mov(zd, scratch);
1737 } else {
1738 MovprfxHelperScope guard(this, zd, za);
1739 (this->*fn)(zd, zd, zn, zm);
1740 }
1741 }
1742
FourRegOneImmDestructiveHelper(ZZZImmFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int imm)1743 void MacroAssembler::FourRegOneImmDestructiveHelper(ZZZImmFn fn,
1744 const ZRegister& zd,
1745 const ZRegister& za,
1746 const ZRegister& zn,
1747 const ZRegister& zm,
1748 int imm) {
1749 if (!zd.Aliases(za) && (zd.Aliases(zn) || zd.Aliases(zm))) {
1750 // zd = za . zd . zm[i]
1751 // zd = za . zn . zd[i]
1752 // zd = za . zd . zd[i]
1753 UseScratchRegisterScope temps(this);
1754 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
1755 {
1756 MovprfxHelperScope guard(this, scratch, za);
1757 (this->*fn)(scratch, zn, zm, imm);
1758 }
1759
1760 Mov(zd, scratch);
1761 } else {
1762 // zd = za . zn . zm[i]
1763 MovprfxHelperScope guard(this, zd, za);
1764 (this->*fn)(zd, zn, zm, imm);
1765 }
1766 }
1767
AbsoluteDifferenceAccumulate(Int3ArithFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm)1768 void MacroAssembler::AbsoluteDifferenceAccumulate(Int3ArithFn fn,
1769 const ZRegister& zd,
1770 const ZRegister& za,
1771 const ZRegister& zn,
1772 const ZRegister& zm) {
1773 if (zn.Aliases(zm)) {
1774 // If zn == zm, the difference is zero.
1775 if (!zd.Aliases(za)) {
1776 Mov(zd, za);
1777 }
1778 } else if (zd.Aliases(za)) {
1779 SingleEmissionCheckScope guard(this);
1780 (this->*fn)(zd, zn, zm);
1781 } else if (zd.Aliases(zn)) {
1782 UseScratchRegisterScope temps(this);
1783 ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1784 Mov(ztmp, zn);
1785 MovprfxHelperScope guard(this, zd, za);
1786 (this->*fn)(zd, ztmp, zm);
1787 } else if (zd.Aliases(zm)) {
1788 UseScratchRegisterScope temps(this);
1789 ZRegister ztmp = temps.AcquireZ().WithLaneSize(zn.GetLaneSizeInBits());
1790 Mov(ztmp, zm);
1791 MovprfxHelperScope guard(this, zd, za);
1792 (this->*fn)(zd, zn, ztmp);
1793 } else {
1794 MovprfxHelperScope guard(this, zd, za);
1795 (this->*fn)(zd, zn, zm);
1796 }
1797 }
1798
1799 #define VIXL_SVE_4REG_LIST(V) \
1800 V(Saba, saba, AbsoluteDifferenceAccumulate) \
1801 V(Uaba, uaba, AbsoluteDifferenceAccumulate) \
1802 V(Sabalb, sabalb, AbsoluteDifferenceAccumulate) \
1803 V(Sabalt, sabalt, AbsoluteDifferenceAccumulate) \
1804 V(Uabalb, uabalb, AbsoluteDifferenceAccumulate) \
1805 V(Uabalt, uabalt, AbsoluteDifferenceAccumulate) \
1806 V(Sdot, sdot, FourRegDestructiveHelper) \
1807 V(Udot, udot, FourRegDestructiveHelper) \
1808 V(Adclb, adclb, FourRegDestructiveHelper) \
1809 V(Adclt, adclt, FourRegDestructiveHelper) \
1810 V(Sbclb, sbclb, FourRegDestructiveHelper) \
1811 V(Sbclt, sbclt, FourRegDestructiveHelper) \
1812 V(Smlalb, smlalb, FourRegDestructiveHelper) \
1813 V(Smlalt, smlalt, FourRegDestructiveHelper) \
1814 V(Smlslb, smlslb, FourRegDestructiveHelper) \
1815 V(Smlslt, smlslt, FourRegDestructiveHelper) \
1816 V(Umlalb, umlalb, FourRegDestructiveHelper) \
1817 V(Umlalt, umlalt, FourRegDestructiveHelper) \
1818 V(Umlslb, umlslb, FourRegDestructiveHelper) \
1819 V(Umlslt, umlslt, FourRegDestructiveHelper) \
1820 V(Bcax, bcax, FourRegDestructiveHelper) \
1821 V(Bsl, bsl, FourRegDestructiveHelper) \
1822 V(Bsl1n, bsl1n, FourRegDestructiveHelper) \
1823 V(Bsl2n, bsl2n, FourRegDestructiveHelper) \
1824 V(Eor3, eor3, FourRegDestructiveHelper) \
1825 V(Nbsl, nbsl, FourRegDestructiveHelper) \
1826 V(Fmlalb, fmlalb, FourRegDestructiveHelper) \
1827 V(Fmlalt, fmlalt, FourRegDestructiveHelper) \
1828 V(Fmlslb, fmlslb, FourRegDestructiveHelper) \
1829 V(Fmlslt, fmlslt, FourRegDestructiveHelper) \
1830 V(Sqdmlalb, sqdmlalb, FourRegDestructiveHelper) \
1831 V(Sqdmlalbt, sqdmlalbt, FourRegDestructiveHelper) \
1832 V(Sqdmlalt, sqdmlalt, FourRegDestructiveHelper) \
1833 V(Sqdmlslb, sqdmlslb, FourRegDestructiveHelper) \
1834 V(Sqdmlslbt, sqdmlslbt, FourRegDestructiveHelper) \
1835 V(Sqdmlslt, sqdmlslt, FourRegDestructiveHelper) \
1836 V(Sqrdmlah, sqrdmlah, FourRegDestructiveHelper) \
1837 V(Sqrdmlsh, sqrdmlsh, FourRegDestructiveHelper) \
1838 V(Fmmla, fmmla, FourRegDestructiveHelper) \
1839 V(Smmla, smmla, FourRegDestructiveHelper) \
1840 V(Ummla, ummla, FourRegDestructiveHelper) \
1841 V(Usmmla, usmmla, FourRegDestructiveHelper) \
1842 V(Usdot, usdot, FourRegDestructiveHelper)
1843
1844 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
1845 void MacroAssembler::MASMFN(const ZRegister& zd, \
1846 const ZRegister& za, \
1847 const ZRegister& zn, \
1848 const ZRegister& zm) { \
1849 VIXL_ASSERT(allow_macro_instructions_); \
1850 HELPER(&Assembler::ASMFN, zd, za, zn, zm); \
1851 }
1852 VIXL_SVE_4REG_LIST(VIXL_DEFINE_MASM_FUNC)
1853 #undef VIXL_DEFINE_MASM_FUNC
1854
1855 #define VIXL_SVE_4REG_1IMM_LIST(V) \
1856 V(Fmla, fmla, FourRegOneImmDestructiveHelper) \
1857 V(Fmls, fmls, FourRegOneImmDestructiveHelper) \
1858 V(Fmlalb, fmlalb, FourRegOneImmDestructiveHelper) \
1859 V(Fmlalt, fmlalt, FourRegOneImmDestructiveHelper) \
1860 V(Fmlslb, fmlslb, FourRegOneImmDestructiveHelper) \
1861 V(Fmlslt, fmlslt, FourRegOneImmDestructiveHelper) \
1862 V(Mla, mla, FourRegOneImmDestructiveHelper) \
1863 V(Mls, mls, FourRegOneImmDestructiveHelper) \
1864 V(Smlalb, smlalb, FourRegOneImmDestructiveHelper) \
1865 V(Smlalt, smlalt, FourRegOneImmDestructiveHelper) \
1866 V(Smlslb, smlslb, FourRegOneImmDestructiveHelper) \
1867 V(Smlslt, smlslt, FourRegOneImmDestructiveHelper) \
1868 V(Sqdmlalb, sqdmlalb, FourRegOneImmDestructiveHelper) \
1869 V(Sqdmlalt, sqdmlalt, FourRegOneImmDestructiveHelper) \
1870 V(Sqdmlslb, sqdmlslb, FourRegOneImmDestructiveHelper) \
1871 V(Sqdmlslt, sqdmlslt, FourRegOneImmDestructiveHelper) \
1872 V(Sqrdmlah, sqrdmlah, FourRegOneImmDestructiveHelper) \
1873 V(Sqrdmlsh, sqrdmlsh, FourRegOneImmDestructiveHelper) \
1874 V(Umlalb, umlalb, FourRegOneImmDestructiveHelper) \
1875 V(Umlalt, umlalt, FourRegOneImmDestructiveHelper) \
1876 V(Umlslb, umlslb, FourRegOneImmDestructiveHelper) \
1877 V(Umlslt, umlslt, FourRegOneImmDestructiveHelper)
1878
1879 #define VIXL_DEFINE_MASM_FUNC(MASMFN, ASMFN, HELPER) \
1880 void MacroAssembler::MASMFN(const ZRegister& zd, \
1881 const ZRegister& za, \
1882 const ZRegister& zn, \
1883 const ZRegister& zm, \
1884 int imm) { \
1885 VIXL_ASSERT(allow_macro_instructions_); \
1886 HELPER(&Assembler::ASMFN, zd, za, zn, zm, imm); \
1887 }
VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)1888 VIXL_SVE_4REG_1IMM_LIST(VIXL_DEFINE_MASM_FUNC)
1889 #undef VIXL_DEFINE_MASM_FUNC
1890
1891 void MacroAssembler::Sdot(const ZRegister& zd,
1892 const ZRegister& za,
1893 const ZRegister& zn,
1894 const ZRegister& zm,
1895 int index) {
1896 VIXL_ASSERT(allow_macro_instructions_);
1897 SVEDotIndexHelper(&Assembler::sdot, zd, za, zn, zm, index);
1898 }
1899
Udot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1900 void MacroAssembler::Udot(const ZRegister& zd,
1901 const ZRegister& za,
1902 const ZRegister& zn,
1903 const ZRegister& zm,
1904 int index) {
1905 VIXL_ASSERT(allow_macro_instructions_);
1906 SVEDotIndexHelper(&Assembler::udot, zd, za, zn, zm, index);
1907 }
1908
Sudot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1909 void MacroAssembler::Sudot(const ZRegister& zd,
1910 const ZRegister& za,
1911 const ZRegister& zn,
1912 const ZRegister& zm,
1913 int index) {
1914 VIXL_ASSERT(allow_macro_instructions_);
1915 SVEDotIndexHelper(&Assembler::sudot, zd, za, zn, zm, index);
1916 }
1917
Usdot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index)1918 void MacroAssembler::Usdot(const ZRegister& zd,
1919 const ZRegister& za,
1920 const ZRegister& zn,
1921 const ZRegister& zm,
1922 int index) {
1923 VIXL_ASSERT(allow_macro_instructions_);
1924 SVEDotIndexHelper(&Assembler::usdot, zd, za, zn, zm, index);
1925 }
1926
Cdot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int index,int rot)1927 void MacroAssembler::Cdot(const ZRegister& zd,
1928 const ZRegister& za,
1929 const ZRegister& zn,
1930 const ZRegister& zm,
1931 int index,
1932 int rot) {
1933 // This doesn't handle zm when it's out of the range that can be encoded in
1934 // instruction. The range depends on element size: z0-z7 for B, z0-15 for H.
1935 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
1936 UseScratchRegisterScope temps(this);
1937 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
1938 {
1939 MovprfxHelperScope guard(this, ztmp, za);
1940 cdot(ztmp, zn, zm, index, rot);
1941 }
1942 Mov(zd, ztmp);
1943 } else {
1944 MovprfxHelperScope guard(this, zd, za);
1945 cdot(zd, zn, zm, index, rot);
1946 }
1947 }
1948
Cdot(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int rot)1949 void MacroAssembler::Cdot(const ZRegister& zd,
1950 const ZRegister& za,
1951 const ZRegister& zn,
1952 const ZRegister& zm,
1953 int rot) {
1954 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
1955 UseScratchRegisterScope temps(this);
1956 VIXL_ASSERT(AreSameLaneSize(zn, zm));
1957 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
1958 Mov(ztmp, zd.Aliases(zn) ? zn : zm);
1959 MovprfxHelperScope guard(this, zd, za);
1960 cdot(zd, (zd.Aliases(zn) ? ztmp : zn), (zd.Aliases(zm) ? ztmp : zm), rot);
1961 } else {
1962 MovprfxHelperScope guard(this, zd, za);
1963 cdot(zd, zn, zm, rot);
1964 }
1965 }
1966
FPMulAddHelper(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,SVEMulAddPredicatedZdaFn fn_zda,SVEMulAddPredicatedZdnFn fn_zdn,FPMacroNaNPropagationOption nan_option)1967 void MacroAssembler::FPMulAddHelper(const ZRegister& zd,
1968 const PRegisterM& pg,
1969 const ZRegister& za,
1970 const ZRegister& zn,
1971 const ZRegister& zm,
1972 SVEMulAddPredicatedZdaFn fn_zda,
1973 SVEMulAddPredicatedZdnFn fn_zdn,
1974 FPMacroNaNPropagationOption nan_option) {
1975 ResolveFPNaNPropagationOption(&nan_option);
1976
1977 if (zd.Aliases(za)) {
1978 // zda = (-)zda + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
1979 SingleEmissionCheckScope guard(this);
1980 (this->*fn_zda)(zd, pg, zn, zm);
1981 } else if (zd.Aliases(zn)) {
1982 // zdn = (-)za + ((-)zdn * zm) for fmad, fmsb, fnmad and fnmsb.
1983 SingleEmissionCheckScope guard(this);
1984 (this->*fn_zdn)(zd, pg, zm, za);
1985 } else if (zd.Aliases(zm)) {
1986 switch (nan_option) {
1987 case FastNaNPropagation: {
1988 // We treat multiplication as commutative in the fast mode, so we can
1989 // swap zn and zm.
1990 // zdm = (-)za + ((-)zdm * zn) for fmad, fmsb, fnmad and fnmsb.
1991 SingleEmissionCheckScope guard(this);
1992 (this->*fn_zdn)(zd, pg, zn, za);
1993 return;
1994 }
1995 case StrictNaNPropagation: {
1996 UseScratchRegisterScope temps(this);
1997 // Use a scratch register to keep the argument order exactly as
1998 // specified.
1999 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zn);
2000 {
2001 MovprfxHelperScope guard(this, scratch, pg, za);
2002 // scratch = (-)za + ((-)zn * zm)
2003 (this->*fn_zda)(scratch, pg, zn, zm);
2004 }
2005 Mov(zd, scratch);
2006 return;
2007 }
2008 case NoFPMacroNaNPropagationSelected:
2009 VIXL_UNREACHABLE();
2010 return;
2011 }
2012 } else {
2013 // zd = (-)za + ((-)zn * zm) for fmla, fmls, fnmla and fnmls.
2014 MovprfxHelperScope guard(this, zd, pg, za);
2015 (this->*fn_zda)(zd, pg, zn, zm);
2016 }
2017 }
2018
Fmla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)2019 void MacroAssembler::Fmla(const ZRegister& zd,
2020 const PRegisterM& pg,
2021 const ZRegister& za,
2022 const ZRegister& zn,
2023 const ZRegister& zm,
2024 FPMacroNaNPropagationOption nan_option) {
2025 VIXL_ASSERT(allow_macro_instructions_);
2026 FPMulAddHelper(zd,
2027 pg,
2028 za,
2029 zn,
2030 zm,
2031 &Assembler::fmla,
2032 &Assembler::fmad,
2033 nan_option);
2034 }
2035
Fmls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)2036 void MacroAssembler::Fmls(const ZRegister& zd,
2037 const PRegisterM& pg,
2038 const ZRegister& za,
2039 const ZRegister& zn,
2040 const ZRegister& zm,
2041 FPMacroNaNPropagationOption nan_option) {
2042 VIXL_ASSERT(allow_macro_instructions_);
2043 FPMulAddHelper(zd,
2044 pg,
2045 za,
2046 zn,
2047 zm,
2048 &Assembler::fmls,
2049 &Assembler::fmsb,
2050 nan_option);
2051 }
2052
Fnmla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)2053 void MacroAssembler::Fnmla(const ZRegister& zd,
2054 const PRegisterM& pg,
2055 const ZRegister& za,
2056 const ZRegister& zn,
2057 const ZRegister& zm,
2058 FPMacroNaNPropagationOption nan_option) {
2059 VIXL_ASSERT(allow_macro_instructions_);
2060 FPMulAddHelper(zd,
2061 pg,
2062 za,
2063 zn,
2064 zm,
2065 &Assembler::fnmla,
2066 &Assembler::fnmad,
2067 nan_option);
2068 }
2069
Fnmls(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,FPMacroNaNPropagationOption nan_option)2070 void MacroAssembler::Fnmls(const ZRegister& zd,
2071 const PRegisterM& pg,
2072 const ZRegister& za,
2073 const ZRegister& zn,
2074 const ZRegister& zm,
2075 FPMacroNaNPropagationOption nan_option) {
2076 VIXL_ASSERT(allow_macro_instructions_);
2077 FPMulAddHelper(zd,
2078 pg,
2079 za,
2080 zn,
2081 zm,
2082 &Assembler::fnmls,
2083 &Assembler::fnmsb,
2084 nan_option);
2085 }
2086
Ftmad(const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,int imm3)2087 void MacroAssembler::Ftmad(const ZRegister& zd,
2088 const ZRegister& zn,
2089 const ZRegister& zm,
2090 int imm3) {
2091 VIXL_ASSERT(allow_macro_instructions_);
2092 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2093 UseScratchRegisterScope temps(this);
2094 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zm);
2095 Mov(scratch, zm);
2096 MovprfxHelperScope guard(this, zd, zn);
2097 ftmad(zd, zd, scratch, imm3);
2098 } else {
2099 MovprfxHelperScope guard(this, zd, zn);
2100 ftmad(zd, zd, zm, imm3);
2101 }
2102 }
2103
Fcadd(const ZRegister & zd,const PRegisterM & pg,const ZRegister & zn,const ZRegister & zm,int rot)2104 void MacroAssembler::Fcadd(const ZRegister& zd,
2105 const PRegisterM& pg,
2106 const ZRegister& zn,
2107 const ZRegister& zm,
2108 int rot) {
2109 VIXL_ASSERT(allow_macro_instructions_);
2110 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2111 UseScratchRegisterScope temps(this);
2112 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2113 {
2114 MovprfxHelperScope guard(this, scratch, pg, zn);
2115 fcadd(scratch, pg, scratch, zm, rot);
2116 }
2117 Mov(zd, scratch);
2118 } else {
2119 MovprfxHelperScope guard(this, zd, pg, zn);
2120 fcadd(zd, pg, zd, zm, rot);
2121 }
2122 }
2123
Fcmla(const ZRegister & zd,const PRegisterM & pg,const ZRegister & za,const ZRegister & zn,const ZRegister & zm,int rot)2124 void MacroAssembler::Fcmla(const ZRegister& zd,
2125 const PRegisterM& pg,
2126 const ZRegister& za,
2127 const ZRegister& zn,
2128 const ZRegister& zm,
2129 int rot) {
2130 VIXL_ASSERT(allow_macro_instructions_);
2131 if ((zd.Aliases(zn) || zd.Aliases(zm)) && !zd.Aliases(za)) {
2132 UseScratchRegisterScope temps(this);
2133 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zd);
2134 {
2135 MovprfxHelperScope guard(this, ztmp, za);
2136 fcmla(ztmp, pg, zn, zm, rot);
2137 }
2138 Mov(zd, pg, ztmp);
2139 } else {
2140 MovprfxHelperScope guard(this, zd, pg, za);
2141 fcmla(zd, pg, zn, zm, rot);
2142 }
2143 }
2144
Splice(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)2145 void MacroAssembler::Splice(const ZRegister& zd,
2146 const PRegister& pg,
2147 const ZRegister& zn,
2148 const ZRegister& zm) {
2149 VIXL_ASSERT(allow_macro_instructions_);
2150 if (CPUHas(CPUFeatures::kSVE2) && AreConsecutive(zn, zm) && !zd.Aliases(zn)) {
2151 SingleEmissionCheckScope guard(this);
2152 splice(zd, pg, zn, zm);
2153 } else if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2154 UseScratchRegisterScope temps(this);
2155 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2156 {
2157 MovprfxHelperScope guard(this, scratch, zn);
2158 splice(scratch, pg, scratch, zm);
2159 }
2160 Mov(zd, scratch);
2161 } else {
2162 MovprfxHelperScope guard(this, zd, zn);
2163 splice(zd, pg, zd, zm);
2164 }
2165 }
2166
Clasta(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)2167 void MacroAssembler::Clasta(const ZRegister& zd,
2168 const PRegister& pg,
2169 const ZRegister& zn,
2170 const ZRegister& zm) {
2171 VIXL_ASSERT(allow_macro_instructions_);
2172 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2173 UseScratchRegisterScope temps(this);
2174 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2175 {
2176 MovprfxHelperScope guard(this, scratch, zn);
2177 clasta(scratch, pg, scratch, zm);
2178 }
2179 Mov(zd, scratch);
2180 } else {
2181 MovprfxHelperScope guard(this, zd, zn);
2182 clasta(zd, pg, zd, zm);
2183 }
2184 }
2185
Clastb(const ZRegister & zd,const PRegister & pg,const ZRegister & zn,const ZRegister & zm)2186 void MacroAssembler::Clastb(const ZRegister& zd,
2187 const PRegister& pg,
2188 const ZRegister& zn,
2189 const ZRegister& zm) {
2190 VIXL_ASSERT(allow_macro_instructions_);
2191 if (zd.Aliases(zm) && !zd.Aliases(zn)) {
2192 UseScratchRegisterScope temps(this);
2193 ZRegister scratch = temps.AcquireZ().WithSameLaneSizeAs(zd);
2194 {
2195 MovprfxHelperScope guard(this, scratch, zn);
2196 clastb(scratch, pg, scratch, zm);
2197 }
2198 Mov(zd, scratch);
2199 } else {
2200 MovprfxHelperScope guard(this, zd, zn);
2201 clastb(zd, pg, zd, zm);
2202 }
2203 }
2204
ShiftRightAccumulate(IntArithImmFn fn,const ZRegister & zd,const ZRegister & za,const ZRegister & zn,int shift)2205 void MacroAssembler::ShiftRightAccumulate(IntArithImmFn fn,
2206 const ZRegister& zd,
2207 const ZRegister& za,
2208 const ZRegister& zn,
2209 int shift) {
2210 VIXL_ASSERT(allow_macro_instructions_);
2211 if (!zd.Aliases(za) && zd.Aliases(zn)) {
2212 UseScratchRegisterScope temps(this);
2213 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zn);
2214 Mov(ztmp, zn);
2215 {
2216 MovprfxHelperScope guard(this, zd, za);
2217 (this->*fn)(zd, ztmp, shift);
2218 }
2219 } else {
2220 MovprfxHelperScope guard(this, zd, za);
2221 (this->*fn)(zd, zn, shift);
2222 }
2223 }
2224
Srsra(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,int shift)2225 void MacroAssembler::Srsra(const ZRegister& zd,
2226 const ZRegister& za,
2227 const ZRegister& zn,
2228 int shift) {
2229 ShiftRightAccumulate(&Assembler::srsra, zd, za, zn, shift);
2230 }
2231
Ssra(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,int shift)2232 void MacroAssembler::Ssra(const ZRegister& zd,
2233 const ZRegister& za,
2234 const ZRegister& zn,
2235 int shift) {
2236 ShiftRightAccumulate(&Assembler::ssra, zd, za, zn, shift);
2237 }
2238
Ursra(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,int shift)2239 void MacroAssembler::Ursra(const ZRegister& zd,
2240 const ZRegister& za,
2241 const ZRegister& zn,
2242 int shift) {
2243 ShiftRightAccumulate(&Assembler::ursra, zd, za, zn, shift);
2244 }
2245
Usra(const ZRegister & zd,const ZRegister & za,const ZRegister & zn,int shift)2246 void MacroAssembler::Usra(const ZRegister& zd,
2247 const ZRegister& za,
2248 const ZRegister& zn,
2249 int shift) {
2250 ShiftRightAccumulate(&Assembler::usra, zd, za, zn, shift);
2251 }
2252
ComplexAddition(ZZZImmFn fn,const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,int rot)2253 void MacroAssembler::ComplexAddition(ZZZImmFn fn,
2254 const ZRegister& zd,
2255 const ZRegister& zn,
2256 const ZRegister& zm,
2257 int rot) {
2258 VIXL_ASSERT(allow_macro_instructions_);
2259 if (!zd.Aliases(zn) && zd.Aliases(zm)) {
2260 UseScratchRegisterScope temps(this);
2261 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(zm);
2262 Mov(ztmp, zm);
2263 {
2264 MovprfxHelperScope guard(this, zd, zn);
2265 (this->*fn)(zd, zd, ztmp, rot);
2266 }
2267 } else {
2268 MovprfxHelperScope guard(this, zd, zn);
2269 (this->*fn)(zd, zd, zm, rot);
2270 }
2271 }
2272
Cadd(const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,int rot)2273 void MacroAssembler::Cadd(const ZRegister& zd,
2274 const ZRegister& zn,
2275 const ZRegister& zm,
2276 int rot) {
2277 ComplexAddition(&Assembler::cadd, zd, zn, zm, rot);
2278 }
2279
Sqcadd(const ZRegister & zd,const ZRegister & zn,const ZRegister & zm,int rot)2280 void MacroAssembler::Sqcadd(const ZRegister& zd,
2281 const ZRegister& zn,
2282 const ZRegister& zm,
2283 int rot) {
2284 ComplexAddition(&Assembler::sqcadd, zd, zn, zm, rot);
2285 }
2286
2287 } // namespace aarch64
2288 } // namespace vixl
2289