1 // Copyright 2017, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include <cstdio>
28 #include <cstring>
29 #include <string>
30
31 #include "test-runner.h"
32 #include "test-utils.h"
33 #include "aarch64/test-utils-aarch64.h"
34
35 #include "aarch64/assembler-aarch64.h"
36 #include "aarch64/instructions-aarch64.h"
37
38 #define __ assm.
39 #define TEST(name) TEST_(AARCH64_API_##name)
40
41 namespace vixl {
42 namespace aarch64 {
43
44 class InstructionReporter : public DecoderVisitor {
45 public:
InstructionReporter()46 InstructionReporter() : DecoderVisitor(kNonConstVisitor) {}
47
Visit(Metadata * metadata,const Instruction * instr)48 void Visit(Metadata* metadata, const Instruction* instr) VIXL_OVERRIDE {
49 USE(instr);
50 instr_form_ = (*metadata)["form"];
51 }
52
MoveForm()53 std::string MoveForm() { return std::move(instr_form_); }
54
55 private:
56 std::string instr_form_;
57 };
58
CheckAndMaybeDisassembleMovprfxPairs(const CodeBuffer * buffer,bool can_take_movprfx)59 static void CheckAndMaybeDisassembleMovprfxPairs(const CodeBuffer* buffer,
60 bool can_take_movprfx) {
61 const Instruction* pair = buffer->GetStartAddress<Instruction*>();
62 const Instruction* end = buffer->GetEndAddress<Instruction*>();
63 bool any_failures = false;
64 PrintDisassembler print_disasm(stdout);
65 Decoder decoder;
66 InstructionReporter reporter;
67 decoder.AppendVisitor(&reporter);
68
69 while (pair < end) {
70 const Instruction* movprfx = pair;
71 const Instruction* candidate = pair->GetNextInstruction();
72 const Instruction* next_pair = candidate->GetNextInstruction();
73 VIXL_ASSERT(candidate < end);
74
75 Instr inst = candidate->GetInstructionBits();
76 decoder.Decode(reinterpret_cast<Instruction*>(&inst));
77 std::string form = reporter.MoveForm();
78 bool failed =
79 can_take_movprfx != candidate->CanTakeSVEMovprfx(form.c_str(), movprfx);
80 any_failures = any_failures || failed;
81
82 if (failed || Test::disassemble()) {
83 printf("----\n");
84 if (failed) {
85 printf("# ERROR: Expected %sCanTakeSVEMovprfx(movprfx):\n",
86 can_take_movprfx ? "" : "!");
87 }
88 print_disasm.DisassembleBuffer(pair, next_pair);
89 }
90
91 pair = next_pair;
92 }
93 // Abort only at the end, so we can see the individual failures.
94 VIXL_CHECK(!any_failures);
95 }
96
TEST(movprfx_negative_aliasing)97 TEST(movprfx_negative_aliasing) {
98 // Test that CanTakeSVEMovprfx() checks that the movprfx destination does not
99 // alias an input to the prefixed instruction.
100 Assembler assm;
101 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE, CPUFeatures::kSVEI8MM);
102 {
103 // We have to use the Assembler directly to generate movprfx, so we need
104 // to manually reserve space for the code we're about to emit.
105 static const size_t kPairCount = 79;
106 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
107
108 __ movprfx(z0.VnB(), p0.Merging(), z9.VnB());
109 __ abs(z0.VnB(), p0.Merging(), z0.VnB());
110
111 __ movprfx(z1, z17);
112 __ add(z1.VnH(), p2.Merging(), z1.VnH(), z1.VnH());
113
114 __ movprfx(z12, z13);
115 __ and_(z12.VnD(), p5.Merging(), z12.VnD(), z12.VnD());
116
117 __ movprfx(z2, z4);
118 __ asr(z2.VnS(), p2.Merging(), z2.VnS(), z2.VnS());
119
120 __ movprfx(z10, z18);
121 __ asr(z10.VnH(), p2.Merging(), z10.VnH(), z10.VnD());
122
123 __ movprfx(z17.VnD(), p5.Zeroing(), z20.VnD());
124 __ asr(z17.VnD(), p5.Merging(), z17.VnD(), z17.VnD());
125
126 __ movprfx(z22, z9);
127 __ asrr(z22.VnH(), p1.Merging(), z22.VnH(), z22.VnH());
128
129 __ movprfx(z0.VnS(), p6.Zeroing(), z6.VnS());
130 __ bic(z0.VnS(), p6.Merging(), z0.VnS(), z0.VnS());
131
132 __ movprfx(z12, z16);
133 __ clasta(z12.VnD(), p5, z12.VnD(), z12.VnD());
134
135 __ movprfx(z7, z15);
136 __ clastb(z7.VnS(), p7, z7.VnS(), z7.VnS());
137
138 __ movprfx(z10, z29);
139 __ cls(z10.VnH(), p2.Merging(), z10.VnH());
140
141 __ movprfx(z6, z13);
142 __ clz(z6.VnB(), p4.Merging(), z6.VnB());
143
144 __ movprfx(z14.VnS(), p6.Zeroing(), z3.VnS());
145 __ cnot(z14.VnS(), p6.Merging(), z14.VnS());
146
147 __ movprfx(z5.VnD(), p6.Merging(), z4.VnD());
148 __ cnt(z5.VnD(), p6.Merging(), z5.VnD());
149
150 __ movprfx(z19.VnB(), p6.Zeroing(), z4.VnB());
151 __ eor(z19.VnB(), p6.Merging(), z19.VnB(), z19.VnB());
152
153 __ movprfx(z27, z2);
154 __ ext(z27.VnB(), z27.VnB(), z27.VnB(), 42);
155
156 __ movprfx(z4.VnS(), p1.Zeroing(), z22.VnS());
157 __ lsl(z4.VnS(), p1.Merging(), z4.VnS(), z4.VnS());
158
159 __ movprfx(z4, z5);
160 __ lsl(z4.VnB(), p5.Merging(), z4.VnB(), z4.VnD());
161
162 __ movprfx(z11.VnD(), p4.Merging(), z29.VnD());
163 __ lsl(z11.VnD(), p4.Merging(), z11.VnD(), z11.VnD());
164
165 __ movprfx(z12.VnD(), p6.Merging(), z3.VnD());
166 __ lslr(z12.VnD(), p6.Merging(), z12.VnD(), z12.VnD());
167
168 __ movprfx(z7, z2);
169 __ lsr(z7.VnB(), p4.Merging(), z7.VnB(), z7.VnB());
170
171 __ movprfx(z25.VnH(), p6.Merging(), z28.VnH());
172 __ lsr(z25.VnH(), p6.Merging(), z25.VnH(), z25.VnD());
173
174 __ movprfx(z14.VnD(), p6.Merging(), z6.VnD());
175 __ lsr(z14.VnD(), p6.Merging(), z14.VnD(), z14.VnD());
176
177 __ movprfx(z26.VnH(), p6.Zeroing(), z27.VnH());
178 __ lsrr(z26.VnH(), p6.Merging(), z26.VnH(), z26.VnH());
179
180 __ movprfx(z17.VnS(), p4.Zeroing(), z29.VnS());
181 __ mad(z17.VnS(), p4.Merging(), z17.VnS(), z23.VnS());
182
183 __ movprfx(z7, z17);
184 __ mad(z7.VnD(), p5.Merging(), z4.VnD(), z7.VnD());
185
186 __ movprfx(z11, z7);
187 __ mla(z11.VnS(), p1.Merging(), z11.VnS(), z27.VnS());
188
189 __ movprfx(z7, z5);
190 __ mla(z7.VnH(), p0.Merging(), z5.VnH(), z7.VnH());
191
192 __ movprfx(z1.VnH(), p0.Merging(), z17.VnH());
193 __ mls(z1.VnH(), p0.Merging(), z1.VnH(), z31.VnH());
194
195 __ movprfx(z22.VnB(), p3.Merging(), z18.VnB());
196 __ mls(z22.VnB(), p3.Merging(), z18.VnB(), z22.VnB());
197
198 __ movprfx(z7.VnS(), p0.Merging(), z10.VnS());
199 __ msb(z7.VnS(), p0.Merging(), z7.VnS(), z10.VnS());
200
201 __ movprfx(z12, z6);
202 __ msb(z12.VnH(), p7.Merging(), z6.VnH(), z12.VnH());
203
204 __ movprfx(z8.VnB(), p4.Merging(), z3.VnB());
205 __ mul(z8.VnB(), p4.Merging(), z8.VnB(), z8.VnB());
206
207 __ movprfx(z9, z26);
208 __ neg(z9.VnS(), p7.Merging(), z9.VnS());
209
210 __ movprfx(z16, z8);
211 __ not_(z16.VnH(), p6.Merging(), z16.VnH());
212
213 __ movprfx(z25.VnH(), p5.Zeroing(), z11.VnH());
214 __ orr(z25.VnH(), p5.Merging(), z25.VnH(), z25.VnH());
215
216 __ movprfx(z17.VnH(), p1.Merging(), z22.VnH());
217 __ rbit(z17.VnH(), p1.Merging(), z17.VnH());
218
219 __ movprfx(z11, z25);
220 __ revb(z11.VnD(), p6.Merging(), z11.VnD());
221
222 __ movprfx(z13, z27);
223 __ revh(z13.VnS(), p2.Merging(), z13.VnS());
224
225 __ movprfx(z30.VnD(), p6.Merging(), z20.VnD());
226 __ revw(z30.VnD(), p6.Merging(), z30.VnD());
227
228 __ movprfx(z2.VnD(), p2.Merging(), z21.VnD());
229 __ sabd(z2.VnD(), p2.Merging(), z2.VnD(), z2.VnD());
230
231 __ movprfx(z0, z7);
232 __ sdiv(z0.VnD(), p0.Merging(), z0.VnD(), z0.VnD());
233
234 __ movprfx(z19, z28);
235 __ sdivr(z19.VnS(), p1.Merging(), z19.VnS(), z19.VnS());
236
237 __ movprfx(z5, z18);
238 __ sdot(z5.VnS(), z18.VnB(), z5.VnB(), 1);
239
240 __ movprfx(z15, z11);
241 __ sdot(z15.VnD(), z2.VnH(), z15.VnH(), 1);
242
243 __ movprfx(z30, z13);
244 __ sdot(z30.VnD(), z30.VnH(), z13.VnH(), 1);
245
246 __ movprfx(z8, z9);
247 __ sdot(z8.VnS(), z8.VnB(), z9.VnB());
248
249 __ movprfx(z23, z14);
250 __ sdot(z23.VnS(), z14.VnB(), z23.VnB());
251
252 __ movprfx(z26, z5);
253 __ sdot(z26.VnS(), z26.VnB(), z5.VnB(), 1);
254
255 __ movprfx(z14, z15);
256 __ smax(z14.VnB(), p2.Merging(), z14.VnB(), z14.VnB());
257
258 __ movprfx(z26.VnS(), p0.Merging(), z10.VnS());
259 __ smin(z26.VnS(), p0.Merging(), z26.VnS(), z26.VnS());
260
261 __ movprfx(z22, z18);
262 __ smulh(z22.VnB(), p2.Merging(), z22.VnB(), z22.VnB());
263
264 __ movprfx(z8, z19);
265 __ splice(z8.VnD(), p2, z8.VnD(), z8.VnD());
266
267 __ movprfx(z23.VnH(), p6.Zeroing(), z2.VnH());
268 __ sub(z23.VnH(), p6.Merging(), z23.VnH(), z23.VnH());
269
270 __ movprfx(z25.VnS(), p2.Merging(), z21.VnS());
271 __ subr(z25.VnS(), p2.Merging(), z25.VnS(), z25.VnS());
272
273 __ movprfx(z28, z31);
274 __ sxtb(z28.VnS(), p6.Merging(), z28.VnS());
275
276 __ movprfx(z14.VnD(), p6.Merging(), z17.VnD());
277 __ sxth(z14.VnD(), p6.Merging(), z14.VnD());
278
279 __ movprfx(z21.VnD(), p0.Zeroing(), z28.VnD());
280 __ sxtw(z21.VnD(), p0.Merging(), z21.VnD());
281
282 __ movprfx(z25, z30);
283 __ uabd(z25.VnB(), p5.Merging(), z25.VnB(), z25.VnB());
284
285 __ movprfx(z13.VnD(), p2.Merging(), z30.VnD());
286 __ udiv(z13.VnD(), p2.Merging(), z13.VnD(), z13.VnD());
287
288 __ movprfx(z19.VnD(), p4.Zeroing(), z6.VnD());
289 __ udivr(z19.VnD(), p4.Merging(), z19.VnD(), z19.VnD());
290
291 __ movprfx(z1, z20);
292 __ udot(z1.VnS(), z18.VnB(), z1.VnB(), 1);
293
294 __ movprfx(z8, z2);
295 __ udot(z8.VnD(), z2.VnH(), z8.VnH(), 1);
296
297 __ movprfx(z28, z10);
298 __ udot(z28.VnD(), z28.VnH(), z7.VnH(), 1);
299
300 __ movprfx(z21, z11);
301 __ udot(z21.VnD(), z21.VnH(), z11.VnH());
302
303 __ movprfx(z1, z22);
304 __ udot(z1.VnD(), z10.VnH(), z1.VnH());
305
306 __ movprfx(z8, z23);
307 __ udot(z8.VnS(), z8.VnB(), z0.VnB(), 1);
308
309 __ movprfx(z10.VnB(), p5.Zeroing(), z0.VnB());
310 __ umax(z10.VnB(), p5.Merging(), z10.VnB(), z10.VnB());
311
312 __ movprfx(z0.VnS(), p2.Zeroing(), z30.VnS());
313 __ umin(z0.VnS(), p2.Merging(), z0.VnS(), z0.VnS());
314
315 __ movprfx(z26.VnD(), p6.Zeroing(), z29.VnD());
316 __ umulh(z26.VnD(), p6.Merging(), z26.VnD(), z26.VnD());
317
318 __ movprfx(z23, z25);
319 __ uxtb(z23.VnS(), p7.Merging(), z23.VnS());
320
321 __ movprfx(z14.VnS(), p3.Zeroing(), z5.VnS());
322 __ uxth(z14.VnS(), p3.Merging(), z14.VnS());
323
324 __ movprfx(z14, z5);
325 __ uxtw(z14.VnD(), p3.Merging(), z14.VnD());
326
327 __ movprfx(z22, z5);
328 __ smmla(z22.VnS(), z22.VnB(), z0.VnB());
329
330 __ movprfx(z1, z5);
331 __ ummla(z1.VnS(), z10.VnB(), z1.VnB());
332
333 __ movprfx(z30, z5);
334 __ usmmla(z30.VnS(), z30.VnB(), z18.VnB());
335
336 __ movprfx(z4, z5);
337 __ usdot(z4.VnS(), z3.VnB(), z4.VnB());
338
339 __ movprfx(z10, z5);
340 __ usdot(z10.VnS(), z10.VnB(), z0.VnB(), 0);
341
342 __ movprfx(z1, z5);
343 __ sudot(z1.VnS(), z10.VnB(), z1.VnB(), 1);
344 }
345 assm.FinalizeCode();
346
347 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
348 }
349
TEST(movprfx_negative_aliasing_fp)350 TEST(movprfx_negative_aliasing_fp) {
351 // Test that CanTakeSVEMovprfx() checks that the movprfx destination does not
352 // alias an input to the prefixed instruction.
353 Assembler assm;
354 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE,
355 CPUFeatures::kSVEF32MM,
356 CPUFeatures::kSVEF64MM);
357 {
358 // We have to use the Assembler directly to generate movprfx, so we need
359 // to manually reserve space for the code we're about to emit.
360 static const size_t kPairCount = 80;
361 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
362
363 __ movprfx(z17.VnS(), p1.Zeroing(), z12.VnS());
364 __ fabd(z17.VnS(), p1.Merging(), z17.VnS(), z17.VnS());
365
366 __ movprfx(z13, z23);
367 __ fabs(z13.VnS(), p4.Merging(), z13.VnS());
368
369 __ movprfx(z24.VnS(), p5.Merging(), z15.VnS());
370 __ fadd(z24.VnS(), p5.Merging(), z24.VnS(), z24.VnS());
371
372 __ movprfx(z28.VnD(), p5.Zeroing(), z14.VnD());
373 __ fcadd(z28.VnD(), p5.Merging(), z28.VnD(), z28.VnD(), 90);
374
375 __ movprfx(z5, z0);
376 __ fcmla(z5.VnH(), z0.VnH(), z5.VnH(), 2, 180);
377
378 __ movprfx(z10, z4);
379 __ fcmla(z10.VnS(), z8.VnS(), z10.VnS(), 1, 270);
380
381 __ movprfx(z12, z26);
382 __ fcmla(z12.VnH(), z12.VnH(), z3.VnH(), 2, 180);
383
384 __ movprfx(z8, z1);
385 __ fcmla(z8.VnS(), z8.VnS(), z1.VnS(), 1, 270);
386
387 __ movprfx(z16.VnD(), p0.Merging(), z13.VnD());
388 __ fcvt(z16.VnD(), p0.Merging(), z16.VnH());
389
390 __ movprfx(z12.VnD(), p7.Zeroing(), z13.VnD());
391 __ fcvt(z12.VnD(), p7.Merging(), z12.VnS());
392
393 __ movprfx(z14, z26);
394 __ fcvt(z14.VnS(), p5.Merging(), z14.VnD());
395
396 __ movprfx(z26, z2);
397 __ fcvt(z26.VnH(), p7.Merging(), z26.VnD());
398
399 __ movprfx(z25.VnD(), p2.Merging(), z13.VnD());
400 __ fcvtzs(z25.VnD(), p2.Merging(), z25.VnH());
401
402 __ movprfx(z31, z2);
403 __ fcvtzs(z31.VnH(), p7.Merging(), z31.VnH());
404
405 __ movprfx(z21.VnD(), p1.Merging(), z7.VnD());
406 __ fcvtzs(z21.VnD(), p1.Merging(), z21.VnS());
407
408 __ movprfx(z5, z17);
409 __ fcvtzs(z5.VnS(), p5.Merging(), z5.VnD());
410
411 __ movprfx(z19.VnD(), p1.Zeroing(), z16.VnD());
412 __ fcvtzu(z19.VnD(), p1.Merging(), z19.VnH());
413
414 __ movprfx(z2.VnH(), p7.Zeroing(), z28.VnH());
415 __ fcvtzu(z2.VnH(), p7.Merging(), z2.VnH());
416
417 __ movprfx(z21.VnD(), p7.Zeroing(), z27.VnD());
418 __ fcvtzu(z21.VnD(), p7.Merging(), z21.VnS());
419
420 __ movprfx(z22.VnD(), p4.Zeroing(), z8.VnD());
421 __ fcvtzu(z22.VnS(), p4.Merging(), z22.VnD());
422
423 __ movprfx(z0.VnS(), p5.Merging(), z5.VnS());
424 __ fdiv(z0.VnS(), p5.Merging(), z0.VnS(), z0.VnS());
425
426 __ movprfx(z12, z24);
427 __ fdivr(z12.VnD(), p7.Merging(), z12.VnD(), z12.VnD());
428
429 __ movprfx(z14.VnD(), p6.Zeroing(), z21.VnD());
430 __ fmad(z14.VnD(), p6.Merging(), z14.VnD(), z3.VnD());
431
432 __ movprfx(z2.VnS(), p5.Zeroing(), z10.VnS());
433 __ fmad(z2.VnS(), p5.Merging(), z14.VnS(), z2.VnS());
434
435 __ movprfx(z24, z5);
436 __ fmax(z24.VnS(), p1.Merging(), z24.VnS(), z24.VnS());
437
438 __ movprfx(z15.VnD(), p2.Merging(), z26.VnD());
439 __ fmaxnm(z15.VnD(), p2.Merging(), z15.VnD(), z15.VnD());
440
441 __ movprfx(z20, z22);
442 __ fmin(z20.VnH(), p0.Merging(), z20.VnH(), z20.VnH());
443
444 __ movprfx(z24.VnS(), p6.Zeroing(), z30.VnS());
445 __ fminnm(z24.VnS(), p6.Merging(), z24.VnS(), z24.VnS());
446
447 __ movprfx(z4, z24);
448 __ fmla(z4.VnH(), z24.VnH(), z4.VnH(), 7);
449
450 __ movprfx(z4, z7);
451 __ fmla(z4.VnS(), z24.VnS(), z4.VnS(), 3);
452
453 __ movprfx(z5, z28);
454 __ fmla(z5.VnD(), z28.VnD(), z5.VnD(), 1);
455
456 __ movprfx(z24, z2);
457 __ fmla(z24.VnD(), z24.VnD(), z2.VnD(), 1);
458
459 __ movprfx(z7, z21);
460 __ fmla(z7.VnH(), p2.Merging(), z7.VnH(), z31.VnH());
461
462 __ movprfx(z25.VnH(), p5.Zeroing(), z29.VnH());
463 __ fmla(z25.VnH(), p5.Merging(), z29.VnH(), z25.VnH());
464
465 __ movprfx(z31, z25);
466 __ fmla(z31.VnH(), z31.VnH(), z2.VnH(), 7);
467
468 __ movprfx(z15, z4);
469 __ fmla(z15.VnS(), z15.VnS(), z4.VnS(), 3);
470
471 __ movprfx(z7, z11);
472 __ fmls(z7.VnH(), z11.VnH(), z7.VnH(), 4);
473
474 __ movprfx(z3, z10);
475 __ fmls(z3.VnS(), z10.VnS(), z3.VnS(), 3);
476
477 __ movprfx(z5, z16);
478 __ fmls(z5.VnD(), z16.VnD(), z5.VnD(), 1);
479
480 __ movprfx(z31, z26);
481 __ fmls(z31.VnD(), z31.VnD(), z8.VnD(), 1);
482
483 __ movprfx(z5.VnH(), p3.Merging(), z2.VnH());
484 __ fmls(z5.VnH(), p3.Merging(), z5.VnH(), z2.VnH());
485
486 __ movprfx(z22.VnS(), p3.Zeroing(), z17.VnS());
487 __ fmls(z22.VnS(), p3.Merging(), z21.VnS(), z22.VnS());
488
489 __ movprfx(z17, z2);
490 __ fmls(z17.VnH(), z17.VnH(), z2.VnH(), 4);
491
492 __ movprfx(z28, z11);
493 __ fmls(z28.VnS(), z28.VnS(), z0.VnS(), 3);
494
495 __ movprfx(z15.VnD(), p1.Merging(), z31.VnD());
496 __ fmsb(z15.VnD(), p1.Merging(), z15.VnD(), z31.VnD());
497
498 __ movprfx(z21.VnD(), p0.Zeroing(), z5.VnD());
499 __ fmsb(z21.VnD(), p0.Merging(), z19.VnD(), z21.VnD());
500
501 __ movprfx(z0.VnH(), p3.Merging(), z31.VnH());
502 __ fmul(z0.VnH(), p3.Merging(), z0.VnH(), z0.VnH());
503
504 __ movprfx(z31.VnH(), p6.Merging(), z8.VnH());
505 __ fmulx(z31.VnH(), p6.Merging(), z31.VnH(), z31.VnH());
506
507 __ movprfx(z17.VnH(), p1.Zeroing(), z10.VnH());
508 __ fneg(z17.VnH(), p1.Merging(), z17.VnH());
509
510 __ movprfx(z22, z31);
511 __ fnmad(z22.VnH(), p1.Merging(), z22.VnH(), z23.VnH());
512
513 __ movprfx(z14.VnD(), p0.Zeroing(), z26.VnD());
514 __ fnmad(z14.VnD(), p0.Merging(), z2.VnD(), z14.VnD());
515
516 __ movprfx(z13.VnH(), p6.Zeroing(), z29.VnH());
517 __ fnmla(z13.VnH(), p6.Merging(), z13.VnH(), z26.VnH());
518
519 __ movprfx(z19.VnH(), p7.Zeroing(), z25.VnH());
520 __ fnmla(z19.VnH(), p7.Merging(), z25.VnH(), z19.VnH());
521
522 __ movprfx(z27.VnH(), p5.Merging(), z24.VnH());
523 __ fnmls(z27.VnH(), p5.Merging(), z27.VnH(), z24.VnH());
524
525 __ movprfx(z6.VnH(), p6.Zeroing(), z21.VnH());
526 __ fnmls(z6.VnH(), p6.Merging(), z21.VnH(), z6.VnH());
527
528 __ movprfx(z7.VnS(), p3.Merging(), z23.VnS());
529 __ fnmsb(z7.VnS(), p3.Merging(), z7.VnS(), z23.VnS());
530
531 __ movprfx(z29.VnH(), p2.Zeroing(), z24.VnH());
532 __ fnmsb(z29.VnH(), p2.Merging(), z24.VnH(), z29.VnH());
533
534 __ movprfx(z7.VnH(), p6.Merging(), z23.VnH());
535 __ frecpx(z7.VnH(), p6.Merging(), z7.VnH());
536
537 __ movprfx(z17.VnS(), p5.Zeroing(), z2.VnS());
538 __ frinta(z17.VnS(), p5.Merging(), z17.VnS());
539
540 __ movprfx(z0.VnS(), p2.Zeroing(), z7.VnS());
541 __ frinti(z0.VnS(), p2.Merging(), z0.VnS());
542
543 __ movprfx(z8.VnH(), p3.Merging(), z20.VnH());
544 __ frintm(z8.VnH(), p3.Merging(), z8.VnH());
545
546 __ movprfx(z3.VnD(), p2.Zeroing(), z20.VnD());
547 __ frintn(z3.VnD(), p2.Merging(), z3.VnD());
548
549 __ movprfx(z11, z3);
550 __ frintp(z11.VnS(), p4.Merging(), z11.VnS());
551
552 __ movprfx(z23, z29);
553 __ frintx(z23.VnD(), p4.Merging(), z23.VnD());
554
555 __ movprfx(z4.VnH(), p4.Zeroing(), z14.VnH());
556 __ frintz(z4.VnH(), p4.Merging(), z4.VnH());
557
558 __ movprfx(z18.VnH(), p3.Zeroing(), z0.VnH());
559 __ fscale(z18.VnH(), p3.Merging(), z18.VnH(), z18.VnH());
560
561 __ movprfx(z2.VnS(), p6.Zeroing(), z4.VnS());
562 __ fsqrt(z2.VnS(), p6.Merging(), z2.VnS());
563
564 __ movprfx(z14.VnD(), p4.Zeroing(), z31.VnD());
565 __ fsub(z14.VnD(), p4.Merging(), z14.VnD(), z14.VnD());
566
567 __ movprfx(z31.VnH(), p2.Merging(), z6.VnH());
568 __ fsubr(z31.VnH(), p2.Merging(), z31.VnH(), z31.VnH());
569
570 __ movprfx(z4, z30);
571 __ ftmad(z4.VnH(), z4.VnH(), z4.VnH(), 2);
572
573 __ movprfx(z25.VnD(), p6.Zeroing(), z2.VnD());
574 __ scvtf(z25.VnD(), p6.Merging(), z25.VnS());
575
576 __ movprfx(z0.VnD(), p3.Merging(), z16.VnD());
577 __ scvtf(z0.VnD(), p3.Merging(), z0.VnD());
578
579 __ movprfx(z19, z23);
580 __ scvtf(z19.VnS(), p7.Merging(), z19.VnD());
581
582 __ movprfx(z19, z4);
583 __ scvtf(z19.VnH(), p4.Merging(), z19.VnD());
584
585 __ movprfx(z13.VnD(), p4.Zeroing(), z6.VnD());
586 __ ucvtf(z13.VnD(), p4.Merging(), z13.VnS());
587
588 __ movprfx(z6.VnH(), p0.Zeroing(), z14.VnH());
589 __ ucvtf(z6.VnH(), p0.Merging(), z6.VnH());
590
591 __ movprfx(z19.VnS(), p4.Merging(), z12.VnS());
592 __ ucvtf(z19.VnH(), p4.Merging(), z19.VnS());
593
594 __ movprfx(z0.VnD(), p5.Zeroing(), z12.VnD());
595 __ ucvtf(z0.VnH(), p5.Merging(), z0.VnD());
596
597 __ movprfx(z30, z5);
598 __ fmmla(z30.VnS(), z30.VnS(), z18.VnS());
599
600 __ movprfx(z31, z5);
601 __ fmmla(z31.VnD(), z31.VnD(), z18.VnD());
602 }
603 assm.FinalizeCode();
604
605 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
606 }
607
TEST(movprfx_negative_instructions)608 TEST(movprfx_negative_instructions) {
609 Assembler assm;
610 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE);
611 {
612 // We have to use the Assembler directly to generate movprfx, so we need
613 // to manually reserve space for the code we're about to emit.
614 static const size_t kPairCount = 13;
615 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
616
617 __ movprfx(z26, z11);
618 __ add(z26.VnB(), z11.VnB(), z4.VnB());
619
620 // The merging form can take movprfx, but the zeroing form cannot.
621 __ movprfx(z29.VnB(), p3.Zeroing(), z7.VnB());
622 __ cpy(z29.VnB(), p3.Zeroing(), -42);
623
624 // Frecpx can take movprfx, but frecpe and frecps cannot.
625 __ movprfx(z13, z15);
626 __ frecpe(z13.VnD(), z26.VnD());
627
628 __ movprfx(z19, z1);
629 __ frecps(z19.VnD(), z1.VnD(), z12.VnD());
630
631 __ movprfx(z6, z12);
632 __ frsqrte(z6.VnS(), z12.VnS());
633
634 __ movprfx(z29, z5);
635 __ frsqrts(z29.VnH(), z5.VnH(), z20.VnH());
636
637 // Ftmad can take movprfx, but ftsmul and ftssel cannot.
638 __ movprfx(z1, z31);
639 __ ftsmul(z1.VnD(), z31.VnD(), z16.VnD());
640
641 __ movprfx(z8, z27);
642 __ ftssel(z8.VnH(), z27.VnH(), z1.VnH());
643
644 // This looks like a merging unary operation, but it's actually an alias of
645 // sel, which isn't destructive.
646 __ movprfx(z0, z18);
647 __ mov(z0.VnS(), p6.Merging(), z18.VnS());
648
649 // The merging form can take movprfx, but the zeroing form cannot.
650 __ movprfx(z12.VnS(), p2.Merging(), z11.VnS());
651 __ mov(z12.VnS(), p2.Zeroing(), -42);
652
653 __ movprfx(z13, z6);
654 __ movprfx(z13, z2);
655
656 // Movprfx can never prefix itself.
657 __ movprfx(z3.VnD(), p5.Zeroing(), z8.VnD());
658 __ movprfx(z3.VnD(), p5.Merging(), z8.VnD());
659
660 __ movprfx(z1.VnD(), p3.Zeroing(), z14.VnD());
661 __ movprfx(z1.VnD(), p3.Zeroing(), z18.VnD());
662 }
663 assm.FinalizeCode();
664
665 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
666 }
667
TEST(movprfx_negative_lane_size)668 TEST(movprfx_negative_lane_size) {
669 // Test that CanTakeSVEMovprfx() checks that the (predicated) movprfx lane
670 // size is compatible with the prefixed instruction.
671 Assembler assm;
672 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE);
673 {
674 // We have to use the Assembler directly to generate movprfx, so we need
675 // to manually reserve space for the code we're about to emit.
676 static const size_t kPairCount = 63;
677 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
678
679 __ movprfx(z0.VnH(), p2.Zeroing(), z17.VnH());
680 __ abs(z0.VnS(), p2.Merging(), z17.VnS());
681
682 __ movprfx(z10.VnD(), p0.Zeroing(), z4.VnD());
683 __ add(z10.VnS(), p0.Merging(), z10.VnS(), z2.VnS());
684
685 __ movprfx(z25.VnS(), p4.Zeroing(), z26.VnS());
686 __ and_(z25.VnB(), p4.Merging(), z25.VnB(), z27.VnB());
687
688 __ movprfx(z26.VnD(), p5.Merging(), z23.VnD());
689 __ asr(z26.VnB(), p5.Merging(), z26.VnB(), 3);
690
691 __ movprfx(z25.VnS(), p7.Zeroing(), z14.VnS());
692 __ asr(z25.VnH(), p7.Merging(), z25.VnH(), z14.VnH());
693
694 __ movprfx(z12.VnS(), p7.Zeroing(), z23.VnS());
695 __ asr(z12.VnH(), p7.Merging(), z12.VnH(), z23.VnD());
696
697 __ movprfx(z3.VnH(), p4.Zeroing(), z18.VnH());
698 __ asr(z3.VnD(), p4.Merging(), z3.VnD(), z15.VnD());
699
700 __ movprfx(z29.VnH(), p4.Merging(), z31.VnH());
701 __ asrd(z29.VnB(), p4.Merging(), z29.VnB(), 3);
702
703 __ movprfx(z31.VnH(), p5.Zeroing(), z14.VnH());
704 __ asrr(z31.VnB(), p5.Merging(), z31.VnB(), z5.VnB());
705
706 __ movprfx(z0.VnS(), p6.Zeroing(), z18.VnS());
707 __ bic(z0.VnB(), p6.Merging(), z0.VnB(), z23.VnB());
708
709 __ movprfx(z19.VnH(), p2.Zeroing(), z24.VnH());
710 __ cls(z19.VnB(), p2.Merging(), z24.VnB());
711
712 __ movprfx(z14.VnS(), p5.Zeroing(), z4.VnS());
713 __ clz(z14.VnD(), p5.Merging(), z10.VnD());
714
715 __ movprfx(z0.VnD(), p5.Merging(), z2.VnD());
716 __ cnot(z0.VnH(), p5.Merging(), z2.VnH());
717
718 __ movprfx(z0.VnB(), p3.Zeroing(), z19.VnB());
719 __ cnt(z0.VnH(), p3.Merging(), z8.VnH());
720
721 __ movprfx(z29.VnS(), p0.Merging(), z7.VnS());
722 __ cpy(z29.VnD(), p0.Merging(), -42);
723
724 __ movprfx(z13.VnB(), p2.Merging(), z31.VnB());
725 __ cpy(z13.VnS(), p2.Merging(), w13);
726
727 __ movprfx(z0.VnS(), p3.Merging(), z15.VnS());
728 __ cpy(z0.VnH(), p3.Merging(), h0);
729
730 __ movprfx(z2.VnD(), p6.Zeroing(), z26.VnD());
731 __ eor(z2.VnB(), p6.Merging(), z2.VnB(), z26.VnB());
732
733 __ movprfx(z7.VnS(), p7.Zeroing(), z30.VnS());
734 __ lsl(z7.VnD(), p7.Merging(), z7.VnD(), 3);
735
736 __ movprfx(z11.VnH(), p3.Merging(), z23.VnH());
737 __ lsl(z11.VnB(), p3.Merging(), z11.VnB(), z21.VnB());
738
739 __ movprfx(z31.VnS(), p7.Zeroing(), z21.VnS());
740 __ lsl(z31.VnH(), p7.Merging(), z31.VnH(), z21.VnD());
741
742 __ movprfx(z26.VnH(), p0.Merging(), z0.VnH());
743 __ lsl(z26.VnD(), p0.Merging(), z26.VnD(), z24.VnD());
744
745 __ movprfx(z1.VnS(), p2.Zeroing(), z6.VnS());
746 __ lslr(z1.VnB(), p2.Merging(), z1.VnB(), z6.VnB());
747
748 __ movprfx(z4.VnD(), p4.Zeroing(), z6.VnD());
749 __ lsr(z4.VnH(), p4.Merging(), z4.VnH(), 3);
750
751 __ movprfx(z27.VnH(), p0.Zeroing(), z29.VnH());
752 __ lsr(z27.VnS(), p0.Merging(), z27.VnS(), z29.VnS());
753
754 __ movprfx(z5.VnD(), p2.Zeroing(), z16.VnD());
755 __ lsr(z5.VnH(), p2.Merging(), z5.VnH(), z2.VnD());
756
757 __ movprfx(z27.VnB(), p4.Zeroing(), z5.VnB());
758 __ lsr(z27.VnD(), p4.Merging(), z27.VnD(), z5.VnD());
759
760 __ movprfx(z27.VnS(), p3.Merging(), z13.VnS());
761 __ lsrr(z27.VnD(), p3.Merging(), z27.VnD(), z13.VnD());
762
763 __ movprfx(z30.VnS(), p2.Zeroing(), z14.VnS());
764 __ mad(z30.VnB(), p2.Merging(), z20.VnB(), z14.VnB());
765
766 __ movprfx(z14.VnB(), p6.Merging(), z11.VnB());
767 __ mla(z14.VnD(), p6.Merging(), z28.VnD(), z11.VnD());
768
769 __ movprfx(z28.VnH(), p2.Zeroing(), z22.VnH());
770 __ mls(z28.VnS(), p2.Merging(), z3.VnS(), z22.VnS());
771
772 // Aliases of cpy.
773 __ movprfx(z18.VnH(), p6.Zeroing(), z25.VnH());
774 __ mov(z18.VnD(), p6.Merging(), -42);
775
776 __ movprfx(z22.VnD(), p2.Zeroing(), z6.VnD());
777 __ mov(z22.VnS(), p2.Merging(), w22);
778
779 __ movprfx(z3.VnH(), p0.Zeroing(), z13.VnH());
780 __ mov(z3.VnB(), p0.Merging(), b0);
781
782 __ movprfx(z31.VnS(), p7.Zeroing(), z12.VnS());
783 __ msb(z31.VnH(), p7.Merging(), z14.VnH(), z12.VnH());
784
785 __ movprfx(z16.VnS(), p7.Zeroing(), z6.VnS());
786 __ mul(z16.VnB(), p7.Merging(), z16.VnB(), z30.VnB());
787
788 __ movprfx(z17.VnD(), p7.Merging(), z1.VnD());
789 __ neg(z17.VnB(), p7.Merging(), z1.VnB());
790
791 __ movprfx(z31.VnH(), p4.Zeroing(), z12.VnH());
792 __ not_(z31.VnB(), p4.Merging(), z12.VnB());
793
794 __ movprfx(z9.VnH(), p3.Zeroing(), z23.VnH());
795 __ orr(z9.VnS(), p3.Merging(), z9.VnS(), z13.VnS());
796
797 __ movprfx(z25.VnD(), p2.Zeroing(), z21.VnD());
798 __ rbit(z25.VnS(), p2.Merging(), z21.VnS());
799
800 __ movprfx(z26.VnH(), p3.Merging(), z13.VnH());
801 __ revb(z26.VnD(), p3.Merging(), z13.VnD());
802
803 __ movprfx(z8.VnH(), p5.Merging(), z20.VnH());
804 __ revh(z8.VnS(), p5.Merging(), z0.VnS());
805
806 __ movprfx(z22.VnH(), p6.Merging(), z15.VnH());
807 __ revw(z22.VnD(), p6.Merging(), z10.VnD());
808
809 __ movprfx(z1.VnD(), p3.Merging(), z15.VnD());
810 __ sabd(z1.VnB(), p3.Merging(), z1.VnB(), z15.VnB());
811
812 __ movprfx(z25.VnD(), p1.Zeroing(), z30.VnD());
813 __ sdiv(z25.VnS(), p1.Merging(), z25.VnS(), z30.VnS());
814
815 __ movprfx(z19.VnS(), p3.Zeroing(), z11.VnS());
816 __ sdivr(z19.VnD(), p3.Merging(), z19.VnD(), z24.VnD());
817
818 __ movprfx(z12.VnH(), p2.Merging(), z2.VnH());
819 __ smax(z12.VnS(), p2.Merging(), z12.VnS(), z24.VnS());
820
821 __ movprfx(z3.VnD(), p1.Merging(), z15.VnD());
822 __ smin(z3.VnS(), p1.Merging(), z3.VnS(), z20.VnS());
823
824 __ movprfx(z13.VnS(), p5.Merging(), z22.VnS());
825 __ smulh(z13.VnB(), p5.Merging(), z13.VnB(), z27.VnB());
826
827 __ movprfx(z11.VnH(), p5.Zeroing(), z25.VnH());
828 __ sub(z11.VnB(), p5.Merging(), z11.VnB(), z7.VnB());
829
830 __ movprfx(z3.VnB(), p6.Merging(), z13.VnB());
831 __ subr(z3.VnS(), p6.Merging(), z3.VnS(), z13.VnS());
832
833 __ movprfx(z26.VnH(), p5.Merging(), z1.VnH());
834 __ sxtb(z26.VnS(), p5.Merging(), z17.VnS());
835
836 __ movprfx(z11.VnB(), p7.Zeroing(), z26.VnB());
837 __ sxth(z11.VnS(), p7.Merging(), z26.VnS());
838
839 __ movprfx(z1.VnS(), p2.Merging(), z21.VnS());
840 __ sxtw(z1.VnD(), p2.Merging(), z21.VnD());
841
842 __ movprfx(z4.VnS(), p6.Zeroing(), z6.VnS());
843 __ uabd(z4.VnH(), p6.Merging(), z4.VnH(), z6.VnH());
844
845 __ movprfx(z26.VnB(), p2.Zeroing(), z11.VnB());
846 __ udiv(z26.VnD(), p2.Merging(), z26.VnD(), z11.VnD());
847
848 __ movprfx(z19.VnB(), p5.Merging(), z6.VnB());
849 __ udivr(z19.VnS(), p5.Merging(), z19.VnS(), z9.VnS());
850
851 __ movprfx(z16.VnB(), p4.Merging(), z6.VnB());
852 __ umax(z16.VnH(), p4.Merging(), z16.VnH(), z6.VnH());
853
854 __ movprfx(z1.VnD(), p0.Zeroing(), z4.VnD());
855 __ umin(z1.VnS(), p0.Merging(), z1.VnS(), z28.VnS());
856
857 __ movprfx(z25.VnD(), p7.Merging(), z4.VnD());
858 __ umulh(z25.VnB(), p7.Merging(), z25.VnB(), z16.VnB());
859
860 __ movprfx(z29.VnB(), p4.Merging(), z2.VnB());
861 __ uxtb(z29.VnS(), p4.Merging(), z31.VnS());
862
863 __ movprfx(z27.VnH(), p5.Merging(), z21.VnH());
864 __ uxth(z27.VnD(), p5.Merging(), z1.VnD());
865
866 __ movprfx(z29.VnB(), p2.Merging(), z7.VnB());
867 __ uxtw(z29.VnD(), p2.Merging(), z7.VnD());
868 }
869 assm.FinalizeCode();
870
871 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
872 }
873
TEST(movprfx_negative_lane_size_fp)874 TEST(movprfx_negative_lane_size_fp) {
875 // Test that CanTakeSVEMovprfx() checks that the (predicated) movprfx lane
876 // size is compatible with the prefixed instruction.
877 Assembler assm;
878 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE);
879 {
880 // We have to use the Assembler directly to generate movprfx, so we need
881 // to manually reserve space for the code we're about to emit.
882 static const size_t kPairCount = 64;
883 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
884
885 __ movprfx(z29.VnD(), p5.Zeroing(), z8.VnD());
886 __ fabd(z29.VnS(), p5.Merging(), z29.VnS(), z26.VnS());
887
888 __ movprfx(z9.VnB(), p0.Zeroing(), z1.VnB());
889 __ fabs(z9.VnS(), p0.Merging(), z15.VnS());
890
891 __ movprfx(z24.VnD(), p0.Zeroing(), z8.VnD());
892 __ fadd(z24.VnH(), p0.Merging(), z24.VnH(), 0.5);
893
894 __ movprfx(z24.VnB(), p1.Zeroing(), z27.VnB());
895 __ fadd(z24.VnH(), p1.Merging(), z24.VnH(), z27.VnH());
896
897 __ movprfx(z14.VnH(), p7.Merging(), z12.VnH());
898 __ fcadd(z14.VnD(), p7.Merging(), z14.VnD(), z12.VnD(), 90);
899
900 __ movprfx(z10.VnB(), p6.Merging(), z11.VnB());
901 __ fcpy(z10.VnH(), p6.Merging(), 1.25);
902
903 __ movprfx(z12.VnB(), p6.Merging(), z18.VnB());
904 __ fcvt(z12.VnD(), p6.Merging(), z18.VnH());
905
906 __ movprfx(z18.VnH(), p7.Zeroing(), z2.VnH());
907 __ fcvt(z18.VnD(), p7.Merging(), z0.VnS());
908
909 __ movprfx(z3.VnH(), p5.Merging(), z14.VnH());
910 __ fcvt(z3.VnS(), p5.Merging(), z21.VnD());
911
912 __ movprfx(z15.VnH(), p1.Zeroing(), z12.VnH());
913 __ fcvt(z15.VnH(), p1.Merging(), z12.VnD());
914
915 __ movprfx(z3.VnH(), p2.Merging(), z22.VnH());
916 __ fcvtzs(z3.VnD(), p2.Merging(), z7.VnH());
917
918 __ movprfx(z17.VnS(), p3.Merging(), z14.VnS());
919 __ fcvtzs(z17.VnD(), p3.Merging(), z14.VnD());
920
921 __ movprfx(z2.VnH(), p1.Zeroing(), z16.VnH());
922 __ fcvtzs(z2.VnS(), p1.Merging(), z31.VnH());
923
924 __ movprfx(z13.VnB(), p2.Merging(), z9.VnB());
925 __ fcvtzs(z13.VnS(), p2.Merging(), z23.VnD());
926
927 __ movprfx(z19.VnB(), p1.Merging(), z4.VnB());
928 __ fcvtzu(z19.VnD(), p1.Merging(), z14.VnH());
929
930 __ movprfx(z29.VnS(), p2.Merging(), z19.VnS());
931 __ fcvtzu(z29.VnD(), p2.Merging(), z19.VnD());
932
933 __ movprfx(z21.VnS(), p4.Zeroing(), z17.VnS());
934 __ fcvtzu(z21.VnD(), p4.Merging(), z17.VnS());
935
936 __ movprfx(z19.VnH(), p4.Zeroing(), z30.VnH());
937 __ fcvtzu(z19.VnS(), p4.Merging(), z16.VnD());
938
939 __ movprfx(z10.VnS(), p7.Zeroing(), z27.VnS());
940 __ fdiv(z10.VnH(), p7.Merging(), z10.VnH(), z27.VnH());
941
942 __ movprfx(z7.VnD(), p7.Zeroing(), z17.VnD());
943 __ fdivr(z7.VnH(), p7.Merging(), z7.VnH(), z28.VnH());
944
945 __ movprfx(z22.VnB(), p0.Merging(), z27.VnB());
946 __ fmad(z22.VnH(), p0.Merging(), z27.VnH(), z15.VnH());
947
948 __ movprfx(z14.VnD(), p1.Zeroing(), z11.VnD());
949 __ fmax(z14.VnS(), p1.Merging(), z14.VnS(), 0.0);
950
951 __ movprfx(z27.VnB(), p5.Merging(), z14.VnB());
952 __ fmax(z27.VnD(), p5.Merging(), z27.VnD(), z14.VnD());
953
954 __ movprfx(z31.VnH(), p7.Merging(), z24.VnH());
955 __ fmaxnm(z31.VnD(), p7.Merging(), z31.VnD(), 0.0);
956
957 __ movprfx(z11.VnD(), p7.Zeroing(), z25.VnD());
958 __ fmaxnm(z11.VnS(), p7.Merging(), z11.VnS(), z28.VnS());
959
960 __ movprfx(z31.VnD(), p6.Merging(), z19.VnD());
961 __ fmin(z31.VnH(), p6.Merging(), z31.VnH(), 0.0);
962
963 __ movprfx(z20.VnS(), p3.Zeroing(), z15.VnS());
964 __ fmin(z20.VnH(), p3.Merging(), z20.VnH(), z8.VnH());
965
966 __ movprfx(z6.VnS(), p0.Merging(), z30.VnS());
967 __ fminnm(z6.VnH(), p0.Merging(), z6.VnH(), 0.0);
968
969 __ movprfx(z1.VnH(), p1.Zeroing(), z14.VnH());
970 __ fminnm(z1.VnS(), p1.Merging(), z1.VnS(), z14.VnS());
971
972 __ movprfx(z13.VnB(), p3.Zeroing(), z21.VnB());
973 __ fmla(z13.VnD(), p3.Merging(), z12.VnD(), z21.VnD());
974
975 __ movprfx(z15.VnS(), p1.Zeroing(), z20.VnS());
976 __ fmls(z15.VnH(), p1.Merging(), z28.VnH(), z20.VnH());
977
978 __ movprfx(z19.VnD(), p3.Zeroing(), z31.VnD());
979 __ fmov(z19.VnH(), p3.Merging(), 0.0);
980
981 __ movprfx(z16.VnS(), p7.Merging(), z30.VnS());
982 __ fmov(z16.VnH(), p7.Merging(), 2.5);
983
984 __ movprfx(z21.VnB(), p1.Merging(), z28.VnB());
985 __ fmsb(z21.VnH(), p1.Merging(), z30.VnH(), z28.VnH());
986
987 __ movprfx(z21.VnS(), p1.Zeroing(), z19.VnS());
988 __ fmul(z21.VnH(), p1.Merging(), z21.VnH(), 2.0);
989
990 __ movprfx(z28.VnB(), p7.Zeroing(), z8.VnB());
991 __ fmul(z28.VnS(), p7.Merging(), z28.VnS(), z26.VnS());
992
993 __ movprfx(z2.VnB(), p4.Merging(), z31.VnB());
994 __ fmulx(z2.VnH(), p4.Merging(), z2.VnH(), z31.VnH());
995
996 __ movprfx(z6.VnB(), p2.Zeroing(), z0.VnB());
997 __ fneg(z6.VnS(), p2.Merging(), z28.VnS());
998
999 __ movprfx(z26.VnB(), p0.Zeroing(), z21.VnB());
1000 __ fnmad(z26.VnH(), p0.Merging(), z21.VnH(), z18.VnH());
1001
1002 __ movprfx(z15.VnB(), p1.Zeroing(), z26.VnB());
1003 __ fnmla(z15.VnH(), p1.Merging(), z26.VnH(), z18.VnH());
1004
1005 __ movprfx(z16.VnS(), p0.Merging(), z1.VnS());
1006 __ fnmls(z16.VnD(), p0.Merging(), z1.VnD(), z13.VnD());
1007
1008 __ movprfx(z4.VnH(), p0.Zeroing(), z16.VnH());
1009 __ fnmsb(z4.VnS(), p0.Merging(), z30.VnS(), z3.VnS());
1010
1011 // Note that frecpe and frecps _cannot_ take movprfx.
1012 __ movprfx(z9.VnH(), p0.Zeroing(), z21.VnH());
1013 __ frecpx(z9.VnS(), p0.Merging(), z14.VnS());
1014
1015 __ movprfx(z6.VnH(), p2.Zeroing(), z28.VnH());
1016 __ frinta(z6.VnD(), p2.Merging(), z28.VnD());
1017
1018 __ movprfx(z12.VnS(), p4.Zeroing(), z7.VnS());
1019 __ frinti(z12.VnH(), p4.Merging(), z7.VnH());
1020
1021 __ movprfx(z6.VnB(), p5.Merging(), z20.VnB());
1022 __ frintm(z6.VnD(), p5.Merging(), z20.VnD());
1023
1024 __ movprfx(z7.VnB(), p6.Merging(), z19.VnB());
1025 __ frintn(z7.VnH(), p6.Merging(), z11.VnH());
1026
1027 __ movprfx(z12.VnD(), p2.Merging(), z31.VnD());
1028 __ frintp(z12.VnS(), p2.Merging(), z31.VnS());
1029
1030 __ movprfx(z1.VnS(), p5.Merging(), z10.VnS());
1031 __ frintx(z1.VnD(), p5.Merging(), z0.VnD());
1032
1033 __ movprfx(z6.VnH(), p0.Merging(), z12.VnH());
1034 __ frintz(z6.VnS(), p0.Merging(), z7.VnS());
1035
1036 __ movprfx(z8.VnH(), p2.Merging(), z6.VnH());
1037 __ fscale(z8.VnD(), p2.Merging(), z8.VnD(), z6.VnD());
1038
1039 __ movprfx(z20.VnH(), p2.Zeroing(), z2.VnH());
1040 __ fsqrt(z20.VnD(), p2.Merging(), z15.VnD());
1041
1042 __ movprfx(z28.VnS(), p6.Zeroing(), z19.VnS());
1043 __ fsub(z28.VnD(), p6.Merging(), z28.VnD(), 1.0);
1044
1045 __ movprfx(z6.VnB(), p0.Zeroing(), z12.VnB());
1046 __ fsub(z6.VnD(), p0.Merging(), z6.VnD(), z20.VnD());
1047
1048 __ movprfx(z6.VnS(), p7.Zeroing(), z11.VnS());
1049 __ fsubr(z6.VnH(), p7.Merging(), z6.VnH(), 1.0);
1050
1051 __ movprfx(z28.VnB(), p3.Merging(), z10.VnB());
1052 __ fsubr(z28.VnS(), p3.Merging(), z28.VnS(), z9.VnS());
1053
1054 __ movprfx(z22.VnB(), p3.Zeroing(), z14.VnB());
1055 __ scvtf(z22.VnD(), p3.Merging(), z24.VnS());
1056
1057 __ movprfx(z20.VnS(), p2.Merging(), z9.VnS());
1058 __ scvtf(z20.VnH(), p2.Merging(), z9.VnH());
1059
1060 __ movprfx(z19.VnH(), p1.Merging(), z21.VnH());
1061 __ scvtf(z19.VnS(), p1.Merging(), z6.VnD());
1062
1063 __ movprfx(z31.VnS(), p3.Merging(), z22.VnS());
1064 __ scvtf(z31.VnH(), p3.Merging(), z22.VnD());
1065
1066 __ movprfx(z8.VnS(), p3.Merging(), z3.VnS());
1067 __ ucvtf(z8.VnD(), p3.Merging(), z1.VnS());
1068
1069 __ movprfx(z0.VnB(), p0.Merging(), z23.VnB());
1070 __ ucvtf(z0.VnH(), p0.Merging(), z12.VnH());
1071
1072 __ movprfx(z8.VnH(), p3.Zeroing(), z4.VnH());
1073 __ ucvtf(z8.VnH(), p3.Merging(), z4.VnS());
1074
1075 __ movprfx(z20.VnH(), p2.Zeroing(), z10.VnH());
1076 __ ucvtf(z20.VnH(), p2.Merging(), z11.VnD());
1077 }
1078 assm.FinalizeCode();
1079
1080 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
1081 }
1082
TEST(movprfx_negative_predication)1083 TEST(movprfx_negative_predication) {
1084 // Test that CanTakeSVEMovprfx() is false when a predicated movprfx appears
1085 // before an unpredicated instruction.
1086 Assembler assm;
1087 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE, CPUFeatures::kSVEI8MM);
1088 {
1089 // We have to use the Assembler directly to generate movprfx, so we need
1090 // to manually reserve space for the code we're about to emit.
1091 static const size_t kPairCount = 60;
1092 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
1093
1094 __ movprfx(z27.VnS(), p1.Zeroing(), z12.VnS());
1095 __ add(z27.VnS(), z27.VnS(), 42);
1096
1097 __ movprfx(z31.VnS(), p6.Zeroing(), z1.VnS());
1098 __ and_(z31.VnS(), z31.VnS(), 4);
1099
1100 __ movprfx(z27.VnS(), p5.Merging(), z24.VnS());
1101 __ bic(z27.VnS(), z27.VnS(), 4);
1102
1103 __ movprfx(z6.VnH(), p7.Merging(), z30.VnH());
1104 __ clasta(z6.VnH(), p7, z6.VnH(), z14.VnH());
1105
1106 __ movprfx(z11.VnB(), p6.Merging(), z5.VnB());
1107 __ clastb(z11.VnB(), p6, z11.VnB(), z29.VnB());
1108
1109 __ movprfx(z5.VnD(), p0.Merging(), z1.VnD());
1110 __ decd(z5.VnD(), SVE_MUL3);
1111
1112 __ movprfx(z11.VnH(), p7.Zeroing(), z28.VnH());
1113 __ dech(z11.VnH(), SVE_VL2);
1114
1115 __ movprfx(z14.VnS(), p5.Zeroing(), z6.VnS());
1116 __ decp(z14.VnS(), p5);
1117
1118 __ movprfx(z6.VnS(), p5.Merging(), z10.VnS());
1119 __ decw(z6.VnS(), SVE_ALL);
1120
1121 __ movprfx(z27.VnH(), p7.Zeroing(), z9.VnH());
1122 __ eon(z27.VnH(), z27.VnH(), 4);
1123
1124 __ movprfx(z3.VnS(), p3.Zeroing(), z2.VnS());
1125 __ eor(z3.VnS(), z3.VnS(), 4);
1126
1127 __ movprfx(z30.VnB(), p2.Zeroing(), z25.VnB());
1128 __ ext(z30.VnB(), z30.VnB(), z25.VnB(), 42);
1129
1130 __ movprfx(z22.VnD(), p0.Merging(), z0.VnD());
1131 __ incd(z22.VnD(), SVE_MUL3);
1132
1133 __ movprfx(z7.VnH(), p3.Merging(), z3.VnH());
1134 __ inch(z7.VnH(), SVE_VL2);
1135
1136 __ movprfx(z9.VnD(), p1.Zeroing(), z28.VnD());
1137 __ incp(z9.VnD(), p1);
1138
1139 __ movprfx(z30.VnS(), p3.Merging(), z4.VnS());
1140 __ incw(z30.VnS(), SVE_ALL);
1141
1142 __ movprfx(z30.VnB(), p7.Zeroing(), z21.VnB());
1143 __ insr(z30.VnB(), w30);
1144
1145 __ movprfx(z2.VnB(), p4.Zeroing(), z26.VnB());
1146 __ insr(z2.VnB(), b0);
1147
1148 __ movprfx(z27.VnS(), p5.Zeroing(), z5.VnS());
1149 __ mul(z27.VnS(), z27.VnS(), 42);
1150
1151 __ movprfx(z5.VnS(), p0.Merging(), z26.VnS());
1152 __ orn(z5.VnS(), z5.VnS(), 4);
1153
1154 __ movprfx(z5.VnS(), p0.Merging(), z26.VnS());
1155 __ orn(z5.VnS(), z5.VnS(), 4);
1156
1157 __ movprfx(z16.VnD(), p1.Merging(), z13.VnD());
1158 __ sdot(z16.VnD(), z11.VnH(), z7.VnH(), 1);
1159
1160 __ movprfx(z27.VnD(), p5.Merging(), z18.VnD());
1161 __ sdot(z27.VnD(), z18.VnH(), z0.VnH());
1162
1163 __ movprfx(z20.VnS(), p6.Merging(), z1.VnS());
1164 __ sdot(z20.VnS(), z10.VnB(), z1.VnB(), 1);
1165
1166 __ movprfx(z19.VnD(), p0.Zeroing(), z7.VnD());
1167 __ smax(z19.VnD(), z19.VnD(), 42);
1168
1169 __ movprfx(z15.VnD(), p1.Zeroing(), z7.VnD());
1170 __ smin(z15.VnD(), z15.VnD(), 42);
1171
1172 __ movprfx(z15.VnB(), p5.Merging(), z3.VnB());
1173 __ splice(z15.VnB(), p5, z15.VnB(), z3.VnB());
1174
1175 __ movprfx(z5.VnB(), p6.Zeroing(), z4.VnB());
1176 __ sqadd(z5.VnB(), z5.VnB(), 42);
1177
1178 __ movprfx(z16.VnD(), p0.Zeroing(), z18.VnD());
1179 __ sqdecd(z16.VnD(), SVE_MUL3);
1180
1181 __ movprfx(z7.VnH(), p3.Merging(), z28.VnH());
1182 __ sqdech(z7.VnH(), SVE_VL2);
1183
1184 __ movprfx(z7.VnS(), p2.Merging(), z13.VnS());
1185 __ sqdecp(z7.VnS(), p2);
1186
1187 __ movprfx(z22.VnS(), p7.Zeroing(), z20.VnS());
1188 __ sqdecw(z22.VnS(), SVE_ALL);
1189
1190 __ movprfx(z26.VnD(), p1.Zeroing(), z0.VnD());
1191 __ sqincd(z26.VnD(), SVE_MUL3);
1192
1193 __ movprfx(z15.VnH(), p7.Zeroing(), z27.VnH());
1194 __ sqinch(z15.VnH(), SVE_VL2);
1195
1196 __ movprfx(z4.VnD(), p7.Merging(), z13.VnD());
1197 __ sqincp(z4.VnD(), p7);
1198
1199 __ movprfx(z29.VnS(), p6.Merging(), z14.VnS());
1200 __ sqincw(z29.VnS(), SVE_ALL);
1201
1202 __ movprfx(z17.VnB(), p1.Merging(), z24.VnB());
1203 __ sqsub(z17.VnB(), z17.VnB(), 42);
1204
1205 __ movprfx(z26.VnS(), p5.Zeroing(), z19.VnS());
1206 __ sub(z26.VnS(), z26.VnS(), 42);
1207
1208 __ movprfx(z15.VnD(), p1.Merging(), z3.VnD());
1209 __ subr(z15.VnD(), z15.VnD(), 42);
1210
1211 __ movprfx(z4.VnD(), p2.Zeroing(), z14.VnD());
1212 __ udot(z4.VnD(), z15.VnH(), z7.VnH(), 1);
1213
1214 __ movprfx(z29.VnD(), p4.Zeroing(), z28.VnD());
1215 __ udot(z29.VnD(), z2.VnH(), z17.VnH());
1216
1217 __ movprfx(z7.VnS(), p6.Merging(), z3.VnS());
1218 __ udot(z7.VnS(), z14.VnB(), z1.VnB(), 1);
1219
1220 __ movprfx(z14.VnB(), p3.Merging(), z5.VnB());
1221 __ umax(z14.VnB(), z14.VnB(), 42);
1222
1223 __ movprfx(z4.VnD(), p1.Zeroing(), z2.VnD());
1224 __ umin(z4.VnD(), z4.VnD(), 42);
1225
1226 __ movprfx(z19.VnB(), p0.Zeroing(), z27.VnB());
1227 __ uqadd(z19.VnB(), z19.VnB(), 42);
1228
1229 __ movprfx(z24.VnD(), p7.Zeroing(), z11.VnD());
1230 __ uqdecd(z24.VnD(), SVE_MUL3);
1231
1232 __ movprfx(z24.VnH(), p4.Zeroing(), z18.VnH());
1233 __ uqdech(z24.VnH(), SVE_VL2);
1234
1235 __ movprfx(z31.VnS(), p5.Zeroing(), z2.VnS());
1236 __ uqdecp(z31.VnS(), p5);
1237
1238 __ movprfx(z19.VnS(), p6.Merging(), z21.VnS());
1239 __ uqdecw(z19.VnS(), SVE_ALL);
1240
1241 __ movprfx(z27.VnD(), p0.Merging(), z21.VnD());
1242 __ uqincd(z27.VnD(), SVE_MUL3);
1243
1244 __ movprfx(z13.VnH(), p4.Zeroing(), z12.VnH());
1245 __ uqinch(z13.VnH(), SVE_VL2);
1246
1247 __ movprfx(z0.VnD(), p4.Zeroing(), z1.VnD());
1248 __ uqincp(z0.VnD(), p4);
1249
1250 __ movprfx(z12.VnS(), p4.Merging(), z21.VnS());
1251 __ uqincw(z12.VnS(), SVE_ALL);
1252
1253 __ movprfx(z9.VnD(), p0.Zeroing(), z16.VnD());
1254 __ uqsub(z9.VnD(), z9.VnD(), 42);
1255
1256 __ movprfx(z22.VnS(), p0.Zeroing(), z5.VnS());
1257 __ smmla(z22.VnS(), z21.VnB(), z0.VnB());
1258
1259 __ movprfx(z1.VnS(), p0.Zeroing(), z5.VnS());
1260 __ ummla(z1.VnS(), z10.VnB(), z2.VnB());
1261
1262 __ movprfx(z30.VnS(), p0.Zeroing(), z5.VnS());
1263 __ usmmla(z30.VnS(), z29.VnB(), z18.VnB());
1264
1265 __ movprfx(z4.VnS(), p0.Zeroing(), z5.VnS());
1266 __ usdot(z4.VnS(), z3.VnB(), z4.VnB());
1267
1268 __ movprfx(z10.VnS(), p0.Zeroing(), z5.VnS());
1269 __ usdot(z10.VnS(), z10.VnB(), z0.VnB(), 0);
1270
1271 __ movprfx(z1.VnS(), p0.Zeroing(), z5.VnS());
1272 __ sudot(z1.VnS(), z10.VnB(), z1.VnB(), 1);
1273 }
1274 assm.FinalizeCode();
1275
1276 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
1277 }
1278
TEST(movprfx_negative_predication_fp)1279 TEST(movprfx_negative_predication_fp) {
1280 // Test that CanTakeSVEMovprfx() is false when a predicated movprfx appears
1281 // before an unpredicated instruction.
1282 Assembler assm;
1283 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE,
1284 CPUFeatures::kSVEF32MM,
1285 CPUFeatures::kSVEF64MM);
1286 {
1287 // We have to use the Assembler directly to generate movprfx, so we need
1288 // to manually reserve space for the code we're about to emit.
1289 static const size_t kPairCount = 11;
1290 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
1291
1292 __ movprfx(z10.VnH(), p3.Zeroing(), z3.VnH());
1293 __ fcmla(z10.VnH(), z22.VnH(), z3.VnH(), 2, 180);
1294
1295 __ movprfx(z12.VnS(), p4.Merging(), z14.VnS());
1296 __ fcmla(z12.VnS(), z3.VnS(), z10.VnS(), 1, 270);
1297
1298 __ movprfx(z16.VnD(), p3.Zeroing(), z24.VnD());
1299 __ fmla(z16.VnD(), z24.VnD(), z8.VnD(), 1);
1300
1301 __ movprfx(z9.VnH(), p7.Zeroing(), z0.VnH());
1302 __ fmla(z9.VnH(), z8.VnH(), z0.VnH(), 7);
1303
1304 __ movprfx(z23.VnS(), p5.Merging(), z5.VnS());
1305 __ fmla(z23.VnS(), z7.VnS(), z5.VnS(), 3);
1306
1307 __ movprfx(z19.VnD(), p6.Zeroing(), z8.VnD());
1308 __ fmls(z19.VnD(), z27.VnD(), z13.VnD(), 1);
1309
1310 __ movprfx(z25.VnH(), p7.Merging(), z24.VnH());
1311 __ fmls(z25.VnH(), z24.VnH(), z4.VnH(), 4);
1312
1313 __ movprfx(z2.VnS(), p1.Zeroing(), z0.VnS());
1314 __ fmls(z2.VnS(), z9.VnS(), z0.VnS(), 3);
1315
1316 // Note that ftsmul and ftssel cannot take movprfx.
1317 __ movprfx(z22.VnD(), p6.Merging(), z16.VnD());
1318 __ ftmad(z22.VnD(), z22.VnD(), z20.VnD(), 2);
1319
1320 __ movprfx(z30.VnS(), p0.Zeroing(), z5.VnS());
1321 __ fmmla(z30.VnS(), z29.VnS(), z18.VnS());
1322
1323 __ movprfx(z31.VnD(), p1.Merging(), z5.VnD());
1324 __ fmmla(z31.VnD(), z30.VnD(), z18.VnD());
1325 }
1326 assm.FinalizeCode();
1327
1328 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
1329 }
1330
TEST(movprfx_positive)1331 TEST(movprfx_positive) {
1332 Assembler assm;
1333 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE, CPUFeatures::kSVEI8MM);
1334 {
1335 // We have to use the Assembler directly to generate movprfx, so we need
1336 // to manually reserve space for the code we're about to emit.
1337 static const size_t kPairCount = 123;
1338 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
1339
1340 __ movprfx(z17, z28);
1341 __ abs(z17.VnB(), p6.Merging(), z28.VnB());
1342
1343 __ movprfx(z9, z7);
1344 __ add(z9.VnB(), p5.Merging(), z9.VnB(), z29.VnB());
1345
1346 __ movprfx(z11, z0);
1347 __ add(z11.VnD(), z11.VnD(), 42);
1348
1349 __ movprfx(z8.VnS(), p3.Zeroing(), z28.VnS());
1350 __ and_(z8.VnS(), p3.Merging(), z8.VnS(), z31.VnS());
1351
1352 __ movprfx(z20, z23);
1353 __ and_(z20.VnS(), z20.VnS(), 4);
1354
1355 __ movprfx(z24.VnD(), p5.Merging(), z11.VnD());
1356 __ asr(z24.VnD(), p5.Merging(), z24.VnD(), 3);
1357
1358 __ movprfx(z1, z13);
1359 __ asr(z1.VnH(), p3.Merging(), z1.VnH(), z4.VnH());
1360
1361 __ movprfx(z0.VnB(), p7.Zeroing(), z28.VnB());
1362 __ asr(z0.VnB(), p7.Merging(), z0.VnB(), z28.VnD());
1363
1364 __ movprfx(z15, z5);
1365 __ asr(z15.VnD(), p3.Merging(), z15.VnD(), z5.VnD());
1366
1367 __ movprfx(z24.VnH(), p3.Merging(), z22.VnH());
1368 __ asrd(z24.VnH(), p3.Merging(), z24.VnH(), 3);
1369
1370 __ movprfx(z2.VnS(), p3.Zeroing(), z20.VnS());
1371 __ asrr(z2.VnS(), p3.Merging(), z2.VnS(), z15.VnS());
1372
1373 __ movprfx(z17.VnB(), p7.Merging(), z6.VnB());
1374 __ bic(z17.VnB(), p7.Merging(), z17.VnB(), z25.VnB());
1375
1376 __ movprfx(z31, z6);
1377 __ bic(z31.VnD(), z31.VnD(), 4);
1378
1379 __ movprfx(z20, z2);
1380 __ clasta(z20.VnB(), p4, z20.VnB(), z15.VnB());
1381
1382 __ movprfx(z27, z11);
1383 __ clastb(z27.VnB(), p5, z27.VnB(), z6.VnB());
1384
1385 __ movprfx(z3.VnS(), p7.Zeroing(), z17.VnS());
1386 __ cls(z3.VnS(), p7.Merging(), z0.VnS());
1387
1388 __ movprfx(z29.VnB(), p0.Zeroing(), z24.VnB());
1389 __ clz(z29.VnB(), p0.Merging(), z7.VnB());
1390
1391 __ movprfx(z2.VnH(), p7.Zeroing(), z29.VnH());
1392 __ cnot(z2.VnH(), p7.Merging(), z28.VnH());
1393
1394 __ movprfx(z23, z5);
1395 __ cnt(z23.VnH(), p0.Merging(), z12.VnH());
1396
1397 __ movprfx(z5, z3);
1398 __ cpy(z5.VnD(), p1.Merging(), -42);
1399
1400 __ movprfx(z0, z12);
1401 __ cpy(z0.VnB(), p1.Merging(), w0);
1402
1403 __ movprfx(z27, z8);
1404 __ cpy(z27.VnB(), p0.Merging(), b0);
1405
1406 __ movprfx(z20, z24);
1407 __ decd(z20.VnD(), SVE_MUL3);
1408
1409 __ movprfx(z5, z28);
1410 __ dech(z5.VnH(), SVE_VL2);
1411
1412 __ movprfx(z7, z3);
1413 __ decp(z7.VnD(), p2);
1414
1415 __ movprfx(z4, z7);
1416 __ decw(z4.VnS(), SVE_ALL);
1417
1418 __ movprfx(z3, z18);
1419 __ eon(z3.VnS(), z3.VnS(), 4);
1420
1421 __ movprfx(z4.VnD(), p0.Merging(), z10.VnD());
1422 __ eor(z4.VnD(), p0.Merging(), z4.VnD(), z10.VnD());
1423
1424 __ movprfx(z15, z18);
1425 __ eor(z15.VnH(), z15.VnH(), 4);
1426
1427 __ movprfx(z19, z28);
1428 __ incd(z19.VnD(), SVE_MUL3);
1429
1430 __ movprfx(z13, z7);
1431 __ inch(z13.VnH(), SVE_VL2);
1432
1433 __ movprfx(z14, z21);
1434 __ incp(z14.VnD(), p1);
1435
1436 __ movprfx(z26, z12);
1437 __ incw(z26.VnS(), SVE_ALL);
1438
1439 __ movprfx(z16, z2);
1440 __ insr(z16.VnB(), w16);
1441
1442 __ movprfx(z20, z26);
1443 __ insr(z20.VnB(), b0);
1444
1445 __ movprfx(z30.VnD(), p0.Merging(), z23.VnD());
1446 __ lsl(z30.VnD(), p0.Merging(), z30.VnD(), 3);
1447
1448 __ movprfx(z28.VnS(), p2.Zeroing(), z6.VnS());
1449 __ lsl(z28.VnS(), p2.Merging(), z28.VnS(), z6.VnS());
1450
1451 __ movprfx(z15.VnH(), p6.Zeroing(), z3.VnH());
1452 __ lsl(z15.VnH(), p6.Merging(), z15.VnH(), z3.VnD());
1453
1454 __ movprfx(z13.VnD(), p4.Zeroing(), z14.VnD());
1455 __ lsl(z13.VnD(), p4.Merging(), z13.VnD(), z25.VnD());
1456
1457 __ movprfx(z14, z5);
1458 __ lslr(z14.VnS(), p0.Merging(), z14.VnS(), z17.VnS());
1459
1460 __ movprfx(z21, z1);
1461 __ lsr(z21.VnH(), p5.Merging(), z21.VnH(), 3);
1462
1463 __ movprfx(z11.VnH(), p0.Zeroing(), z13.VnH());
1464 __ lsr(z11.VnH(), p0.Merging(), z11.VnH(), z9.VnH());
1465
1466 __ movprfx(z24, z29);
1467 __ lsr(z24.VnS(), p4.Merging(), z24.VnS(), z1.VnD());
1468
1469 __ movprfx(z1.VnD(), p6.Merging(), z9.VnD());
1470 __ lsr(z1.VnD(), p6.Merging(), z1.VnD(), z9.VnD());
1471
1472 __ movprfx(z22, z3);
1473 __ lsrr(z22.VnB(), p3.Merging(), z22.VnB(), z3.VnB());
1474
1475 __ movprfx(z24.VnB(), p2.Zeroing(), z5.VnB());
1476 __ mad(z24.VnB(), p2.Merging(), z5.VnB(), z10.VnB());
1477
1478 __ movprfx(z8, z4);
1479 __ mla(z8.VnS(), p6.Merging(), z4.VnS(), z26.VnS());
1480
1481 __ movprfx(z10, z8);
1482 __ mls(z10.VnS(), p4.Merging(), z23.VnS(), z16.VnS());
1483
1484 // Aliases of cpy.
1485 __ movprfx(z4.VnH(), p5.Zeroing(), z2.VnH());
1486 __ mov(z4.VnH(), p5.Merging(), -42);
1487
1488 __ movprfx(z2.VnB(), p3.Zeroing(), z24.VnB());
1489 __ mov(z2.VnB(), p3.Merging(), w2);
1490
1491 __ movprfx(z27, z13);
1492 __ mov(z27.VnD(), p3.Merging(), d0);
1493
1494 __ movprfx(z18.VnB(), p5.Zeroing(), z11.VnB());
1495 __ msb(z18.VnB(), p5.Merging(), z3.VnB(), z11.VnB());
1496
1497 __ movprfx(z29, z16);
1498 __ mul(z29.VnS(), p6.Merging(), z29.VnS(), z9.VnS());
1499
1500 __ movprfx(z21, z23);
1501 __ mul(z21.VnH(), z21.VnH(), 42);
1502
1503 __ movprfx(z7.VnS(), p4.Merging(), z14.VnS());
1504 __ neg(z7.VnS(), p4.Merging(), z14.VnS());
1505
1506 __ movprfx(z8.VnD(), p4.Zeroing(), z5.VnD());
1507 __ not_(z8.VnD(), p4.Merging(), z5.VnD());
1508
1509 __ movprfx(z14, z13);
1510 __ orn(z14.VnS(), z14.VnS(), 4);
1511
1512 __ movprfx(z14, z13);
1513 __ orn(z14.VnS(), z14.VnS(), 4);
1514
1515 __ movprfx(z27, z17);
1516 __ orr(z27.VnD(), p2.Merging(), z27.VnD(), z17.VnD());
1517
1518 __ movprfx(z13.VnH(), p2.Zeroing(), z27.VnH());
1519 __ rbit(z13.VnH(), p2.Merging(), z1.VnH());
1520
1521 __ movprfx(z1, z29);
1522 __ revb(z1.VnS(), p4.Merging(), z6.VnS());
1523
1524 __ movprfx(z18.VnD(), p2.Zeroing(), z10.VnD());
1525 __ revh(z18.VnD(), p2.Merging(), z16.VnD());
1526
1527 __ movprfx(z2.VnD(), p1.Merging(), z10.VnD());
1528 __ revw(z2.VnD(), p1.Merging(), z1.VnD());
1529
1530 __ movprfx(z28.VnS(), p7.Merging(), z11.VnS());
1531 __ sabd(z28.VnS(), p7.Merging(), z28.VnS(), z11.VnS());
1532
1533 __ movprfx(z22.VnS(), p0.Merging(), z20.VnS());
1534 __ sdiv(z22.VnS(), p0.Merging(), z22.VnS(), z6.VnS());
1535
1536 __ movprfx(z13.VnS(), p7.Merging(), z0.VnS());
1537 __ sdivr(z13.VnS(), p7.Merging(), z13.VnS(), z2.VnS());
1538
1539 __ movprfx(z0, z12);
1540 __ sdot(z0.VnD(), z10.VnH(), z12.VnH(), 1);
1541
1542 __ movprfx(z8, z15);
1543 __ sdot(z8.VnS(), z15.VnB(), z12.VnB());
1544
1545 __ movprfx(z13, z0);
1546 __ sdot(z13.VnS(), z10.VnB(), z0.VnB(), 1);
1547
1548 __ movprfx(z11, z13);
1549 __ smax(z11.VnB(), p5.Merging(), z11.VnB(), z24.VnB());
1550
1551 __ movprfx(z3, z17);
1552 __ smax(z3.VnD(), z3.VnD(), 42);
1553
1554 __ movprfx(z10, z29);
1555 __ smin(z10.VnD(), p4.Merging(), z10.VnD(), z29.VnD());
1556
1557 __ movprfx(z13, z29);
1558 __ smin(z13.VnD(), z13.VnD(), 42);
1559
1560 __ movprfx(z6, z17);
1561 __ smulh(z6.VnS(), p7.Merging(), z6.VnS(), z31.VnS());
1562
1563 __ movprfx(z19, z20);
1564 __ splice(z19.VnB(), p3, z19.VnB(), z20.VnB());
1565
1566 __ movprfx(z0, z3);
1567 __ sqadd(z0.VnD(), z0.VnD(), 42);
1568
1569 __ movprfx(z29, z5);
1570 __ sqdecd(z29.VnD(), SVE_MUL3);
1571
1572 __ movprfx(z25, z11);
1573 __ sqdech(z25.VnH(), SVE_VL2);
1574
1575 __ movprfx(z16, z9);
1576 __ sqdecp(z16.VnS(), p1);
1577
1578 __ movprfx(z8, z17);
1579 __ sqdecw(z8.VnS(), SVE_ALL);
1580
1581 __ movprfx(z4, z5);
1582 __ sqincd(z4.VnD(), SVE_MUL3);
1583
1584 __ movprfx(z0, z17);
1585 __ sqinch(z0.VnH(), SVE_VL2);
1586
1587 __ movprfx(z7, z27);
1588 __ sqincp(z7.VnS(), p6);
1589
1590 __ movprfx(z10, z9);
1591 __ sqincw(z10.VnS(), SVE_ALL);
1592
1593 __ movprfx(z31, z22);
1594 __ sqsub(z31.VnB(), z31.VnB(), 42);
1595
1596 __ movprfx(z12.VnH(), p7.Zeroing(), z23.VnH());
1597 __ sub(z12.VnH(), p7.Merging(), z12.VnH(), z23.VnH());
1598
1599 __ movprfx(z10, z1);
1600 __ sub(z10.VnH(), z10.VnH(), 42);
1601
1602 __ movprfx(z15.VnB(), p0.Merging(), z0.VnB());
1603 __ subr(z15.VnB(), p0.Merging(), z15.VnB(), z0.VnB());
1604
1605 __ movprfx(z17, z2);
1606 __ subr(z17.VnH(), z17.VnH(), 42);
1607
1608 __ movprfx(z5, z3);
1609 __ sxtb(z5.VnD(), p6.Merging(), z20.VnD());
1610
1611 __ movprfx(z11, z17);
1612 __ sxth(z11.VnD(), p6.Merging(), z25.VnD());
1613
1614 __ movprfx(z26, z4);
1615 __ sxtw(z26.VnD(), p5.Merging(), z4.VnD());
1616
1617 __ movprfx(z15.VnD(), p0.Zeroing(), z8.VnD());
1618 __ uabd(z15.VnD(), p0.Merging(), z15.VnD(), z20.VnD());
1619
1620 __ movprfx(z21, z24);
1621 __ udiv(z21.VnD(), p3.Merging(), z21.VnD(), z24.VnD());
1622
1623 __ movprfx(z22, z10);
1624 __ udivr(z22.VnD(), p7.Merging(), z22.VnD(), z27.VnD());
1625
1626 __ movprfx(z27, z25);
1627 __ udot(z27.VnD(), z29.VnH(), z3.VnH(), 1);
1628
1629 __ movprfx(z29, z10);
1630 __ udot(z29.VnS(), z10.VnB(), z21.VnB());
1631
1632 __ movprfx(z18, z0);
1633 __ udot(z18.VnS(), z14.VnB(), z0.VnB(), 1);
1634
1635 __ movprfx(z6, z30);
1636 __ umax(z6.VnS(), p2.Merging(), z6.VnS(), z27.VnS());
1637
1638 __ movprfx(z31, z17);
1639 __ umax(z31.VnD(), z31.VnD(), 42);
1640
1641 __ movprfx(z27.VnS(), p0.Merging(), z20.VnS());
1642 __ umin(z27.VnS(), p0.Merging(), z27.VnS(), z8.VnS());
1643
1644 __ movprfx(z0, z11);
1645 __ umin(z0.VnH(), z0.VnH(), 42);
1646
1647 __ movprfx(z21, z17);
1648 __ umulh(z21.VnB(), p0.Merging(), z21.VnB(), z30.VnB());
1649
1650 __ movprfx(z9, z24);
1651 __ uqadd(z9.VnD(), z9.VnD(), 42);
1652
1653 __ movprfx(z18, z13);
1654 __ uqdecd(z18.VnD(), SVE_MUL3);
1655
1656 __ movprfx(z20, z23);
1657 __ uqdech(z20.VnH(), SVE_VL2);
1658
1659 __ movprfx(z12, z29);
1660 __ uqdecp(z12.VnS(), p7);
1661
1662 __ movprfx(z24, z25);
1663 __ uqdecw(z24.VnS(), SVE_ALL);
1664
1665 __ movprfx(z13, z1);
1666 __ uqincd(z13.VnD(), SVE_MUL3);
1667
1668 __ movprfx(z5, z19);
1669 __ uqinch(z5.VnH(), SVE_VL2);
1670
1671 __ movprfx(z6, z25);
1672 __ uqincp(z6.VnS(), p5);
1673
1674 __ movprfx(z12, z14);
1675 __ uqincw(z12.VnS(), SVE_ALL);
1676
1677 __ movprfx(z13, z6);
1678 __ uqsub(z13.VnH(), z13.VnH(), 42);
1679
1680 __ movprfx(z31, z3);
1681 __ uxtb(z31.VnS(), p0.Merging(), z3.VnS());
1682
1683 __ movprfx(z18.VnD(), p4.Merging(), z25.VnD());
1684 __ uxth(z18.VnD(), p4.Merging(), z25.VnD());
1685
1686 __ movprfx(z18.VnD(), p7.Merging(), z25.VnD());
1687 __ uxtw(z18.VnD(), p7.Merging(), z25.VnD());
1688
1689 __ movprfx(z22, z5);
1690 __ smmla(z22.VnS(), z21.VnB(), z0.VnB());
1691
1692 __ movprfx(z1, z5);
1693 __ ummla(z1.VnS(), z10.VnB(), z0.VnB());
1694
1695 __ movprfx(z30, z5);
1696 __ usmmla(z30.VnS(), z31.VnB(), z18.VnB());
1697
1698 __ movprfx(z4, z5);
1699 __ usdot(z4.VnS(), z3.VnB(), z3.VnB());
1700
1701 __ movprfx(z10, z5);
1702 __ usdot(z10.VnS(), z9.VnB(), z0.VnB(), 0);
1703
1704 __ movprfx(z1, z5);
1705 __ sudot(z1.VnS(), z10.VnB(), z2.VnB(), 1);
1706 }
1707 assm.FinalizeCode();
1708
1709 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), true);
1710 }
1711
TEST(movprfx_positive_fp)1712 TEST(movprfx_positive_fp) {
1713 Assembler assm;
1714 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE,
1715 CPUFeatures::kSVEF32MM,
1716 CPUFeatures::kSVEF64MM);
1717 {
1718 // We have to use the Assembler directly to generate movprfx, so we need
1719 // to manually reserve space for the code we're about to emit.
1720 static const size_t kPairCount = 75;
1721 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
1722
1723 __ movprfx(z18.VnS(), p6.Zeroing(), z20.VnS());
1724 __ fabd(z18.VnS(), p6.Merging(), z18.VnS(), z19.VnS());
1725
1726 __ movprfx(z28.VnD(), p4.Zeroing(), z24.VnD());
1727 __ fabs(z28.VnD(), p4.Merging(), z24.VnD());
1728
1729 __ movprfx(z12, z8);
1730 __ fadd(z12.VnS(), p2.Merging(), z12.VnS(), 0.5);
1731
1732 __ movprfx(z0.VnS(), p1.Merging(), z9.VnS());
1733 __ fadd(z0.VnS(), p1.Merging(), z0.VnS(), z9.VnS());
1734
1735 __ movprfx(z10.VnH(), p2.Merging(), z2.VnH());
1736 __ fcadd(z10.VnH(), p2.Merging(), z10.VnH(), z20.VnH(), 90);
1737
1738 __ movprfx(z21, z6);
1739 __ fcmla(z21.VnH(), z31.VnH(), z6.VnH(), 2, 180);
1740
1741 __ movprfx(z16, z6);
1742 __ fcmla(z16.VnS(), z11.VnS(), z6.VnS(), 1, 270);
1743
1744 __ movprfx(z15.VnH(), p6.Merging(), z16.VnH());
1745 __ fcpy(z15.VnH(), p6.Merging(), 1.25);
1746
1747 __ movprfx(z1, z14);
1748 __ fcvt(z1.VnD(), p2.Merging(), z4.VnH());
1749
1750 __ movprfx(z25.VnD(), p6.Merging(), z1.VnD());
1751 __ fcvt(z25.VnD(), p6.Merging(), z1.VnS());
1752
1753 __ movprfx(z18.VnS(), p2.Merging(), z2.VnS());
1754 __ fcvt(z18.VnH(), p2.Merging(), z7.VnS());
1755
1756 __ movprfx(z21.VnD(), p5.Zeroing(), z26.VnD());
1757 __ fcvt(z21.VnH(), p5.Merging(), z26.VnD());
1758
1759 __ movprfx(z12.VnD(), p1.Merging(), z18.VnD());
1760 __ fcvtzs(z12.VnD(), p1.Merging(), z18.VnH());
1761
1762 __ movprfx(z3.VnS(), p2.Merging(), z0.VnS());
1763 __ fcvtzs(z3.VnS(), p2.Merging(), z26.VnS());
1764
1765 __ movprfx(z21.VnS(), p4.Merging(), z7.VnS());
1766 __ fcvtzs(z21.VnS(), p4.Merging(), z7.VnH());
1767
1768 __ movprfx(z16.VnD(), p3.Zeroing(), z4.VnD());
1769 __ fcvtzs(z16.VnS(), p3.Merging(), z28.VnD());
1770
1771 __ movprfx(z31.VnD(), p4.Merging(), z1.VnD());
1772 __ fcvtzu(z31.VnD(), p4.Merging(), z1.VnH());
1773
1774 __ movprfx(z23.VnH(), p0.Zeroing(), z28.VnH());
1775 __ fcvtzu(z23.VnH(), p0.Merging(), z28.VnH());
1776
1777 __ movprfx(z2, z12);
1778 __ fcvtzu(z2.VnD(), p3.Merging(), z28.VnS());
1779
1780 __ movprfx(z4, z7);
1781 __ fcvtzu(z4.VnS(), p7.Merging(), z16.VnD());
1782
1783 __ movprfx(z13.VnS(), p3.Zeroing(), z23.VnS());
1784 __ fdiv(z13.VnS(), p3.Merging(), z13.VnS(), z23.VnS());
1785
1786 __ movprfx(z6.VnD(), p1.Zeroing(), z16.VnD());
1787 __ fdivr(z6.VnD(), p1.Merging(), z6.VnD(), z5.VnD());
1788
1789 __ movprfx(z31, z23);
1790 __ fmad(z31.VnS(), p5.Merging(), z23.VnS(), z11.VnS());
1791
1792 __ movprfx(z14.VnH(), p7.Merging(), z21.VnH());
1793 __ fmax(z14.VnH(), p7.Merging(), z14.VnH(), 0.0);
1794
1795 __ movprfx(z17.VnS(), p4.Merging(), z9.VnS());
1796 __ fmax(z17.VnS(), p4.Merging(), z17.VnS(), z9.VnS());
1797
1798 __ movprfx(z1.VnS(), p3.Zeroing(), z30.VnS());
1799 __ fmaxnm(z1.VnS(), p3.Merging(), z1.VnS(), 0.0);
1800
1801 __ movprfx(z10.VnD(), p1.Zeroing(), z17.VnD());
1802 __ fmaxnm(z10.VnD(), p1.Merging(), z10.VnD(), z17.VnD());
1803
1804 __ movprfx(z3, z13);
1805 __ fmin(z3.VnS(), p0.Merging(), z3.VnS(), 0.0);
1806
1807 __ movprfx(z15, z21);
1808 __ fmin(z15.VnS(), p4.Merging(), z15.VnS(), z21.VnS());
1809
1810 __ movprfx(z30.VnH(), p7.Zeroing(), z25.VnH());
1811 __ fminnm(z30.VnH(), p7.Merging(), z30.VnH(), 0.0);
1812
1813 __ movprfx(z31, z15);
1814 __ fminnm(z31.VnD(), p5.Merging(), z31.VnD(), z25.VnD());
1815
1816 __ movprfx(z27, z28);
1817 __ fmla(z27.VnD(), z28.VnD(), z12.VnD(), 1);
1818
1819 __ movprfx(z26.VnH(), p6.Zeroing(), z13.VnH());
1820 __ fmla(z26.VnH(), p6.Merging(), z13.VnH(), z7.VnH());
1821
1822 __ movprfx(z26, z10);
1823 __ fmla(z26.VnH(), z10.VnH(), z1.VnH(), 7);
1824
1825 __ movprfx(z0, z1);
1826 __ fmla(z0.VnS(), z25.VnS(), z1.VnS(), 3);
1827
1828 __ movprfx(z7, z3);
1829 __ fmls(z7.VnD(), z30.VnD(), z3.VnD(), 1);
1830
1831 __ movprfx(z1, z24);
1832 __ fmls(z1.VnD(), p5.Merging(), z20.VnD(), z24.VnD());
1833
1834 __ movprfx(z19, z18);
1835 __ fmls(z19.VnH(), z18.VnH(), z7.VnH(), 4);
1836
1837 __ movprfx(z0, z26);
1838 __ fmls(z0.VnS(), z17.VnS(), z4.VnS(), 3);
1839
1840 __ movprfx(z19.VnS(), p7.Zeroing(), z6.VnS());
1841 __ fmov(z19.VnS(), p7.Merging(), 0.0);
1842
1843 __ movprfx(z21, z15);
1844 __ fmov(z21.VnH(), p7.Merging(), 2.5);
1845
1846 __ movprfx(z23, z18);
1847 __ fmsb(z23.VnS(), p4.Merging(), z1.VnS(), z7.VnS());
1848
1849 __ movprfx(z8, z28);
1850 __ fmul(z8.VnS(), p4.Merging(), z8.VnS(), 2.0);
1851
1852 __ movprfx(z6.VnD(), p6.Merging(), z27.VnD());
1853 __ fmul(z6.VnD(), p6.Merging(), z6.VnD(), z27.VnD());
1854
1855 __ movprfx(z6.VnH(), p0.Merging(), z19.VnH());
1856 __ fmulx(z6.VnH(), p0.Merging(), z6.VnH(), z19.VnH());
1857
1858 __ movprfx(z5.VnH(), p0.Merging(), z1.VnH());
1859 __ fneg(z5.VnH(), p0.Merging(), z1.VnH());
1860
1861 __ movprfx(z22.VnD(), p4.Zeroing(), z24.VnD());
1862 __ fnmad(z22.VnD(), p4.Merging(), z24.VnD(), z12.VnD());
1863
1864 __ movprfx(z5.VnS(), p0.Merging(), z29.VnS());
1865 __ fnmla(z5.VnS(), p0.Merging(), z17.VnS(), z29.VnS());
1866
1867 __ movprfx(z5, z3);
1868 __ fnmls(z5.VnD(), p5.Merging(), z3.VnD(), z2.VnD());
1869
1870 __ movprfx(z9.VnD(), p2.Zeroing(), z7.VnD());
1871 __ fnmsb(z9.VnD(), p2.Merging(), z7.VnD(), z23.VnD());
1872
1873 // Note that frecpe and frecps _cannot_ take movprfx.
1874 __ movprfx(z12.VnH(), p1.Zeroing(), z17.VnH());
1875 __ frecpx(z12.VnH(), p1.Merging(), z4.VnH());
1876
1877 __ movprfx(z28.VnS(), p4.Zeroing(), z27.VnS());
1878 __ frinta(z28.VnS(), p4.Merging(), z24.VnS());
1879
1880 __ movprfx(z7.VnD(), p7.Merging(), z25.VnD());
1881 __ frinti(z7.VnD(), p7.Merging(), z25.VnD());
1882
1883 __ movprfx(z10, z21);
1884 __ frintm(z10.VnD(), p5.Merging(), z26.VnD());
1885
1886 __ movprfx(z25, z21);
1887 __ frintn(z25.VnH(), p4.Merging(), z1.VnH());
1888
1889 __ movprfx(z25, z9);
1890 __ frintp(z25.VnH(), p1.Merging(), z9.VnH());
1891
1892 __ movprfx(z30, z16);
1893 __ frintx(z30.VnS(), p1.Merging(), z16.VnS());
1894
1895 __ movprfx(z0.VnD(), p5.Merging(), z9.VnD());
1896 __ frintz(z0.VnD(), p5.Merging(), z23.VnD());
1897
1898 __ movprfx(z11.VnD(), p7.Merging(), z2.VnD());
1899 __ fscale(z11.VnD(), p7.Merging(), z11.VnD(), z2.VnD());
1900
1901 __ movprfx(z23.VnS(), p4.Merging(), z17.VnS());
1902 __ fsqrt(z23.VnS(), p4.Merging(), z10.VnS());
1903
1904 __ movprfx(z0.VnD(), p2.Merging(), z26.VnD());
1905 __ fsub(z0.VnD(), p2.Merging(), z0.VnD(), 1.0);
1906
1907 __ movprfx(z28.VnD(), p1.Zeroing(), z16.VnD());
1908 __ fsub(z28.VnD(), p1.Merging(), z28.VnD(), z16.VnD());
1909
1910 __ movprfx(z22, z27);
1911 __ fsubr(z22.VnD(), p4.Merging(), z22.VnD(), 1.0);
1912
1913 __ movprfx(z4.VnS(), p2.Merging(), z26.VnS());
1914 __ fsubr(z4.VnS(), p2.Merging(), z4.VnS(), z26.VnS());
1915
1916 // Note that ftsmul and ftssel _cannot_ take movprfx.
1917 __ movprfx(z10, z4);
1918 __ ftmad(z10.VnS(), z10.VnS(), z4.VnS(), 2);
1919
1920 __ movprfx(z2, z16);
1921 __ scvtf(z2.VnD(), p1.Merging(), z16.VnS());
1922
1923 __ movprfx(z10, z20);
1924 __ scvtf(z10.VnD(), p5.Merging(), z20.VnD());
1925
1926 __ movprfx(z29, z28);
1927 __ scvtf(z29.VnS(), p0.Merging(), z31.VnD());
1928
1929 __ movprfx(z26.VnD(), p3.Merging(), z13.VnD());
1930 __ scvtf(z26.VnH(), p3.Merging(), z5.VnD());
1931
1932 __ movprfx(z7.VnD(), p3.Zeroing(), z26.VnD());
1933 __ ucvtf(z7.VnD(), p3.Merging(), z26.VnS());
1934
1935 __ movprfx(z13, z17);
1936 __ ucvtf(z13.VnD(), p7.Merging(), z17.VnD());
1937
1938 __ movprfx(z24.VnD(), p1.Merging(), z31.VnD());
1939 __ ucvtf(z24.VnS(), p1.Merging(), z18.VnD());
1940
1941 __ movprfx(z17.VnD(), p4.Merging(), z22.VnD());
1942 __ ucvtf(z17.VnH(), p4.Merging(), z4.VnD());
1943
1944 __ movprfx(z30, z5);
1945 __ fmmla(z30.VnS(), z29.VnS(), z18.VnS());
1946
1947 __ movprfx(z31, z5);
1948 __ fmmla(z31.VnD(), z30.VnD(), z18.VnD());
1949 }
1950 assm.FinalizeCode();
1951
1952 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), true);
1953 }
1954
TEST(movprfx_positive_sve2)1955 TEST(movprfx_positive_sve2) {
1956 Assembler assm;
1957 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE, CPUFeatures::kSVE2);
1958 {
1959 // We have to use the Assembler directly to generate movprfx, so we need
1960 // to manually reserve space for the code we're about to emit.
1961 static const size_t kPairCount = 145;
1962 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
1963
1964 __ movprfx(z25, z26);
1965 __ adclb(z25.VnS(), z17.VnS(), z24.VnS());
1966
1967 __ movprfx(z0, z1);
1968 __ adclt(z0.VnS(), z2.VnS(), z15.VnS());
1969
1970 __ movprfx(z3, z4);
1971 __ addp(z3.VnB(), p1.Merging(), z3.VnB(), z0.VnB());
1972
1973 __ movprfx(z6, z7);
1974 __ bcax(z6.VnD(), z6.VnD(), z12.VnD(), z1.VnD());
1975
1976 __ movprfx(z18, z19);
1977 __ bsl1n(z18.VnD(), z18.VnD(), z8.VnD(), z7.VnD());
1978
1979 __ movprfx(z7, z8);
1980 __ bsl2n(z7.VnD(), z7.VnD(), z3.VnD(), z19.VnD());
1981
1982 __ movprfx(z21, z22);
1983 __ bsl(z21.VnD(), z21.VnD(), z2.VnD(), z2.VnD());
1984
1985 __ movprfx(z5, z6);
1986 __ cadd(z5.VnB(), z5.VnB(), z12.VnB(), 90);
1987
1988 __ movprfx(z7, z8);
1989 __ cdot(z7.VnS(), z4.VnB(), z10.VnB(), 0);
1990
1991 __ movprfx(z7, z8);
1992 __ cdot(z7.VnS(), z4.VnB(), z0.VnB(), 0, 0);
1993
1994 __ movprfx(z7, z8);
1995 __ cdot(z7.VnD(), z4.VnH(), z0.VnH(), 0, 0);
1996
1997 __ movprfx(z19, z20);
1998 __ cmla(z19.VnB(), z7.VnB(), z2.VnB(), 0);
1999
2000 __ movprfx(z19, z20);
2001 __ cmla(z19.VnS(), z7.VnS(), z2.VnS(), 0, 0);
2002
2003 __ movprfx(z19, z20);
2004 __ cmla(z19.VnH(), z7.VnH(), z2.VnH(), 0, 0);
2005
2006 __ movprfx(z10, z11);
2007 __ eor3(z10.VnD(), z10.VnD(), z24.VnD(), z23.VnD());
2008
2009 __ movprfx(z3, z4);
2010 __ eorbt(z3.VnB(), z10.VnB(), z8.VnB());
2011
2012 __ movprfx(z20, z22);
2013 __ eortb(z20.VnB(), z21.VnB(), z15.VnB());
2014
2015 __ movprfx(z14, z15);
2016 __ faddp(z14.VnD(), p1.Merging(), z14.VnD(), z26.VnD());
2017
2018 __ movprfx(z14.VnD(), p4.Merging(), z15.VnD());
2019 __ fcvtx(z14.VnS(), p4.Merging(), z0.VnD());
2020
2021 __ movprfx(z15.VnH(), p0.Merging(), z16.VnH());
2022 __ flogb(z15.VnH(), p0.Merging(), z3.VnH());
2023
2024 __ movprfx(z2, z3);
2025 __ fmaxnmp(z2.VnD(), p1.Merging(), z2.VnD(), z14.VnD());
2026
2027 __ movprfx(z22, z23);
2028 __ fmaxp(z22.VnD(), p1.Merging(), z22.VnD(), z3.VnD());
2029
2030 __ movprfx(z1, z2);
2031 __ fminnmp(z1.VnD(), p0.Merging(), z1.VnD(), z14.VnD());
2032
2033 __ movprfx(z16, z17);
2034 __ fminp(z16.VnD(), p3.Merging(), z16.VnD(), z11.VnD());
2035
2036 __ movprfx(z16, z17);
2037 __ fmlalb(z16.VnS(), z18.VnH(), z29.VnH());
2038
2039 __ movprfx(z16, z17);
2040 __ fmlalb(z16.VnS(), z18.VnH(), z2.VnH(), 0);
2041
2042 __ movprfx(z18, z19);
2043 __ fmlalt(z18.VnS(), z13.VnH(), z5.VnH());
2044
2045 __ movprfx(z18, z19);
2046 __ fmlalt(z18.VnS(), z13.VnH(), z5.VnH(), 0);
2047
2048 __ movprfx(z16, z17);
2049 __ fmlslb(z16.VnS(), z10.VnH(), z1.VnH());
2050
2051 __ movprfx(z16, z17);
2052 __ fmlslb(z16.VnS(), z10.VnH(), z1.VnH(), 0);
2053
2054 __ movprfx(z3, z4);
2055 __ fmlslt(z3.VnS(), z17.VnH(), z14.VnH());
2056
2057 __ movprfx(z3, z4);
2058 __ fmlslt(z3.VnS(), z17.VnH(), z1.VnH(), 0);
2059
2060 __ movprfx(z2, z3);
2061 __ mla(z2.VnH(), z0.VnH(), z1.VnH(), 0);
2062
2063 __ movprfx(z2, z3);
2064 __ mla(z2.VnS(), z0.VnS(), z1.VnS(), 0);
2065
2066 __ movprfx(z2, z3);
2067 __ mla(z2.VnD(), z0.VnD(), z1.VnD(), 0);
2068
2069 __ movprfx(z2, z3);
2070 __ mls(z2.VnH(), z0.VnH(), z1.VnH(), 0);
2071
2072 __ movprfx(z2, z3);
2073 __ mls(z2.VnS(), z0.VnS(), z1.VnS(), 0);
2074
2075 __ movprfx(z2, z3);
2076 __ mls(z2.VnD(), z0.VnD(), z1.VnD(), 0);
2077
2078 __ movprfx(z17, z18);
2079 __ nbsl(z17.VnD(), z17.VnD(), z21.VnD(), z27.VnD());
2080
2081 __ movprfx(z13, z14);
2082 __ saba(z13.VnB(), z2.VnB(), z31.VnB());
2083
2084 __ movprfx(z13, z14);
2085 __ sabalb(z13.VnD(), z20.VnS(), z26.VnS());
2086
2087 __ movprfx(z14, z15);
2088 __ sabalt(z14.VnD(), z19.VnS(), z10.VnS());
2089
2090 __ movprfx(z19.VnD(), p5.Merging(), z20.VnD());
2091 __ sadalp(z19.VnD(), p5.Merging(), z9.VnS());
2092
2093 __ movprfx(z17, z18);
2094 __ sbclb(z17.VnS(), z10.VnS(), z8.VnS());
2095
2096 __ movprfx(z20, z21);
2097 __ sbclt(z20.VnS(), z0.VnS(), z13.VnS());
2098
2099 __ movprfx(z20.VnB(), p3.Merging(), z21.VnB());
2100 __ shadd(z20.VnB(), p3.Merging(), z20.VnB(), z7.VnB());
2101
2102 __ movprfx(z21.VnB(), p0.Merging(), z22.VnB());
2103 __ shsub(z21.VnB(), p0.Merging(), z21.VnB(), z0.VnB());
2104
2105 __ movprfx(z1.VnB(), p0.Merging(), z2.VnB());
2106 __ shsubr(z1.VnB(), p0.Merging(), z1.VnB(), z2.VnB());
2107
2108 __ movprfx(z5, z6);
2109 __ smaxp(z5.VnB(), p4.Merging(), z5.VnB(), z10.VnB());
2110
2111 __ movprfx(z27, z28);
2112 __ sminp(z27.VnB(), p3.Merging(), z27.VnB(), z1.VnB());
2113
2114 __ movprfx(z1, z2);
2115 __ smlalb(z1.VnD(), z3.VnS(), z23.VnS());
2116
2117 __ movprfx(z1, z2);
2118 __ smlalb(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2119
2120 __ movprfx(z1, z2);
2121 __ smlalb(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2122
2123 __ movprfx(z1, z2);
2124 __ smlalt(z1.VnD(), z3.VnS(), z23.VnS());
2125
2126 __ movprfx(z1, z2);
2127 __ smlalt(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2128
2129 __ movprfx(z1, z2);
2130 __ smlalt(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2131
2132 __ movprfx(z1, z2);
2133 __ smlslb(z1.VnD(), z3.VnS(), z23.VnS());
2134
2135 __ movprfx(z1, z2);
2136 __ smlslb(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2137
2138 __ movprfx(z1, z2);
2139 __ smlslb(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2140
2141 __ movprfx(z1, z2);
2142 __ smlslt(z1.VnD(), z3.VnS(), z23.VnS());
2143
2144 __ movprfx(z1, z2);
2145 __ smlslt(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2146
2147 __ movprfx(z1, z2);
2148 __ smlslt(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2149
2150 __ movprfx(z29.VnB(), p1.Merging(), z30.VnB());
2151 __ sqabs(z29.VnB(), p1.Merging(), z18.VnB());
2152
2153 __ movprfx(z28.VnB(), p0.Merging(), z29.VnB());
2154 __ sqadd(z28.VnB(), p0.Merging(), z28.VnB(), z3.VnB());
2155
2156 __ movprfx(z20, z21);
2157 __ sqcadd(z20.VnB(), z20.VnB(), z23.VnB(), 90);
2158
2159 __ movprfx(z6, z7);
2160 __ sqdmlalb(z6.VnD(), z19.VnS(), z25.VnS());
2161
2162 __ movprfx(z6, z7);
2163 __ sqdmlalb(z6.VnD(), z19.VnS(), z2.VnS(), 0);
2164
2165 __ movprfx(z6, z7);
2166 __ sqdmlalb(z6.VnS(), z19.VnH(), z2.VnH(), 0);
2167
2168 __ movprfx(z23, z24);
2169 __ sqdmlalbt(z23.VnD(), z29.VnS(), z26.VnS());
2170
2171 __ movprfx(z11, z12);
2172 __ sqdmlalt(z11.VnD(), z0.VnS(), z0.VnS());
2173
2174 __ movprfx(z11, z12);
2175 __ sqdmlalt(z11.VnD(), z0.VnS(), z0.VnS(), 0);
2176
2177 __ movprfx(z11, z12);
2178 __ sqdmlalt(z11.VnS(), z0.VnH(), z0.VnH(), 0);
2179
2180 __ movprfx(z16, z17);
2181 __ sqdmlslb(z16.VnD(), z26.VnS(), z25.VnS());
2182
2183 __ movprfx(z16, z17);
2184 __ sqdmlslb(z16.VnD(), z26.VnS(), z2.VnS(), 0);
2185
2186 __ movprfx(z16, z17);
2187 __ sqdmlslb(z16.VnS(), z26.VnH(), z2.VnH(), 0);
2188
2189 __ movprfx(z26, z27);
2190 __ sqdmlslbt(z26.VnD(), z23.VnS(), z4.VnS());
2191
2192 __ movprfx(z21, z22);
2193 __ sqdmlslt(z21.VnD(), z23.VnS(), z9.VnS());
2194
2195 __ movprfx(z21, z22);
2196 __ sqdmlslt(z21.VnD(), z23.VnS(), z0.VnS(), 0);
2197
2198 __ movprfx(z21, z22);
2199 __ sqdmlslt(z21.VnS(), z23.VnH(), z0.VnH(), 0);
2200
2201 __ movprfx(z21.VnB(), p0.Merging(), z22.VnB());
2202 __ sqneg(z21.VnB(), p0.Merging(), z17.VnB());
2203
2204 __ movprfx(z31, z0);
2205 __ sqrdcmlah(z31.VnB(), z15.VnB(), z20.VnB(), 0);
2206
2207 __ movprfx(z31, z0);
2208 __ sqrdcmlah(z31.VnH(), z15.VnH(), z2.VnH(), 0, 0);
2209
2210 __ movprfx(z31, z0);
2211 __ sqrdcmlah(z31.VnS(), z15.VnS(), z2.VnS(), 0, 0);
2212
2213 __ movprfx(z27, z28);
2214 __ sqrdmlah(z27.VnB(), z28.VnB(), z19.VnB());
2215
2216 __ movprfx(z27, z28);
2217 __ sqrdmlah(z27.VnH(), z28.VnH(), z1.VnH(), 0);
2218
2219 __ movprfx(z27, z28);
2220 __ sqrdmlah(z27.VnS(), z28.VnS(), z1.VnS(), 0);
2221
2222 __ movprfx(z27, z28);
2223 __ sqrdmlah(z27.VnD(), z28.VnD(), z1.VnD(), 0);
2224
2225 __ movprfx(z11, z12);
2226 __ sqrdmlsh(z11.VnB(), z16.VnB(), z31.VnB());
2227
2228 __ movprfx(z11, z12);
2229 __ sqrdmlsh(z11.VnH(), z16.VnH(), z1.VnH(), 0);
2230
2231 __ movprfx(z11, z12);
2232 __ sqrdmlsh(z11.VnS(), z16.VnS(), z1.VnS(), 0);
2233
2234 __ movprfx(z11, z12);
2235 __ sqrdmlsh(z11.VnD(), z16.VnD(), z1.VnD(), 0);
2236
2237 __ movprfx(z31.VnB(), p5.Merging(), z0.VnB());
2238 __ sqrshl(z31.VnB(), p5.Merging(), z31.VnB(), z27.VnB());
2239
2240 __ movprfx(z25.VnB(), p6.Merging(), z26.VnB());
2241 __ sqrshlr(z25.VnB(), p6.Merging(), z25.VnB(), z7.VnB());
2242
2243 __ movprfx(z0.VnB(), p5.Merging(), z1.VnB());
2244 __ sqshl(z0.VnB(), p5.Merging(), z0.VnB(), 0);
2245
2246 __ movprfx(z0.VnB(), p5.Merging(), z1.VnB());
2247 __ sqshl(z0.VnB(), p5.Merging(), z0.VnB(), z2.VnB());
2248
2249 __ movprfx(z7.VnB(), p3.Merging(), z8.VnB());
2250 __ sqshlr(z7.VnB(), p3.Merging(), z7.VnB(), z5.VnB());
2251
2252 __ movprfx(z10.VnB(), p1.Merging(), z11.VnB());
2253 __ sqshlu(z10.VnB(), p1.Merging(), z10.VnB(), 0);
2254
2255 __ movprfx(z16.VnB(), p7.Merging(), z17.VnB());
2256 __ sqsub(z16.VnB(), p7.Merging(), z16.VnB(), z22.VnB());
2257
2258 __ movprfx(z16.VnB(), p7.Merging(), z17.VnB());
2259 __ sqsubr(z16.VnB(), p7.Merging(), z16.VnB(), z22.VnB());
2260
2261 __ movprfx(z23.VnB(), p4.Merging(), z24.VnB());
2262 __ srhadd(z23.VnB(), p4.Merging(), z23.VnB(), z14.VnB());
2263
2264 __ movprfx(z31.VnB(), p7.Merging(), z0.VnB());
2265 __ srshl(z31.VnB(), p7.Merging(), z31.VnB(), z3.VnB());
2266
2267 __ movprfx(z16.VnB(), p7.Merging(), z17.VnB());
2268 __ srshlr(z16.VnB(), p7.Merging(), z16.VnB(), z29.VnB());
2269
2270 __ movprfx(z12.VnB(), p0.Merging(), z13.VnB());
2271 __ srshr(z12.VnB(), p0.Merging(), z12.VnB(), 1);
2272
2273 __ movprfx(z0, z1);
2274 __ srsra(z0.VnB(), z8.VnB(), 1);
2275
2276 __ movprfx(z0, z1);
2277 __ ssra(z0.VnB(), z8.VnB(), 1);
2278
2279 __ movprfx(z26.VnB(), p2.Merging(), z27.VnB());
2280 __ suqadd(z26.VnB(), p2.Merging(), z26.VnB(), z28.VnB());
2281
2282 __ movprfx(z23, z24);
2283 __ uaba(z23.VnB(), z22.VnB(), z20.VnB());
2284
2285 __ movprfx(z11, z12);
2286 __ uabalb(z11.VnD(), z25.VnS(), z12.VnS());
2287
2288 __ movprfx(z4, z5);
2289 __ uabalt(z4.VnD(), z2.VnS(), z31.VnS());
2290
2291 __ movprfx(z20.VnD(), p4.Merging(), z21.VnD());
2292 __ uadalp(z20.VnD(), p4.Merging(), z5.VnS());
2293
2294 __ movprfx(z21.VnB(), p2.Merging(), z22.VnB());
2295 __ uhadd(z21.VnB(), p2.Merging(), z21.VnB(), z19.VnB());
2296
2297 __ movprfx(z1.VnB(), p4.Merging(), z2.VnB());
2298 __ uhsub(z1.VnB(), p4.Merging(), z1.VnB(), z9.VnB());
2299
2300 __ movprfx(z18.VnB(), p0.Merging(), z19.VnB());
2301 __ uhsubr(z18.VnB(), p0.Merging(), z18.VnB(), z1.VnB());
2302
2303 __ movprfx(z7, z8);
2304 __ umaxp(z7.VnB(), p2.Merging(), z7.VnB(), z23.VnB());
2305
2306 __ movprfx(z10, z11);
2307 __ uminp(z10.VnB(), p0.Merging(), z10.VnB(), z22.VnB());
2308
2309 __ movprfx(z31, z0);
2310 __ umlalb(z31.VnD(), z9.VnS(), z21.VnS());
2311
2312 __ movprfx(z31, z0);
2313 __ umlalb(z31.VnD(), z9.VnS(), z1.VnS(), 0);
2314
2315 __ movprfx(z31, z0);
2316 __ umlalb(z31.VnS(), z9.VnH(), z1.VnH(), 0);
2317
2318 __ movprfx(z11, z12);
2319 __ umlalt(z11.VnD(), z5.VnS(), z22.VnS());
2320
2321 __ movprfx(z11, z12);
2322 __ umlalt(z11.VnD(), z5.VnS(), z2.VnS(), 0);
2323
2324 __ movprfx(z11, z12);
2325 __ umlalt(z11.VnS(), z5.VnH(), z2.VnH(), 0);
2326
2327 __ movprfx(z28, z29);
2328 __ umlslb(z28.VnD(), z13.VnS(), z9.VnS());
2329
2330 __ movprfx(z28, z29);
2331 __ umlslb(z28.VnD(), z13.VnS(), z1.VnS(), 0);
2332
2333 __ movprfx(z28, z29);
2334 __ umlslb(z28.VnS(), z13.VnH(), z1.VnH(), 0);
2335
2336 __ movprfx(z9, z10);
2337 __ umlslt(z9.VnD(), z12.VnS(), z30.VnS());
2338
2339 __ movprfx(z9, z10);
2340 __ umlslt(z9.VnD(), z12.VnS(), z0.VnS(), 0);
2341
2342 __ movprfx(z9, z10);
2343 __ umlslt(z9.VnS(), z12.VnH(), z0.VnH(), 0);
2344
2345 __ movprfx(z24.VnB(), p7.Merging(), z25.VnB());
2346 __ uqadd(z24.VnB(), p7.Merging(), z24.VnB(), z1.VnB()),
2347
2348 __ movprfx(z20.VnB(), p1.Merging(), z21.VnB());
2349 __ uqrshl(z20.VnB(), p1.Merging(), z20.VnB(), z30.VnB());
2350
2351 __ movprfx(z8.VnB(), p5.Merging(), z9.VnB());
2352 __ uqrshlr(z8.VnB(), p5.Merging(), z8.VnB(), z9.VnB());
2353
2354 __ movprfx(z29.VnB(), p7.Merging(), z30.VnB());
2355 __ uqshl(z29.VnB(), p7.Merging(), z29.VnB(), 0);
2356
2357 __ movprfx(z29.VnB(), p7.Merging(), z30.VnB());
2358 __ uqshl(z29.VnB(), p7.Merging(), z29.VnB(), z30.VnB());
2359
2360 __ movprfx(z12.VnB(), p1.Merging(), z13.VnB());
2361 __ uqshlr(z12.VnB(), p1.Merging(), z12.VnB(), z13.VnB());
2362
2363 __ movprfx(z20.VnB(), p0.Merging(), z21.VnB());
2364 __ uqsub(z20.VnB(), p0.Merging(), z20.VnB(), z6.VnB());
2365
2366 __ movprfx(z20.VnB(), p0.Merging(), z21.VnB());
2367 __ uqsubr(z20.VnB(), p0.Merging(), z20.VnB(), z6.VnB());
2368
2369 __ movprfx(z25.VnS(), p7.Merging(), z26.VnS());
2370 __ urecpe(z25.VnS(), p7.Merging(), z2.VnS());
2371
2372 __ movprfx(z29.VnB(), p4.Merging(), z30.VnB());
2373 __ urhadd(z29.VnB(), p4.Merging(), z29.VnB(), z10.VnB());
2374
2375 __ movprfx(z15.VnB(), p2.Merging(), z16.VnB());
2376 __ urshl(z15.VnB(), p2.Merging(), z15.VnB(), z3.VnB());
2377
2378 __ movprfx(z27.VnB(), p1.Merging(), z28.VnB());
2379 __ urshlr(z27.VnB(), p1.Merging(), z27.VnB(), z30.VnB());
2380
2381 __ movprfx(z31.VnB(), p2.Merging(), z0.VnB());
2382 __ urshr(z31.VnB(), p2.Merging(), z31.VnB(), 1);
2383
2384 __ movprfx(z4.VnS(), p3.Merging(), z5.VnS());
2385 __ ursqrte(z4.VnS(), p3.Merging(), z3.VnS());
2386
2387 __ movprfx(z0, z1);
2388 __ ursra(z0.VnB(), z8.VnB(), 1);
2389
2390 __ movprfx(z25.VnB(), p4.Merging(), z26.VnB());
2391 __ usqadd(z25.VnB(), p4.Merging(), z25.VnB(), z6.VnB());
2392
2393 __ movprfx(z0, z1);
2394 __ usra(z0.VnB(), z8.VnB(), 1);
2395
2396 __ movprfx(z16, z17);
2397 __ xar(z16.VnB(), z16.VnB(), z13.VnB(), 1);
2398 }
2399 assm.FinalizeCode();
2400
2401 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), true);
2402 }
2403
TEST(movprfx_negative_instructions_sve2)2404 TEST(movprfx_negative_instructions_sve2) {
2405 Assembler assm;
2406 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE,
2407 CPUFeatures::kSVE2,
2408 CPUFeatures::kSVEBitPerm);
2409 {
2410 // We have to use the Assembler directly to generate movprfx, so we need
2411 // to manually reserve space for the code we're about to emit.
2412 static const size_t kPairCount = 133;
2413 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
2414
2415 __ movprfx(z29, z30);
2416 __ addhnb(z29.VnS(), z19.VnD(), z2.VnD());
2417
2418 __ movprfx(z8, z9);
2419 __ addhnt(z8.VnS(), z12.VnD(), z6.VnD());
2420
2421 __ movprfx(z18, z19);
2422 __ bdep(z18.VnB(), z10.VnB(), z0.VnB());
2423
2424 __ movprfx(z6, z7);
2425 __ bext(z6.VnB(), z2.VnB(), z5.VnB());
2426
2427 __ movprfx(z24, z25);
2428 __ bgrp(z24.VnB(), z9.VnB(), z5.VnB());
2429
2430 __ movprfx(z1, z2);
2431 __ fcvtlt(z1.VnD(), p1.Merging(), z28.VnS());
2432
2433 __ movprfx(z1, z2);
2434 __ fcvtlt(z1.VnS(), p1.Merging(), z28.VnH());
2435
2436 __ movprfx(z4, z5);
2437 __ fcvtnt(z4.VnH(), p7.Merging(), z0.VnS());
2438
2439 __ movprfx(z4, z5);
2440 __ fcvtnt(z4.VnS(), p7.Merging(), z0.VnD());
2441
2442 __ movprfx(z27, z28);
2443 __ fcvtxnt(z27.VnS(), p0.Merging(), z17.VnD());
2444
2445 __ movprfx(z24, z25);
2446 __ histcnt(z24.VnS(), p6.Zeroing(), z3.VnS(), z10.VnS());
2447
2448 __ movprfx(z22, z23);
2449 __ histseg(z22.VnB(), z14.VnB(), z8.VnB());
2450
2451 __ movprfx(z21, z22);
2452 __ ldnt1b(z21.VnS(), p5.Zeroing(), SVEMemOperand(z21.VnS(), x23));
2453
2454 __ movprfx(z21, z22);
2455 __ ldnt1b(z21.VnD(), p5.Zeroing(), SVEMemOperand(z1.VnD(), x23));
2456
2457 __ movprfx(z10, z11);
2458 __ ldnt1d(z10.VnD(), p0.Zeroing(), SVEMemOperand(z23.VnD(), x6));
2459
2460 __ movprfx(z30, z31);
2461 __ ldnt1h(z30.VnS(), p4.Zeroing(), SVEMemOperand(z6.VnS(), x11));
2462
2463 __ movprfx(z30, z31);
2464 __ ldnt1h(z30.VnD(), p4.Zeroing(), SVEMemOperand(z6.VnD(), x11));
2465
2466 __ movprfx(z7, z8);
2467 __ ldnt1sb(z7.VnS(), p3.Zeroing(), SVEMemOperand(z18.VnS(), x11));
2468
2469 __ movprfx(z7, z8);
2470 __ ldnt1sb(z7.VnD(), p3.Zeroing(), SVEMemOperand(z18.VnD(), x11));
2471
2472 __ movprfx(z17, z18);
2473 __ ldnt1sh(z17.VnS(), p5.Zeroing(), SVEMemOperand(z31.VnS(), x19));
2474
2475 __ movprfx(z17, z18);
2476 __ ldnt1sh(z17.VnD(), p5.Zeroing(), SVEMemOperand(z31.VnD(), x19));
2477
2478 __ movprfx(z3, z4);
2479 __ ldnt1sw(z3.VnD(), p7.Zeroing(), SVEMemOperand(z1.VnD(), x10));
2480
2481 __ movprfx(z0, z1);
2482 __ ldnt1w(z0.VnS(), p4.Zeroing(), SVEMemOperand(z11.VnS(), x1));
2483
2484 __ movprfx(z0, z1);
2485 __ ldnt1w(z0.VnD(), p4.Zeroing(), SVEMemOperand(z11.VnD(), x1));
2486
2487 __ movprfx(z18, z19);
2488 __ match(p15.VnB(), p1.Zeroing(), z18.VnB(), z5.VnB());
2489
2490 __ movprfx(z15, z16);
2491 __ mul(z15.VnB(), z15.VnB(), z15.VnB());
2492
2493 __ movprfx(z15, z16);
2494 __ mul(z15.VnH(), z15.VnH(), z1.VnH(), 0);
2495
2496 __ movprfx(z15, z16);
2497 __ mul(z15.VnS(), z15.VnS(), z1.VnS(), 0);
2498
2499 __ movprfx(z15, z16);
2500 __ mul(z15.VnD(), z15.VnD(), z1.VnD(), 0);
2501
2502 __ movprfx(z20, z21);
2503 __ nmatch(p1.VnB(), p1.Zeroing(), z20.VnB(), z17.VnB());
2504
2505 __ movprfx(z0, z1);
2506 __ pmul(z0.VnB(), z5.VnB(), z5.VnB());
2507
2508 __ movprfx(z12, z13);
2509 __ pmullb(z12.VnD(), z21.VnS(), z12.VnS());
2510
2511 __ movprfx(z31, z0);
2512 __ pmullt(z31.VnD(), z30.VnS(), z26.VnS());
2513
2514 __ movprfx(z0, z1);
2515 __ raddhnb(z0.VnS(), z11.VnD(), z10.VnD());
2516
2517 __ movprfx(z23, z24);
2518 __ raddhnt(z23.VnS(), z27.VnD(), z9.VnD());
2519
2520 __ movprfx(z5, z6);
2521 __ rshrnb(z5.VnB(), z1.VnH(), 1);
2522
2523 __ movprfx(z5, z6);
2524 __ rshrnt(z5.VnB(), z1.VnH(), 8);
2525
2526 __ movprfx(z30, z31);
2527 __ rsubhnb(z30.VnS(), z29.VnD(), z11.VnD());
2528
2529 __ movprfx(z25, z26);
2530 __ rsubhnt(z25.VnS(), z7.VnD(), z18.VnD());
2531
2532 __ movprfx(z2, z3);
2533 __ sabdlb(z2.VnD(), z21.VnS(), z3.VnS());
2534
2535 __ movprfx(z25, z26);
2536 __ sabdlt(z25.VnD(), z23.VnS(), z17.VnS());
2537
2538 __ movprfx(z24, z25);
2539 __ saddlb(z24.VnD(), z30.VnS(), z16.VnS());
2540
2541 __ movprfx(z15, z16);
2542 __ saddlbt(z15.VnD(), z6.VnS(), z18.VnS());
2543
2544 __ movprfx(z21, z22);
2545 __ saddlt(z21.VnD(), z29.VnS(), z31.VnS());
2546
2547 __ movprfx(z12, z13);
2548 __ saddwb(z12.VnD(), z8.VnD(), z8.VnS());
2549
2550 __ movprfx(z24, z25);
2551 __ saddwt(z24.VnD(), z0.VnD(), z3.VnS());
2552
2553 __ movprfx(z7, z8);
2554 __ shrnb(z7.VnB(), z4.VnH(), 1);
2555
2556 __ movprfx(z21, z22);
2557 __ shrnt(z21.VnB(), z29.VnH(), 1);
2558
2559 __ movprfx(z29, z30);
2560 __ sli(z29.VnB(), z7.VnB(), 0);
2561
2562 __ movprfx(z23, z24);
2563 __ smulh(z23.VnB(), z23.VnB(), z3.VnB());
2564
2565 __ movprfx(z10, z11);
2566 __ smullb(z10.VnD(), z4.VnS(), z4.VnS());
2567
2568 __ movprfx(z10, z11);
2569 __ smullb(z10.VnS(), z4.VnH(), z4.VnH(), 0);
2570
2571 __ movprfx(z10, z11);
2572 __ smullb(z10.VnD(), z4.VnS(), z4.VnS(), 0);
2573
2574 __ movprfx(z31, z0);
2575 __ smullt(z31.VnD(), z26.VnS(), z5.VnS());
2576
2577 __ movprfx(z31, z0);
2578 __ smullt(z31.VnS(), z26.VnH(), z5.VnH(), 0);
2579
2580 __ movprfx(z31, z0);
2581 __ smullt(z31.VnD(), z26.VnS(), z5.VnS(), 0);
2582
2583 __ movprfx(z18, z19);
2584 __ sqdmulh(z18.VnB(), z25.VnB(), z1.VnB());
2585
2586 __ movprfx(z18, z19);
2587 __ sqdmulh(z18.VnH(), z25.VnH(), z1.VnH(), 0);
2588
2589 __ movprfx(z18, z19);
2590 __ sqdmulh(z18.VnS(), z25.VnS(), z1.VnS(), 0);
2591
2592 __ movprfx(z18, z19);
2593 __ sqdmulh(z18.VnD(), z25.VnD(), z1.VnD(), 0);
2594
2595 __ movprfx(z1, z2);
2596 __ sqdmullb(z1.VnD(), z31.VnS(), z21.VnS());
2597
2598 __ movprfx(z1, z2);
2599 __ sqdmullb(z1.VnS(), z31.VnH(), z1.VnH(), 0);
2600
2601 __ movprfx(z1, z2);
2602 __ sqdmullb(z1.VnD(), z31.VnS(), z1.VnS(), 0);
2603
2604 __ movprfx(z2, z3);
2605 __ sqdmullt(z2.VnD(), z1.VnS(), z5.VnS());
2606
2607 __ movprfx(z2, z3);
2608 __ sqdmullt(z2.VnS(), z1.VnH(), z5.VnH(), 0);
2609
2610 __ movprfx(z2, z3);
2611 __ sqdmullt(z2.VnD(), z1.VnS(), z5.VnS(), 0);
2612
2613 __ movprfx(z21, z22);
2614 __ sqrdmulh(z21.VnB(), z21.VnB(), z27.VnB());
2615
2616 __ movprfx(z21, z22);
2617 __ sqrdmulh(z21.VnH(), z21.VnH(), z2.VnH(), 0);
2618
2619 __ movprfx(z21, z22);
2620 __ sqrdmulh(z21.VnS(), z21.VnS(), z2.VnS(), 0);
2621
2622 __ movprfx(z21, z22);
2623 __ sqrdmulh(z21.VnD(), z21.VnD(), z2.VnD(), 0);
2624
2625 __ movprfx(z1, z2);
2626 __ sqrshrnb(z1.VnB(), z1.VnH(), 1);
2627
2628 __ movprfx(z24, z25);
2629 __ sqrshrnt(z24.VnB(), z19.VnH(), 8);
2630
2631 __ movprfx(z23, z24);
2632 __ sqrshrunb(z23.VnB(), z28.VnH(), 1);
2633
2634 __ movprfx(z9, z10);
2635 __ sqrshrunt(z9.VnB(), z15.VnH(), 8);
2636
2637 __ movprfx(z25, z26);
2638 __ sqshrnb(z25.VnB(), z1.VnH(), 1);
2639
2640 __ movprfx(z0, z1);
2641 __ sqshrnt(z0.VnB(), z25.VnH(), 8);
2642
2643 __ movprfx(z25, z26);
2644 __ sqshrunb(z25.VnB(), z10.VnH(), 1);
2645
2646 __ movprfx(z20, z21);
2647 __ sqshrunt(z20.VnB(), z3.VnH(), 8);
2648
2649 __ movprfx(z2, z3);
2650 __ sqxtnb(z2.VnB(), z0.VnH());
2651
2652 __ movprfx(z31, z0);
2653 __ sqxtnt(z31.VnB(), z18.VnH());
2654
2655 __ movprfx(z28, z29);
2656 __ sqxtunb(z28.VnB(), z6.VnH());
2657
2658 __ movprfx(z14, z15);
2659 __ sqxtunt(z14.VnB(), z31.VnH());
2660
2661 __ movprfx(z6, z7);
2662 __ sri(z6.VnB(), z9.VnB(), 1);
2663
2664 __ movprfx(z2, z3);
2665 __ sshllb(z2.VnH(), z20.VnB(), 0);
2666
2667 __ movprfx(z27, z28);
2668 __ sshllt(z27.VnH(), z8.VnB(), 0);
2669
2670 __ movprfx(z4, z5);
2671 __ ssublb(z4.VnD(), z23.VnS(), z7.VnS());
2672
2673 __ movprfx(z6, z7);
2674 __ ssublbt(z6.VnD(), z28.VnS(), z12.VnS());
2675
2676 __ movprfx(z12, z13);
2677 __ ssublt(z12.VnD(), z13.VnS(), z6.VnS());
2678
2679 __ movprfx(z11, z12);
2680 __ ssubltb(z11.VnD(), z18.VnS(), z19.VnS());
2681
2682 __ movprfx(z7, z8);
2683 __ ssubwb(z7.VnD(), z28.VnD(), z11.VnS());
2684
2685 __ movprfx(z29, z30);
2686 __ ssubwt(z29.VnD(), z25.VnD(), z20.VnS());
2687
2688 __ movprfx(z21, z22);
2689 __ stnt1b(z21.VnS(), p5.Zeroing(), SVEMemOperand(z1.VnS(), x23));
2690
2691 __ movprfx(z21, z22);
2692 __ stnt1b(z21.VnD(), p5.Zeroing(), SVEMemOperand(z1.VnD(), x23));
2693
2694 __ movprfx(z10, z11);
2695 __ stnt1d(z10.VnD(), p0.Zeroing(), SVEMemOperand(z1.VnD(), x23));
2696
2697 __ movprfx(z30, z31);
2698 __ stnt1h(z30.VnS(), p4.Zeroing(), SVEMemOperand(z6.VnS(), x6));
2699
2700 __ movprfx(z30, z31);
2701 __ stnt1h(z30.VnD(), p4.Zeroing(), SVEMemOperand(z6.VnD(), x6));
2702
2703 __ movprfx(z0, z1);
2704 __ stnt1w(z0.VnS(), p4.Zeroing(), SVEMemOperand(z11.VnS(), x1));
2705
2706 __ movprfx(z0, z1);
2707 __ stnt1w(z0.VnD(), p4.Zeroing(), SVEMemOperand(z11.VnD(), x1));
2708
2709 __ movprfx(z31, z0);
2710 __ subhnb(z31.VnS(), z31.VnD(), z7.VnD());
2711
2712 __ movprfx(z31, z0);
2713 __ subhnt(z31.VnS(), z22.VnD(), z27.VnD());
2714
2715 __ movprfx(z24, z25);
2716 __ tbl(z24.VnB(), z29.VnB(), z30.VnB(), z0.VnB());
2717
2718 __ movprfx(z22, z23);
2719 __ tbx(z22.VnB(), z15.VnB(), z19.VnB());
2720
2721 __ movprfx(z1, z2);
2722 __ uabdlb(z1.VnD(), z26.VnS(), z12.VnS());
2723
2724 __ movprfx(z25, z26);
2725 __ uabdlt(z25.VnD(), z29.VnS(), z14.VnS());
2726
2727 __ movprfx(z3, z4);
2728 __ uaddlb(z3.VnD(), z5.VnS(), z2.VnS());
2729
2730 __ movprfx(z15, z16);
2731 __ uaddlt(z15.VnD(), z28.VnS(), z20.VnS());
2732
2733 __ movprfx(z31, z0);
2734 __ uaddwb(z31.VnD(), z8.VnD(), z25.VnS());
2735
2736 __ movprfx(z17, z18);
2737 __ uaddwt(z17.VnD(), z15.VnD(), z2.VnS());
2738
2739 __ movprfx(z12, z13);
2740 __ umulh(z12.VnB(), z12.VnB(), z17.VnB());
2741
2742 __ movprfx(z12, z13);
2743 __ umullb(z12.VnD(), z5.VnS(), z2.VnS());
2744
2745 __ movprfx(z12, z13);
2746 __ umullb(z12.VnS(), z5.VnH(), z2.VnH(), 0);
2747
2748 __ movprfx(z12, z13);
2749 __ umullb(z12.VnD(), z5.VnS(), z2.VnS(), 0);
2750
2751 __ movprfx(z24, z25);
2752 __ umullt(z24.VnD(), z6.VnS(), z6.VnS());
2753
2754 __ movprfx(z24, z25);
2755 __ umullt(z24.VnS(), z6.VnH(), z1.VnH(), 0);
2756
2757 __ movprfx(z24, z25);
2758 __ umullt(z24.VnD(), z6.VnS(), z1.VnS(), 0);
2759
2760 __ movprfx(z30, z31);
2761 __ uqrshrnb(z30.VnB(), z25.VnH(), 1);
2762
2763 __ movprfx(z3, z4);
2764 __ uqrshrnt(z3.VnB(), z25.VnH(), 8);
2765
2766 __ movprfx(z17, z18);
2767 __ uqshrnb(z17.VnB(), z4.VnH(), 1);
2768
2769 __ movprfx(z28, z29);
2770 __ uqshrnt(z28.VnB(), z18.VnH(), 8);
2771
2772 __ movprfx(z28, z29);
2773 __ uqxtnb(z28.VnB(), z4.VnH());
2774
2775 __ movprfx(z19, z20);
2776 __ uqxtnt(z19.VnB(), z7.VnH());
2777
2778 __ movprfx(z8, z9);
2779 __ ushllb(z8.VnH(), z31.VnB(), 0);
2780
2781 __ movprfx(z3, z4);
2782 __ ushllt(z3.VnH(), z21.VnB(), 0);
2783
2784 __ movprfx(z25, z26);
2785 __ usublb(z25.VnD(), z9.VnS(), z17.VnS());
2786
2787 __ movprfx(z5, z6);
2788 __ usublt(z5.VnD(), z11.VnS(), z15.VnS());
2789
2790 __ movprfx(z10, z11);
2791 __ usubwb(z10.VnD(), z13.VnD(), z20.VnS());
2792
2793 __ movprfx(z15, z16);
2794 __ usubwt(z15.VnD(), z8.VnD(), z23.VnS());
2795
2796 __ movprfx(z20, z21);
2797 __ whilege(p0.VnB(), w20, w29);
2798
2799 __ movprfx(z24, z25);
2800 __ whilegt(p11.VnB(), w24, w3);
2801
2802 __ movprfx(z20, z21);
2803 __ whilehi(p2.VnB(), x20, x8);
2804
2805 __ movprfx(z22, z23);
2806 __ whilehs(p4.VnB(), w22, w9);
2807
2808 __ movprfx(z25, z26);
2809 __ whilerw(p7.VnB(), x25, x27);
2810
2811 __ movprfx(z14, z15);
2812 __ whilewr(p8.VnB(), x14, x14);
2813 }
2814 assm.FinalizeCode();
2815
2816 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
2817 }
2818
TEST(movprfx_negative_predication_sve2)2819 TEST(movprfx_negative_predication_sve2) {
2820 Assembler assm;
2821 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE, CPUFeatures::kSVE2);
2822 {
2823 // We have to use the Assembler directly to generate movprfx, so we need
2824 // to manually reserve space for the code we're about to emit.
2825 static const size_t kPairCount = 140;
2826 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
2827
2828 __ movprfx(z25.VnS(), p0.Zeroing(), z26.VnS());
2829 __ adclb(z25.VnS(), z17.VnS(), z24.VnS());
2830
2831 __ movprfx(z0.VnS(), p0.Zeroing(), z1.VnS());
2832 __ adclt(z0.VnS(), z2.VnS(), z15.VnS());
2833
2834 __ movprfx(z6.VnD(), p0.Zeroing(), z7.VnD());
2835 __ bcax(z6.VnD(), z6.VnD(), z12.VnD(), z1.VnD());
2836
2837 __ movprfx(z18.VnD(), p0.Zeroing(), z19.VnD());
2838 __ bsl1n(z18.VnD(), z18.VnD(), z8.VnD(), z7.VnD());
2839
2840 __ movprfx(z7.VnD(), p0.Zeroing(), z8.VnD());
2841 __ bsl2n(z7.VnD(), z7.VnD(), z3.VnD(), z19.VnD());
2842
2843 __ movprfx(z21.VnD(), p0.Zeroing(), z22.VnD());
2844 __ bsl(z21.VnD(), z21.VnD(), z2.VnD(), z2.VnD());
2845
2846 __ movprfx(z5.VnB(), p0.Zeroing(), z6.VnB());
2847 __ cadd(z5.VnB(), z5.VnB(), z12.VnB(), 90);
2848
2849 __ movprfx(z7.VnS(), p0.Zeroing(), z8.VnS());
2850 __ cdot(z7.VnS(), z4.VnB(), z10.VnB(), 0);
2851
2852 __ movprfx(z7.VnS(), p0.Zeroing(), z8.VnS());
2853 __ cdot(z7.VnS(), z4.VnB(), z0.VnB(), 0, 0);
2854
2855 __ movprfx(z7.VnD(), p0.Zeroing(), z8.VnD());
2856 __ cdot(z7.VnD(), z4.VnH(), z0.VnH(), 0, 0);
2857
2858 __ movprfx(z19.VnB(), p0.Zeroing(), z20.VnB());
2859 __ cmla(z19.VnB(), z7.VnB(), z2.VnB(), 0);
2860
2861 __ movprfx(z19.VnS(), p0.Zeroing(), z20.VnS());
2862 __ cmla(z19.VnS(), z7.VnS(), z2.VnS(), 0, 0);
2863
2864 __ movprfx(z19.VnH(), p0.Zeroing(), z20.VnH());
2865 __ cmla(z19.VnH(), z7.VnH(), z2.VnH(), 0, 0);
2866
2867 __ movprfx(z10.VnD(), p0.Zeroing(), z11.VnD());
2868 __ eor3(z10.VnD(), z10.VnD(), z24.VnD(), z23.VnD());
2869
2870 __ movprfx(z3.VnB(), p0.Zeroing(), z4.VnB());
2871 __ eorbt(z3.VnB(), z10.VnB(), z8.VnB());
2872
2873 __ movprfx(z20.VnB(), p0.Zeroing(), z22.VnB());
2874 __ eortb(z20.VnB(), z21.VnB(), z15.VnB());
2875
2876 __ movprfx(z14.VnD(), p0.Zeroing(), z15.VnD());
2877 __ faddp(z14.VnD(), p1.Merging(), z14.VnD(), z26.VnD());
2878
2879 __ movprfx(z2.VnD(), p0.Zeroing(), z3.VnD());
2880 __ fmaxnmp(z2.VnD(), p1.Merging(), z2.VnD(), z14.VnD());
2881
2882 __ movprfx(z22.VnD(), p0.Zeroing(), z23.VnD());
2883 __ fmaxp(z22.VnD(), p1.Merging(), z22.VnD(), z3.VnD());
2884
2885 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2886 __ fminnmp(z1.VnD(), p0.Merging(), z1.VnD(), z14.VnD());
2887
2888 __ movprfx(z16.VnD(), p0.Zeroing(), z17.VnD());
2889 __ fminp(z16.VnD(), p3.Merging(), z16.VnD(), z11.VnD());
2890
2891 __ movprfx(z16.VnS(), p0.Zeroing(), z17.VnS());
2892 __ fmlalb(z16.VnS(), z18.VnH(), z29.VnH());
2893
2894 __ movprfx(z16.VnS(), p0.Zeroing(), z17.VnS());
2895 __ fmlalb(z16.VnS(), z18.VnH(), z2.VnH(), 0);
2896
2897 __ movprfx(z18.VnS(), p0.Zeroing(), z19.VnS());
2898 __ fmlalt(z18.VnS(), z13.VnH(), z5.VnH());
2899
2900 __ movprfx(z18.VnS(), p0.Zeroing(), z19.VnS());
2901 __ fmlalt(z18.VnS(), z13.VnH(), z5.VnH(), 0);
2902
2903 __ movprfx(z16.VnS(), p0.Zeroing(), z17.VnS());
2904 __ fmlslb(z16.VnS(), z10.VnH(), z1.VnH());
2905
2906 __ movprfx(z16.VnS(), p0.Zeroing(), z17.VnS());
2907 __ fmlslb(z16.VnS(), z10.VnH(), z1.VnH(), 0);
2908
2909 __ movprfx(z3.VnS(), p0.Zeroing(), z4.VnS());
2910 __ fmlslt(z3.VnS(), z17.VnH(), z14.VnH());
2911
2912 __ movprfx(z3.VnS(), p0.Zeroing(), z4.VnS());
2913 __ fmlslt(z3.VnS(), z17.VnH(), z1.VnH(), 0);
2914
2915 __ movprfx(z2.VnH(), p0.Zeroing(), z3.VnH());
2916 __ mla(z2.VnH(), z0.VnH(), z1.VnH(), 0);
2917
2918 __ movprfx(z2.VnS(), p0.Zeroing(), z3.VnS());
2919 __ mla(z2.VnS(), z0.VnS(), z1.VnS(), 0);
2920
2921 __ movprfx(z2.VnD(), p0.Zeroing(), z3.VnD());
2922 __ mla(z2.VnD(), z0.VnD(), z1.VnD(), 0);
2923
2924 __ movprfx(z2.VnH(), p0.Zeroing(), z3.VnH());
2925 __ mls(z2.VnH(), z0.VnH(), z1.VnH(), 0);
2926
2927 __ movprfx(z2.VnS(), p0.Zeroing(), z3.VnS());
2928 __ mls(z2.VnS(), z0.VnS(), z1.VnS(), 0);
2929
2930 __ movprfx(z2.VnD(), p0.Zeroing(), z3.VnD());
2931 __ mls(z2.VnD(), z0.VnD(), z1.VnD(), 0);
2932
2933 __ movprfx(z17.VnD(), p0.Zeroing(), z18.VnD());
2934 __ nbsl(z17.VnD(), z17.VnD(), z21.VnD(), z27.VnD());
2935
2936 __ movprfx(z13.VnB(), p0.Zeroing(), z14.VnB());
2937 __ saba(z13.VnB(), z2.VnB(), z31.VnB());
2938
2939 __ movprfx(z13.VnD(), p0.Zeroing(), z14.VnD());
2940 __ sabalb(z13.VnD(), z20.VnS(), z26.VnS());
2941
2942 __ movprfx(z14.VnD(), p0.Zeroing(), z15.VnD());
2943 __ sabalt(z14.VnD(), z19.VnS(), z10.VnS());
2944
2945 __ movprfx(z17.VnS(), p0.Zeroing(), z18.VnS());
2946 __ sbclb(z17.VnS(), z10.VnS(), z8.VnS());
2947
2948 __ movprfx(z20.VnS(), p0.Zeroing(), z21.VnS());
2949 __ sbclt(z20.VnS(), z0.VnS(), z13.VnS());
2950
2951 __ movprfx(z5.VnB(), p0.Zeroing(), z6.VnB());
2952 __ smaxp(z5.VnB(), p4.Merging(), z5.VnB(), z10.VnB());
2953
2954 __ movprfx(z27.VnB(), p0.Zeroing(), z28.VnB());
2955 __ sminp(z27.VnB(), p3.Merging(), z27.VnB(), z1.VnB());
2956
2957 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2958 __ smlalb(z1.VnD(), z3.VnS(), z23.VnS());
2959
2960 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2961 __ smlalb(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2962
2963 __ movprfx(z1.VnS(), p0.Zeroing(), z2.VnS());
2964 __ smlalb(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2965
2966 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2967 __ smlalt(z1.VnD(), z3.VnS(), z23.VnS());
2968
2969 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2970 __ smlalt(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2971
2972 __ movprfx(z1.VnS(), p0.Zeroing(), z2.VnS());
2973 __ smlalt(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2974
2975 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2976 __ smlslb(z1.VnD(), z3.VnS(), z23.VnS());
2977
2978 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2979 __ smlslb(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2980
2981 __ movprfx(z1.VnS(), p0.Zeroing(), z2.VnS());
2982 __ smlslb(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2983
2984 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2985 __ smlslt(z1.VnD(), z3.VnS(), z23.VnS());
2986
2987 __ movprfx(z1.VnD(), p0.Zeroing(), z2.VnD());
2988 __ smlslt(z1.VnD(), z3.VnS(), z2.VnS(), 0);
2989
2990 __ movprfx(z1.VnS(), p0.Zeroing(), z2.VnS());
2991 __ smlslt(z1.VnS(), z3.VnH(), z2.VnH(), 0);
2992
2993 __ movprfx(z20.VnB(), p0.Zeroing(), z21.VnB());
2994 __ sqcadd(z20.VnB(), z20.VnB(), z23.VnB(), 90);
2995
2996 __ movprfx(z6.VnD(), p0.Zeroing(), z7.VnD());
2997 __ sqdmlalb(z6.VnD(), z19.VnS(), z25.VnS());
2998
2999 __ movprfx(z6.VnD(), p0.Zeroing(), z7.VnD());
3000 __ sqdmlalb(z6.VnD(), z19.VnS(), z2.VnS(), 0);
3001
3002 __ movprfx(z6.VnS(), p0.Zeroing(), z7.VnS());
3003 __ sqdmlalb(z6.VnS(), z19.VnH(), z2.VnH(), 0);
3004
3005 __ movprfx(z23.VnD(), p0.Zeroing(), z24.VnD());
3006 __ sqdmlalbt(z23.VnD(), z29.VnS(), z26.VnS());
3007
3008 __ movprfx(z11.VnD(), p0.Zeroing(), z12.VnD());
3009 __ sqdmlalt(z11.VnD(), z0.VnS(), z0.VnS());
3010
3011 __ movprfx(z11.VnD(), p0.Zeroing(), z12.VnD());
3012 __ sqdmlalt(z11.VnD(), z0.VnS(), z0.VnS(), 0);
3013
3014 __ movprfx(z11.VnS(), p0.Zeroing(), z12.VnS());
3015 __ sqdmlalt(z11.VnS(), z0.VnH(), z0.VnH(), 0);
3016
3017 __ movprfx(z16.VnD(), p0.Zeroing(), z17.VnD());
3018 __ sqdmlslb(z16.VnD(), z26.VnS(), z25.VnS());
3019
3020 __ movprfx(z16.VnD(), p0.Zeroing(), z17.VnD());
3021 __ sqdmlslb(z16.VnD(), z26.VnS(), z2.VnS(), 0);
3022
3023 __ movprfx(z16.VnS(), p0.Zeroing(), z17.VnS());
3024 __ sqdmlslb(z16.VnS(), z26.VnH(), z2.VnH(), 0);
3025
3026 __ movprfx(z26.VnD(), p0.Zeroing(), z27.VnD());
3027 __ sqdmlslbt(z26.VnD(), z23.VnS(), z4.VnS());
3028
3029 __ movprfx(z21.VnD(), p0.Zeroing(), z22.VnD());
3030 __ sqdmlslt(z21.VnD(), z23.VnS(), z9.VnS());
3031
3032 __ movprfx(z21.VnD(), p0.Zeroing(), z22.VnD());
3033 __ sqdmlslt(z21.VnD(), z23.VnS(), z0.VnS(), 0);
3034
3035 __ movprfx(z21.VnS(), p0.Zeroing(), z22.VnS());
3036 __ sqdmlslt(z21.VnS(), z23.VnH(), z0.VnH(), 0);
3037
3038 __ movprfx(z31.VnB(), p0.Zeroing(), z0.VnB());
3039 __ sqrdcmlah(z31.VnB(), z15.VnB(), z20.VnB(), 0);
3040
3041 __ movprfx(z31.VnH(), p0.Zeroing(), z0.VnH());
3042 __ sqrdcmlah(z31.VnH(), z15.VnH(), z2.VnH(), 0, 0);
3043
3044 __ movprfx(z31.VnS(), p0.Zeroing(), z0.VnS());
3045 __ sqrdcmlah(z31.VnS(), z15.VnS(), z2.VnS(), 0, 0);
3046
3047 __ movprfx(z27.VnB(), p0.Zeroing(), z28.VnB());
3048 __ sqrdmlah(z27.VnB(), z28.VnB(), z19.VnB());
3049
3050 __ movprfx(z27.VnH(), p0.Zeroing(), z28.VnH());
3051 __ sqrdmlah(z27.VnH(), z28.VnH(), z1.VnH(), 0);
3052
3053 __ movprfx(z27.VnS(), p0.Zeroing(), z28.VnS());
3054 __ sqrdmlah(z27.VnS(), z28.VnS(), z1.VnS(), 0);
3055
3056 __ movprfx(z27.VnD(), p0.Zeroing(), z28.VnD());
3057 __ sqrdmlah(z27.VnD(), z28.VnD(), z1.VnD(), 0);
3058
3059 __ movprfx(z11.VnB(), p0.Zeroing(), z12.VnB());
3060 __ sqrdmlsh(z11.VnB(), z16.VnB(), z31.VnB());
3061
3062 __ movprfx(z11.VnH(), p0.Zeroing(), z12.VnH());
3063 __ sqrdmlsh(z11.VnH(), z16.VnH(), z1.VnH(), 0);
3064
3065 __ movprfx(z11.VnS(), p0.Zeroing(), z12.VnS());
3066 __ sqrdmlsh(z11.VnS(), z16.VnS(), z1.VnS(), 0);
3067
3068 __ movprfx(z11.VnD(), p0.Zeroing(), z12.VnD());
3069 __ sqrdmlsh(z11.VnD(), z16.VnD(), z1.VnD(), 0);
3070
3071 __ movprfx(z0.VnB(), p0.Zeroing(), z1.VnB());
3072 __ srsra(z0.VnB(), z8.VnB(), 1);
3073
3074 __ movprfx(z0.VnB(), p0.Zeroing(), z1.VnB());
3075 __ ssra(z0.VnB(), z8.VnB(), 1);
3076
3077 __ movprfx(z23.VnB(), p0.Zeroing(), z24.VnB());
3078 __ uaba(z23.VnB(), z22.VnB(), z20.VnB());
3079
3080 __ movprfx(z11.VnD(), p0.Zeroing(), z12.VnD());
3081 __ uabalb(z11.VnD(), z25.VnS(), z12.VnS());
3082
3083 __ movprfx(z4.VnD(), p0.Zeroing(), z5.VnD());
3084 __ uabalt(z4.VnD(), z2.VnS(), z31.VnS());
3085
3086 __ movprfx(z7.VnB(), p0.Zeroing(), z8.VnB());
3087 __ umaxp(z7.VnB(), p2.Merging(), z7.VnB(), z23.VnB());
3088
3089 __ movprfx(z10.VnB(), p0.Zeroing(), z11.VnB());
3090 __ uminp(z10.VnB(), p0.Merging(), z10.VnB(), z22.VnB());
3091
3092 __ movprfx(z31.VnD(), p0.Zeroing(), z0.VnD());
3093 __ umlalb(z31.VnD(), z9.VnS(), z21.VnS());
3094
3095 __ movprfx(z31.VnD(), p0.Zeroing(), z0.VnD());
3096 __ umlalb(z31.VnD(), z9.VnS(), z1.VnS(), 0);
3097
3098 __ movprfx(z31.VnS(), p0.Zeroing(), z0.VnS());
3099 __ umlalb(z31.VnS(), z9.VnH(), z1.VnH(), 0);
3100
3101 __ movprfx(z11.VnD(), p0.Zeroing(), z12.VnD());
3102 __ umlalt(z11.VnD(), z5.VnS(), z22.VnS());
3103
3104 __ movprfx(z11.VnD(), p0.Zeroing(), z12.VnD());
3105 __ umlalt(z11.VnD(), z5.VnS(), z2.VnS(), 0);
3106
3107 __ movprfx(z11.VnS(), p0.Zeroing(), z12.VnS());
3108 __ umlalt(z11.VnS(), z5.VnH(), z2.VnH(), 0);
3109
3110 __ movprfx(z28.VnD(), p0.Zeroing(), z29.VnD());
3111 __ umlslb(z28.VnD(), z13.VnS(), z9.VnS());
3112
3113 __ movprfx(z28.VnD(), p0.Zeroing(), z29.VnD());
3114 __ umlslb(z28.VnD(), z13.VnS(), z1.VnS(), 0);
3115
3116 __ movprfx(z28.VnS(), p0.Zeroing(), z29.VnS());
3117 __ umlslb(z28.VnS(), z13.VnH(), z1.VnH(), 0);
3118
3119 __ movprfx(z9.VnD(), p0.Zeroing(), z10.VnD());
3120 __ umlslt(z9.VnD(), z12.VnS(), z30.VnS());
3121
3122 __ movprfx(z9.VnD(), p0.Zeroing(), z10.VnD());
3123 __ umlslt(z9.VnD(), z12.VnS(), z0.VnS(), 0);
3124
3125 __ movprfx(z9.VnS(), p0.Zeroing(), z10.VnS());
3126 __ umlslt(z9.VnS(), z12.VnH(), z0.VnH(), 0);
3127
3128 __ movprfx(z0.VnB(), p0.Zeroing(), z1.VnB());
3129 __ ursra(z0.VnB(), z8.VnB(), 1);
3130
3131 __ movprfx(z0.VnB(), p0.Zeroing(), z1.VnB());
3132 __ usra(z0.VnB(), z8.VnB(), 1);
3133
3134 __ movprfx(z16.VnB(), p0.Zeroing(), z17.VnB());
3135 __ xar(z16.VnB(), z16.VnB(), z13.VnB(), 1);
3136 }
3137 assm.FinalizeCode();
3138
3139 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
3140 }
3141
TEST(movprfx_negative_aliasing_sve2)3142 TEST(movprfx_negative_aliasing_sve2) {
3143 Assembler assm;
3144 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE, CPUFeatures::kSVE2);
3145 {
3146 // We have to use the Assembler directly to generate movprfx, so we need
3147 // to manually reserve space for the code we're about to emit.
3148 static const size_t kPairCount = 140;
3149 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
3150
3151 __ movprfx(z25, z26);
3152 __ adclb(z25.VnS(), z17.VnS(), z25.VnS());
3153
3154 __ movprfx(z0, z1);
3155 __ adclt(z0.VnS(), z2.VnS(), z0.VnS());
3156
3157 __ movprfx(z3, z4);
3158 __ addp(z3.VnB(), p1.Merging(), z3.VnB(), z3.VnB());
3159
3160 __ movprfx(z6, z7);
3161 __ bcax(z6.VnD(), z6.VnD(), z12.VnD(), z6.VnD());
3162
3163 __ movprfx(z18, z19);
3164 __ bsl1n(z18.VnD(), z18.VnD(), z8.VnD(), z18.VnD());
3165
3166 __ movprfx(z7, z8);
3167 __ bsl2n(z7.VnD(), z7.VnD(), z3.VnD(), z7.VnD());
3168
3169 __ movprfx(z21, z22);
3170 __ bsl(z21.VnD(), z21.VnD(), z2.VnD(), z21.VnD());
3171
3172 __ movprfx(z5, z6);
3173 __ cadd(z5.VnB(), z5.VnB(), z5.VnB(), 90);
3174
3175 __ movprfx(z7, z8);
3176 __ cdot(z7.VnS(), z4.VnB(), z7.VnB(), 0);
3177
3178 __ movprfx(z7, z8);
3179 __ cdot(z7.VnS(), z4.VnB(), z7.VnB(), 0, 0);
3180
3181 __ movprfx(z7, z8);
3182 __ cdot(z7.VnD(), z7.VnH(), z0.VnH(), 0, 0);
3183
3184 __ movprfx(z19, z20);
3185 __ cmla(z19.VnB(), z19.VnB(), z2.VnB(), 0);
3186
3187 __ movprfx(z19, z20);
3188 __ cmla(z19.VnS(), z19.VnS(), z2.VnS(), 0, 0);
3189
3190 __ movprfx(z1, z20);
3191 __ cmla(z1.VnH(), z7.VnH(), z1.VnH(), 0, 0);
3192
3193 __ movprfx(z10, z11);
3194 __ eor3(z10.VnD(), z10.VnD(), z10.VnD(), z23.VnD());
3195
3196 __ movprfx(z3, z4);
3197 __ eorbt(z3.VnB(), z10.VnB(), z3.VnB());
3198
3199 __ movprfx(z20, z22);
3200 __ eortb(z20.VnB(), z21.VnB(), z20.VnB());
3201
3202 __ movprfx(z14, z15);
3203 __ faddp(z14.VnD(), p1.Merging(), z14.VnD(), z14.VnD());
3204
3205 __ movprfx(z14.VnD(), p4.Merging(), z15.VnD());
3206 __ fcvtx(z14.VnS(), p4.Merging(), z14.VnD());
3207
3208 __ movprfx(z15.VnH(), p0.Merging(), z16.VnH());
3209 __ flogb(z15.VnH(), p0.Merging(), z15.VnH());
3210
3211 __ movprfx(z2, z3);
3212 __ fmaxnmp(z2.VnD(), p1.Merging(), z2.VnD(), z2.VnD());
3213
3214 __ movprfx(z22, z23);
3215 __ fmaxp(z22.VnD(), p1.Merging(), z22.VnD(), z22.VnD());
3216
3217 __ movprfx(z1, z2);
3218 __ fminnmp(z1.VnD(), p0.Merging(), z1.VnD(), z1.VnD());
3219
3220 __ movprfx(z16, z17);
3221 __ fminp(z16.VnD(), p3.Merging(), z16.VnD(), z16.VnD());
3222
3223 __ movprfx(z16, z17);
3224 __ fmlalb(z16.VnS(), z18.VnH(), z16.VnH());
3225
3226 __ movprfx(z16, z17);
3227 __ fmlalb(z16.VnS(), z16.VnH(), z2.VnH(), 0);
3228
3229 __ movprfx(z18, z19);
3230 __ fmlalt(z18.VnS(), z13.VnH(), z18.VnH());
3231
3232 __ movprfx(z18, z19);
3233 __ fmlalt(z18.VnS(), z18.VnH(), z5.VnH(), 0);
3234
3235 __ movprfx(z16, z17);
3236 __ fmlslb(z16.VnS(), z16.VnH(), z1.VnH());
3237
3238 __ movprfx(z16, z17);
3239 __ fmlslb(z16.VnS(), z16.VnH(), z1.VnH(), 0);
3240
3241 __ movprfx(z3, z4);
3242 __ fmlslt(z3.VnS(), z17.VnH(), z3.VnH());
3243
3244 __ movprfx(z3, z4);
3245 __ fmlslt(z3.VnS(), z17.VnH(), z3.VnH(), 0);
3246
3247 __ movprfx(z2, z3);
3248 __ mla(z2.VnH(), z0.VnH(), z2.VnH(), 0);
3249
3250 __ movprfx(z2, z3);
3251 __ mla(z2.VnS(), z0.VnS(), z2.VnS(), 0);
3252
3253 __ movprfx(z2, z3);
3254 __ mla(z2.VnD(), z0.VnD(), z2.VnD(), 0);
3255
3256 __ movprfx(z2, z3);
3257 __ mls(z2.VnH(), z0.VnH(), z2.VnH(), 0);
3258
3259 __ movprfx(z2, z3);
3260 __ mls(z2.VnS(), z0.VnS(), z2.VnS(), 0);
3261
3262 __ movprfx(z2, z3);
3263 __ mls(z2.VnD(), z0.VnD(), z2.VnD(), 0);
3264
3265 __ movprfx(z17, z18);
3266 __ nbsl(z17.VnD(), z17.VnD(), z21.VnD(), z17.VnD());
3267
3268 __ movprfx(z13, z14);
3269 __ saba(z13.VnB(), z2.VnB(), z13.VnB());
3270
3271 __ movprfx(z13, z14);
3272 __ sabalb(z13.VnD(), z13.VnS(), z26.VnS());
3273
3274 __ movprfx(z14, z15);
3275 __ sabalt(z14.VnD(), z14.VnS(), z10.VnS());
3276
3277 __ movprfx(z19.VnD(), p5.Merging(), z20.VnD());
3278 __ sadalp(z19.VnD(), p5.Merging(), z19.VnS());
3279
3280 __ movprfx(z17, z18);
3281 __ sbclb(z17.VnS(), z17.VnS(), z8.VnS());
3282
3283 __ movprfx(z20, z21);
3284 __ sbclt(z20.VnS(), z20.VnS(), z13.VnS());
3285
3286 __ movprfx(z20.VnB(), p3.Merging(), z21.VnB());
3287 __ shadd(z20.VnB(), p3.Merging(), z20.VnB(), z20.VnB());
3288
3289 __ movprfx(z21.VnB(), p0.Merging(), z22.VnB());
3290 __ shsub(z21.VnB(), p0.Merging(), z21.VnB(), z21.VnB());
3291
3292 __ movprfx(z1.VnB(), p0.Merging(), z2.VnB());
3293 __ shsubr(z1.VnB(), p0.Merging(), z1.VnB(), z1.VnB());
3294
3295 __ movprfx(z5, z6);
3296 __ smaxp(z5.VnB(), p4.Merging(), z5.VnB(), z5.VnB());
3297
3298 __ movprfx(z27, z28);
3299 __ sminp(z27.VnB(), p3.Merging(), z27.VnB(), z27.VnB());
3300
3301 __ movprfx(z1, z2);
3302 __ smlalb(z1.VnD(), z3.VnS(), z1.VnS());
3303
3304 __ movprfx(z1, z2);
3305 __ smlalb(z1.VnD(), z3.VnS(), z1.VnS(), 0);
3306
3307 __ movprfx(z1, z2);
3308 __ smlalb(z1.VnS(), z1.VnH(), z2.VnH(), 0);
3309
3310 __ movprfx(z1, z2);
3311 __ smlalt(z1.VnD(), z1.VnS(), z23.VnS());
3312
3313 __ movprfx(z1, z2);
3314 __ smlalt(z1.VnD(), z3.VnS(), z1.VnS(), 0);
3315
3316 __ movprfx(z1, z2);
3317 __ smlalt(z1.VnS(), z1.VnH(), z2.VnH(), 0);
3318
3319 __ movprfx(z1, z2);
3320 __ smlslb(z1.VnD(), z1.VnS(), z23.VnS());
3321
3322 __ movprfx(z1, z2);
3323 __ smlslb(z1.VnD(), z3.VnS(), z1.VnS(), 0);
3324
3325 __ movprfx(z1, z2);
3326 __ smlslb(z1.VnS(), z3.VnH(), z1.VnH(), 0);
3327
3328 __ movprfx(z1, z2);
3329 __ smlslt(z1.VnD(), z1.VnS(), z23.VnS());
3330
3331 __ movprfx(z1, z2);
3332 __ smlslt(z1.VnD(), z3.VnS(), z1.VnS(), 0);
3333
3334 __ movprfx(z1, z2);
3335 __ smlslt(z1.VnS(), z1.VnH(), z2.VnH(), 0);
3336
3337 __ movprfx(z29.VnB(), p1.Merging(), z30.VnB());
3338 __ sqabs(z29.VnB(), p1.Merging(), z29.VnB());
3339
3340 __ movprfx(z28.VnB(), p0.Merging(), z29.VnB());
3341 __ sqadd(z28.VnB(), p0.Merging(), z28.VnB(), z28.VnB());
3342
3343 __ movprfx(z20, z21);
3344 __ sqcadd(z20.VnB(), z20.VnB(), z20.VnB(), 90);
3345
3346 __ movprfx(z6, z7);
3347 __ sqdmlalb(z6.VnD(), z6.VnS(), z25.VnS());
3348
3349 __ movprfx(z6, z7);
3350 __ sqdmlalb(z6.VnD(), z6.VnS(), z2.VnS(), 0);
3351
3352 __ movprfx(z6, z7);
3353 __ sqdmlalb(z6.VnS(), z6.VnH(), z2.VnH(), 0);
3354
3355 __ movprfx(z23, z24);
3356 __ sqdmlalbt(z23.VnD(), z23.VnS(), z26.VnS());
3357
3358 __ movprfx(z11, z12);
3359 __ sqdmlalt(z11.VnD(), z11.VnS(), z0.VnS());
3360
3361 __ movprfx(z11, z12);
3362 __ sqdmlalt(z11.VnD(), z11.VnS(), z0.VnS(), 0);
3363
3364 __ movprfx(z1, z12);
3365 __ sqdmlalt(z1.VnS(), z0.VnH(), z1.VnH(), 0);
3366
3367 __ movprfx(z16, z17);
3368 __ sqdmlslb(z16.VnD(), z26.VnS(), z16.VnS());
3369
3370 __ movprfx(z16, z17);
3371 __ sqdmlslb(z16.VnD(), z16.VnS(), z2.VnS(), 0);
3372
3373 __ movprfx(z16, z17);
3374 __ sqdmlslb(z16.VnS(), z16.VnH(), z2.VnH(), 0);
3375
3376 __ movprfx(z26, z27);
3377 __ sqdmlslbt(z26.VnD(), z26.VnS(), z4.VnS());
3378
3379 __ movprfx(z21, z22);
3380 __ sqdmlslt(z21.VnD(), z23.VnS(), z21.VnS());
3381
3382 __ movprfx(z21, z22);
3383 __ sqdmlslt(z21.VnD(), z21.VnS(), z0.VnS(), 0);
3384
3385 __ movprfx(z1, z22);
3386 __ sqdmlslt(z21.VnS(), z23.VnH(), z1.VnH(), 0);
3387
3388 __ movprfx(z21.VnB(), p0.Merging(), z22.VnB());
3389 __ sqneg(z21.VnB(), p0.Merging(), z21.VnB());
3390
3391 __ movprfx(z31, z0);
3392 __ sqrdcmlah(z31.VnB(), z15.VnB(), z31.VnB(), 0);
3393
3394 __ movprfx(z31, z0);
3395 __ sqrdcmlah(z31.VnH(), z31.VnH(), z2.VnH(), 0, 0);
3396
3397 __ movprfx(z31, z0);
3398 __ sqrdcmlah(z31.VnS(), z31.VnS(), z2.VnS(), 0, 0);
3399
3400 __ movprfx(z27, z28);
3401 __ sqrdmlah(z27.VnB(), z27.VnB(), z19.VnB());
3402
3403 __ movprfx(z27, z28);
3404 __ sqrdmlah(z27.VnH(), z27.VnH(), z1.VnH(), 0);
3405
3406 __ movprfx(z27, z28);
3407 __ sqrdmlah(z27.VnS(), z27.VnS(), z1.VnS(), 0);
3408
3409 __ movprfx(z27, z28);
3410 __ sqrdmlah(z27.VnD(), z27.VnD(), z1.VnD(), 0);
3411
3412 __ movprfx(z11, z12);
3413 __ sqrdmlsh(z11.VnB(), z16.VnB(), z11.VnB());
3414
3415 __ movprfx(z11, z12);
3416 __ sqrdmlsh(z11.VnH(), z11.VnH(), z1.VnH(), 0);
3417
3418 __ movprfx(z11, z12);
3419 __ sqrdmlsh(z11.VnS(), z11.VnS(), z1.VnS(), 0);
3420
3421 __ movprfx(z11, z12);
3422 __ sqrdmlsh(z11.VnD(), z11.VnD(), z1.VnD(), 0);
3423
3424 __ movprfx(z31.VnB(), p5.Merging(), z0.VnB());
3425 __ sqrshl(z31.VnB(), p5.Merging(), z31.VnB(), z31.VnB());
3426
3427 __ movprfx(z25.VnB(), p6.Merging(), z26.VnB());
3428 __ sqrshlr(z25.VnB(), p6.Merging(), z25.VnB(), z25.VnB());
3429
3430 __ movprfx(z0.VnB(), p5.Merging(), z1.VnB());
3431 __ sqshl(z0.VnB(), p5.Merging(), z0.VnB(), z0.VnB());
3432
3433 __ movprfx(z7.VnB(), p3.Merging(), z8.VnB());
3434 __ sqshlr(z7.VnB(), p3.Merging(), z7.VnB(), z7.VnB());
3435
3436 __ movprfx(z16.VnB(), p7.Merging(), z17.VnB());
3437 __ sqsub(z16.VnB(), p7.Merging(), z16.VnB(), z16.VnB());
3438
3439 __ movprfx(z16.VnB(), p7.Merging(), z17.VnB());
3440 __ sqsubr(z16.VnB(), p7.Merging(), z16.VnB(), z16.VnB());
3441
3442 __ movprfx(z23.VnB(), p4.Merging(), z24.VnB());
3443 __ srhadd(z23.VnB(), p4.Merging(), z23.VnB(), z23.VnB());
3444
3445 __ movprfx(z31.VnB(), p7.Merging(), z0.VnB());
3446 __ srshl(z31.VnB(), p7.Merging(), z31.VnB(), z31.VnB());
3447
3448 __ movprfx(z16.VnB(), p7.Merging(), z17.VnB());
3449 __ srshlr(z16.VnB(), p7.Merging(), z16.VnB(), z16.VnB());
3450
3451 __ movprfx(z0, z1);
3452 __ srsra(z0.VnB(), z0.VnB(), 1);
3453
3454 __ movprfx(z0, z1);
3455 __ ssra(z0.VnB(), z0.VnB(), 1);
3456
3457 __ movprfx(z26.VnB(), p2.Merging(), z27.VnB());
3458 __ suqadd(z26.VnB(), p2.Merging(), z26.VnB(), z26.VnB());
3459
3460 __ movprfx(z23, z24);
3461 __ uaba(z23.VnB(), z22.VnB(), z23.VnB());
3462
3463 __ movprfx(z11, z12);
3464 __ uabalb(z11.VnD(), z25.VnS(), z11.VnS());
3465
3466 __ movprfx(z4, z5);
3467 __ uabalt(z4.VnD(), z4.VnS(), z31.VnS());
3468
3469 __ movprfx(z20.VnD(), p4.Merging(), z21.VnD());
3470 __ uadalp(z20.VnD(), p4.Merging(), z20.VnS());
3471
3472 __ movprfx(z21.VnB(), p2.Merging(), z22.VnB());
3473 __ uhadd(z21.VnB(), p2.Merging(), z21.VnB(), z21.VnB());
3474
3475 __ movprfx(z1.VnB(), p4.Merging(), z2.VnB());
3476 __ uhsub(z1.VnB(), p4.Merging(), z1.VnB(), z1.VnB());
3477
3478 __ movprfx(z18.VnB(), p0.Merging(), z19.VnB());
3479 __ uhsubr(z18.VnB(), p0.Merging(), z18.VnB(), z18.VnB());
3480
3481 __ movprfx(z7, z8);
3482 __ umaxp(z7.VnB(), p2.Merging(), z7.VnB(), z7.VnB());
3483
3484 __ movprfx(z10, z11);
3485 __ uminp(z10.VnB(), p0.Merging(), z10.VnB(), z10.VnB());
3486
3487 __ movprfx(z31, z0);
3488 __ umlalb(z31.VnD(), z9.VnS(), z31.VnS());
3489
3490 __ movprfx(z31, z0);
3491 __ umlalb(z31.VnD(), z31.VnS(), z1.VnS(), 0);
3492
3493 __ movprfx(z31, z0);
3494 __ umlalb(z31.VnS(), z31.VnH(), z1.VnH(), 0);
3495
3496 __ movprfx(z11, z12);
3497 __ umlalt(z11.VnD(), z11.VnS(), z22.VnS());
3498
3499 __ movprfx(z11, z12);
3500 __ umlalt(z11.VnD(), z11.VnS(), z2.VnS(), 0);
3501
3502 __ movprfx(z1, z12);
3503 __ umlalt(z1.VnS(), z5.VnH(), z1.VnH(), 0);
3504
3505 __ movprfx(z28, z29);
3506 __ umlslb(z28.VnD(), z28.VnS(), z9.VnS());
3507
3508 __ movprfx(z28, z29);
3509 __ umlslb(z28.VnD(), z28.VnS(), z1.VnS(), 0);
3510
3511 __ movprfx(z28, z29);
3512 __ umlslb(z28.VnS(), z28.VnH(), z1.VnH(), 0);
3513
3514 __ movprfx(z9, z10);
3515 __ umlslt(z9.VnD(), z9.VnS(), z30.VnS());
3516
3517 __ movprfx(z9, z10);
3518 __ umlslt(z9.VnD(), z9.VnS(), z0.VnS(), 0);
3519
3520 __ movprfx(z9, z10);
3521 __ umlslt(z9.VnS(), z9.VnH(), z0.VnH(), 0);
3522
3523 __ movprfx(z24.VnB(), p7.Merging(), z25.VnB());
3524 __ uqadd(z24.VnB(), p7.Merging(), z24.VnB(), z24.VnB()),
3525
3526 __ movprfx(z20.VnB(), p1.Merging(), z21.VnB());
3527 __ uqrshl(z20.VnB(), p1.Merging(), z20.VnB(), z20.VnB());
3528
3529 __ movprfx(z8.VnB(), p5.Merging(), z9.VnB());
3530 __ uqrshlr(z8.VnB(), p5.Merging(), z8.VnB(), z8.VnB());
3531
3532 __ movprfx(z29.VnB(), p7.Merging(), z30.VnB());
3533 __ uqshl(z29.VnB(), p7.Merging(), z29.VnB(), z29.VnB());
3534
3535 __ movprfx(z12.VnB(), p1.Merging(), z13.VnB());
3536 __ uqshlr(z12.VnB(), p1.Merging(), z12.VnB(), z12.VnB());
3537
3538 __ movprfx(z20.VnB(), p0.Merging(), z21.VnB());
3539 __ uqsub(z20.VnB(), p0.Merging(), z20.VnB(), z20.VnB());
3540
3541 __ movprfx(z20.VnB(), p0.Merging(), z21.VnB());
3542 __ uqsubr(z20.VnB(), p0.Merging(), z20.VnB(), z20.VnB());
3543
3544 __ movprfx(z25.VnS(), p7.Merging(), z26.VnS());
3545 __ urecpe(z25.VnS(), p7.Merging(), z25.VnS());
3546
3547 __ movprfx(z29.VnB(), p4.Merging(), z30.VnB());
3548 __ urhadd(z29.VnB(), p4.Merging(), z29.VnB(), z29.VnB());
3549
3550 __ movprfx(z15.VnB(), p2.Merging(), z16.VnB());
3551 __ urshl(z15.VnB(), p2.Merging(), z15.VnB(), z15.VnB());
3552
3553 __ movprfx(z27.VnB(), p1.Merging(), z28.VnB());
3554 __ urshlr(z27.VnB(), p1.Merging(), z27.VnB(), z27.VnB());
3555
3556 __ movprfx(z4.VnS(), p3.Merging(), z5.VnS());
3557 __ ursqrte(z4.VnS(), p3.Merging(), z4.VnS());
3558
3559 __ movprfx(z0, z1);
3560 __ ursra(z0.VnB(), z0.VnB(), 1);
3561
3562 __ movprfx(z25.VnB(), p4.Merging(), z26.VnB());
3563 __ usqadd(z25.VnB(), p4.Merging(), z25.VnB(), z25.VnB());
3564
3565 __ movprfx(z0, z1);
3566 __ usra(z0.VnB(), z0.VnB(), 1);
3567
3568 __ movprfx(z16, z17);
3569 __ xar(z16.VnB(), z16.VnB(), z16.VnB(), 1);
3570 }
3571 assm.FinalizeCode();
3572
3573 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
3574 }
3575
TEST(movprfx_negative_lane_size_sve2)3576 TEST(movprfx_negative_lane_size_sve2) {
3577 Assembler assm;
3578 assm.GetCPUFeatures()->Combine(CPUFeatures::kSVE, CPUFeatures::kSVE2);
3579 {
3580 // We have to use the Assembler directly to generate movprfx, so we need
3581 // to manually reserve space for the code we're about to emit.
3582 static const size_t kPairCount = 140;
3583 CodeBufferCheckScope guard(&assm, kPairCount * 2 * kInstructionSize);
3584
3585 __ movprfx(z14.VnS(), p4.Merging(), z15.VnS());
3586 __ fcvtx(z14.VnS(), p4.Merging(), z0.VnD());
3587
3588 __ movprfx(z15.VnS(), p0.Merging(), z16.VnS());
3589 __ flogb(z15.VnH(), p0.Merging(), z3.VnH());
3590
3591 __ movprfx(z19.VnB(), p5.Merging(), z20.VnB());
3592 __ sadalp(z19.VnD(), p5.Merging(), z9.VnS());
3593
3594 __ movprfx(z20.VnH(), p3.Merging(), z21.VnH());
3595 __ shadd(z20.VnB(), p3.Merging(), z20.VnB(), z7.VnB());
3596
3597 __ movprfx(z21.VnH(), p0.Merging(), z22.VnH());
3598 __ shsub(z21.VnB(), p0.Merging(), z21.VnB(), z0.VnB());
3599
3600 __ movprfx(z1.VnS(), p0.Merging(), z2.VnS());
3601 __ shsubr(z1.VnB(), p0.Merging(), z1.VnB(), z2.VnB());
3602
3603 __ movprfx(z29.VnD(), p1.Merging(), z30.VnD());
3604 __ sqabs(z29.VnB(), p1.Merging(), z18.VnB());
3605
3606 __ movprfx(z28.VnH(), p0.Merging(), z29.VnH());
3607 __ sqadd(z28.VnB(), p0.Merging(), z28.VnB(), z3.VnB());
3608
3609 __ movprfx(z21.VnH(), p0.Merging(), z22.VnH());
3610 __ sqneg(z21.VnB(), p0.Merging(), z17.VnB());
3611
3612 __ movprfx(z31.VnS(), p5.Merging(), z0.VnS());
3613 __ sqrshl(z31.VnB(), p5.Merging(), z31.VnB(), z27.VnB());
3614
3615 __ movprfx(z25.VnD(), p6.Merging(), z26.VnD());
3616 __ sqrshlr(z25.VnB(), p6.Merging(), z25.VnB(), z7.VnB());
3617
3618 __ movprfx(z0.VnH(), p5.Merging(), z1.VnH());
3619 __ sqshl(z0.VnB(), p5.Merging(), z0.VnB(), 0);
3620
3621 __ movprfx(z0.VnS(), p5.Merging(), z1.VnS());
3622 __ sqshl(z0.VnB(), p5.Merging(), z0.VnB(), z2.VnB());
3623
3624 __ movprfx(z7.VnD(), p3.Merging(), z8.VnD());
3625 __ sqshlr(z7.VnB(), p3.Merging(), z7.VnB(), z5.VnB());
3626
3627 __ movprfx(z10.VnH(), p1.Merging(), z11.VnH());
3628 __ sqshlu(z10.VnB(), p1.Merging(), z10.VnB(), 0);
3629
3630 __ movprfx(z16.VnH(), p7.Merging(), z17.VnH());
3631 __ sqsub(z16.VnB(), p7.Merging(), z16.VnB(), z22.VnB());
3632
3633 __ movprfx(z16.VnS(), p7.Merging(), z17.VnS());
3634 __ sqsubr(z16.VnB(), p7.Merging(), z16.VnB(), z22.VnB());
3635
3636 __ movprfx(z23.VnD(), p4.Merging(), z24.VnD());
3637 __ srhadd(z23.VnB(), p4.Merging(), z23.VnB(), z14.VnB());
3638
3639 __ movprfx(z31.VnH(), p7.Merging(), z0.VnH());
3640 __ srshl(z31.VnB(), p7.Merging(), z31.VnB(), z3.VnB());
3641
3642 __ movprfx(z16.VnH(), p7.Merging(), z17.VnH());
3643 __ srshlr(z16.VnB(), p7.Merging(), z16.VnB(), z29.VnB());
3644
3645 __ movprfx(z12.VnH(), p0.Merging(), z13.VnH());
3646 __ srshr(z12.VnB(), p0.Merging(), z12.VnB(), 1);
3647
3648 __ movprfx(z26.VnH(), p2.Merging(), z27.VnH());
3649 __ suqadd(z26.VnB(), p2.Merging(), z26.VnB(), z28.VnB());
3650
3651 __ movprfx(z20.VnB(), p4.Merging(), z21.VnB());
3652 __ uadalp(z20.VnD(), p4.Merging(), z5.VnS());
3653
3654 __ movprfx(z21.VnH(), p2.Merging(), z22.VnH());
3655 __ uhadd(z21.VnB(), p2.Merging(), z21.VnB(), z19.VnB());
3656
3657 __ movprfx(z1.VnH(), p4.Merging(), z2.VnH());
3658 __ uhsub(z1.VnB(), p4.Merging(), z1.VnB(), z9.VnB());
3659
3660 __ movprfx(z18.VnH(), p0.Merging(), z19.VnH());
3661 __ uhsubr(z18.VnB(), p0.Merging(), z18.VnB(), z1.VnB());
3662
3663 __ movprfx(z24.VnH(), p7.Merging(), z25.VnH());
3664 __ uqadd(z24.VnB(), p7.Merging(), z24.VnB(), z1.VnB()),
3665
3666 __ movprfx(z20.VnS(), p1.Merging(), z21.VnS());
3667 __ uqrshl(z20.VnB(), p1.Merging(), z20.VnB(), z30.VnB());
3668
3669 __ movprfx(z8.VnS(), p5.Merging(), z9.VnS());
3670 __ uqrshlr(z8.VnB(), p5.Merging(), z8.VnB(), z9.VnB());
3671
3672 __ movprfx(z29.VnS(), p7.Merging(), z30.VnS());
3673 __ uqshl(z29.VnB(), p7.Merging(), z29.VnB(), 0);
3674
3675 __ movprfx(z29.VnS(), p7.Merging(), z30.VnS());
3676 __ uqshl(z29.VnB(), p7.Merging(), z29.VnB(), z30.VnB());
3677
3678 __ movprfx(z12.VnS(), p1.Merging(), z13.VnS());
3679 __ uqshlr(z12.VnB(), p1.Merging(), z12.VnB(), z13.VnB());
3680
3681 __ movprfx(z20.VnS(), p0.Merging(), z21.VnS());
3682 __ uqsub(z20.VnB(), p0.Merging(), z20.VnB(), z6.VnB());
3683
3684 __ movprfx(z20.VnS(), p0.Merging(), z21.VnS());
3685 __ uqsubr(z20.VnB(), p0.Merging(), z20.VnB(), z6.VnB());
3686
3687 __ movprfx(z25.VnB(), p7.Merging(), z26.VnB());
3688 __ urecpe(z25.VnS(), p7.Merging(), z2.VnS());
3689
3690 __ movprfx(z29.VnD(), p4.Merging(), z30.VnD());
3691 __ urhadd(z29.VnB(), p4.Merging(), z29.VnB(), z10.VnB());
3692
3693 __ movprfx(z15.VnD(), p2.Merging(), z16.VnD());
3694 __ urshl(z15.VnB(), p2.Merging(), z15.VnB(), z3.VnB());
3695
3696 __ movprfx(z27.VnD(), p1.Merging(), z28.VnD());
3697 __ urshlr(z27.VnB(), p1.Merging(), z27.VnB(), z30.VnB());
3698
3699 __ movprfx(z31.VnD(), p2.Merging(), z0.VnD());
3700 __ urshr(z31.VnB(), p2.Merging(), z31.VnB(), 1);
3701
3702 __ movprfx(z4.VnH(), p3.Merging(), z5.VnH());
3703 __ ursqrte(z4.VnS(), p3.Merging(), z3.VnS());
3704
3705 __ movprfx(z25.VnD(), p4.Merging(), z26.VnD());
3706 __ usqadd(z25.VnB(), p4.Merging(), z25.VnB(), z6.VnB());
3707 }
3708 assm.FinalizeCode();
3709
3710 CheckAndMaybeDisassembleMovprfxPairs(assm.GetBuffer(), false);
3711 }
3712
3713 } // namespace aarch64
3714 } // namespace vixl
3715