1 // Copyright 2016, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include <cfloat>
28 #include <cmath>
29 #include <cstdio>
30 #include <cstdlib>
31 #include <cstring>
32
33 #include <fstream>
34 #include <regex>
35
36 #include "test-runner.h"
37 #include "test-utils-aarch64.h"
38
39 #include "aarch64/cpu-aarch64.h"
40 #include "aarch64/disasm-aarch64.h"
41 #include "aarch64/macro-assembler-aarch64.h"
42 #include "aarch64/simulator-aarch64.h"
43
44 namespace vixl {
45 namespace aarch64 {
46
47 #define __ masm->
48 #define TEST(name) TEST_(TRACE_##name)
49
50 #define REF(name) "test/test-trace-reference/" name
51
GenerateTestSequenceBase(MacroAssembler * masm)52 static void GenerateTestSequenceBase(MacroAssembler* masm) {
53 ExactAssemblyScope guard(masm,
54 masm->GetBuffer()->GetRemainingBytes(),
55 ExactAssemblyScope::kMaximumSize);
56
57 __ adc(w3, w4, w5);
58 __ adc(x6, x7, x8);
59 __ adcs(w9, w10, w11);
60 __ adcs(x12, x13, x14);
61 __ add(w15, w16, w17);
62 __ add(x18, x19, x20);
63 __ adds(w21, w22, w23);
64 __ adds(x24, x25, x26);
65 __ and_(w27, w28, w29);
66 __ and_(x2, x3, x4);
67 __ ands(w5, w6, w7);
68 __ ands(x8, x9, x10);
69 __ asr(w11, w12, 0);
70 __ asr(x13, x14, 1);
71 __ asrv(w15, w16, w17);
72 __ asrv(x18, x19, x20);
73 __ bfm(w21, w22, 5, 6);
74 __ bfm(x23, x24, 7, 8);
75 __ bic(w25, w26, w27);
76 __ bic(x28, x29, x2);
77 __ bics(w3, w4, w5);
78 __ bics(x6, x7, x8);
79 __ ccmn(w9, w10, NoFlag, al);
80 __ ccmn(w9, w10, NoFlag, eq);
81 __ ccmn(w9, w10, NoFlag, ne);
82 __ ccmn(x11, x12, CFlag, al);
83 __ ccmn(x11, x12, CFlag, cc);
84 __ ccmn(x11, x12, CFlag, cs);
85 __ ccmp(w13, w14, VFlag, al);
86 __ ccmp(w13, w14, VFlag, hi);
87 __ ccmp(w13, w14, VFlag, ls);
88 __ ccmp(x15, x16, CVFlag, al);
89 __ ccmp(x15, x16, CVFlag, eq);
90 __ ccmp(x15, x16, CVFlag, ne);
91 __ cinc(w17, w18, cc);
92 __ cinc(w17, w18, cs);
93 __ cinc(x19, x20, hi);
94 __ cinc(x19, x20, ls);
95 __ cinv(w21, w22, eq);
96 __ cinv(w21, w22, ne);
97 __ cinv(x23, x24, cc);
98 __ cinv(x23, x24, cs);
99 __ clrex();
100 __ cls(w25, w26);
101 __ cls(x27, x28);
102 __ clz(w29, w2);
103 __ clz(x3, x4);
104 __ cmn(w5, w6);
105 __ cmn(x7, x8);
106 __ cmp(w9, w10);
107 __ cmp(x11, x12);
108 __ cneg(w13, w14, hi);
109 __ cneg(w13, w14, ls);
110 __ cneg(x15, x16, eq);
111 __ cneg(x15, x16, ne);
112 __ crc32b(w17, w18, w19);
113 __ crc32cb(w20, w21, w22);
114 __ crc32ch(w23, w24, w25);
115 __ crc32cw(w26, w27, w28);
116 __ crc32h(w4, w5, w6);
117 __ crc32w(w7, w8, w9);
118 __ csel(w13, w14, w15, cc);
119 __ csel(w13, w14, w15, cs);
120 __ csel(x16, x17, x18, hi);
121 __ csel(x16, x17, x18, ls);
122 __ cset(w19, eq);
123 __ cset(w19, ne);
124 __ cset(x20, cc);
125 __ cset(x20, cs);
126 __ csetm(w21, hi);
127 __ csetm(w21, ls);
128 __ csetm(x22, eq);
129 __ csetm(x22, ne);
130 __ csinc(w23, w24, w25, cc);
131 __ csinc(w23, w24, w25, cs);
132 __ csinc(x26, x27, x28, hi);
133 __ csinc(x26, x27, x28, ls);
134 __ csinv(w29, w2, w3, eq);
135 __ csinv(w29, w2, w3, ne);
136 __ csinv(x4, x5, x6, cc);
137 __ csinv(x4, x5, x6, cs);
138 __ csneg(w7, w8, w9, hi);
139 __ csneg(w7, w8, w9, ls);
140 __ csneg(x10, x11, x12, eq);
141 __ csneg(x10, x11, x12, ne);
142 __ dc(CVAC, x0);
143 __ dmb(InnerShareable, BarrierAll);
144 __ dsb(InnerShareable, BarrierAll);
145 __ eon(w13, w14, w15);
146 __ eon(x16, x17, x18);
147 __ eor(w19, w20, w21);
148 __ eor(x22, x23, x24);
149 __ extr(w25, w26, w27, 9);
150 __ extr(x28, x29, x2, 10);
151 __ hint(NOP);
152 __ ic(IVAU, x0);
153 __ isb();
154 __ ldar(w3, MemOperand(x0));
155 __ ldar(x4, MemOperand(x0));
156 __ ldarb(w5, MemOperand(x0));
157 __ ldarb(x6, MemOperand(x0));
158 __ ldarh(w7, MemOperand(x0));
159 __ ldarh(x8, MemOperand(x0));
160 __ ldaxp(w9, w10, MemOperand(x0));
161 __ ldaxp(x11, x12, MemOperand(x0));
162 __ ldaxr(w13, MemOperand(x0));
163 __ ldaxr(x14, MemOperand(x0));
164 __ ldaxrb(w15, MemOperand(x0));
165 __ ldaxrb(x16, MemOperand(x0));
166 __ ldaxrh(w17, MemOperand(x0));
167 __ ldaxrh(x18, MemOperand(x0));
168 __ ldnp(w19, w20, MemOperand(x0));
169 __ ldnp(x21, x22, MemOperand(x0));
170 __ ldp(w23, w24, MemOperand(x0));
171 __ ldp(w23, w24, MemOperand(x1, 8, PostIndex));
172 __ ldp(w23, w24, MemOperand(x1, 8, PreIndex));
173 __ ldp(x25, x26, MemOperand(x0));
174 __ ldp(x25, x26, MemOperand(x1, 16, PostIndex));
175 __ ldp(x25, x26, MemOperand(x1, 16, PreIndex));
176 __ ldpsw(x27, x28, MemOperand(x0));
177 __ ldpsw(x27, x28, MemOperand(x1, 8, PostIndex));
178 __ ldpsw(x27, x28, MemOperand(x1, 8, PreIndex));
179 __ ldr(w29, MemOperand(x0));
180 __ ldr(w29, MemOperand(x1, 4, PostIndex));
181 __ ldr(w29, MemOperand(x1, 4, PreIndex));
182 __ ldr(x2, MemOperand(x0));
183 __ ldr(x2, MemOperand(x1, 8, PostIndex));
184 __ ldr(x2, MemOperand(x1, 8, PreIndex));
185 __ ldrb(w3, MemOperand(x0));
186 __ ldrb(w3, MemOperand(x1, 1, PostIndex));
187 __ ldrb(w3, MemOperand(x1, 1, PreIndex));
188 __ ldrb(x4, MemOperand(x0));
189 __ ldrb(x4, MemOperand(x1, 1, PostIndex));
190 __ ldrb(x4, MemOperand(x1, 1, PreIndex));
191 __ ldrh(w5, MemOperand(x0));
192 __ ldrh(w5, MemOperand(x1, 2, PostIndex));
193 __ ldrh(w5, MemOperand(x1, 2, PreIndex));
194 __ ldrh(x6, MemOperand(x0));
195 __ ldrh(x6, MemOperand(x1, 2, PostIndex));
196 __ ldrh(x6, MemOperand(x1, 2, PreIndex));
197 __ ldrsb(w7, MemOperand(x0));
198 __ ldrsb(w7, MemOperand(x1, 1, PostIndex));
199 __ ldrsb(w7, MemOperand(x1, 1, PreIndex));
200 __ ldrsb(x8, MemOperand(x0));
201 __ ldrsb(x8, MemOperand(x1, 1, PostIndex));
202 __ ldrsb(x8, MemOperand(x1, 1, PreIndex));
203 __ ldrsh(w9, MemOperand(x0));
204 __ ldrsh(w9, MemOperand(x1, 2, PostIndex));
205 __ ldrsh(w9, MemOperand(x1, 2, PreIndex));
206 __ ldrsh(x10, MemOperand(x0));
207 __ ldrsh(x10, MemOperand(x1, 2, PostIndex));
208 __ ldrsh(x10, MemOperand(x1, 2, PreIndex));
209 __ ldrsw(x11, MemOperand(x0));
210 __ ldrsw(x11, MemOperand(x1, 4, PostIndex));
211 __ ldrsw(x11, MemOperand(x1, 4, PreIndex));
212 __ ldur(w12, MemOperand(x0, 7));
213 __ ldur(x13, MemOperand(x0, 15));
214 __ ldurb(w14, MemOperand(x0, 1));
215 __ ldurb(x15, MemOperand(x0, 1));
216 __ ldurh(w16, MemOperand(x0, 3));
217 __ ldurh(x17, MemOperand(x0, 3));
218 __ ldursb(w18, MemOperand(x0, 1));
219 __ ldursb(x19, MemOperand(x0, 1));
220 __ ldursh(w20, MemOperand(x0, 3));
221 __ ldursh(x21, MemOperand(x0, 3));
222 __ ldursw(x22, MemOperand(x0, 7));
223 __ ldxp(w23, w24, MemOperand(x0));
224 __ ldxp(x25, x26, MemOperand(x0));
225 __ ldxr(w27, MemOperand(x0));
226 __ ldxr(x28, MemOperand(x0));
227 __ ldxrb(w29, MemOperand(x0));
228 __ ldxrb(x2, MemOperand(x0));
229 __ ldxrh(w3, MemOperand(x0));
230 __ ldxrh(x4, MemOperand(x0));
231 __ lsl(w5, w6, 2);
232 __ lsl(x7, x8, 3);
233 __ lslv(w9, w10, w11);
234 __ lslv(x12, x13, x14);
235 __ lsr(w15, w16, 4);
236 __ lsr(x17, x18, 5);
237 __ lsrv(w19, w20, w21);
238 __ lsrv(x22, x23, x24);
239 __ madd(w25, w26, w27, w28);
240 __ madd(x29, x2, x3, x4);
241 __ mneg(w5, w6, w7);
242 __ mneg(x8, x9, x10);
243 __ mov(w11, w12);
244 __ mov(x13, x14);
245 __ movk(w15, 130);
246 __ movk(x16, 131);
247 __ movn(w17, 132);
248 __ movn(x18, 133);
249 __ movz(w19, 134);
250 __ movz(x20, 135);
251 __ msub(w22, w23, w24, w25);
252 __ msub(x26, x27, x28, x29);
253 __ mul(w2, w3, w4);
254 __ mul(x5, x6, x7);
255 __ mvn(w8, w9);
256 __ mvn(x10, x11);
257 __ neg(w12, w13);
258 __ neg(x14, x15);
259 __ negs(w16, w17);
260 __ negs(x18, x19);
261 __ ngc(w20, w21);
262 __ ngc(x22, x23);
263 __ ngcs(w24, w25);
264 __ ngcs(x26, x27);
265 __ nop();
266 __ orn(w28, w29, w2);
267 __ orn(x3, x4, x5);
268 __ orr(w6, w7, w8);
269 __ orr(x9, x10, x11);
270 __ prfm(PLDL1KEEP, MemOperand(x0, 4));
271 __ prfum(PLDL1KEEP, MemOperand(x0, 1));
272 __ rbit(w12, w13);
273 __ rbit(x14, x15);
274 __ rev(w16, w17);
275 __ rev(x18, x19);
276 __ rev16(w20, w21);
277 __ rev16(x22, x23);
278 __ rev32(x24, x25);
279 __ rorv(w26, w27, w28);
280 __ rorv(x29, x2, x3);
281 __ sbc(w4, w5, w6);
282 __ sbc(x7, x8, x9);
283 __ sbcs(w10, w11, w12);
284 __ sbcs(x13, x14, x15);
285 __ sbfiz(w16, w17, 2, 3);
286 __ sbfiz(x18, x19, 4, 5);
287 __ sbfx(w22, w23, 6, 7);
288 __ sbfx(x24, x25, 8, 9);
289 __ sdiv(w26, w27, w28);
290 __ sdiv(x29, x2, x3);
291 __ smulh(x12, x13, x14);
292 __ stlr(w18, MemOperand(x0));
293 __ stlr(x19, MemOperand(x0));
294 __ stlrb(w20, MemOperand(x0));
295 __ stlrb(x21, MemOperand(x0));
296 __ stlrh(w22, MemOperand(x0));
297 __ stlrh(x23, MemOperand(x0));
298 __ stlxp(w24, w25, w26, MemOperand(x0));
299 __ stlxp(x27, x28, x29, MemOperand(x0));
300 __ stlxr(w2, w3, MemOperand(x0));
301 __ stlxr(x4, x5, MemOperand(x0));
302 __ stlxrb(w6, w7, MemOperand(x0));
303 __ stlxrb(x8, x9, MemOperand(x0));
304 __ stlxrh(w10, w11, MemOperand(x0));
305 __ stlxrh(x12, x13, MemOperand(x0));
306 __ stnp(w14, w15, MemOperand(x0));
307 __ stnp(x16, x17, MemOperand(x0));
308 __ stp(w18, w19, MemOperand(x0));
309 __ stp(w18, w19, MemOperand(x1, 8, PostIndex));
310 __ stp(w18, w19, MemOperand(x1, 8, PreIndex));
311 __ stp(x20, x21, MemOperand(x0));
312 __ stp(x20, x21, MemOperand(x1, 16, PostIndex));
313 __ stp(x20, x21, MemOperand(x1, 16, PreIndex));
314 __ str(w22, MemOperand(x0));
315 __ str(w22, MemOperand(x1, 4, PostIndex));
316 __ str(w22, MemOperand(x1, 4, PreIndex));
317 __ str(x23, MemOperand(x0));
318 __ str(x23, MemOperand(x1, 8, PostIndex));
319 __ str(x23, MemOperand(x1, 8, PreIndex));
320 __ strb(w24, MemOperand(x0));
321 __ strb(w24, MemOperand(x1, 1, PostIndex));
322 __ strb(w24, MemOperand(x1, 1, PreIndex));
323 __ strb(x25, MemOperand(x0));
324 __ strb(x25, MemOperand(x1, 1, PostIndex));
325 __ strb(x25, MemOperand(x1, 1, PreIndex));
326 __ strh(w26, MemOperand(x0));
327 __ strh(w26, MemOperand(x1, 2, PostIndex));
328 __ strh(w26, MemOperand(x1, 2, PreIndex));
329 __ strh(x27, MemOperand(x0));
330 __ strh(x27, MemOperand(x1, 2, PostIndex));
331 __ strh(x27, MemOperand(x1, 2, PreIndex));
332 __ stur(w28, MemOperand(x0, 7));
333 __ stur(x29, MemOperand(x0, 15));
334 __ sturb(w2, MemOperand(x0, 1));
335 __ sturb(x3, MemOperand(x0, 1));
336 __ sturh(w4, MemOperand(x0, 3));
337 __ sturh(x5, MemOperand(x0, 3));
338 __ stxp(w6, w7, w8, MemOperand(x0));
339 __ stxp(x9, x10, x11, MemOperand(x0));
340 __ stxr(w12, w13, MemOperand(x0));
341 __ stxr(x14, x15, MemOperand(x0));
342 __ stxrb(w16, w17, MemOperand(x0));
343 __ stxrb(x18, x19, MemOperand(x0));
344 __ stxrh(w20, w21, MemOperand(x0));
345 __ stxrh(x22, x23, MemOperand(x0));
346 __ sub(w24, w25, w26);
347 __ sub(x27, x28, x29);
348 __ subs(w2, w3, w4);
349 __ subs(x5, x6, x7);
350 __ sxtb(w8, w9);
351 __ sxtb(x10, x11);
352 __ sxth(w12, w13);
353 __ sxth(x14, x15);
354 __ sxtw(w16, w17);
355 __ sxtw(x18, x19);
356 __ tst(w20, w21);
357 __ tst(x22, x23);
358 __ ubfiz(w24, w25, 10, 11);
359 __ ubfiz(x26, x27, 12, 13);
360 __ ubfm(w28, w29, 14, 15);
361 __ ubfm(x2, x3, 1, 2);
362 __ ubfx(w4, w5, 3, 4);
363 __ ubfx(x6, x7, 5, 6);
364 __ udiv(w8, w9, w10);
365 __ udiv(x11, x12, x13);
366 __ umulh(x22, x23, x24);
367 __ uxtb(w28, w29);
368 __ uxtb(x2, x3);
369 __ uxth(w4, w5);
370 __ uxth(x6, x7);
371 __ uxtw(w8, w9);
372 __ uxtw(x10, x11);
373
374 // Branch tests.
375 {
376 Label end;
377 // Branch to the next instruction.
378 __ b(&end);
379 __ bind(&end);
380 }
381 {
382 Label loop, end;
383 __ subs(x3, x3, x3);
384 __ bind(&loop);
385 // Not-taken branch (the first time).
386 // Taken branch (the second time).
387 __ b(&end, ne);
388 __ cmp(x3, 1);
389 // Backwards branch.
390 __ b(&loop);
391 __ bind(&end);
392 }
393 }
394
395
GenerateTestSequenceFP(MacroAssembler * masm)396 static void GenerateTestSequenceFP(MacroAssembler* masm) {
397 ExactAssemblyScope guard(masm,
398 masm->GetBuffer()->GetRemainingBytes(),
399 ExactAssemblyScope::kMaximumSize);
400
401 // Scalar floating point instructions.
402 __ fabd(d13, d2, d19);
403 __ fabd(s8, s10, s30);
404 __ fabs(d1, d1);
405 __ fabs(s25, s7);
406 __ facge(d1, d23, d16);
407 __ facge(s4, s17, s1);
408 __ facgt(d2, d21, d24);
409 __ facgt(s12, s26, s12);
410 __ fadd(d13, d11, d22);
411 __ fadd(s27, s19, s8);
412 __ fccmp(d6, d10, NoFlag, hs);
413 __ fccmp(s29, s20, NZVFlag, ne);
414 __ fccmpe(d10, d2, NZCFlag, al);
415 __ fccmpe(s3, s3, NZVFlag, pl);
416 __ fcmeq(d19, d8, d10);
417 __ fcmeq(d0, d18, 0.0);
418 __ fcmeq(s1, s4, s30);
419 __ fcmeq(s22, s29, 0.0);
420 __ fcmge(d27, d18, d1);
421 __ fcmge(d31, d28, 0.0);
422 __ fcmge(s31, s19, s9);
423 __ fcmge(s1, s25, 0.0);
424 __ fcmgt(d18, d1, d15);
425 __ fcmgt(d3, d31, 0.0);
426 __ fcmgt(s11, s25, s2);
427 __ fcmgt(s17, s16, 0.0);
428 __ fcmle(d24, d17, 0.0);
429 __ fcmle(s11, s8, 0.0);
430 __ fcmlt(d5, d31, 0.0);
431 __ fcmlt(s18, s23, 0.0);
432 __ fcmp(d10, d24);
433 __ fcmp(d13, 0.0);
434 __ fcmp(s18, s6);
435 __ fcmp(s16, 0.0);
436 __ fcmpe(d9, d17);
437 __ fcmpe(d29, 0.0);
438 __ fcmpe(s16, s17);
439 __ fcmpe(s22, 0.0);
440 __ fcsel(d10, d14, d19, gt);
441 __ fcsel(s22, s18, s2, ge);
442 __ fcvt(d4, h24);
443 __ fcvt(d11, s2);
444 __ fcvt(h8, d9);
445 __ fcvt(h12, s1);
446 __ fcvt(s12, d31);
447 __ fcvt(s27, h25);
448 __ fcvtas(d28, d16);
449 __ fcvtas(s3, s5);
450 __ fcvtas(w18, d31);
451 __ fcvtas(w29, s24);
452 __ fcvtas(x9, d1);
453 __ fcvtas(x30, s2);
454 __ fcvtau(d14, d0);
455 __ fcvtau(s31, s14);
456 __ fcvtau(w16, d2);
457 __ fcvtau(w18, s0);
458 __ fcvtau(x26, d7);
459 __ fcvtau(x25, s19);
460 __ fcvtms(d30, d25);
461 __ fcvtms(s12, s15);
462 __ fcvtms(w9, d7);
463 __ fcvtms(w19, s6);
464 __ fcvtms(x6, d6);
465 __ fcvtms(x22, s7);
466 __ fcvtmu(d27, d0);
467 __ fcvtmu(s8, s22);
468 __ fcvtmu(w29, d19);
469 __ fcvtmu(w26, s0);
470 __ fcvtmu(x13, d5);
471 __ fcvtmu(x5, s18);
472 __ fcvtns(d30, d15);
473 __ fcvtns(s10, s11);
474 __ fcvtns(w21, d15);
475 __ fcvtns(w18, s10);
476 __ fcvtns(x8, d17);
477 __ fcvtns(x17, s12);
478 __ fcvtnu(d0, d21);
479 __ fcvtnu(s6, s25);
480 __ fcvtnu(w29, d11);
481 __ fcvtnu(w25, s31);
482 __ fcvtnu(x30, d11);
483 __ fcvtnu(x27, s18);
484 __ fcvtps(d11, d22);
485 __ fcvtps(s29, s20);
486 __ fcvtps(w15, d25);
487 __ fcvtps(w16, s7);
488 __ fcvtps(x13, d20);
489 __ fcvtps(x3, s23);
490 __ fcvtpu(d24, d1);
491 __ fcvtpu(s14, s24);
492 __ fcvtpu(w26, d29);
493 __ fcvtpu(wzr, s26);
494 __ fcvtpu(x27, d6);
495 __ fcvtpu(x29, s14);
496 __ fcvtxn(s12, d12);
497 __ fcvtzs(d15, d0);
498 __ fcvtzs(d13, d4, 42);
499 __ fcvtzs(s8, s11);
500 __ fcvtzs(s31, s6, 25);
501 __ fcvtzs(w6, d9);
502 __ fcvtzs(w25, d10, 20);
503 __ fcvtzs(w9, s1);
504 __ fcvtzs(w17, s29, 30);
505 __ fcvtzs(x19, d2);
506 __ fcvtzs(x22, d14, 1);
507 __ fcvtzs(x14, s20);
508 __ fcvtzs(x3, s30, 33);
509 __ fcvtzu(d28, d15);
510 __ fcvtzu(d0, d4, 3);
511 __ fcvtzu(s2, s5);
512 __ fcvtzu(s4, s0, 30);
513 __ fcvtzu(w11, d4);
514 __ fcvtzu(w7, d24, 32);
515 __ fcvtzu(w18, s24);
516 __ fcvtzu(w14, s27, 4);
517 __ fcvtzu(x22, d11);
518 __ fcvtzu(x8, d27, 52);
519 __ fcvtzu(x7, s20);
520 __ fcvtzu(x22, s7, 44);
521 __ fdiv(d6, d14, d15);
522 __ fdiv(s26, s5, s25);
523 __ fmadd(d18, d26, d12, d30);
524 __ fmadd(s13, s9, s28, s4);
525 __ fmax(d12, d5, d5);
526 __ fmax(s12, s28, s6);
527 __ fmaxnm(d28, d4, d2);
528 __ fmaxnm(s6, s10, s8);
529 __ fmin(d20, d20, d18);
530 __ fmin(s7, s13, s16);
531 __ fminnm(d19, d14, d30);
532 __ fminnm(s0, s1, s1);
533 __ fmov(d13, d6);
534 __ fmov(d2, x17);
535 __ fmov(d8, -2.5000);
536 __ fmov(s5, s3);
537 __ fmov(s25, w20);
538 __ fmov(s21, 2.8750f);
539 __ fmov(w18, s24);
540 __ fmov(x18, d2);
541 __ fmsub(d20, d30, d3, d19);
542 __ fmsub(s5, s19, s4, s12);
543 __ fmul(d30, d27, d23);
544 __ fmul(s25, s17, s15);
545 __ fmulx(d4, d17, d1);
546 __ fmulx(s14, s25, s4);
547 __ fneg(d15, d0);
548 __ fneg(s14, s15);
549 __ fnmadd(d0, d16, d22, d31);
550 __ fnmadd(s0, s18, s26, s18);
551 __ fnmsub(d19, d12, d15, d21);
552 __ fnmsub(s29, s0, s11, s26);
553 __ fnmul(d31, d19, d1);
554 __ fnmul(s18, s3, s17);
555 __ frecpe(d7, d21);
556 __ frecpe(s29, s17);
557 __ frecps(d11, d26, d17);
558 __ frecps(s18, s27, s1);
559 __ frecpx(d15, d18);
560 __ frecpx(s5, s10);
561 __ frinta(d16, d30);
562 __ frinta(s1, s22);
563 __ frinti(d19, d29);
564 __ frinti(s14, s21);
565 __ frintm(d20, d30);
566 __ frintm(s1, s16);
567 __ frintn(d30, d1);
568 __ frintn(s24, s10);
569 __ frintp(d4, d20);
570 __ frintp(s13, s3);
571 __ frintx(d13, d20);
572 __ frintx(s17, s7);
573 __ frintz(d0, d8);
574 __ frintz(s15, s29);
575 __ frsqrte(d21, d10);
576 __ frsqrte(s17, s25);
577 __ frsqrts(d4, d29, d17);
578 __ frsqrts(s14, s3, s24);
579 __ fsqrt(d14, d17);
580 __ fsqrt(s4, s14);
581 __ fsub(d13, d19, d7);
582 __ fsub(s3, s21, s27);
583 __ scvtf(d31, d16);
584 __ scvtf(d26, d31, 24);
585 __ scvtf(d6, w16);
586 __ scvtf(d5, w20, 6);
587 __ scvtf(d16, x8);
588 __ scvtf(d15, x8, 10);
589 __ scvtf(s7, s4);
590 __ scvtf(s8, s15, 14);
591 __ scvtf(s29, w10);
592 __ scvtf(s15, w21, 11);
593 __ scvtf(s27, x26);
594 __ scvtf(s26, x12, 38);
595 __ ucvtf(d0, d9);
596 __ ucvtf(d5, d22, 47);
597 __ ucvtf(d30, w27);
598 __ ucvtf(d3, w19, 1);
599 __ ucvtf(d28, x21);
600 __ ucvtf(d27, x30, 35);
601 __ ucvtf(s11, s5);
602 __ ucvtf(s0, s23, 14);
603 __ ucvtf(s20, w19);
604 __ ucvtf(s21, w22, 18);
605 __ ucvtf(s6, x13);
606 __ ucvtf(s7, x2, 21);
607 }
608
609
GenerateTestSequenceNEON(MacroAssembler * masm)610 static void GenerateTestSequenceNEON(MacroAssembler* masm) {
611 ExactAssemblyScope guard(masm,
612 masm->GetBuffer()->GetRemainingBytes(),
613 ExactAssemblyScope::kMaximumSize);
614
615 // NEON integer instructions.
616 __ abs(d19, d0);
617 __ abs(v16.V16B(), v11.V16B());
618 __ abs(v0.V2D(), v31.V2D());
619 __ abs(v27.V2S(), v25.V2S());
620 __ abs(v21.V4H(), v27.V4H());
621 __ abs(v16.V4S(), v1.V4S());
622 __ abs(v31.V8B(), v5.V8B());
623 __ abs(v29.V8H(), v13.V8H());
624 __ add(d10, d5, d17);
625 __ add(v31.V16B(), v15.V16B(), v23.V16B());
626 __ add(v10.V2D(), v31.V2D(), v14.V2D());
627 __ add(v15.V2S(), v14.V2S(), v19.V2S());
628 __ add(v27.V4H(), v23.V4H(), v17.V4H());
629 __ add(v25.V4S(), v28.V4S(), v29.V4S());
630 __ add(v13.V8B(), v7.V8B(), v18.V8B());
631 __ add(v4.V8H(), v2.V8H(), v1.V8H());
632 __ addhn(v10.V2S(), v14.V2D(), v15.V2D());
633 __ addhn(v10.V4H(), v30.V4S(), v26.V4S());
634 __ addhn(v31.V8B(), v12.V8H(), v22.V8H());
635 __ addhn2(v16.V16B(), v21.V8H(), v20.V8H());
636 __ addhn2(v0.V4S(), v2.V2D(), v17.V2D());
637 __ addhn2(v31.V8H(), v7.V4S(), v17.V4S());
638 __ addp(d14, v19.V2D());
639 __ addp(v3.V16B(), v8.V16B(), v28.V16B());
640 __ addp(v8.V2D(), v5.V2D(), v17.V2D());
641 __ addp(v22.V2S(), v30.V2S(), v26.V2S());
642 __ addp(v29.V4H(), v24.V4H(), v14.V4H());
643 __ addp(v30.V4S(), v26.V4S(), v24.V4S());
644 __ addp(v12.V8B(), v26.V8B(), v7.V8B());
645 __ addp(v17.V8H(), v8.V8H(), v12.V8H());
646 __ addv(b27, v23.V16B());
647 __ addv(b12, v20.V8B());
648 __ addv(h27, v30.V4H());
649 __ addv(h19, v14.V8H());
650 __ addv(s14, v27.V4S());
651 __ and_(v10.V16B(), v8.V16B(), v27.V16B());
652 __ and_(v5.V8B(), v1.V8B(), v16.V8B());
653 __ bic(v26.V16B(), v3.V16B(), v24.V16B());
654 __ bic(v7.V2S(), 0xe4, 16);
655 __ bic(v28.V4H(), 0x23, 8);
656 __ bic(v29.V4S(), 0xac);
657 __ bic(v12.V8B(), v31.V8B(), v21.V8B());
658 __ bic(v18.V8H(), 0x98);
659 __ bif(v12.V16B(), v26.V16B(), v8.V16B());
660 __ bif(v2.V8B(), v23.V8B(), v27.V8B());
661 __ bit(v8.V16B(), v3.V16B(), v13.V16B());
662 __ bit(v5.V8B(), v5.V8B(), v23.V8B());
663 __ bsl(v9.V16B(), v31.V16B(), v23.V16B());
664 __ bsl(v14.V8B(), v7.V8B(), v3.V8B());
665 __ cls(v29.V16B(), v5.V16B());
666 __ cls(v21.V2S(), v0.V2S());
667 __ cls(v1.V4H(), v12.V4H());
668 __ cls(v27.V4S(), v10.V4S());
669 __ cls(v19.V8B(), v4.V8B());
670 __ cls(v15.V8H(), v14.V8H());
671 __ clz(v1.V16B(), v4.V16B());
672 __ clz(v27.V2S(), v17.V2S());
673 __ clz(v9.V4H(), v9.V4H());
674 __ clz(v31.V4S(), v15.V4S());
675 __ clz(v14.V8B(), v19.V8B());
676 __ clz(v6.V8H(), v11.V8H());
677 __ cmeq(d18, d5, d29);
678 __ cmeq(d14, d31, 0);
679 __ cmeq(v19.V16B(), v3.V16B(), v22.V16B());
680 __ cmeq(v15.V16B(), v9.V16B(), 0);
681 __ cmeq(v12.V2D(), v16.V2D(), v10.V2D());
682 __ cmeq(v8.V2D(), v22.V2D(), 0);
683 __ cmeq(v2.V2S(), v3.V2S(), v9.V2S());
684 __ cmeq(v16.V2S(), v25.V2S(), 0);
685 __ cmeq(v6.V4H(), v23.V4H(), v20.V4H());
686 __ cmeq(v16.V4H(), v13.V4H(), 0);
687 __ cmeq(v21.V4S(), v17.V4S(), v2.V4S());
688 __ cmeq(v6.V4S(), v25.V4S(), 0);
689 __ cmeq(v16.V8B(), v13.V8B(), v2.V8B());
690 __ cmeq(v21.V8B(), v16.V8B(), 0);
691 __ cmeq(v20.V8H(), v7.V8H(), v25.V8H());
692 __ cmeq(v26.V8H(), v8.V8H(), 0);
693 __ cmge(d16, d13, d31);
694 __ cmge(d25, d24, 0);
695 __ cmge(v17.V16B(), v19.V16B(), v17.V16B());
696 __ cmge(v22.V16B(), v30.V16B(), 0);
697 __ cmge(v28.V2D(), v20.V2D(), v26.V2D());
698 __ cmge(v6.V2D(), v23.V2D(), 0);
699 __ cmge(v25.V2S(), v22.V2S(), v3.V2S());
700 __ cmge(v21.V2S(), v11.V2S(), 0);
701 __ cmge(v16.V4H(), v3.V4H(), v12.V4H());
702 __ cmge(v23.V4H(), v9.V4H(), 0);
703 __ cmge(v7.V4S(), v2.V4S(), v11.V4S());
704 __ cmge(v0.V4S(), v22.V4S(), 0);
705 __ cmge(v10.V8B(), v30.V8B(), v9.V8B());
706 __ cmge(v21.V8B(), v8.V8B(), 0);
707 __ cmge(v2.V8H(), v7.V8H(), v26.V8H());
708 __ cmge(v19.V8H(), v10.V8H(), 0);
709 __ cmgt(d6, d13, d1);
710 __ cmgt(d30, d24, 0);
711 __ cmgt(v20.V16B(), v25.V16B(), v27.V16B());
712 __ cmgt(v0.V16B(), v25.V16B(), 0);
713 __ cmgt(v22.V2D(), v25.V2D(), v1.V2D());
714 __ cmgt(v16.V2D(), v16.V2D(), 0);
715 __ cmgt(v5.V2S(), v9.V2S(), v15.V2S());
716 __ cmgt(v12.V2S(), v18.V2S(), 0);
717 __ cmgt(v28.V4H(), v18.V4H(), v11.V4H());
718 __ cmgt(v22.V4H(), v3.V4H(), 0);
719 __ cmgt(v5.V4S(), v11.V4S(), v27.V4S());
720 __ cmgt(v13.V4S(), v20.V4S(), 0);
721 __ cmgt(v27.V8B(), v31.V8B(), v7.V8B());
722 __ cmgt(v5.V8B(), v0.V8B(), 0);
723 __ cmgt(v22.V8H(), v28.V8H(), v13.V8H());
724 __ cmgt(v6.V8H(), v2.V8H(), 0);
725 __ cmhi(d21, d8, d22);
726 __ cmhi(v18.V16B(), v19.V16B(), v19.V16B());
727 __ cmhi(v7.V2D(), v0.V2D(), v21.V2D());
728 __ cmhi(v15.V2S(), v19.V2S(), v0.V2S());
729 __ cmhi(v31.V4H(), v7.V4H(), v12.V4H());
730 __ cmhi(v9.V4S(), v16.V4S(), v22.V4S());
731 __ cmhi(v7.V8B(), v24.V8B(), v28.V8B());
732 __ cmhi(v11.V8H(), v10.V8H(), v25.V8H());
733 __ cmhs(d1, d12, d17);
734 __ cmhs(v21.V16B(), v25.V16B(), v30.V16B());
735 __ cmhs(v8.V2D(), v2.V2D(), v26.V2D());
736 __ cmhs(v1.V2S(), v22.V2S(), v29.V2S());
737 __ cmhs(v26.V4H(), v30.V4H(), v30.V4H());
738 __ cmhs(v19.V4S(), v20.V4S(), v16.V4S());
739 __ cmhs(v1.V8B(), v3.V8B(), v26.V8B());
740 __ cmhs(v20.V8H(), v28.V8H(), v8.V8H());
741 __ cmle(d30, d24, 0);
742 __ cmle(v0.V16B(), v3.V16B(), 0);
743 __ cmle(v2.V2D(), v30.V2D(), 0);
744 __ cmle(v7.V2S(), v10.V2S(), 0);
745 __ cmle(v9.V4H(), v31.V4H(), 0);
746 __ cmle(v9.V4S(), v18.V4S(), 0);
747 __ cmle(v21.V8B(), v31.V8B(), 0);
748 __ cmle(v29.V8H(), v21.V8H(), 0);
749 __ cmlt(d25, d23, 0);
750 __ cmlt(v7.V16B(), v21.V16B(), 0);
751 __ cmlt(v7.V2D(), v30.V2D(), 0);
752 __ cmlt(v25.V2S(), v28.V2S(), 0);
753 __ cmlt(v0.V4H(), v11.V4H(), 0);
754 __ cmlt(v24.V4S(), v5.V4S(), 0);
755 __ cmlt(v26.V8B(), v11.V8B(), 0);
756 __ cmlt(v1.V8H(), v21.V8H(), 0);
757 __ cmtst(d28, d23, d30);
758 __ cmtst(v26.V16B(), v6.V16B(), v31.V16B());
759 __ cmtst(v1.V2D(), v21.V2D(), v4.V2D());
760 __ cmtst(v27.V2S(), v26.V2S(), v20.V2S());
761 __ cmtst(v26.V4H(), v0.V4H(), v18.V4H());
762 __ cmtst(v25.V4S(), v16.V4S(), v4.V4S());
763 __ cmtst(v11.V8B(), v10.V8B(), v9.V8B());
764 __ cmtst(v0.V8H(), v2.V8H(), v1.V8H());
765 __ cnt(v25.V16B(), v15.V16B());
766 __ cnt(v28.V8B(), v6.V8B());
767 __ dup(v6.V16B(), v7.B(), 7);
768 __ dup(v9.V16B(), w20);
769 __ dup(v12.V2D(), v13.D(), 1);
770 __ dup(v9.V2D(), xzr);
771 __ dup(v4.V2S(), v26.S(), 2);
772 __ dup(v3.V2S(), w12);
773 __ dup(v22.V4H(), v5.H(), 7);
774 __ dup(v16.V4H(), w25);
775 __ dup(v20.V4S(), v10.S(), 2);
776 __ dup(v10.V4S(), w7);
777 __ dup(v30.V8B(), v30.B(), 2);
778 __ dup(v31.V8B(), w15);
779 __ dup(v28.V8H(), v17.H(), 4);
780 __ dup(v2.V8H(), w3);
781 __ eor(v29.V16B(), v25.V16B(), v3.V16B());
782 __ eor(v3.V8B(), v16.V8B(), v28.V8B());
783 __ ext(v1.V16B(), v26.V16B(), v6.V16B(), 1);
784 __ ext(v2.V8B(), v30.V8B(), v1.V8B(), 1);
785 __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
786 __ ld1(v23.V16B(),
787 v24.V16B(),
788 v25.V16B(),
789 v26.V16B(),
790 MemOperand(x1, x2, PostIndex));
791 __ ld1(v5.V16B(),
792 v6.V16B(),
793 v7.V16B(),
794 v8.V16B(),
795 MemOperand(x1, 64, PostIndex));
796 __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), MemOperand(x0));
797 __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), MemOperand(x1, x2, PostIndex));
798 __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x1, 48, PostIndex));
799 __ ld1(v17.V16B(), v18.V16B(), MemOperand(x0));
800 __ ld1(v20.V16B(), v21.V16B(), MemOperand(x1, x2, PostIndex));
801 __ ld1(v28.V16B(), v29.V16B(), MemOperand(x1, 32, PostIndex));
802 __ ld1(v29.V16B(), MemOperand(x0));
803 __ ld1(v21.V16B(), MemOperand(x1, x2, PostIndex));
804 __ ld1(v4.V16B(), MemOperand(x1, 16, PostIndex));
805 __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), MemOperand(x0));
806 __ ld1(v17.V1D(),
807 v18.V1D(),
808 v19.V1D(),
809 v20.V1D(),
810 MemOperand(x1, x2, PostIndex));
811 __ ld1(v28.V1D(),
812 v29.V1D(),
813 v30.V1D(),
814 v31.V1D(),
815 MemOperand(x1, 32, PostIndex));
816 __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), MemOperand(x0));
817 __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), MemOperand(x1, x2, PostIndex));
818 __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), MemOperand(x1, 24, PostIndex));
819 __ ld1(v29.V1D(), v30.V1D(), MemOperand(x0));
820 __ ld1(v31.V1D(), v0.V1D(), MemOperand(x1, x2, PostIndex));
821 __ ld1(v3.V1D(), v4.V1D(), MemOperand(x1, 16, PostIndex));
822 __ ld1(v28.V1D(), MemOperand(x0));
823 __ ld1(v11.V1D(), MemOperand(x1, x2, PostIndex));
824 __ ld1(v29.V1D(), MemOperand(x1, 8, PostIndex));
825 __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x0));
826 __ ld1(v8.V2D(),
827 v9.V2D(),
828 v10.V2D(),
829 v11.V2D(),
830 MemOperand(x1, x2, PostIndex));
831 __ ld1(v14.V2D(),
832 v15.V2D(),
833 v16.V2D(),
834 v17.V2D(),
835 MemOperand(x1, 64, PostIndex));
836 __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x0));
837 __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
838 __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x1, 48, PostIndex));
839 __ ld1(v18.V2D(), v19.V2D(), MemOperand(x0));
840 __ ld1(v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
841 __ ld1(v17.V2D(), v18.V2D(), MemOperand(x1, 32, PostIndex));
842 __ ld1(v5.V2D(), MemOperand(x0));
843 __ ld1(v6.V2D(), MemOperand(x1, x2, PostIndex));
844 __ ld1(v15.V2D(), MemOperand(x1, 16, PostIndex));
845 __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x0));
846 __ ld1(v24.V2S(),
847 v25.V2S(),
848 v26.V2S(),
849 v27.V2S(),
850 MemOperand(x1, x2, PostIndex));
851 __ ld1(v27.V2S(),
852 v28.V2S(),
853 v29.V2S(),
854 v30.V2S(),
855 MemOperand(x1, 32, PostIndex));
856 __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), MemOperand(x0));
857 __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), MemOperand(x1, x2, PostIndex));
858 __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x1, 24, PostIndex));
859 __ ld1(v0.V2S(), v1.V2S(), MemOperand(x0));
860 __ ld1(v13.V2S(), v14.V2S(), MemOperand(x1, x2, PostIndex));
861 __ ld1(v3.V2S(), v4.V2S(), MemOperand(x1, 16, PostIndex));
862 __ ld1(v26.V2S(), MemOperand(x0));
863 __ ld1(v0.V2S(), MemOperand(x1, x2, PostIndex));
864 __ ld1(v11.V2S(), MemOperand(x1, 8, PostIndex));
865 __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
866 __ ld1(v24.V4H(),
867 v25.V4H(),
868 v26.V4H(),
869 v27.V4H(),
870 MemOperand(x1, x2, PostIndex));
871 __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
872 __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), MemOperand(x0));
873 __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), MemOperand(x1, x2, PostIndex));
874 __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 24, PostIndex));
875 __ ld1(v3.V4H(), v4.V4H(), MemOperand(x0));
876 __ ld1(v3.V4H(), v4.V4H(), MemOperand(x1, x2, PostIndex));
877 __ ld1(v23.V4H(), v24.V4H(), MemOperand(x1, 16, PostIndex));
878 __ ld1(v26.V4H(), MemOperand(x0));
879 __ ld1(v1.V4H(), MemOperand(x1, x2, PostIndex));
880 __ ld1(v14.V4H(), MemOperand(x1, 8, PostIndex));
881 __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), MemOperand(x0));
882 __ ld1(v28.V4S(),
883 v29.V4S(),
884 v30.V4S(),
885 v31.V4S(),
886 MemOperand(x1, x2, PostIndex));
887 __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1, 64, PostIndex));
888 __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
889 __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), MemOperand(x1, x2, PostIndex));
890 __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), MemOperand(x1, 48, PostIndex));
891 __ ld1(v20.V4S(), v21.V4S(), MemOperand(x0));
892 __ ld1(v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex));
893 __ ld1(v11.V4S(), v12.V4S(), MemOperand(x1, 32, PostIndex));
894 __ ld1(v15.V4S(), MemOperand(x0));
895 __ ld1(v12.V4S(), MemOperand(x1, x2, PostIndex));
896 __ ld1(v0.V4S(), MemOperand(x1, 16, PostIndex));
897 __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), MemOperand(x0));
898 __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, x2, PostIndex));
899 __ ld1(v9.V8B(),
900 v10.V8B(),
901 v11.V8B(),
902 v12.V8B(),
903 MemOperand(x1, 32, PostIndex));
904 __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), MemOperand(x0));
905 __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x1, x2, PostIndex));
906 __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
907 __ ld1(v10.V8B(), v11.V8B(), MemOperand(x0));
908 __ ld1(v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
909 __ ld1(v27.V8B(), v28.V8B(), MemOperand(x1, 16, PostIndex));
910 __ ld1(v31.V8B(), MemOperand(x0));
911 __ ld1(v10.V8B(), MemOperand(x1, x2, PostIndex));
912 __ ld1(v28.V8B(), MemOperand(x1, 8, PostIndex));
913 __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
914 __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
915 __ ld1(v10.V8H(),
916 v11.V8H(),
917 v12.V8H(),
918 v13.V8H(),
919 MemOperand(x1, 64, PostIndex));
920 __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
921 __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
922 __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), MemOperand(x1, 48, PostIndex));
923 __ ld1(v4.V8H(), v5.V8H(), MemOperand(x0));
924 __ ld1(v21.V8H(), v22.V8H(), MemOperand(x1, x2, PostIndex));
925 __ ld1(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
926 __ ld1(v9.V8H(), MemOperand(x0));
927 __ ld1(v27.V8H(), MemOperand(x1, x2, PostIndex));
928 __ ld1(v26.V8H(), MemOperand(x1, 16, PostIndex));
929 __ ld1(v19.B(), 1, MemOperand(x0));
930 __ ld1(v12.B(), 3, MemOperand(x1, x2, PostIndex));
931 __ ld1(v27.B(), 12, MemOperand(x1, 1, PostIndex));
932 __ ld1(v10.D(), 1, MemOperand(x0));
933 __ ld1(v26.D(), 1, MemOperand(x1, x2, PostIndex));
934 __ ld1(v7.D(), 1, MemOperand(x1, 8, PostIndex));
935 __ ld1(v19.H(), 5, MemOperand(x0));
936 __ ld1(v10.H(), 1, MemOperand(x1, x2, PostIndex));
937 __ ld1(v5.H(), 4, MemOperand(x1, 2, PostIndex));
938 __ ld1(v21.S(), 2, MemOperand(x0));
939 __ ld1(v13.S(), 2, MemOperand(x1, x2, PostIndex));
940 __ ld1(v1.S(), 2, MemOperand(x1, 4, PostIndex));
941 __ ld1r(v2.V16B(), MemOperand(x0));
942 __ ld1r(v2.V16B(), MemOperand(x1, x2, PostIndex));
943 __ ld1r(v22.V16B(), MemOperand(x1, 1, PostIndex));
944 __ ld1r(v25.V1D(), MemOperand(x0));
945 __ ld1r(v9.V1D(), MemOperand(x1, x2, PostIndex));
946 __ ld1r(v23.V1D(), MemOperand(x1, 8, PostIndex));
947 __ ld1r(v19.V2D(), MemOperand(x0));
948 __ ld1r(v21.V2D(), MemOperand(x1, x2, PostIndex));
949 __ ld1r(v30.V2D(), MemOperand(x1, 8, PostIndex));
950 __ ld1r(v24.V2S(), MemOperand(x0));
951 __ ld1r(v26.V2S(), MemOperand(x1, x2, PostIndex));
952 __ ld1r(v28.V2S(), MemOperand(x1, 4, PostIndex));
953 __ ld1r(v19.V4H(), MemOperand(x0));
954 __ ld1r(v1.V4H(), MemOperand(x1, x2, PostIndex));
955 __ ld1r(v21.V4H(), MemOperand(x1, 2, PostIndex));
956 __ ld1r(v15.V4S(), MemOperand(x0));
957 __ ld1r(v21.V4S(), MemOperand(x1, x2, PostIndex));
958 __ ld1r(v23.V4S(), MemOperand(x1, 4, PostIndex));
959 __ ld1r(v26.V8B(), MemOperand(x0));
960 __ ld1r(v14.V8B(), MemOperand(x1, x2, PostIndex));
961 __ ld1r(v19.V8B(), MemOperand(x1, 1, PostIndex));
962 __ ld1r(v13.V8H(), MemOperand(x0));
963 __ ld1r(v30.V8H(), MemOperand(x1, x2, PostIndex));
964 __ ld1r(v27.V8H(), MemOperand(x1, 2, PostIndex));
965 __ ld2(v21.V16B(), v22.V16B(), MemOperand(x0));
966 __ ld2(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
967 __ ld2(v12.V16B(), v13.V16B(), MemOperand(x1, 32, PostIndex));
968 __ ld2(v14.V2D(), v15.V2D(), MemOperand(x0));
969 __ ld2(v0.V2D(), v1.V2D(), MemOperand(x1, x2, PostIndex));
970 __ ld2(v12.V2D(), v13.V2D(), MemOperand(x1, 32, PostIndex));
971 __ ld2(v27.V2S(), v28.V2S(), MemOperand(x0));
972 __ ld2(v2.V2S(), v3.V2S(), MemOperand(x1, x2, PostIndex));
973 __ ld2(v12.V2S(), v13.V2S(), MemOperand(x1, 16, PostIndex));
974 __ ld2(v9.V4H(), v10.V4H(), MemOperand(x0));
975 __ ld2(v23.V4H(), v24.V4H(), MemOperand(x1, x2, PostIndex));
976 __ ld2(v1.V4H(), v2.V4H(), MemOperand(x1, 16, PostIndex));
977 __ ld2(v20.V4S(), v21.V4S(), MemOperand(x0));
978 __ ld2(v10.V4S(), v11.V4S(), MemOperand(x1, x2, PostIndex));
979 __ ld2(v24.V4S(), v25.V4S(), MemOperand(x1, 32, PostIndex));
980 __ ld2(v17.V8B(), v18.V8B(), MemOperand(x0));
981 __ ld2(v13.V8B(), v14.V8B(), MemOperand(x1, x2, PostIndex));
982 __ ld2(v7.V8B(), v8.V8B(), MemOperand(x1, 16, PostIndex));
983 __ ld2(v30.V8H(), v31.V8H(), MemOperand(x0));
984 __ ld2(v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
985 __ ld2(v13.V8H(), v14.V8H(), MemOperand(x1, 32, PostIndex));
986 __ ld2(v5.B(), v6.B(), 12, MemOperand(x0));
987 __ ld2(v16.B(), v17.B(), 7, MemOperand(x1, x2, PostIndex));
988 __ ld2(v29.B(), v30.B(), 2, MemOperand(x1, 2, PostIndex));
989 __ ld2(v11.D(), v12.D(), 1, MemOperand(x0));
990 __ ld2(v26.D(), v27.D(), 0, MemOperand(x1, x2, PostIndex));
991 __ ld2(v25.D(), v26.D(), 0, MemOperand(x1, 16, PostIndex));
992 __ ld2(v18.H(), v19.H(), 7, MemOperand(x0));
993 __ ld2(v17.H(), v18.H(), 5, MemOperand(x1, x2, PostIndex));
994 __ ld2(v30.H(), v31.H(), 2, MemOperand(x1, 4, PostIndex));
995 __ ld2(v29.S(), v30.S(), 3, MemOperand(x0));
996 __ ld2(v28.S(), v29.S(), 0, MemOperand(x1, x2, PostIndex));
997 __ ld2(v6.S(), v7.S(), 1, MemOperand(x1, 8, PostIndex));
998 __ ld2r(v26.V16B(), v27.V16B(), MemOperand(x0));
999 __ ld2r(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
1000 __ ld2r(v5.V16B(), v6.V16B(), MemOperand(x1, 2, PostIndex));
1001 __ ld2r(v26.V1D(), v27.V1D(), MemOperand(x0));
1002 __ ld2r(v14.V1D(), v15.V1D(), MemOperand(x1, x2, PostIndex));
1003 __ ld2r(v23.V1D(), v24.V1D(), MemOperand(x1, 16, PostIndex));
1004 __ ld2r(v11.V2D(), v12.V2D(), MemOperand(x0));
1005 __ ld2r(v29.V2D(), v30.V2D(), MemOperand(x1, x2, PostIndex));
1006 __ ld2r(v15.V2D(), v16.V2D(), MemOperand(x1, 16, PostIndex));
1007 __ ld2r(v26.V2S(), v27.V2S(), MemOperand(x0));
1008 __ ld2r(v22.V2S(), v23.V2S(), MemOperand(x1, x2, PostIndex));
1009 __ ld2r(v2.V2S(), v3.V2S(), MemOperand(x1, 8, PostIndex));
1010 __ ld2r(v2.V4H(), v3.V4H(), MemOperand(x0));
1011 __ ld2r(v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
1012 __ ld2r(v6.V4H(), v7.V4H(), MemOperand(x1, 4, PostIndex));
1013 __ ld2r(v7.V4S(), v8.V4S(), MemOperand(x0));
1014 __ ld2r(v19.V4S(), v20.V4S(), MemOperand(x1, x2, PostIndex));
1015 __ ld2r(v21.V4S(), v22.V4S(), MemOperand(x1, 8, PostIndex));
1016 __ ld2r(v26.V8B(), v27.V8B(), MemOperand(x0));
1017 __ ld2r(v20.V8B(), v21.V8B(), MemOperand(x1, x2, PostIndex));
1018 __ ld2r(v11.V8B(), v12.V8B(), MemOperand(x1, 2, PostIndex));
1019 __ ld2r(v12.V8H(), v13.V8H(), MemOperand(x0));
1020 __ ld2r(v6.V8H(), v7.V8H(), MemOperand(x1, x2, PostIndex));
1021 __ ld2r(v25.V8H(), v26.V8H(), MemOperand(x1, 4, PostIndex));
1022 __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x0));
1023 __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), MemOperand(x1, x2, PostIndex));
1024 __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x1, 48, PostIndex));
1025 __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), MemOperand(x0));
1026 __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), MemOperand(x1, x2, PostIndex));
1027 __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), MemOperand(x1, 48, PostIndex));
1028 __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x0));
1029 __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), MemOperand(x1, x2, PostIndex));
1030 __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), MemOperand(x1, 24, PostIndex));
1031 __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), MemOperand(x0));
1032 __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), MemOperand(x1, x2, PostIndex));
1033 __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 24, PostIndex));
1034 __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
1035 __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), MemOperand(x1, x2, PostIndex));
1036 __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), MemOperand(x1, 48, PostIndex));
1037 __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x0));
1038 __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1039 __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
1040 __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), MemOperand(x0));
1041 __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), MemOperand(x1, x2, PostIndex));
1042 __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), MemOperand(x1, 48, PostIndex));
1043 __ ld3(v21.B(), v22.B(), v23.B(), 11, MemOperand(x0));
1044 __ ld3(v5.B(), v6.B(), v7.B(), 9, MemOperand(x1, x2, PostIndex));
1045 __ ld3(v23.B(), v24.B(), v25.B(), 0, MemOperand(x1, 3, PostIndex));
1046 __ ld3(v16.D(), v17.D(), v18.D(), 0, MemOperand(x0));
1047 __ ld3(v30.D(), v31.D(), v0.D(), 0, MemOperand(x1, x2, PostIndex));
1048 __ ld3(v28.D(), v29.D(), v30.D(), 1, MemOperand(x1, 24, PostIndex));
1049 __ ld3(v13.H(), v14.H(), v15.H(), 2, MemOperand(x0));
1050 __ ld3(v22.H(), v23.H(), v24.H(), 7, MemOperand(x1, x2, PostIndex));
1051 __ ld3(v14.H(), v15.H(), v16.H(), 3, MemOperand(x1, 6, PostIndex));
1052 __ ld3(v22.S(), v23.S(), v24.S(), 3, MemOperand(x0));
1053 __ ld3(v30.S(), v31.S(), v0.S(), 2, MemOperand(x1, x2, PostIndex));
1054 __ ld3(v12.S(), v13.S(), v14.S(), 1, MemOperand(x1, 12, PostIndex));
1055 __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x0));
1056 __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, x2, PostIndex));
1057 __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), MemOperand(x1, 3, PostIndex));
1058 __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), MemOperand(x0));
1059 __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), MemOperand(x1, x2, PostIndex));
1060 __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), MemOperand(x1, 24, PostIndex));
1061 __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x0));
1062 __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
1063 __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), MemOperand(x1, 24, PostIndex));
1064 __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), MemOperand(x0));
1065 __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x1, x2, PostIndex));
1066 __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, 12, PostIndex));
1067 __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), MemOperand(x0));
1068 __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), MemOperand(x1, x2, PostIndex));
1069 __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 6, PostIndex));
1070 __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x0));
1071 __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), MemOperand(x1, x2, PostIndex));
1072 __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, 12, PostIndex));
1073 __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x0));
1074 __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
1075 __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, 3, PostIndex));
1076 __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
1077 __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x1, x2, PostIndex));
1078 __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), MemOperand(x1, 6, PostIndex));
1079 __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), MemOperand(x0));
1080 __ ld4(v2.V16B(),
1081 v3.V16B(),
1082 v4.V16B(),
1083 v5.V16B(),
1084 MemOperand(x1, x2, PostIndex));
1085 __ ld4(v5.V16B(),
1086 v6.V16B(),
1087 v7.V16B(),
1088 v8.V16B(),
1089 MemOperand(x1, 64, PostIndex));
1090 __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), MemOperand(x0));
1091 __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1092 __ ld4(v29.V2D(),
1093 v30.V2D(),
1094 v31.V2D(),
1095 v0.V2D(),
1096 MemOperand(x1, 64, PostIndex));
1097 __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), MemOperand(x0));
1098 __ ld4(v24.V2S(),
1099 v25.V2S(),
1100 v26.V2S(),
1101 v27.V2S(),
1102 MemOperand(x1, x2, PostIndex));
1103 __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), MemOperand(x1, 32, PostIndex));
1104 __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
1105 __ ld4(v23.V4H(),
1106 v24.V4H(),
1107 v25.V4H(),
1108 v26.V4H(),
1109 MemOperand(x1, x2, PostIndex));
1110 __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 32, PostIndex));
1111 __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), MemOperand(x0));
1112 __ ld4(v28.V4S(),
1113 v29.V4S(),
1114 v30.V4S(),
1115 v31.V4S(),
1116 MemOperand(x1, x2, PostIndex));
1117 __ ld4(v29.V4S(),
1118 v30.V4S(),
1119 v31.V4S(),
1120 v0.V4S(),
1121 MemOperand(x1, 64, PostIndex));
1122 __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), MemOperand(x0));
1123 __ ld4(v27.V8B(),
1124 v28.V8B(),
1125 v29.V8B(),
1126 v30.V8B(),
1127 MemOperand(x1, x2, PostIndex));
1128 __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, 32, PostIndex));
1129 __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
1130 __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
1131 __ ld4(v20.V8H(),
1132 v21.V8H(),
1133 v22.V8H(),
1134 v23.V8H(),
1135 MemOperand(x1, 64, PostIndex));
1136 __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, MemOperand(x0));
1137 __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, MemOperand(x1, x2, PostIndex));
1138 __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, MemOperand(x1, 4, PostIndex));
1139 __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, MemOperand(x0));
1140 __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1141 __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, MemOperand(x1, 32, PostIndex));
1142 __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, MemOperand(x0));
1143 __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, MemOperand(x1, x2, PostIndex));
1144 __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, MemOperand(x1, 8, PostIndex));
1145 __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, MemOperand(x0));
1146 __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, MemOperand(x1, x2, PostIndex));
1147 __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, MemOperand(x1, 16, PostIndex));
1148 __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), MemOperand(x0));
1149 __ ld4r(v13.V16B(),
1150 v14.V16B(),
1151 v15.V16B(),
1152 v16.V16B(),
1153 MemOperand(x1, x2, PostIndex));
1154 __ ld4r(v9.V16B(),
1155 v10.V16B(),
1156 v11.V16B(),
1157 v12.V16B(),
1158 MemOperand(x1, 4, PostIndex));
1159 __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), MemOperand(x0));
1160 __ ld4r(v4.V1D(),
1161 v5.V1D(),
1162 v6.V1D(),
1163 v7.V1D(),
1164 MemOperand(x1, x2, PostIndex));
1165 __ ld4r(v26.V1D(),
1166 v27.V1D(),
1167 v28.V1D(),
1168 v29.V1D(),
1169 MemOperand(x1, 32, PostIndex));
1170 __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x0));
1171 __ ld4r(v28.V2D(),
1172 v29.V2D(),
1173 v30.V2D(),
1174 v31.V2D(),
1175 MemOperand(x1, x2, PostIndex));
1176 __ ld4r(v15.V2D(),
1177 v16.V2D(),
1178 v17.V2D(),
1179 v18.V2D(),
1180 MemOperand(x1, 32, PostIndex));
1181 __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x0));
1182 __ ld4r(v28.V2S(),
1183 v29.V2S(),
1184 v30.V2S(),
1185 v31.V2S(),
1186 MemOperand(x1, x2, PostIndex));
1187 __ ld4r(v11.V2S(),
1188 v12.V2S(),
1189 v13.V2S(),
1190 v14.V2S(),
1191 MemOperand(x1, 16, PostIndex));
1192 __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), MemOperand(x0));
1193 __ ld4r(v22.V4H(),
1194 v23.V4H(),
1195 v24.V4H(),
1196 v25.V4H(),
1197 MemOperand(x1, x2, PostIndex));
1198 __ ld4r(v20.V4H(),
1199 v21.V4H(),
1200 v22.V4H(),
1201 v23.V4H(),
1202 MemOperand(x1, 8, PostIndex));
1203 __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x0));
1204 __ ld4r(v25.V4S(),
1205 v26.V4S(),
1206 v27.V4S(),
1207 v28.V4S(),
1208 MemOperand(x1, x2, PostIndex));
1209 __ ld4r(v23.V4S(),
1210 v24.V4S(),
1211 v25.V4S(),
1212 v26.V4S(),
1213 MemOperand(x1, 16, PostIndex));
1214 __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), MemOperand(x0));
1215 __ ld4r(v27.V8B(),
1216 v28.V8B(),
1217 v29.V8B(),
1218 v30.V8B(),
1219 MemOperand(x1, x2, PostIndex));
1220 __ ld4r(v29.V8B(),
1221 v30.V8B(),
1222 v31.V8B(),
1223 v0.V8B(),
1224 MemOperand(x1, 4, PostIndex));
1225 __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x0));
1226 __ ld4r(v25.V8H(),
1227 v26.V8H(),
1228 v27.V8H(),
1229 v28.V8H(),
1230 MemOperand(x1, x2, PostIndex));
1231 __ ld4r(v22.V8H(),
1232 v23.V8H(),
1233 v24.V8H(),
1234 v25.V8H(),
1235 MemOperand(x1, 8, PostIndex));
1236 __ mla(v29.V16B(), v7.V16B(), v26.V16B());
1237 __ mla(v6.V2S(), v4.V2S(), v14.V2S());
1238 __ mla(v9.V2S(), v11.V2S(), v0.S(), 2);
1239 __ mla(v5.V4H(), v17.V4H(), v25.V4H());
1240 __ mla(v24.V4H(), v7.V4H(), v11.H(), 3);
1241 __ mla(v12.V4S(), v3.V4S(), v4.V4S());
1242 __ mla(v10.V4S(), v7.V4S(), v7.S(), 3);
1243 __ mla(v3.V8B(), v16.V8B(), v9.V8B());
1244 __ mla(v19.V8H(), v22.V8H(), v18.V8H());
1245 __ mla(v6.V8H(), v2.V8H(), v0.H(), 0);
1246 __ mls(v23.V16B(), v10.V16B(), v11.V16B());
1247 __ mls(v14.V2S(), v31.V2S(), v22.V2S());
1248 __ mls(v28.V2S(), v13.V2S(), v1.S(), 3);
1249 __ mls(v2.V4H(), v19.V4H(), v13.V4H());
1250 __ mls(v18.V4H(), v15.V4H(), v12.H(), 6);
1251 __ mls(v6.V4S(), v11.V4S(), v16.V4S());
1252 __ mls(v23.V4S(), v16.V4S(), v10.S(), 2);
1253 __ mls(v26.V8B(), v13.V8B(), v23.V8B());
1254 __ mls(v10.V8H(), v10.V8H(), v12.V8H());
1255 __ mls(v14.V8H(), v0.V8H(), v14.H(), 7);
1256 __ mov(b22, v1.B(), 3);
1257 __ mov(d7, v13.D(), 1);
1258 __ mov(h26, v21.H(), 2);
1259 __ mov(s26, v19.S(), 0);
1260 __ mov(v26.V16B(), v11.V16B());
1261 __ mov(v20.V8B(), v0.V8B());
1262 __ mov(v19.B(), 13, v6.B(), 4);
1263 __ mov(v4.B(), 13, w19);
1264 __ mov(v11.D(), 1, v8.D(), 0);
1265 __ mov(v3.D(), 0, x30);
1266 __ mov(v29.H(), 4, v11.H(), 7);
1267 __ mov(v2.H(), 6, w6);
1268 __ mov(v22.S(), 0, v5.S(), 2);
1269 __ mov(v24.S(), 3, w8);
1270 __ mov(w18, v1.S(), 3);
1271 __ mov(x28, v21.D(), 0);
1272 __ movi(d24, 0xffff0000ffffff);
1273 __ movi(v29.V16B(), 0x80);
1274 __ movi(v12.V2D(), 0xffff00ff00ffff00);
1275 __ movi(v12.V2S(), 0xec, LSL, 24);
1276 __ movi(v10.V2S(), 0x4c, MSL, 16);
1277 __ movi(v26.V4H(), 0xc0, LSL);
1278 __ movi(v24.V4S(), 0x98, LSL, 16);
1279 __ movi(v1.V4S(), 0xde, MSL, 16);
1280 __ movi(v21.V8B(), 0x4d);
1281 __ movi(v29.V8H(), 0x69, LSL);
1282 __ mul(v1.V16B(), v15.V16B(), v17.V16B());
1283 __ mul(v21.V2S(), v19.V2S(), v29.V2S());
1284 __ mul(v19.V2S(), v5.V2S(), v3.S(), 0);
1285 __ mul(v29.V4H(), v11.V4H(), v2.V4H());
1286 __ mul(v2.V4H(), v7.V4H(), v0.H(), 0);
1287 __ mul(v25.V4S(), v26.V4S(), v16.V4S());
1288 __ mul(v26.V4S(), v6.V4S(), v15.S(), 2);
1289 __ mul(v11.V8B(), v15.V8B(), v31.V8B());
1290 __ mul(v20.V8H(), v31.V8H(), v15.V8H());
1291 __ mul(v29.V8H(), v5.V8H(), v9.H(), 4);
1292 __ mvn(v13.V16B(), v21.V16B());
1293 __ mvn(v28.V8B(), v19.V8B());
1294 __ mvni(v25.V2S(), 0xb8, LSL, 8);
1295 __ mvni(v17.V2S(), 0x6c, MSL, 16);
1296 __ mvni(v29.V4H(), 0x48, LSL);
1297 __ mvni(v20.V4S(), 0x7a, LSL, 16);
1298 __ mvni(v0.V4S(), 0x1e, MSL, 8);
1299 __ mvni(v31.V8H(), 0x3e, LSL);
1300 __ neg(d25, d11);
1301 __ neg(v4.V16B(), v9.V16B());
1302 __ neg(v11.V2D(), v25.V2D());
1303 __ neg(v7.V2S(), v18.V2S());
1304 __ neg(v7.V4H(), v15.V4H());
1305 __ neg(v17.V4S(), v18.V4S());
1306 __ neg(v20.V8B(), v17.V8B());
1307 __ neg(v0.V8H(), v11.V8H());
1308 __ orn(v13.V16B(), v11.V16B(), v31.V16B());
1309 __ orn(v22.V8B(), v16.V8B(), v22.V8B());
1310 __ orr(v17.V16B(), v17.V16B(), v23.V16B());
1311 __ orr(v8.V2S(), 0xe3);
1312 __ orr(v11.V4H(), 0x97, 8);
1313 __ orr(v7.V4S(), 0xab);
1314 __ orr(v8.V8B(), v4.V8B(), v3.V8B());
1315 __ orr(v31.V8H(), 0xb0, 8);
1316 __ pmul(v11.V16B(), v18.V16B(), v23.V16B());
1317 __ pmul(v8.V8B(), v24.V8B(), v5.V8B());
1318 __ pmull(v24.V8H(), v18.V8B(), v22.V8B());
1319 __ pmull2(v13.V8H(), v3.V16B(), v21.V16B());
1320 __ raddhn(v22.V2S(), v10.V2D(), v21.V2D());
1321 __ raddhn(v5.V4H(), v13.V4S(), v13.V4S());
1322 __ raddhn(v10.V8B(), v17.V8H(), v26.V8H());
1323 __ raddhn2(v9.V16B(), v29.V8H(), v13.V8H());
1324 __ raddhn2(v27.V4S(), v23.V2D(), v26.V2D());
1325 __ raddhn2(v0.V8H(), v29.V4S(), v7.V4S());
1326 __ rbit(v22.V16B(), v15.V16B());
1327 __ rbit(v30.V8B(), v3.V8B());
1328 __ rev16(v31.V16B(), v27.V16B());
1329 __ rev16(v12.V8B(), v26.V8B());
1330 __ rev32(v5.V16B(), v4.V16B());
1331 __ rev32(v16.V4H(), v26.V4H());
1332 __ rev32(v20.V8B(), v3.V8B());
1333 __ rev32(v20.V8H(), v28.V8H());
1334 __ rev64(v9.V16B(), v19.V16B());
1335 __ rev64(v5.V2S(), v16.V2S());
1336 __ rev64(v7.V4H(), v31.V4H());
1337 __ rev64(v15.V4S(), v26.V4S());
1338 __ rev64(v25.V8B(), v9.V8B());
1339 __ rev64(v11.V8H(), v5.V8H());
1340 __ rshrn(v18.V2S(), v13.V2D(), 1);
1341 __ rshrn(v25.V4H(), v30.V4S(), 2);
1342 __ rshrn(v13.V8B(), v9.V8H(), 8);
1343 __ rshrn2(v3.V16B(), v6.V8H(), 8);
1344 __ rshrn2(v0.V4S(), v29.V2D(), 25);
1345 __ rshrn2(v27.V8H(), v26.V4S(), 15);
1346 __ rsubhn(v15.V2S(), v25.V2D(), v4.V2D());
1347 __ rsubhn(v23.V4H(), v9.V4S(), v3.V4S());
1348 __ rsubhn(v6.V8B(), v30.V8H(), v24.V8H());
1349 __ rsubhn2(v4.V16B(), v24.V8H(), v20.V8H());
1350 __ rsubhn2(v1.V4S(), v23.V2D(), v22.V2D());
1351 __ rsubhn2(v19.V8H(), v2.V4S(), v20.V4S());
1352 __ saba(v28.V16B(), v9.V16B(), v25.V16B());
1353 __ saba(v9.V2S(), v28.V2S(), v20.V2S());
1354 __ saba(v17.V4H(), v22.V4H(), v22.V4H());
1355 __ saba(v29.V4S(), v5.V4S(), v27.V4S());
1356 __ saba(v20.V8B(), v21.V8B(), v18.V8B());
1357 __ saba(v27.V8H(), v17.V8H(), v30.V8H());
1358 __ sabal(v20.V2D(), v13.V2S(), v7.V2S());
1359 __ sabal(v4.V4S(), v12.V4H(), v4.V4H());
1360 __ sabal(v23.V8H(), v24.V8B(), v20.V8B());
1361 __ sabal2(v26.V2D(), v21.V4S(), v18.V4S());
1362 __ sabal2(v27.V4S(), v28.V8H(), v8.V8H());
1363 __ sabal2(v12.V8H(), v16.V16B(), v21.V16B());
1364 __ sabd(v0.V16B(), v15.V16B(), v13.V16B());
1365 __ sabd(v15.V2S(), v7.V2S(), v30.V2S());
1366 __ sabd(v17.V4H(), v17.V4H(), v12.V4H());
1367 __ sabd(v7.V4S(), v4.V4S(), v22.V4S());
1368 __ sabd(v23.V8B(), v3.V8B(), v26.V8B());
1369 __ sabd(v20.V8H(), v28.V8H(), v5.V8H());
1370 __ sabdl(v27.V2D(), v22.V2S(), v20.V2S());
1371 __ sabdl(v31.V4S(), v20.V4H(), v23.V4H());
1372 __ sabdl(v0.V8H(), v20.V8B(), v27.V8B());
1373 __ sabdl2(v31.V2D(), v11.V4S(), v3.V4S());
1374 __ sabdl2(v26.V4S(), v11.V8H(), v27.V8H());
1375 __ sabdl2(v6.V8H(), v8.V16B(), v18.V16B());
1376 __ sadalp(v8.V1D(), v26.V2S());
1377 __ sadalp(v12.V2D(), v26.V4S());
1378 __ sadalp(v12.V2S(), v26.V4H());
1379 __ sadalp(v4.V4H(), v1.V8B());
1380 __ sadalp(v15.V4S(), v17.V8H());
1381 __ sadalp(v21.V8H(), v25.V16B());
1382 __ saddl(v5.V2D(), v10.V2S(), v14.V2S());
1383 __ saddl(v18.V4S(), v3.V4H(), v15.V4H());
1384 __ saddl(v15.V8H(), v2.V8B(), v23.V8B());
1385 __ saddl2(v16.V2D(), v16.V4S(), v27.V4S());
1386 __ saddl2(v6.V4S(), v24.V8H(), v0.V8H());
1387 __ saddl2(v7.V8H(), v20.V16B(), v28.V16B());
1388 __ saddlp(v10.V1D(), v25.V2S());
1389 __ saddlp(v15.V2D(), v16.V4S());
1390 __ saddlp(v18.V2S(), v10.V4H());
1391 __ saddlp(v29.V4H(), v26.V8B());
1392 __ saddlp(v10.V4S(), v1.V8H());
1393 __ saddlp(v0.V8H(), v21.V16B());
1394 __ saddlv(d12, v7.V4S());
1395 __ saddlv(h14, v28.V16B());
1396 __ saddlv(h30, v30.V8B());
1397 __ saddlv(s27, v3.V4H());
1398 __ saddlv(s16, v16.V8H());
1399 __ saddw(v24.V2D(), v11.V2D(), v18.V2S());
1400 __ saddw(v13.V4S(), v12.V4S(), v6.V4H());
1401 __ saddw(v19.V8H(), v19.V8H(), v7.V8B());
1402 __ saddw2(v27.V2D(), v9.V2D(), v26.V4S());
1403 __ saddw2(v19.V4S(), v23.V4S(), v21.V8H());
1404 __ saddw2(v15.V8H(), v25.V8H(), v30.V16B());
1405 __ shadd(v7.V16B(), v4.V16B(), v9.V16B());
1406 __ shadd(v29.V2S(), v25.V2S(), v24.V2S());
1407 __ shadd(v31.V4H(), v10.V4H(), v13.V4H());
1408 __ shadd(v21.V4S(), v16.V4S(), v8.V4S());
1409 __ shadd(v14.V8B(), v29.V8B(), v22.V8B());
1410 __ shadd(v19.V8H(), v24.V8H(), v20.V8H());
1411 __ shl(d22, d25, 23);
1412 __ shl(v5.V16B(), v17.V16B(), 7);
1413 __ shl(v2.V2D(), v4.V2D(), 21);
1414 __ shl(v4.V2S(), v3.V2S(), 26);
1415 __ shl(v3.V4H(), v28.V4H(), 8);
1416 __ shl(v4.V4S(), v31.V4S(), 24);
1417 __ shl(v18.V8B(), v16.V8B(), 2);
1418 __ shl(v0.V8H(), v11.V8H(), 3);
1419 __ shll(v5.V2D(), v24.V2S(), 32);
1420 __ shll(v26.V4S(), v20.V4H(), 16);
1421 __ shll(v5.V8H(), v9.V8B(), 8);
1422 __ shll2(v21.V2D(), v28.V4S(), 32);
1423 __ shll2(v22.V4S(), v1.V8H(), 16);
1424 __ shll2(v30.V8H(), v25.V16B(), 8);
1425 __ shrn(v5.V2S(), v1.V2D(), 28);
1426 __ shrn(v29.V4H(), v18.V4S(), 7);
1427 __ shrn(v17.V8B(), v29.V8H(), 2);
1428 __ shrn2(v5.V16B(), v30.V8H(), 3);
1429 __ shrn2(v24.V4S(), v1.V2D(), 1);
1430 __ shrn2(v5.V8H(), v14.V4S(), 16);
1431 __ shsub(v30.V16B(), v22.V16B(), v23.V16B());
1432 __ shsub(v22.V2S(), v27.V2S(), v25.V2S());
1433 __ shsub(v13.V4H(), v22.V4H(), v1.V4H());
1434 __ shsub(v10.V4S(), v8.V4S(), v23.V4S());
1435 __ shsub(v6.V8B(), v9.V8B(), v31.V8B());
1436 __ shsub(v8.V8H(), v31.V8H(), v8.V8H());
1437 __ sli(d19, d29, 20);
1438 __ sli(v9.V16B(), v24.V16B(), 0);
1439 __ sli(v22.V2D(), v9.V2D(), 10);
1440 __ sli(v11.V2S(), v27.V2S(), 20);
1441 __ sli(v16.V4H(), v15.V4H(), 5);
1442 __ sli(v8.V4S(), v8.V4S(), 25);
1443 __ sli(v10.V8B(), v30.V8B(), 0);
1444 __ sli(v7.V8H(), v28.V8H(), 6);
1445 __ smax(v18.V16B(), v8.V16B(), v1.V16B());
1446 __ smax(v30.V2S(), v5.V2S(), v1.V2S());
1447 __ smax(v17.V4H(), v25.V4H(), v19.V4H());
1448 __ smax(v1.V4S(), v24.V4S(), v31.V4S());
1449 __ smax(v17.V8B(), v24.V8B(), v24.V8B());
1450 __ smax(v11.V8H(), v26.V8H(), v10.V8H());
1451 __ smaxp(v12.V16B(), v14.V16B(), v7.V16B());
1452 __ smaxp(v31.V2S(), v24.V2S(), v6.V2S());
1453 __ smaxp(v10.V4H(), v29.V4H(), v10.V4H());
1454 __ smaxp(v18.V4S(), v11.V4S(), v7.V4S());
1455 __ smaxp(v21.V8B(), v0.V8B(), v18.V8B());
1456 __ smaxp(v26.V8H(), v8.V8H(), v15.V8H());
1457 __ smaxv(b4, v5.V16B());
1458 __ smaxv(b23, v0.V8B());
1459 __ smaxv(h6, v0.V4H());
1460 __ smaxv(h24, v8.V8H());
1461 __ smaxv(s3, v16.V4S());
1462 __ smin(v24.V16B(), v8.V16B(), v18.V16B());
1463 __ smin(v29.V2S(), v8.V2S(), v23.V2S());
1464 __ smin(v6.V4H(), v11.V4H(), v21.V4H());
1465 __ smin(v24.V4S(), v23.V4S(), v15.V4S());
1466 __ smin(v8.V8B(), v16.V8B(), v4.V8B());
1467 __ smin(v12.V8H(), v1.V8H(), v10.V8H());
1468 __ sminp(v13.V16B(), v18.V16B(), v28.V16B());
1469 __ sminp(v22.V2S(), v28.V2S(), v16.V2S());
1470 __ sminp(v15.V4H(), v12.V4H(), v5.V4H());
1471 __ sminp(v15.V4S(), v17.V4S(), v8.V4S());
1472 __ sminp(v21.V8B(), v2.V8B(), v6.V8B());
1473 __ sminp(v21.V8H(), v12.V8H(), v6.V8H());
1474 __ sminv(b8, v6.V16B());
1475 __ sminv(b6, v18.V8B());
1476 __ sminv(h20, v1.V4H());
1477 __ sminv(h7, v17.V8H());
1478 __ sminv(s21, v4.V4S());
1479 __ smlal(v24.V2D(), v14.V2S(), v21.V2S());
1480 __ smlal(v31.V2D(), v3.V2S(), v14.S(), 2);
1481 __ smlal(v7.V4S(), v20.V4H(), v21.V4H());
1482 __ smlal(v19.V4S(), v16.V4H(), v9.H(), 3);
1483 __ smlal(v29.V8H(), v14.V8B(), v1.V8B());
1484 __ smlal2(v30.V2D(), v26.V4S(), v16.V4S());
1485 __ smlal2(v31.V2D(), v30.V4S(), v1.S(), 0);
1486 __ smlal2(v17.V4S(), v6.V8H(), v3.V8H());
1487 __ smlal2(v11.V4S(), v31.V8H(), v5.H(), 7);
1488 __ smlal2(v30.V8H(), v16.V16B(), v29.V16B());
1489 __ smlsl(v1.V2D(), v20.V2S(), v17.V2S());
1490 __ smlsl(v29.V2D(), v12.V2S(), v5.S(), 3);
1491 __ smlsl(v0.V4S(), v26.V4H(), v1.V4H());
1492 __ smlsl(v3.V4S(), v5.V4H(), v6.H(), 5);
1493 __ smlsl(v4.V8H(), v0.V8B(), v26.V8B());
1494 __ smlsl2(v14.V2D(), v14.V4S(), v5.V4S());
1495 __ smlsl2(v15.V2D(), v5.V4S(), v0.S(), 1);
1496 __ smlsl2(v29.V4S(), v17.V8H(), v31.V8H());
1497 __ smlsl2(v6.V4S(), v15.V8H(), v9.H(), 6);
1498 __ smlsl2(v30.V8H(), v15.V16B(), v15.V16B());
1499 __ smov(w21, v6.B(), 3);
1500 __ smov(w13, v26.H(), 7);
1501 __ smov(x24, v16.B(), 7);
1502 __ smov(x7, v4.H(), 3);
1503 __ smov(x29, v7.S(), 1);
1504 __ smull(v4.V2D(), v29.V2S(), v17.V2S());
1505 __ smull(v30.V2D(), v21.V2S(), v6.S(), 2);
1506 __ smull(v23.V4S(), v5.V4H(), v23.V4H());
1507 __ smull(v8.V4S(), v9.V4H(), v2.H(), 1);
1508 __ smull(v31.V8H(), v17.V8B(), v1.V8B());
1509 __ smull2(v3.V2D(), v3.V4S(), v23.V4S());
1510 __ smull2(v15.V2D(), v29.V4S(), v6.S(), 1);
1511 __ smull2(v19.V4S(), v20.V8H(), v30.V8H());
1512 __ smull2(v6.V4S(), v10.V8H(), v7.H(), 4);
1513 __ smull2(v25.V8H(), v8.V16B(), v27.V16B());
1514 __ sqabs(b3, b15);
1515 __ sqabs(d14, d9);
1516 __ sqabs(h31, h28);
1517 __ sqabs(s8, s0);
1518 __ sqabs(v14.V16B(), v7.V16B());
1519 __ sqabs(v23.V2D(), v19.V2D());
1520 __ sqabs(v10.V2S(), v24.V2S());
1521 __ sqabs(v31.V4H(), v19.V4H());
1522 __ sqabs(v23.V4S(), v0.V4S());
1523 __ sqabs(v29.V8B(), v23.V8B());
1524 __ sqabs(v17.V8H(), v21.V8H());
1525 __ sqadd(b9, b23, b13);
1526 __ sqadd(d2, d25, d26);
1527 __ sqadd(h7, h29, h25);
1528 __ sqadd(s11, s7, s24);
1529 __ sqadd(v20.V16B(), v16.V16B(), v29.V16B());
1530 __ sqadd(v23.V2D(), v30.V2D(), v28.V2D());
1531 __ sqadd(v8.V2S(), v19.V2S(), v2.V2S());
1532 __ sqadd(v20.V4H(), v12.V4H(), v31.V4H());
1533 __ sqadd(v14.V4S(), v15.V4S(), v17.V4S());
1534 __ sqadd(v2.V8B(), v29.V8B(), v13.V8B());
1535 __ sqadd(v7.V8H(), v19.V8H(), v14.V8H());
1536 __ sqdmlal(d15, s5, s30);
1537 __ sqdmlal(d24, s10, v2.S(), 3);
1538 __ sqdmlal(s9, h19, h8);
1539 __ sqdmlal(s14, h1, v12.H(), 3);
1540 __ sqdmlal(v30.V2D(), v5.V2S(), v31.V2S());
1541 __ sqdmlal(v25.V2D(), v14.V2S(), v10.S(), 1);
1542 __ sqdmlal(v19.V4S(), v17.V4H(), v16.V4H());
1543 __ sqdmlal(v8.V4S(), v5.V4H(), v8.H(), 1);
1544 __ sqdmlal2(v1.V2D(), v23.V4S(), v3.V4S());
1545 __ sqdmlal2(v19.V2D(), v0.V4S(), v9.S(), 0);
1546 __ sqdmlal2(v26.V4S(), v22.V8H(), v11.V8H());
1547 __ sqdmlal2(v6.V4S(), v28.V8H(), v13.H(), 4);
1548 __ sqdmlsl(d10, s29, s20);
1549 __ sqdmlsl(d10, s9, v10.S(), 1);
1550 __ sqdmlsl(s30, h9, h24);
1551 __ sqdmlsl(s13, h24, v6.H(), 1);
1552 __ sqdmlsl(v27.V2D(), v10.V2S(), v20.V2S());
1553 __ sqdmlsl(v23.V2D(), v23.V2S(), v3.S(), 3);
1554 __ sqdmlsl(v7.V4S(), v17.V4H(), v29.V4H());
1555 __ sqdmlsl(v22.V4S(), v21.V4H(), v3.H(), 4);
1556 __ sqdmlsl2(v12.V2D(), v7.V4S(), v22.V4S());
1557 __ sqdmlsl2(v20.V2D(), v25.V4S(), v8.S(), 0);
1558 __ sqdmlsl2(v25.V4S(), v26.V8H(), v18.V8H());
1559 __ sqdmlsl2(v25.V4S(), v19.V8H(), v5.H(), 0);
1560 __ sqdmulh(h17, h27, h12);
1561 __ sqdmulh(h16, h5, v11.H(), 0);
1562 __ sqdmulh(s1, s19, s16);
1563 __ sqdmulh(s1, s16, v2.S(), 0);
1564 __ sqdmulh(v28.V2S(), v1.V2S(), v8.V2S());
1565 __ sqdmulh(v28.V2S(), v8.V2S(), v3.S(), 0);
1566 __ sqdmulh(v11.V4H(), v25.V4H(), v5.V4H());
1567 __ sqdmulh(v30.V4H(), v14.V4H(), v8.H(), 5);
1568 __ sqdmulh(v25.V4S(), v21.V4S(), v13.V4S());
1569 __ sqdmulh(v23.V4S(), v2.V4S(), v10.S(), 3);
1570 __ sqdmulh(v26.V8H(), v5.V8H(), v23.V8H());
1571 __ sqdmulh(v4.V8H(), v22.V8H(), v4.H(), 3);
1572 __ sqdmull(d25, s2, s26);
1573 __ sqdmull(d30, s14, v5.S(), 1);
1574 __ sqdmull(s29, h18, h11);
1575 __ sqdmull(s11, h13, v7.H(), 6);
1576 __ sqdmull(v23.V2D(), v9.V2S(), v8.V2S());
1577 __ sqdmull(v18.V2D(), v29.V2S(), v4.S(), 1);
1578 __ sqdmull(v17.V4S(), v24.V4H(), v7.V4H());
1579 __ sqdmull(v8.V4S(), v15.V4H(), v5.H(), 1);
1580 __ sqdmull2(v28.V2D(), v14.V4S(), v2.V4S());
1581 __ sqdmull2(v1.V2D(), v24.V4S(), v13.S(), 2);
1582 __ sqdmull2(v11.V4S(), v17.V8H(), v31.V8H());
1583 __ sqdmull2(v1.V4S(), v20.V8H(), v11.H(), 3);
1584 __ sqneg(b2, b0);
1585 __ sqneg(d24, d2);
1586 __ sqneg(h29, h3);
1587 __ sqneg(s4, s9);
1588 __ sqneg(v14.V16B(), v29.V16B());
1589 __ sqneg(v30.V2D(), v12.V2D());
1590 __ sqneg(v28.V2S(), v26.V2S());
1591 __ sqneg(v4.V4H(), v4.V4H());
1592 __ sqneg(v9.V4S(), v8.V4S());
1593 __ sqneg(v20.V8B(), v20.V8B());
1594 __ sqneg(v27.V8H(), v10.V8H());
1595 __ sqrdmulh(h7, h24, h0);
1596 __ sqrdmulh(h14, h3, v4.H(), 6);
1597 __ sqrdmulh(s27, s19, s24);
1598 __ sqrdmulh(s31, s21, v4.S(), 0);
1599 __ sqrdmulh(v18.V2S(), v25.V2S(), v1.V2S());
1600 __ sqrdmulh(v22.V2S(), v5.V2S(), v13.S(), 0);
1601 __ sqrdmulh(v22.V4H(), v24.V4H(), v9.V4H());
1602 __ sqrdmulh(v13.V4H(), v2.V4H(), v12.H(), 6);
1603 __ sqrdmulh(v9.V4S(), v27.V4S(), v2.V4S());
1604 __ sqrdmulh(v3.V4S(), v23.V4S(), v7.S(), 1);
1605 __ sqrdmulh(v2.V8H(), v0.V8H(), v7.V8H());
1606 __ sqrdmulh(v16.V8H(), v9.V8H(), v8.H(), 2);
1607 __ sqrshl(b8, b21, b13);
1608 __ sqrshl(d29, d7, d20);
1609 __ sqrshl(h28, h14, h10);
1610 __ sqrshl(s26, s18, s2);
1611 __ sqrshl(v18.V16B(), v31.V16B(), v26.V16B());
1612 __ sqrshl(v28.V2D(), v4.V2D(), v0.V2D());
1613 __ sqrshl(v3.V2S(), v6.V2S(), v0.V2S());
1614 __ sqrshl(v1.V4H(), v18.V4H(), v22.V4H());
1615 __ sqrshl(v16.V4S(), v25.V4S(), v7.V4S());
1616 __ sqrshl(v0.V8B(), v21.V8B(), v5.V8B());
1617 __ sqrshl(v30.V8H(), v19.V8H(), v8.V8H());
1618 __ sqrshrn(b6, h21, 4);
1619 __ sqrshrn(h14, s17, 11);
1620 __ sqrshrn(s25, d27, 10);
1621 __ sqrshrn(v6.V2S(), v13.V2D(), 18);
1622 __ sqrshrn(v5.V4H(), v9.V4S(), 15);
1623 __ sqrshrn(v19.V8B(), v12.V8H(), 1);
1624 __ sqrshrn2(v19.V16B(), v21.V8H(), 7);
1625 __ sqrshrn2(v29.V4S(), v24.V2D(), 13);
1626 __ sqrshrn2(v12.V8H(), v2.V4S(), 10);
1627 __ sqrshrun(b16, h9, 5);
1628 __ sqrshrun(h3, s24, 15);
1629 __ sqrshrun(s16, d18, 8);
1630 __ sqrshrun(v28.V2S(), v23.V2D(), 8);
1631 __ sqrshrun(v31.V4H(), v25.V4S(), 10);
1632 __ sqrshrun(v19.V8B(), v23.V8H(), 2);
1633 __ sqrshrun2(v24.V16B(), v0.V8H(), 8);
1634 __ sqrshrun2(v22.V4S(), v1.V2D(), 23);
1635 __ sqrshrun2(v28.V8H(), v21.V4S(), 13);
1636 __ sqshl(b6, b21, b8);
1637 __ sqshl(b11, b26, 2);
1638 __ sqshl(d29, d0, d4);
1639 __ sqshl(d21, d7, 35);
1640 __ sqshl(h20, h25, h17);
1641 __ sqshl(h20, h0, 8);
1642 __ sqshl(s29, s13, s4);
1643 __ sqshl(s10, s11, 20);
1644 __ sqshl(v8.V16B(), v18.V16B(), v28.V16B());
1645 __ sqshl(v29.V16B(), v29.V16B(), 2);
1646 __ sqshl(v8.V2D(), v31.V2D(), v16.V2D());
1647 __ sqshl(v7.V2D(), v14.V2D(), 37);
1648 __ sqshl(v0.V2S(), v26.V2S(), v7.V2S());
1649 __ sqshl(v5.V2S(), v11.V2S(), 19);
1650 __ sqshl(v11.V4H(), v30.V4H(), v0.V4H());
1651 __ sqshl(v1.V4H(), v18.V4H(), 7);
1652 __ sqshl(v22.V4S(), v3.V4S(), v30.V4S());
1653 __ sqshl(v16.V4S(), v15.V4S(), 28);
1654 __ sqshl(v6.V8B(), v28.V8B(), v25.V8B());
1655 __ sqshl(v0.V8B(), v15.V8B(), 0);
1656 __ sqshl(v6.V8H(), v16.V8H(), v30.V8H());
1657 __ sqshl(v3.V8H(), v20.V8H(), 14);
1658 __ sqshlu(b13, b14, 6);
1659 __ sqshlu(d0, d16, 44);
1660 __ sqshlu(h5, h29, 15);
1661 __ sqshlu(s29, s8, 13);
1662 __ sqshlu(v27.V16B(), v20.V16B(), 2);
1663 __ sqshlu(v24.V2D(), v12.V2D(), 11);
1664 __ sqshlu(v12.V2S(), v19.V2S(), 22);
1665 __ sqshlu(v8.V4H(), v12.V4H(), 11);
1666 __ sqshlu(v18.V4S(), v3.V4S(), 8);
1667 __ sqshlu(v3.V8B(), v10.V8B(), 1);
1668 __ sqshlu(v30.V8H(), v24.V8H(), 4);
1669 __ sqshrn(b1, h28, 1);
1670 __ sqshrn(h31, s7, 10);
1671 __ sqshrn(s4, d10, 24);
1672 __ sqshrn(v10.V2S(), v1.V2D(), 29);
1673 __ sqshrn(v3.V4H(), v13.V4S(), 14);
1674 __ sqshrn(v27.V8B(), v6.V8H(), 7);
1675 __ sqshrn2(v14.V16B(), v23.V8H(), 1);
1676 __ sqshrn2(v25.V4S(), v22.V2D(), 27);
1677 __ sqshrn2(v31.V8H(), v12.V4S(), 10);
1678 __ sqshrun(b9, h0, 1);
1679 __ sqshrun(h11, s6, 7);
1680 __ sqshrun(s13, d12, 13);
1681 __ sqshrun(v10.V2S(), v30.V2D(), 1);
1682 __ sqshrun(v31.V4H(), v3.V4S(), 11);
1683 __ sqshrun(v28.V8B(), v30.V8H(), 8);
1684 __ sqshrun2(v16.V16B(), v27.V8H(), 3);
1685 __ sqshrun2(v27.V4S(), v14.V2D(), 18);
1686 __ sqshrun2(v23.V8H(), v14.V4S(), 1);
1687 __ sqsub(b19, b29, b11);
1688 __ sqsub(d21, d31, d6);
1689 __ sqsub(h18, h10, h19);
1690 __ sqsub(s6, s5, s0);
1691 __ sqsub(v21.V16B(), v22.V16B(), v0.V16B());
1692 __ sqsub(v22.V2D(), v10.V2D(), v17.V2D());
1693 __ sqsub(v8.V2S(), v21.V2S(), v2.V2S());
1694 __ sqsub(v18.V4H(), v25.V4H(), v27.V4H());
1695 __ sqsub(v13.V4S(), v3.V4S(), v6.V4S());
1696 __ sqsub(v28.V8B(), v29.V8B(), v16.V8B());
1697 __ sqsub(v17.V8H(), v6.V8H(), v10.V8H());
1698 __ sqxtn(b27, h26);
1699 __ sqxtn(h17, s11);
1700 __ sqxtn(s22, d31);
1701 __ sqxtn(v26.V2S(), v5.V2D());
1702 __ sqxtn(v13.V4H(), v7.V4S());
1703 __ sqxtn(v19.V8B(), v19.V8H());
1704 __ sqxtn2(v19.V16B(), v3.V8H());
1705 __ sqxtn2(v23.V4S(), v1.V2D());
1706 __ sqxtn2(v13.V8H(), v3.V4S());
1707 __ sqxtun(b26, h9);
1708 __ sqxtun(h19, s12);
1709 __ sqxtun(s3, d6);
1710 __ sqxtun(v29.V2S(), v26.V2D());
1711 __ sqxtun(v26.V4H(), v10.V4S());
1712 __ sqxtun(v7.V8B(), v29.V8H());
1713 __ sqxtun2(v21.V16B(), v14.V8H());
1714 __ sqxtun2(v24.V4S(), v15.V2D());
1715 __ sqxtun2(v30.V8H(), v1.V4S());
1716 __ srhadd(v21.V16B(), v17.V16B(), v15.V16B());
1717 __ srhadd(v28.V2S(), v21.V2S(), v29.V2S());
1718 __ srhadd(v9.V4H(), v1.V4H(), v30.V4H());
1719 __ srhadd(v24.V4S(), v0.V4S(), v2.V4S());
1720 __ srhadd(v6.V8B(), v17.V8B(), v15.V8B());
1721 __ srhadd(v5.V8H(), v7.V8H(), v21.V8H());
1722 __ sri(d14, d14, 49);
1723 __ sri(v23.V16B(), v8.V16B(), 4);
1724 __ sri(v20.V2D(), v13.V2D(), 20);
1725 __ sri(v16.V2S(), v2.V2S(), 24);
1726 __ sri(v5.V4H(), v23.V4H(), 11);
1727 __ sri(v27.V4S(), v15.V4S(), 23);
1728 __ sri(v19.V8B(), v29.V8B(), 4);
1729 __ sri(v7.V8H(), v29.V8H(), 3);
1730 __ srshl(d2, d9, d26);
1731 __ srshl(v29.V16B(), v17.V16B(), v11.V16B());
1732 __ srshl(v8.V2D(), v15.V2D(), v4.V2D());
1733 __ srshl(v25.V2S(), v17.V2S(), v8.V2S());
1734 __ srshl(v19.V4H(), v7.V4H(), v7.V4H());
1735 __ srshl(v13.V4S(), v2.V4S(), v17.V4S());
1736 __ srshl(v22.V8B(), v6.V8B(), v21.V8B());
1737 __ srshl(v10.V8H(), v17.V8H(), v4.V8H());
1738 __ srshr(d21, d18, 45);
1739 __ srshr(v3.V16B(), v11.V16B(), 7);
1740 __ srshr(v21.V2D(), v26.V2D(), 53);
1741 __ srshr(v11.V2S(), v5.V2S(), 28);
1742 __ srshr(v7.V4H(), v18.V4H(), 12);
1743 __ srshr(v7.V4S(), v3.V4S(), 30);
1744 __ srshr(v14.V8B(), v2.V8B(), 6);
1745 __ srshr(v21.V8H(), v20.V8H(), 3);
1746 __ srsra(d21, d30, 63);
1747 __ srsra(v27.V16B(), v30.V16B(), 6);
1748 __ srsra(v20.V2D(), v12.V2D(), 27);
1749 __ srsra(v0.V2S(), v17.V2S(), 5);
1750 __ srsra(v14.V4H(), v16.V4H(), 15);
1751 __ srsra(v18.V4S(), v3.V4S(), 20);
1752 __ srsra(v21.V8B(), v1.V8B(), 1);
1753 __ srsra(v31.V8H(), v25.V8H(), 2);
1754 __ sshl(d1, d13, d9);
1755 __ sshl(v17.V16B(), v31.V16B(), v15.V16B());
1756 __ sshl(v13.V2D(), v16.V2D(), v0.V2D());
1757 __ sshl(v0.V2S(), v7.V2S(), v22.V2S());
1758 __ sshl(v23.V4H(), v19.V4H(), v4.V4H());
1759 __ sshl(v5.V4S(), v5.V4S(), v11.V4S());
1760 __ sshl(v23.V8B(), v27.V8B(), v7.V8B());
1761 __ sshl(v29.V8H(), v10.V8H(), v5.V8H());
1762 __ sshll(v0.V2D(), v2.V2S(), 23);
1763 __ sshll(v11.V4S(), v8.V4H(), 8);
1764 __ sshll(v4.V8H(), v29.V8B(), 1);
1765 __ sshll2(v10.V2D(), v4.V4S(), 14);
1766 __ sshll2(v26.V4S(), v31.V8H(), 6);
1767 __ sshll2(v3.V8H(), v26.V16B(), 4);
1768 __ sshr(d19, d21, 20);
1769 __ sshr(v15.V16B(), v23.V16B(), 5);
1770 __ sshr(v17.V2D(), v14.V2D(), 38);
1771 __ sshr(v3.V2S(), v29.V2S(), 23);
1772 __ sshr(v23.V4H(), v27.V4H(), 4);
1773 __ sshr(v28.V4S(), v3.V4S(), 4);
1774 __ sshr(v14.V8B(), v2.V8B(), 6);
1775 __ sshr(v3.V8H(), v8.V8H(), 6);
1776 __ ssra(d12, d28, 44);
1777 __ ssra(v29.V16B(), v31.V16B(), 4);
1778 __ ssra(v3.V2D(), v0.V2D(), 24);
1779 __ ssra(v14.V2S(), v28.V2S(), 6);
1780 __ ssra(v18.V4H(), v8.V4H(), 7);
1781 __ ssra(v31.V4S(), v14.V4S(), 24);
1782 __ ssra(v28.V8B(), v26.V8B(), 5);
1783 __ ssra(v9.V8H(), v9.V8H(), 14);
1784 __ ssubl(v13.V2D(), v14.V2S(), v3.V2S());
1785 __ ssubl(v5.V4S(), v16.V4H(), v8.V4H());
1786 __ ssubl(v0.V8H(), v28.V8B(), v6.V8B());
1787 __ ssubl2(v5.V2D(), v13.V4S(), v25.V4S());
1788 __ ssubl2(v3.V4S(), v15.V8H(), v17.V8H());
1789 __ ssubl2(v15.V8H(), v15.V16B(), v14.V16B());
1790 __ ssubw(v25.V2D(), v23.V2D(), v26.V2S());
1791 __ ssubw(v21.V4S(), v18.V4S(), v24.V4H());
1792 __ ssubw(v30.V8H(), v22.V8H(), v3.V8B());
1793 __ ssubw2(v16.V2D(), v24.V2D(), v28.V4S());
1794 __ ssubw2(v31.V4S(), v11.V4S(), v15.V8H());
1795 __ ssubw2(v4.V8H(), v8.V8H(), v16.V16B());
1796 __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
1797 __ st1(v10.V16B(),
1798 v11.V16B(),
1799 v12.V16B(),
1800 v13.V16B(),
1801 MemOperand(x1, x2, PostIndex));
1802 __ st1(v27.V16B(),
1803 v28.V16B(),
1804 v29.V16B(),
1805 v30.V16B(),
1806 MemOperand(x1, 64, PostIndex));
1807 __ st1(v16.V16B(), v17.V16B(), v18.V16B(), MemOperand(x0));
1808 __ st1(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
1809 __ st1(v9.V16B(), v10.V16B(), v11.V16B(), MemOperand(x1, 48, PostIndex));
1810 __ st1(v7.V16B(), v8.V16B(), MemOperand(x0));
1811 __ st1(v26.V16B(), v27.V16B(), MemOperand(x1, x2, PostIndex));
1812 __ st1(v22.V16B(), v23.V16B(), MemOperand(x1, 32, PostIndex));
1813 __ st1(v23.V16B(), MemOperand(x0));
1814 __ st1(v28.V16B(), MemOperand(x1, x2, PostIndex));
1815 __ st1(v2.V16B(), MemOperand(x1, 16, PostIndex));
1816 __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), MemOperand(x0));
1817 __ st1(v12.V1D(),
1818 v13.V1D(),
1819 v14.V1D(),
1820 v15.V1D(),
1821 MemOperand(x1, x2, PostIndex));
1822 __ st1(v30.V1D(),
1823 v31.V1D(),
1824 v0.V1D(),
1825 v1.V1D(),
1826 MemOperand(x1, 32, PostIndex));
1827 __ st1(v16.V1D(), v17.V1D(), v18.V1D(), MemOperand(x0));
1828 __ st1(v3.V1D(), v4.V1D(), v5.V1D(), MemOperand(x1, x2, PostIndex));
1829 __ st1(v14.V1D(), v15.V1D(), v16.V1D(), MemOperand(x1, 24, PostIndex));
1830 __ st1(v18.V1D(), v19.V1D(), MemOperand(x0));
1831 __ st1(v5.V1D(), v6.V1D(), MemOperand(x1, x2, PostIndex));
1832 __ st1(v2.V1D(), v3.V1D(), MemOperand(x1, 16, PostIndex));
1833 __ st1(v4.V1D(), MemOperand(x0));
1834 __ st1(v27.V1D(), MemOperand(x1, x2, PostIndex));
1835 __ st1(v23.V1D(), MemOperand(x1, 8, PostIndex));
1836 __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), MemOperand(x0));
1837 __ st1(v22.V2D(),
1838 v23.V2D(),
1839 v24.V2D(),
1840 v25.V2D(),
1841 MemOperand(x1, x2, PostIndex));
1842 __ st1(v28.V2D(),
1843 v29.V2D(),
1844 v30.V2D(),
1845 v31.V2D(),
1846 MemOperand(x1, 64, PostIndex));
1847 __ st1(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
1848 __ st1(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x1, x2, PostIndex));
1849 __ st1(v22.V2D(), v23.V2D(), v24.V2D(), MemOperand(x1, 48, PostIndex));
1850 __ st1(v21.V2D(), v22.V2D(), MemOperand(x0));
1851 __ st1(v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1852 __ st1(v27.V2D(), v28.V2D(), MemOperand(x1, 32, PostIndex));
1853 __ st1(v21.V2D(), MemOperand(x0));
1854 __ st1(v29.V2D(), MemOperand(x1, x2, PostIndex));
1855 __ st1(v20.V2D(), MemOperand(x1, 16, PostIndex));
1856 __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x0));
1857 __ st1(v8.V2S(),
1858 v9.V2S(),
1859 v10.V2S(),
1860 v11.V2S(),
1861 MemOperand(x1, x2, PostIndex));
1862 __ st1(v15.V2S(),
1863 v16.V2S(),
1864 v17.V2S(),
1865 v18.V2S(),
1866 MemOperand(x1, 32, PostIndex));
1867 __ st1(v2.V2S(), v3.V2S(), v4.V2S(), MemOperand(x0));
1868 __ st1(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, x2, PostIndex));
1869 __ st1(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x1, 24, PostIndex));
1870 __ st1(v28.V2S(), v29.V2S(), MemOperand(x0));
1871 __ st1(v29.V2S(), v30.V2S(), MemOperand(x1, x2, PostIndex));
1872 __ st1(v23.V2S(), v24.V2S(), MemOperand(x1, 16, PostIndex));
1873 __ st1(v6.V2S(), MemOperand(x0));
1874 __ st1(v11.V2S(), MemOperand(x1, x2, PostIndex));
1875 __ st1(v17.V2S(), MemOperand(x1, 8, PostIndex));
1876 __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x0));
1877 __ st1(v9.V4H(),
1878 v10.V4H(),
1879 v11.V4H(),
1880 v12.V4H(),
1881 MemOperand(x1, x2, PostIndex));
1882 __ st1(v25.V4H(),
1883 v26.V4H(),
1884 v27.V4H(),
1885 v28.V4H(),
1886 MemOperand(x1, 32, PostIndex));
1887 __ st1(v11.V4H(), v12.V4H(), v13.V4H(), MemOperand(x0));
1888 __ st1(v10.V4H(), v11.V4H(), v12.V4H(), MemOperand(x1, x2, PostIndex));
1889 __ st1(v12.V4H(), v13.V4H(), v14.V4H(), MemOperand(x1, 24, PostIndex));
1890 __ st1(v13.V4H(), v14.V4H(), MemOperand(x0));
1891 __ st1(v15.V4H(), v16.V4H(), MemOperand(x1, x2, PostIndex));
1892 __ st1(v21.V4H(), v22.V4H(), MemOperand(x1, 16, PostIndex));
1893 __ st1(v16.V4H(), MemOperand(x0));
1894 __ st1(v8.V4H(), MemOperand(x1, x2, PostIndex));
1895 __ st1(v30.V4H(), MemOperand(x1, 8, PostIndex));
1896 __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), MemOperand(x0));
1897 __ st1(v25.V4S(),
1898 v26.V4S(),
1899 v27.V4S(),
1900 v28.V4S(),
1901 MemOperand(x1, x2, PostIndex));
1902 __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 64, PostIndex));
1903 __ st1(v31.V4S(), v0.V4S(), v1.V4S(), MemOperand(x0));
1904 __ st1(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1905 __ st1(v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 48, PostIndex));
1906 __ st1(v17.V4S(), v18.V4S(), MemOperand(x0));
1907 __ st1(v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1908 __ st1(v1.V4S(), v2.V4S(), MemOperand(x1, 32, PostIndex));
1909 __ st1(v26.V4S(), MemOperand(x0));
1910 __ st1(v15.V4S(), MemOperand(x1, x2, PostIndex));
1911 __ st1(v13.V4S(), MemOperand(x1, 16, PostIndex));
1912 __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
1913 __ st1(v10.V8B(),
1914 v11.V8B(),
1915 v12.V8B(),
1916 v13.V8B(),
1917 MemOperand(x1, x2, PostIndex));
1918 __ st1(v15.V8B(),
1919 v16.V8B(),
1920 v17.V8B(),
1921 v18.V8B(),
1922 MemOperand(x1, 32, PostIndex));
1923 __ st1(v19.V8B(), v20.V8B(), v21.V8B(), MemOperand(x0));
1924 __ st1(v31.V8B(), v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1925 __ st1(v9.V8B(), v10.V8B(), v11.V8B(), MemOperand(x1, 24, PostIndex));
1926 __ st1(v12.V8B(), v13.V8B(), MemOperand(x0));
1927 __ st1(v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1928 __ st1(v0.V8B(), v1.V8B(), MemOperand(x1, 16, PostIndex));
1929 __ st1(v16.V8B(), MemOperand(x0));
1930 __ st1(v25.V8B(), MemOperand(x1, x2, PostIndex));
1931 __ st1(v31.V8B(), MemOperand(x1, 8, PostIndex));
1932 __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), MemOperand(x0));
1933 __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), MemOperand(x1, x2, PostIndex));
1934 __ st1(v26.V8H(),
1935 v27.V8H(),
1936 v28.V8H(),
1937 v29.V8H(),
1938 MemOperand(x1, 64, PostIndex));
1939 __ st1(v10.V8H(), v11.V8H(), v12.V8H(), MemOperand(x0));
1940 __ st1(v21.V8H(), v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1941 __ st1(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
1942 __ st1(v26.V8H(), v27.V8H(), MemOperand(x0));
1943 __ st1(v24.V8H(), v25.V8H(), MemOperand(x1, x2, PostIndex));
1944 __ st1(v17.V8H(), v18.V8H(), MemOperand(x1, 32, PostIndex));
1945 __ st1(v29.V8H(), MemOperand(x0));
1946 __ st1(v19.V8H(), MemOperand(x1, x2, PostIndex));
1947 __ st1(v23.V8H(), MemOperand(x1, 16, PostIndex));
1948 __ st1(v19.B(), 15, MemOperand(x0));
1949 __ st1(v25.B(), 9, MemOperand(x1, x2, PostIndex));
1950 __ st1(v4.B(), 8, MemOperand(x1, 1, PostIndex));
1951 __ st1(v13.D(), 0, MemOperand(x0));
1952 __ st1(v30.D(), 0, MemOperand(x1, x2, PostIndex));
1953 __ st1(v3.D(), 0, MemOperand(x1, 8, PostIndex));
1954 __ st1(v22.H(), 0, MemOperand(x0));
1955 __ st1(v31.H(), 7, MemOperand(x1, x2, PostIndex));
1956 __ st1(v23.H(), 3, MemOperand(x1, 2, PostIndex));
1957 __ st1(v0.S(), 0, MemOperand(x0));
1958 __ st1(v11.S(), 3, MemOperand(x1, x2, PostIndex));
1959 __ st1(v24.S(), 3, MemOperand(x1, 4, PostIndex));
1960 __ st2(v7.V16B(), v8.V16B(), MemOperand(x0));
1961 __ st2(v5.V16B(), v6.V16B(), MemOperand(x1, x2, PostIndex));
1962 __ st2(v18.V16B(), v19.V16B(), MemOperand(x1, 32, PostIndex));
1963 __ st2(v14.V2D(), v15.V2D(), MemOperand(x0));
1964 __ st2(v7.V2D(), v8.V2D(), MemOperand(x1, x2, PostIndex));
1965 __ st2(v24.V2D(), v25.V2D(), MemOperand(x1, 32, PostIndex));
1966 __ st2(v22.V2S(), v23.V2S(), MemOperand(x0));
1967 __ st2(v4.V2S(), v5.V2S(), MemOperand(x1, x2, PostIndex));
1968 __ st2(v2.V2S(), v3.V2S(), MemOperand(x1, 16, PostIndex));
1969 __ st2(v23.V4H(), v24.V4H(), MemOperand(x0));
1970 __ st2(v8.V4H(), v9.V4H(), MemOperand(x1, x2, PostIndex));
1971 __ st2(v7.V4H(), v8.V4H(), MemOperand(x1, 16, PostIndex));
1972 __ st2(v17.V4S(), v18.V4S(), MemOperand(x0));
1973 __ st2(v6.V4S(), v7.V4S(), MemOperand(x1, x2, PostIndex));
1974 __ st2(v26.V4S(), v27.V4S(), MemOperand(x1, 32, PostIndex));
1975 __ st2(v31.V8B(), v0.V8B(), MemOperand(x0));
1976 __ st2(v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1977 __ st2(v21.V8B(), v22.V8B(), MemOperand(x1, 16, PostIndex));
1978 __ st2(v7.V8H(), v8.V8H(), MemOperand(x0));
1979 __ st2(v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1980 __ st2(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
1981 __ st2(v8.B(), v9.B(), 15, MemOperand(x0));
1982 __ st2(v8.B(), v9.B(), 15, MemOperand(x1, x2, PostIndex));
1983 __ st2(v7.B(), v8.B(), 4, MemOperand(x1, 2, PostIndex));
1984 __ st2(v25.D(), v26.D(), 0, MemOperand(x0));
1985 __ st2(v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1986 __ st2(v3.D(), v4.D(), 1, MemOperand(x1, 16, PostIndex));
1987 __ st2(v4.H(), v5.H(), 3, MemOperand(x0));
1988 __ st2(v0.H(), v1.H(), 5, MemOperand(x1, x2, PostIndex));
1989 __ st2(v22.H(), v23.H(), 2, MemOperand(x1, 4, PostIndex));
1990 __ st2(v14.S(), v15.S(), 3, MemOperand(x0));
1991 __ st2(v23.S(), v24.S(), 3, MemOperand(x1, x2, PostIndex));
1992 __ st2(v0.S(), v1.S(), 2, MemOperand(x1, 8, PostIndex));
1993 __ st3(v26.V16B(), v27.V16B(), v28.V16B(), MemOperand(x0));
1994 __ st3(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
1995 __ st3(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, 48, PostIndex));
1996 __ st3(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
1997 __ st3(v23.V2D(), v24.V2D(), v25.V2D(), MemOperand(x1, x2, PostIndex));
1998 __ st3(v10.V2D(), v11.V2D(), v12.V2D(), MemOperand(x1, 48, PostIndex));
1999 __ st3(v9.V2S(), v10.V2S(), v11.V2S(), MemOperand(x0));
2000 __ st3(v13.V2S(), v14.V2S(), v15.V2S(), MemOperand(x1, x2, PostIndex));
2001 __ st3(v22.V2S(), v23.V2S(), v24.V2S(), MemOperand(x1, 24, PostIndex));
2002 __ st3(v31.V4H(), v0.V4H(), v1.V4H(), MemOperand(x0));
2003 __ st3(v8.V4H(), v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
2004 __ st3(v19.V4H(), v20.V4H(), v21.V4H(), MemOperand(x1, 24, PostIndex));
2005 __ st3(v18.V4S(), v19.V4S(), v20.V4S(), MemOperand(x0));
2006 __ st3(v25.V4S(), v26.V4S(), v27.V4S(), MemOperand(x1, x2, PostIndex));
2007 __ st3(v16.V4S(), v17.V4S(), v18.V4S(), MemOperand(x1, 48, PostIndex));
2008 __ st3(v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2009 __ st3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x1, x2, PostIndex));
2010 __ st3(v30.V8B(), v31.V8B(), v0.V8B(), MemOperand(x1, 24, PostIndex));
2011 __ st3(v8.V8H(), v9.V8H(), v10.V8H(), MemOperand(x0));
2012 __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, x2, PostIndex));
2013 __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
2014 __ st3(v31.B(), v0.B(), v1.B(), 10, MemOperand(x0));
2015 __ st3(v4.B(), v5.B(), v6.B(), 5, MemOperand(x1, x2, PostIndex));
2016 __ st3(v5.B(), v6.B(), v7.B(), 1, MemOperand(x1, 3, PostIndex));
2017 __ st3(v5.D(), v6.D(), v7.D(), 0, MemOperand(x0));
2018 __ st3(v6.D(), v7.D(), v8.D(), 0, MemOperand(x1, x2, PostIndex));
2019 __ st3(v0.D(), v1.D(), v2.D(), 0, MemOperand(x1, 24, PostIndex));
2020 __ st3(v31.H(), v0.H(), v1.H(), 2, MemOperand(x0));
2021 __ st3(v14.H(), v15.H(), v16.H(), 5, MemOperand(x1, x2, PostIndex));
2022 __ st3(v21.H(), v22.H(), v23.H(), 6, MemOperand(x1, 6, PostIndex));
2023 __ st3(v21.S(), v22.S(), v23.S(), 0, MemOperand(x0));
2024 __ st3(v11.S(), v12.S(), v13.S(), 1, MemOperand(x1, x2, PostIndex));
2025 __ st3(v15.S(), v16.S(), v17.S(), 0, MemOperand(x1, 12, PostIndex));
2026 __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), MemOperand(x0));
2027 __ st4(v24.V16B(),
2028 v25.V16B(),
2029 v26.V16B(),
2030 v27.V16B(),
2031 MemOperand(x1, x2, PostIndex));
2032 __ st4(v15.V16B(),
2033 v16.V16B(),
2034 v17.V16B(),
2035 v18.V16B(),
2036 MemOperand(x1, 64, PostIndex));
2037 __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
2038 __ st4(v17.V2D(),
2039 v18.V2D(),
2040 v19.V2D(),
2041 v20.V2D(),
2042 MemOperand(x1, x2, PostIndex));
2043 __ st4(v9.V2D(),
2044 v10.V2D(),
2045 v11.V2D(),
2046 v12.V2D(),
2047 MemOperand(x1, 64, PostIndex));
2048 __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), MemOperand(x0));
2049 __ st4(v15.V2S(),
2050 v16.V2S(),
2051 v17.V2S(),
2052 v18.V2S(),
2053 MemOperand(x1, x2, PostIndex));
2054 __ st4(v24.V2S(),
2055 v25.V2S(),
2056 v26.V2S(),
2057 v27.V2S(),
2058 MemOperand(x1, 32, PostIndex));
2059 __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), MemOperand(x0));
2060 __ st4(v18.V4H(),
2061 v19.V4H(),
2062 v20.V4H(),
2063 v21.V4H(),
2064 MemOperand(x1, x2, PostIndex));
2065 __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
2066 __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), MemOperand(x0));
2067 __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), MemOperand(x1, x2, PostIndex));
2068 __ st4(v15.V4S(),
2069 v16.V4S(),
2070 v17.V4S(),
2071 v18.V4S(),
2072 MemOperand(x1, 64, PostIndex));
2073 __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2074 __ st4(v25.V8B(),
2075 v26.V8B(),
2076 v27.V8B(),
2077 v28.V8B(),
2078 MemOperand(x1, x2, PostIndex));
2079 __ st4(v19.V8B(),
2080 v20.V8B(),
2081 v21.V8B(),
2082 v22.V8B(),
2083 MemOperand(x1, 32, PostIndex));
2084 __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), MemOperand(x0));
2085 __ st4(v15.V8H(),
2086 v16.V8H(),
2087 v17.V8H(),
2088 v18.V8H(),
2089 MemOperand(x1, x2, PostIndex));
2090 __ st4(v31.V8H(),
2091 v0.V8H(),
2092 v1.V8H(),
2093 v2.V8H(),
2094 MemOperand(x1, 64, PostIndex));
2095 __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, MemOperand(x0));
2096 __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, MemOperand(x1, x2, PostIndex));
2097 __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, MemOperand(x1, 4, PostIndex));
2098 __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, MemOperand(x0));
2099 __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, MemOperand(x1, x2, PostIndex));
2100 __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, MemOperand(x1, 32, PostIndex));
2101 __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, MemOperand(x0));
2102 __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, MemOperand(x1, x2, PostIndex));
2103 __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, MemOperand(x1, 8, PostIndex));
2104 __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, MemOperand(x0));
2105 __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, MemOperand(x1, x2, PostIndex));
2106 __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, MemOperand(x1, 16, PostIndex));
2107 __ sub(d12, d17, d2);
2108 __ sub(v20.V16B(), v24.V16B(), v8.V16B());
2109 __ sub(v8.V2D(), v29.V2D(), v5.V2D());
2110 __ sub(v2.V2S(), v28.V2S(), v24.V2S());
2111 __ sub(v24.V4H(), v10.V4H(), v4.V4H());
2112 __ sub(v28.V4S(), v4.V4S(), v17.V4S());
2113 __ sub(v16.V8B(), v27.V8B(), v2.V8B());
2114 __ sub(v20.V8H(), v10.V8H(), v13.V8H());
2115 __ subhn(v5.V2S(), v14.V2D(), v13.V2D());
2116 __ subhn(v10.V4H(), v5.V4S(), v8.V4S());
2117 __ subhn(v6.V8B(), v10.V8H(), v22.V8H());
2118 __ subhn2(v11.V16B(), v6.V8H(), v9.V8H());
2119 __ subhn2(v25.V4S(), v18.V2D(), v24.V2D());
2120 __ subhn2(v20.V8H(), v21.V4S(), v1.V4S());
2121 __ suqadd(b25, b11);
2122 __ suqadd(d13, d1);
2123 __ suqadd(h0, h9);
2124 __ suqadd(s22, s8);
2125 __ suqadd(v24.V16B(), v27.V16B());
2126 __ suqadd(v26.V2D(), v14.V2D());
2127 __ suqadd(v7.V2S(), v10.V2S());
2128 __ suqadd(v25.V4H(), v12.V4H());
2129 __ suqadd(v4.V4S(), v3.V4S());
2130 __ suqadd(v14.V8B(), v18.V8B());
2131 __ suqadd(v31.V8H(), v8.V8H());
2132 __ sxtl(v16.V2D(), v20.V2S());
2133 __ sxtl(v27.V4S(), v28.V4H());
2134 __ sxtl(v0.V8H(), v22.V8B());
2135 __ sxtl2(v6.V2D(), v7.V4S());
2136 __ sxtl2(v9.V4S(), v27.V8H());
2137 __ sxtl2(v16.V8H(), v16.V16B());
2138 __ tbl(v25.V16B(),
2139 v17.V16B(),
2140 v18.V16B(),
2141 v19.V16B(),
2142 v20.V16B(),
2143 v22.V16B());
2144 __ tbl(v28.V16B(), v13.V16B(), v14.V16B(), v15.V16B(), v4.V16B());
2145 __ tbl(v3.V16B(), v0.V16B(), v1.V16B(), v2.V16B());
2146 __ tbl(v20.V16B(), v15.V16B(), v4.V16B());
2147 __ tbl(v7.V8B(), v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), v20.V8B());
2148 __ tbl(v8.V8B(), v1.V16B(), v2.V16B(), v3.V16B(), v31.V8B());
2149 __ tbl(v8.V8B(), v25.V16B(), v26.V16B(), v16.V8B());
2150 __ tbl(v11.V8B(), v19.V16B(), v30.V8B());
2151 __ tbx(v25.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), v28.V16B(), v5.V16B());
2152 __ tbx(v21.V16B(), v29.V16B(), v30.V16B(), v31.V16B(), v24.V16B());
2153 __ tbx(v6.V16B(), v16.V16B(), v17.V16B(), v1.V16B());
2154 __ tbx(v13.V16B(), v3.V16B(), v20.V16B());
2155 __ tbx(v24.V8B(), v29.V16B(), v30.V16B(), v31.V16B(), v0.V16B(), v9.V8B());
2156 __ tbx(v17.V8B(), v9.V16B(), v10.V16B(), v11.V16B(), v26.V8B());
2157 __ tbx(v5.V8B(), v3.V16B(), v4.V16B(), v21.V8B());
2158 __ tbx(v16.V8B(), v11.V16B(), v29.V8B());
2159 __ trn1(v19.V16B(), v24.V16B(), v12.V16B());
2160 __ trn1(v2.V2D(), v7.V2D(), v10.V2D());
2161 __ trn1(v22.V2S(), v0.V2S(), v21.V2S());
2162 __ trn1(v12.V4H(), v15.V4H(), v20.V4H());
2163 __ trn1(v30.V4S(), v17.V4S(), v9.V4S());
2164 __ trn1(v12.V8B(), v19.V8B(), v29.V8B());
2165 __ trn1(v23.V8H(), v8.V8H(), v9.V8H());
2166 __ trn2(v28.V16B(), v30.V16B(), v25.V16B());
2167 __ trn2(v7.V2D(), v27.V2D(), v7.V2D());
2168 __ trn2(v30.V2S(), v16.V2S(), v19.V2S());
2169 __ trn2(v24.V4H(), v6.V4H(), v25.V4H());
2170 __ trn2(v2.V4S(), v19.V4S(), v11.V4S());
2171 __ trn2(v25.V8B(), v27.V8B(), v18.V8B());
2172 __ trn2(v12.V8H(), v4.V8H(), v15.V8H());
2173 __ uaba(v31.V16B(), v12.V16B(), v28.V16B());
2174 __ uaba(v18.V2S(), v5.V2S(), v14.V2S());
2175 __ uaba(v9.V4H(), v20.V4H(), v21.V4H());
2176 __ uaba(v6.V4S(), v20.V4S(), v2.V4S());
2177 __ uaba(v16.V8B(), v12.V8B(), v5.V8B());
2178 __ uaba(v15.V8H(), v26.V8H(), v30.V8H());
2179 __ uabal(v10.V2D(), v18.V2S(), v15.V2S());
2180 __ uabal(v30.V4S(), v19.V4H(), v7.V4H());
2181 __ uabal(v4.V8H(), v27.V8B(), v0.V8B());
2182 __ uabal2(v19.V2D(), v12.V4S(), v2.V4S());
2183 __ uabal2(v26.V4S(), v5.V8H(), v12.V8H());
2184 __ uabal2(v19.V8H(), v20.V16B(), v28.V16B());
2185 __ uabd(v18.V16B(), v4.V16B(), v21.V16B());
2186 __ uabd(v30.V2S(), v21.V2S(), v16.V2S());
2187 __ uabd(v8.V4H(), v28.V4H(), v25.V4H());
2188 __ uabd(v28.V4S(), v12.V4S(), v21.V4S());
2189 __ uabd(v19.V8B(), v16.V8B(), v28.V8B());
2190 __ uabd(v9.V8H(), v12.V8H(), v29.V8H());
2191 __ uabdl(v26.V2D(), v0.V2S(), v8.V2S());
2192 __ uabdl(v29.V4S(), v31.V4H(), v25.V4H());
2193 __ uabdl(v27.V8H(), v29.V8B(), v14.V8B());
2194 __ uabdl2(v20.V2D(), v20.V4S(), v8.V4S());
2195 __ uabdl2(v22.V4S(), v15.V8H(), v18.V8H());
2196 __ uabdl2(v9.V8H(), v18.V16B(), v23.V16B());
2197 __ uadalp(v9.V1D(), v15.V2S());
2198 __ uadalp(v14.V2D(), v12.V4S());
2199 __ uadalp(v28.V2S(), v12.V4H());
2200 __ uadalp(v0.V4H(), v17.V8B());
2201 __ uadalp(v1.V4S(), v29.V8H());
2202 __ uadalp(v15.V8H(), v22.V16B());
2203 __ uaddl(v1.V2D(), v20.V2S(), v27.V2S());
2204 __ uaddl(v31.V4S(), v25.V4H(), v5.V4H());
2205 __ uaddl(v12.V8H(), v3.V8B(), v3.V8B());
2206 __ uaddl2(v5.V2D(), v23.V4S(), v6.V4S());
2207 __ uaddl2(v1.V4S(), v5.V8H(), v25.V8H());
2208 __ uaddl2(v22.V8H(), v30.V16B(), v28.V16B());
2209 __ uaddlp(v7.V1D(), v9.V2S());
2210 __ uaddlp(v26.V2D(), v4.V4S());
2211 __ uaddlp(v28.V2S(), v1.V4H());
2212 __ uaddlp(v20.V4H(), v31.V8B());
2213 __ uaddlp(v16.V4S(), v17.V8H());
2214 __ uaddlp(v6.V8H(), v2.V16B());
2215 __ uaddlv(d28, v22.V4S());
2216 __ uaddlv(h0, v19.V16B());
2217 __ uaddlv(h30, v30.V8B());
2218 __ uaddlv(s24, v18.V4H());
2219 __ uaddlv(s10, v0.V8H());
2220 __ uaddw(v9.V2D(), v17.V2D(), v14.V2S());
2221 __ uaddw(v9.V4S(), v25.V4S(), v3.V4H());
2222 __ uaddw(v18.V8H(), v1.V8H(), v0.V8B());
2223 __ uaddw2(v18.V2D(), v5.V2D(), v6.V4S());
2224 __ uaddw2(v17.V4S(), v15.V4S(), v11.V8H());
2225 __ uaddw2(v29.V8H(), v11.V8H(), v7.V16B());
2226 __ uhadd(v13.V16B(), v9.V16B(), v3.V16B());
2227 __ uhadd(v17.V2S(), v25.V2S(), v24.V2S());
2228 __ uhadd(v25.V4H(), v23.V4H(), v13.V4H());
2229 __ uhadd(v0.V4S(), v20.V4S(), v16.V4S());
2230 __ uhadd(v5.V8B(), v5.V8B(), v25.V8B());
2231 __ uhadd(v3.V8H(), v29.V8H(), v18.V8H());
2232 __ uhsub(v1.V16B(), v22.V16B(), v13.V16B());
2233 __ uhsub(v14.V2S(), v30.V2S(), v30.V2S());
2234 __ uhsub(v29.V4H(), v14.V4H(), v17.V4H());
2235 __ uhsub(v26.V4S(), v5.V4S(), v18.V4S());
2236 __ uhsub(v3.V8B(), v7.V8B(), v12.V8B());
2237 __ uhsub(v25.V8H(), v21.V8H(), v5.V8H());
2238 __ umax(v28.V16B(), v12.V16B(), v6.V16B());
2239 __ umax(v20.V2S(), v19.V2S(), v26.V2S());
2240 __ umax(v0.V4H(), v31.V4H(), v18.V4H());
2241 __ umax(v6.V4S(), v21.V4S(), v28.V4S());
2242 __ umax(v0.V8B(), v2.V8B(), v20.V8B());
2243 __ umax(v4.V8H(), v11.V8H(), v22.V8H());
2244 __ umaxp(v1.V16B(), v6.V16B(), v29.V16B());
2245 __ umaxp(v19.V2S(), v17.V2S(), v27.V2S());
2246 __ umaxp(v21.V4H(), v16.V4H(), v7.V4H());
2247 __ umaxp(v9.V4S(), v20.V4S(), v29.V4S());
2248 __ umaxp(v13.V8B(), v1.V8B(), v16.V8B());
2249 __ umaxp(v19.V8H(), v23.V8H(), v26.V8H());
2250 __ umaxv(b17, v30.V16B());
2251 __ umaxv(b23, v12.V8B());
2252 __ umaxv(h31, v15.V4H());
2253 __ umaxv(h15, v25.V8H());
2254 __ umaxv(s18, v21.V4S());
2255 __ umin(v22.V16B(), v0.V16B(), v18.V16B());
2256 __ umin(v1.V2S(), v21.V2S(), v16.V2S());
2257 __ umin(v17.V4H(), v4.V4H(), v25.V4H());
2258 __ umin(v24.V4S(), v26.V4S(), v13.V4S());
2259 __ umin(v20.V8B(), v1.V8B(), v5.V8B());
2260 __ umin(v26.V8H(), v25.V8H(), v23.V8H());
2261 __ uminp(v5.V16B(), v1.V16B(), v23.V16B());
2262 __ uminp(v7.V2S(), v26.V2S(), v30.V2S());
2263 __ uminp(v9.V4H(), v5.V4H(), v25.V4H());
2264 __ uminp(v23.V4S(), v10.V4S(), v1.V4S());
2265 __ uminp(v4.V8B(), v29.V8B(), v14.V8B());
2266 __ uminp(v21.V8H(), v0.V8H(), v14.V8H());
2267 __ uminv(b0, v17.V16B());
2268 __ uminv(b0, v31.V8B());
2269 __ uminv(h24, v0.V4H());
2270 __ uminv(h29, v14.V8H());
2271 __ uminv(s30, v3.V4S());
2272 __ umlal(v11.V2D(), v11.V2S(), v24.V2S());
2273 __ umlal(v30.V2D(), v16.V2S(), v11.S(), 3);
2274 __ umlal(v0.V4S(), v9.V4H(), v26.V4H());
2275 __ umlal(v20.V4S(), v24.V4H(), v12.H(), 4);
2276 __ umlal(v16.V8H(), v21.V8B(), v6.V8B());
2277 __ umlal2(v17.V2D(), v19.V4S(), v23.V4S());
2278 __ umlal2(v5.V2D(), v30.V4S(), v8.S(), 0);
2279 __ umlal2(v16.V4S(), v8.V8H(), v15.V8H());
2280 __ umlal2(v15.V4S(), v26.V8H(), v1.H(), 5);
2281 __ umlal2(v30.V8H(), v1.V16B(), v17.V16B());
2282 __ umlsl(v18.V2D(), v19.V2S(), v28.V2S());
2283 __ umlsl(v7.V2D(), v7.V2S(), v8.S(), 0);
2284 __ umlsl(v24.V4S(), v8.V4H(), v4.V4H());
2285 __ umlsl(v18.V4S(), v22.V4H(), v12.H(), 4);
2286 __ umlsl(v28.V8H(), v14.V8B(), v20.V8B());
2287 __ umlsl2(v11.V2D(), v0.V4S(), v9.V4S());
2288 __ umlsl2(v26.V2D(), v16.V4S(), v9.S(), 2);
2289 __ umlsl2(v3.V4S(), v11.V8H(), v9.V8H());
2290 __ umlsl2(v10.V4S(), v25.V8H(), v9.H(), 4);
2291 __ umlsl2(v24.V8H(), v16.V16B(), v28.V16B());
2292 __ umov(x30, v25.D(), 1);
2293 __ umull(v12.V2D(), v10.V2S(), v29.V2S());
2294 __ umull(v22.V2D(), v30.V2S(), v5.S(), 3);
2295 __ umull(v7.V4S(), v0.V4H(), v25.V4H());
2296 __ umull(v11.V4S(), v13.V4H(), v3.H(), 2);
2297 __ umull(v25.V8H(), v16.V8B(), v10.V8B());
2298 __ umull2(v17.V2D(), v3.V4S(), v26.V4S());
2299 __ umull2(v26.V2D(), v11.V4S(), v2.S(), 3);
2300 __ umull2(v12.V4S(), v17.V8H(), v23.V8H());
2301 __ umull2(v4.V4S(), v31.V8H(), v1.H(), 2);
2302 __ umull2(v5.V8H(), v12.V16B(), v17.V16B());
2303 __ uqadd(b30, b4, b28);
2304 __ uqadd(d27, d20, d16);
2305 __ uqadd(h7, h14, h28);
2306 __ uqadd(s28, s17, s4);
2307 __ uqadd(v19.V16B(), v22.V16B(), v21.V16B());
2308 __ uqadd(v16.V2D(), v4.V2D(), v11.V2D());
2309 __ uqadd(v20.V2S(), v14.V2S(), v4.V2S());
2310 __ uqadd(v5.V4H(), v0.V4H(), v16.V4H());
2311 __ uqadd(v21.V4S(), v31.V4S(), v9.V4S());
2312 __ uqadd(v23.V8B(), v24.V8B(), v3.V8B());
2313 __ uqadd(v17.V8H(), v27.V8H(), v11.V8H());
2314 __ uqrshl(b10, b22, b10);
2315 __ uqrshl(d29, d5, d11);
2316 __ uqrshl(h27, h24, h30);
2317 __ uqrshl(s10, s13, s8);
2318 __ uqrshl(v9.V16B(), v18.V16B(), v14.V16B());
2319 __ uqrshl(v24.V2D(), v15.V2D(), v17.V2D());
2320 __ uqrshl(v4.V2S(), v14.V2S(), v27.V2S());
2321 __ uqrshl(v15.V4H(), v5.V4H(), v8.V4H());
2322 __ uqrshl(v21.V4S(), v29.V4S(), v0.V4S());
2323 __ uqrshl(v16.V8B(), v24.V8B(), v9.V8B());
2324 __ uqrshl(v2.V8H(), v0.V8H(), v15.V8H());
2325 __ uqrshrn(b11, h26, 4);
2326 __ uqrshrn(h7, s30, 5);
2327 __ uqrshrn(s10, d8, 21);
2328 __ uqrshrn(v15.V2S(), v6.V2D(), 11);
2329 __ uqrshrn(v5.V4H(), v26.V4S(), 12);
2330 __ uqrshrn(v28.V8B(), v25.V8H(), 5);
2331 __ uqrshrn2(v25.V16B(), v30.V8H(), 2);
2332 __ uqrshrn2(v21.V4S(), v14.V2D(), 32);
2333 __ uqrshrn2(v13.V8H(), v7.V4S(), 2);
2334 __ uqshl(b13, b0, b23);
2335 __ uqshl(b9, b17, 4);
2336 __ uqshl(d23, d6, d4);
2337 __ uqshl(d8, d11, 44);
2338 __ uqshl(h19, h13, h15);
2339 __ uqshl(h25, h26, 6);
2340 __ uqshl(s4, s24, s10);
2341 __ uqshl(s19, s14, 1);
2342 __ uqshl(v14.V16B(), v30.V16B(), v25.V16B());
2343 __ uqshl(v6.V16B(), v10.V16B(), 5);
2344 __ uqshl(v18.V2D(), v8.V2D(), v7.V2D());
2345 __ uqshl(v25.V2D(), v14.V2D(), 18);
2346 __ uqshl(v25.V2S(), v16.V2S(), v23.V2S());
2347 __ uqshl(v13.V2S(), v15.V2S(), 31);
2348 __ uqshl(v28.V4H(), v24.V4H(), v15.V4H());
2349 __ uqshl(v4.V4H(), v17.V4H(), 1);
2350 __ uqshl(v9.V4S(), v31.V4S(), v23.V4S());
2351 __ uqshl(v18.V4S(), v28.V4S(), 31);
2352 __ uqshl(v31.V8B(), v21.V8B(), v15.V8B());
2353 __ uqshl(v6.V8B(), v21.V8B(), 1);
2354 __ uqshl(v28.V8H(), v2.V8H(), v17.V8H());
2355 __ uqshl(v24.V8H(), v8.V8H(), 14);
2356 __ uqshrn(b21, h27, 7);
2357 __ uqshrn(h28, s26, 11);
2358 __ uqshrn(s13, d31, 17);
2359 __ uqshrn(v21.V2S(), v16.V2D(), 8);
2360 __ uqshrn(v24.V4H(), v24.V4S(), 2);
2361 __ uqshrn(v5.V8B(), v1.V8H(), 8);
2362 __ uqshrn2(v16.V16B(), v29.V8H(), 6);
2363 __ uqshrn2(v2.V4S(), v6.V2D(), 1);
2364 __ uqshrn2(v16.V8H(), v10.V4S(), 14);
2365 __ uqsub(b28, b20, b26);
2366 __ uqsub(d0, d7, d10);
2367 __ uqsub(h26, h24, h7);
2368 __ uqsub(s23, s23, s16);
2369 __ uqsub(v14.V16B(), v16.V16B(), v24.V16B());
2370 __ uqsub(v11.V2D(), v17.V2D(), v6.V2D());
2371 __ uqsub(v10.V2S(), v10.V2S(), v8.V2S());
2372 __ uqsub(v9.V4H(), v15.V4H(), v12.V4H());
2373 __ uqsub(v23.V4S(), v18.V4S(), v7.V4S());
2374 __ uqsub(v9.V8B(), v19.V8B(), v17.V8B());
2375 __ uqsub(v20.V8H(), v2.V8H(), v6.V8H());
2376 __ uqxtn(b29, h19);
2377 __ uqxtn(h0, s13);
2378 __ uqxtn(s26, d22);
2379 __ uqxtn(v5.V2S(), v31.V2D());
2380 __ uqxtn(v30.V4H(), v19.V4S());
2381 __ uqxtn(v15.V8B(), v2.V8H());
2382 __ uqxtn2(v29.V16B(), v3.V8H());
2383 __ uqxtn2(v13.V4S(), v17.V2D());
2384 __ uqxtn2(v28.V8H(), v11.V4S());
2385 __ urecpe(v23.V2S(), v15.V2S());
2386 __ urecpe(v27.V4S(), v7.V4S());
2387 __ urhadd(v2.V16B(), v15.V16B(), v27.V16B());
2388 __ urhadd(v15.V2S(), v1.V2S(), v18.V2S());
2389 __ urhadd(v17.V4H(), v4.V4H(), v26.V4H());
2390 __ urhadd(v2.V4S(), v27.V4S(), v14.V4S());
2391 __ urhadd(v5.V8B(), v17.V8B(), v14.V8B());
2392 __ urhadd(v30.V8H(), v2.V8H(), v25.V8H());
2393 __ urshl(d4, d28, d30);
2394 __ urshl(v13.V16B(), v31.V16B(), v19.V16B());
2395 __ urshl(v14.V2D(), v23.V2D(), v21.V2D());
2396 __ urshl(v10.V2S(), v7.V2S(), v8.V2S());
2397 __ urshl(v15.V4H(), v21.V4H(), v28.V4H());
2398 __ urshl(v30.V4S(), v8.V4S(), v23.V4S());
2399 __ urshl(v31.V8B(), v20.V8B(), v5.V8B());
2400 __ urshl(v30.V8H(), v27.V8H(), v30.V8H());
2401 __ urshr(d4, d13, 49);
2402 __ urshr(v2.V16B(), v20.V16B(), 1);
2403 __ urshr(v13.V2D(), v11.V2D(), 51);
2404 __ urshr(v21.V2S(), v31.V2S(), 10);
2405 __ urshr(v21.V4H(), v17.V4H(), 11);
2406 __ urshr(v4.V4S(), v22.V4S(), 1);
2407 __ urshr(v0.V8B(), v1.V8B(), 7);
2408 __ urshr(v13.V8H(), v20.V8H(), 1);
2409 __ ursqrte(v20.V2S(), v16.V2S());
2410 __ ursqrte(v28.V4S(), v8.V4S());
2411 __ ursra(d27, d16, 45);
2412 __ ursra(v18.V16B(), v17.V16B(), 3);
2413 __ ursra(v26.V2D(), v28.V2D(), 58);
2414 __ ursra(v8.V2S(), v22.V2S(), 31);
2415 __ ursra(v31.V4H(), v4.V4H(), 7);
2416 __ ursra(v31.V4S(), v15.V4S(), 2);
2417 __ ursra(v3.V8B(), v1.V8B(), 5);
2418 __ ursra(v18.V8H(), v14.V8H(), 13);
2419 __ ushl(d31, d0, d16);
2420 __ ushl(v0.V16B(), v6.V16B(), v2.V16B());
2421 __ ushl(v18.V2D(), v1.V2D(), v18.V2D());
2422 __ ushl(v27.V2S(), v7.V2S(), v29.V2S());
2423 __ ushl(v14.V4H(), v14.V4H(), v13.V4H());
2424 __ ushl(v22.V4S(), v4.V4S(), v9.V4S());
2425 __ ushl(v23.V8B(), v22.V8B(), v27.V8B());
2426 __ ushl(v21.V8H(), v25.V8H(), v8.V8H());
2427 __ ushll(v11.V2D(), v0.V2S(), 21);
2428 __ ushll(v2.V4S(), v17.V4H(), 8);
2429 __ ushll(v11.V8H(), v14.V8B(), 1);
2430 __ ushll2(v8.V2D(), v29.V4S(), 7);
2431 __ ushll2(v29.V4S(), v9.V8H(), 2);
2432 __ ushll2(v5.V8H(), v24.V16B(), 6);
2433 __ ushr(d28, d27, 53);
2434 __ ushr(v1.V16B(), v9.V16B(), 7);
2435 __ ushr(v2.V2D(), v24.V2D(), 43);
2436 __ ushr(v30.V2S(), v25.V2S(), 11);
2437 __ ushr(v10.V4H(), v26.V4H(), 12);
2438 __ ushr(v4.V4S(), v5.V4S(), 30);
2439 __ ushr(v30.V8B(), v2.V8B(), 1);
2440 __ ushr(v6.V8H(), v12.V8H(), 2);
2441 __ usqadd(b19, b5);
2442 __ usqadd(d9, d2);
2443 __ usqadd(h2, h16);
2444 __ usqadd(s16, s3);
2445 __ usqadd(v31.V16B(), v29.V16B());
2446 __ usqadd(v8.V2D(), v10.V2D());
2447 __ usqadd(v18.V2S(), v9.V2S());
2448 __ usqadd(v24.V4H(), v14.V4H());
2449 __ usqadd(v10.V4S(), v30.V4S());
2450 __ usqadd(v16.V8B(), v20.V8B());
2451 __ usqadd(v12.V8H(), v16.V8H());
2452 __ usra(d28, d27, 37);
2453 __ usra(v5.V16B(), v22.V16B(), 5);
2454 __ usra(v2.V2D(), v19.V2D(), 33);
2455 __ usra(v0.V2S(), v0.V2S(), 21);
2456 __ usra(v7.V4H(), v6.V4H(), 12);
2457 __ usra(v4.V4S(), v17.V4S(), 9);
2458 __ usra(v9.V8B(), v12.V8B(), 7);
2459 __ usra(v3.V8H(), v27.V8H(), 14);
2460 __ usubl(v29.V2D(), v12.V2S(), v30.V2S());
2461 __ usubl(v29.V4S(), v28.V4H(), v6.V4H());
2462 __ usubl(v12.V8H(), v4.V8B(), v14.V8B());
2463 __ usubl2(v1.V2D(), v24.V4S(), v17.V4S());
2464 __ usubl2(v4.V4S(), v1.V8H(), v3.V8H());
2465 __ usubl2(v23.V8H(), v4.V16B(), v7.V16B());
2466 __ usubw(v9.V2D(), v20.V2D(), v30.V2S());
2467 __ usubw(v20.V4S(), v16.V4S(), v23.V4H());
2468 __ usubw(v25.V8H(), v8.V8H(), v29.V8B());
2469 __ usubw2(v18.V2D(), v29.V2D(), v6.V4S());
2470 __ usubw2(v6.V4S(), v6.V4S(), v20.V8H());
2471 __ usubw2(v18.V8H(), v4.V8H(), v16.V16B());
2472 __ uxtl(v27.V2D(), v21.V2S());
2473 __ uxtl(v0.V4S(), v31.V4H());
2474 __ uxtl(v27.V8H(), v10.V8B());
2475 __ uxtl2(v6.V2D(), v16.V4S());
2476 __ uxtl2(v22.V4S(), v20.V8H());
2477 __ uxtl2(v20.V8H(), v21.V16B());
2478 __ uzp1(v30.V16B(), v9.V16B(), v17.V16B());
2479 __ uzp1(v7.V2D(), v26.V2D(), v28.V2D());
2480 __ uzp1(v26.V2S(), v16.V2S(), v22.V2S());
2481 __ uzp1(v14.V4H(), v19.V4H(), v6.V4H());
2482 __ uzp1(v17.V4S(), v23.V4S(), v30.V4S());
2483 __ uzp1(v28.V8B(), v27.V8B(), v13.V8B());
2484 __ uzp1(v17.V8H(), v1.V8H(), v12.V8H());
2485 __ uzp2(v8.V16B(), v18.V16B(), v26.V16B());
2486 __ uzp2(v21.V2D(), v22.V2D(), v24.V2D());
2487 __ uzp2(v20.V2S(), v21.V2S(), v2.V2S());
2488 __ uzp2(v16.V4H(), v31.V4H(), v6.V4H());
2489 __ uzp2(v25.V4S(), v11.V4S(), v8.V4S());
2490 __ uzp2(v31.V8B(), v31.V8B(), v13.V8B());
2491 __ uzp2(v8.V8H(), v17.V8H(), v1.V8H());
2492 __ xtn(v17.V2S(), v26.V2D());
2493 __ xtn(v3.V4H(), v0.V4S());
2494 __ xtn(v18.V8B(), v8.V8H());
2495 __ xtn2(v0.V16B(), v0.V8H());
2496 __ xtn2(v15.V4S(), v4.V2D());
2497 __ xtn2(v31.V8H(), v18.V4S());
2498 __ zip1(v22.V16B(), v9.V16B(), v6.V16B());
2499 __ zip1(v23.V2D(), v11.V2D(), v2.V2D());
2500 __ zip1(v26.V2S(), v16.V2S(), v9.V2S());
2501 __ zip1(v1.V4H(), v9.V4H(), v7.V4H());
2502 __ zip1(v0.V4S(), v30.V4S(), v20.V4S());
2503 __ zip1(v30.V8B(), v17.V8B(), v15.V8B());
2504 __ zip1(v17.V8H(), v8.V8H(), v2.V8H());
2505 __ zip2(v23.V16B(), v10.V16B(), v11.V16B());
2506 __ zip2(v30.V2D(), v6.V2D(), v14.V2D());
2507 __ zip2(v9.V2S(), v10.V2S(), v21.V2S());
2508 __ zip2(v8.V4H(), v24.V4H(), v29.V4H());
2509 __ zip2(v0.V4S(), v21.V4S(), v23.V4S());
2510 __ zip2(v25.V8B(), v23.V8B(), v30.V8B());
2511 __ zip2(v7.V8H(), v10.V8H(), v30.V8H());
2512 } // NOLINT(readability/fn_size)
2513
2514
GenerateTestSequenceNEONFP(MacroAssembler * masm)2515 static void GenerateTestSequenceNEONFP(MacroAssembler* masm) {
2516 ExactAssemblyScope guard(masm,
2517 masm->GetBuffer()->GetRemainingBytes(),
2518 ExactAssemblyScope::kMaximumSize);
2519
2520 // NEON floating point instructions.
2521 __ fabd(v3.V2D(), v25.V2D(), v8.V2D());
2522 __ fabd(v14.V2S(), v27.V2S(), v11.V2S());
2523 __ fabd(v9.V4S(), v22.V4S(), v18.V4S());
2524 __ fabs(v1.V2D(), v29.V2D());
2525 __ fabs(v6.V2S(), v21.V2S());
2526 __ fabs(v12.V4S(), v25.V4S());
2527 __ facge(v18.V2D(), v5.V2D(), v0.V2D());
2528 __ facge(v15.V2S(), v11.V2S(), v6.V2S());
2529 __ facge(v30.V4S(), v10.V4S(), v25.V4S());
2530 __ facgt(v28.V2D(), v16.V2D(), v31.V2D());
2531 __ facgt(v15.V2S(), v1.V2S(), v4.V2S());
2532 __ facgt(v22.V4S(), v3.V4S(), v10.V4S());
2533 __ fadd(v7.V2D(), v10.V2D(), v24.V2D());
2534 __ fadd(v10.V2S(), v23.V2S(), v7.V2S());
2535 __ fadd(v16.V4S(), v22.V4S(), v11.V4S());
2536 __ faddp(d27, v28.V2D());
2537 __ faddp(s20, v23.V2S());
2538 __ faddp(v21.V2D(), v4.V2D(), v11.V2D());
2539 __ faddp(v31.V2S(), v26.V2S(), v1.V2S());
2540 __ faddp(v13.V4S(), v27.V4S(), v28.V4S());
2541 __ fcmeq(v17.V2D(), v13.V2D(), v20.V2D());
2542 __ fcmeq(v24.V2D(), v16.V2D(), 0.0);
2543 __ fcmeq(v26.V2S(), v17.V2S(), v10.V2S());
2544 __ fcmeq(v24.V2S(), v4.V2S(), 0.0);
2545 __ fcmeq(v8.V4S(), v4.V4S(), v14.V4S());
2546 __ fcmeq(v26.V4S(), v25.V4S(), 0.0);
2547 __ fcmge(v27.V2D(), v0.V2D(), v0.V2D());
2548 __ fcmge(v22.V2D(), v30.V2D(), 0.0);
2549 __ fcmge(v7.V2S(), v21.V2S(), v25.V2S());
2550 __ fcmge(v15.V2S(), v15.V2S(), 0.0);
2551 __ fcmge(v29.V4S(), v4.V4S(), v27.V4S());
2552 __ fcmge(v22.V4S(), v21.V4S(), 0.0);
2553 __ fcmgt(v1.V2D(), v26.V2D(), v15.V2D());
2554 __ fcmgt(v15.V2D(), v23.V2D(), 0.0);
2555 __ fcmgt(v21.V2S(), v16.V2S(), v6.V2S());
2556 __ fcmgt(v1.V2S(), v13.V2S(), 0.0);
2557 __ fcmgt(v14.V4S(), v0.V4S(), v25.V4S());
2558 __ fcmgt(v13.V4S(), v8.V4S(), 0.0);
2559 __ fcmle(v4.V2D(), v6.V2D(), 0.0);
2560 __ fcmle(v24.V2S(), v31.V2S(), 0.0);
2561 __ fcmle(v8.V4S(), v23.V4S(), 0.0);
2562 __ fcmlt(v7.V2D(), v3.V2D(), 0.0);
2563 __ fcmlt(v15.V2S(), v21.V2S(), 0.0);
2564 __ fcmlt(v1.V4S(), v2.V4S(), 0.0);
2565 __ fcvtas(v6.V2D(), v8.V2D());
2566 __ fcvtas(v1.V2S(), v9.V2S());
2567 __ fcvtas(v8.V4S(), v19.V4S());
2568 __ fcvtau(v5.V2D(), v31.V2D());
2569 __ fcvtau(v28.V2S(), v29.V2S());
2570 __ fcvtau(v11.V4S(), v26.V4S());
2571 __ fcvtl(v8.V2D(), v25.V2S());
2572 __ fcvtl(v27.V4S(), v14.V4H());
2573 __ fcvtl2(v1.V2D(), v6.V4S());
2574 __ fcvtl2(v24.V4S(), v9.V8H());
2575 __ fcvtms(v9.V2D(), v24.V2D());
2576 __ fcvtms(v7.V2S(), v11.V2S());
2577 __ fcvtms(v23.V4S(), v21.V4S());
2578 __ fcvtmu(v13.V2D(), v1.V2D());
2579 __ fcvtmu(v26.V2S(), v12.V2S());
2580 __ fcvtmu(v21.V4S(), v21.V4S());
2581 __ fcvtn(v11.V2S(), v1.V2D());
2582 __ fcvtn(v8.V4H(), v2.V4S());
2583 __ fcvtn2(v24.V4S(), v29.V2D());
2584 __ fcvtn2(v4.V8H(), v10.V4S());
2585 __ fcvtns(v25.V2D(), v10.V2D());
2586 __ fcvtns(v4.V2S(), v8.V2S());
2587 __ fcvtns(v29.V4S(), v27.V4S());
2588 __ fcvtnu(v18.V2D(), v27.V2D());
2589 __ fcvtnu(v11.V2S(), v14.V2S());
2590 __ fcvtnu(v27.V4S(), v21.V4S());
2591 __ fcvtps(v23.V2D(), v5.V2D());
2592 __ fcvtps(v24.V2S(), v15.V2S());
2593 __ fcvtps(v5.V4S(), v19.V4S());
2594 __ fcvtpu(v3.V2D(), v21.V2D());
2595 __ fcvtpu(v3.V2S(), v21.V2S());
2596 __ fcvtpu(v0.V4S(), v7.V4S());
2597 __ fcvtxn(v29.V2S(), v11.V2D());
2598 __ fcvtxn2(v31.V4S(), v25.V2D());
2599 __ fcvtzs(v19.V2D(), v17.V2D());
2600 __ fcvtzs(v12.V2D(), v24.V2D(), 64);
2601 __ fcvtzs(v9.V2S(), v2.V2S());
2602 __ fcvtzs(v5.V2S(), v20.V2S(), 29);
2603 __ fcvtzs(v21.V4S(), v25.V4S());
2604 __ fcvtzs(v26.V4S(), v1.V4S(), 6);
2605 __ fcvtzu(v13.V2D(), v25.V2D());
2606 __ fcvtzu(v28.V2D(), v13.V2D(), 32);
2607 __ fcvtzu(v26.V2S(), v6.V2S());
2608 __ fcvtzu(v9.V2S(), v10.V2S(), 15);
2609 __ fcvtzu(v30.V4S(), v6.V4S());
2610 __ fcvtzu(v19.V4S(), v22.V4S(), 18);
2611 __ fdiv(v15.V2D(), v8.V2D(), v15.V2D());
2612 __ fdiv(v12.V2S(), v9.V2S(), v26.V2S());
2613 __ fdiv(v19.V4S(), v22.V4S(), v19.V4S());
2614 __ fmax(v19.V2D(), v7.V2D(), v8.V2D());
2615 __ fmax(v25.V2S(), v12.V2S(), v29.V2S());
2616 __ fmax(v6.V4S(), v15.V4S(), v5.V4S());
2617 __ fmaxnm(v16.V2D(), v8.V2D(), v20.V2D());
2618 __ fmaxnm(v15.V2S(), v26.V2S(), v25.V2S());
2619 __ fmaxnm(v23.V4S(), v14.V4S(), v16.V4S());
2620 __ fmaxnmp(d6, v19.V2D());
2621 __ fmaxnmp(s27, v26.V2S());
2622 __ fmaxnmp(v8.V2D(), v12.V2D(), v23.V2D());
2623 __ fmaxnmp(v13.V2S(), v25.V2S(), v22.V2S());
2624 __ fmaxnmp(v15.V4S(), v11.V4S(), v17.V4S());
2625 __ fmaxnmv(s27, v19.V4S());
2626 __ fmaxp(d20, v14.V2D());
2627 __ fmaxp(s18, v2.V2S());
2628 __ fmaxp(v9.V2D(), v23.V2D(), v31.V2D());
2629 __ fmaxp(v7.V2S(), v22.V2S(), v31.V2S());
2630 __ fmaxp(v18.V4S(), v7.V4S(), v29.V4S());
2631 __ fmaxv(s31, v29.V4S());
2632 __ fmin(v2.V2D(), v5.V2D(), v2.V2D());
2633 __ fmin(v31.V2S(), v17.V2S(), v10.V2S());
2634 __ fmin(v10.V4S(), v4.V4S(), v16.V4S());
2635 __ fminnm(v21.V2D(), v6.V2D(), v5.V2D());
2636 __ fminnm(v22.V2S(), v18.V2S(), v14.V2S());
2637 __ fminnm(v25.V4S(), v31.V4S(), v3.V4S());
2638 __ fminnmp(d9, v1.V2D());
2639 __ fminnmp(s21, v20.V2S());
2640 __ fminnmp(v16.V2D(), v21.V2D(), v19.V2D());
2641 __ fminnmp(v16.V2S(), v31.V2S(), v25.V2S());
2642 __ fminnmp(v26.V4S(), v16.V4S(), v15.V4S());
2643 __ fminnmv(s3, v4.V4S());
2644 __ fminp(d24, v26.V2D());
2645 __ fminp(s7, v17.V2S());
2646 __ fminp(v23.V2D(), v19.V2D(), v3.V2D());
2647 __ fminp(v29.V2S(), v21.V2S(), v9.V2S());
2648 __ fminp(v0.V4S(), v24.V4S(), v21.V4S());
2649 __ fminv(s25, v8.V4S());
2650 __ fmla(d23, d0, v9.D(), 1);
2651 __ fmla(s23, s15, v7.S(), 0);
2652 __ fmla(v17.V2D(), v11.V2D(), v6.V2D());
2653 __ fmla(v30.V2D(), v30.V2D(), v11.D(), 0);
2654 __ fmla(v19.V2S(), v12.V2S(), v6.V2S());
2655 __ fmla(v24.V2S(), v17.V2S(), v9.S(), 0);
2656 __ fmla(v16.V4S(), v11.V4S(), v11.V4S());
2657 __ fmla(v27.V4S(), v23.V4S(), v9.S(), 2);
2658 __ fmls(d27, d30, v6.D(), 0);
2659 __ fmls(s21, s16, v2.S(), 0);
2660 __ fmls(v5.V2D(), v19.V2D(), v21.V2D());
2661 __ fmls(v18.V2D(), v30.V2D(), v12.D(), 0);
2662 __ fmls(v5.V2S(), v16.V2S(), v7.V2S());
2663 __ fmls(v3.V2S(), v18.V2S(), v11.S(), 1);
2664 __ fmls(v27.V4S(), v5.V4S(), v30.V4S());
2665 __ fmls(v26.V4S(), v20.V4S(), v4.S(), 3);
2666 __ fmov(v14.V2D(), -0.34375);
2667 __ fmov(v26.V2S(), 0.90625f);
2668 __ fmov(v31.V4S(), -5.0000f);
2669 __ fmov(v28.D(), 1, x25);
2670 __ fmov(x18, v2.D(), 1);
2671 __ fmul(d12, d4, v1.D(), 1);
2672 __ fmul(s30, s1, v15.S(), 3);
2673 __ fmul(v25.V2D(), v0.V2D(), v21.V2D());
2674 __ fmul(v10.V2D(), v24.V2D(), v10.D(), 1);
2675 __ fmul(v7.V2S(), v24.V2S(), v16.V2S());
2676 __ fmul(v1.V2S(), v16.V2S(), v4.S(), 2);
2677 __ fmul(v5.V4S(), v28.V4S(), v25.V4S());
2678 __ fmul(v11.V4S(), v3.V4S(), v8.S(), 0);
2679 __ fmulx(d28, d9, v3.D(), 1);
2680 __ fmulx(s25, s21, v15.S(), 1);
2681 __ fmulx(v31.V2D(), v28.V2D(), v8.V2D());
2682 __ fmulx(v3.V2D(), v21.V2D(), v6.D(), 0);
2683 __ fmulx(v9.V2S(), v1.V2S(), v0.V2S());
2684 __ fmulx(v16.V2S(), v27.V2S(), v6.S(), 0);
2685 __ fmulx(v2.V4S(), v4.V4S(), v5.V4S());
2686 __ fmulx(v18.V4S(), v7.V4S(), v4.S(), 0);
2687 __ fneg(v1.V2D(), v25.V2D());
2688 __ fneg(v14.V2S(), v31.V2S());
2689 __ fneg(v5.V4S(), v4.V4S());
2690 __ frecpe(v18.V2D(), v12.V2D());
2691 __ frecpe(v10.V2S(), v22.V2S());
2692 __ frecpe(v5.V4S(), v6.V4S());
2693 __ frecps(v22.V2D(), v7.V2D(), v26.V2D());
2694 __ frecps(v31.V2S(), v27.V2S(), v2.V2S());
2695 __ frecps(v18.V4S(), v6.V4S(), v27.V4S());
2696 __ frinta(v26.V2D(), v13.V2D());
2697 __ frinta(v15.V2S(), v26.V2S());
2698 __ frinta(v13.V4S(), v16.V4S());
2699 __ frinti(v9.V2D(), v12.V2D());
2700 __ frinti(v5.V2S(), v19.V2S());
2701 __ frinti(v15.V4S(), v11.V4S());
2702 __ frintm(v17.V2D(), v29.V2D());
2703 __ frintm(v30.V2S(), v11.V2S());
2704 __ frintm(v1.V4S(), v20.V4S());
2705 __ frintn(v24.V2D(), v6.V2D());
2706 __ frintn(v12.V2S(), v17.V2S());
2707 __ frintn(v29.V4S(), v11.V4S());
2708 __ frintp(v10.V2D(), v7.V2D());
2709 __ frintp(v12.V2S(), v18.V2S());
2710 __ frintp(v26.V4S(), v31.V4S());
2711 __ frintx(v24.V2D(), v13.V2D());
2712 __ frintx(v7.V2S(), v9.V2S());
2713 __ frintx(v18.V4S(), v21.V4S());
2714 __ frintz(v19.V2D(), v25.V2D());
2715 __ frintz(v15.V2S(), v8.V2S());
2716 __ frintz(v20.V4S(), v3.V4S());
2717 __ frsqrte(v23.V2D(), v5.V2D());
2718 __ frsqrte(v9.V2S(), v7.V2S());
2719 __ frsqrte(v3.V4S(), v9.V4S());
2720 __ frsqrts(v25.V2D(), v28.V2D(), v15.V2D());
2721 __ frsqrts(v9.V2S(), v26.V2S(), v10.V2S());
2722 __ frsqrts(v5.V4S(), v1.V4S(), v10.V4S());
2723 __ fsqrt(v6.V2D(), v18.V2D());
2724 __ fsqrt(v6.V2S(), v18.V2S());
2725 __ fsqrt(v0.V4S(), v31.V4S());
2726 __ fsub(v31.V2D(), v30.V2D(), v31.V2D());
2727 __ fsub(v11.V2S(), v8.V2S(), v6.V2S());
2728 __ fsub(v16.V4S(), v0.V4S(), v31.V4S());
2729 __ scvtf(v25.V2D(), v31.V2D());
2730 __ scvtf(v10.V2D(), v13.V2D(), 45);
2731 __ scvtf(v10.V2S(), v15.V2S());
2732 __ scvtf(v18.V2S(), v4.V2S(), 27);
2733 __ scvtf(v17.V4S(), v5.V4S());
2734 __ scvtf(v11.V4S(), v25.V4S(), 24);
2735 __ ucvtf(v9.V2D(), v3.V2D());
2736 __ ucvtf(v26.V2D(), v30.V2D(), 46);
2737 __ ucvtf(v11.V2S(), v4.V2S());
2738 __ ucvtf(v29.V2S(), v3.V2S(), 25);
2739 __ ucvtf(v22.V4S(), v23.V4S());
2740 __ ucvtf(v18.V4S(), v9.V4S(), 25);
2741 }
2742
2743
GenerateTestSequenceSVE(MacroAssembler * masm)2744 static void GenerateTestSequenceSVE(MacroAssembler* masm) {
2745 ExactAssemblyScope guard(masm,
2746 masm->GetBuffer()->GetRemainingBytes(),
2747 ExactAssemblyScope::kMaximumSize);
2748 CPUFeaturesScope feature_guard(masm, CPUFeatures::kSVE);
2749
2750 // Simple, unpredicated loads and stores.
2751 __ str(p12.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2752 __ str(p13.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2753 __ str(p14.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2754 __ str(p15.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2755 __ ldr(p8.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2756 __ ldr(p9.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2757 __ ldr(p10.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2758 __ ldr(p11.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2759
2760 __ str(z0.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2761 __ str(z1.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2762 __ str(z2.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2763 __ str(z3.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2764 __ ldr(z20.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2765 __ ldr(z21.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2766 __ ldr(z22.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2767 __ ldr(z23.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2768
2769 // Structured accesses.
2770 __ st1b(z0.VnB(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2771 __ st1h(z1.VnH(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2772 __ st1w(z2.VnS(), p1, SVEMemOperand(x0, x3, LSL, 2));
2773 __ st1d(z3.VnD(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2774 __ ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2775 __ ld1h(z21.VnH(), p2.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
2776 __ ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2777 __ ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2778
2779 // Structured, packed accesses.
2780 __ st1b(z2.VnH(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2781 __ st1b(z3.VnS(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2782 __ st1b(z4.VnD(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2783 __ st1h(z0.VnS(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2784 __ st1h(z1.VnD(), p1, SVEMemOperand(x0, x2, LSL, 1));
2785 __ st1w(z2.VnD(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2786 __ ld1b(z20.VnH(), p1.Zeroing(), SVEMemOperand(x0, x2));
2787 __ ld1b(z21.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2788 __ ld1b(z22.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2789 __ ld1h(z23.VnS(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2790 __ ld1h(z24.VnD(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2791 __ ld1w(z20.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2792 __ ld1sb(z21.VnH(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2793 __ ld1sb(z22.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2794 __ ld1sb(z23.VnD(), p2.Zeroing(), SVEMemOperand(x0, x2));
2795 __ ld1sh(z24.VnS(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2796 __ ld1sh(z20.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2797 __ ld1sw(z21.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2798
2799 // Structured, interleaved accesses.
2800 __ st2b(z0.VnB(), z1.VnB(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2801 __ st2h(z1.VnH(), z2.VnH(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2802 __ st2w(z2.VnS(), z3.VnS(), p3, SVEMemOperand(x0, x2, LSL, 2));
2803 __ st2d(z3.VnD(), z4.VnD(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2804 __ ld2b(z20.VnB(), z21.VnB(), p5.Zeroing(), SVEMemOperand(x0, x2));
2805 __ ld2h(z21.VnH(), z22.VnH(), p6.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2806 __ ld2w(z22.VnS(), z23.VnS(), p6.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2807 __ ld2d(z23.VnD(), z24.VnD(), p5.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2808
2809 __ st3b(z4.VnB(), z5.VnB(), z6.VnB(), p4, SVEMemOperand(x0, 3, SVE_MUL_VL));
2810 __ st3h(z5.VnH(), z6.VnH(), z7.VnH(), p4, SVEMemOperand(x0, 3, SVE_MUL_VL));
2811 __ st3w(z6.VnS(), z7.VnS(), z8.VnS(), p3, SVEMemOperand(x0, 3, SVE_MUL_VL));
2812 __ st3d(z7.VnD(), z8.VnD(), z9.VnD(), p4, SVEMemOperand(x0, x2, LSL, 3));
2813 __ ld3b(z24.VnB(),
2814 z25.VnB(),
2815 z26.VnB(),
2816 p5.Zeroing(),
2817 SVEMemOperand(x0, 3, SVE_MUL_VL));
2818 __ ld3h(z25.VnH(),
2819 z26.VnH(),
2820 z27.VnH(),
2821 p6.Zeroing(),
2822 SVEMemOperand(x0, x2, LSL, 1));
2823 __ ld3w(z26.VnS(),
2824 z27.VnS(),
2825 z28.VnS(),
2826 p6.Zeroing(),
2827 SVEMemOperand(x0, 3, SVE_MUL_VL));
2828 __ ld3d(z27.VnD(),
2829 z28.VnD(),
2830 z29.VnD(),
2831 p5.Zeroing(),
2832 SVEMemOperand(x0, 3, SVE_MUL_VL));
2833
2834 __ st4b(z31.VnB(),
2835 z0.VnB(),
2836 z1.VnB(),
2837 z2.VnB(),
2838 p4,
2839 SVEMemOperand(x0, 4, SVE_MUL_VL));
2840 __ st4h(z0.VnH(),
2841 z1.VnH(),
2842 z2.VnH(),
2843 z3.VnH(),
2844 p4,
2845 SVEMemOperand(x0, 4, SVE_MUL_VL));
2846 __ st4w(z1.VnS(),
2847 z2.VnS(),
2848 z3.VnS(),
2849 z4.VnS(),
2850 p3,
2851 SVEMemOperand(x0, 4, SVE_MUL_VL));
2852 __ st4d(z2.VnD(),
2853 z3.VnD(),
2854 z4.VnD(),
2855 z5.VnD(),
2856 p4,
2857 SVEMemOperand(x0, x2, LSL, 3));
2858 __ ld4b(z25.VnB(),
2859 z26.VnB(),
2860 z27.VnB(),
2861 z28.VnB(),
2862 p5.Zeroing(),
2863 SVEMemOperand(x0, 4, SVE_MUL_VL));
2864 __ ld4h(z26.VnH(),
2865 z27.VnH(),
2866 z28.VnH(),
2867 z29.VnH(),
2868 p6.Zeroing(),
2869 SVEMemOperand(x0, 4, SVE_MUL_VL));
2870 __ ld4w(z27.VnS(),
2871 z28.VnS(),
2872 z29.VnS(),
2873 z30.VnS(),
2874 p6.Zeroing(),
2875 SVEMemOperand(x0, x2, LSL, 2));
2876 __ ld4d(z28.VnD(),
2877 z29.VnD(),
2878 z30.VnD(),
2879 z31.VnD(),
2880 p5.Zeroing(),
2881 SVEMemOperand(x0, 4, SVE_MUL_VL));
2882 }
2883
GenerateTestSequenceAtomics(MacroAssembler * masm)2884 static void GenerateTestSequenceAtomics(MacroAssembler* masm) {
2885 ExactAssemblyScope guard(masm,
2886 masm->GetBuffer()->GetRemainingBytes(),
2887 ExactAssemblyScope::kMaximumSize);
2888 CPUFeaturesScope feature_guard(masm, CPUFeatures::kAtomics);
2889 __ sub(sp, sp, 16); // Claim some working space on the stack.
2890 __ mov(x0, 0x5555555555555555);
2891 __ str(x0, MemOperand(sp)); // Initialise working space.
2892
2893 #define INST_LIST(OP) \
2894 __ ld##OP##b(w0, w0, MemOperand(sp)); \
2895 __ ld##OP##ab(w0, w1, MemOperand(sp)); \
2896 __ ld##OP##lb(w0, w2, MemOperand(sp)); \
2897 __ ld##OP##alb(w0, w3, MemOperand(sp)); \
2898 __ ld##OP##h(w0, w0, MemOperand(sp)); \
2899 __ ld##OP##ah(w0, w1, MemOperand(sp)); \
2900 __ ld##OP##lh(w0, w2, MemOperand(sp)); \
2901 __ ld##OP##alh(w0, w3, MemOperand(sp)); \
2902 __ ld##OP(w0, w0, MemOperand(sp)); \
2903 __ ld##OP##a(w0, w1, MemOperand(sp)); \
2904 __ ld##OP##l(w0, w2, MemOperand(sp)); \
2905 __ ld##OP##al(w0, w3, MemOperand(sp)); \
2906 __ ld##OP(x0, x0, MemOperand(sp)); \
2907 __ ld##OP##a(x0, x1, MemOperand(sp)); \
2908 __ ld##OP##l(x0, x2, MemOperand(sp)); \
2909 __ ld##OP##al(x0, x3, MemOperand(sp)); \
2910 __ st##OP##b(w0, MemOperand(sp)); \
2911 __ st##OP##lb(w0, MemOperand(sp)); \
2912 __ st##OP##h(w0, MemOperand(sp)); \
2913 __ st##OP##lh(w0, MemOperand(sp)); \
2914 __ st##OP(w0, MemOperand(sp)); \
2915 __ st##OP##l(w0, MemOperand(sp)); \
2916 __ st##OP(x0, MemOperand(sp)); \
2917 __ st##OP##l(x0, MemOperand(sp));
2918
2919 INST_LIST(add);
2920 INST_LIST(set);
2921 INST_LIST(eor);
2922 INST_LIST(smin);
2923 INST_LIST(smax);
2924 INST_LIST(umin);
2925 INST_LIST(umax);
2926 INST_LIST(clr);
2927
2928 #undef INST_LIST
2929
2930 __ add(sp, sp, 16); // Restore stack pointer.
2931 }
2932
MaskAddresses(const char * trace)2933 static void MaskAddresses(const char* trace) {
2934 #define VIXL_COLOUR "(\x1b\\[[01];([0-9][0-9])?m)?"
2935 // All patterns are replaced with "$1~~~~~~~~~~~~~~~~".
2936 std::regex patterns[] =
2937 {// Mask registers that hold addresses that change from run to run.
2938 std::regex("((x0|x1|x2|sp): " VIXL_COLOUR "0x)[0-9a-f]{16}"),
2939 // Mask accessed memory addresses.
2940 std::regex("((<-|->) " VIXL_COLOUR "0x)[0-9a-f]{16}"),
2941 // Mask instruction addresses.
2942 std::regex("^(0x)[0-9a-f]{16}"),
2943 // Mask branch targets.
2944 std::regex("(Branch" VIXL_COLOUR " to 0x)[0-9a-f]{16}"),
2945 // Mask explicit address annotations.
2946 std::regex("(addr 0x)[0-9a-f]+")};
2947 #undef VIXL_COLOUR
2948
2949 std::vector<std::string> lines;
2950 std::ifstream in(trace);
2951 while (!in.eof()) {
2952 std::string line;
2953 std::getline(in, line);
2954 for (auto&& pattern : patterns) {
2955 line = std::regex_replace(line, pattern, "$1~~~~~~~~~~~~~~~~");
2956 }
2957 lines.push_back(line);
2958 }
2959 in.close();
2960
2961 // `getline` produces an empty line after a terminal "\n".
2962 if (lines.back().empty()) lines.pop_back();
2963
2964 std::ofstream out(trace, std::ofstream::trunc);
2965 for (auto&& line : lines) {
2966 out << line << "\n";
2967 }
2968 }
2969
PrintFile(const char * name)2970 static void PrintFile(const char* name) {
2971 FILE* file = fopen(name, "r");
2972 char buffer[1024]; // The buffer size is arbitrary.
2973 while (fgets(buffer, sizeof(buffer), file) != NULL) fputs(buffer, stdout);
2974 fclose(file);
2975 }
2976
CheckOrGenerateTrace(const char * filename,const char * ref_file)2977 static bool CheckOrGenerateTrace(const char* filename, const char* ref_file) {
2978 bool trace_matched_reference;
2979 if (Test::generate_test_trace()) {
2980 // Copy trace_stream to stdout.
2981 FILE* trace_stream = fopen(filename, "r");
2982 VIXL_ASSERT(trace_stream != NULL);
2983 fseek(trace_stream, 0, SEEK_SET);
2984 int c;
2985 while (1) {
2986 c = getc(trace_stream);
2987 if (c == EOF) break;
2988 putc(c, stdout);
2989 }
2990 fclose(trace_stream);
2991 trace_matched_reference = true;
2992 } else {
2993 // Check trace_stream against ref_file.
2994 char command[1024];
2995 size_t length =
2996 snprintf(command, sizeof(command), "diff -u %s %s", ref_file, filename);
2997 VIXL_CHECK(length < sizeof(command));
2998 trace_matched_reference = (system(command) == 0);
2999 }
3000 return trace_matched_reference;
3001 }
3002
3003
3004 // Trace tests can only work with the simulator.
3005 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
3006
TraceTestHelper(bool coloured_trace,TraceParameters trace_parameters,const char * ref_file)3007 static void TraceTestHelper(bool coloured_trace,
3008 TraceParameters trace_parameters,
3009 const char* ref_file) {
3010 MacroAssembler masm(12 * KBytes);
3011
3012 char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
3013 FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
3014
3015 Decoder decoder;
3016 Simulator simulator(&decoder, trace_stream);
3017 simulator.SetColouredTrace(coloured_trace);
3018 simulator.SetTraceParameters(trace_parameters);
3019 simulator.SilenceExclusiveAccessWarning();
3020
3021 const int vl_in_bytes = 5 * kZRegMinSizeInBytes;
3022 const int vl_in_bits = vl_in_bytes * kBitsPerByte;
3023 const int pl_in_bits = vl_in_bits / kZRegBitsPerPRegBit;
3024 simulator.SetVectorLengthInBits(vl_in_bits);
3025
3026 // Set up a scratch buffer so we can test loads and stores.
3027 const int kScratchSize = vl_in_bytes * 1024;
3028 const int kScratchGuardSize = vl_in_bytes;
3029 char scratch_buffer[kScratchSize + kScratchGuardSize];
3030 for (size_t i = 0; i < (sizeof(scratch_buffer) / sizeof(scratch_buffer[0]));
3031 i++) {
3032 scratch_buffer[i] = i & 0xff;
3033 }
3034 // Used for offset addressing.
3035 simulator.WriteXRegister(0, reinterpret_cast<uintptr_t>(scratch_buffer));
3036 // Used for pre-/post-index addressing.
3037 simulator.WriteXRegister(1, reinterpret_cast<uintptr_t>(scratch_buffer));
3038
3039 const int kPostIndexRegisterStep = 13; // Arbitrary interesting value.
3040 // Used for post-index offsets.
3041 simulator.WriteXRegister(2, kPostIndexRegisterStep);
3042
3043 // Initialize the other registers with unique values.
3044 uint64_t initial_base_u64 = 0x0100001000100101;
3045 for (unsigned i = 3; i < kNumberOfRegisters; i++) {
3046 if (i == kLinkRegCode) continue;
3047 if (i == kZeroRegCode) continue;
3048 // NoRegLog suppresses the log now, but the registers will still be logged
3049 // before the first instruction is executed since they have been written but
3050 // not printed.
3051 simulator.WriteRegister(i, initial_base_u64 * i, Simulator::NoRegLog);
3052 }
3053 for (unsigned r = 0; r < kNumberOfVRegisters; r++) {
3054 LogicVRegister reg(simulator.ReadVRegister(r));
3055 // Try to initialise Z registers with reasonable FP values. We prioritise
3056 // setting double values, then floats and half-precision values. The lanes
3057 // overlap, so this is a compromise, but d0, s0 and h0 views all see similar
3058 // arithmetic values.
3059 //
3060 // The exponent of each value is set to the (biased) register number. We set
3061 // the double, float and half-precision exponents where we can.
3062 uint64_t base = 0x3ff000003f803c00 + (0x0010000000800400 * (0x7f + r));
3063 for (unsigned lane = 0; lane < (vl_in_bytes / kDRegSizeInBytes); lane++) {
3064 uint64_t mantissas = 0x0000000100010001 * (lane & 0x7f);
3065 reg.SetUint(kFormatVnD, lane, base | mantissas);
3066 }
3067 }
3068 for (unsigned r = 0; r < kNumberOfPRegisters; r++) {
3069 LogicPRegister reg(simulator.ReadPRegister(r));
3070 // Set `r` active lanes between each inactive lane.
3071 for (unsigned bit = 0; bit < pl_in_bits; bit++) {
3072 reg.SetActive(kFormatVnB, bit, ((bit + 1) % (r + 2)) != 0);
3073 }
3074 // Completely clear some Q-sized blocks. The trace will completely omit
3075 // these for stores.
3076 for (unsigned chunk = 0; chunk < (vl_in_bits / kQRegSize); chunk++) {
3077 if (((chunk + 1) % (r + 2)) == 0) {
3078 reg.SetActiveMask(chunk, static_cast<uint16_t>(0));
3079 }
3080 }
3081 }
3082
3083 GenerateTestSequenceBase(&masm);
3084 GenerateTestSequenceFP(&masm);
3085 GenerateTestSequenceNEON(&masm);
3086 GenerateTestSequenceNEONFP(&masm);
3087 GenerateTestSequenceSVE(&masm);
3088 GenerateTestSequenceAtomics(&masm);
3089 masm.Ret();
3090 masm.FinalizeCode();
3091
3092 if (Test::disassemble()) {
3093 PrintDisassembler disasm(stdout);
3094 Instruction* start = masm.GetBuffer()->GetStartAddress<Instruction*>();
3095 Instruction* end = masm.GetBuffer()->GetEndAddress<Instruction*>();
3096 disasm.DisassembleBuffer(start, end);
3097 }
3098
3099 simulator.RunFrom(masm.GetBuffer()->GetStartAddress<Instruction*>());
3100
3101 fclose(trace_stream);
3102
3103 // We already traced into the temporary file, so just print the file.
3104 // Note that these tests need to control the trace flags, so we ignore all
3105 // --trace-* options here except for --trace-sim.
3106 if (Test::trace_sim()) PrintFile(trace_stream_filename);
3107
3108 MaskAddresses(trace_stream_filename);
3109
3110 bool trace_matched_reference =
3111 CheckOrGenerateTrace(trace_stream_filename, ref_file);
3112 remove(trace_stream_filename); // Clean up before checking the result.
3113 VIXL_CHECK(trace_matched_reference);
3114
3115 uint64_t offset_base = simulator.ReadRegister<uint64_t>(0);
3116 uint64_t index_base = simulator.ReadRegister<uint64_t>(1);
3117
3118 VIXL_CHECK(index_base >= offset_base);
3119 VIXL_CHECK((index_base - offset_base) <= kScratchSize);
3120 }
3121
3122
3123 // Test individual options.
TEST(disasm)3124 TEST(disasm) { TraceTestHelper(false, LOG_DISASM, REF("log-disasm")); }
TEST(regs)3125 TEST(regs) { TraceTestHelper(false, LOG_REGS, REF("log-regs")); }
TEST(vregs)3126 TEST(vregs) { TraceTestHelper(false, LOG_VREGS, REF("log-vregs")); }
TEST(sysregs)3127 TEST(sysregs) { TraceTestHelper(false, LOG_SYSREGS, REF("log-sysregs")); }
TEST(write)3128 TEST(write) { TraceTestHelper(false, LOG_WRITE, REF("log-write")); }
TEST(branch)3129 TEST(branch) { TraceTestHelper(false, LOG_WRITE, REF("log-branch")); }
3130
3131 // Test standard combinations.
TEST(none)3132 TEST(none) { TraceTestHelper(false, LOG_NONE, REF("log-none")); }
TEST(state)3133 TEST(state) { TraceTestHelper(false, LOG_STATE, REF("log-state")); }
TEST(all)3134 TEST(all) { TraceTestHelper(false, LOG_ALL, REF("log-all")); }
3135
3136
3137 // Test individual options (with colour).
TEST(disasm_colour)3138 TEST(disasm_colour) {
3139 TraceTestHelper(true, LOG_DISASM, REF("log-disasm-colour"));
3140 }
TEST(regs_colour)3141 TEST(regs_colour) { TraceTestHelper(true, LOG_REGS, REF("log-regs-colour")); }
TEST(vregs_colour)3142 TEST(vregs_colour) {
3143 TraceTestHelper(true, LOG_VREGS, REF("log-vregs-colour"));
3144 }
TEST(sysregs_colour)3145 TEST(sysregs_colour) {
3146 TraceTestHelper(true, LOG_SYSREGS, REF("log-sysregs-colour"));
3147 }
TEST(write_colour)3148 TEST(write_colour) {
3149 TraceTestHelper(true, LOG_WRITE, REF("log-write-colour"));
3150 }
TEST(branch_colour)3151 TEST(branch_colour) {
3152 TraceTestHelper(true, LOG_WRITE, REF("log-branch-colour"));
3153 }
3154
3155 // Test standard combinations (with colour).
TEST(none_colour)3156 TEST(none_colour) { TraceTestHelper(true, LOG_NONE, REF("log-none-colour")); }
TEST(state_colour)3157 TEST(state_colour) {
3158 TraceTestHelper(true, LOG_STATE, REF("log-state-colour"));
3159 }
TEST(all_colour)3160 TEST(all_colour) { TraceTestHelper(true, LOG_ALL, REF("log-all-colour")); }
3161
3162 #endif // VIXL_INCLUDE_SIMULATOR_AARCH64
3163
PrintDisassemblerTestHelper(const char * prefix,const char * suffix,const char * ref_file)3164 static void PrintDisassemblerTestHelper(const char* prefix,
3165 const char* suffix,
3166 const char* ref_file) {
3167 MacroAssembler masm(12 * KBytes);
3168
3169 char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
3170 FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
3171
3172 // We don't need to execute this code so there's no need for the execution
3173 // environment setup from TraceTestHelper.
3174
3175 GenerateTestSequenceBase(&masm);
3176 GenerateTestSequenceFP(&masm);
3177 GenerateTestSequenceNEON(&masm);
3178 GenerateTestSequenceNEONFP(&masm);
3179 GenerateTestSequenceSVE(&masm);
3180 GenerateTestSequenceAtomics(&masm);
3181 masm.FinalizeCode();
3182
3183 Decoder decoder;
3184 CPUFeaturesAuditor auditor(&decoder);
3185 PrintDisassembler disasm(trace_stream);
3186 if (prefix != NULL) disasm.SetCPUFeaturesPrefix(prefix);
3187 if (suffix != NULL) disasm.SetCPUFeaturesSuffix(suffix);
3188 disasm.RegisterCPUFeaturesAuditor(&auditor);
3189 decoder.AppendVisitor(&disasm);
3190
3191 Instruction* instruction = masm.GetBuffer()->GetStartAddress<Instruction*>();
3192 Instruction* end = masm.GetCursorAddress<Instruction*>();
3193 while (instruction != end) {
3194 decoder.Decode(instruction);
3195 instruction += kInstructionSize;
3196 }
3197
3198 fclose(trace_stream);
3199
3200 // We already disassembled into the temporary file, so just print the file.
3201 if (Test::disassemble()) PrintFile(trace_stream_filename);
3202
3203 MaskAddresses(trace_stream_filename);
3204
3205 bool trace_matched_reference =
3206 CheckOrGenerateTrace(trace_stream_filename, ref_file);
3207 remove(trace_stream_filename); // Clean up before checking the result.
3208 VIXL_CHECK(trace_matched_reference);
3209 }
3210
3211
3212 // Test CPUFeatures disassembly annotations.
TEST(cpufeatures)3213 TEST(cpufeatures) {
3214 PrintDisassemblerTestHelper(NULL, NULL, REF("log-cpufeatures"));
3215 }
TEST(cpufeatures_custom)3216 TEST(cpufeatures_custom) {
3217 PrintDisassemblerTestHelper("### {", "} ###", REF("log-cpufeatures-custom"));
3218 }
TEST(cpufeatures_colour)3219 TEST(cpufeatures_colour) {
3220 // The colour chosen is arbitrary.
3221 PrintDisassemblerTestHelper("\033[1;35m", // Prefix: Bold magenta.
3222 "\033[0;m", // Suffix: Reset colour.
3223 REF("log-cpufeatures-colour"));
3224 }
3225 } // namespace aarch64
3226 } // namespace vixl
3227