1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h"
6
7 #include "src/codegen/assembler.h"
8 #include "src/codegen/cpu-features.h"
9 #include "src/codegen/register.h"
10
11 #if V8_TARGET_ARCH_IA32
12 #include "src/codegen/ia32/register-ia32.h"
13 #elif V8_TARGET_ARCH_X64
14 #include "src/codegen/x64/register-x64.h"
15 #else
16 #error Unsupported target architecture.
17 #endif
18
19 // Operand on IA32 can be a wrapper for a single register, in which case they
20 // should call I8x16Splat |src| being Register.
21 #if V8_TARGET_ARCH_IA32
22 #define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only());
23 #else
24 #define DCHECK_OPERAND_IS_NOT_REG(op)
25 #endif
26
27 namespace v8 {
28 namespace internal {
29
Move(Register dst,uint32_t src)30 void SharedTurboAssembler::Move(Register dst, uint32_t src) {
31 // Helper to paper over the different assembler function names.
32 #if V8_TARGET_ARCH_IA32
33 mov(dst, Immediate(src));
34 #elif V8_TARGET_ARCH_X64
35 movl(dst, Immediate(src));
36 #else
37 #error Unsupported target architecture.
38 #endif
39 }
40
Move(Register dst,Register src)41 void SharedTurboAssembler::Move(Register dst, Register src) {
42 // Helper to paper over the different assembler function names.
43 if (dst != src) {
44 #if V8_TARGET_ARCH_IA32
45 mov(dst, src);
46 #elif V8_TARGET_ARCH_X64
47 movq(dst, src);
48 #else
49 #error Unsupported target architecture.
50 #endif
51 }
52 }
53
Add(Register dst,Immediate src)54 void SharedTurboAssembler::Add(Register dst, Immediate src) {
55 // Helper to paper over the different assembler function names.
56 #if V8_TARGET_ARCH_IA32
57 add(dst, src);
58 #elif V8_TARGET_ARCH_X64
59 addq(dst, src);
60 #else
61 #error Unsupported target architecture.
62 #endif
63 }
64
And(Register dst,Immediate src)65 void SharedTurboAssembler::And(Register dst, Immediate src) {
66 // Helper to paper over the different assembler function names.
67 #if V8_TARGET_ARCH_IA32
68 and_(dst, src);
69 #elif V8_TARGET_ARCH_X64
70 if (is_uint32(src.value())) {
71 andl(dst, src);
72 } else {
73 andq(dst, src);
74 }
75 #else
76 #error Unsupported target architecture.
77 #endif
78 }
79
Movhps(XMMRegister dst,XMMRegister src1,Operand src2)80 void SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1,
81 Operand src2) {
82 if (CpuFeatures::IsSupported(AVX)) {
83 CpuFeatureScope scope(this, AVX);
84 vmovhps(dst, src1, src2);
85 } else {
86 if (dst != src1) {
87 movaps(dst, src1);
88 }
89 movhps(dst, src2);
90 }
91 }
92
Movlps(XMMRegister dst,XMMRegister src1,Operand src2)93 void SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1,
94 Operand src2) {
95 if (CpuFeatures::IsSupported(AVX)) {
96 CpuFeatureScope scope(this, AVX);
97 vmovlps(dst, src1, src2);
98 } else {
99 if (dst != src1) {
100 movaps(dst, src1);
101 }
102 movlps(dst, src2);
103 }
104 }
105
Pblendvb(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister mask)106 void SharedTurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1,
107 XMMRegister src2, XMMRegister mask) {
108 if (CpuFeatures::IsSupported(AVX)) {
109 CpuFeatureScope scope(this, AVX);
110 vpblendvb(dst, src1, src2, mask);
111 } else {
112 CpuFeatureScope scope(this, SSE4_1);
113 DCHECK_EQ(mask, xmm0);
114 DCHECK_EQ(dst, src1);
115 pblendvb(dst, src2);
116 }
117 }
118
Shufps(XMMRegister dst,XMMRegister src1,XMMRegister src2,uint8_t imm8)119 void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
120 XMMRegister src2, uint8_t imm8) {
121 if (CpuFeatures::IsSupported(AVX)) {
122 CpuFeatureScope avx_scope(this, AVX);
123 vshufps(dst, src1, src2, imm8);
124 } else {
125 if (dst != src1) {
126 movaps(dst, src1);
127 }
128 shufps(dst, src2, imm8);
129 }
130 }
131
F64x2ExtractLane(DoubleRegister dst,XMMRegister src,uint8_t lane)132 void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
133 uint8_t lane) {
134 ASM_CODE_COMMENT(this);
135 if (lane == 0) {
136 if (dst != src) {
137 Movaps(dst, src);
138 }
139 } else {
140 DCHECK_EQ(1, lane);
141 if (CpuFeatures::IsSupported(AVX)) {
142 CpuFeatureScope avx_scope(this, AVX);
143 // Pass src as operand to avoid false-dependency on dst.
144 vmovhlps(dst, src, src);
145 } else {
146 movhlps(dst, src);
147 }
148 }
149 }
150
F64x2ReplaceLane(XMMRegister dst,XMMRegister src,DoubleRegister rep,uint8_t lane)151 void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
152 DoubleRegister rep, uint8_t lane) {
153 ASM_CODE_COMMENT(this);
154 if (CpuFeatures::IsSupported(AVX)) {
155 CpuFeatureScope scope(this, AVX);
156 if (lane == 0) {
157 vmovsd(dst, src, rep);
158 } else {
159 vmovlhps(dst, src, rep);
160 }
161 } else {
162 CpuFeatureScope scope(this, SSE4_1);
163 if (dst != src) {
164 DCHECK_NE(dst, rep); // Ensure rep is not overwritten.
165 movaps(dst, src);
166 }
167 if (lane == 0) {
168 movsd(dst, rep);
169 } else {
170 movlhps(dst, rep);
171 }
172 }
173 }
174
F32x4Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)175 void SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs,
176 XMMRegister rhs, XMMRegister scratch) {
177 ASM_CODE_COMMENT(this);
178 // The minps instruction doesn't propagate NaNs and +0's in its first
179 // operand. Perform minps in both orders, merge the results, and adjust.
180 if (CpuFeatures::IsSupported(AVX)) {
181 CpuFeatureScope scope(this, AVX);
182 vminps(scratch, lhs, rhs);
183 vminps(dst, rhs, lhs);
184 } else if (dst == lhs || dst == rhs) {
185 XMMRegister src = dst == lhs ? rhs : lhs;
186 movaps(scratch, src);
187 minps(scratch, dst);
188 minps(dst, src);
189 } else {
190 movaps(scratch, lhs);
191 minps(scratch, rhs);
192 movaps(dst, rhs);
193 minps(dst, lhs);
194 }
195 // Propagate -0's and NaNs, which may be non-canonical.
196 Orps(scratch, dst);
197 // Canonicalize NaNs by quieting and clearing the payload.
198 Cmpunordps(dst, dst, scratch);
199 Orps(scratch, dst);
200 Psrld(dst, dst, byte{10});
201 Andnps(dst, dst, scratch);
202 }
203
F32x4Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)204 void SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs,
205 XMMRegister rhs, XMMRegister scratch) {
206 ASM_CODE_COMMENT(this);
207 // The maxps instruction doesn't propagate NaNs and +0's in its first
208 // operand. Perform maxps in both orders, merge the results, and adjust.
209 if (CpuFeatures::IsSupported(AVX)) {
210 CpuFeatureScope scope(this, AVX);
211 vmaxps(scratch, lhs, rhs);
212 vmaxps(dst, rhs, lhs);
213 } else if (dst == lhs || dst == rhs) {
214 XMMRegister src = dst == lhs ? rhs : lhs;
215 movaps(scratch, src);
216 maxps(scratch, dst);
217 maxps(dst, src);
218 } else {
219 movaps(scratch, lhs);
220 maxps(scratch, rhs);
221 movaps(dst, rhs);
222 maxps(dst, lhs);
223 }
224 // Find discrepancies.
225 Xorps(dst, scratch);
226 // Propagate NaNs, which may be non-canonical.
227 Orps(scratch, dst);
228 // Propagate sign discrepancy and (subtle) quiet NaNs.
229 Subps(scratch, scratch, dst);
230 // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
231 Cmpunordps(dst, dst, scratch);
232 Psrld(dst, dst, byte{10});
233 Andnps(dst, dst, scratch);
234 }
235
F64x2Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)236 void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
237 XMMRegister rhs, XMMRegister scratch) {
238 ASM_CODE_COMMENT(this);
239 if (CpuFeatures::IsSupported(AVX)) {
240 CpuFeatureScope scope(this, AVX);
241 // The minpd instruction doesn't propagate NaNs and +0's in its first
242 // operand. Perform minpd in both orders, merge the resuls, and adjust.
243 vminpd(scratch, lhs, rhs);
244 vminpd(dst, rhs, lhs);
245 // propagate -0's and NaNs, which may be non-canonical.
246 vorpd(scratch, scratch, dst);
247 // Canonicalize NaNs by quieting and clearing the payload.
248 vcmpunordpd(dst, dst, scratch);
249 vorpd(scratch, scratch, dst);
250 vpsrlq(dst, dst, byte{13});
251 vandnpd(dst, dst, scratch);
252 } else {
253 // Compare lhs with rhs, and rhs with lhs, and have the results in scratch
254 // and dst. If dst overlaps with lhs or rhs, we can save a move.
255 if (dst == lhs || dst == rhs) {
256 XMMRegister src = dst == lhs ? rhs : lhs;
257 movaps(scratch, src);
258 minpd(scratch, dst);
259 minpd(dst, src);
260 } else {
261 movaps(scratch, lhs);
262 movaps(dst, rhs);
263 minpd(scratch, rhs);
264 minpd(dst, lhs);
265 }
266 orpd(scratch, dst);
267 cmpunordpd(dst, scratch);
268 orpd(scratch, dst);
269 psrlq(dst, byte{13});
270 andnpd(dst, scratch);
271 }
272 }
273
F64x2Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)274 void SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs,
275 XMMRegister rhs, XMMRegister scratch) {
276 ASM_CODE_COMMENT(this);
277 if (CpuFeatures::IsSupported(AVX)) {
278 CpuFeatureScope scope(this, AVX);
279 // The maxpd instruction doesn't propagate NaNs and +0's in its first
280 // operand. Perform maxpd in both orders, merge the resuls, and adjust.
281 vmaxpd(scratch, lhs, rhs);
282 vmaxpd(dst, rhs, lhs);
283 // Find discrepancies.
284 vxorpd(dst, dst, scratch);
285 // Propagate NaNs, which may be non-canonical.
286 vorpd(scratch, scratch, dst);
287 // Propagate sign discrepancy and (subtle) quiet NaNs.
288 vsubpd(scratch, scratch, dst);
289 // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
290 vcmpunordpd(dst, dst, scratch);
291 vpsrlq(dst, dst, byte{13});
292 vandnpd(dst, dst, scratch);
293 } else {
294 if (dst == lhs || dst == rhs) {
295 XMMRegister src = dst == lhs ? rhs : lhs;
296 movaps(scratch, src);
297 maxpd(scratch, dst);
298 maxpd(dst, src);
299 } else {
300 movaps(scratch, lhs);
301 movaps(dst, rhs);
302 maxpd(scratch, rhs);
303 maxpd(dst, lhs);
304 }
305 xorpd(dst, scratch);
306 orpd(scratch, dst);
307 subpd(scratch, dst);
308 cmpunordpd(dst, scratch);
309 psrlq(dst, byte{13});
310 andnpd(dst, scratch);
311 }
312 }
313
F32x4Splat(XMMRegister dst,DoubleRegister src)314 void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
315 ASM_CODE_COMMENT(this);
316 if (CpuFeatures::IsSupported(AVX2)) {
317 CpuFeatureScope avx2_scope(this, AVX2);
318 vbroadcastss(dst, src);
319 } else if (CpuFeatures::IsSupported(AVX)) {
320 CpuFeatureScope avx_scope(this, AVX);
321 vshufps(dst, src, src, 0);
322 } else {
323 if (dst == src) {
324 // 1 byte shorter than pshufd.
325 shufps(dst, src, 0);
326 } else {
327 pshufd(dst, src, 0);
328 }
329 }
330 }
331
F32x4ExtractLane(FloatRegister dst,XMMRegister src,uint8_t lane)332 void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src,
333 uint8_t lane) {
334 ASM_CODE_COMMENT(this);
335 DCHECK_LT(lane, 4);
336 // These instructions are shorter than insertps, but will leave junk in
337 // the top lanes of dst.
338 if (lane == 0) {
339 if (dst != src) {
340 Movaps(dst, src);
341 }
342 } else if (lane == 1) {
343 Movshdup(dst, src);
344 } else if (lane == 2 && dst == src) {
345 // Check dst == src to avoid false dependency on dst.
346 Movhlps(dst, src);
347 } else if (dst == src) {
348 Shufps(dst, src, src, lane);
349 } else {
350 Pshufd(dst, src, lane);
351 }
352 }
353
S128Store32Lane(Operand dst,XMMRegister src,uint8_t laneidx)354 void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
355 uint8_t laneidx) {
356 ASM_CODE_COMMENT(this);
357 if (laneidx == 0) {
358 Movss(dst, src);
359 } else {
360 DCHECK_GE(3, laneidx);
361 Extractps(dst, src, laneidx);
362 }
363 }
364
365 template <typename Op>
I8x16SplatPreAvx2(XMMRegister dst,Op src,XMMRegister scratch)366 void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
367 XMMRegister scratch) {
368 ASM_CODE_COMMENT(this);
369 DCHECK(!CpuFeatures::IsSupported(AVX2));
370 CpuFeatureScope ssse3_scope(this, SSSE3);
371 Movd(dst, src);
372 Xorps(scratch, scratch);
373 Pshufb(dst, scratch);
374 }
375
I8x16Splat(XMMRegister dst,Register src,XMMRegister scratch)376 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
377 XMMRegister scratch) {
378 ASM_CODE_COMMENT(this);
379 if (CpuFeatures::IsSupported(AVX2)) {
380 CpuFeatureScope avx2_scope(this, AVX2);
381 Movd(scratch, src);
382 vpbroadcastb(dst, scratch);
383 } else {
384 I8x16SplatPreAvx2(dst, src, scratch);
385 }
386 }
387
I8x16Splat(XMMRegister dst,Operand src,XMMRegister scratch)388 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
389 XMMRegister scratch) {
390 ASM_CODE_COMMENT(this);
391 DCHECK_OPERAND_IS_NOT_REG(src);
392 if (CpuFeatures::IsSupported(AVX2)) {
393 CpuFeatureScope avx2_scope(this, AVX2);
394 vpbroadcastb(dst, src);
395 } else {
396 I8x16SplatPreAvx2(dst, src, scratch);
397 }
398 }
399
I8x16Shl(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)400 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
401 uint8_t src2, Register tmp1,
402 XMMRegister tmp2) {
403 ASM_CODE_COMMENT(this);
404 DCHECK_NE(dst, tmp2);
405 // Perform 16-bit shift, then mask away low bits.
406 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
407 movaps(dst, src1);
408 src1 = dst;
409 }
410
411 uint8_t shift = truncate_to_int3(src2);
412 Psllw(dst, src1, byte{shift});
413
414 uint8_t bmask = static_cast<uint8_t>(0xff << shift);
415 uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
416 Move(tmp1, mask);
417 Movd(tmp2, tmp1);
418 Pshufd(tmp2, tmp2, uint8_t{0});
419 Pand(dst, tmp2);
420 }
421
I8x16Shl(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)422 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
423 Register src2, Register tmp1,
424 XMMRegister tmp2, XMMRegister tmp3) {
425 ASM_CODE_COMMENT(this);
426 DCHECK(!AreAliased(dst, tmp2, tmp3));
427 DCHECK(!AreAliased(src1, tmp2, tmp3));
428
429 // Take shift value modulo 8.
430 Move(tmp1, src2);
431 And(tmp1, Immediate(7));
432 Add(tmp1, Immediate(8));
433 // Create a mask to unset high bits.
434 Movd(tmp3, tmp1);
435 Pcmpeqd(tmp2, tmp2);
436 Psrlw(tmp2, tmp2, tmp3);
437 Packuswb(tmp2, tmp2);
438 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
439 movaps(dst, src1);
440 src1 = dst;
441 }
442 // Mask off the unwanted bits before word-shifting.
443 Pand(dst, src1, tmp2);
444 Add(tmp1, Immediate(-8));
445 Movd(tmp3, tmp1);
446 Psllw(dst, dst, tmp3);
447 }
448
I8x16ShrS(XMMRegister dst,XMMRegister src1,uint8_t src2,XMMRegister tmp)449 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
450 uint8_t src2, XMMRegister tmp) {
451 ASM_CODE_COMMENT(this);
452 // Unpack bytes into words, do word (16-bit) shifts, and repack.
453 DCHECK_NE(dst, tmp);
454 uint8_t shift = truncate_to_int3(src2) + 8;
455
456 Punpckhbw(tmp, src1);
457 Punpcklbw(dst, src1);
458 Psraw(tmp, shift);
459 Psraw(dst, shift);
460 Packsswb(dst, tmp);
461 }
462
I8x16ShrS(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)463 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
464 Register src2, Register tmp1,
465 XMMRegister tmp2, XMMRegister tmp3) {
466 ASM_CODE_COMMENT(this);
467 DCHECK(!AreAliased(dst, tmp2, tmp3));
468 DCHECK_NE(src1, tmp2);
469
470 // Unpack the bytes into words, do arithmetic shifts, and repack.
471 Punpckhbw(tmp2, src1);
472 Punpcklbw(dst, src1);
473 // Prepare shift value
474 Move(tmp1, src2);
475 // Take shift value modulo 8.
476 And(tmp1, Immediate(7));
477 Add(tmp1, Immediate(8));
478 Movd(tmp3, tmp1);
479 Psraw(tmp2, tmp3);
480 Psraw(dst, tmp3);
481 Packsswb(dst, tmp2);
482 }
483
I8x16ShrU(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)484 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
485 uint8_t src2, Register tmp1,
486 XMMRegister tmp2) {
487 ASM_CODE_COMMENT(this);
488 DCHECK_NE(dst, tmp2);
489 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
490 movaps(dst, src1);
491 src1 = dst;
492 }
493
494 // Perform 16-bit shift, then mask away high bits.
495 uint8_t shift = truncate_to_int3(src2);
496 Psrlw(dst, src1, shift);
497
498 uint8_t bmask = 0xff >> shift;
499 uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
500 Move(tmp1, mask);
501 Movd(tmp2, tmp1);
502 Pshufd(tmp2, tmp2, byte{0});
503 Pand(dst, tmp2);
504 }
505
I8x16ShrU(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)506 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
507 Register src2, Register tmp1,
508 XMMRegister tmp2, XMMRegister tmp3) {
509 ASM_CODE_COMMENT(this);
510 DCHECK(!AreAliased(dst, tmp2, tmp3));
511 DCHECK_NE(src1, tmp2);
512
513 // Unpack the bytes into words, do logical shifts, and repack.
514 Punpckhbw(tmp2, src1);
515 Punpcklbw(dst, src1);
516 // Prepare shift value.
517 Move(tmp1, src2);
518 // Take shift value modulo 8.
519 And(tmp1, Immediate(7));
520 Add(tmp1, Immediate(8));
521 Movd(tmp3, tmp1);
522 Psrlw(tmp2, tmp3);
523 Psrlw(dst, tmp3);
524 Packuswb(dst, tmp2);
525 }
526
527 template <typename Op>
I16x8SplatPreAvx2(XMMRegister dst,Op src)528 void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
529 DCHECK(!CpuFeatures::IsSupported(AVX2));
530 Movd(dst, src);
531 Pshuflw(dst, dst, uint8_t{0x0});
532 Punpcklqdq(dst, dst);
533 }
534
I16x8Splat(XMMRegister dst,Register src)535 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
536 ASM_CODE_COMMENT(this);
537 if (CpuFeatures::IsSupported(AVX2)) {
538 CpuFeatureScope avx2_scope(this, AVX2);
539 Movd(dst, src);
540 vpbroadcastw(dst, dst);
541 } else {
542 I16x8SplatPreAvx2(dst, src);
543 }
544 }
545
I16x8Splat(XMMRegister dst,Operand src)546 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
547 ASM_CODE_COMMENT(this);
548 DCHECK_OPERAND_IS_NOT_REG(src);
549 if (CpuFeatures::IsSupported(AVX2)) {
550 CpuFeatureScope avx2_scope(this, AVX2);
551 vpbroadcastw(dst, src);
552 } else {
553 I16x8SplatPreAvx2(dst, src);
554 }
555 }
556
I16x8ExtMulLow(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool is_signed)557 void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
558 XMMRegister src2, XMMRegister scratch,
559 bool is_signed) {
560 ASM_CODE_COMMENT(this);
561 is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
562 is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
563 Pmullw(dst, scratch);
564 }
565
I16x8ExtMulHighS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)566 void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
567 XMMRegister src2,
568 XMMRegister scratch) {
569 ASM_CODE_COMMENT(this);
570 if (CpuFeatures::IsSupported(AVX)) {
571 CpuFeatureScope avx_scope(this, AVX);
572 vpunpckhbw(scratch, src1, src1);
573 vpsraw(scratch, scratch, 8);
574 vpunpckhbw(dst, src2, src2);
575 vpsraw(dst, dst, 8);
576 vpmullw(dst, dst, scratch);
577 } else {
578 if (dst != src1) {
579 movaps(dst, src1);
580 }
581 movaps(scratch, src2);
582 punpckhbw(dst, dst);
583 psraw(dst, 8);
584 punpckhbw(scratch, scratch);
585 psraw(scratch, 8);
586 pmullw(dst, scratch);
587 }
588 }
589
I16x8ExtMulHighU(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)590 void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
591 XMMRegister src2,
592 XMMRegister scratch) {
593 ASM_CODE_COMMENT(this);
594 // The logic here is slightly complicated to handle all the cases of register
595 // aliasing. This allows flexibility for callers in TurboFan and Liftoff.
596 if (CpuFeatures::IsSupported(AVX)) {
597 CpuFeatureScope avx_scope(this, AVX);
598 if (src1 == src2) {
599 vpxor(scratch, scratch, scratch);
600 vpunpckhbw(dst, src1, scratch);
601 vpmullw(dst, dst, dst);
602 } else {
603 if (dst == src2) {
604 // We overwrite dst, then use src2, so swap src1 and src2.
605 std::swap(src1, src2);
606 }
607 vpxor(scratch, scratch, scratch);
608 vpunpckhbw(dst, src1, scratch);
609 vpunpckhbw(scratch, src2, scratch);
610 vpmullw(dst, dst, scratch);
611 }
612 } else {
613 if (src1 == src2) {
614 xorps(scratch, scratch);
615 if (dst != src1) {
616 movaps(dst, src1);
617 }
618 punpckhbw(dst, scratch);
619 pmullw(dst, scratch);
620 } else {
621 // When dst == src1, nothing special needs to be done.
622 // When dst == src2, swap src1 and src2, since we overwrite dst.
623 // When dst is unique, copy src1 to dst first.
624 if (dst == src2) {
625 std::swap(src1, src2);
626 // Now, dst == src1.
627 } else if (dst != src1) {
628 // dst != src1 && dst != src2.
629 movaps(dst, src1);
630 }
631 xorps(scratch, scratch);
632 punpckhbw(dst, scratch);
633 punpckhbw(scratch, src2);
634 psrlw(scratch, 8);
635 pmullw(dst, scratch);
636 }
637 }
638 }
639
I16x8SConvertI8x16High(XMMRegister dst,XMMRegister src)640 void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
641 XMMRegister src) {
642 ASM_CODE_COMMENT(this);
643 if (CpuFeatures::IsSupported(AVX)) {
644 CpuFeatureScope avx_scope(this, AVX);
645 // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high)
646 // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p|
647 vpunpckhbw(dst, src, src);
648 vpsraw(dst, dst, 8);
649 } else {
650 CpuFeatureScope sse_scope(this, SSE4_1);
651 if (dst == src) {
652 // 2 bytes shorter than pshufd, but has depdency on dst.
653 movhlps(dst, src);
654 pmovsxbw(dst, dst);
655 } else {
656 // No dependency on dst.
657 pshufd(dst, src, 0xEE);
658 pmovsxbw(dst, dst);
659 }
660 }
661 }
662
I16x8UConvertI8x16High(XMMRegister dst,XMMRegister src,XMMRegister scratch)663 void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
664 XMMRegister src,
665 XMMRegister scratch) {
666 ASM_CODE_COMMENT(this);
667 if (CpuFeatures::IsSupported(AVX)) {
668 CpuFeatureScope avx_scope(this, AVX);
669 // tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
670 // src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
671 // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
672 XMMRegister tmp = dst == src ? scratch : dst;
673 vpxor(tmp, tmp, tmp);
674 vpunpckhbw(dst, src, tmp);
675 } else {
676 CpuFeatureScope sse_scope(this, SSE4_1);
677 if (dst == src) {
678 // xorps can be executed on more ports than pshufd.
679 xorps(scratch, scratch);
680 punpckhbw(dst, scratch);
681 } else {
682 // No dependency on dst.
683 pshufd(dst, src, 0xEE);
684 pmovzxbw(dst, dst);
685 }
686 }
687 }
688
I16x8Q15MulRSatS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)689 void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
690 XMMRegister src2,
691 XMMRegister scratch) {
692 ASM_CODE_COMMENT(this);
693 // k = i16x8.splat(0x8000)
694 Pcmpeqd(scratch, scratch);
695 Psllw(scratch, scratch, byte{15});
696
697 if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
698 movaps(dst, src1);
699 src1 = dst;
700 }
701
702 Pmulhrsw(dst, src1, src2);
703 Pcmpeqw(scratch, dst);
704 Pxor(dst, scratch);
705 }
706
I32x4ExtAddPairwiseI16x8U(XMMRegister dst,XMMRegister src,XMMRegister tmp)707 void SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
708 XMMRegister src,
709 XMMRegister tmp) {
710 ASM_CODE_COMMENT(this);
711 if (CpuFeatures::IsSupported(AVX)) {
712 CpuFeatureScope avx_scope(this, AVX);
713 // src = |a|b|c|d|e|f|g|h| (low)
714 // scratch = |0|a|0|c|0|e|0|g|
715 vpsrld(tmp, src, 16);
716 // dst = |0|b|0|d|0|f|0|h|
717 vpblendw(dst, src, tmp, 0xAA);
718 // dst = |a+b|c+d|e+f|g+h|
719 vpaddd(dst, tmp, dst);
720 } else if (CpuFeatures::IsSupported(SSE4_1)) {
721 CpuFeatureScope sse_scope(this, SSE4_1);
722 // There is a potentially better lowering if we get rip-relative
723 // constants, see https://github.com/WebAssembly/simd/pull/380.
724 movaps(tmp, src);
725 psrld(tmp, 16);
726 if (dst != src) {
727 movaps(dst, src);
728 }
729 pblendw(dst, tmp, 0xAA);
730 paddd(dst, tmp);
731 } else {
732 // src = |a|b|c|d|e|f|g|h|
733 // tmp = i32x4.splat(0x0000FFFF)
734 pcmpeqd(tmp, tmp);
735 psrld(tmp, byte{16});
736 // tmp =|0|b|0|d|0|f|0|h|
737 andps(tmp, src);
738 // dst = |0|a|0|c|0|e|0|g|
739 if (dst != src) {
740 movaps(dst, src);
741 }
742 psrld(dst, byte{16});
743 // dst = |a+b|c+d|e+f|g+h|
744 paddd(dst, tmp);
745 }
746 }
747
748 // 1. Multiply low word into scratch.
749 // 2. Multiply high word (can be signed or unsigned) into dst.
750 // 3. Unpack and interleave scratch and dst into dst.
I32x4ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)751 void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
752 XMMRegister src2, XMMRegister scratch,
753 bool low, bool is_signed) {
754 ASM_CODE_COMMENT(this);
755 if (CpuFeatures::IsSupported(AVX)) {
756 CpuFeatureScope avx_scope(this, AVX);
757 vpmullw(scratch, src1, src2);
758 is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
759 low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
760 } else {
761 DCHECK_EQ(dst, src1);
762 movaps(scratch, src1);
763 pmullw(dst, src2);
764 is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
765 low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
766 }
767 }
768
I32x4SConvertI16x8High(XMMRegister dst,XMMRegister src)769 void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
770 XMMRegister src) {
771 ASM_CODE_COMMENT(this);
772 if (CpuFeatures::IsSupported(AVX)) {
773 CpuFeatureScope avx_scope(this, AVX);
774 // src = |a|b|c|d|e|f|g|h| (high)
775 // dst = |e|e|f|f|g|g|h|h|
776 vpunpckhwd(dst, src, src);
777 vpsrad(dst, dst, 16);
778 } else {
779 CpuFeatureScope sse_scope(this, SSE4_1);
780 if (dst == src) {
781 // 2 bytes shorter than pshufd, but has depdency on dst.
782 movhlps(dst, src);
783 pmovsxwd(dst, dst);
784 } else {
785 // No dependency on dst.
786 pshufd(dst, src, 0xEE);
787 pmovsxwd(dst, dst);
788 }
789 }
790 }
791
I32x4UConvertI16x8High(XMMRegister dst,XMMRegister src,XMMRegister scratch)792 void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
793 XMMRegister src,
794 XMMRegister scratch) {
795 ASM_CODE_COMMENT(this);
796 if (CpuFeatures::IsSupported(AVX)) {
797 CpuFeatureScope avx_scope(this, AVX);
798 // scratch = |0|0|0|0|0|0|0|0|
799 // src = |a|b|c|d|e|f|g|h|
800 // dst = |0|a|0|b|0|c|0|d|
801 XMMRegister tmp = dst == src ? scratch : dst;
802 vpxor(tmp, tmp, tmp);
803 vpunpckhwd(dst, src, tmp);
804 } else {
805 if (dst == src) {
806 // xorps can be executed on more ports than pshufd.
807 xorps(scratch, scratch);
808 punpckhwd(dst, scratch);
809 } else {
810 CpuFeatureScope sse_scope(this, SSE4_1);
811 // No dependency on dst.
812 pshufd(dst, src, 0xEE);
813 pmovzxwd(dst, dst);
814 }
815 }
816 }
817
I64x2Neg(XMMRegister dst,XMMRegister src,XMMRegister scratch)818 void SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src,
819 XMMRegister scratch) {
820 ASM_CODE_COMMENT(this);
821 if (CpuFeatures::IsSupported(AVX)) {
822 CpuFeatureScope scope(this, AVX);
823 vpxor(scratch, scratch, scratch);
824 vpsubq(dst, scratch, src);
825 } else {
826 if (dst == src) {
827 movaps(scratch, src);
828 std::swap(src, scratch);
829 }
830 pxor(dst, dst);
831 psubq(dst, src);
832 }
833 }
834
I64x2Abs(XMMRegister dst,XMMRegister src,XMMRegister scratch)835 void SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src,
836 XMMRegister scratch) {
837 ASM_CODE_COMMENT(this);
838 if (CpuFeatures::IsSupported(AVX)) {
839 CpuFeatureScope avx_scope(this, AVX);
840 XMMRegister tmp = dst == src ? scratch : dst;
841 vpxor(tmp, tmp, tmp);
842 vpsubq(tmp, tmp, src);
843 vblendvpd(dst, src, tmp, src);
844 } else {
845 CpuFeatureScope sse_scope(this, SSE3);
846 movshdup(scratch, src);
847 if (dst != src) {
848 movaps(dst, src);
849 }
850 psrad(scratch, 31);
851 xorps(dst, scratch);
852 psubq(dst, scratch);
853 }
854 }
855
I64x2GtS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)856 void SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0,
857 XMMRegister src1, XMMRegister scratch) {
858 ASM_CODE_COMMENT(this);
859 if (CpuFeatures::IsSupported(AVX)) {
860 CpuFeatureScope avx_scope(this, AVX);
861 vpcmpgtq(dst, src0, src1);
862 } else if (CpuFeatures::IsSupported(SSE4_2)) {
863 CpuFeatureScope sse_scope(this, SSE4_2);
864 if (dst == src0) {
865 pcmpgtq(dst, src1);
866 } else if (dst == src1) {
867 movaps(scratch, src0);
868 pcmpgtq(scratch, src1);
869 movaps(dst, scratch);
870 } else {
871 movaps(dst, src0);
872 pcmpgtq(dst, src1);
873 }
874 } else {
875 CpuFeatureScope sse_scope(this, SSE3);
876 DCHECK_NE(dst, src0);
877 DCHECK_NE(dst, src1);
878 movaps(dst, src1);
879 movaps(scratch, src0);
880 psubq(dst, src0);
881 pcmpeqd(scratch, src1);
882 andps(dst, scratch);
883 movaps(scratch, src0);
884 pcmpgtd(scratch, src1);
885 orps(dst, scratch);
886 movshdup(dst, dst);
887 }
888 }
889
I64x2GeS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)890 void SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
891 XMMRegister src1, XMMRegister scratch) {
892 ASM_CODE_COMMENT(this);
893 if (CpuFeatures::IsSupported(AVX)) {
894 CpuFeatureScope avx_scope(this, AVX);
895 vpcmpgtq(dst, src1, src0);
896 vpcmpeqd(scratch, scratch, scratch);
897 vpxor(dst, dst, scratch);
898 } else if (CpuFeatures::IsSupported(SSE4_2)) {
899 CpuFeatureScope sse_scope(this, SSE4_2);
900 DCHECK_NE(dst, src0);
901 if (dst != src1) {
902 movaps(dst, src1);
903 }
904 pcmpgtq(dst, src0);
905 pcmpeqd(scratch, scratch);
906 xorps(dst, scratch);
907 } else {
908 CpuFeatureScope sse_scope(this, SSE3);
909 DCHECK_NE(dst, src0);
910 DCHECK_NE(dst, src1);
911 movaps(dst, src0);
912 movaps(scratch, src1);
913 psubq(dst, src1);
914 pcmpeqd(scratch, src0);
915 andps(dst, scratch);
916 movaps(scratch, src1);
917 pcmpgtd(scratch, src0);
918 orps(dst, scratch);
919 movshdup(dst, dst);
920 pcmpeqd(scratch, scratch);
921 xorps(dst, scratch);
922 }
923 }
924
I64x2ShrS(XMMRegister dst,XMMRegister src,uint8_t shift,XMMRegister xmm_tmp)925 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
926 uint8_t shift, XMMRegister xmm_tmp) {
927 ASM_CODE_COMMENT(this);
928 DCHECK_GT(64, shift);
929 DCHECK_NE(xmm_tmp, dst);
930 DCHECK_NE(xmm_tmp, src);
931 // Use logical right shift to emulate arithmetic right shifts:
932 // Given:
933 // signed >> c
934 // == (signed + 2^63 - 2^63) >> c
935 // == ((signed + 2^63) >> c) - (2^63 >> c)
936 // ^^^^^^^^^
937 // xmm_tmp
938 // signed + 2^63 is an unsigned number, so we can use logical right shifts.
939
940 // xmm_tmp = wasm_i64x2_const(0x80000000'00000000).
941 Pcmpeqd(xmm_tmp, xmm_tmp);
942 Psllq(xmm_tmp, byte{63});
943
944 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
945 movaps(dst, src);
946 src = dst;
947 }
948 // Add a bias of 2^63 to convert signed to unsigned.
949 // Since only highest bit changes, use pxor instead of paddq.
950 Pxor(dst, src, xmm_tmp);
951 // Logically shift both value and bias.
952 Psrlq(dst, shift);
953 Psrlq(xmm_tmp, shift);
954 // Subtract shifted bias to convert back to signed value.
955 Psubq(dst, xmm_tmp);
956 }
957
I64x2ShrS(XMMRegister dst,XMMRegister src,Register shift,XMMRegister xmm_tmp,XMMRegister xmm_shift,Register tmp_shift)958 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
959 Register shift, XMMRegister xmm_tmp,
960 XMMRegister xmm_shift,
961 Register tmp_shift) {
962 ASM_CODE_COMMENT(this);
963 DCHECK_NE(xmm_tmp, dst);
964 DCHECK_NE(xmm_tmp, src);
965 DCHECK_NE(xmm_shift, dst);
966 DCHECK_NE(xmm_shift, src);
967 // tmp_shift can alias shift since we don't use shift after masking it.
968
969 // See I64x2ShrS with constant shift for explanation of this algorithm.
970 Pcmpeqd(xmm_tmp, xmm_tmp);
971 Psllq(xmm_tmp, byte{63});
972
973 // Shift modulo 64.
974 Move(tmp_shift, shift);
975 And(tmp_shift, Immediate(0x3F));
976 Movd(xmm_shift, tmp_shift);
977
978 if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
979 movaps(dst, src);
980 src = dst;
981 }
982 Pxor(dst, src, xmm_tmp);
983 Psrlq(dst, xmm_shift);
984 Psrlq(xmm_tmp, xmm_shift);
985 Psubq(dst, xmm_tmp);
986 }
987
I64x2Mul(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister tmp1,XMMRegister tmp2)988 void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
989 XMMRegister rhs, XMMRegister tmp1,
990 XMMRegister tmp2) {
991 ASM_CODE_COMMENT(this);
992 DCHECK(!AreAliased(dst, tmp1, tmp2));
993 DCHECK(!AreAliased(lhs, tmp1, tmp2));
994 DCHECK(!AreAliased(rhs, tmp1, tmp2));
995
996 if (CpuFeatures::IsSupported(AVX)) {
997 CpuFeatureScope avx_scope(this, AVX);
998 // 1. Multiply high dword of each qword of left with right.
999 vpsrlq(tmp1, lhs, byte{32});
1000 vpmuludq(tmp1, tmp1, rhs);
1001 // 2. Multiply high dword of each qword of right with left.
1002 vpsrlq(tmp2, rhs, byte{32});
1003 vpmuludq(tmp2, tmp2, lhs);
1004 // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
1005 vpaddq(tmp2, tmp2, tmp1);
1006 vpsllq(tmp2, tmp2, byte{32});
1007 // 4. Multiply low dwords (this is the low dword of result).
1008 vpmuludq(dst, lhs, rhs);
1009 // 5. Add 3 and 4.
1010 vpaddq(dst, dst, tmp2);
1011 } else {
1012 // Same algorithm as AVX version, but with moves to not overwrite inputs.
1013 movaps(tmp1, lhs);
1014 movaps(tmp2, rhs);
1015 psrlq(tmp1, byte{32});
1016 pmuludq(tmp1, rhs);
1017 psrlq(tmp2, byte{32});
1018 pmuludq(tmp2, lhs);
1019 paddq(tmp2, tmp1);
1020 psllq(tmp2, byte{32});
1021 if (dst == rhs) {
1022 // pmuludq is commutative
1023 pmuludq(dst, lhs);
1024 } else {
1025 if (dst != lhs) {
1026 movaps(dst, lhs);
1027 }
1028 pmuludq(dst, rhs);
1029 }
1030 paddq(dst, tmp2);
1031 }
1032 }
1033
1034 // 1. Unpack src0, src1 into even-number elements of scratch.
1035 // 2. Unpack src1, src0 into even-number elements of dst.
1036 // 3. Multiply 1. with 2.
1037 // For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
I64x2ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)1038 void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
1039 XMMRegister src2, XMMRegister scratch,
1040 bool low, bool is_signed) {
1041 ASM_CODE_COMMENT(this);
1042 if (CpuFeatures::IsSupported(AVX)) {
1043 CpuFeatureScope avx_scope(this, AVX);
1044 if (low) {
1045 vpunpckldq(scratch, src1, src1);
1046 vpunpckldq(dst, src2, src2);
1047 } else {
1048 vpunpckhdq(scratch, src1, src1);
1049 vpunpckhdq(dst, src2, src2);
1050 }
1051 if (is_signed) {
1052 vpmuldq(dst, scratch, dst);
1053 } else {
1054 vpmuludq(dst, scratch, dst);
1055 }
1056 } else {
1057 uint8_t mask = low ? 0x50 : 0xFA;
1058 pshufd(scratch, src1, mask);
1059 pshufd(dst, src2, mask);
1060 if (is_signed) {
1061 CpuFeatureScope sse4_scope(this, SSE4_1);
1062 pmuldq(dst, scratch);
1063 } else {
1064 pmuludq(dst, scratch);
1065 }
1066 }
1067 }
1068
I64x2SConvertI32x4High(XMMRegister dst,XMMRegister src)1069 void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
1070 XMMRegister src) {
1071 ASM_CODE_COMMENT(this);
1072 if (CpuFeatures::IsSupported(AVX)) {
1073 CpuFeatureScope avx_scope(this, AVX);
1074 vpunpckhqdq(dst, src, src);
1075 vpmovsxdq(dst, dst);
1076 } else {
1077 CpuFeatureScope sse_scope(this, SSE4_1);
1078 if (dst == src) {
1079 movhlps(dst, src);
1080 } else {
1081 pshufd(dst, src, 0xEE);
1082 }
1083 pmovsxdq(dst, dst);
1084 }
1085 }
1086
I64x2UConvertI32x4High(XMMRegister dst,XMMRegister src,XMMRegister scratch)1087 void SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst,
1088 XMMRegister src,
1089 XMMRegister scratch) {
1090 ASM_CODE_COMMENT(this);
1091 if (CpuFeatures::IsSupported(AVX)) {
1092 CpuFeatureScope avx_scope(this, AVX);
1093 vpxor(scratch, scratch, scratch);
1094 vpunpckhdq(dst, src, scratch);
1095 } else {
1096 if (dst == src) {
1097 // xorps can be executed on more ports than pshufd.
1098 xorps(scratch, scratch);
1099 punpckhdq(dst, scratch);
1100 } else {
1101 CpuFeatureScope sse_scope(this, SSE4_1);
1102 // No dependency on dst.
1103 pshufd(dst, src, 0xEE);
1104 pmovzxdq(dst, dst);
1105 }
1106 }
1107 }
1108
S128Not(XMMRegister dst,XMMRegister src,XMMRegister scratch)1109 void SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src,
1110 XMMRegister scratch) {
1111 ASM_CODE_COMMENT(this);
1112 if (dst == src) {
1113 Pcmpeqd(scratch, scratch);
1114 Pxor(dst, scratch);
1115 } else {
1116 Pcmpeqd(dst, dst);
1117 Pxor(dst, src);
1118 }
1119 }
1120
S128Select(XMMRegister dst,XMMRegister mask,XMMRegister src1,XMMRegister src2,XMMRegister scratch)1121 void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
1122 XMMRegister src1, XMMRegister src2,
1123 XMMRegister scratch) {
1124 ASM_CODE_COMMENT(this);
1125 // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
1126 // pandn(x, y) = !x & y, so we have to flip the mask and input.
1127 if (CpuFeatures::IsSupported(AVX)) {
1128 CpuFeatureScope avx_scope(this, AVX);
1129 vpandn(scratch, mask, src2);
1130 vpand(dst, src1, mask);
1131 vpor(dst, dst, scratch);
1132 } else {
1133 DCHECK_EQ(dst, mask);
1134 // Use float ops as they are 1 byte shorter than int ops.
1135 movaps(scratch, mask);
1136 andnps(scratch, src2);
1137 andps(dst, src1);
1138 orps(dst, scratch);
1139 }
1140 }
1141
S128Load8Splat(XMMRegister dst,Operand src,XMMRegister scratch)1142 void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
1143 XMMRegister scratch) {
1144 ASM_CODE_COMMENT(this);
1145 // The trap handler uses the current pc to creating a landing, so that it can
1146 // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1147 // first instruction in each case below is the one that loads.
1148 if (CpuFeatures::IsSupported(AVX2)) {
1149 CpuFeatureScope avx2_scope(this, AVX2);
1150 vpbroadcastb(dst, src);
1151 } else if (CpuFeatures::IsSupported(AVX)) {
1152 CpuFeatureScope avx_scope(this, AVX);
1153 // Avoid dependency on previous value of dst.
1154 vpinsrb(dst, scratch, src, uint8_t{0});
1155 vpxor(scratch, scratch, scratch);
1156 vpshufb(dst, dst, scratch);
1157 } else {
1158 CpuFeatureScope ssse4_scope(this, SSE4_1);
1159 pinsrb(dst, src, uint8_t{0});
1160 xorps(scratch, scratch);
1161 pshufb(dst, scratch);
1162 }
1163 }
1164
S128Load16Splat(XMMRegister dst,Operand src,XMMRegister scratch)1165 void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
1166 XMMRegister scratch) {
1167 ASM_CODE_COMMENT(this);
1168 // The trap handler uses the current pc to creating a landing, so that it can
1169 // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1170 // first instruction in each case below is the one that loads.
1171 if (CpuFeatures::IsSupported(AVX2)) {
1172 CpuFeatureScope avx2_scope(this, AVX2);
1173 vpbroadcastw(dst, src);
1174 } else if (CpuFeatures::IsSupported(AVX)) {
1175 CpuFeatureScope avx_scope(this, AVX);
1176 // Avoid dependency on previous value of dst.
1177 vpinsrw(dst, scratch, src, uint8_t{0});
1178 vpshuflw(dst, dst, uint8_t{0});
1179 vpunpcklqdq(dst, dst, dst);
1180 } else {
1181 pinsrw(dst, src, uint8_t{0});
1182 pshuflw(dst, dst, uint8_t{0});
1183 movlhps(dst, dst);
1184 }
1185 }
1186
S128Load32Splat(XMMRegister dst,Operand src)1187 void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
1188 ASM_CODE_COMMENT(this);
1189 // The trap handler uses the current pc to creating a landing, so that it can
1190 // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1191 // first instruction in each case below is the one that loads.
1192 if (CpuFeatures::IsSupported(AVX)) {
1193 CpuFeatureScope avx_scope(this, AVX);
1194 vbroadcastss(dst, src);
1195 } else {
1196 movss(dst, src);
1197 shufps(dst, dst, byte{0});
1198 }
1199 }
1200
S128Store64Lane(Operand dst,XMMRegister src,uint8_t laneidx)1201 void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
1202 uint8_t laneidx) {
1203 ASM_CODE_COMMENT(this);
1204 if (laneidx == 0) {
1205 Movlps(dst, src);
1206 } else {
1207 DCHECK_EQ(1, laneidx);
1208 Movhps(dst, src);
1209 }
1210 }
1211
1212 // Helper macro to define qfma macro-assembler. This takes care of every
1213 // possible case of register aliasing to minimize the number of instructions.
1214 #define QFMA(ps_or_pd) \
1215 if (CpuFeatures::IsSupported(FMA3)) { \
1216 CpuFeatureScope fma3_scope(this, FMA3); \
1217 if (dst == src1) { \
1218 vfmadd231##ps_or_pd(dst, src2, src3); \
1219 } else if (dst == src2) { \
1220 vfmadd132##ps_or_pd(dst, src1, src3); \
1221 } else if (dst == src3) { \
1222 vfmadd213##ps_or_pd(dst, src2, src1); \
1223 } else { \
1224 CpuFeatureScope avx_scope(this, AVX); \
1225 vmovups(dst, src1); \
1226 vfmadd231##ps_or_pd(dst, src2, src3); \
1227 } \
1228 } else if (CpuFeatures::IsSupported(AVX)) { \
1229 CpuFeatureScope avx_scope(this, AVX); \
1230 vmul##ps_or_pd(tmp, src2, src3); \
1231 vadd##ps_or_pd(dst, src1, tmp); \
1232 } else { \
1233 if (dst == src1) { \
1234 movaps(tmp, src2); \
1235 mul##ps_or_pd(tmp, src3); \
1236 add##ps_or_pd(dst, tmp); \
1237 } else if (dst == src2) { \
1238 DCHECK_NE(src2, src1); \
1239 mul##ps_or_pd(src2, src3); \
1240 add##ps_or_pd(src2, src1); \
1241 } else if (dst == src3) { \
1242 DCHECK_NE(src3, src1); \
1243 mul##ps_or_pd(src3, src2); \
1244 add##ps_or_pd(src3, src1); \
1245 } else { \
1246 movaps(dst, src2); \
1247 mul##ps_or_pd(dst, src3); \
1248 add##ps_or_pd(dst, src1); \
1249 } \
1250 }
1251
1252 // Helper macro to define qfms macro-assembler. This takes care of every
1253 // possible case of register aliasing to minimize the number of instructions.
1254 #define QFMS(ps_or_pd) \
1255 if (CpuFeatures::IsSupported(FMA3)) { \
1256 CpuFeatureScope fma3_scope(this, FMA3); \
1257 if (dst == src1) { \
1258 vfnmadd231##ps_or_pd(dst, src2, src3); \
1259 } else if (dst == src2) { \
1260 vfnmadd132##ps_or_pd(dst, src1, src3); \
1261 } else if (dst == src3) { \
1262 vfnmadd213##ps_or_pd(dst, src2, src1); \
1263 } else { \
1264 CpuFeatureScope avx_scope(this, AVX); \
1265 vmovups(dst, src1); \
1266 vfnmadd231##ps_or_pd(dst, src2, src3); \
1267 } \
1268 } else if (CpuFeatures::IsSupported(AVX)) { \
1269 CpuFeatureScope avx_scope(this, AVX); \
1270 vmul##ps_or_pd(tmp, src2, src3); \
1271 vsub##ps_or_pd(dst, src1, tmp); \
1272 } else { \
1273 movaps(tmp, src2); \
1274 mul##ps_or_pd(tmp, src3); \
1275 if (dst != src1) { \
1276 movaps(dst, src1); \
1277 } \
1278 sub##ps_or_pd(dst, tmp); \
1279 }
1280
F32x4Qfma(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1281 void SharedTurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1,
1282 XMMRegister src2, XMMRegister src3,
1283 XMMRegister tmp) {
1284 QFMA(ps)
1285 }
1286
F32x4Qfms(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1287 void SharedTurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1,
1288 XMMRegister src2, XMMRegister src3,
1289 XMMRegister tmp) {
1290 QFMS(ps)
1291 }
1292
F64x2Qfma(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1293 void SharedTurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1,
1294 XMMRegister src2, XMMRegister src3,
1295 XMMRegister tmp) {
1296 QFMA(pd);
1297 }
1298
F64x2Qfms(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1299 void SharedTurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1,
1300 XMMRegister src2, XMMRegister src3,
1301 XMMRegister tmp) {
1302 QFMS(pd);
1303 }
1304
1305 #undef QFMOP
1306
1307 } // namespace internal
1308 } // namespace v8
1309
1310 #undef DCHECK_OPERAND_IS_NOT_REG
1311