• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h"
6 
7 #include "src/codegen/assembler.h"
8 #include "src/codegen/cpu-features.h"
9 #include "src/codegen/register.h"
10 
11 #if V8_TARGET_ARCH_IA32
12 #include "src/codegen/ia32/register-ia32.h"
13 #elif V8_TARGET_ARCH_X64
14 #include "src/codegen/x64/register-x64.h"
15 #else
16 #error Unsupported target architecture.
17 #endif
18 
19 // Operand on IA32 can be a wrapper for a single register, in which case they
20 // should call I8x16Splat |src| being Register.
21 #if V8_TARGET_ARCH_IA32
22 #define DCHECK_OPERAND_IS_NOT_REG(op) DCHECK(!op.is_reg_only());
23 #else
24 #define DCHECK_OPERAND_IS_NOT_REG(op)
25 #endif
26 
27 namespace v8 {
28 namespace internal {
29 
Move(Register dst,uint32_t src)30 void SharedTurboAssembler::Move(Register dst, uint32_t src) {
31   // Helper to paper over the different assembler function names.
32 #if V8_TARGET_ARCH_IA32
33   mov(dst, Immediate(src));
34 #elif V8_TARGET_ARCH_X64
35   movl(dst, Immediate(src));
36 #else
37 #error Unsupported target architecture.
38 #endif
39 }
40 
Move(Register dst,Register src)41 void SharedTurboAssembler::Move(Register dst, Register src) {
42   // Helper to paper over the different assembler function names.
43   if (dst != src) {
44 #if V8_TARGET_ARCH_IA32
45     mov(dst, src);
46 #elif V8_TARGET_ARCH_X64
47     movq(dst, src);
48 #else
49 #error Unsupported target architecture.
50 #endif
51   }
52 }
53 
Add(Register dst,Immediate src)54 void SharedTurboAssembler::Add(Register dst, Immediate src) {
55   // Helper to paper over the different assembler function names.
56 #if V8_TARGET_ARCH_IA32
57   add(dst, src);
58 #elif V8_TARGET_ARCH_X64
59   addq(dst, src);
60 #else
61 #error Unsupported target architecture.
62 #endif
63 }
64 
And(Register dst,Immediate src)65 void SharedTurboAssembler::And(Register dst, Immediate src) {
66   // Helper to paper over the different assembler function names.
67 #if V8_TARGET_ARCH_IA32
68   and_(dst, src);
69 #elif V8_TARGET_ARCH_X64
70   if (is_uint32(src.value())) {
71     andl(dst, src);
72   } else {
73     andq(dst, src);
74   }
75 #else
76 #error Unsupported target architecture.
77 #endif
78 }
79 
Movhps(XMMRegister dst,XMMRegister src1,Operand src2)80 void SharedTurboAssembler::Movhps(XMMRegister dst, XMMRegister src1,
81                                   Operand src2) {
82   if (CpuFeatures::IsSupported(AVX)) {
83     CpuFeatureScope scope(this, AVX);
84     vmovhps(dst, src1, src2);
85   } else {
86     if (dst != src1) {
87       movaps(dst, src1);
88     }
89     movhps(dst, src2);
90   }
91 }
92 
Movlps(XMMRegister dst,XMMRegister src1,Operand src2)93 void SharedTurboAssembler::Movlps(XMMRegister dst, XMMRegister src1,
94                                   Operand src2) {
95   if (CpuFeatures::IsSupported(AVX)) {
96     CpuFeatureScope scope(this, AVX);
97     vmovlps(dst, src1, src2);
98   } else {
99     if (dst != src1) {
100       movaps(dst, src1);
101     }
102     movlps(dst, src2);
103   }
104 }
105 
Pblendvb(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister mask)106 void SharedTurboAssembler::Pblendvb(XMMRegister dst, XMMRegister src1,
107                                     XMMRegister src2, XMMRegister mask) {
108   if (CpuFeatures::IsSupported(AVX)) {
109     CpuFeatureScope scope(this, AVX);
110     vpblendvb(dst, src1, src2, mask);
111   } else {
112     CpuFeatureScope scope(this, SSE4_1);
113     DCHECK_EQ(mask, xmm0);
114     DCHECK_EQ(dst, src1);
115     pblendvb(dst, src2);
116   }
117 }
118 
Shufps(XMMRegister dst,XMMRegister src1,XMMRegister src2,uint8_t imm8)119 void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
120                                   XMMRegister src2, uint8_t imm8) {
121   if (CpuFeatures::IsSupported(AVX)) {
122     CpuFeatureScope avx_scope(this, AVX);
123     vshufps(dst, src1, src2, imm8);
124   } else {
125     if (dst != src1) {
126       movaps(dst, src1);
127     }
128     shufps(dst, src2, imm8);
129   }
130 }
131 
F64x2ExtractLane(DoubleRegister dst,XMMRegister src,uint8_t lane)132 void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
133                                             uint8_t lane) {
134   ASM_CODE_COMMENT(this);
135   if (lane == 0) {
136     if (dst != src) {
137       Movaps(dst, src);
138     }
139   } else {
140     DCHECK_EQ(1, lane);
141     if (CpuFeatures::IsSupported(AVX)) {
142       CpuFeatureScope avx_scope(this, AVX);
143       // Pass src as operand to avoid false-dependency on dst.
144       vmovhlps(dst, src, src);
145     } else {
146       movhlps(dst, src);
147     }
148   }
149 }
150 
F64x2ReplaceLane(XMMRegister dst,XMMRegister src,DoubleRegister rep,uint8_t lane)151 void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
152                                             DoubleRegister rep, uint8_t lane) {
153   ASM_CODE_COMMENT(this);
154   if (CpuFeatures::IsSupported(AVX)) {
155     CpuFeatureScope scope(this, AVX);
156     if (lane == 0) {
157       vmovsd(dst, src, rep);
158     } else {
159       vmovlhps(dst, src, rep);
160     }
161   } else {
162     CpuFeatureScope scope(this, SSE4_1);
163     if (dst != src) {
164       DCHECK_NE(dst, rep);  // Ensure rep is not overwritten.
165       movaps(dst, src);
166     }
167     if (lane == 0) {
168       movsd(dst, rep);
169     } else {
170       movlhps(dst, rep);
171     }
172   }
173 }
174 
F32x4Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)175 void SharedTurboAssembler::F32x4Min(XMMRegister dst, XMMRegister lhs,
176                                     XMMRegister rhs, XMMRegister scratch) {
177   ASM_CODE_COMMENT(this);
178   // The minps instruction doesn't propagate NaNs and +0's in its first
179   // operand. Perform minps in both orders, merge the results, and adjust.
180   if (CpuFeatures::IsSupported(AVX)) {
181     CpuFeatureScope scope(this, AVX);
182     vminps(scratch, lhs, rhs);
183     vminps(dst, rhs, lhs);
184   } else if (dst == lhs || dst == rhs) {
185     XMMRegister src = dst == lhs ? rhs : lhs;
186     movaps(scratch, src);
187     minps(scratch, dst);
188     minps(dst, src);
189   } else {
190     movaps(scratch, lhs);
191     minps(scratch, rhs);
192     movaps(dst, rhs);
193     minps(dst, lhs);
194   }
195   // Propagate -0's and NaNs, which may be non-canonical.
196   Orps(scratch, dst);
197   // Canonicalize NaNs by quieting and clearing the payload.
198   Cmpunordps(dst, dst, scratch);
199   Orps(scratch, dst);
200   Psrld(dst, dst, byte{10});
201   Andnps(dst, dst, scratch);
202 }
203 
F32x4Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)204 void SharedTurboAssembler::F32x4Max(XMMRegister dst, XMMRegister lhs,
205                                     XMMRegister rhs, XMMRegister scratch) {
206   ASM_CODE_COMMENT(this);
207   // The maxps instruction doesn't propagate NaNs and +0's in its first
208   // operand. Perform maxps in both orders, merge the results, and adjust.
209   if (CpuFeatures::IsSupported(AVX)) {
210     CpuFeatureScope scope(this, AVX);
211     vmaxps(scratch, lhs, rhs);
212     vmaxps(dst, rhs, lhs);
213   } else if (dst == lhs || dst == rhs) {
214     XMMRegister src = dst == lhs ? rhs : lhs;
215     movaps(scratch, src);
216     maxps(scratch, dst);
217     maxps(dst, src);
218   } else {
219     movaps(scratch, lhs);
220     maxps(scratch, rhs);
221     movaps(dst, rhs);
222     maxps(dst, lhs);
223   }
224   // Find discrepancies.
225   Xorps(dst, scratch);
226   // Propagate NaNs, which may be non-canonical.
227   Orps(scratch, dst);
228   // Propagate sign discrepancy and (subtle) quiet NaNs.
229   Subps(scratch, scratch, dst);
230   // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
231   Cmpunordps(dst, dst, scratch);
232   Psrld(dst, dst, byte{10});
233   Andnps(dst, dst, scratch);
234 }
235 
F64x2Min(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)236 void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
237                                     XMMRegister rhs, XMMRegister scratch) {
238   ASM_CODE_COMMENT(this);
239   if (CpuFeatures::IsSupported(AVX)) {
240     CpuFeatureScope scope(this, AVX);
241     // The minpd instruction doesn't propagate NaNs and +0's in its first
242     // operand. Perform minpd in both orders, merge the resuls, and adjust.
243     vminpd(scratch, lhs, rhs);
244     vminpd(dst, rhs, lhs);
245     // propagate -0's and NaNs, which may be non-canonical.
246     vorpd(scratch, scratch, dst);
247     // Canonicalize NaNs by quieting and clearing the payload.
248     vcmpunordpd(dst, dst, scratch);
249     vorpd(scratch, scratch, dst);
250     vpsrlq(dst, dst, byte{13});
251     vandnpd(dst, dst, scratch);
252   } else {
253     // Compare lhs with rhs, and rhs with lhs, and have the results in scratch
254     // and dst. If dst overlaps with lhs or rhs, we can save a move.
255     if (dst == lhs || dst == rhs) {
256       XMMRegister src = dst == lhs ? rhs : lhs;
257       movaps(scratch, src);
258       minpd(scratch, dst);
259       minpd(dst, src);
260     } else {
261       movaps(scratch, lhs);
262       movaps(dst, rhs);
263       minpd(scratch, rhs);
264       minpd(dst, lhs);
265     }
266     orpd(scratch, dst);
267     cmpunordpd(dst, scratch);
268     orpd(scratch, dst);
269     psrlq(dst, byte{13});
270     andnpd(dst, scratch);
271   }
272 }
273 
F64x2Max(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister scratch)274 void SharedTurboAssembler::F64x2Max(XMMRegister dst, XMMRegister lhs,
275                                     XMMRegister rhs, XMMRegister scratch) {
276   ASM_CODE_COMMENT(this);
277   if (CpuFeatures::IsSupported(AVX)) {
278     CpuFeatureScope scope(this, AVX);
279     // The maxpd instruction doesn't propagate NaNs and +0's in its first
280     // operand. Perform maxpd in both orders, merge the resuls, and adjust.
281     vmaxpd(scratch, lhs, rhs);
282     vmaxpd(dst, rhs, lhs);
283     // Find discrepancies.
284     vxorpd(dst, dst, scratch);
285     // Propagate NaNs, which may be non-canonical.
286     vorpd(scratch, scratch, dst);
287     // Propagate sign discrepancy and (subtle) quiet NaNs.
288     vsubpd(scratch, scratch, dst);
289     // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
290     vcmpunordpd(dst, dst, scratch);
291     vpsrlq(dst, dst, byte{13});
292     vandnpd(dst, dst, scratch);
293   } else {
294     if (dst == lhs || dst == rhs) {
295       XMMRegister src = dst == lhs ? rhs : lhs;
296       movaps(scratch, src);
297       maxpd(scratch, dst);
298       maxpd(dst, src);
299     } else {
300       movaps(scratch, lhs);
301       movaps(dst, rhs);
302       maxpd(scratch, rhs);
303       maxpd(dst, lhs);
304     }
305     xorpd(dst, scratch);
306     orpd(scratch, dst);
307     subpd(scratch, dst);
308     cmpunordpd(dst, scratch);
309     psrlq(dst, byte{13});
310     andnpd(dst, scratch);
311   }
312 }
313 
F32x4Splat(XMMRegister dst,DoubleRegister src)314 void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
315   ASM_CODE_COMMENT(this);
316   if (CpuFeatures::IsSupported(AVX2)) {
317     CpuFeatureScope avx2_scope(this, AVX2);
318     vbroadcastss(dst, src);
319   } else if (CpuFeatures::IsSupported(AVX)) {
320     CpuFeatureScope avx_scope(this, AVX);
321     vshufps(dst, src, src, 0);
322   } else {
323     if (dst == src) {
324       // 1 byte shorter than pshufd.
325       shufps(dst, src, 0);
326     } else {
327       pshufd(dst, src, 0);
328     }
329   }
330 }
331 
F32x4ExtractLane(FloatRegister dst,XMMRegister src,uint8_t lane)332 void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src,
333                                             uint8_t lane) {
334   ASM_CODE_COMMENT(this);
335   DCHECK_LT(lane, 4);
336   // These instructions are shorter than insertps, but will leave junk in
337   // the top lanes of dst.
338   if (lane == 0) {
339     if (dst != src) {
340       Movaps(dst, src);
341     }
342   } else if (lane == 1) {
343     Movshdup(dst, src);
344   } else if (lane == 2 && dst == src) {
345     // Check dst == src to avoid false dependency on dst.
346     Movhlps(dst, src);
347   } else if (dst == src) {
348     Shufps(dst, src, src, lane);
349   } else {
350     Pshufd(dst, src, lane);
351   }
352 }
353 
S128Store32Lane(Operand dst,XMMRegister src,uint8_t laneidx)354 void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
355                                            uint8_t laneidx) {
356   ASM_CODE_COMMENT(this);
357   if (laneidx == 0) {
358     Movss(dst, src);
359   } else {
360     DCHECK_GE(3, laneidx);
361     Extractps(dst, src, laneidx);
362   }
363 }
364 
365 template <typename Op>
I8x16SplatPreAvx2(XMMRegister dst,Op src,XMMRegister scratch)366 void SharedTurboAssembler::I8x16SplatPreAvx2(XMMRegister dst, Op src,
367                                              XMMRegister scratch) {
368   ASM_CODE_COMMENT(this);
369   DCHECK(!CpuFeatures::IsSupported(AVX2));
370   CpuFeatureScope ssse3_scope(this, SSSE3);
371   Movd(dst, src);
372   Xorps(scratch, scratch);
373   Pshufb(dst, scratch);
374 }
375 
I8x16Splat(XMMRegister dst,Register src,XMMRegister scratch)376 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Register src,
377                                       XMMRegister scratch) {
378   ASM_CODE_COMMENT(this);
379   if (CpuFeatures::IsSupported(AVX2)) {
380     CpuFeatureScope avx2_scope(this, AVX2);
381     Movd(scratch, src);
382     vpbroadcastb(dst, scratch);
383   } else {
384     I8x16SplatPreAvx2(dst, src, scratch);
385   }
386 }
387 
I8x16Splat(XMMRegister dst,Operand src,XMMRegister scratch)388 void SharedTurboAssembler::I8x16Splat(XMMRegister dst, Operand src,
389                                       XMMRegister scratch) {
390   ASM_CODE_COMMENT(this);
391   DCHECK_OPERAND_IS_NOT_REG(src);
392   if (CpuFeatures::IsSupported(AVX2)) {
393     CpuFeatureScope avx2_scope(this, AVX2);
394     vpbroadcastb(dst, src);
395   } else {
396     I8x16SplatPreAvx2(dst, src, scratch);
397   }
398 }
399 
I8x16Shl(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)400 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
401                                     uint8_t src2, Register tmp1,
402                                     XMMRegister tmp2) {
403   ASM_CODE_COMMENT(this);
404   DCHECK_NE(dst, tmp2);
405   // Perform 16-bit shift, then mask away low bits.
406   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
407     movaps(dst, src1);
408     src1 = dst;
409   }
410 
411   uint8_t shift = truncate_to_int3(src2);
412   Psllw(dst, src1, byte{shift});
413 
414   uint8_t bmask = static_cast<uint8_t>(0xff << shift);
415   uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
416   Move(tmp1, mask);
417   Movd(tmp2, tmp1);
418   Pshufd(tmp2, tmp2, uint8_t{0});
419   Pand(dst, tmp2);
420 }
421 
I8x16Shl(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)422 void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
423                                     Register src2, Register tmp1,
424                                     XMMRegister tmp2, XMMRegister tmp3) {
425   ASM_CODE_COMMENT(this);
426   DCHECK(!AreAliased(dst, tmp2, tmp3));
427   DCHECK(!AreAliased(src1, tmp2, tmp3));
428 
429   // Take shift value modulo 8.
430   Move(tmp1, src2);
431   And(tmp1, Immediate(7));
432   Add(tmp1, Immediate(8));
433   // Create a mask to unset high bits.
434   Movd(tmp3, tmp1);
435   Pcmpeqd(tmp2, tmp2);
436   Psrlw(tmp2, tmp2, tmp3);
437   Packuswb(tmp2, tmp2);
438   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
439     movaps(dst, src1);
440     src1 = dst;
441   }
442   // Mask off the unwanted bits before word-shifting.
443   Pand(dst, src1, tmp2);
444   Add(tmp1, Immediate(-8));
445   Movd(tmp3, tmp1);
446   Psllw(dst, dst, tmp3);
447 }
448 
I8x16ShrS(XMMRegister dst,XMMRegister src1,uint8_t src2,XMMRegister tmp)449 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
450                                      uint8_t src2, XMMRegister tmp) {
451   ASM_CODE_COMMENT(this);
452   // Unpack bytes into words, do word (16-bit) shifts, and repack.
453   DCHECK_NE(dst, tmp);
454   uint8_t shift = truncate_to_int3(src2) + 8;
455 
456   Punpckhbw(tmp, src1);
457   Punpcklbw(dst, src1);
458   Psraw(tmp, shift);
459   Psraw(dst, shift);
460   Packsswb(dst, tmp);
461 }
462 
I8x16ShrS(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)463 void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
464                                      Register src2, Register tmp1,
465                                      XMMRegister tmp2, XMMRegister tmp3) {
466   ASM_CODE_COMMENT(this);
467   DCHECK(!AreAliased(dst, tmp2, tmp3));
468   DCHECK_NE(src1, tmp2);
469 
470   // Unpack the bytes into words, do arithmetic shifts, and repack.
471   Punpckhbw(tmp2, src1);
472   Punpcklbw(dst, src1);
473   // Prepare shift value
474   Move(tmp1, src2);
475   // Take shift value modulo 8.
476   And(tmp1, Immediate(7));
477   Add(tmp1, Immediate(8));
478   Movd(tmp3, tmp1);
479   Psraw(tmp2, tmp3);
480   Psraw(dst, tmp3);
481   Packsswb(dst, tmp2);
482 }
483 
I8x16ShrU(XMMRegister dst,XMMRegister src1,uint8_t src2,Register tmp1,XMMRegister tmp2)484 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
485                                      uint8_t src2, Register tmp1,
486                                      XMMRegister tmp2) {
487   ASM_CODE_COMMENT(this);
488   DCHECK_NE(dst, tmp2);
489   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
490     movaps(dst, src1);
491     src1 = dst;
492   }
493 
494   // Perform 16-bit shift, then mask away high bits.
495   uint8_t shift = truncate_to_int3(src2);
496   Psrlw(dst, src1, shift);
497 
498   uint8_t bmask = 0xff >> shift;
499   uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
500   Move(tmp1, mask);
501   Movd(tmp2, tmp1);
502   Pshufd(tmp2, tmp2, byte{0});
503   Pand(dst, tmp2);
504 }
505 
I8x16ShrU(XMMRegister dst,XMMRegister src1,Register src2,Register tmp1,XMMRegister tmp2,XMMRegister tmp3)506 void SharedTurboAssembler::I8x16ShrU(XMMRegister dst, XMMRegister src1,
507                                      Register src2, Register tmp1,
508                                      XMMRegister tmp2, XMMRegister tmp3) {
509   ASM_CODE_COMMENT(this);
510   DCHECK(!AreAliased(dst, tmp2, tmp3));
511   DCHECK_NE(src1, tmp2);
512 
513   // Unpack the bytes into words, do logical shifts, and repack.
514   Punpckhbw(tmp2, src1);
515   Punpcklbw(dst, src1);
516   // Prepare shift value.
517   Move(tmp1, src2);
518   // Take shift value modulo 8.
519   And(tmp1, Immediate(7));
520   Add(tmp1, Immediate(8));
521   Movd(tmp3, tmp1);
522   Psrlw(tmp2, tmp3);
523   Psrlw(dst, tmp3);
524   Packuswb(dst, tmp2);
525 }
526 
527 template <typename Op>
I16x8SplatPreAvx2(XMMRegister dst,Op src)528 void SharedTurboAssembler::I16x8SplatPreAvx2(XMMRegister dst, Op src) {
529   DCHECK(!CpuFeatures::IsSupported(AVX2));
530   Movd(dst, src);
531   Pshuflw(dst, dst, uint8_t{0x0});
532   Punpcklqdq(dst, dst);
533 }
534 
I16x8Splat(XMMRegister dst,Register src)535 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Register src) {
536   ASM_CODE_COMMENT(this);
537   if (CpuFeatures::IsSupported(AVX2)) {
538     CpuFeatureScope avx2_scope(this, AVX2);
539     Movd(dst, src);
540     vpbroadcastw(dst, dst);
541   } else {
542     I16x8SplatPreAvx2(dst, src);
543   }
544 }
545 
I16x8Splat(XMMRegister dst,Operand src)546 void SharedTurboAssembler::I16x8Splat(XMMRegister dst, Operand src) {
547   ASM_CODE_COMMENT(this);
548   DCHECK_OPERAND_IS_NOT_REG(src);
549   if (CpuFeatures::IsSupported(AVX2)) {
550     CpuFeatureScope avx2_scope(this, AVX2);
551     vpbroadcastw(dst, src);
552   } else {
553     I16x8SplatPreAvx2(dst, src);
554   }
555 }
556 
I16x8ExtMulLow(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool is_signed)557 void SharedTurboAssembler::I16x8ExtMulLow(XMMRegister dst, XMMRegister src1,
558                                           XMMRegister src2, XMMRegister scratch,
559                                           bool is_signed) {
560   ASM_CODE_COMMENT(this);
561   is_signed ? Pmovsxbw(scratch, src1) : Pmovzxbw(scratch, src1);
562   is_signed ? Pmovsxbw(dst, src2) : Pmovzxbw(dst, src2);
563   Pmullw(dst, scratch);
564 }
565 
I16x8ExtMulHighS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)566 void SharedTurboAssembler::I16x8ExtMulHighS(XMMRegister dst, XMMRegister src1,
567                                             XMMRegister src2,
568                                             XMMRegister scratch) {
569   ASM_CODE_COMMENT(this);
570   if (CpuFeatures::IsSupported(AVX)) {
571     CpuFeatureScope avx_scope(this, AVX);
572     vpunpckhbw(scratch, src1, src1);
573     vpsraw(scratch, scratch, 8);
574     vpunpckhbw(dst, src2, src2);
575     vpsraw(dst, dst, 8);
576     vpmullw(dst, dst, scratch);
577   } else {
578     if (dst != src1) {
579       movaps(dst, src1);
580     }
581     movaps(scratch, src2);
582     punpckhbw(dst, dst);
583     psraw(dst, 8);
584     punpckhbw(scratch, scratch);
585     psraw(scratch, 8);
586     pmullw(dst, scratch);
587   }
588 }
589 
I16x8ExtMulHighU(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)590 void SharedTurboAssembler::I16x8ExtMulHighU(XMMRegister dst, XMMRegister src1,
591                                             XMMRegister src2,
592                                             XMMRegister scratch) {
593   ASM_CODE_COMMENT(this);
594   // The logic here is slightly complicated to handle all the cases of register
595   // aliasing. This allows flexibility for callers in TurboFan and Liftoff.
596   if (CpuFeatures::IsSupported(AVX)) {
597     CpuFeatureScope avx_scope(this, AVX);
598     if (src1 == src2) {
599       vpxor(scratch, scratch, scratch);
600       vpunpckhbw(dst, src1, scratch);
601       vpmullw(dst, dst, dst);
602     } else {
603       if (dst == src2) {
604         // We overwrite dst, then use src2, so swap src1 and src2.
605         std::swap(src1, src2);
606       }
607       vpxor(scratch, scratch, scratch);
608       vpunpckhbw(dst, src1, scratch);
609       vpunpckhbw(scratch, src2, scratch);
610       vpmullw(dst, dst, scratch);
611     }
612   } else {
613     if (src1 == src2) {
614       xorps(scratch, scratch);
615       if (dst != src1) {
616         movaps(dst, src1);
617       }
618       punpckhbw(dst, scratch);
619       pmullw(dst, scratch);
620     } else {
621       // When dst == src1, nothing special needs to be done.
622       // When dst == src2, swap src1 and src2, since we overwrite dst.
623       // When dst is unique, copy src1 to dst first.
624       if (dst == src2) {
625         std::swap(src1, src2);
626         // Now, dst == src1.
627       } else if (dst != src1) {
628         // dst != src1 && dst != src2.
629         movaps(dst, src1);
630       }
631       xorps(scratch, scratch);
632       punpckhbw(dst, scratch);
633       punpckhbw(scratch, src2);
634       psrlw(scratch, 8);
635       pmullw(dst, scratch);
636     }
637   }
638 }
639 
I16x8SConvertI8x16High(XMMRegister dst,XMMRegister src)640 void SharedTurboAssembler::I16x8SConvertI8x16High(XMMRegister dst,
641                                                   XMMRegister src) {
642   ASM_CODE_COMMENT(this);
643   if (CpuFeatures::IsSupported(AVX)) {
644     CpuFeatureScope avx_scope(this, AVX);
645     // src = |a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p| (high)
646     // dst = |i|i|j|j|k|k|l|l|m|m|n|n|o|o|p|p|
647     vpunpckhbw(dst, src, src);
648     vpsraw(dst, dst, 8);
649   } else {
650     CpuFeatureScope sse_scope(this, SSE4_1);
651     if (dst == src) {
652       // 2 bytes shorter than pshufd, but has depdency on dst.
653       movhlps(dst, src);
654       pmovsxbw(dst, dst);
655     } else {
656       // No dependency on dst.
657       pshufd(dst, src, 0xEE);
658       pmovsxbw(dst, dst);
659     }
660   }
661 }
662 
I16x8UConvertI8x16High(XMMRegister dst,XMMRegister src,XMMRegister scratch)663 void SharedTurboAssembler::I16x8UConvertI8x16High(XMMRegister dst,
664                                                   XMMRegister src,
665                                                   XMMRegister scratch) {
666   ASM_CODE_COMMENT(this);
667   if (CpuFeatures::IsSupported(AVX)) {
668     CpuFeatureScope avx_scope(this, AVX);
669     // tmp = |0|0|0|0|0|0|0|0 | 0|0|0|0|0|0|0|0|
670     // src = |a|b|c|d|e|f|g|h | i|j|k|l|m|n|o|p|
671     // dst = |0|a|0|b|0|c|0|d | 0|e|0|f|0|g|0|h|
672     XMMRegister tmp = dst == src ? scratch : dst;
673     vpxor(tmp, tmp, tmp);
674     vpunpckhbw(dst, src, tmp);
675   } else {
676     CpuFeatureScope sse_scope(this, SSE4_1);
677     if (dst == src) {
678       // xorps can be executed on more ports than pshufd.
679       xorps(scratch, scratch);
680       punpckhbw(dst, scratch);
681     } else {
682       // No dependency on dst.
683       pshufd(dst, src, 0xEE);
684       pmovzxbw(dst, dst);
685     }
686   }
687 }
688 
I16x8Q15MulRSatS(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch)689 void SharedTurboAssembler::I16x8Q15MulRSatS(XMMRegister dst, XMMRegister src1,
690                                             XMMRegister src2,
691                                             XMMRegister scratch) {
692   ASM_CODE_COMMENT(this);
693   // k = i16x8.splat(0x8000)
694   Pcmpeqd(scratch, scratch);
695   Psllw(scratch, scratch, byte{15});
696 
697   if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
698     movaps(dst, src1);
699     src1 = dst;
700   }
701 
702   Pmulhrsw(dst, src1, src2);
703   Pcmpeqw(scratch, dst);
704   Pxor(dst, scratch);
705 }
706 
I32x4ExtAddPairwiseI16x8U(XMMRegister dst,XMMRegister src,XMMRegister tmp)707 void SharedTurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
708                                                      XMMRegister src,
709                                                      XMMRegister tmp) {
710   ASM_CODE_COMMENT(this);
711   if (CpuFeatures::IsSupported(AVX)) {
712     CpuFeatureScope avx_scope(this, AVX);
713     // src = |a|b|c|d|e|f|g|h| (low)
714     // scratch = |0|a|0|c|0|e|0|g|
715     vpsrld(tmp, src, 16);
716     // dst = |0|b|0|d|0|f|0|h|
717     vpblendw(dst, src, tmp, 0xAA);
718     // dst = |a+b|c+d|e+f|g+h|
719     vpaddd(dst, tmp, dst);
720   } else if (CpuFeatures::IsSupported(SSE4_1)) {
721     CpuFeatureScope sse_scope(this, SSE4_1);
722     // There is a potentially better lowering if we get rip-relative
723     // constants, see https://github.com/WebAssembly/simd/pull/380.
724     movaps(tmp, src);
725     psrld(tmp, 16);
726     if (dst != src) {
727       movaps(dst, src);
728     }
729     pblendw(dst, tmp, 0xAA);
730     paddd(dst, tmp);
731   } else {
732     // src = |a|b|c|d|e|f|g|h|
733     // tmp = i32x4.splat(0x0000FFFF)
734     pcmpeqd(tmp, tmp);
735     psrld(tmp, byte{16});
736     // tmp =|0|b|0|d|0|f|0|h|
737     andps(tmp, src);
738     // dst = |0|a|0|c|0|e|0|g|
739     if (dst != src) {
740       movaps(dst, src);
741     }
742     psrld(dst, byte{16});
743     // dst = |a+b|c+d|e+f|g+h|
744     paddd(dst, tmp);
745   }
746 }
747 
748 // 1. Multiply low word into scratch.
749 // 2. Multiply high word (can be signed or unsigned) into dst.
750 // 3. Unpack and interleave scratch and dst into dst.
I32x4ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)751 void SharedTurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
752                                        XMMRegister src2, XMMRegister scratch,
753                                        bool low, bool is_signed) {
754   ASM_CODE_COMMENT(this);
755   if (CpuFeatures::IsSupported(AVX)) {
756     CpuFeatureScope avx_scope(this, AVX);
757     vpmullw(scratch, src1, src2);
758     is_signed ? vpmulhw(dst, src1, src2) : vpmulhuw(dst, src1, src2);
759     low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
760   } else {
761     DCHECK_EQ(dst, src1);
762     movaps(scratch, src1);
763     pmullw(dst, src2);
764     is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
765     low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
766   }
767 }
768 
I32x4SConvertI16x8High(XMMRegister dst,XMMRegister src)769 void SharedTurboAssembler::I32x4SConvertI16x8High(XMMRegister dst,
770                                                   XMMRegister src) {
771   ASM_CODE_COMMENT(this);
772   if (CpuFeatures::IsSupported(AVX)) {
773     CpuFeatureScope avx_scope(this, AVX);
774     // src = |a|b|c|d|e|f|g|h| (high)
775     // dst = |e|e|f|f|g|g|h|h|
776     vpunpckhwd(dst, src, src);
777     vpsrad(dst, dst, 16);
778   } else {
779     CpuFeatureScope sse_scope(this, SSE4_1);
780     if (dst == src) {
781       // 2 bytes shorter than pshufd, but has depdency on dst.
782       movhlps(dst, src);
783       pmovsxwd(dst, dst);
784     } else {
785       // No dependency on dst.
786       pshufd(dst, src, 0xEE);
787       pmovsxwd(dst, dst);
788     }
789   }
790 }
791 
I32x4UConvertI16x8High(XMMRegister dst,XMMRegister src,XMMRegister scratch)792 void SharedTurboAssembler::I32x4UConvertI16x8High(XMMRegister dst,
793                                                   XMMRegister src,
794                                                   XMMRegister scratch) {
795   ASM_CODE_COMMENT(this);
796   if (CpuFeatures::IsSupported(AVX)) {
797     CpuFeatureScope avx_scope(this, AVX);
798     // scratch = |0|0|0|0|0|0|0|0|
799     // src     = |a|b|c|d|e|f|g|h|
800     // dst     = |0|a|0|b|0|c|0|d|
801     XMMRegister tmp = dst == src ? scratch : dst;
802     vpxor(tmp, tmp, tmp);
803     vpunpckhwd(dst, src, tmp);
804   } else {
805     if (dst == src) {
806       // xorps can be executed on more ports than pshufd.
807       xorps(scratch, scratch);
808       punpckhwd(dst, scratch);
809     } else {
810       CpuFeatureScope sse_scope(this, SSE4_1);
811       // No dependency on dst.
812       pshufd(dst, src, 0xEE);
813       pmovzxwd(dst, dst);
814     }
815   }
816 }
817 
I64x2Neg(XMMRegister dst,XMMRegister src,XMMRegister scratch)818 void SharedTurboAssembler::I64x2Neg(XMMRegister dst, XMMRegister src,
819                                     XMMRegister scratch) {
820   ASM_CODE_COMMENT(this);
821   if (CpuFeatures::IsSupported(AVX)) {
822     CpuFeatureScope scope(this, AVX);
823     vpxor(scratch, scratch, scratch);
824     vpsubq(dst, scratch, src);
825   } else {
826     if (dst == src) {
827       movaps(scratch, src);
828       std::swap(src, scratch);
829     }
830     pxor(dst, dst);
831     psubq(dst, src);
832   }
833 }
834 
I64x2Abs(XMMRegister dst,XMMRegister src,XMMRegister scratch)835 void SharedTurboAssembler::I64x2Abs(XMMRegister dst, XMMRegister src,
836                                     XMMRegister scratch) {
837   ASM_CODE_COMMENT(this);
838   if (CpuFeatures::IsSupported(AVX)) {
839     CpuFeatureScope avx_scope(this, AVX);
840     XMMRegister tmp = dst == src ? scratch : dst;
841     vpxor(tmp, tmp, tmp);
842     vpsubq(tmp, tmp, src);
843     vblendvpd(dst, src, tmp, src);
844   } else {
845     CpuFeatureScope sse_scope(this, SSE3);
846     movshdup(scratch, src);
847     if (dst != src) {
848       movaps(dst, src);
849     }
850     psrad(scratch, 31);
851     xorps(dst, scratch);
852     psubq(dst, scratch);
853   }
854 }
855 
I64x2GtS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)856 void SharedTurboAssembler::I64x2GtS(XMMRegister dst, XMMRegister src0,
857                                     XMMRegister src1, XMMRegister scratch) {
858   ASM_CODE_COMMENT(this);
859   if (CpuFeatures::IsSupported(AVX)) {
860     CpuFeatureScope avx_scope(this, AVX);
861     vpcmpgtq(dst, src0, src1);
862   } else if (CpuFeatures::IsSupported(SSE4_2)) {
863     CpuFeatureScope sse_scope(this, SSE4_2);
864     if (dst == src0) {
865       pcmpgtq(dst, src1);
866     } else if (dst == src1) {
867       movaps(scratch, src0);
868       pcmpgtq(scratch, src1);
869       movaps(dst, scratch);
870     } else {
871       movaps(dst, src0);
872       pcmpgtq(dst, src1);
873     }
874   } else {
875     CpuFeatureScope sse_scope(this, SSE3);
876     DCHECK_NE(dst, src0);
877     DCHECK_NE(dst, src1);
878     movaps(dst, src1);
879     movaps(scratch, src0);
880     psubq(dst, src0);
881     pcmpeqd(scratch, src1);
882     andps(dst, scratch);
883     movaps(scratch, src0);
884     pcmpgtd(scratch, src1);
885     orps(dst, scratch);
886     movshdup(dst, dst);
887   }
888 }
889 
I64x2GeS(XMMRegister dst,XMMRegister src0,XMMRegister src1,XMMRegister scratch)890 void SharedTurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
891                                     XMMRegister src1, XMMRegister scratch) {
892   ASM_CODE_COMMENT(this);
893   if (CpuFeatures::IsSupported(AVX)) {
894     CpuFeatureScope avx_scope(this, AVX);
895     vpcmpgtq(dst, src1, src0);
896     vpcmpeqd(scratch, scratch, scratch);
897     vpxor(dst, dst, scratch);
898   } else if (CpuFeatures::IsSupported(SSE4_2)) {
899     CpuFeatureScope sse_scope(this, SSE4_2);
900     DCHECK_NE(dst, src0);
901     if (dst != src1) {
902       movaps(dst, src1);
903     }
904     pcmpgtq(dst, src0);
905     pcmpeqd(scratch, scratch);
906     xorps(dst, scratch);
907   } else {
908     CpuFeatureScope sse_scope(this, SSE3);
909     DCHECK_NE(dst, src0);
910     DCHECK_NE(dst, src1);
911     movaps(dst, src0);
912     movaps(scratch, src1);
913     psubq(dst, src1);
914     pcmpeqd(scratch, src0);
915     andps(dst, scratch);
916     movaps(scratch, src1);
917     pcmpgtd(scratch, src0);
918     orps(dst, scratch);
919     movshdup(dst, dst);
920     pcmpeqd(scratch, scratch);
921     xorps(dst, scratch);
922   }
923 }
924 
I64x2ShrS(XMMRegister dst,XMMRegister src,uint8_t shift,XMMRegister xmm_tmp)925 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
926                                      uint8_t shift, XMMRegister xmm_tmp) {
927   ASM_CODE_COMMENT(this);
928   DCHECK_GT(64, shift);
929   DCHECK_NE(xmm_tmp, dst);
930   DCHECK_NE(xmm_tmp, src);
931   // Use logical right shift to emulate arithmetic right shifts:
932   // Given:
933   // signed >> c
934   //   == (signed + 2^63 - 2^63) >> c
935   //   == ((signed + 2^63) >> c) - (2^63 >> c)
936   //                                ^^^^^^^^^
937   //                                 xmm_tmp
938   // signed + 2^63 is an unsigned number, so we can use logical right shifts.
939 
940   // xmm_tmp = wasm_i64x2_const(0x80000000'00000000).
941   Pcmpeqd(xmm_tmp, xmm_tmp);
942   Psllq(xmm_tmp, byte{63});
943 
944   if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
945     movaps(dst, src);
946     src = dst;
947   }
948   // Add a bias of 2^63 to convert signed to unsigned.
949   // Since only highest bit changes, use pxor instead of paddq.
950   Pxor(dst, src, xmm_tmp);
951   // Logically shift both value and bias.
952   Psrlq(dst, shift);
953   Psrlq(xmm_tmp, shift);
954   // Subtract shifted bias to convert back to signed value.
955   Psubq(dst, xmm_tmp);
956 }
957 
I64x2ShrS(XMMRegister dst,XMMRegister src,Register shift,XMMRegister xmm_tmp,XMMRegister xmm_shift,Register tmp_shift)958 void SharedTurboAssembler::I64x2ShrS(XMMRegister dst, XMMRegister src,
959                                      Register shift, XMMRegister xmm_tmp,
960                                      XMMRegister xmm_shift,
961                                      Register tmp_shift) {
962   ASM_CODE_COMMENT(this);
963   DCHECK_NE(xmm_tmp, dst);
964   DCHECK_NE(xmm_tmp, src);
965   DCHECK_NE(xmm_shift, dst);
966   DCHECK_NE(xmm_shift, src);
967   // tmp_shift can alias shift since we don't use shift after masking it.
968 
969   // See I64x2ShrS with constant shift for explanation of this algorithm.
970   Pcmpeqd(xmm_tmp, xmm_tmp);
971   Psllq(xmm_tmp, byte{63});
972 
973   // Shift modulo 64.
974   Move(tmp_shift, shift);
975   And(tmp_shift, Immediate(0x3F));
976   Movd(xmm_shift, tmp_shift);
977 
978   if (!CpuFeatures::IsSupported(AVX) && (dst != src)) {
979     movaps(dst, src);
980     src = dst;
981   }
982   Pxor(dst, src, xmm_tmp);
983   Psrlq(dst, xmm_shift);
984   Psrlq(xmm_tmp, xmm_shift);
985   Psubq(dst, xmm_tmp);
986 }
987 
I64x2Mul(XMMRegister dst,XMMRegister lhs,XMMRegister rhs,XMMRegister tmp1,XMMRegister tmp2)988 void SharedTurboAssembler::I64x2Mul(XMMRegister dst, XMMRegister lhs,
989                                     XMMRegister rhs, XMMRegister tmp1,
990                                     XMMRegister tmp2) {
991   ASM_CODE_COMMENT(this);
992   DCHECK(!AreAliased(dst, tmp1, tmp2));
993   DCHECK(!AreAliased(lhs, tmp1, tmp2));
994   DCHECK(!AreAliased(rhs, tmp1, tmp2));
995 
996   if (CpuFeatures::IsSupported(AVX)) {
997     CpuFeatureScope avx_scope(this, AVX);
998     // 1. Multiply high dword of each qword of left with right.
999     vpsrlq(tmp1, lhs, byte{32});
1000     vpmuludq(tmp1, tmp1, rhs);
1001     // 2. Multiply high dword of each qword of right with left.
1002     vpsrlq(tmp2, rhs, byte{32});
1003     vpmuludq(tmp2, tmp2, lhs);
1004     // 3. Add 1 and 2, then shift left by 32 (this is the high dword of result).
1005     vpaddq(tmp2, tmp2, tmp1);
1006     vpsllq(tmp2, tmp2, byte{32});
1007     // 4. Multiply low dwords (this is the low dword of result).
1008     vpmuludq(dst, lhs, rhs);
1009     // 5. Add 3 and 4.
1010     vpaddq(dst, dst, tmp2);
1011   } else {
1012     // Same algorithm as AVX version, but with moves to not overwrite inputs.
1013     movaps(tmp1, lhs);
1014     movaps(tmp2, rhs);
1015     psrlq(tmp1, byte{32});
1016     pmuludq(tmp1, rhs);
1017     psrlq(tmp2, byte{32});
1018     pmuludq(tmp2, lhs);
1019     paddq(tmp2, tmp1);
1020     psllq(tmp2, byte{32});
1021     if (dst == rhs) {
1022       // pmuludq is commutative
1023       pmuludq(dst, lhs);
1024     } else {
1025       if (dst != lhs) {
1026         movaps(dst, lhs);
1027       }
1028       pmuludq(dst, rhs);
1029     }
1030     paddq(dst, tmp2);
1031   }
1032 }
1033 
1034 // 1. Unpack src0, src1 into even-number elements of scratch.
1035 // 2. Unpack src1, src0 into even-number elements of dst.
1036 // 3. Multiply 1. with 2.
1037 // For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
I64x2ExtMul(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister scratch,bool low,bool is_signed)1038 void SharedTurboAssembler::I64x2ExtMul(XMMRegister dst, XMMRegister src1,
1039                                        XMMRegister src2, XMMRegister scratch,
1040                                        bool low, bool is_signed) {
1041   ASM_CODE_COMMENT(this);
1042   if (CpuFeatures::IsSupported(AVX)) {
1043     CpuFeatureScope avx_scope(this, AVX);
1044     if (low) {
1045       vpunpckldq(scratch, src1, src1);
1046       vpunpckldq(dst, src2, src2);
1047     } else {
1048       vpunpckhdq(scratch, src1, src1);
1049       vpunpckhdq(dst, src2, src2);
1050     }
1051     if (is_signed) {
1052       vpmuldq(dst, scratch, dst);
1053     } else {
1054       vpmuludq(dst, scratch, dst);
1055     }
1056   } else {
1057     uint8_t mask = low ? 0x50 : 0xFA;
1058     pshufd(scratch, src1, mask);
1059     pshufd(dst, src2, mask);
1060     if (is_signed) {
1061       CpuFeatureScope sse4_scope(this, SSE4_1);
1062       pmuldq(dst, scratch);
1063     } else {
1064       pmuludq(dst, scratch);
1065     }
1066   }
1067 }
1068 
I64x2SConvertI32x4High(XMMRegister dst,XMMRegister src)1069 void SharedTurboAssembler::I64x2SConvertI32x4High(XMMRegister dst,
1070                                                   XMMRegister src) {
1071   ASM_CODE_COMMENT(this);
1072   if (CpuFeatures::IsSupported(AVX)) {
1073     CpuFeatureScope avx_scope(this, AVX);
1074     vpunpckhqdq(dst, src, src);
1075     vpmovsxdq(dst, dst);
1076   } else {
1077     CpuFeatureScope sse_scope(this, SSE4_1);
1078     if (dst == src) {
1079       movhlps(dst, src);
1080     } else {
1081       pshufd(dst, src, 0xEE);
1082     }
1083     pmovsxdq(dst, dst);
1084   }
1085 }
1086 
I64x2UConvertI32x4High(XMMRegister dst,XMMRegister src,XMMRegister scratch)1087 void SharedTurboAssembler::I64x2UConvertI32x4High(XMMRegister dst,
1088                                                   XMMRegister src,
1089                                                   XMMRegister scratch) {
1090   ASM_CODE_COMMENT(this);
1091   if (CpuFeatures::IsSupported(AVX)) {
1092     CpuFeatureScope avx_scope(this, AVX);
1093     vpxor(scratch, scratch, scratch);
1094     vpunpckhdq(dst, src, scratch);
1095   } else {
1096     if (dst == src) {
1097       // xorps can be executed on more ports than pshufd.
1098       xorps(scratch, scratch);
1099       punpckhdq(dst, scratch);
1100     } else {
1101       CpuFeatureScope sse_scope(this, SSE4_1);
1102       // No dependency on dst.
1103       pshufd(dst, src, 0xEE);
1104       pmovzxdq(dst, dst);
1105     }
1106   }
1107 }
1108 
S128Not(XMMRegister dst,XMMRegister src,XMMRegister scratch)1109 void SharedTurboAssembler::S128Not(XMMRegister dst, XMMRegister src,
1110                                    XMMRegister scratch) {
1111   ASM_CODE_COMMENT(this);
1112   if (dst == src) {
1113     Pcmpeqd(scratch, scratch);
1114     Pxor(dst, scratch);
1115   } else {
1116     Pcmpeqd(dst, dst);
1117     Pxor(dst, src);
1118   }
1119 }
1120 
S128Select(XMMRegister dst,XMMRegister mask,XMMRegister src1,XMMRegister src2,XMMRegister scratch)1121 void SharedTurboAssembler::S128Select(XMMRegister dst, XMMRegister mask,
1122                                       XMMRegister src1, XMMRegister src2,
1123                                       XMMRegister scratch) {
1124   ASM_CODE_COMMENT(this);
1125   // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
1126   // pandn(x, y) = !x & y, so we have to flip the mask and input.
1127   if (CpuFeatures::IsSupported(AVX)) {
1128     CpuFeatureScope avx_scope(this, AVX);
1129     vpandn(scratch, mask, src2);
1130     vpand(dst, src1, mask);
1131     vpor(dst, dst, scratch);
1132   } else {
1133     DCHECK_EQ(dst, mask);
1134     // Use float ops as they are 1 byte shorter than int ops.
1135     movaps(scratch, mask);
1136     andnps(scratch, src2);
1137     andps(dst, src1);
1138     orps(dst, scratch);
1139   }
1140 }
1141 
S128Load8Splat(XMMRegister dst,Operand src,XMMRegister scratch)1142 void SharedTurboAssembler::S128Load8Splat(XMMRegister dst, Operand src,
1143                                           XMMRegister scratch) {
1144   ASM_CODE_COMMENT(this);
1145   // The trap handler uses the current pc to creating a landing, so that it can
1146   // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1147   // first instruction in each case below is the one that loads.
1148   if (CpuFeatures::IsSupported(AVX2)) {
1149     CpuFeatureScope avx2_scope(this, AVX2);
1150     vpbroadcastb(dst, src);
1151   } else if (CpuFeatures::IsSupported(AVX)) {
1152     CpuFeatureScope avx_scope(this, AVX);
1153     // Avoid dependency on previous value of dst.
1154     vpinsrb(dst, scratch, src, uint8_t{0});
1155     vpxor(scratch, scratch, scratch);
1156     vpshufb(dst, dst, scratch);
1157   } else {
1158     CpuFeatureScope ssse4_scope(this, SSE4_1);
1159     pinsrb(dst, src, uint8_t{0});
1160     xorps(scratch, scratch);
1161     pshufb(dst, scratch);
1162   }
1163 }
1164 
S128Load16Splat(XMMRegister dst,Operand src,XMMRegister scratch)1165 void SharedTurboAssembler::S128Load16Splat(XMMRegister dst, Operand src,
1166                                            XMMRegister scratch) {
1167   ASM_CODE_COMMENT(this);
1168   // The trap handler uses the current pc to creating a landing, so that it can
1169   // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1170   // first instruction in each case below is the one that loads.
1171   if (CpuFeatures::IsSupported(AVX2)) {
1172     CpuFeatureScope avx2_scope(this, AVX2);
1173     vpbroadcastw(dst, src);
1174   } else if (CpuFeatures::IsSupported(AVX)) {
1175     CpuFeatureScope avx_scope(this, AVX);
1176     // Avoid dependency on previous value of dst.
1177     vpinsrw(dst, scratch, src, uint8_t{0});
1178     vpshuflw(dst, dst, uint8_t{0});
1179     vpunpcklqdq(dst, dst, dst);
1180   } else {
1181     pinsrw(dst, src, uint8_t{0});
1182     pshuflw(dst, dst, uint8_t{0});
1183     movlhps(dst, dst);
1184   }
1185 }
1186 
S128Load32Splat(XMMRegister dst,Operand src)1187 void SharedTurboAssembler::S128Load32Splat(XMMRegister dst, Operand src) {
1188   ASM_CODE_COMMENT(this);
1189   // The trap handler uses the current pc to creating a landing, so that it can
1190   // determine if a trap occured in Wasm code due to a OOB load. Make sure the
1191   // first instruction in each case below is the one that loads.
1192   if (CpuFeatures::IsSupported(AVX)) {
1193     CpuFeatureScope avx_scope(this, AVX);
1194     vbroadcastss(dst, src);
1195   } else {
1196     movss(dst, src);
1197     shufps(dst, dst, byte{0});
1198   }
1199 }
1200 
S128Store64Lane(Operand dst,XMMRegister src,uint8_t laneidx)1201 void SharedTurboAssembler::S128Store64Lane(Operand dst, XMMRegister src,
1202                                            uint8_t laneidx) {
1203   ASM_CODE_COMMENT(this);
1204   if (laneidx == 0) {
1205     Movlps(dst, src);
1206   } else {
1207     DCHECK_EQ(1, laneidx);
1208     Movhps(dst, src);
1209   }
1210 }
1211 
1212 // Helper macro to define qfma macro-assembler. This takes care of every
1213 // possible case of register aliasing to minimize the number of instructions.
1214 #define QFMA(ps_or_pd)                        \
1215   if (CpuFeatures::IsSupported(FMA3)) {       \
1216     CpuFeatureScope fma3_scope(this, FMA3);   \
1217     if (dst == src1) {                        \
1218       vfmadd231##ps_or_pd(dst, src2, src3);   \
1219     } else if (dst == src2) {                 \
1220       vfmadd132##ps_or_pd(dst, src1, src3);   \
1221     } else if (dst == src3) {                 \
1222       vfmadd213##ps_or_pd(dst, src2, src1);   \
1223     } else {                                  \
1224       CpuFeatureScope avx_scope(this, AVX);   \
1225       vmovups(dst, src1);                     \
1226       vfmadd231##ps_or_pd(dst, src2, src3);   \
1227     }                                         \
1228   } else if (CpuFeatures::IsSupported(AVX)) { \
1229     CpuFeatureScope avx_scope(this, AVX);     \
1230     vmul##ps_or_pd(tmp, src2, src3);          \
1231     vadd##ps_or_pd(dst, src1, tmp);           \
1232   } else {                                    \
1233     if (dst == src1) {                        \
1234       movaps(tmp, src2);                      \
1235       mul##ps_or_pd(tmp, src3);               \
1236       add##ps_or_pd(dst, tmp);                \
1237     } else if (dst == src2) {                 \
1238       DCHECK_NE(src2, src1);                  \
1239       mul##ps_or_pd(src2, src3);              \
1240       add##ps_or_pd(src2, src1);              \
1241     } else if (dst == src3) {                 \
1242       DCHECK_NE(src3, src1);                  \
1243       mul##ps_or_pd(src3, src2);              \
1244       add##ps_or_pd(src3, src1);              \
1245     } else {                                  \
1246       movaps(dst, src2);                      \
1247       mul##ps_or_pd(dst, src3);               \
1248       add##ps_or_pd(dst, src1);               \
1249     }                                         \
1250   }
1251 
1252 // Helper macro to define qfms macro-assembler. This takes care of every
1253 // possible case of register aliasing to minimize the number of instructions.
1254 #define QFMS(ps_or_pd)                        \
1255   if (CpuFeatures::IsSupported(FMA3)) {       \
1256     CpuFeatureScope fma3_scope(this, FMA3);   \
1257     if (dst == src1) {                        \
1258       vfnmadd231##ps_or_pd(dst, src2, src3);  \
1259     } else if (dst == src2) {                 \
1260       vfnmadd132##ps_or_pd(dst, src1, src3);  \
1261     } else if (dst == src3) {                 \
1262       vfnmadd213##ps_or_pd(dst, src2, src1);  \
1263     } else {                                  \
1264       CpuFeatureScope avx_scope(this, AVX);   \
1265       vmovups(dst, src1);                     \
1266       vfnmadd231##ps_or_pd(dst, src2, src3);  \
1267     }                                         \
1268   } else if (CpuFeatures::IsSupported(AVX)) { \
1269     CpuFeatureScope avx_scope(this, AVX);     \
1270     vmul##ps_or_pd(tmp, src2, src3);          \
1271     vsub##ps_or_pd(dst, src1, tmp);           \
1272   } else {                                    \
1273     movaps(tmp, src2);                        \
1274     mul##ps_or_pd(tmp, src3);                 \
1275     if (dst != src1) {                        \
1276       movaps(dst, src1);                      \
1277     }                                         \
1278     sub##ps_or_pd(dst, tmp);                  \
1279   }
1280 
F32x4Qfma(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1281 void SharedTurboAssembler::F32x4Qfma(XMMRegister dst, XMMRegister src1,
1282                                      XMMRegister src2, XMMRegister src3,
1283                                      XMMRegister tmp) {
1284   QFMA(ps)
1285 }
1286 
F32x4Qfms(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1287 void SharedTurboAssembler::F32x4Qfms(XMMRegister dst, XMMRegister src1,
1288                                      XMMRegister src2, XMMRegister src3,
1289                                      XMMRegister tmp) {
1290   QFMS(ps)
1291 }
1292 
F64x2Qfma(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1293 void SharedTurboAssembler::F64x2Qfma(XMMRegister dst, XMMRegister src1,
1294                                      XMMRegister src2, XMMRegister src3,
1295                                      XMMRegister tmp) {
1296   QFMA(pd);
1297 }
1298 
F64x2Qfms(XMMRegister dst,XMMRegister src1,XMMRegister src2,XMMRegister src3,XMMRegister tmp)1299 void SharedTurboAssembler::F64x2Qfms(XMMRegister dst, XMMRegister src1,
1300                                      XMMRegister src2, XMMRegister src3,
1301                                      XMMRegister tmp) {
1302   QFMS(pd);
1303 }
1304 
1305 #undef QFMOP
1306 
1307 }  // namespace internal
1308 }  // namespace v8
1309 
1310 #undef DCHECK_OPERAND_IS_NOT_REG
1311