• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/base/overflowing-math.h"
6 #include "src/codegen/assembler-inl.h"
7 #include "src/codegen/callable.h"
8 #include "src/codegen/ia32/assembler-ia32.h"
9 #include "src/codegen/macro-assembler.h"
10 #include "src/codegen/optimized-compilation-info.h"
11 #include "src/compiler/backend/code-generator-impl.h"
12 #include "src/compiler/backend/code-generator.h"
13 #include "src/compiler/backend/gap-resolver.h"
14 #include "src/compiler/node-matchers.h"
15 #include "src/compiler/osr.h"
16 #include "src/execution/frame-constants.h"
17 #include "src/execution/frames.h"
18 #include "src/heap/memory-chunk.h"
19 #include "src/objects/smi.h"
20 #include "src/wasm/wasm-code-manager.h"
21 #include "src/wasm/wasm-objects.h"
22 
23 namespace v8 {
24 namespace internal {
25 namespace compiler {
26 
27 #define __ tasm()->
28 
29 #define kScratchDoubleReg xmm0
30 
31 // Adds IA-32 specific methods for decoding operands.
32 class IA32OperandConverter : public InstructionOperandConverter {
33  public:
IA32OperandConverter(CodeGenerator * gen,Instruction * instr)34   IA32OperandConverter(CodeGenerator* gen, Instruction* instr)
35       : InstructionOperandConverter(gen, instr) {}
36 
InputOperand(size_t index,int extra=0)37   Operand InputOperand(size_t index, int extra = 0) {
38     return ToOperand(instr_->InputAt(index), extra);
39   }
40 
InputImmediate(size_t index)41   Immediate InputImmediate(size_t index) {
42     return ToImmediate(instr_->InputAt(index));
43   }
44 
OutputOperand()45   Operand OutputOperand() { return ToOperand(instr_->Output()); }
46 
ToOperand(InstructionOperand * op,int extra=0)47   Operand ToOperand(InstructionOperand* op, int extra = 0) {
48     if (op->IsRegister()) {
49       DCHECK_EQ(0, extra);
50       return Operand(ToRegister(op));
51     } else if (op->IsFPRegister()) {
52       DCHECK_EQ(0, extra);
53       return Operand(ToDoubleRegister(op));
54     }
55     DCHECK(op->IsStackSlot() || op->IsFPStackSlot());
56     return SlotToOperand(AllocatedOperand::cast(op)->index(), extra);
57   }
58 
SlotToOperand(int slot,int extra=0)59   Operand SlotToOperand(int slot, int extra = 0) {
60     FrameOffset offset = frame_access_state()->GetFrameOffset(slot);
61     return Operand(offset.from_stack_pointer() ? esp : ebp,
62                    offset.offset() + extra);
63   }
64 
ToImmediate(InstructionOperand * operand)65   Immediate ToImmediate(InstructionOperand* operand) {
66     Constant constant = ToConstant(operand);
67     if (constant.type() == Constant::kInt32 &&
68         RelocInfo::IsWasmReference(constant.rmode())) {
69       return Immediate(static_cast<Address>(constant.ToInt32()),
70                        constant.rmode());
71     }
72     switch (constant.type()) {
73       case Constant::kInt32:
74         return Immediate(constant.ToInt32());
75       case Constant::kFloat32:
76         return Immediate::EmbeddedNumber(constant.ToFloat32());
77       case Constant::kFloat64:
78         return Immediate::EmbeddedNumber(constant.ToFloat64().value());
79       case Constant::kExternalReference:
80         return Immediate(constant.ToExternalReference());
81       case Constant::kHeapObject:
82         return Immediate(constant.ToHeapObject());
83       case Constant::kCompressedHeapObject:
84         break;
85       case Constant::kDelayedStringConstant:
86         return Immediate::EmbeddedStringConstant(
87             constant.ToDelayedStringConstant());
88       case Constant::kInt64:
89         break;
90       case Constant::kRpoNumber:
91         return Immediate::CodeRelativeOffset(ToLabel(operand));
92     }
93     UNREACHABLE();
94   }
95 
NextOffset(size_t * offset)96   static size_t NextOffset(size_t* offset) {
97     size_t i = *offset;
98     (*offset)++;
99     return i;
100   }
101 
ScaleFor(AddressingMode one,AddressingMode mode)102   static ScaleFactor ScaleFor(AddressingMode one, AddressingMode mode) {
103     STATIC_ASSERT(0 == static_cast<int>(times_1));
104     STATIC_ASSERT(1 == static_cast<int>(times_2));
105     STATIC_ASSERT(2 == static_cast<int>(times_4));
106     STATIC_ASSERT(3 == static_cast<int>(times_8));
107     int scale = static_cast<int>(mode - one);
108     DCHECK(scale >= 0 && scale < 4);
109     return static_cast<ScaleFactor>(scale);
110   }
111 
MemoryOperand(size_t * offset)112   Operand MemoryOperand(size_t* offset) {
113     AddressingMode mode = AddressingModeField::decode(instr_->opcode());
114     switch (mode) {
115       case kMode_MR: {
116         Register base = InputRegister(NextOffset(offset));
117         int32_t disp = 0;
118         return Operand(base, disp);
119       }
120       case kMode_MRI: {
121         Register base = InputRegister(NextOffset(offset));
122         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
123         return Operand(base, ctant.ToInt32(), ctant.rmode());
124       }
125       case kMode_MR1:
126       case kMode_MR2:
127       case kMode_MR4:
128       case kMode_MR8: {
129         Register base = InputRegister(NextOffset(offset));
130         Register index = InputRegister(NextOffset(offset));
131         ScaleFactor scale = ScaleFor(kMode_MR1, mode);
132         int32_t disp = 0;
133         return Operand(base, index, scale, disp);
134       }
135       case kMode_MR1I:
136       case kMode_MR2I:
137       case kMode_MR4I:
138       case kMode_MR8I: {
139         Register base = InputRegister(NextOffset(offset));
140         Register index = InputRegister(NextOffset(offset));
141         ScaleFactor scale = ScaleFor(kMode_MR1I, mode);
142         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
143         return Operand(base, index, scale, ctant.ToInt32(), ctant.rmode());
144       }
145       case kMode_M1:
146       case kMode_M2:
147       case kMode_M4:
148       case kMode_M8: {
149         Register index = InputRegister(NextOffset(offset));
150         ScaleFactor scale = ScaleFor(kMode_M1, mode);
151         int32_t disp = 0;
152         return Operand(index, scale, disp);
153       }
154       case kMode_M1I:
155       case kMode_M2I:
156       case kMode_M4I:
157       case kMode_M8I: {
158         Register index = InputRegister(NextOffset(offset));
159         ScaleFactor scale = ScaleFor(kMode_M1I, mode);
160         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
161         return Operand(index, scale, ctant.ToInt32(), ctant.rmode());
162       }
163       case kMode_MI: {
164         Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
165         return Operand(ctant.ToInt32(), ctant.rmode());
166       }
167       case kMode_Root: {
168         Register base = kRootRegister;
169         int32_t disp = InputInt32(NextOffset(offset));
170         return Operand(base, disp);
171       }
172       case kMode_None:
173         UNREACHABLE();
174     }
175     UNREACHABLE();
176   }
177 
MemoryOperand(size_t first_input=0)178   Operand MemoryOperand(size_t first_input = 0) {
179     return MemoryOperand(&first_input);
180   }
181 
NextMemoryOperand(size_t offset=0)182   Operand NextMemoryOperand(size_t offset = 0) {
183     AddressingMode mode = AddressingModeField::decode(instr_->opcode());
184     Register base = InputRegister(NextOffset(&offset));
185     const int32_t disp = 4;
186     if (mode == kMode_MR1) {
187       Register index = InputRegister(NextOffset(&offset));
188       ScaleFactor scale = ScaleFor(kMode_MR1, kMode_MR1);
189       return Operand(base, index, scale, disp);
190     } else if (mode == kMode_MRI) {
191       Constant ctant = ToConstant(instr_->InputAt(NextOffset(&offset)));
192       return Operand(base, ctant.ToInt32() + disp, ctant.rmode());
193     } else {
194       UNREACHABLE();
195     }
196   }
197 
MoveInstructionOperandToRegister(Register destination,InstructionOperand * op)198   void MoveInstructionOperandToRegister(Register destination,
199                                         InstructionOperand* op) {
200     if (op->IsImmediate() || op->IsConstant()) {
201       gen_->tasm()->mov(destination, ToImmediate(op));
202     } else if (op->IsRegister()) {
203       gen_->tasm()->Move(destination, ToRegister(op));
204     } else {
205       gen_->tasm()->mov(destination, ToOperand(op));
206     }
207   }
208 };
209 
210 namespace {
211 
HasAddressingMode(Instruction * instr)212 bool HasAddressingMode(Instruction* instr) {
213   return instr->addressing_mode() != kMode_None;
214 }
215 
HasImmediateInput(Instruction * instr,size_t index)216 bool HasImmediateInput(Instruction* instr, size_t index) {
217   return instr->InputAt(index)->IsImmediate();
218 }
219 
HasRegisterInput(Instruction * instr,size_t index)220 bool HasRegisterInput(Instruction* instr, size_t index) {
221   return instr->InputAt(index)->IsRegister();
222 }
223 
224 class OutOfLineLoadFloat32NaN final : public OutOfLineCode {
225  public:
OutOfLineLoadFloat32NaN(CodeGenerator * gen,XMMRegister result)226   OutOfLineLoadFloat32NaN(CodeGenerator* gen, XMMRegister result)
227       : OutOfLineCode(gen), result_(result) {}
228 
Generate()229   void Generate() final {
230     __ xorps(result_, result_);
231     __ divss(result_, result_);
232   }
233 
234  private:
235   XMMRegister const result_;
236 };
237 
238 class OutOfLineLoadFloat64NaN final : public OutOfLineCode {
239  public:
OutOfLineLoadFloat64NaN(CodeGenerator * gen,XMMRegister result)240   OutOfLineLoadFloat64NaN(CodeGenerator* gen, XMMRegister result)
241       : OutOfLineCode(gen), result_(result) {}
242 
Generate()243   void Generate() final {
244     __ xorpd(result_, result_);
245     __ divsd(result_, result_);
246   }
247 
248  private:
249   XMMRegister const result_;
250 };
251 
252 class OutOfLineTruncateDoubleToI final : public OutOfLineCode {
253  public:
OutOfLineTruncateDoubleToI(CodeGenerator * gen,Register result,XMMRegister input,StubCallMode stub_mode)254   OutOfLineTruncateDoubleToI(CodeGenerator* gen, Register result,
255                              XMMRegister input, StubCallMode stub_mode)
256       : OutOfLineCode(gen),
257         result_(result),
258         input_(input),
259         stub_mode_(stub_mode),
260         isolate_(gen->isolate()),
261         zone_(gen->zone()) {}
262 
Generate()263   void Generate() final {
264     __ AllocateStackSpace(kDoubleSize);
265     __ movsd(MemOperand(esp, 0), input_);
266     if (stub_mode_ == StubCallMode::kCallWasmRuntimeStub) {
267       // A direct call to a wasm runtime stub defined in this module.
268       // Just encode the stub index. This will be patched when the code
269       // is added to the native module and copied into wasm code space.
270       __ wasm_call(wasm::WasmCode::kDoubleToI, RelocInfo::WASM_STUB_CALL);
271     } else if (tasm()->options().inline_offheap_trampolines) {
272       __ CallBuiltin(Builtins::kDoubleToI);
273     } else {
274       __ Call(BUILTIN_CODE(isolate_, DoubleToI), RelocInfo::CODE_TARGET);
275     }
276     __ mov(result_, MemOperand(esp, 0));
277     __ add(esp, Immediate(kDoubleSize));
278   }
279 
280  private:
281   Register const result_;
282   XMMRegister const input_;
283   StubCallMode stub_mode_;
284   Isolate* isolate_;
285   Zone* zone_;
286 };
287 
288 class OutOfLineRecordWrite final : public OutOfLineCode {
289  public:
OutOfLineRecordWrite(CodeGenerator * gen,Register object,Operand operand,Register value,Register scratch0,Register scratch1,RecordWriteMode mode,StubCallMode stub_mode)290   OutOfLineRecordWrite(CodeGenerator* gen, Register object, Operand operand,
291                        Register value, Register scratch0, Register scratch1,
292                        RecordWriteMode mode, StubCallMode stub_mode)
293       : OutOfLineCode(gen),
294         object_(object),
295         operand_(operand),
296         value_(value),
297         scratch0_(scratch0),
298         scratch1_(scratch1),
299         mode_(mode),
300         stub_mode_(stub_mode),
301         zone_(gen->zone()) {}
302 
Generate()303   void Generate() final {
304     if (mode_ > RecordWriteMode::kValueIsPointer) {
305       __ JumpIfSmi(value_, exit());
306     }
307     __ CheckPageFlag(value_, scratch0_,
308                      MemoryChunk::kPointersToHereAreInterestingMask, zero,
309                      exit());
310     __ lea(scratch1_, operand_);
311     RememberedSetAction const remembered_set_action =
312         mode_ > RecordWriteMode::kValueIsMap ? EMIT_REMEMBERED_SET
313                                              : OMIT_REMEMBERED_SET;
314     SaveFPRegsMode const save_fp_mode =
315         frame()->DidAllocateDoubleRegisters() ? kSaveFPRegs : kDontSaveFPRegs;
316     if (mode_ == RecordWriteMode::kValueIsEphemeronKey) {
317       __ CallEphemeronKeyBarrier(object_, scratch1_, save_fp_mode);
318     } else if (stub_mode_ == StubCallMode::kCallWasmRuntimeStub) {
319       // A direct call to a wasm runtime stub defined in this module.
320       // Just encode the stub index. This will be patched when the code
321       // is added to the native module and copied into wasm code space.
322       __ CallRecordWriteStub(object_, scratch1_, remembered_set_action,
323                              save_fp_mode, wasm::WasmCode::kRecordWrite);
324     } else {
325       __ CallRecordWriteStub(object_, scratch1_, remembered_set_action,
326                              save_fp_mode);
327     }
328   }
329 
330  private:
331   Register const object_;
332   Operand const operand_;
333   Register const value_;
334   Register const scratch0_;
335   Register const scratch1_;
336   RecordWriteMode const mode_;
337   StubCallMode const stub_mode_;
338   Zone* zone_;
339 };
340 
341 }  // namespace
342 
343 #define ASSEMBLE_COMPARE(asm_instr)                              \
344   do {                                                           \
345     if (HasAddressingMode(instr)) {                              \
346       size_t index = 0;                                          \
347       Operand left = i.MemoryOperand(&index);                    \
348       if (HasImmediateInput(instr, index)) {                     \
349         __ asm_instr(left, i.InputImmediate(index));             \
350       } else {                                                   \
351         __ asm_instr(left, i.InputRegister(index));              \
352       }                                                          \
353     } else {                                                     \
354       if (HasImmediateInput(instr, 1)) {                         \
355         if (HasRegisterInput(instr, 0)) {                        \
356           __ asm_instr(i.InputRegister(0), i.InputImmediate(1)); \
357         } else {                                                 \
358           __ asm_instr(i.InputOperand(0), i.InputImmediate(1));  \
359         }                                                        \
360       } else {                                                   \
361         if (HasRegisterInput(instr, 1)) {                        \
362           __ asm_instr(i.InputRegister(0), i.InputRegister(1));  \
363         } else {                                                 \
364           __ asm_instr(i.InputRegister(0), i.InputOperand(1));   \
365         }                                                        \
366       }                                                          \
367     }                                                            \
368   } while (0)
369 
370 #define ASSEMBLE_IEEE754_BINOP(name)                                     \
371   do {                                                                   \
372     /* Pass two doubles as arguments on the stack. */                    \
373     __ PrepareCallCFunction(4, eax);                                     \
374     __ movsd(Operand(esp, 0 * kDoubleSize), i.InputDoubleRegister(0));   \
375     __ movsd(Operand(esp, 1 * kDoubleSize), i.InputDoubleRegister(1));   \
376     __ CallCFunction(ExternalReference::ieee754_##name##_function(), 4); \
377     /* Return value is in st(0) on ia32. */                              \
378     /* Store it into the result register. */                             \
379     __ AllocateStackSpace(kDoubleSize);                                  \
380     __ fstp_d(Operand(esp, 0));                                          \
381     __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));                 \
382     __ add(esp, Immediate(kDoubleSize));                                 \
383   } while (false)
384 
385 #define ASSEMBLE_IEEE754_UNOP(name)                                      \
386   do {                                                                   \
387     /* Pass one double as argument on the stack. */                      \
388     __ PrepareCallCFunction(2, eax);                                     \
389     __ movsd(Operand(esp, 0 * kDoubleSize), i.InputDoubleRegister(0));   \
390     __ CallCFunction(ExternalReference::ieee754_##name##_function(), 2); \
391     /* Return value is in st(0) on ia32. */                              \
392     /* Store it into the result register. */                             \
393     __ AllocateStackSpace(kDoubleSize);                                  \
394     __ fstp_d(Operand(esp, 0));                                          \
395     __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));                 \
396     __ add(esp, Immediate(kDoubleSize));                                 \
397   } while (false)
398 
399 #define ASSEMBLE_BINOP(asm_instr)                             \
400   do {                                                        \
401     if (HasAddressingMode(instr)) {                           \
402       size_t index = 1;                                       \
403       Operand right = i.MemoryOperand(&index);                \
404       __ asm_instr(i.InputRegister(0), right);                \
405     } else {                                                  \
406       if (HasImmediateInput(instr, 1)) {                      \
407         __ asm_instr(i.InputOperand(0), i.InputImmediate(1)); \
408       } else {                                                \
409         __ asm_instr(i.InputRegister(0), i.InputOperand(1));  \
410       }                                                       \
411     }                                                         \
412   } while (0)
413 
414 #define ASSEMBLE_ATOMIC_BINOP(bin_inst, mov_inst, cmpxchg_inst) \
415   do {                                                          \
416     Label binop;                                                \
417     __ bind(&binop);                                            \
418     __ mov_inst(eax, i.MemoryOperand(1));                       \
419     __ Move(i.TempRegister(0), eax);                            \
420     __ bin_inst(i.TempRegister(0), i.InputRegister(0));         \
421     __ lock();                                                  \
422     __ cmpxchg_inst(i.MemoryOperand(1), i.TempRegister(0));     \
423     __ j(not_equal, &binop);                                    \
424   } while (false)
425 
426 #define ASSEMBLE_I64ATOMIC_BINOP(instr1, instr2)                \
427   do {                                                          \
428     Label binop;                                                \
429     __ bind(&binop);                                            \
430     __ mov(eax, i.MemoryOperand(2));                            \
431     __ mov(edx, i.NextMemoryOperand(2));                        \
432     __ push(ebx);                                               \
433     frame_access_state()->IncreaseSPDelta(1);                   \
434     i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0)); \
435     __ push(i.InputRegister(1));                                \
436     __ instr1(ebx, eax);                                        \
437     __ instr2(i.InputRegister(1), edx);                         \
438     __ lock();                                                  \
439     __ cmpxchg8b(i.MemoryOperand(2));                           \
440     __ pop(i.InputRegister(1));                                 \
441     __ pop(ebx);                                                \
442     frame_access_state()->IncreaseSPDelta(-1);                  \
443     __ j(not_equal, &binop);                                    \
444   } while (false);
445 
446 #define ASSEMBLE_MOVX(mov_instr)                            \
447   do {                                                      \
448     if (HasAddressingMode(instr)) {                         \
449       __ mov_instr(i.OutputRegister(), i.MemoryOperand());  \
450     } else if (HasRegisterInput(instr, 0)) {                \
451       __ mov_instr(i.OutputRegister(), i.InputRegister(0)); \
452     } else {                                                \
453       __ mov_instr(i.OutputRegister(), i.InputOperand(0));  \
454     }                                                       \
455   } while (0)
456 
457 #define ASSEMBLE_SIMD_PUNPCK_SHUFFLE(opcode)                         \
458   do {                                                               \
459     XMMRegister src0 = i.InputSimd128Register(0);                    \
460     Operand src1 = i.InputOperand(instr->InputCount() == 2 ? 1 : 0); \
461     if (CpuFeatures::IsSupported(AVX)) {                             \
462       CpuFeatureScope avx_scope(tasm(), AVX);                        \
463       __ v##opcode(i.OutputSimd128Register(), src0, src1);           \
464     } else {                                                         \
465       DCHECK_EQ(i.OutputSimd128Register(), src0);                    \
466       __ opcode(i.OutputSimd128Register(), src1);                    \
467     }                                                                \
468   } while (false)
469 
470 #define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, SSELevel, imm)               \
471   if (CpuFeatures::IsSupported(AVX)) {                                 \
472     CpuFeatureScope avx_scope(tasm(), AVX);                            \
473     __ v##opcode(i.OutputSimd128Register(), i.InputSimd128Register(0), \
474                  i.InputOperand(1), imm);                              \
475   } else {                                                             \
476     CpuFeatureScope sse_scope(tasm(), SSELevel);                       \
477     DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));   \
478     __ opcode(i.OutputSimd128Register(), i.InputOperand(1), imm);      \
479   }
480 
481 #define ASSEMBLE_SIMD_ALL_TRUE(opcode)               \
482   do {                                               \
483     Register dst = i.OutputRegister();               \
484     Operand src = i.InputOperand(0);                 \
485     Register tmp = i.TempRegister(0);                \
486     XMMRegister tmp_simd = i.TempSimd128Register(1); \
487     __ mov(tmp, Immediate(1));                       \
488     __ xor_(dst, dst);                               \
489     __ Pxor(tmp_simd, tmp_simd);                     \
490     __ opcode(tmp_simd, src);                        \
491     __ Ptest(tmp_simd, tmp_simd);                    \
492     __ cmov(zero, dst, tmp);                         \
493   } while (false)
494 
495 #define ASSEMBLE_SIMD_SHIFT(opcode, width)             \
496   do {                                                 \
497     XMMRegister dst = i.OutputSimd128Register();       \
498     DCHECK_EQ(dst, i.InputSimd128Register(0));         \
499     if (HasImmediateInput(instr, 1)) {                 \
500       __ opcode(dst, dst, byte{i.InputInt##width(1)}); \
501     } else {                                           \
502       XMMRegister tmp = i.TempSimd128Register(0);      \
503       Register tmp_shift = i.TempRegister(1);          \
504       constexpr int mask = (1 << width) - 1;           \
505       __ mov(tmp_shift, i.InputRegister(1));           \
506       __ and_(tmp_shift, Immediate(mask));             \
507       __ Movd(tmp, tmp_shift);                         \
508       __ opcode(dst, dst, tmp);                        \
509     }                                                  \
510   } while (false)
511 
AssembleDeconstructFrame()512 void CodeGenerator::AssembleDeconstructFrame() {
513   __ mov(esp, ebp);
514   __ pop(ebp);
515 }
516 
AssemblePrepareTailCall()517 void CodeGenerator::AssemblePrepareTailCall() {
518   if (frame_access_state()->has_frame()) {
519     __ mov(ebp, MemOperand(ebp, 0));
520   }
521   frame_access_state()->SetFrameAccessToSP();
522 }
523 
AssemblePopArgumentsAdaptorFrame(Register args_reg,Register,Register,Register)524 void CodeGenerator::AssemblePopArgumentsAdaptorFrame(Register args_reg,
525                                                      Register, Register,
526                                                      Register) {
527   // There are not enough temp registers left on ia32 for a call instruction
528   // so we pick some scratch registers and save/restore them manually here.
529   int scratch_count = 3;
530   Register scratch1 = esi;
531   Register scratch2 = ecx;
532   Register scratch3 = edx;
533   DCHECK(!AreAliased(args_reg, scratch1, scratch2, scratch3));
534   Label done;
535 
536   // Check if current frame is an arguments adaptor frame.
537   __ cmp(Operand(ebp, StandardFrameConstants::kContextOffset),
538          Immediate(StackFrame::TypeToMarker(StackFrame::ARGUMENTS_ADAPTOR)));
539   __ j(not_equal, &done, Label::kNear);
540 
541   __ push(scratch1);
542   __ push(scratch2);
543   __ push(scratch3);
544 
545   // Load arguments count from current arguments adaptor frame (note, it
546   // does not include receiver).
547   Register caller_args_count_reg = scratch1;
548   __ mov(caller_args_count_reg,
549          Operand(ebp, ArgumentsAdaptorFrameConstants::kLengthOffset));
550   __ SmiUntag(caller_args_count_reg);
551 
552   __ PrepareForTailCall(args_reg, caller_args_count_reg, scratch2, scratch3,
553                         scratch_count);
554   __ pop(scratch3);
555   __ pop(scratch2);
556   __ pop(scratch1);
557 
558   __ bind(&done);
559 }
560 
561 namespace {
562 
AdjustStackPointerForTailCall(TurboAssembler * tasm,FrameAccessState * state,int new_slot_above_sp,bool allow_shrinkage=true)563 void AdjustStackPointerForTailCall(TurboAssembler* tasm,
564                                    FrameAccessState* state,
565                                    int new_slot_above_sp,
566                                    bool allow_shrinkage = true) {
567   int current_sp_offset = state->GetSPToFPSlotCount() +
568                           StandardFrameConstants::kFixedSlotCountAboveFp;
569   int stack_slot_delta = new_slot_above_sp - current_sp_offset;
570   if (stack_slot_delta > 0) {
571     tasm->AllocateStackSpace(stack_slot_delta * kSystemPointerSize);
572     state->IncreaseSPDelta(stack_slot_delta);
573   } else if (allow_shrinkage && stack_slot_delta < 0) {
574     tasm->add(esp, Immediate(-stack_slot_delta * kSystemPointerSize));
575     state->IncreaseSPDelta(stack_slot_delta);
576   }
577 }
578 
579 #ifdef DEBUG
VerifyOutputOfAtomicPairInstr(IA32OperandConverter * converter,const Instruction * instr)580 bool VerifyOutputOfAtomicPairInstr(IA32OperandConverter* converter,
581                                    const Instruction* instr) {
582   if (instr->OutputCount() == 2) {
583     return (converter->OutputRegister(0) == eax &&
584             converter->OutputRegister(1) == edx);
585   }
586   if (instr->OutputCount() == 1) {
587     return (converter->OutputRegister(0) == eax &&
588             converter->TempRegister(0) == edx) ||
589            (converter->OutputRegister(0) == edx &&
590             converter->TempRegister(0) == eax);
591   }
592   DCHECK_EQ(instr->OutputCount(), 0);
593   return (converter->TempRegister(0) == eax &&
594           converter->TempRegister(1) == edx);
595 }
596 #endif
597 
598 }  // namespace
599 
AssembleTailCallBeforeGap(Instruction * instr,int first_unused_stack_slot)600 void CodeGenerator::AssembleTailCallBeforeGap(Instruction* instr,
601                                               int first_unused_stack_slot) {
602   CodeGenerator::PushTypeFlags flags(kImmediatePush | kScalarPush);
603   ZoneVector<MoveOperands*> pushes(zone());
604   GetPushCompatibleMoves(instr, flags, &pushes);
605 
606   if (!pushes.empty() &&
607       (LocationOperand::cast(pushes.back()->destination()).index() + 1 ==
608        first_unused_stack_slot)) {
609     IA32OperandConverter g(this, instr);
610     for (auto move : pushes) {
611       LocationOperand destination_location(
612           LocationOperand::cast(move->destination()));
613       InstructionOperand source(move->source());
614       AdjustStackPointerForTailCall(tasm(), frame_access_state(),
615                                     destination_location.index());
616       if (source.IsStackSlot()) {
617         LocationOperand source_location(LocationOperand::cast(source));
618         __ push(g.SlotToOperand(source_location.index()));
619       } else if (source.IsRegister()) {
620         LocationOperand source_location(LocationOperand::cast(source));
621         __ push(source_location.GetRegister());
622       } else if (source.IsImmediate()) {
623         __ Push(Immediate(ImmediateOperand::cast(source).inline_value()));
624       } else {
625         // Pushes of non-scalar data types is not supported.
626         UNIMPLEMENTED();
627       }
628       frame_access_state()->IncreaseSPDelta(1);
629       move->Eliminate();
630     }
631   }
632   AdjustStackPointerForTailCall(tasm(), frame_access_state(),
633                                 first_unused_stack_slot, false);
634 }
635 
AssembleTailCallAfterGap(Instruction * instr,int first_unused_stack_slot)636 void CodeGenerator::AssembleTailCallAfterGap(Instruction* instr,
637                                              int first_unused_stack_slot) {
638   AdjustStackPointerForTailCall(tasm(), frame_access_state(),
639                                 first_unused_stack_slot);
640 }
641 
642 // Check that {kJavaScriptCallCodeStartRegister} is correct.
AssembleCodeStartRegisterCheck()643 void CodeGenerator::AssembleCodeStartRegisterCheck() {
644   __ push(eax);  // Push eax so we can use it as a scratch register.
645   __ ComputeCodeStartAddress(eax);
646   __ cmp(eax, kJavaScriptCallCodeStartRegister);
647   __ Assert(equal, AbortReason::kWrongFunctionCodeStart);
648   __ pop(eax);  // Restore eax.
649 }
650 
651 // Check if the code object is marked for deoptimization. If it is, then it
652 // jumps to the CompileLazyDeoptimizedCode builtin. In order to do this we need
653 // to:
654 //    1. read from memory the word that contains that bit, which can be found in
655 //       the flags in the referenced {CodeDataContainer} object;
656 //    2. test kMarkedForDeoptimizationBit in those flags; and
657 //    3. if it is not zero then it jumps to the builtin.
BailoutIfDeoptimized()658 void CodeGenerator::BailoutIfDeoptimized() {
659   int offset = Code::kCodeDataContainerOffset - Code::kHeaderSize;
660   __ push(eax);  // Push eax so we can use it as a scratch register.
661   __ mov(eax, Operand(kJavaScriptCallCodeStartRegister, offset));
662   __ test(FieldOperand(eax, CodeDataContainer::kKindSpecificFlagsOffset),
663           Immediate(1 << Code::kMarkedForDeoptimizationBit));
664   __ pop(eax);  // Restore eax.
665 
666   Label skip;
667   __ j(zero, &skip, Label::kNear);
668   __ Jump(BUILTIN_CODE(isolate(), CompileLazyDeoptimizedCode),
669           RelocInfo::CODE_TARGET);
670   __ bind(&skip);
671 }
672 
GenerateSpeculationPoisonFromCodeStartRegister()673 void CodeGenerator::GenerateSpeculationPoisonFromCodeStartRegister() {
674   // TODO(860429): Remove remaining poisoning infrastructure on ia32.
675   UNREACHABLE();
676 }
677 
AssembleRegisterArgumentPoisoning()678 void CodeGenerator::AssembleRegisterArgumentPoisoning() {
679   // TODO(860429): Remove remaining poisoning infrastructure on ia32.
680   UNREACHABLE();
681 }
682 
683 // Assembles an instruction after register allocation, producing machine code.
AssembleArchInstruction(Instruction * instr)684 CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
685     Instruction* instr) {
686   IA32OperandConverter i(this, instr);
687   InstructionCode opcode = instr->opcode();
688   ArchOpcode arch_opcode = ArchOpcodeField::decode(opcode);
689   switch (arch_opcode) {
690     case kArchCallCodeObject: {
691       InstructionOperand* op = instr->InputAt(0);
692       if (op->IsImmediate()) {
693         Handle<Code> code = i.InputCode(0);
694         __ Call(code, RelocInfo::CODE_TARGET);
695       } else {
696         Register reg = i.InputRegister(0);
697         DCHECK_IMPLIES(
698             instr->HasCallDescriptorFlag(CallDescriptor::kFixedTargetRegister),
699             reg == kJavaScriptCallCodeStartRegister);
700         __ LoadCodeObjectEntry(reg, reg);
701         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
702           __ RetpolineCall(reg);
703         } else {
704           __ call(reg);
705         }
706       }
707       RecordCallPosition(instr);
708       frame_access_state()->ClearSPDelta();
709       break;
710     }
711     case kArchCallBuiltinPointer: {
712       DCHECK(!HasImmediateInput(instr, 0));
713       Register builtin_index = i.InputRegister(0);
714       __ CallBuiltinByIndex(builtin_index);
715       RecordCallPosition(instr);
716       frame_access_state()->ClearSPDelta();
717       break;
718     }
719     case kArchCallWasmFunction: {
720       if (HasImmediateInput(instr, 0)) {
721         Constant constant = i.ToConstant(instr->InputAt(0));
722         Address wasm_code = static_cast<Address>(constant.ToInt32());
723         if (DetermineStubCallMode() == StubCallMode::kCallWasmRuntimeStub) {
724           __ wasm_call(wasm_code, constant.rmode());
725         } else {
726           if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
727             __ RetpolineCall(wasm_code, constant.rmode());
728           } else {
729             __ call(wasm_code, constant.rmode());
730           }
731         }
732       } else {
733         Register reg = i.InputRegister(0);
734         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
735           __ RetpolineCall(reg);
736         } else {
737           __ call(reg);
738         }
739       }
740       RecordCallPosition(instr);
741       frame_access_state()->ClearSPDelta();
742       break;
743     }
744     case kArchTailCallCodeObjectFromJSFunction:
745     case kArchTailCallCodeObject: {
746       if (arch_opcode == kArchTailCallCodeObjectFromJSFunction) {
747         AssemblePopArgumentsAdaptorFrame(kJavaScriptCallArgCountRegister,
748                                          no_reg, no_reg, no_reg);
749       }
750       if (HasImmediateInput(instr, 0)) {
751         Handle<Code> code = i.InputCode(0);
752         __ Jump(code, RelocInfo::CODE_TARGET);
753       } else {
754         Register reg = i.InputRegister(0);
755         DCHECK_IMPLIES(
756             instr->HasCallDescriptorFlag(CallDescriptor::kFixedTargetRegister),
757             reg == kJavaScriptCallCodeStartRegister);
758         __ LoadCodeObjectEntry(reg, reg);
759         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
760           __ RetpolineJump(reg);
761         } else {
762           __ jmp(reg);
763         }
764       }
765       frame_access_state()->ClearSPDelta();
766       frame_access_state()->SetFrameAccessToDefault();
767       break;
768     }
769     case kArchTailCallWasm: {
770       if (HasImmediateInput(instr, 0)) {
771         Constant constant = i.ToConstant(instr->InputAt(0));
772         Address wasm_code = static_cast<Address>(constant.ToInt32());
773         __ jmp(wasm_code, constant.rmode());
774       } else {
775         Register reg = i.InputRegister(0);
776         if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
777           __ RetpolineJump(reg);
778         } else {
779           __ jmp(reg);
780         }
781       }
782       frame_access_state()->ClearSPDelta();
783       frame_access_state()->SetFrameAccessToDefault();
784       break;
785     }
786     case kArchTailCallAddress: {
787       CHECK(!HasImmediateInput(instr, 0));
788       Register reg = i.InputRegister(0);
789       DCHECK_IMPLIES(
790           instr->HasCallDescriptorFlag(CallDescriptor::kFixedTargetRegister),
791           reg == kJavaScriptCallCodeStartRegister);
792       if (instr->HasCallDescriptorFlag(CallDescriptor::kRetpoline)) {
793         __ RetpolineJump(reg);
794       } else {
795         __ jmp(reg);
796       }
797       frame_access_state()->ClearSPDelta();
798       frame_access_state()->SetFrameAccessToDefault();
799       break;
800     }
801     case kArchCallJSFunction: {
802       Register func = i.InputRegister(0);
803       if (FLAG_debug_code) {
804         // Check the function's context matches the context argument.
805         __ cmp(esi, FieldOperand(func, JSFunction::kContextOffset));
806         __ Assert(equal, AbortReason::kWrongFunctionContext);
807       }
808       static_assert(kJavaScriptCallCodeStartRegister == ecx, "ABI mismatch");
809       __ mov(ecx, FieldOperand(func, JSFunction::kCodeOffset));
810       __ CallCodeObject(ecx);
811       RecordCallPosition(instr);
812       frame_access_state()->ClearSPDelta();
813       break;
814     }
815     case kArchPrepareCallCFunction: {
816       // Frame alignment requires using FP-relative frame addressing.
817       frame_access_state()->SetFrameAccessToFP();
818       int const num_parameters = MiscField::decode(instr->opcode());
819       __ PrepareCallCFunction(num_parameters, i.TempRegister(0));
820       break;
821     }
822     case kArchSaveCallerRegisters: {
823       fp_mode_ =
824           static_cast<SaveFPRegsMode>(MiscField::decode(instr->opcode()));
825       DCHECK(fp_mode_ == kDontSaveFPRegs || fp_mode_ == kSaveFPRegs);
826       // kReturnRegister0 should have been saved before entering the stub.
827       int bytes = __ PushCallerSaved(fp_mode_, kReturnRegister0);
828       DCHECK(IsAligned(bytes, kSystemPointerSize));
829       DCHECK_EQ(0, frame_access_state()->sp_delta());
830       frame_access_state()->IncreaseSPDelta(bytes / kSystemPointerSize);
831       DCHECK(!caller_registers_saved_);
832       caller_registers_saved_ = true;
833       break;
834     }
835     case kArchRestoreCallerRegisters: {
836       DCHECK(fp_mode_ ==
837              static_cast<SaveFPRegsMode>(MiscField::decode(instr->opcode())));
838       DCHECK(fp_mode_ == kDontSaveFPRegs || fp_mode_ == kSaveFPRegs);
839       // Don't overwrite the returned value.
840       int bytes = __ PopCallerSaved(fp_mode_, kReturnRegister0);
841       frame_access_state()->IncreaseSPDelta(-(bytes / kSystemPointerSize));
842       DCHECK_EQ(0, frame_access_state()->sp_delta());
843       DCHECK(caller_registers_saved_);
844       caller_registers_saved_ = false;
845       break;
846     }
847     case kArchPrepareTailCall:
848       AssemblePrepareTailCall();
849       break;
850     case kArchCallCFunction: {
851       int const num_parameters = MiscField::decode(instr->opcode());
852       Label return_location;
853       if (linkage()->GetIncomingDescriptor()->IsWasmCapiFunction()) {
854         // Put the return address in a stack slot.
855         Register scratch = eax;
856         __ push(scratch);
857         __ PushPC();
858         int pc = __ pc_offset();
859         __ pop(scratch);
860         __ sub(scratch, Immediate(pc + Code::kHeaderSize - kHeapObjectTag));
861         __ add(scratch, Immediate::CodeRelativeOffset(&return_location));
862         __ mov(MemOperand(ebp, WasmExitFrameConstants::kCallingPCOffset),
863                scratch);
864         __ pop(scratch);
865       }
866       if (HasImmediateInput(instr, 0)) {
867         ExternalReference ref = i.InputExternalReference(0);
868         __ CallCFunction(ref, num_parameters);
869       } else {
870         Register func = i.InputRegister(0);
871         __ CallCFunction(func, num_parameters);
872       }
873       __ bind(&return_location);
874       if (linkage()->GetIncomingDescriptor()->IsWasmCapiFunction()) {
875         RecordSafepoint(instr->reference_map(), Safepoint::kNoLazyDeopt);
876       }
877       frame_access_state()->SetFrameAccessToDefault();
878       // Ideally, we should decrement SP delta to match the change of stack
879       // pointer in CallCFunction. However, for certain architectures (e.g.
880       // ARM), there may be more strict alignment requirement, causing old SP
881       // to be saved on the stack. In those cases, we can not calculate the SP
882       // delta statically.
883       frame_access_state()->ClearSPDelta();
884       if (caller_registers_saved_) {
885         // Need to re-sync SP delta introduced in kArchSaveCallerRegisters.
886         // Here, we assume the sequence to be:
887         //   kArchSaveCallerRegisters;
888         //   kArchCallCFunction;
889         //   kArchRestoreCallerRegisters;
890         int bytes =
891             __ RequiredStackSizeForCallerSaved(fp_mode_, kReturnRegister0);
892         frame_access_state()->IncreaseSPDelta(bytes / kSystemPointerSize);
893       }
894       break;
895     }
896     case kArchJmp:
897       AssembleArchJump(i.InputRpo(0));
898       break;
899     case kArchBinarySearchSwitch:
900       AssembleArchBinarySearchSwitch(instr);
901       break;
902     case kArchTableSwitch:
903       AssembleArchTableSwitch(instr);
904       break;
905     case kArchComment:
906       __ RecordComment(reinterpret_cast<const char*>(i.InputInt32(0)));
907       break;
908     case kArchAbortCSAAssert:
909       DCHECK(i.InputRegister(0) == edx);
910       {
911         // We don't actually want to generate a pile of code for this, so just
912         // claim there is a stack frame, without generating one.
913         FrameScope scope(tasm(), StackFrame::NONE);
914         __ Call(
915             isolate()->builtins()->builtin_handle(Builtins::kAbortCSAAssert),
916             RelocInfo::CODE_TARGET);
917       }
918       __ int3();
919       break;
920     case kArchDebugBreak:
921       __ DebugBreak();
922       break;
923     case kArchNop:
924     case kArchThrowTerminator:
925       // don't emit code for nops.
926       break;
927     case kArchDeoptimize: {
928       DeoptimizationExit* exit =
929           BuildTranslation(instr, -1, 0, OutputFrameStateCombine::Ignore());
930       __ jmp(exit->label());
931       break;
932     }
933     case kArchRet:
934       AssembleReturn(instr->InputAt(0));
935       break;
936     case kArchFramePointer:
937       __ mov(i.OutputRegister(), ebp);
938       break;
939     case kArchParentFramePointer:
940       if (frame_access_state()->has_frame()) {
941         __ mov(i.OutputRegister(), Operand(ebp, 0));
942       } else {
943         __ mov(i.OutputRegister(), ebp);
944       }
945       break;
946     case kArchStackPointerGreaterThan: {
947       // Potentially apply an offset to the current stack pointer before the
948       // comparison to consider the size difference of an optimized frame versus
949       // the contained unoptimized frames.
950       Register lhs_register = esp;
951       uint32_t offset;
952 
953       if (ShouldApplyOffsetToStackCheck(instr, &offset)) {
954         lhs_register = i.TempRegister(0);
955         __ lea(lhs_register, Operand(esp, -1 * static_cast<int32_t>(offset)));
956       }
957 
958       constexpr size_t kValueIndex = 0;
959       if (HasAddressingMode(instr)) {
960         __ cmp(lhs_register, i.MemoryOperand(kValueIndex));
961       } else {
962         __ cmp(lhs_register, i.InputRegister(kValueIndex));
963       }
964       break;
965     }
966     case kArchStackCheckOffset:
967       __ Move(i.OutputRegister(), Smi::FromInt(GetStackCheckOffset()));
968       break;
969     case kArchTruncateDoubleToI: {
970       auto result = i.OutputRegister();
971       auto input = i.InputDoubleRegister(0);
972       auto ool = zone()->New<OutOfLineTruncateDoubleToI>(
973           this, result, input, DetermineStubCallMode());
974       __ cvttsd2si(result, Operand(input));
975       __ cmp(result, 1);
976       __ j(overflow, ool->entry());
977       __ bind(ool->exit());
978       break;
979     }
980     case kArchStoreWithWriteBarrier: {
981       RecordWriteMode mode =
982           static_cast<RecordWriteMode>(MiscField::decode(instr->opcode()));
983       Register object = i.InputRegister(0);
984       size_t index = 0;
985       Operand operand = i.MemoryOperand(&index);
986       Register value = i.InputRegister(index);
987       Register scratch0 = i.TempRegister(0);
988       Register scratch1 = i.TempRegister(1);
989       auto ool = zone()->New<OutOfLineRecordWrite>(this, object, operand, value,
990                                                    scratch0, scratch1, mode,
991                                                    DetermineStubCallMode());
992       __ mov(operand, value);
993       __ CheckPageFlag(object, scratch0,
994                        MemoryChunk::kPointersFromHereAreInterestingMask,
995                        not_zero, ool->entry());
996       __ bind(ool->exit());
997       break;
998     }
999     case kArchStackSlot: {
1000       FrameOffset offset =
1001           frame_access_state()->GetFrameOffset(i.InputInt32(0));
1002       Register base = offset.from_stack_pointer() ? esp : ebp;
1003       __ lea(i.OutputRegister(), Operand(base, offset.offset()));
1004       break;
1005     }
1006     case kIeee754Float64Acos:
1007       ASSEMBLE_IEEE754_UNOP(acos);
1008       break;
1009     case kIeee754Float64Acosh:
1010       ASSEMBLE_IEEE754_UNOP(acosh);
1011       break;
1012     case kIeee754Float64Asin:
1013       ASSEMBLE_IEEE754_UNOP(asin);
1014       break;
1015     case kIeee754Float64Asinh:
1016       ASSEMBLE_IEEE754_UNOP(asinh);
1017       break;
1018     case kIeee754Float64Atan:
1019       ASSEMBLE_IEEE754_UNOP(atan);
1020       break;
1021     case kIeee754Float64Atanh:
1022       ASSEMBLE_IEEE754_UNOP(atanh);
1023       break;
1024     case kIeee754Float64Atan2:
1025       ASSEMBLE_IEEE754_BINOP(atan2);
1026       break;
1027     case kIeee754Float64Cbrt:
1028       ASSEMBLE_IEEE754_UNOP(cbrt);
1029       break;
1030     case kIeee754Float64Cos:
1031       ASSEMBLE_IEEE754_UNOP(cos);
1032       break;
1033     case kIeee754Float64Cosh:
1034       ASSEMBLE_IEEE754_UNOP(cosh);
1035       break;
1036     case kIeee754Float64Expm1:
1037       ASSEMBLE_IEEE754_UNOP(expm1);
1038       break;
1039     case kIeee754Float64Exp:
1040       ASSEMBLE_IEEE754_UNOP(exp);
1041       break;
1042     case kIeee754Float64Log:
1043       ASSEMBLE_IEEE754_UNOP(log);
1044       break;
1045     case kIeee754Float64Log1p:
1046       ASSEMBLE_IEEE754_UNOP(log1p);
1047       break;
1048     case kIeee754Float64Log2:
1049       ASSEMBLE_IEEE754_UNOP(log2);
1050       break;
1051     case kIeee754Float64Log10:
1052       ASSEMBLE_IEEE754_UNOP(log10);
1053       break;
1054     case kIeee754Float64Pow:
1055       ASSEMBLE_IEEE754_BINOP(pow);
1056       break;
1057     case kIeee754Float64Sin:
1058       ASSEMBLE_IEEE754_UNOP(sin);
1059       break;
1060     case kIeee754Float64Sinh:
1061       ASSEMBLE_IEEE754_UNOP(sinh);
1062       break;
1063     case kIeee754Float64Tan:
1064       ASSEMBLE_IEEE754_UNOP(tan);
1065       break;
1066     case kIeee754Float64Tanh:
1067       ASSEMBLE_IEEE754_UNOP(tanh);
1068       break;
1069     case kIA32Add:
1070       ASSEMBLE_BINOP(add);
1071       break;
1072     case kIA32And:
1073       ASSEMBLE_BINOP(and_);
1074       break;
1075     case kIA32Cmp:
1076       ASSEMBLE_COMPARE(cmp);
1077       break;
1078     case kIA32Cmp16:
1079       ASSEMBLE_COMPARE(cmpw);
1080       break;
1081     case kIA32Cmp8:
1082       ASSEMBLE_COMPARE(cmpb);
1083       break;
1084     case kIA32Test:
1085       ASSEMBLE_COMPARE(test);
1086       break;
1087     case kIA32Test16:
1088       ASSEMBLE_COMPARE(test_w);
1089       break;
1090     case kIA32Test8:
1091       ASSEMBLE_COMPARE(test_b);
1092       break;
1093     case kIA32Imul:
1094       if (HasImmediateInput(instr, 1)) {
1095         __ imul(i.OutputRegister(), i.InputOperand(0), i.InputInt32(1));
1096       } else {
1097         __ imul(i.OutputRegister(), i.InputOperand(1));
1098       }
1099       break;
1100     case kIA32ImulHigh:
1101       __ imul(i.InputRegister(1));
1102       break;
1103     case kIA32UmulHigh:
1104       __ mul(i.InputRegister(1));
1105       break;
1106     case kIA32Idiv:
1107       __ cdq();
1108       __ idiv(i.InputOperand(1));
1109       break;
1110     case kIA32Udiv:
1111       __ Move(edx, Immediate(0));
1112       __ div(i.InputOperand(1));
1113       break;
1114     case kIA32Not:
1115       __ not_(i.OutputOperand());
1116       break;
1117     case kIA32Neg:
1118       __ neg(i.OutputOperand());
1119       break;
1120     case kIA32Or:
1121       ASSEMBLE_BINOP(or_);
1122       break;
1123     case kIA32Xor:
1124       ASSEMBLE_BINOP(xor_);
1125       break;
1126     case kIA32Sub:
1127       ASSEMBLE_BINOP(sub);
1128       break;
1129     case kIA32Shl:
1130       if (HasImmediateInput(instr, 1)) {
1131         __ shl(i.OutputOperand(), i.InputInt5(1));
1132       } else {
1133         __ shl_cl(i.OutputOperand());
1134       }
1135       break;
1136     case kIA32Shr:
1137       if (HasImmediateInput(instr, 1)) {
1138         __ shr(i.OutputOperand(), i.InputInt5(1));
1139       } else {
1140         __ shr_cl(i.OutputOperand());
1141       }
1142       break;
1143     case kIA32Sar:
1144       if (HasImmediateInput(instr, 1)) {
1145         __ sar(i.OutputOperand(), i.InputInt5(1));
1146       } else {
1147         __ sar_cl(i.OutputOperand());
1148       }
1149       break;
1150     case kIA32AddPair: {
1151       // i.OutputRegister(0) == i.InputRegister(0) ... left low word.
1152       // i.InputRegister(1) ... left high word.
1153       // i.InputRegister(2) ... right low word.
1154       // i.InputRegister(3) ... right high word.
1155       bool use_temp = false;
1156       if ((HasRegisterInput(instr, 1) &&
1157            i.OutputRegister(0).code() == i.InputRegister(1).code()) ||
1158           i.OutputRegister(0).code() == i.InputRegister(3).code()) {
1159         // We cannot write to the output register directly, because it would
1160         // overwrite an input for adc. We have to use the temp register.
1161         use_temp = true;
1162         __ Move(i.TempRegister(0), i.InputRegister(0));
1163         __ add(i.TempRegister(0), i.InputRegister(2));
1164       } else {
1165         __ add(i.OutputRegister(0), i.InputRegister(2));
1166       }
1167       i.MoveInstructionOperandToRegister(i.OutputRegister(1),
1168                                          instr->InputAt(1));
1169       __ adc(i.OutputRegister(1), Operand(i.InputRegister(3)));
1170       if (use_temp) {
1171         __ Move(i.OutputRegister(0), i.TempRegister(0));
1172       }
1173       break;
1174     }
1175     case kIA32SubPair: {
1176       // i.OutputRegister(0) == i.InputRegister(0) ... left low word.
1177       // i.InputRegister(1) ... left high word.
1178       // i.InputRegister(2) ... right low word.
1179       // i.InputRegister(3) ... right high word.
1180       bool use_temp = false;
1181       if ((HasRegisterInput(instr, 1) &&
1182            i.OutputRegister(0).code() == i.InputRegister(1).code()) ||
1183           i.OutputRegister(0).code() == i.InputRegister(3).code()) {
1184         // We cannot write to the output register directly, because it would
1185         // overwrite an input for adc. We have to use the temp register.
1186         use_temp = true;
1187         __ Move(i.TempRegister(0), i.InputRegister(0));
1188         __ sub(i.TempRegister(0), i.InputRegister(2));
1189       } else {
1190         __ sub(i.OutputRegister(0), i.InputRegister(2));
1191       }
1192       i.MoveInstructionOperandToRegister(i.OutputRegister(1),
1193                                          instr->InputAt(1));
1194       __ sbb(i.OutputRegister(1), Operand(i.InputRegister(3)));
1195       if (use_temp) {
1196         __ Move(i.OutputRegister(0), i.TempRegister(0));
1197       }
1198       break;
1199     }
1200     case kIA32MulPair: {
1201       __ imul(i.OutputRegister(1), i.InputOperand(0));
1202       i.MoveInstructionOperandToRegister(i.TempRegister(0), instr->InputAt(1));
1203       __ imul(i.TempRegister(0), i.InputOperand(2));
1204       __ add(i.OutputRegister(1), i.TempRegister(0));
1205       __ mov(i.OutputRegister(0), i.InputOperand(0));
1206       // Multiplies the low words and stores them in eax and edx.
1207       __ mul(i.InputRegister(2));
1208       __ add(i.OutputRegister(1), i.TempRegister(0));
1209 
1210       break;
1211     }
1212     case kIA32ShlPair:
1213       if (HasImmediateInput(instr, 2)) {
1214         __ ShlPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
1215       } else {
1216         // Shift has been loaded into CL by the register allocator.
1217         __ ShlPair_cl(i.InputRegister(1), i.InputRegister(0));
1218       }
1219       break;
1220     case kIA32ShrPair:
1221       if (HasImmediateInput(instr, 2)) {
1222         __ ShrPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
1223       } else {
1224         // Shift has been loaded into CL by the register allocator.
1225         __ ShrPair_cl(i.InputRegister(1), i.InputRegister(0));
1226       }
1227       break;
1228     case kIA32SarPair:
1229       if (HasImmediateInput(instr, 2)) {
1230         __ SarPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
1231       } else {
1232         // Shift has been loaded into CL by the register allocator.
1233         __ SarPair_cl(i.InputRegister(1), i.InputRegister(0));
1234       }
1235       break;
1236     case kIA32Rol:
1237       if (HasImmediateInput(instr, 1)) {
1238         __ rol(i.OutputOperand(), i.InputInt5(1));
1239       } else {
1240         __ rol_cl(i.OutputOperand());
1241       }
1242       break;
1243     case kIA32Ror:
1244       if (HasImmediateInput(instr, 1)) {
1245         __ ror(i.OutputOperand(), i.InputInt5(1));
1246       } else {
1247         __ ror_cl(i.OutputOperand());
1248       }
1249       break;
1250     case kIA32Lzcnt:
1251       __ Lzcnt(i.OutputRegister(), i.InputOperand(0));
1252       break;
1253     case kIA32Tzcnt:
1254       __ Tzcnt(i.OutputRegister(), i.InputOperand(0));
1255       break;
1256     case kIA32Popcnt:
1257       __ Popcnt(i.OutputRegister(), i.InputOperand(0));
1258       break;
1259     case kIA32Bswap:
1260       __ bswap(i.OutputRegister());
1261       break;
1262     case kArchWordPoisonOnSpeculation:
1263       // TODO(860429): Remove remaining poisoning infrastructure on ia32.
1264       UNREACHABLE();
1265     case kIA32MFence:
1266       __ mfence();
1267       break;
1268     case kIA32LFence:
1269       __ lfence();
1270       break;
1271     case kSSEFloat32Cmp:
1272       __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
1273       break;
1274     case kSSEFloat32Add:
1275       __ addss(i.InputDoubleRegister(0), i.InputOperand(1));
1276       break;
1277     case kSSEFloat32Sub:
1278       __ subss(i.InputDoubleRegister(0), i.InputOperand(1));
1279       break;
1280     case kSSEFloat32Mul:
1281       __ mulss(i.InputDoubleRegister(0), i.InputOperand(1));
1282       break;
1283     case kSSEFloat32Div:
1284       __ divss(i.InputDoubleRegister(0), i.InputOperand(1));
1285       // Don't delete this mov. It may improve performance on some CPUs,
1286       // when there is a (v)mulss depending on the result.
1287       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
1288       break;
1289     case kSSEFloat32Sqrt:
1290       __ sqrtss(i.OutputDoubleRegister(), i.InputOperand(0));
1291       break;
1292     case kSSEFloat32Abs: {
1293       // TODO(bmeurer): Use 128-bit constants.
1294       XMMRegister tmp = i.TempSimd128Register(0);
1295       __ pcmpeqd(tmp, tmp);
1296       __ psrlq(tmp, 33);
1297       __ andps(i.OutputDoubleRegister(), tmp);
1298       break;
1299     }
1300     case kSSEFloat32Neg: {
1301       // TODO(bmeurer): Use 128-bit constants.
1302       XMMRegister tmp = i.TempSimd128Register(0);
1303       __ pcmpeqd(tmp, tmp);
1304       __ psllq(tmp, 31);
1305       __ xorps(i.OutputDoubleRegister(), tmp);
1306       break;
1307     }
1308     case kSSEFloat32Round: {
1309       CpuFeatureScope sse_scope(tasm(), SSE4_1);
1310       RoundingMode const mode =
1311           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
1312       __ roundss(i.OutputDoubleRegister(), i.InputDoubleRegister(0), mode);
1313       break;
1314     }
1315     case kSSEFloat64Cmp:
1316       __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
1317       break;
1318     case kSSEFloat64Add:
1319       __ addsd(i.InputDoubleRegister(0), i.InputOperand(1));
1320       break;
1321     case kSSEFloat64Sub:
1322       __ subsd(i.InputDoubleRegister(0), i.InputOperand(1));
1323       break;
1324     case kSSEFloat64Mul:
1325       __ mulsd(i.InputDoubleRegister(0), i.InputOperand(1));
1326       break;
1327     case kSSEFloat64Div:
1328       __ divsd(i.InputDoubleRegister(0), i.InputOperand(1));
1329       // Don't delete this mov. It may improve performance on some CPUs,
1330       // when there is a (v)mulsd depending on the result.
1331       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
1332       break;
1333     case kSSEFloat32Max: {
1334       Label compare_swap, done_compare;
1335       if (instr->InputAt(1)->IsFPRegister()) {
1336         __ ucomiss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1337       } else {
1338         __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
1339       }
1340       auto ool =
1341           zone()->New<OutOfLineLoadFloat32NaN>(this, i.OutputDoubleRegister());
1342       __ j(parity_even, ool->entry());
1343       __ j(above, &done_compare, Label::kNear);
1344       __ j(below, &compare_swap, Label::kNear);
1345       __ movmskps(i.TempRegister(0), i.InputDoubleRegister(0));
1346       __ test(i.TempRegister(0), Immediate(1));
1347       __ j(zero, &done_compare, Label::kNear);
1348       __ bind(&compare_swap);
1349       if (instr->InputAt(1)->IsFPRegister()) {
1350         __ movss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1351       } else {
1352         __ movss(i.InputDoubleRegister(0), i.InputOperand(1));
1353       }
1354       __ bind(&done_compare);
1355       __ bind(ool->exit());
1356       break;
1357     }
1358 
1359     case kSSEFloat64Max: {
1360       Label compare_swap, done_compare;
1361       if (instr->InputAt(1)->IsFPRegister()) {
1362         __ ucomisd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1363       } else {
1364         __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
1365       }
1366       auto ool =
1367           zone()->New<OutOfLineLoadFloat64NaN>(this, i.OutputDoubleRegister());
1368       __ j(parity_even, ool->entry());
1369       __ j(above, &done_compare, Label::kNear);
1370       __ j(below, &compare_swap, Label::kNear);
1371       __ movmskpd(i.TempRegister(0), i.InputDoubleRegister(0));
1372       __ test(i.TempRegister(0), Immediate(1));
1373       __ j(zero, &done_compare, Label::kNear);
1374       __ bind(&compare_swap);
1375       if (instr->InputAt(1)->IsFPRegister()) {
1376         __ movsd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1377       } else {
1378         __ movsd(i.InputDoubleRegister(0), i.InputOperand(1));
1379       }
1380       __ bind(&done_compare);
1381       __ bind(ool->exit());
1382       break;
1383     }
1384     case kSSEFloat32Min: {
1385       Label compare_swap, done_compare;
1386       if (instr->InputAt(1)->IsFPRegister()) {
1387         __ ucomiss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1388       } else {
1389         __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
1390       }
1391       auto ool =
1392           zone()->New<OutOfLineLoadFloat32NaN>(this, i.OutputDoubleRegister());
1393       __ j(parity_even, ool->entry());
1394       __ j(below, &done_compare, Label::kNear);
1395       __ j(above, &compare_swap, Label::kNear);
1396       if (instr->InputAt(1)->IsFPRegister()) {
1397         __ movmskps(i.TempRegister(0), i.InputDoubleRegister(1));
1398       } else {
1399         __ movss(kScratchDoubleReg, i.InputOperand(1));
1400         __ movmskps(i.TempRegister(0), kScratchDoubleReg);
1401       }
1402       __ test(i.TempRegister(0), Immediate(1));
1403       __ j(zero, &done_compare, Label::kNear);
1404       __ bind(&compare_swap);
1405       if (instr->InputAt(1)->IsFPRegister()) {
1406         __ movss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1407       } else {
1408         __ movss(i.InputDoubleRegister(0), i.InputOperand(1));
1409       }
1410       __ bind(&done_compare);
1411       __ bind(ool->exit());
1412       break;
1413     }
1414     case kSSEFloat64Min: {
1415       Label compare_swap, done_compare;
1416       if (instr->InputAt(1)->IsFPRegister()) {
1417         __ ucomisd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1418       } else {
1419         __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
1420       }
1421       auto ool =
1422           zone()->New<OutOfLineLoadFloat64NaN>(this, i.OutputDoubleRegister());
1423       __ j(parity_even, ool->entry());
1424       __ j(below, &done_compare, Label::kNear);
1425       __ j(above, &compare_swap, Label::kNear);
1426       if (instr->InputAt(1)->IsFPRegister()) {
1427         __ movmskpd(i.TempRegister(0), i.InputDoubleRegister(1));
1428       } else {
1429         __ movsd(kScratchDoubleReg, i.InputOperand(1));
1430         __ movmskpd(i.TempRegister(0), kScratchDoubleReg);
1431       }
1432       __ test(i.TempRegister(0), Immediate(1));
1433       __ j(zero, &done_compare, Label::kNear);
1434       __ bind(&compare_swap);
1435       if (instr->InputAt(1)->IsFPRegister()) {
1436         __ movsd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
1437       } else {
1438         __ movsd(i.InputDoubleRegister(0), i.InputOperand(1));
1439       }
1440       __ bind(&done_compare);
1441       __ bind(ool->exit());
1442       break;
1443     }
1444     case kSSEFloat64Mod: {
1445       Register tmp = i.TempRegister(1);
1446       __ mov(tmp, esp);
1447       __ AllocateStackSpace(kDoubleSize);
1448       __ and_(esp, -8);  // align to 8 byte boundary.
1449       // Move values to st(0) and st(1).
1450       __ movsd(Operand(esp, 0), i.InputDoubleRegister(1));
1451       __ fld_d(Operand(esp, 0));
1452       __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
1453       __ fld_d(Operand(esp, 0));
1454       // Loop while fprem isn't done.
1455       Label mod_loop;
1456       __ bind(&mod_loop);
1457       // This instruction traps on all kinds of inputs, but we are assuming the
1458       // floating point control word is set to ignore them all.
1459       __ fprem();
1460       // fnstsw_ax clobbers eax.
1461       DCHECK_EQ(eax, i.TempRegister(0));
1462       __ fnstsw_ax();
1463       __ sahf();
1464       __ j(parity_even, &mod_loop);
1465       // Move output to stack and clean up.
1466       __ fstp(1);
1467       __ fstp_d(Operand(esp, 0));
1468       __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));
1469       __ mov(esp, tmp);
1470       break;
1471     }
1472     case kSSEFloat64Abs: {
1473       // TODO(bmeurer): Use 128-bit constants.
1474       XMMRegister tmp = i.TempSimd128Register(0);
1475       __ pcmpeqd(tmp, tmp);
1476       __ psrlq(tmp, 1);
1477       __ andpd(i.OutputDoubleRegister(), tmp);
1478       break;
1479     }
1480     case kSSEFloat64Neg: {
1481       // TODO(bmeurer): Use 128-bit constants.
1482       XMMRegister tmp = i.TempSimd128Register(0);
1483       __ pcmpeqd(tmp, tmp);
1484       __ psllq(tmp, 63);
1485       __ xorpd(i.OutputDoubleRegister(), tmp);
1486       break;
1487     }
1488     case kSSEFloat64Sqrt:
1489       __ sqrtsd(i.OutputDoubleRegister(), i.InputOperand(0));
1490       break;
1491     case kSSEFloat64Round: {
1492       CpuFeatureScope sse_scope(tasm(), SSE4_1);
1493       RoundingMode const mode =
1494           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
1495       __ roundsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0), mode);
1496       break;
1497     }
1498     case kSSEFloat32ToFloat64:
1499       __ cvtss2sd(i.OutputDoubleRegister(), i.InputOperand(0));
1500       break;
1501     case kSSEFloat64ToFloat32:
1502       __ cvtsd2ss(i.OutputDoubleRegister(), i.InputOperand(0));
1503       break;
1504     case kSSEFloat32ToInt32:
1505       __ cvttss2si(i.OutputRegister(), i.InputOperand(0));
1506       break;
1507     case kSSEFloat32ToUint32:
1508       __ Cvttss2ui(i.OutputRegister(), i.InputOperand(0),
1509                    i.TempSimd128Register(0));
1510       break;
1511     case kSSEFloat64ToInt32:
1512       __ cvttsd2si(i.OutputRegister(), i.InputOperand(0));
1513       break;
1514     case kSSEFloat64ToUint32:
1515       __ Cvttsd2ui(i.OutputRegister(), i.InputOperand(0),
1516                    i.TempSimd128Register(0));
1517       break;
1518     case kSSEInt32ToFloat32:
1519       __ cvtsi2ss(i.OutputDoubleRegister(), i.InputOperand(0));
1520       break;
1521     case kSSEUint32ToFloat32:
1522       __ Cvtui2ss(i.OutputDoubleRegister(), i.InputOperand(0),
1523                   i.TempRegister(0));
1524       break;
1525     case kSSEInt32ToFloat64:
1526       __ cvtsi2sd(i.OutputDoubleRegister(), i.InputOperand(0));
1527       break;
1528     case kSSEUint32ToFloat64:
1529       __ Cvtui2sd(i.OutputDoubleRegister(), i.InputOperand(0),
1530                   i.TempRegister(0));
1531       break;
1532     case kSSEFloat64ExtractLowWord32:
1533       if (instr->InputAt(0)->IsFPStackSlot()) {
1534         __ mov(i.OutputRegister(), i.InputOperand(0));
1535       } else {
1536         __ movd(i.OutputRegister(), i.InputDoubleRegister(0));
1537       }
1538       break;
1539     case kSSEFloat64ExtractHighWord32:
1540       if (instr->InputAt(0)->IsFPStackSlot()) {
1541         __ mov(i.OutputRegister(), i.InputOperand(0, kDoubleSize / 2));
1542       } else {
1543         __ Pextrd(i.OutputRegister(), i.InputDoubleRegister(0), 1);
1544       }
1545       break;
1546     case kSSEFloat64InsertLowWord32:
1547       __ Pinsrd(i.OutputDoubleRegister(), i.InputOperand(1), 0);
1548       break;
1549     case kSSEFloat64InsertHighWord32:
1550       __ Pinsrd(i.OutputDoubleRegister(), i.InputOperand(1), 1);
1551       break;
1552     case kSSEFloat64LoadLowWord32:
1553       __ movd(i.OutputDoubleRegister(), i.InputOperand(0));
1554       break;
1555     case kAVXFloat32Add: {
1556       CpuFeatureScope avx_scope(tasm(), AVX);
1557       __ vaddss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1558                 i.InputOperand(1));
1559       break;
1560     }
1561     case kAVXFloat32Sub: {
1562       CpuFeatureScope avx_scope(tasm(), AVX);
1563       __ vsubss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1564                 i.InputOperand(1));
1565       break;
1566     }
1567     case kAVXFloat32Mul: {
1568       CpuFeatureScope avx_scope(tasm(), AVX);
1569       __ vmulss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1570                 i.InputOperand(1));
1571       break;
1572     }
1573     case kAVXFloat32Div: {
1574       CpuFeatureScope avx_scope(tasm(), AVX);
1575       __ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1576                 i.InputOperand(1));
1577       // Don't delete this mov. It may improve performance on some CPUs,
1578       // when there is a (v)mulss depending on the result.
1579       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
1580       break;
1581     }
1582     case kAVXFloat64Add: {
1583       CpuFeatureScope avx_scope(tasm(), AVX);
1584       __ vaddsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1585                 i.InputOperand(1));
1586       break;
1587     }
1588     case kAVXFloat64Sub: {
1589       CpuFeatureScope avx_scope(tasm(), AVX);
1590       __ vsubsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1591                 i.InputOperand(1));
1592       break;
1593     }
1594     case kAVXFloat64Mul: {
1595       CpuFeatureScope avx_scope(tasm(), AVX);
1596       __ vmulsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1597                 i.InputOperand(1));
1598       break;
1599     }
1600     case kAVXFloat64Div: {
1601       CpuFeatureScope avx_scope(tasm(), AVX);
1602       __ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1603                 i.InputOperand(1));
1604       // Don't delete this mov. It may improve performance on some CPUs,
1605       // when there is a (v)mulsd depending on the result.
1606       __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
1607       break;
1608     }
1609     case kAVXFloat32Abs: {
1610       // TODO(bmeurer): Use RIP relative 128-bit constants.
1611       XMMRegister tmp = i.TempSimd128Register(0);
1612       __ pcmpeqd(tmp, tmp);
1613       __ psrlq(tmp, 33);
1614       CpuFeatureScope avx_scope(tasm(), AVX);
1615       __ vandps(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
1616       break;
1617     }
1618     case kAVXFloat32Neg: {
1619       // TODO(bmeurer): Use RIP relative 128-bit constants.
1620       XMMRegister tmp = i.TempSimd128Register(0);
1621       __ pcmpeqd(tmp, tmp);
1622       __ psllq(tmp, 31);
1623       CpuFeatureScope avx_scope(tasm(), AVX);
1624       __ vxorps(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
1625       break;
1626     }
1627     case kAVXFloat64Abs: {
1628       // TODO(bmeurer): Use RIP relative 128-bit constants.
1629       XMMRegister tmp = i.TempSimd128Register(0);
1630       __ pcmpeqd(tmp, tmp);
1631       __ psrlq(tmp, 1);
1632       CpuFeatureScope avx_scope(tasm(), AVX);
1633       __ vandpd(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
1634       break;
1635     }
1636     case kAVXFloat64Neg: {
1637       // TODO(bmeurer): Use RIP relative 128-bit constants.
1638       XMMRegister tmp = i.TempSimd128Register(0);
1639       __ pcmpeqd(tmp, tmp);
1640       __ psllq(tmp, 63);
1641       CpuFeatureScope avx_scope(tasm(), AVX);
1642       __ vxorpd(i.OutputDoubleRegister(), tmp, i.InputOperand(0));
1643       break;
1644     }
1645     case kSSEFloat64SilenceNaN:
1646       __ xorpd(kScratchDoubleReg, kScratchDoubleReg);
1647       __ subsd(i.InputDoubleRegister(0), kScratchDoubleReg);
1648       break;
1649     case kIA32Movsxbl:
1650       ASSEMBLE_MOVX(movsx_b);
1651       break;
1652     case kIA32Movzxbl:
1653       ASSEMBLE_MOVX(movzx_b);
1654       break;
1655     case kIA32Movb: {
1656       size_t index = 0;
1657       Operand operand = i.MemoryOperand(&index);
1658       if (HasImmediateInput(instr, index)) {
1659         __ mov_b(operand, i.InputInt8(index));
1660       } else {
1661         __ mov_b(operand, i.InputRegister(index));
1662       }
1663       break;
1664     }
1665     case kIA32Movsxwl:
1666       ASSEMBLE_MOVX(movsx_w);
1667       break;
1668     case kIA32Movzxwl:
1669       ASSEMBLE_MOVX(movzx_w);
1670       break;
1671     case kIA32Movw: {
1672       size_t index = 0;
1673       Operand operand = i.MemoryOperand(&index);
1674       if (HasImmediateInput(instr, index)) {
1675         __ mov_w(operand, i.InputInt16(index));
1676       } else {
1677         __ mov_w(operand, i.InputRegister(index));
1678       }
1679       break;
1680     }
1681     case kIA32Movl:
1682       if (instr->HasOutput()) {
1683         __ mov(i.OutputRegister(), i.MemoryOperand());
1684       } else {
1685         size_t index = 0;
1686         Operand operand = i.MemoryOperand(&index);
1687         if (HasImmediateInput(instr, index)) {
1688           __ mov(operand, i.InputImmediate(index));
1689         } else {
1690           __ mov(operand, i.InputRegister(index));
1691         }
1692       }
1693       break;
1694     case kIA32Movsd:
1695       if (instr->HasOutput()) {
1696         __ movsd(i.OutputDoubleRegister(), i.MemoryOperand());
1697       } else {
1698         size_t index = 0;
1699         Operand operand = i.MemoryOperand(&index);
1700         __ movsd(operand, i.InputDoubleRegister(index));
1701       }
1702       break;
1703     case kIA32Movss:
1704       if (instr->HasOutput()) {
1705         __ movss(i.OutputDoubleRegister(), i.MemoryOperand());
1706       } else {
1707         size_t index = 0;
1708         Operand operand = i.MemoryOperand(&index);
1709         __ movss(operand, i.InputDoubleRegister(index));
1710       }
1711       break;
1712     case kIA32Movdqu:
1713       if (instr->HasOutput()) {
1714         __ Movdqu(i.OutputSimd128Register(), i.MemoryOperand());
1715       } else {
1716         size_t index = 0;
1717         Operand operand = i.MemoryOperand(&index);
1718         __ Movdqu(operand, i.InputSimd128Register(index));
1719       }
1720       break;
1721     case kIA32BitcastFI:
1722       if (instr->InputAt(0)->IsFPStackSlot()) {
1723         __ mov(i.OutputRegister(), i.InputOperand(0));
1724       } else {
1725         __ movd(i.OutputRegister(), i.InputDoubleRegister(0));
1726       }
1727       break;
1728     case kIA32BitcastIF:
1729       if (HasRegisterInput(instr, 0)) {
1730         __ movd(i.OutputDoubleRegister(), i.InputRegister(0));
1731       } else {
1732         __ movss(i.OutputDoubleRegister(), i.InputOperand(0));
1733       }
1734       break;
1735     case kIA32Lea: {
1736       AddressingMode mode = AddressingModeField::decode(instr->opcode());
1737       // Shorten "leal" to "addl", "subl" or "shll" if the register allocation
1738       // and addressing mode just happens to work out. The "addl"/"subl" forms
1739       // in these cases are faster based on measurements.
1740       if (mode == kMode_MI) {
1741         __ Move(i.OutputRegister(), Immediate(i.InputInt32(0)));
1742       } else if (i.InputRegister(0) == i.OutputRegister()) {
1743         if (mode == kMode_MRI) {
1744           int32_t constant_summand = i.InputInt32(1);
1745           if (constant_summand > 0) {
1746             __ add(i.OutputRegister(), Immediate(constant_summand));
1747           } else if (constant_summand < 0) {
1748             __ sub(i.OutputRegister(),
1749                    Immediate(base::NegateWithWraparound(constant_summand)));
1750           }
1751         } else if (mode == kMode_MR1) {
1752           if (i.InputRegister(1) == i.OutputRegister()) {
1753             __ shl(i.OutputRegister(), 1);
1754           } else {
1755             __ add(i.OutputRegister(), i.InputRegister(1));
1756           }
1757         } else if (mode == kMode_M2) {
1758           __ shl(i.OutputRegister(), 1);
1759         } else if (mode == kMode_M4) {
1760           __ shl(i.OutputRegister(), 2);
1761         } else if (mode == kMode_M8) {
1762           __ shl(i.OutputRegister(), 3);
1763         } else {
1764           __ lea(i.OutputRegister(), i.MemoryOperand());
1765         }
1766       } else if (mode == kMode_MR1 &&
1767                  i.InputRegister(1) == i.OutputRegister()) {
1768         __ add(i.OutputRegister(), i.InputRegister(0));
1769       } else {
1770         __ lea(i.OutputRegister(), i.MemoryOperand());
1771       }
1772       break;
1773     }
1774     case kIA32PushFloat32:
1775       if (instr->InputAt(0)->IsFPRegister()) {
1776         __ AllocateStackSpace(kFloatSize);
1777         __ movss(Operand(esp, 0), i.InputDoubleRegister(0));
1778         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
1779       } else if (HasImmediateInput(instr, 0)) {
1780         __ Move(kScratchDoubleReg, i.InputFloat32(0));
1781         __ AllocateStackSpace(kFloatSize);
1782         __ movss(Operand(esp, 0), kScratchDoubleReg);
1783         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
1784       } else {
1785         __ movss(kScratchDoubleReg, i.InputOperand(0));
1786         __ AllocateStackSpace(kFloatSize);
1787         __ movss(Operand(esp, 0), kScratchDoubleReg);
1788         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
1789       }
1790       break;
1791     case kIA32PushFloat64:
1792       if (instr->InputAt(0)->IsFPRegister()) {
1793         __ AllocateStackSpace(kDoubleSize);
1794         __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
1795         frame_access_state()->IncreaseSPDelta(kDoubleSize / kSystemPointerSize);
1796       } else if (HasImmediateInput(instr, 0)) {
1797         __ Move(kScratchDoubleReg, i.InputDouble(0));
1798         __ AllocateStackSpace(kDoubleSize);
1799         __ movsd(Operand(esp, 0), kScratchDoubleReg);
1800         frame_access_state()->IncreaseSPDelta(kDoubleSize / kSystemPointerSize);
1801       } else {
1802         __ movsd(kScratchDoubleReg, i.InputOperand(0));
1803         __ AllocateStackSpace(kDoubleSize);
1804         __ movsd(Operand(esp, 0), kScratchDoubleReg);
1805         frame_access_state()->IncreaseSPDelta(kDoubleSize / kSystemPointerSize);
1806       }
1807       break;
1808     case kIA32PushSimd128:
1809       if (instr->InputAt(0)->IsFPRegister()) {
1810         __ AllocateStackSpace(kSimd128Size);
1811         __ movups(Operand(esp, 0), i.InputSimd128Register(0));
1812       } else {
1813         __ movups(kScratchDoubleReg, i.InputOperand(0));
1814         __ AllocateStackSpace(kSimd128Size);
1815         __ movups(Operand(esp, 0), kScratchDoubleReg);
1816       }
1817       frame_access_state()->IncreaseSPDelta(kSimd128Size / kSystemPointerSize);
1818       break;
1819     case kIA32Push:
1820       if (HasAddressingMode(instr)) {
1821         size_t index = 0;
1822         Operand operand = i.MemoryOperand(&index);
1823         __ push(operand);
1824         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
1825       } else if (instr->InputAt(0)->IsFPRegister()) {
1826         __ AllocateStackSpace(kFloatSize);
1827         __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
1828         frame_access_state()->IncreaseSPDelta(kFloatSize / kSystemPointerSize);
1829       } else if (HasImmediateInput(instr, 0)) {
1830         __ push(i.InputImmediate(0));
1831         frame_access_state()->IncreaseSPDelta(1);
1832       } else {
1833         __ push(i.InputOperand(0));
1834         frame_access_state()->IncreaseSPDelta(1);
1835       }
1836       break;
1837     case kIA32Poke: {
1838       int slot = MiscField::decode(instr->opcode());
1839       if (HasImmediateInput(instr, 0)) {
1840         __ mov(Operand(esp, slot * kSystemPointerSize), i.InputImmediate(0));
1841       } else {
1842         __ mov(Operand(esp, slot * kSystemPointerSize), i.InputRegister(0));
1843       }
1844       break;
1845     }
1846     case kIA32Peek: {
1847       int reverse_slot = i.InputInt32(0);
1848       int offset =
1849           FrameSlotToFPOffset(frame()->GetTotalFrameSlotCount() - reverse_slot);
1850       if (instr->OutputAt(0)->IsFPRegister()) {
1851         LocationOperand* op = LocationOperand::cast(instr->OutputAt(0));
1852         if (op->representation() == MachineRepresentation::kFloat64) {
1853           __ movsd(i.OutputDoubleRegister(), Operand(ebp, offset));
1854         } else if (op->representation() == MachineRepresentation::kFloat32) {
1855           __ movss(i.OutputFloatRegister(), Operand(ebp, offset));
1856         } else {
1857           DCHECK_EQ(MachineRepresentation::kSimd128, op->representation());
1858           __ movdqu(i.OutputSimd128Register(), Operand(ebp, offset));
1859         }
1860       } else {
1861         __ mov(i.OutputRegister(), Operand(ebp, offset));
1862       }
1863       break;
1864     }
1865     case kSSEF64x2Splat: {
1866       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
1867       XMMRegister dst = i.OutputSimd128Register();
1868       __ shufpd(dst, dst, 0x0);
1869       break;
1870     }
1871     case kAVXF64x2Splat: {
1872       CpuFeatureScope avx_scope(tasm(), AVX);
1873       XMMRegister src = i.InputDoubleRegister(0);
1874       __ vshufpd(i.OutputSimd128Register(), src, src, 0x0);
1875       break;
1876     }
1877     case kSSEF64x2ExtractLane: {
1878       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
1879       XMMRegister dst = i.OutputDoubleRegister();
1880       int8_t lane = i.InputInt8(1);
1881       if (lane != 0) {
1882         DCHECK_LT(lane, 4);
1883         __ shufpd(dst, dst, lane);
1884       }
1885       break;
1886     }
1887     case kAVXF64x2ExtractLane: {
1888       CpuFeatureScope avx_scope(tasm(), AVX);
1889       XMMRegister dst = i.OutputDoubleRegister();
1890       XMMRegister src = i.InputSimd128Register(0);
1891       int8_t lane = i.InputInt8(1);
1892       if (lane == 0) {
1893         if (dst != src) __ vmovapd(dst, src);
1894       } else {
1895         DCHECK_LT(lane, 4);
1896         __ vshufpd(dst, src, src, lane);
1897       }
1898       break;
1899     }
1900     case kSSEF64x2ReplaceLane: {
1901       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
1902       CpuFeatureScope sse_scope(tasm(), SSE4_1);
1903       XMMRegister dst = i.OutputSimd128Register();
1904       int8_t lane = i.InputInt8(1);
1905       DoubleRegister rep = i.InputDoubleRegister(2);
1906 
1907       // insertps takes a mask which contains (high to low):
1908       // - 2 bit specifying source float element to copy
1909       // - 2 bit specifying destination float element to write to
1910       // - 4 bits specifying which elements of the destination to zero
1911       DCHECK_LT(lane, 2);
1912       if (lane == 0) {
1913         __ insertps(dst, rep, 0b00000000);
1914         __ insertps(dst, rep, 0b01010000);
1915       } else {
1916         __ insertps(dst, rep, 0b00100000);
1917         __ insertps(dst, rep, 0b01110000);
1918       }
1919       break;
1920     }
1921     case kAVXF64x2ReplaceLane: {
1922       CpuFeatureScope avx_scope(tasm(), AVX);
1923       XMMRegister dst = i.OutputSimd128Register();
1924       XMMRegister src = i.InputSimd128Register(0);
1925       int8_t lane = i.InputInt8(1);
1926       DoubleRegister rep = i.InputDoubleRegister(2);
1927       DCHECK_NE(dst, rep);
1928 
1929       DCHECK_LT(lane, 2);
1930       if (lane == 0) {
1931         __ vinsertps(dst, src, rep, 0b00000000);
1932         __ vinsertps(dst, dst, rep, 0b01010000);
1933       } else {
1934         __ vinsertps(dst, src, rep, 0b00100000);
1935         __ vinsertps(dst, dst, rep, 0b01110000);
1936       }
1937       break;
1938     }
1939     case kIA32F64x2Sqrt: {
1940       __ Sqrtpd(i.OutputSimd128Register(), i.InputOperand(0));
1941       break;
1942     }
1943     case kIA32F64x2Add: {
1944       __ Addpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1945                i.InputOperand(1));
1946       break;
1947     }
1948     case kIA32F64x2Sub: {
1949       __ Subpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1950                i.InputOperand(1));
1951       break;
1952     }
1953     case kIA32F64x2Mul: {
1954       __ Mulpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1955                i.InputOperand(1));
1956       break;
1957     }
1958     case kIA32F64x2Div: {
1959       __ Divpd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
1960                i.InputOperand(1));
1961       break;
1962     }
1963     case kIA32F64x2Min: {
1964       Operand src1 = i.InputOperand(1);
1965       XMMRegister dst = i.OutputSimd128Register(),
1966                   src = i.InputSimd128Register(0),
1967                   tmp = i.TempSimd128Register(0);
1968       // The minpd instruction doesn't propagate NaNs and +0's in its first
1969       // operand. Perform minpd in both orders, merge the resuls, and adjust.
1970       __ Movupd(tmp, src1);
1971       __ Minpd(tmp, tmp, src);
1972       __ Minpd(dst, src, src1);
1973       // propagate -0's and NaNs, which may be non-canonical.
1974       __ Orpd(tmp, dst);
1975       // Canonicalize NaNs by quieting and clearing the payload.
1976       __ Cmpunordpd(dst, dst, tmp);
1977       __ Orpd(tmp, dst);
1978       __ Psrlq(dst, 13);
1979       __ Andnpd(dst, tmp);
1980       break;
1981     }
1982     case kIA32F64x2Max: {
1983       Operand src1 = i.InputOperand(1);
1984       XMMRegister dst = i.OutputSimd128Register(),
1985                   src = i.InputSimd128Register(0),
1986                   tmp = i.TempSimd128Register(0);
1987       // The maxpd instruction doesn't propagate NaNs and +0's in its first
1988       // operand. Perform maxpd in both orders, merge the resuls, and adjust.
1989       __ Movupd(tmp, src1);
1990       __ Maxpd(tmp, tmp, src);
1991       __ Maxpd(dst, src, src1);
1992       // Find discrepancies.
1993       __ Xorpd(dst, tmp);
1994       // Propagate NaNs, which may be non-canonical.
1995       __ Orpd(tmp, dst);
1996       // Propagate sign discrepancy and (subtle) quiet NaNs.
1997       __ Subpd(tmp, tmp, dst);
1998       // Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
1999       __ Cmpunordpd(dst, dst, tmp);
2000       __ Psrlq(dst, 13);
2001       __ Andnpd(dst, tmp);
2002       break;
2003     }
2004     case kIA32F64x2Eq: {
2005       __ Cmpeqpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2006                  i.InputOperand(1));
2007       break;
2008     }
2009     case kIA32F64x2Ne: {
2010       __ Cmpneqpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2011                   i.InputOperand(1));
2012       break;
2013     }
2014     case kIA32F64x2Lt: {
2015       __ Cmpltpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2016                  i.InputOperand(1));
2017       break;
2018     }
2019     case kIA32F64x2Le: {
2020       __ Cmplepd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2021                  i.InputOperand(1));
2022       break;
2023     }
2024     case kIA32F64x2Pmin: {
2025       XMMRegister dst = i.OutputSimd128Register();
2026       DCHECK_EQ(dst, i.InputSimd128Register(0));
2027       __ Minpd(dst, dst, i.InputSimd128Register(1));
2028       break;
2029     }
2030     case kIA32F64x2Pmax: {
2031       XMMRegister dst = i.OutputSimd128Register();
2032       DCHECK_EQ(dst, i.InputSimd128Register(0));
2033       __ Maxpd(dst, dst, i.InputSimd128Register(1));
2034       break;
2035     }
2036     case kIA32F64x2Round: {
2037       RoundingMode const mode =
2038           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
2039       __ Roundpd(i.OutputSimd128Register(), i.InputDoubleRegister(0), mode);
2040       break;
2041     }
2042     case kIA32I64x2SplatI32Pair: {
2043       XMMRegister dst = i.OutputSimd128Register();
2044       __ Pinsrd(dst, i.InputRegister(0), 0);
2045       __ Pinsrd(dst, i.InputOperand(1), 1);
2046       __ Pshufd(dst, dst, 0x44);
2047       break;
2048     }
2049     case kIA32I64x2ReplaceLaneI32Pair: {
2050       int8_t lane = i.InputInt8(1);
2051       __ Pinsrd(i.OutputSimd128Register(), i.InputOperand(2), lane * 2);
2052       __ Pinsrd(i.OutputSimd128Register(), i.InputOperand(3), lane * 2 + 1);
2053       break;
2054     }
2055     case kIA32I64x2Neg: {
2056       XMMRegister dst = i.OutputSimd128Register();
2057       Operand src = i.InputOperand(0);
2058       __ Pxor(dst, dst);
2059       __ Psubq(dst, src);
2060       break;
2061     }
2062     case kIA32I64x2Shl: {
2063       ASSEMBLE_SIMD_SHIFT(Psllq, 6);
2064       break;
2065     }
2066     case kIA32I64x2ShrS: {
2067       XMMRegister dst = i.OutputSimd128Register();
2068       XMMRegister src = i.InputSimd128Register(0);
2069       XMMRegister tmp = i.TempSimd128Register(0);
2070       XMMRegister tmp2 = i.TempSimd128Register(1);
2071       Operand shift = i.InputOperand(1);
2072 
2073       // Take shift value modulo 64.
2074       __ and_(shift, Immediate(63));
2075       __ Movd(tmp, shift);
2076 
2077       // Set up a mask [0x80000000,0,0x80000000,0].
2078       __ Pcmpeqb(tmp2, tmp2);
2079       __ Psllq(tmp2, tmp2, 63);
2080 
2081       __ Psrlq(tmp2, tmp2, tmp);
2082       __ Psrlq(dst, src, tmp);
2083       __ Pxor(dst, tmp2);
2084       __ Psubq(dst, tmp2);
2085       break;
2086     }
2087     case kIA32I64x2Add: {
2088       __ Paddq(i.OutputSimd128Register(), i.InputSimd128Register(0),
2089                i.InputOperand(1));
2090       break;
2091     }
2092     case kIA32I64x2Sub: {
2093       __ Psubq(i.OutputSimd128Register(), i.InputSimd128Register(0),
2094                i.InputOperand(1));
2095       break;
2096     }
2097     case kIA32I64x2Mul: {
2098       XMMRegister dst = i.OutputSimd128Register();
2099       XMMRegister left = i.InputSimd128Register(0);
2100       XMMRegister right = i.InputSimd128Register(1);
2101       XMMRegister tmp1 = i.TempSimd128Register(0);
2102       XMMRegister tmp2 = i.TempSimd128Register(1);
2103 
2104       __ Movaps(tmp1, left);
2105       __ Movaps(tmp2, right);
2106 
2107       // Multiply high dword of each qword of left with right.
2108       __ Psrlq(tmp1, 32);
2109       __ Pmuludq(tmp1, tmp1, right);
2110 
2111       // Multiply high dword of each qword of right with left.
2112       __ Psrlq(tmp2, 32);
2113       __ Pmuludq(tmp2, tmp2, left);
2114 
2115       __ Paddq(tmp2, tmp2, tmp1);
2116       __ Psllq(tmp2, tmp2, 32);
2117 
2118       __ Pmuludq(dst, left, right);
2119       __ Paddq(dst, dst, tmp2);
2120       break;
2121     }
2122     case kIA32I64x2ShrU: {
2123       ASSEMBLE_SIMD_SHIFT(Psrlq, 6);
2124       break;
2125     }
2126     case kSSEF32x4Splat: {
2127       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
2128       XMMRegister dst = i.OutputSimd128Register();
2129       __ shufps(dst, dst, 0x0);
2130       break;
2131     }
2132     case kAVXF32x4Splat: {
2133       CpuFeatureScope avx_scope(tasm(), AVX);
2134       XMMRegister src = i.InputFloatRegister(0);
2135       __ vshufps(i.OutputSimd128Register(), src, src, 0x0);
2136       break;
2137     }
2138     case kSSEF32x4ExtractLane: {
2139       DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
2140       XMMRegister dst = i.OutputFloatRegister();
2141       int8_t lane = i.InputInt8(1);
2142       if (lane != 0) {
2143         DCHECK_LT(lane, 4);
2144         __ shufps(dst, dst, lane);
2145       }
2146       break;
2147     }
2148     case kAVXF32x4ExtractLane: {
2149       CpuFeatureScope avx_scope(tasm(), AVX);
2150       XMMRegister dst = i.OutputFloatRegister();
2151       XMMRegister src = i.InputSimd128Register(0);
2152       int8_t lane = i.InputInt8(1);
2153       if (lane == 0) {
2154         if (dst != src) __ vmovaps(dst, src);
2155       } else {
2156         DCHECK_LT(lane, 4);
2157         __ vshufps(dst, src, src, lane);
2158       }
2159       break;
2160     }
2161     case kSSEF32x4ReplaceLane: {
2162       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2163       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2164       __ insertps(i.OutputSimd128Register(), i.InputOperand(2),
2165                   i.InputInt8(1) << 4);
2166       break;
2167     }
2168     case kAVXF32x4ReplaceLane: {
2169       CpuFeatureScope avx_scope(tasm(), AVX);
2170       __ vinsertps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2171                    i.InputOperand(2), i.InputInt8(1) << 4);
2172       break;
2173     }
2174     case kIA32F32x4SConvertI32x4: {
2175       __ Cvtdq2ps(i.OutputSimd128Register(), i.InputOperand(0));
2176       break;
2177     }
2178     case kSSEF32x4UConvertI32x4: {
2179       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2180       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2181       XMMRegister dst = i.OutputSimd128Register();
2182       __ pxor(kScratchDoubleReg, kScratchDoubleReg);      // zeros
2183       __ pblendw(kScratchDoubleReg, dst, 0x55);           // get lo 16 bits
2184       __ psubd(dst, kScratchDoubleReg);                   // get hi 16 bits
2185       __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
2186       __ psrld(dst, 1);                  // divide by 2 to get in unsigned range
2187       __ cvtdq2ps(dst, dst);             // convert hi exactly
2188       __ addps(dst, dst);                // double hi, exactly
2189       __ addps(dst, kScratchDoubleReg);  // add hi and lo, may round.
2190       break;
2191     }
2192     case kAVXF32x4UConvertI32x4: {
2193       CpuFeatureScope avx_scope(tasm(), AVX);
2194       XMMRegister dst = i.OutputSimd128Register();
2195       XMMRegister src = i.InputSimd128Register(0);
2196       __ vpxor(kScratchDoubleReg, kScratchDoubleReg,
2197                kScratchDoubleReg);  // zeros
2198       __ vpblendw(kScratchDoubleReg, kScratchDoubleReg, src,
2199                   0x55);                                   // get lo 16 bits
2200       __ vpsubd(dst, src, kScratchDoubleReg);              // get hi 16 bits
2201       __ vcvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
2202       __ vpsrld(dst, dst, 1);    // divide by 2 to get in unsigned range
2203       __ vcvtdq2ps(dst, dst);    // convert hi exactly
2204       __ vaddps(dst, dst, dst);  // double hi, exactly
2205       __ vaddps(dst, dst, kScratchDoubleReg);  // add hi and lo, may round.
2206       break;
2207     }
2208     case kSSEF32x4Abs: {
2209       XMMRegister dst = i.OutputSimd128Register();
2210       DCHECK_EQ(i.InputSimd128Register(0), dst);
2211       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
2212       __ psrld(kScratchDoubleReg, 1);
2213       __ andps(dst, kScratchDoubleReg);
2214       break;
2215     }
2216     case kAVXF32x4Abs: {
2217       CpuFeatureScope avx_scope(tasm(), AVX);
2218       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2219       __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);
2220       __ vandps(i.OutputSimd128Register(), kScratchDoubleReg,
2221                 i.InputOperand(0));
2222       break;
2223     }
2224     case kSSEF32x4Neg: {
2225       XMMRegister dst = i.OutputSimd128Register();
2226       DCHECK_EQ(dst, i.InputSimd128Register(0));
2227       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
2228       __ pslld(kScratchDoubleReg, 31);
2229       __ xorps(dst, kScratchDoubleReg);
2230       break;
2231     }
2232     case kAVXF32x4Neg: {
2233       CpuFeatureScope avx_scope(tasm(), AVX);
2234       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2235       __ vpslld(kScratchDoubleReg, kScratchDoubleReg, 31);
2236       __ vxorps(i.OutputSimd128Register(), kScratchDoubleReg,
2237                 i.InputOperand(0));
2238       break;
2239     }
2240     case kSSEF32x4Sqrt: {
2241       __ sqrtps(i.OutputSimd128Register(), i.InputSimd128Register(0));
2242       break;
2243     }
2244     case kAVXF32x4Sqrt: {
2245       CpuFeatureScope avx_scope(tasm(), AVX);
2246       __ vsqrtps(i.OutputSimd128Register(), i.InputOperand(0));
2247       break;
2248     }
2249     case kIA32F32x4RecipApprox: {
2250       __ Rcpps(i.OutputSimd128Register(), i.InputOperand(0));
2251       break;
2252     }
2253     case kIA32F32x4RecipSqrtApprox: {
2254       __ Rsqrtps(i.OutputSimd128Register(), i.InputOperand(0));
2255       break;
2256     }
2257     case kSSEF32x4Add: {
2258       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2259       __ addps(i.OutputSimd128Register(), i.InputOperand(1));
2260       break;
2261     }
2262     case kAVXF32x4Add: {
2263       CpuFeatureScope avx_scope(tasm(), AVX);
2264       __ vaddps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2265                 i.InputOperand(1));
2266       break;
2267     }
2268     case kSSEF32x4AddHoriz: {
2269       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2270       CpuFeatureScope sse_scope(tasm(), SSE3);
2271       __ haddps(i.OutputSimd128Register(), i.InputOperand(1));
2272       break;
2273     }
2274     case kAVXF32x4AddHoriz: {
2275       CpuFeatureScope avx_scope(tasm(), AVX);
2276       __ vhaddps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2277                  i.InputOperand(1));
2278       break;
2279     }
2280     case kSSEF32x4Sub: {
2281       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2282       __ subps(i.OutputSimd128Register(), i.InputOperand(1));
2283       break;
2284     }
2285     case kAVXF32x4Sub: {
2286       CpuFeatureScope avx_scope(tasm(), AVX);
2287       __ vsubps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2288                 i.InputOperand(1));
2289       break;
2290     }
2291     case kSSEF32x4Mul: {
2292       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2293       __ mulps(i.OutputSimd128Register(), i.InputOperand(1));
2294       break;
2295     }
2296     case kAVXF32x4Mul: {
2297       CpuFeatureScope avx_scope(tasm(), AVX);
2298       __ vmulps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2299                 i.InputOperand(1));
2300       break;
2301     }
2302     case kSSEF32x4Div: {
2303       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2304       __ divps(i.OutputSimd128Register(), i.InputOperand(1));
2305       break;
2306     }
2307     case kAVXF32x4Div: {
2308       CpuFeatureScope avx_scope(tasm(), AVX);
2309       __ vdivps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2310                 i.InputOperand(1));
2311       break;
2312     }
2313     case kSSEF32x4Min: {
2314       XMMRegister src1 = i.InputSimd128Register(1),
2315                   dst = i.OutputSimd128Register();
2316       DCHECK_EQ(dst, i.InputSimd128Register(0));
2317       // The minps instruction doesn't propagate NaNs and +0's in its first
2318       // operand. Perform minps in both orders, merge the resuls, and adjust.
2319       __ movaps(kScratchDoubleReg, src1);
2320       __ minps(kScratchDoubleReg, dst);
2321       __ minps(dst, src1);
2322       // propagate -0's and NaNs, which may be non-canonical.
2323       __ orps(kScratchDoubleReg, dst);
2324       // Canonicalize NaNs by quieting and clearing the payload.
2325       __ cmpps(dst, kScratchDoubleReg, 3);
2326       __ orps(kScratchDoubleReg, dst);
2327       __ psrld(dst, 10);
2328       __ andnps(dst, kScratchDoubleReg);
2329       break;
2330     }
2331     case kAVXF32x4Min: {
2332       CpuFeatureScope avx_scope(tasm(), AVX);
2333       XMMRegister dst = i.OutputSimd128Register();
2334       XMMRegister src0 = i.InputSimd128Register(0);
2335       Operand src1 = i.InputOperand(1);
2336       // See comment above for correction of minps.
2337       __ movups(kScratchDoubleReg, src1);
2338       __ vminps(kScratchDoubleReg, kScratchDoubleReg, src0);
2339       __ vminps(dst, src0, src1);
2340       __ vorps(dst, dst, kScratchDoubleReg);
2341       __ vcmpneqps(kScratchDoubleReg, dst, dst);
2342       __ vorps(dst, dst, kScratchDoubleReg);
2343       __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 10);
2344       __ vandnps(dst, kScratchDoubleReg, dst);
2345       break;
2346     }
2347     case kSSEF32x4Max: {
2348       XMMRegister src1 = i.InputSimd128Register(1),
2349                   dst = i.OutputSimd128Register();
2350       DCHECK_EQ(dst, i.InputSimd128Register(0));
2351       // The maxps instruction doesn't propagate NaNs and +0's in its first
2352       // operand. Perform maxps in both orders, merge the resuls, and adjust.
2353       __ movaps(kScratchDoubleReg, src1);
2354       __ maxps(kScratchDoubleReg, dst);
2355       __ maxps(dst, src1);
2356       // Find discrepancies.
2357       __ xorps(dst, kScratchDoubleReg);
2358       // Propagate NaNs, which may be non-canonical.
2359       __ orps(kScratchDoubleReg, dst);
2360       // Propagate sign discrepancy and (subtle) quiet NaNs.
2361       __ subps(kScratchDoubleReg, dst);
2362       // Canonicalize NaNs by clearing the payload.
2363       __ cmpps(dst, kScratchDoubleReg, 3);
2364       __ psrld(dst, 10);
2365       __ andnps(dst, kScratchDoubleReg);
2366       break;
2367     }
2368     case kAVXF32x4Max: {
2369       CpuFeatureScope avx_scope(tasm(), AVX);
2370       XMMRegister dst = i.OutputSimd128Register();
2371       XMMRegister src0 = i.InputSimd128Register(0);
2372       Operand src1 = i.InputOperand(1);
2373       // See comment above for correction of maxps.
2374       __ vmovups(kScratchDoubleReg, src1);
2375       __ vmaxps(kScratchDoubleReg, kScratchDoubleReg, src0);
2376       __ vmaxps(dst, src0, src1);
2377       __ vxorps(dst, dst, kScratchDoubleReg);
2378       __ vorps(kScratchDoubleReg, kScratchDoubleReg, dst);
2379       __ vsubps(kScratchDoubleReg, kScratchDoubleReg, dst);
2380       __ vcmpneqps(dst, kScratchDoubleReg, kScratchDoubleReg);
2381       __ vpsrld(dst, dst, 10);
2382       __ vandnps(dst, dst, kScratchDoubleReg);
2383       break;
2384     }
2385     case kSSEF32x4Eq: {
2386       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2387       __ cmpeqps(i.OutputSimd128Register(), i.InputOperand(1));
2388       break;
2389     }
2390     case kAVXF32x4Eq: {
2391       CpuFeatureScope avx_scope(tasm(), AVX);
2392       __ vcmpeqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2393                   i.InputOperand(1));
2394       break;
2395     }
2396     case kSSEF32x4Ne: {
2397       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2398       __ cmpneqps(i.OutputSimd128Register(), i.InputOperand(1));
2399       break;
2400     }
2401     case kAVXF32x4Ne: {
2402       CpuFeatureScope avx_scope(tasm(), AVX);
2403       __ vcmpneqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2404                    i.InputOperand(1));
2405       break;
2406     }
2407     case kSSEF32x4Lt: {
2408       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2409       __ cmpltps(i.OutputSimd128Register(), i.InputOperand(1));
2410       break;
2411     }
2412     case kAVXF32x4Lt: {
2413       CpuFeatureScope avx_scope(tasm(), AVX);
2414       __ vcmpltps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2415                   i.InputOperand(1));
2416       break;
2417     }
2418     case kSSEF32x4Le: {
2419       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2420       __ cmpleps(i.OutputSimd128Register(), i.InputOperand(1));
2421       break;
2422     }
2423     case kAVXF32x4Le: {
2424       CpuFeatureScope avx_scope(tasm(), AVX);
2425       __ vcmpleps(i.OutputSimd128Register(), i.InputSimd128Register(0),
2426                   i.InputOperand(1));
2427       break;
2428     }
2429     case kIA32F32x4Pmin: {
2430       XMMRegister dst = i.OutputSimd128Register();
2431       DCHECK_EQ(dst, i.InputSimd128Register(0));
2432       __ Minps(dst, dst, i.InputSimd128Register(1));
2433       break;
2434     }
2435     case kIA32F32x4Pmax: {
2436       XMMRegister dst = i.OutputSimd128Register();
2437       DCHECK_EQ(dst, i.InputSimd128Register(0));
2438       __ Maxps(dst, dst, i.InputSimd128Register(1));
2439       break;
2440     }
2441     case kIA32F32x4Round: {
2442       RoundingMode const mode =
2443           static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
2444       __ Roundps(i.OutputSimd128Register(), i.InputDoubleRegister(0), mode);
2445       break;
2446     }
2447     case kIA32I32x4Splat: {
2448       XMMRegister dst = i.OutputSimd128Register();
2449       __ Movd(dst, i.InputOperand(0));
2450       __ Pshufd(dst, dst, 0x0);
2451       break;
2452     }
2453     case kIA32I32x4ExtractLane: {
2454       __ Pextrd(i.OutputRegister(), i.InputSimd128Register(0), i.InputInt8(1));
2455       break;
2456     }
2457     case kSSEI32x4ReplaceLane: {
2458       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2459       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2460       __ pinsrd(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
2461       break;
2462     }
2463     case kAVXI32x4ReplaceLane: {
2464       CpuFeatureScope avx_scope(tasm(), AVX);
2465       __ vpinsrd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2466                  i.InputOperand(2), i.InputInt8(1));
2467       break;
2468     }
2469     case kSSEI32x4SConvertF32x4: {
2470       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2471       XMMRegister dst = i.OutputSimd128Register();
2472       // NAN->0
2473       __ movaps(kScratchDoubleReg, dst);
2474       __ cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
2475       __ pand(dst, kScratchDoubleReg);
2476       // Set top bit if >= 0 (but not -0.0!)
2477       __ pxor(kScratchDoubleReg, dst);
2478       // Convert
2479       __ cvttps2dq(dst, dst);
2480       // Set top bit if >=0 is now < 0
2481       __ pand(kScratchDoubleReg, dst);
2482       __ psrad(kScratchDoubleReg, 31);
2483       // Set positive overflow lanes to 0x7FFFFFFF
2484       __ pxor(dst, kScratchDoubleReg);
2485       break;
2486     }
2487     case kAVXI32x4SConvertF32x4: {
2488       CpuFeatureScope avx_scope(tasm(), AVX);
2489       XMMRegister dst = i.OutputSimd128Register();
2490       XMMRegister src = i.InputSimd128Register(0);
2491       // NAN->0
2492       __ vcmpeqps(kScratchDoubleReg, src, src);
2493       __ vpand(dst, src, kScratchDoubleReg);
2494       // Set top bit if >= 0 (but not -0.0!)
2495       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, dst);
2496       // Convert
2497       __ vcvttps2dq(dst, dst);
2498       // Set top bit if >=0 is now < 0
2499       __ vpand(kScratchDoubleReg, kScratchDoubleReg, dst);
2500       __ vpsrad(kScratchDoubleReg, kScratchDoubleReg, 31);
2501       // Set positive overflow lanes to 0x7FFFFFFF
2502       __ vpxor(dst, dst, kScratchDoubleReg);
2503       break;
2504     }
2505     case kIA32I32x4SConvertI16x8Low: {
2506       __ Pmovsxwd(i.OutputSimd128Register(), i.InputOperand(0));
2507       break;
2508     }
2509     case kIA32I32x4SConvertI16x8High: {
2510       XMMRegister dst = i.OutputSimd128Register();
2511       __ Palignr(dst, i.InputOperand(0), 8);
2512       __ Pmovsxwd(dst, dst);
2513       break;
2514     }
2515     case kIA32I32x4Neg: {
2516       XMMRegister dst = i.OutputSimd128Register();
2517       Operand src = i.InputOperand(0);
2518       if (src.is_reg(dst)) {
2519         __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
2520         __ Psignd(dst, kScratchDoubleReg);
2521       } else {
2522         __ Pxor(dst, dst);
2523         __ Psubd(dst, src);
2524       }
2525       break;
2526     }
2527     case kIA32I32x4Shl: {
2528       ASSEMBLE_SIMD_SHIFT(Pslld, 5);
2529       break;
2530     }
2531     case kIA32I32x4ShrS: {
2532       ASSEMBLE_SIMD_SHIFT(Psrad, 5);
2533       break;
2534     }
2535     case kSSEI32x4Add: {
2536       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2537       __ paddd(i.OutputSimd128Register(), i.InputOperand(1));
2538       break;
2539     }
2540     case kAVXI32x4Add: {
2541       CpuFeatureScope avx_scope(tasm(), AVX);
2542       __ vpaddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2543                 i.InputOperand(1));
2544       break;
2545     }
2546     case kSSEI32x4AddHoriz: {
2547       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2548       CpuFeatureScope sse_scope(tasm(), SSSE3);
2549       __ phaddd(i.OutputSimd128Register(), i.InputOperand(1));
2550       break;
2551     }
2552     case kAVXI32x4AddHoriz: {
2553       CpuFeatureScope avx_scope(tasm(), AVX);
2554       __ vphaddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2555                  i.InputOperand(1));
2556       break;
2557     }
2558     case kSSEI32x4Sub: {
2559       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2560       __ psubd(i.OutputSimd128Register(), i.InputOperand(1));
2561       break;
2562     }
2563     case kAVXI32x4Sub: {
2564       CpuFeatureScope avx_scope(tasm(), AVX);
2565       __ vpsubd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2566                 i.InputOperand(1));
2567       break;
2568     }
2569     case kSSEI32x4Mul: {
2570       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2571       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2572       __ pmulld(i.OutputSimd128Register(), i.InputOperand(1));
2573       break;
2574     }
2575     case kAVXI32x4Mul: {
2576       CpuFeatureScope avx_scope(tasm(), AVX);
2577       __ vpmulld(i.OutputSimd128Register(), i.InputSimd128Register(0),
2578                  i.InputOperand(1));
2579       break;
2580     }
2581     case kSSEI32x4MinS: {
2582       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2583       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2584       __ pminsd(i.OutputSimd128Register(), i.InputOperand(1));
2585       break;
2586     }
2587     case kAVXI32x4MinS: {
2588       CpuFeatureScope avx_scope(tasm(), AVX);
2589       __ vpminsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2590                  i.InputOperand(1));
2591       break;
2592     }
2593     case kSSEI32x4MaxS: {
2594       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2595       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2596       __ pmaxsd(i.OutputSimd128Register(), i.InputOperand(1));
2597       break;
2598     }
2599     case kAVXI32x4MaxS: {
2600       CpuFeatureScope avx_scope(tasm(), AVX);
2601       __ vpmaxsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2602                  i.InputOperand(1));
2603       break;
2604     }
2605     case kSSEI32x4Eq: {
2606       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2607       __ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
2608       break;
2609     }
2610     case kAVXI32x4Eq: {
2611       CpuFeatureScope avx_scope(tasm(), AVX);
2612       __ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2613                   i.InputOperand(1));
2614       break;
2615     }
2616     case kSSEI32x4Ne: {
2617       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2618       __ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
2619       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
2620       __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
2621       break;
2622     }
2623     case kAVXI32x4Ne: {
2624       CpuFeatureScope avx_scope(tasm(), AVX);
2625       __ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2626                   i.InputOperand(1));
2627       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2628       __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
2629                kScratchDoubleReg);
2630       break;
2631     }
2632     case kSSEI32x4GtS: {
2633       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2634       __ pcmpgtd(i.OutputSimd128Register(), i.InputOperand(1));
2635       break;
2636     }
2637     case kAVXI32x4GtS: {
2638       CpuFeatureScope avx_scope(tasm(), AVX);
2639       __ vpcmpgtd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2640                   i.InputOperand(1));
2641       break;
2642     }
2643     case kSSEI32x4GeS: {
2644       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2645       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2646       XMMRegister dst = i.OutputSimd128Register();
2647       Operand src = i.InputOperand(1);
2648       __ pminsd(dst, src);
2649       __ pcmpeqd(dst, src);
2650       break;
2651     }
2652     case kAVXI32x4GeS: {
2653       CpuFeatureScope avx_scope(tasm(), AVX);
2654       XMMRegister src1 = i.InputSimd128Register(0);
2655       Operand src2 = i.InputOperand(1);
2656       __ vpminsd(kScratchDoubleReg, src1, src2);
2657       __ vpcmpeqd(i.OutputSimd128Register(), kScratchDoubleReg, src2);
2658       break;
2659     }
2660     case kSSEI32x4UConvertF32x4: {
2661       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2662       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2663       XMMRegister dst = i.OutputSimd128Register();
2664       XMMRegister tmp = i.TempSimd128Register(0);
2665       // NAN->0, negative->0
2666       __ pxor(kScratchDoubleReg, kScratchDoubleReg);
2667       __ maxps(dst, kScratchDoubleReg);
2668       // scratch: float representation of max_signed
2669       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
2670       __ psrld(kScratchDoubleReg, 1);                     // 0x7fffffff
2671       __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
2672       // tmp: convert (src-max_signed).
2673       // Positive overflow lanes -> 0x7FFFFFFF
2674       // Negative lanes -> 0
2675       __ movaps(tmp, dst);
2676       __ subps(tmp, kScratchDoubleReg);
2677       __ cmpleps(kScratchDoubleReg, tmp);
2678       __ cvttps2dq(tmp, tmp);
2679       __ pxor(tmp, kScratchDoubleReg);
2680       __ pxor(kScratchDoubleReg, kScratchDoubleReg);
2681       __ pmaxsd(tmp, kScratchDoubleReg);
2682       // convert. Overflow lanes above max_signed will be 0x80000000
2683       __ cvttps2dq(dst, dst);
2684       // Add (src-max_signed) for overflow lanes.
2685       __ paddd(dst, tmp);
2686       break;
2687     }
2688     case kAVXI32x4UConvertF32x4: {
2689       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2690       CpuFeatureScope avx_scope(tasm(), AVX);
2691       XMMRegister dst = i.OutputSimd128Register();
2692       XMMRegister tmp = i.TempSimd128Register(0);
2693       // NAN->0, negative->0
2694       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2695       __ vmaxps(dst, dst, kScratchDoubleReg);
2696       // scratch: float representation of max_signed
2697       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2698       __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);  // 0x7fffffff
2699       __ vcvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
2700       // tmp: convert (src-max_signed).
2701       // Positive overflow lanes -> 0x7FFFFFFF
2702       // Negative lanes -> 0
2703       __ vsubps(tmp, dst, kScratchDoubleReg);
2704       __ vcmpleps(kScratchDoubleReg, kScratchDoubleReg, tmp);
2705       __ vcvttps2dq(tmp, tmp);
2706       __ vpxor(tmp, tmp, kScratchDoubleReg);
2707       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2708       __ vpmaxsd(tmp, tmp, kScratchDoubleReg);
2709       // convert. Overflow lanes above max_signed will be 0x80000000
2710       __ vcvttps2dq(dst, dst);
2711       // Add (src-max_signed) for overflow lanes.
2712       __ vpaddd(dst, dst, tmp);
2713       break;
2714     }
2715     case kIA32I32x4UConvertI16x8Low: {
2716       __ Pmovzxwd(i.OutputSimd128Register(), i.InputOperand(0));
2717       break;
2718     }
2719     case kIA32I32x4UConvertI16x8High: {
2720       XMMRegister dst = i.OutputSimd128Register();
2721       __ Palignr(dst, i.InputOperand(0), 8);
2722       __ Pmovzxwd(dst, dst);
2723       break;
2724     }
2725     case kIA32I32x4ShrU: {
2726       ASSEMBLE_SIMD_SHIFT(Psrld, 5);
2727       break;
2728     }
2729     case kSSEI32x4MinU: {
2730       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2731       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2732       __ pminud(i.OutputSimd128Register(), i.InputOperand(1));
2733       break;
2734     }
2735     case kAVXI32x4MinU: {
2736       CpuFeatureScope avx_scope(tasm(), AVX);
2737       __ vpminud(i.OutputSimd128Register(), i.InputSimd128Register(0),
2738                  i.InputOperand(1));
2739       break;
2740     }
2741     case kSSEI32x4MaxU: {
2742       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2743       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2744       __ pmaxud(i.OutputSimd128Register(), i.InputOperand(1));
2745       break;
2746     }
2747     case kAVXI32x4MaxU: {
2748       CpuFeatureScope avx_scope(tasm(), AVX);
2749       __ vpmaxud(i.OutputSimd128Register(), i.InputSimd128Register(0),
2750                  i.InputOperand(1));
2751       break;
2752     }
2753     case kSSEI32x4GtU: {
2754       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2755       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2756       XMMRegister dst = i.OutputSimd128Register();
2757       Operand src = i.InputOperand(1);
2758       __ pmaxud(dst, src);
2759       __ pcmpeqd(dst, src);
2760       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
2761       __ pxor(dst, kScratchDoubleReg);
2762       break;
2763     }
2764     case kAVXI32x4GtU: {
2765       CpuFeatureScope avx_scope(tasm(), AVX);
2766       XMMRegister dst = i.OutputSimd128Register();
2767       XMMRegister src1 = i.InputSimd128Register(0);
2768       Operand src2 = i.InputOperand(1);
2769       __ vpmaxud(kScratchDoubleReg, src1, src2);
2770       __ vpcmpeqd(dst, kScratchDoubleReg, src2);
2771       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2772       __ vpxor(dst, dst, kScratchDoubleReg);
2773       break;
2774     }
2775     case kSSEI32x4GeU: {
2776       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2777       CpuFeatureScope sse_scope(tasm(), SSE4_1);
2778       XMMRegister dst = i.OutputSimd128Register();
2779       Operand src = i.InputOperand(1);
2780       __ pminud(dst, src);
2781       __ pcmpeqd(dst, src);
2782       break;
2783     }
2784     case kAVXI32x4GeU: {
2785       CpuFeatureScope avx_scope(tasm(), AVX);
2786       XMMRegister src1 = i.InputSimd128Register(0);
2787       Operand src2 = i.InputOperand(1);
2788       __ vpminud(kScratchDoubleReg, src1, src2);
2789       __ vpcmpeqd(i.OutputSimd128Register(), kScratchDoubleReg, src2);
2790       break;
2791     }
2792     case kIA32I32x4Abs: {
2793       __ Pabsd(i.OutputSimd128Register(), i.InputSimd128Register(0));
2794       break;
2795     }
2796     case kIA32I32x4BitMask: {
2797       __ Movmskps(i.OutputRegister(), i.InputSimd128Register(0));
2798       break;
2799     }
2800     case kIA32I32x4DotI16x8S: {
2801       __ Pmaddwd(i.OutputSimd128Register(), i.InputSimd128Register(0),
2802                  i.InputSimd128Register(1));
2803       break;
2804     }
2805     case kIA32I16x8Splat: {
2806       XMMRegister dst = i.OutputSimd128Register();
2807       __ Movd(dst, i.InputOperand(0));
2808       __ Pshuflw(dst, dst, 0x0);
2809       __ Pshufd(dst, dst, 0x0);
2810       break;
2811     }
2812     case kIA32I16x8ExtractLaneU: {
2813       Register dst = i.OutputRegister();
2814       __ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
2815       break;
2816     }
2817     case kIA32I16x8ExtractLaneS: {
2818       Register dst = i.OutputRegister();
2819       __ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
2820       __ movsx_w(dst, dst);
2821       break;
2822     }
2823     case kSSEI16x8ReplaceLane: {
2824       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2825       __ pinsrw(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
2826       break;
2827     }
2828     case kAVXI16x8ReplaceLane: {
2829       CpuFeatureScope avx_scope(tasm(), AVX);
2830       __ vpinsrw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2831                  i.InputOperand(2), i.InputInt8(1));
2832       break;
2833     }
2834     case kIA32I16x8SConvertI8x16Low: {
2835       __ Pmovsxbw(i.OutputSimd128Register(), i.InputOperand(0));
2836       break;
2837     }
2838     case kIA32I16x8SConvertI8x16High: {
2839       XMMRegister dst = i.OutputSimd128Register();
2840       __ Palignr(dst, i.InputOperand(0), 8);
2841       __ Pmovsxbw(dst, dst);
2842       break;
2843     }
2844     case kIA32I16x8Neg: {
2845       XMMRegister dst = i.OutputSimd128Register();
2846       Operand src = i.InputOperand(0);
2847       if (src.is_reg(dst)) {
2848         __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
2849         __ Psignw(dst, kScratchDoubleReg);
2850       } else {
2851         __ Pxor(dst, dst);
2852         __ Psubw(dst, src);
2853       }
2854       break;
2855     }
2856     case kIA32I16x8Shl: {
2857       ASSEMBLE_SIMD_SHIFT(Psllw, 4);
2858       break;
2859     }
2860     case kIA32I16x8ShrS: {
2861       ASSEMBLE_SIMD_SHIFT(Psraw, 4);
2862       break;
2863     }
2864     case kSSEI16x8SConvertI32x4: {
2865       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2866       __ packssdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
2867       break;
2868     }
2869     case kAVXI16x8SConvertI32x4: {
2870       CpuFeatureScope avx_scope(tasm(), AVX);
2871       __ vpackssdw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2872                    i.InputOperand(1));
2873       break;
2874     }
2875     case kSSEI16x8Add: {
2876       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2877       __ paddw(i.OutputSimd128Register(), i.InputOperand(1));
2878       break;
2879     }
2880     case kAVXI16x8Add: {
2881       CpuFeatureScope avx_scope(tasm(), AVX);
2882       __ vpaddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2883                 i.InputOperand(1));
2884       break;
2885     }
2886     case kSSEI16x8AddSatS: {
2887       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2888       __ paddsw(i.OutputSimd128Register(), i.InputOperand(1));
2889       break;
2890     }
2891     case kAVXI16x8AddSatS: {
2892       CpuFeatureScope avx_scope(tasm(), AVX);
2893       __ vpaddsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2894                  i.InputOperand(1));
2895       break;
2896     }
2897     case kSSEI16x8AddHoriz: {
2898       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2899       CpuFeatureScope sse_scope(tasm(), SSSE3);
2900       __ phaddw(i.OutputSimd128Register(), i.InputOperand(1));
2901       break;
2902     }
2903     case kAVXI16x8AddHoriz: {
2904       CpuFeatureScope avx_scope(tasm(), AVX);
2905       __ vphaddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2906                  i.InputOperand(1));
2907       break;
2908     }
2909     case kSSEI16x8Sub: {
2910       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2911       __ psubw(i.OutputSimd128Register(), i.InputOperand(1));
2912       break;
2913     }
2914     case kAVXI16x8Sub: {
2915       CpuFeatureScope avx_scope(tasm(), AVX);
2916       __ vpsubw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2917                 i.InputOperand(1));
2918       break;
2919     }
2920     case kSSEI16x8SubSatS: {
2921       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2922       __ psubsw(i.OutputSimd128Register(), i.InputOperand(1));
2923       break;
2924     }
2925     case kAVXI16x8SubSatS: {
2926       CpuFeatureScope avx_scope(tasm(), AVX);
2927       __ vpsubsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2928                  i.InputOperand(1));
2929       break;
2930     }
2931     case kSSEI16x8Mul: {
2932       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2933       __ pmullw(i.OutputSimd128Register(), i.InputOperand(1));
2934       break;
2935     }
2936     case kAVXI16x8Mul: {
2937       CpuFeatureScope avx_scope(tasm(), AVX);
2938       __ vpmullw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2939                  i.InputOperand(1));
2940       break;
2941     }
2942     case kSSEI16x8MinS: {
2943       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2944       __ pminsw(i.OutputSimd128Register(), i.InputOperand(1));
2945       break;
2946     }
2947     case kAVXI16x8MinS: {
2948       CpuFeatureScope avx_scope(tasm(), AVX);
2949       __ vpminsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2950                  i.InputOperand(1));
2951       break;
2952     }
2953     case kSSEI16x8MaxS: {
2954       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2955       __ pmaxsw(i.OutputSimd128Register(), i.InputOperand(1));
2956       break;
2957     }
2958     case kAVXI16x8MaxS: {
2959       CpuFeatureScope avx_scope(tasm(), AVX);
2960       __ vpmaxsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2961                  i.InputOperand(1));
2962       break;
2963     }
2964     case kSSEI16x8Eq: {
2965       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2966       __ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
2967       break;
2968     }
2969     case kAVXI16x8Eq: {
2970       CpuFeatureScope avx_scope(tasm(), AVX);
2971       __ vpcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2972                   i.InputOperand(1));
2973       break;
2974     }
2975     case kSSEI16x8Ne: {
2976       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2977       __ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
2978       __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
2979       __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
2980       break;
2981     }
2982     case kAVXI16x8Ne: {
2983       CpuFeatureScope avx_scope(tasm(), AVX);
2984       __ vpcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2985                   i.InputOperand(1));
2986       __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
2987       __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
2988                kScratchDoubleReg);
2989       break;
2990     }
2991     case kSSEI16x8GtS: {
2992       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
2993       __ pcmpgtw(i.OutputSimd128Register(), i.InputOperand(1));
2994       break;
2995     }
2996     case kAVXI16x8GtS: {
2997       CpuFeatureScope avx_scope(tasm(), AVX);
2998       __ vpcmpgtw(i.OutputSimd128Register(), i.InputSimd128Register(0),
2999                   i.InputOperand(1));
3000       break;
3001     }
3002     case kSSEI16x8GeS: {
3003       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3004       XMMRegister dst = i.OutputSimd128Register();
3005       Operand src = i.InputOperand(1);
3006       __ pminsw(dst, src);
3007       __ pcmpeqw(dst, src);
3008       break;
3009     }
3010     case kAVXI16x8GeS: {
3011       CpuFeatureScope avx_scope(tasm(), AVX);
3012       XMMRegister src1 = i.InputSimd128Register(0);
3013       Operand src2 = i.InputOperand(1);
3014       __ vpminsw(kScratchDoubleReg, src1, src2);
3015       __ vpcmpeqw(i.OutputSimd128Register(), kScratchDoubleReg, src2);
3016       break;
3017     }
3018     case kIA32I16x8UConvertI8x16Low: {
3019       __ Pmovzxbw(i.OutputSimd128Register(), i.InputOperand(0));
3020       break;
3021     }
3022     case kIA32I16x8UConvertI8x16High: {
3023       XMMRegister dst = i.OutputSimd128Register();
3024       __ Palignr(dst, i.InputOperand(0), 8);
3025       __ Pmovzxbw(dst, dst);
3026       break;
3027     }
3028     case kIA32I16x8ShrU: {
3029       ASSEMBLE_SIMD_SHIFT(Psrlw, 4);
3030       break;
3031     }
3032     case kSSEI16x8UConvertI32x4: {
3033       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3034       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3035       __ packusdw(i.OutputSimd128Register(), i.InputSimd128Register(1));
3036       break;
3037     }
3038     case kAVXI16x8UConvertI32x4: {
3039       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3040       CpuFeatureScope avx_scope(tasm(), AVX);
3041       XMMRegister dst = i.OutputSimd128Register();
3042       __ vpackusdw(dst, dst, i.InputSimd128Register(1));
3043       break;
3044     }
3045     case kSSEI16x8AddSatU: {
3046       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3047       __ paddusw(i.OutputSimd128Register(), i.InputOperand(1));
3048       break;
3049     }
3050     case kAVXI16x8AddSatU: {
3051       CpuFeatureScope avx_scope(tasm(), AVX);
3052       __ vpaddusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
3053                   i.InputOperand(1));
3054       break;
3055     }
3056     case kSSEI16x8SubSatU: {
3057       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3058       __ psubusw(i.OutputSimd128Register(), i.InputOperand(1));
3059       break;
3060     }
3061     case kAVXI16x8SubSatU: {
3062       CpuFeatureScope avx_scope(tasm(), AVX);
3063       __ vpsubusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
3064                   i.InputOperand(1));
3065       break;
3066     }
3067     case kSSEI16x8MinU: {
3068       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3069       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3070       __ pminuw(i.OutputSimd128Register(), i.InputOperand(1));
3071       break;
3072     }
3073     case kAVXI16x8MinU: {
3074       CpuFeatureScope avx_scope(tasm(), AVX);
3075       __ vpminuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
3076                  i.InputOperand(1));
3077       break;
3078     }
3079     case kSSEI16x8MaxU: {
3080       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3081       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3082       __ pmaxuw(i.OutputSimd128Register(), i.InputOperand(1));
3083       break;
3084     }
3085     case kAVXI16x8MaxU: {
3086       CpuFeatureScope avx_scope(tasm(), AVX);
3087       __ vpmaxuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
3088                  i.InputOperand(1));
3089       break;
3090     }
3091     case kSSEI16x8GtU: {
3092       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3093       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3094       XMMRegister dst = i.OutputSimd128Register();
3095       Operand src = i.InputOperand(1);
3096       __ pmaxuw(dst, src);
3097       __ pcmpeqw(dst, src);
3098       __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
3099       __ pxor(dst, kScratchDoubleReg);
3100       break;
3101     }
3102     case kAVXI16x8GtU: {
3103       CpuFeatureScope avx_scope(tasm(), AVX);
3104       XMMRegister dst = i.OutputSimd128Register();
3105       XMMRegister src1 = i.InputSimd128Register(0);
3106       Operand src2 = i.InputOperand(1);
3107       __ vpmaxuw(kScratchDoubleReg, src1, src2);
3108       __ vpcmpeqw(dst, kScratchDoubleReg, src2);
3109       __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
3110       __ vpxor(dst, dst, kScratchDoubleReg);
3111       break;
3112     }
3113     case kSSEI16x8GeU: {
3114       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3115       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3116       XMMRegister dst = i.OutputSimd128Register();
3117       Operand src = i.InputOperand(1);
3118       __ pminuw(dst, src);
3119       __ pcmpeqw(dst, src);
3120       break;
3121     }
3122     case kAVXI16x8GeU: {
3123       CpuFeatureScope avx_scope(tasm(), AVX);
3124       XMMRegister src1 = i.InputSimd128Register(0);
3125       Operand src2 = i.InputOperand(1);
3126       __ vpminuw(kScratchDoubleReg, src1, src2);
3127       __ vpcmpeqw(i.OutputSimd128Register(), kScratchDoubleReg, src2);
3128       break;
3129     }
3130     case kIA32I16x8RoundingAverageU: {
3131       __ Pavgw(i.OutputSimd128Register(), i.InputSimd128Register(0),
3132                i.InputOperand(1));
3133       break;
3134     }
3135     case kIA32I16x8Abs: {
3136       __ Pabsw(i.OutputSimd128Register(), i.InputSimd128Register(0));
3137       break;
3138     }
3139     case kIA32I16x8BitMask: {
3140       Register dst = i.OutputRegister();
3141       XMMRegister tmp = i.TempSimd128Register(0);
3142       __ Packsswb(tmp, i.InputSimd128Register(0));
3143       __ Pmovmskb(dst, tmp);
3144       __ shr(dst, 8);
3145       break;
3146     }
3147     case kIA32I8x16Splat: {
3148       XMMRegister dst = i.OutputSimd128Register();
3149       __ Movd(dst, i.InputOperand(0));
3150       __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
3151       __ Pshufb(dst, kScratchDoubleReg);
3152       break;
3153     }
3154     case kIA32I8x16ExtractLaneU: {
3155       Register dst = i.OutputRegister();
3156       __ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
3157       break;
3158     }
3159     case kIA32I8x16ExtractLaneS: {
3160       Register dst = i.OutputRegister();
3161       __ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
3162       __ movsx_b(dst, dst);
3163       break;
3164     }
3165     case kSSEI8x16ReplaceLane: {
3166       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3167       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3168       __ pinsrb(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
3169       break;
3170     }
3171     case kAVXI8x16ReplaceLane: {
3172       CpuFeatureScope avx_scope(tasm(), AVX);
3173       __ vpinsrb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3174                  i.InputOperand(2), i.InputInt8(1));
3175       break;
3176     }
3177     case kSSEI8x16SConvertI16x8: {
3178       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3179       __ packsswb(i.OutputSimd128Register(), i.InputOperand(1));
3180       break;
3181     }
3182     case kAVXI8x16SConvertI16x8: {
3183       CpuFeatureScope avx_scope(tasm(), AVX);
3184       __ vpacksswb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3185                    i.InputOperand(1));
3186       break;
3187     }
3188     case kIA32I8x16Neg: {
3189       XMMRegister dst = i.OutputSimd128Register();
3190       Operand src = i.InputOperand(0);
3191       if (src.is_reg(dst)) {
3192         __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
3193         __ Psignb(dst, kScratchDoubleReg);
3194       } else {
3195         __ Pxor(dst, dst);
3196         __ Psubb(dst, src);
3197       }
3198       break;
3199     }
3200     case kIA32I8x16Shl: {
3201       XMMRegister dst = i.OutputSimd128Register();
3202       DCHECK_EQ(dst, i.InputSimd128Register(0));
3203       Register tmp = i.ToRegister(instr->TempAt(0));
3204       XMMRegister tmp_simd = i.TempSimd128Register(1);
3205 
3206       if (HasImmediateInput(instr, 1)) {
3207         // Perform 16-bit shift, then mask away low bits.
3208         uint8_t shift = i.InputInt3(1);
3209         __ Psllw(dst, dst, byte{shift});
3210 
3211         uint8_t bmask = static_cast<uint8_t>(0xff << shift);
3212         uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
3213         __ mov(tmp, mask);
3214         __ Movd(tmp_simd, tmp);
3215         __ Pshufd(tmp_simd, tmp_simd, 0);
3216         __ Pand(dst, tmp_simd);
3217       } else {
3218         // Take shift value modulo 8.
3219         __ mov(tmp, i.InputRegister(1));
3220         __ and_(tmp, 7);
3221         // Mask off the unwanted bits before word-shifting.
3222         __ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
3223         __ add(tmp, Immediate(8));
3224         __ Movd(tmp_simd, tmp);
3225         __ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
3226         __ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
3227         __ Pand(dst, kScratchDoubleReg);
3228         // TODO(zhin): sub here to avoid asking for another temporary register,
3229         // examine codegen for other i8x16 shifts, they use less instructions.
3230         __ sub(tmp, Immediate(8));
3231         __ Movd(tmp_simd, tmp);
3232         __ Psllw(dst, dst, tmp_simd);
3233       }
3234       break;
3235     }
3236     case kIA32I8x16ShrS: {
3237       XMMRegister dst = i.OutputSimd128Register();
3238       DCHECK_EQ(dst, i.InputSimd128Register(0));
3239       if (HasImmediateInput(instr, 1)) {
3240         __ Punpckhbw(kScratchDoubleReg, dst);
3241         __ Punpcklbw(dst, dst);
3242         uint8_t shift = i.InputInt3(1) + 8;
3243         __ Psraw(kScratchDoubleReg, shift);
3244         __ Psraw(dst, shift);
3245         __ Packsswb(dst, kScratchDoubleReg);
3246       } else {
3247         Register tmp = i.ToRegister(instr->TempAt(0));
3248         XMMRegister tmp_simd = i.TempSimd128Register(1);
3249         // Unpack the bytes into words, do arithmetic shifts, and repack.
3250         __ Punpckhbw(kScratchDoubleReg, dst);
3251         __ Punpcklbw(dst, dst);
3252         __ mov(tmp, i.InputRegister(1));
3253         // Take shift value modulo 8.
3254         __ and_(tmp, 7);
3255         __ add(tmp, Immediate(8));
3256         __ Movd(tmp_simd, tmp);
3257         __ Psraw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
3258         __ Psraw(dst, dst, tmp_simd);
3259         __ Packsswb(dst, kScratchDoubleReg);
3260       }
3261       break;
3262     }
3263     case kSSEI8x16Add: {
3264       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3265       __ paddb(i.OutputSimd128Register(), i.InputOperand(1));
3266       break;
3267     }
3268     case kAVXI8x16Add: {
3269       CpuFeatureScope avx_scope(tasm(), AVX);
3270       __ vpaddb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3271                 i.InputOperand(1));
3272       break;
3273     }
3274     case kSSEI8x16AddSatS: {
3275       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3276       __ paddsb(i.OutputSimd128Register(), i.InputOperand(1));
3277       break;
3278     }
3279     case kAVXI8x16AddSatS: {
3280       CpuFeatureScope avx_scope(tasm(), AVX);
3281       __ vpaddsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3282                  i.InputOperand(1));
3283       break;
3284     }
3285     case kSSEI8x16Sub: {
3286       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3287       __ psubb(i.OutputSimd128Register(), i.InputOperand(1));
3288       break;
3289     }
3290     case kAVXI8x16Sub: {
3291       CpuFeatureScope avx_scope(tasm(), AVX);
3292       __ vpsubb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3293                 i.InputOperand(1));
3294       break;
3295     }
3296     case kSSEI8x16SubSatS: {
3297       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3298       __ psubsb(i.OutputSimd128Register(), i.InputOperand(1));
3299       break;
3300     }
3301     case kAVXI8x16SubSatS: {
3302       CpuFeatureScope avx_scope(tasm(), AVX);
3303       __ vpsubsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3304                  i.InputOperand(1));
3305       break;
3306     }
3307     case kSSEI8x16Mul: {
3308       XMMRegister dst = i.OutputSimd128Register();
3309       DCHECK_EQ(dst, i.InputSimd128Register(0));
3310       XMMRegister right = i.InputSimd128Register(1);
3311       XMMRegister tmp = i.TempSimd128Register(0);
3312 
3313       // I16x8 view of I8x16
3314       // left = AAaa AAaa ... AAaa AAaa
3315       // right= BBbb BBbb ... BBbb BBbb
3316 
3317       // t = 00AA 00AA ... 00AA 00AA
3318       // s = 00BB 00BB ... 00BB 00BB
3319       __ movaps(tmp, dst);
3320       __ movaps(kScratchDoubleReg, right);
3321       __ psrlw(tmp, 8);
3322       __ psrlw(kScratchDoubleReg, 8);
3323       // dst = left * 256
3324       __ psllw(dst, 8);
3325 
3326       // t = I16x8Mul(t, s)
3327       //    => __PP __PP ...  __PP  __PP
3328       __ pmullw(tmp, kScratchDoubleReg);
3329       // dst = I16x8Mul(left * 256, right)
3330       //    => pp__ pp__ ...  pp__  pp__
3331       __ pmullw(dst, right);
3332 
3333       // t = I16x8Shl(t, 8)
3334       //    => PP00 PP00 ...  PP00  PP00
3335       __ psllw(tmp, 8);
3336 
3337       // dst = I16x8Shr(dst, 8)
3338       //    => 00pp 00pp ...  00pp  00pp
3339       __ psrlw(dst, 8);
3340 
3341       // dst = I16x8Or(dst, t)
3342       //    => PPpp PPpp ...  PPpp  PPpp
3343       __ por(dst, tmp);
3344       break;
3345     }
3346     case kAVXI8x16Mul: {
3347       CpuFeatureScope avx_scope(tasm(), AVX);
3348       XMMRegister dst = i.OutputSimd128Register();
3349       XMMRegister left = i.InputSimd128Register(0);
3350       XMMRegister right = i.InputSimd128Register(1);
3351       XMMRegister tmp = i.TempSimd128Register(0);
3352 
3353       // I16x8 view of I8x16
3354       // left = AAaa AAaa ... AAaa AAaa
3355       // right= BBbb BBbb ... BBbb BBbb
3356 
3357       // t = 00AA 00AA ... 00AA 00AA
3358       // s = 00BB 00BB ... 00BB 00BB
3359       __ vpsrlw(tmp, left, 8);
3360       __ vpsrlw(kScratchDoubleReg, right, 8);
3361 
3362       // t = I16x8Mul(t0, t1)
3363       //    => __PP __PP ...  __PP  __PP
3364       __ vpmullw(tmp, tmp, kScratchDoubleReg);
3365 
3366       // s = left * 256
3367       __ vpsllw(kScratchDoubleReg, left, 8);
3368 
3369       // dst = I16x8Mul(left * 256, right)
3370       //    => pp__ pp__ ...  pp__  pp__
3371       __ vpmullw(dst, kScratchDoubleReg, right);
3372 
3373       // dst = I16x8Shr(dst, 8)
3374       //    => 00pp 00pp ...  00pp  00pp
3375       __ vpsrlw(dst, dst, 8);
3376 
3377       // t = I16x8Shl(t, 8)
3378       //    => PP00 PP00 ...  PP00  PP00
3379       __ vpsllw(tmp, tmp, 8);
3380 
3381       // dst = I16x8Or(dst, t)
3382       //    => PPpp PPpp ...  PPpp  PPpp
3383       __ vpor(dst, dst, tmp);
3384       break;
3385     }
3386     case kSSEI8x16MinS: {
3387       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3388       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3389       __ pminsb(i.OutputSimd128Register(), i.InputOperand(1));
3390       break;
3391     }
3392     case kAVXI8x16MinS: {
3393       CpuFeatureScope avx_scope(tasm(), AVX);
3394       __ vpminsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3395                  i.InputOperand(1));
3396       break;
3397     }
3398     case kSSEI8x16MaxS: {
3399       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3400       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3401       __ pmaxsb(i.OutputSimd128Register(), i.InputOperand(1));
3402       break;
3403     }
3404     case kAVXI8x16MaxS: {
3405       CpuFeatureScope avx_scope(tasm(), AVX);
3406       __ vpmaxsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3407                  i.InputOperand(1));
3408       break;
3409     }
3410     case kSSEI8x16Eq: {
3411       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3412       __ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
3413       break;
3414     }
3415     case kAVXI8x16Eq: {
3416       CpuFeatureScope avx_scope(tasm(), AVX);
3417       __ vpcmpeqb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3418                   i.InputOperand(1));
3419       break;
3420     }
3421     case kSSEI8x16Ne: {
3422       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3423       __ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
3424       __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
3425       __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
3426       break;
3427     }
3428     case kAVXI8x16Ne: {
3429       CpuFeatureScope avx_scope(tasm(), AVX);
3430       __ vpcmpeqb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3431                   i.InputOperand(1));
3432       __ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
3433       __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
3434                kScratchDoubleReg);
3435       break;
3436     }
3437     case kSSEI8x16GtS: {
3438       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3439       __ pcmpgtb(i.OutputSimd128Register(), i.InputOperand(1));
3440       break;
3441     }
3442     case kAVXI8x16GtS: {
3443       CpuFeatureScope avx_scope(tasm(), AVX);
3444       __ vpcmpgtb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3445                   i.InputOperand(1));
3446       break;
3447     }
3448     case kSSEI8x16GeS: {
3449       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3450       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3451       XMMRegister dst = i.OutputSimd128Register();
3452       Operand src = i.InputOperand(1);
3453       __ pminsb(dst, src);
3454       __ pcmpeqb(dst, src);
3455       break;
3456     }
3457     case kAVXI8x16GeS: {
3458       CpuFeatureScope avx_scope(tasm(), AVX);
3459       XMMRegister src1 = i.InputSimd128Register(0);
3460       Operand src2 = i.InputOperand(1);
3461       __ vpminsb(kScratchDoubleReg, src1, src2);
3462       __ vpcmpeqb(i.OutputSimd128Register(), kScratchDoubleReg, src2);
3463       break;
3464     }
3465     case kSSEI8x16UConvertI16x8: {
3466       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3467       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3468       XMMRegister dst = i.OutputSimd128Register();
3469       __ packuswb(dst, i.InputOperand(1));
3470       break;
3471     }
3472     case kAVXI8x16UConvertI16x8: {
3473       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3474       CpuFeatureScope avx_scope(tasm(), AVX);
3475       XMMRegister dst = i.OutputSimd128Register();
3476       __ vpackuswb(dst, dst, i.InputOperand(1));
3477       break;
3478     }
3479     case kSSEI8x16AddSatU: {
3480       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3481       __ paddusb(i.OutputSimd128Register(), i.InputOperand(1));
3482       break;
3483     }
3484     case kAVXI8x16AddSatU: {
3485       CpuFeatureScope avx_scope(tasm(), AVX);
3486       __ vpaddusb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3487                   i.InputOperand(1));
3488       break;
3489     }
3490     case kSSEI8x16SubSatU: {
3491       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3492       __ psubusb(i.OutputSimd128Register(), i.InputOperand(1));
3493       break;
3494     }
3495     case kAVXI8x16SubSatU: {
3496       CpuFeatureScope avx_scope(tasm(), AVX);
3497       __ vpsubusb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3498                   i.InputOperand(1));
3499       break;
3500     }
3501     case kIA32I8x16ShrU: {
3502       XMMRegister dst = i.OutputSimd128Register();
3503       DCHECK_EQ(dst, i.InputSimd128Register(0));
3504       Register tmp = i.ToRegister(instr->TempAt(0));
3505       XMMRegister tmp_simd = i.TempSimd128Register(1);
3506 
3507       if (HasImmediateInput(instr, 1)) {
3508         // Perform 16-bit shift, then mask away high bits.
3509         uint8_t shift = i.InputInt3(1);
3510         __ Psrlw(dst, dst, byte{shift});
3511 
3512         uint8_t bmask = 0xff >> shift;
3513         uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
3514         __ mov(tmp, mask);
3515         __ Movd(tmp_simd, tmp);
3516         __ Pshufd(tmp_simd, tmp_simd, 0);
3517         __ Pand(dst, tmp_simd);
3518       } else {
3519         // Unpack the bytes into words, do logical shifts, and repack.
3520         __ Punpckhbw(kScratchDoubleReg, dst);
3521         __ Punpcklbw(dst, dst);
3522         __ mov(tmp, i.InputRegister(1));
3523         // Take shift value modulo 8.
3524         __ and_(tmp, 7);
3525         __ add(tmp, Immediate(8));
3526         __ Movd(tmp_simd, tmp);
3527         __ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
3528         __ Psrlw(dst, dst, tmp_simd);
3529         __ Packuswb(dst, kScratchDoubleReg);
3530       }
3531       break;
3532     }
3533     case kSSEI8x16MinU: {
3534       XMMRegister dst = i.OutputSimd128Register();
3535       DCHECK_EQ(dst, i.InputSimd128Register(0));
3536       __ pminub(dst, i.InputOperand(1));
3537       break;
3538     }
3539     case kAVXI8x16MinU: {
3540       CpuFeatureScope avx_scope(tasm(), AVX);
3541       __ vpminub(i.OutputSimd128Register(), i.InputSimd128Register(0),
3542                  i.InputOperand(1));
3543       break;
3544     }
3545     case kSSEI8x16MaxU: {
3546       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3547       __ pmaxub(i.OutputSimd128Register(), i.InputOperand(1));
3548       break;
3549     }
3550     case kAVXI8x16MaxU: {
3551       CpuFeatureScope avx_scope(tasm(), AVX);
3552       __ vpmaxub(i.OutputSimd128Register(), i.InputSimd128Register(0),
3553                  i.InputOperand(1));
3554       break;
3555     }
3556     case kSSEI8x16GtU: {
3557       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3558       XMMRegister dst = i.OutputSimd128Register();
3559       Operand src = i.InputOperand(1);
3560       __ pmaxub(dst, src);
3561       __ pcmpeqb(dst, src);
3562       __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
3563       __ pxor(dst, kScratchDoubleReg);
3564       break;
3565     }
3566     case kAVXI8x16GtU: {
3567       CpuFeatureScope avx_scope(tasm(), AVX);
3568       XMMRegister dst = i.OutputSimd128Register();
3569       XMMRegister src1 = i.InputSimd128Register(0);
3570       Operand src2 = i.InputOperand(1);
3571       __ vpmaxub(kScratchDoubleReg, src1, src2);
3572       __ vpcmpeqb(dst, kScratchDoubleReg, src2);
3573       __ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
3574       __ vpxor(dst, dst, kScratchDoubleReg);
3575       break;
3576     }
3577     case kSSEI8x16GeU: {
3578       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3579       XMMRegister dst = i.OutputSimd128Register();
3580       Operand src = i.InputOperand(1);
3581       __ pminub(dst, src);
3582       __ pcmpeqb(dst, src);
3583       break;
3584     }
3585     case kAVXI8x16GeU: {
3586       CpuFeatureScope avx_scope(tasm(), AVX);
3587       XMMRegister src1 = i.InputSimd128Register(0);
3588       Operand src2 = i.InputOperand(1);
3589       __ vpminub(kScratchDoubleReg, src1, src2);
3590       __ vpcmpeqb(i.OutputSimd128Register(), kScratchDoubleReg, src2);
3591       break;
3592     }
3593     case kIA32I8x16RoundingAverageU: {
3594       __ Pavgb(i.OutputSimd128Register(), i.InputSimd128Register(0),
3595                i.InputOperand(1));
3596       break;
3597     }
3598     case kIA32I8x16Abs: {
3599       __ Pabsb(i.OutputSimd128Register(), i.InputSimd128Register(0));
3600       break;
3601     }
3602     case kIA32I8x16BitMask: {
3603       __ Pmovmskb(i.OutputRegister(), i.InputSimd128Register(0));
3604       break;
3605     }
3606     case kIA32S128Const: {
3607       XMMRegister dst = i.OutputSimd128Register();
3608       Register tmp = i.TempRegister(0);
3609       uint64_t low_qword = make_uint64(i.InputUint32(1), i.InputUint32(0));
3610       __ Move(dst, low_qword);
3611       __ Move(tmp, Immediate(i.InputUint32(2)));
3612       __ Pinsrd(dst, tmp, 2);
3613       __ Move(tmp, Immediate(i.InputUint32(3)));
3614       __ Pinsrd(dst, tmp, 3);
3615       break;
3616     }
3617     case kIA32S128Zero: {
3618       XMMRegister dst = i.OutputSimd128Register();
3619       __ Pxor(dst, dst);
3620       break;
3621     }
3622     case kIA32S128AllOnes: {
3623       XMMRegister dst = i.OutputSimd128Register();
3624       __ Pcmpeqd(dst, dst);
3625       break;
3626     }
3627     case kSSES128Not: {
3628       XMMRegister dst = i.OutputSimd128Register();
3629       DCHECK_EQ(dst, i.InputSimd128Register(0));
3630       __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
3631       __ pxor(dst, kScratchDoubleReg);
3632       break;
3633     }
3634     case kAVXS128Not: {
3635       CpuFeatureScope avx_scope(tasm(), AVX);
3636       __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
3637       __ vpxor(i.OutputSimd128Register(), kScratchDoubleReg, i.InputOperand(0));
3638       break;
3639     }
3640     case kSSES128And: {
3641       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3642       __ pand(i.OutputSimd128Register(), i.InputOperand(1));
3643       break;
3644     }
3645     case kAVXS128And: {
3646       CpuFeatureScope avx_scope(tasm(), AVX);
3647       __ vpand(i.OutputSimd128Register(), i.InputSimd128Register(0),
3648                i.InputOperand(1));
3649       break;
3650     }
3651     case kSSES128Or: {
3652       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3653       __ por(i.OutputSimd128Register(), i.InputOperand(1));
3654       break;
3655     }
3656     case kAVXS128Or: {
3657       CpuFeatureScope avx_scope(tasm(), AVX);
3658       __ vpor(i.OutputSimd128Register(), i.InputSimd128Register(0),
3659               i.InputOperand(1));
3660       break;
3661     }
3662     case kSSES128Xor: {
3663       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3664       __ pxor(i.OutputSimd128Register(), i.InputOperand(1));
3665       break;
3666     }
3667     case kAVXS128Xor: {
3668       CpuFeatureScope avx_scope(tasm(), AVX);
3669       __ vpxor(i.OutputSimd128Register(), i.InputSimd128Register(0),
3670                i.InputOperand(1));
3671       break;
3672     }
3673     case kSSES128Select: {
3674       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3675       // Mask used here is stored in dst.
3676       XMMRegister dst = i.OutputSimd128Register();
3677       __ movaps(kScratchDoubleReg, i.InputSimd128Register(1));
3678       __ xorps(kScratchDoubleReg, i.InputSimd128Register(2));
3679       __ andps(dst, kScratchDoubleReg);
3680       __ xorps(dst, i.InputSimd128Register(2));
3681       break;
3682     }
3683     case kAVXS128Select: {
3684       CpuFeatureScope avx_scope(tasm(), AVX);
3685       XMMRegister dst = i.OutputSimd128Register();
3686       __ vxorps(kScratchDoubleReg, i.InputSimd128Register(2),
3687                 i.InputOperand(1));
3688       __ vandps(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(0));
3689       __ vxorps(dst, kScratchDoubleReg, i.InputSimd128Register(2));
3690       break;
3691     }
3692     case kIA32S128AndNot: {
3693       XMMRegister dst = i.OutputSimd128Register();
3694       DCHECK_EQ(dst, i.InputSimd128Register(0));
3695       // The inputs have been inverted by instruction selector, so we can call
3696       // andnps here without any modifications.
3697       XMMRegister src1 = i.InputSimd128Register(1);
3698       __ Andnps(dst, src1);
3699       break;
3700     }
3701     case kIA32I8x16Swizzle: {
3702       DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3703       XMMRegister dst = i.OutputSimd128Register();
3704       XMMRegister mask = i.TempSimd128Register(0);
3705 
3706       // Out-of-range indices should return 0, add 112 so that any value > 15
3707       // saturates to 128 (top bit set), so pshufb will zero that lane.
3708       __ Move(mask, uint32_t{0x70707070});
3709       __ Pshufd(mask, mask, 0x0);
3710       __ Paddusb(mask, i.InputSimd128Register(1));
3711       __ Pshufb(dst, mask);
3712       break;
3713     }
3714     case kIA32I8x16Shuffle: {
3715       XMMRegister dst = i.OutputSimd128Register();
3716       Operand src0 = i.InputOperand(0);
3717       Register tmp = i.TempRegister(0);
3718       // Prepare 16 byte aligned buffer for shuffle control mask
3719       __ mov(tmp, esp);
3720       __ and_(esp, -16);
3721       if (instr->InputCount() == 5) {  // only one input operand
3722         DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
3723         for (int j = 4; j > 0; j--) {
3724           uint32_t mask = i.InputUint32(j);
3725           __ push(Immediate(mask));
3726         }
3727         __ Pshufb(dst, Operand(esp, 0));
3728       } else {  // two input operands
3729         DCHECK_EQ(6, instr->InputCount());
3730         __ movups(kScratchDoubleReg, src0);
3731         for (int j = 5; j > 1; j--) {
3732           uint32_t lanes = i.InputUint32(j);
3733           uint32_t mask = 0;
3734           for (int k = 0; k < 32; k += 8) {
3735             uint8_t lane = lanes >> k;
3736             mask |= (lane < kSimd128Size ? lane : 0x80) << k;
3737           }
3738           __ push(Immediate(mask));
3739         }
3740         __ Pshufb(kScratchDoubleReg, Operand(esp, 0));
3741         Operand src1 = i.InputOperand(1);
3742         if (!src1.is_reg(dst)) __ movups(dst, src1);
3743         for (int j = 5; j > 1; j--) {
3744           uint32_t lanes = i.InputUint32(j);
3745           uint32_t mask = 0;
3746           for (int k = 0; k < 32; k += 8) {
3747             uint8_t lane = lanes >> k;
3748             mask |= (lane >= kSimd128Size ? (lane & 0xF) : 0x80) << k;
3749           }
3750           __ push(Immediate(mask));
3751         }
3752         __ Pshufb(dst, Operand(esp, 0));
3753         __ por(dst, kScratchDoubleReg);
3754       }
3755       __ mov(esp, tmp);
3756       break;
3757     }
3758     case kIA32S128Load8Splat: {
3759       __ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0);
3760       __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
3761       __ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
3762       break;
3763     }
3764     case kIA32S128Load16Splat: {
3765       __ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0);
3766       __ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(),
3767                  uint8_t{0});
3768       __ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
3769       break;
3770     }
3771     case kIA32S128Load32Splat: {
3772       __ Vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
3773       break;
3774     }
3775     case kIA32S128Load64Splat: {
3776       __ Movddup(i.OutputSimd128Register(), i.MemoryOperand());
3777       break;
3778     }
3779     case kIA32S128Load8x8S: {
3780       __ Pmovsxbw(i.OutputSimd128Register(), i.MemoryOperand());
3781       break;
3782     }
3783     case kIA32S128Load8x8U: {
3784       __ Pmovzxbw(i.OutputSimd128Register(), i.MemoryOperand());
3785       break;
3786     }
3787     case kIA32S128Load16x4S: {
3788       __ Pmovsxwd(i.OutputSimd128Register(), i.MemoryOperand());
3789       break;
3790     }
3791     case kIA32S128Load16x4U: {
3792       __ Pmovzxwd(i.OutputSimd128Register(), i.MemoryOperand());
3793       break;
3794     }
3795     case kIA32S128Load32x2S: {
3796       __ Pmovsxdq(i.OutputSimd128Register(), i.MemoryOperand());
3797       break;
3798     }
3799     case kIA32S128Load32x2U: {
3800       __ Pmovzxdq(i.OutputSimd128Register(), i.MemoryOperand());
3801       break;
3802     }
3803     case kIA32S32x4Swizzle: {
3804       DCHECK_EQ(2, instr->InputCount());
3805       __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(1));
3806       break;
3807     }
3808     case kIA32S32x4Shuffle: {
3809       DCHECK_EQ(4, instr->InputCount());  // Swizzles should be handled above.
3810       int8_t shuffle = i.InputInt8(2);
3811       DCHECK_NE(0xe4, shuffle);  // A simple blend should be handled below.
3812       __ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
3813       __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
3814       __ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
3815       break;
3816     }
3817     case kIA32S16x8Blend:
3818       ASSEMBLE_SIMD_IMM_SHUFFLE(pblendw, SSE4_1, i.InputInt8(2));
3819       break;
3820     case kIA32S16x8HalfShuffle1: {
3821       XMMRegister dst = i.OutputSimd128Register();
3822       __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
3823       __ Pshufhw(dst, dst, i.InputInt8(2));
3824       break;
3825     }
3826     case kIA32S16x8HalfShuffle2: {
3827       XMMRegister dst = i.OutputSimd128Register();
3828       __ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
3829       __ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3));
3830       __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(2));
3831       __ Pshufhw(dst, dst, i.InputInt8(3));
3832       __ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
3833       break;
3834     }
3835     case kIA32S8x16Alignr:
3836       ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2));
3837       break;
3838     case kIA32S16x8Dup: {
3839       XMMRegister dst = i.OutputSimd128Register();
3840       Operand src = i.InputOperand(0);
3841       int8_t lane = i.InputInt8(1) & 0x7;
3842       int8_t lane4 = lane & 0x3;
3843       int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
3844       if (lane < 4) {
3845         __ Pshuflw(dst, src, half_dup);
3846         __ Pshufd(dst, dst, 0);
3847       } else {
3848         __ Pshufhw(dst, src, half_dup);
3849         __ Pshufd(dst, dst, 0xaa);
3850       }
3851       break;
3852     }
3853     case kIA32S8x16Dup: {
3854       XMMRegister dst = i.OutputSimd128Register();
3855       XMMRegister src = i.InputSimd128Register(0);
3856       int8_t lane = i.InputInt8(1) & 0xf;
3857       if (CpuFeatures::IsSupported(AVX)) {
3858         CpuFeatureScope avx_scope(tasm(), AVX);
3859         if (lane < 8) {
3860           __ vpunpcklbw(dst, src, src);
3861         } else {
3862           __ vpunpckhbw(dst, src, src);
3863         }
3864       } else {
3865         DCHECK_EQ(dst, src);
3866         if (lane < 8) {
3867           __ punpcklbw(dst, dst);
3868         } else {
3869           __ punpckhbw(dst, dst);
3870         }
3871       }
3872       lane &= 0x7;
3873       int8_t lane4 = lane & 0x3;
3874       int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
3875       if (lane < 4) {
3876         __ Pshuflw(dst, dst, half_dup);
3877         __ Pshufd(dst, dst, 0);
3878       } else {
3879         __ Pshufhw(dst, dst, half_dup);
3880         __ Pshufd(dst, dst, 0xaa);
3881       }
3882       break;
3883     }
3884     case kIA32S64x2UnpackHigh:
3885       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq);
3886       break;
3887     case kIA32S32x4UnpackHigh:
3888       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhdq);
3889       break;
3890     case kIA32S16x8UnpackHigh:
3891       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhwd);
3892       break;
3893     case kIA32S8x16UnpackHigh:
3894       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhbw);
3895       break;
3896     case kIA32S64x2UnpackLow:
3897       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklqdq);
3898       break;
3899     case kIA32S32x4UnpackLow:
3900       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckldq);
3901       break;
3902     case kIA32S16x8UnpackLow:
3903       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklwd);
3904       break;
3905     case kIA32S8x16UnpackLow:
3906       ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklbw);
3907       break;
3908     case kSSES16x8UnzipHigh: {
3909       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3910       XMMRegister dst = i.OutputSimd128Register();
3911       XMMRegister src2 = dst;
3912       DCHECK_EQ(dst, i.InputSimd128Register(0));
3913       if (instr->InputCount() == 2) {
3914         __ movups(kScratchDoubleReg, i.InputOperand(1));
3915         __ psrld(kScratchDoubleReg, 16);
3916         src2 = kScratchDoubleReg;
3917       }
3918       __ psrld(dst, 16);
3919       __ packusdw(dst, src2);
3920       break;
3921     }
3922     case kAVXS16x8UnzipHigh: {
3923       CpuFeatureScope avx_scope(tasm(), AVX);
3924       XMMRegister dst = i.OutputSimd128Register();
3925       XMMRegister src2 = dst;
3926       if (instr->InputCount() == 2) {
3927         __ vpsrld(kScratchDoubleReg, i.InputSimd128Register(1), 16);
3928         src2 = kScratchDoubleReg;
3929       }
3930       __ vpsrld(dst, i.InputSimd128Register(0), 16);
3931       __ vpackusdw(dst, dst, src2);
3932       break;
3933     }
3934     case kSSES16x8UnzipLow: {
3935       CpuFeatureScope sse_scope(tasm(), SSE4_1);
3936       XMMRegister dst = i.OutputSimd128Register();
3937       XMMRegister src2 = dst;
3938       DCHECK_EQ(dst, i.InputSimd128Register(0));
3939       __ pxor(kScratchDoubleReg, kScratchDoubleReg);
3940       if (instr->InputCount() == 2) {
3941         __ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
3942         src2 = kScratchDoubleReg;
3943       }
3944       __ pblendw(dst, kScratchDoubleReg, 0xaa);
3945       __ packusdw(dst, src2);
3946       break;
3947     }
3948     case kAVXS16x8UnzipLow: {
3949       CpuFeatureScope avx_scope(tasm(), AVX);
3950       XMMRegister dst = i.OutputSimd128Register();
3951       XMMRegister src2 = dst;
3952       __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
3953       if (instr->InputCount() == 2) {
3954         __ vpblendw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1),
3955                     0x55);
3956         src2 = kScratchDoubleReg;
3957       }
3958       __ vpblendw(dst, kScratchDoubleReg, i.InputSimd128Register(0), 0x55);
3959       __ vpackusdw(dst, dst, src2);
3960       break;
3961     }
3962     case kSSES8x16UnzipHigh: {
3963       XMMRegister dst = i.OutputSimd128Register();
3964       XMMRegister src2 = dst;
3965       DCHECK_EQ(dst, i.InputSimd128Register(0));
3966       if (instr->InputCount() == 2) {
3967         __ movups(kScratchDoubleReg, i.InputOperand(1));
3968         __ psrlw(kScratchDoubleReg, 8);
3969         src2 = kScratchDoubleReg;
3970       }
3971       __ psrlw(dst, 8);
3972       __ packuswb(dst, src2);
3973       break;
3974     }
3975     case kAVXS8x16UnzipHigh: {
3976       CpuFeatureScope avx_scope(tasm(), AVX);
3977       XMMRegister dst = i.OutputSimd128Register();
3978       XMMRegister src2 = dst;
3979       if (instr->InputCount() == 2) {
3980         __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
3981         src2 = kScratchDoubleReg;
3982       }
3983       __ vpsrlw(dst, i.InputSimd128Register(0), 8);
3984       __ vpackuswb(dst, dst, src2);
3985       break;
3986     }
3987     case kSSES8x16UnzipLow: {
3988       XMMRegister dst = i.OutputSimd128Register();
3989       XMMRegister src2 = dst;
3990       DCHECK_EQ(dst, i.InputSimd128Register(0));
3991       if (instr->InputCount() == 2) {
3992         __ movups(kScratchDoubleReg, i.InputOperand(1));
3993         __ psllw(kScratchDoubleReg, 8);
3994         __ psrlw(kScratchDoubleReg, 8);
3995         src2 = kScratchDoubleReg;
3996       }
3997       __ psllw(dst, 8);
3998       __ psrlw(dst, 8);
3999       __ packuswb(dst, src2);
4000       break;
4001     }
4002     case kAVXS8x16UnzipLow: {
4003       CpuFeatureScope avx_scope(tasm(), AVX);
4004       XMMRegister dst = i.OutputSimd128Register();
4005       XMMRegister src2 = dst;
4006       if (instr->InputCount() == 2) {
4007         __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
4008         __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
4009         src2 = kScratchDoubleReg;
4010       }
4011       __ vpsllw(dst, i.InputSimd128Register(0), 8);
4012       __ vpsrlw(dst, dst, 8);
4013       __ vpackuswb(dst, dst, src2);
4014       break;
4015     }
4016     case kSSES8x16TransposeLow: {
4017       XMMRegister dst = i.OutputSimd128Register();
4018       DCHECK_EQ(dst, i.InputSimd128Register(0));
4019       __ psllw(dst, 8);
4020       if (instr->InputCount() == 1) {
4021         __ movups(kScratchDoubleReg, dst);
4022       } else {
4023         DCHECK_EQ(2, instr->InputCount());
4024         __ movups(kScratchDoubleReg, i.InputOperand(1));
4025         __ psllw(kScratchDoubleReg, 8);
4026       }
4027       __ psrlw(dst, 8);
4028       __ por(dst, kScratchDoubleReg);
4029       break;
4030     }
4031     case kAVXS8x16TransposeLow: {
4032       CpuFeatureScope avx_scope(tasm(), AVX);
4033       XMMRegister dst = i.OutputSimd128Register();
4034       if (instr->InputCount() == 1) {
4035         __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(0), 8);
4036         __ vpsrlw(dst, kScratchDoubleReg, 8);
4037       } else {
4038         DCHECK_EQ(2, instr->InputCount());
4039         __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
4040         __ vpsllw(dst, i.InputSimd128Register(0), 8);
4041         __ vpsrlw(dst, dst, 8);
4042       }
4043       __ vpor(dst, dst, kScratchDoubleReg);
4044       break;
4045     }
4046     case kSSES8x16TransposeHigh: {
4047       XMMRegister dst = i.OutputSimd128Register();
4048       DCHECK_EQ(dst, i.InputSimd128Register(0));
4049       __ psrlw(dst, 8);
4050       if (instr->InputCount() == 1) {
4051         __ movups(kScratchDoubleReg, dst);
4052       } else {
4053         DCHECK_EQ(2, instr->InputCount());
4054         __ movups(kScratchDoubleReg, i.InputOperand(1));
4055         __ psrlw(kScratchDoubleReg, 8);
4056       }
4057       __ psllw(kScratchDoubleReg, 8);
4058       __ por(dst, kScratchDoubleReg);
4059       break;
4060     }
4061     case kAVXS8x16TransposeHigh: {
4062       CpuFeatureScope avx_scope(tasm(), AVX);
4063       XMMRegister dst = i.OutputSimd128Register();
4064       if (instr->InputCount() == 1) {
4065         __ vpsrlw(dst, i.InputSimd128Register(0), 8);
4066         __ vpsllw(kScratchDoubleReg, dst, 8);
4067       } else {
4068         DCHECK_EQ(2, instr->InputCount());
4069         __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
4070         __ vpsrlw(dst, i.InputSimd128Register(0), 8);
4071         __ vpsllw(kScratchDoubleReg, kScratchDoubleReg, 8);
4072       }
4073       __ vpor(dst, dst, kScratchDoubleReg);
4074       break;
4075     }
4076     case kSSES8x8Reverse:
4077     case kSSES8x4Reverse:
4078     case kSSES8x2Reverse: {
4079       DCHECK_EQ(1, instr->InputCount());
4080       XMMRegister dst = i.OutputSimd128Register();
4081       DCHECK_EQ(dst, i.InputSimd128Register(0));
4082       if (arch_opcode != kSSES8x2Reverse) {
4083         // First shuffle words into position.
4084         int8_t shuffle_mask = arch_opcode == kSSES8x4Reverse ? 0xB1 : 0x1B;
4085         __ pshuflw(dst, dst, shuffle_mask);
4086         __ pshufhw(dst, dst, shuffle_mask);
4087       }
4088       __ movaps(kScratchDoubleReg, dst);
4089       __ psrlw(kScratchDoubleReg, 8);
4090       __ psllw(dst, 8);
4091       __ por(dst, kScratchDoubleReg);
4092       break;
4093     }
4094     case kAVXS8x2Reverse:
4095     case kAVXS8x4Reverse:
4096     case kAVXS8x8Reverse: {
4097       DCHECK_EQ(1, instr->InputCount());
4098       CpuFeatureScope avx_scope(tasm(), AVX);
4099       XMMRegister dst = i.OutputSimd128Register();
4100       XMMRegister src = dst;
4101       if (arch_opcode != kAVXS8x2Reverse) {
4102         // First shuffle words into position.
4103         int8_t shuffle_mask = arch_opcode == kAVXS8x4Reverse ? 0xB1 : 0x1B;
4104         __ vpshuflw(dst, i.InputOperand(0), shuffle_mask);
4105         __ vpshufhw(dst, dst, shuffle_mask);
4106       } else {
4107         src = i.InputSimd128Register(0);
4108       }
4109       // Reverse each 16 bit lane.
4110       __ vpsrlw(kScratchDoubleReg, src, 8);
4111       __ vpsllw(dst, src, 8);
4112       __ vpor(dst, dst, kScratchDoubleReg);
4113       break;
4114     }
4115     case kIA32V32x4AnyTrue:
4116     case kIA32V16x8AnyTrue:
4117     case kIA32V8x16AnyTrue: {
4118       Register dst = i.OutputRegister();
4119       XMMRegister src = i.InputSimd128Register(0);
4120       Register tmp = i.TempRegister(0);
4121       __ xor_(tmp, tmp);
4122       __ mov(dst, Immediate(1));
4123       __ Ptest(src, src);
4124       __ cmov(zero, dst, tmp);
4125       break;
4126     }
4127     // Need to split up all the different lane structures because the
4128     // comparison instruction used matters, e.g. given 0xff00, pcmpeqb returns
4129     // 0x0011, pcmpeqw returns 0x0000, ptest will set ZF to 0 and 1
4130     // respectively.
4131     case kIA32V32x4AllTrue:
4132       ASSEMBLE_SIMD_ALL_TRUE(Pcmpeqd);
4133       break;
4134     case kIA32V16x8AllTrue:
4135       ASSEMBLE_SIMD_ALL_TRUE(pcmpeqw);
4136       break;
4137     case kIA32V8x16AllTrue: {
4138       ASSEMBLE_SIMD_ALL_TRUE(pcmpeqb);
4139       break;
4140     }
4141     case kIA32Word32AtomicPairLoad: {
4142       XMMRegister tmp = i.ToDoubleRegister(instr->TempAt(0));
4143       __ movq(tmp, i.MemoryOperand());
4144       __ Pextrd(i.OutputRegister(0), tmp, 0);
4145       __ Pextrd(i.OutputRegister(1), tmp, 1);
4146       break;
4147     }
4148     case kIA32Word32AtomicPairStore: {
4149       Label store;
4150       __ bind(&store);
4151       __ mov(i.TempRegister(0), i.MemoryOperand(2));
4152       __ mov(i.TempRegister(1), i.NextMemoryOperand(2));
4153       __ push(ebx);
4154       frame_access_state()->IncreaseSPDelta(1);
4155       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0));
4156       __ lock();
4157       __ cmpxchg8b(i.MemoryOperand(2));
4158       __ pop(ebx);
4159       frame_access_state()->IncreaseSPDelta(-1);
4160       __ j(not_equal, &store);
4161       break;
4162     }
4163     case kWord32AtomicExchangeInt8: {
4164       __ xchg_b(i.InputRegister(0), i.MemoryOperand(1));
4165       __ movsx_b(i.InputRegister(0), i.InputRegister(0));
4166       break;
4167     }
4168     case kWord32AtomicExchangeUint8: {
4169       __ xchg_b(i.InputRegister(0), i.MemoryOperand(1));
4170       __ movzx_b(i.InputRegister(0), i.InputRegister(0));
4171       break;
4172     }
4173     case kWord32AtomicExchangeInt16: {
4174       __ xchg_w(i.InputRegister(0), i.MemoryOperand(1));
4175       __ movsx_w(i.InputRegister(0), i.InputRegister(0));
4176       break;
4177     }
4178     case kWord32AtomicExchangeUint16: {
4179       __ xchg_w(i.InputRegister(0), i.MemoryOperand(1));
4180       __ movzx_w(i.InputRegister(0), i.InputRegister(0));
4181       break;
4182     }
4183     case kWord32AtomicExchangeWord32: {
4184       __ xchg(i.InputRegister(0), i.MemoryOperand(1));
4185       break;
4186     }
4187     case kIA32Word32AtomicPairExchange: {
4188       DCHECK(VerifyOutputOfAtomicPairInstr(&i, instr));
4189       Label exchange;
4190       __ bind(&exchange);
4191       __ mov(eax, i.MemoryOperand(2));
4192       __ mov(edx, i.NextMemoryOperand(2));
4193       __ push(ebx);
4194       frame_access_state()->IncreaseSPDelta(1);
4195       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0));
4196       __ lock();
4197       __ cmpxchg8b(i.MemoryOperand(2));
4198       __ pop(ebx);
4199       frame_access_state()->IncreaseSPDelta(-1);
4200       __ j(not_equal, &exchange);
4201       break;
4202     }
4203     case kWord32AtomicCompareExchangeInt8: {
4204       __ lock();
4205       __ cmpxchg_b(i.MemoryOperand(2), i.InputRegister(1));
4206       __ movsx_b(eax, eax);
4207       break;
4208     }
4209     case kWord32AtomicCompareExchangeUint8: {
4210       __ lock();
4211       __ cmpxchg_b(i.MemoryOperand(2), i.InputRegister(1));
4212       __ movzx_b(eax, eax);
4213       break;
4214     }
4215     case kWord32AtomicCompareExchangeInt16: {
4216       __ lock();
4217       __ cmpxchg_w(i.MemoryOperand(2), i.InputRegister(1));
4218       __ movsx_w(eax, eax);
4219       break;
4220     }
4221     case kWord32AtomicCompareExchangeUint16: {
4222       __ lock();
4223       __ cmpxchg_w(i.MemoryOperand(2), i.InputRegister(1));
4224       __ movzx_w(eax, eax);
4225       break;
4226     }
4227     case kWord32AtomicCompareExchangeWord32: {
4228       __ lock();
4229       __ cmpxchg(i.MemoryOperand(2), i.InputRegister(1));
4230       break;
4231     }
4232     case kIA32Word32AtomicPairCompareExchange: {
4233       __ push(ebx);
4234       frame_access_state()->IncreaseSPDelta(1);
4235       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(2));
4236       __ lock();
4237       __ cmpxchg8b(i.MemoryOperand(4));
4238       __ pop(ebx);
4239       frame_access_state()->IncreaseSPDelta(-1);
4240       break;
4241     }
4242 #define ATOMIC_BINOP_CASE(op, inst)                \
4243   case kWord32Atomic##op##Int8: {                  \
4244     ASSEMBLE_ATOMIC_BINOP(inst, mov_b, cmpxchg_b); \
4245     __ movsx_b(eax, eax);                          \
4246     break;                                         \
4247   }                                                \
4248   case kWord32Atomic##op##Uint8: {                 \
4249     ASSEMBLE_ATOMIC_BINOP(inst, mov_b, cmpxchg_b); \
4250     __ movzx_b(eax, eax);                          \
4251     break;                                         \
4252   }                                                \
4253   case kWord32Atomic##op##Int16: {                 \
4254     ASSEMBLE_ATOMIC_BINOP(inst, mov_w, cmpxchg_w); \
4255     __ movsx_w(eax, eax);                          \
4256     break;                                         \
4257   }                                                \
4258   case kWord32Atomic##op##Uint16: {                \
4259     ASSEMBLE_ATOMIC_BINOP(inst, mov_w, cmpxchg_w); \
4260     __ movzx_w(eax, eax);                          \
4261     break;                                         \
4262   }                                                \
4263   case kWord32Atomic##op##Word32: {                \
4264     ASSEMBLE_ATOMIC_BINOP(inst, mov, cmpxchg);     \
4265     break;                                         \
4266   }
4267       ATOMIC_BINOP_CASE(Add, add)
4268       ATOMIC_BINOP_CASE(Sub, sub)
4269       ATOMIC_BINOP_CASE(And, and_)
4270       ATOMIC_BINOP_CASE(Or, or_)
4271       ATOMIC_BINOP_CASE(Xor, xor_)
4272 #undef ATOMIC_BINOP_CASE
4273 #define ATOMIC_BINOP_CASE(op, instr1, instr2)         \
4274   case kIA32Word32AtomicPair##op: {                   \
4275     DCHECK(VerifyOutputOfAtomicPairInstr(&i, instr)); \
4276     ASSEMBLE_I64ATOMIC_BINOP(instr1, instr2)          \
4277     break;                                            \
4278   }
4279       ATOMIC_BINOP_CASE(Add, add, adc)
4280       ATOMIC_BINOP_CASE(And, and_, and_)
4281       ATOMIC_BINOP_CASE(Or, or_, or_)
4282       ATOMIC_BINOP_CASE(Xor, xor_, xor_)
4283 #undef ATOMIC_BINOP_CASE
4284     case kIA32Word32AtomicPairSub: {
4285       DCHECK(VerifyOutputOfAtomicPairInstr(&i, instr));
4286       Label binop;
4287       __ bind(&binop);
4288       // Move memory operand into edx:eax
4289       __ mov(eax, i.MemoryOperand(2));
4290       __ mov(edx, i.NextMemoryOperand(2));
4291       // Save input registers temporarily on the stack.
4292       __ push(ebx);
4293       frame_access_state()->IncreaseSPDelta(1);
4294       i.MoveInstructionOperandToRegister(ebx, instr->InputAt(0));
4295       __ push(i.InputRegister(1));
4296       // Negate input in place
4297       __ neg(ebx);
4298       __ adc(i.InputRegister(1), 0);
4299       __ neg(i.InputRegister(1));
4300       // Add memory operand, negated input.
4301       __ add(ebx, eax);
4302       __ adc(i.InputRegister(1), edx);
4303       __ lock();
4304       __ cmpxchg8b(i.MemoryOperand(2));
4305       // Restore input registers
4306       __ pop(i.InputRegister(1));
4307       __ pop(ebx);
4308       frame_access_state()->IncreaseSPDelta(-1);
4309       __ j(not_equal, &binop);
4310       break;
4311     }
4312     case kWord32AtomicLoadInt8:
4313     case kWord32AtomicLoadUint8:
4314     case kWord32AtomicLoadInt16:
4315     case kWord32AtomicLoadUint16:
4316     case kWord32AtomicLoadWord32:
4317     case kWord32AtomicStoreWord8:
4318     case kWord32AtomicStoreWord16:
4319     case kWord32AtomicStoreWord32:
4320       UNREACHABLE();  // Won't be generated by instruction selector.
4321       break;
4322   }
4323   return kSuccess;
4324 }  // NOLINT(readability/fn_size)
4325 
FlagsConditionToCondition(FlagsCondition condition)4326 static Condition FlagsConditionToCondition(FlagsCondition condition) {
4327   switch (condition) {
4328     case kUnorderedEqual:
4329     case kEqual:
4330       return equal;
4331       break;
4332     case kUnorderedNotEqual:
4333     case kNotEqual:
4334       return not_equal;
4335       break;
4336     case kSignedLessThan:
4337       return less;
4338       break;
4339     case kSignedGreaterThanOrEqual:
4340       return greater_equal;
4341       break;
4342     case kSignedLessThanOrEqual:
4343       return less_equal;
4344       break;
4345     case kSignedGreaterThan:
4346       return greater;
4347       break;
4348     case kUnsignedLessThan:
4349       return below;
4350       break;
4351     case kUnsignedGreaterThanOrEqual:
4352       return above_equal;
4353       break;
4354     case kUnsignedLessThanOrEqual:
4355       return below_equal;
4356       break;
4357     case kUnsignedGreaterThan:
4358       return above;
4359       break;
4360     case kOverflow:
4361       return overflow;
4362       break;
4363     case kNotOverflow:
4364       return no_overflow;
4365       break;
4366     default:
4367       UNREACHABLE();
4368   }
4369 }
4370 
4371 // Assembles a branch after an instruction.
AssembleArchBranch(Instruction * instr,BranchInfo * branch)4372 void CodeGenerator::AssembleArchBranch(Instruction* instr, BranchInfo* branch) {
4373   Label::Distance flabel_distance =
4374       branch->fallthru ? Label::kNear : Label::kFar;
4375   Label* tlabel = branch->true_label;
4376   Label* flabel = branch->false_label;
4377   if (branch->condition == kUnorderedEqual) {
4378     __ j(parity_even, flabel, flabel_distance);
4379   } else if (branch->condition == kUnorderedNotEqual) {
4380     __ j(parity_even, tlabel);
4381   }
4382   __ j(FlagsConditionToCondition(branch->condition), tlabel);
4383 
4384   // Add a jump if not falling through to the next block.
4385   if (!branch->fallthru) __ jmp(flabel);
4386 }
4387 
AssembleBranchPoisoning(FlagsCondition condition,Instruction * instr)4388 void CodeGenerator::AssembleBranchPoisoning(FlagsCondition condition,
4389                                             Instruction* instr) {
4390   // TODO(860429): Remove remaining poisoning infrastructure on ia32.
4391   UNREACHABLE();
4392 }
4393 
AssembleArchDeoptBranch(Instruction * instr,BranchInfo * branch)4394 void CodeGenerator::AssembleArchDeoptBranch(Instruction* instr,
4395                                             BranchInfo* branch) {
4396   AssembleArchBranch(instr, branch);
4397 }
4398 
AssembleArchJump(RpoNumber target)4399 void CodeGenerator::AssembleArchJump(RpoNumber target) {
4400   if (!IsNextInAssemblyOrder(target)) __ jmp(GetLabel(target));
4401 }
4402 
AssembleArchTrap(Instruction * instr,FlagsCondition condition)4403 void CodeGenerator::AssembleArchTrap(Instruction* instr,
4404                                      FlagsCondition condition) {
4405   class OutOfLineTrap final : public OutOfLineCode {
4406    public:
4407     OutOfLineTrap(CodeGenerator* gen, Instruction* instr)
4408         : OutOfLineCode(gen), instr_(instr), gen_(gen) {}
4409 
4410     void Generate() final {
4411       IA32OperandConverter i(gen_, instr_);
4412       TrapId trap_id =
4413           static_cast<TrapId>(i.InputInt32(instr_->InputCount() - 1));
4414       GenerateCallToTrap(trap_id);
4415     }
4416 
4417    private:
4418     void GenerateCallToTrap(TrapId trap_id) {
4419       if (trap_id == TrapId::kInvalid) {
4420         // We cannot test calls to the runtime in cctest/test-run-wasm.
4421         // Therefore we emit a call to C here instead of a call to the runtime.
4422         __ PrepareCallCFunction(0, esi);
4423         __ CallCFunction(
4424             ExternalReference::wasm_call_trap_callback_for_testing(), 0);
4425         __ LeaveFrame(StackFrame::WASM);
4426         auto call_descriptor = gen_->linkage()->GetIncomingDescriptor();
4427         size_t pop_size =
4428             call_descriptor->StackParameterCount() * kSystemPointerSize;
4429         // Use ecx as a scratch register, we return anyways immediately.
4430         __ Ret(static_cast<int>(pop_size), ecx);
4431       } else {
4432         gen_->AssembleSourcePosition(instr_);
4433         // A direct call to a wasm runtime stub defined in this module.
4434         // Just encode the stub index. This will be patched when the code
4435         // is added to the native module and copied into wasm code space.
4436         __ wasm_call(static_cast<Address>(trap_id), RelocInfo::WASM_STUB_CALL);
4437         ReferenceMap* reference_map =
4438             gen_->zone()->New<ReferenceMap>(gen_->zone());
4439         gen_->RecordSafepoint(reference_map, Safepoint::kNoLazyDeopt);
4440         __ AssertUnreachable(AbortReason::kUnexpectedReturnFromWasmTrap);
4441       }
4442     }
4443 
4444     Instruction* instr_;
4445     CodeGenerator* gen_;
4446   };
4447   auto ool = zone()->New<OutOfLineTrap>(this, instr);
4448   Label* tlabel = ool->entry();
4449   Label end;
4450   if (condition == kUnorderedEqual) {
4451     __ j(parity_even, &end, Label::kNear);
4452   } else if (condition == kUnorderedNotEqual) {
4453     __ j(parity_even, tlabel);
4454   }
4455   __ j(FlagsConditionToCondition(condition), tlabel);
4456   __ bind(&end);
4457 }
4458 
4459 // Assembles boolean materializations after an instruction.
AssembleArchBoolean(Instruction * instr,FlagsCondition condition)4460 void CodeGenerator::AssembleArchBoolean(Instruction* instr,
4461                                         FlagsCondition condition) {
4462   IA32OperandConverter i(this, instr);
4463   Label done;
4464 
4465   // Materialize a full 32-bit 1 or 0 value. The result register is always the
4466   // last output of the instruction.
4467   Label check;
4468   DCHECK_NE(0u, instr->OutputCount());
4469   Register reg = i.OutputRegister(instr->OutputCount() - 1);
4470   if (condition == kUnorderedEqual) {
4471     __ j(parity_odd, &check, Label::kNear);
4472     __ Move(reg, Immediate(0));
4473     __ jmp(&done, Label::kNear);
4474   } else if (condition == kUnorderedNotEqual) {
4475     __ j(parity_odd, &check, Label::kNear);
4476     __ mov(reg, Immediate(1));
4477     __ jmp(&done, Label::kNear);
4478   }
4479   Condition cc = FlagsConditionToCondition(condition);
4480 
4481   __ bind(&check);
4482   if (reg.is_byte_register()) {
4483     // setcc for byte registers (al, bl, cl, dl).
4484     __ setcc(cc, reg);
4485     __ movzx_b(reg, reg);
4486   } else {
4487     // Emit a branch to set a register to either 1 or 0.
4488     Label set;
4489     __ j(cc, &set, Label::kNear);
4490     __ Move(reg, Immediate(0));
4491     __ jmp(&done, Label::kNear);
4492     __ bind(&set);
4493     __ mov(reg, Immediate(1));
4494   }
4495   __ bind(&done);
4496 }
4497 
AssembleArchBinarySearchSwitch(Instruction * instr)4498 void CodeGenerator::AssembleArchBinarySearchSwitch(Instruction* instr) {
4499   IA32OperandConverter i(this, instr);
4500   Register input = i.InputRegister(0);
4501   std::vector<std::pair<int32_t, Label*>> cases;
4502   for (size_t index = 2; index < instr->InputCount(); index += 2) {
4503     cases.push_back({i.InputInt32(index + 0), GetLabel(i.InputRpo(index + 1))});
4504   }
4505   AssembleArchBinarySearchSwitchRange(input, i.InputRpo(1), cases.data(),
4506                                       cases.data() + cases.size());
4507 }
4508 
AssembleArchTableSwitch(Instruction * instr)4509 void CodeGenerator::AssembleArchTableSwitch(Instruction* instr) {
4510   IA32OperandConverter i(this, instr);
4511   Register input = i.InputRegister(0);
4512   size_t const case_count = instr->InputCount() - 2;
4513   Label** cases = zone()->NewArray<Label*>(case_count);
4514   for (size_t index = 0; index < case_count; ++index) {
4515     cases[index] = GetLabel(i.InputRpo(index + 2));
4516   }
4517   Label* const table = AddJumpTable(cases, case_count);
4518   __ cmp(input, Immediate(case_count));
4519   __ j(above_equal, GetLabel(i.InputRpo(1)));
4520   __ jmp(Operand::JumpTable(input, times_system_pointer_size, table));
4521 }
4522 
4523 // The calling convention for JSFunctions on IA32 passes arguments on the
4524 // stack and the JSFunction and context in EDI and ESI, respectively, thus
4525 // the steps of the call look as follows:
4526 
4527 // --{ before the call instruction }--------------------------------------------
4528 //                                                         |  caller frame |
4529 //                                                         ^ esp           ^ ebp
4530 
4531 // --{ push arguments and setup ESI, EDI }--------------------------------------
4532 //                                       | args + receiver |  caller frame |
4533 //                                       ^ esp                             ^ ebp
4534 //                 [edi = JSFunction, esi = context]
4535 
4536 // --{ call [edi + kCodeEntryOffset] }------------------------------------------
4537 //                                 | RET | args + receiver |  caller frame |
4538 //                                 ^ esp                                   ^ ebp
4539 
4540 // =={ prologue of called function }============================================
4541 // --{ push ebp }---------------------------------------------------------------
4542 //                            | FP | RET | args + receiver |  caller frame |
4543 //                            ^ esp                                        ^ ebp
4544 
4545 // --{ mov ebp, esp }-----------------------------------------------------------
4546 //                            | FP | RET | args + receiver |  caller frame |
4547 //                            ^ ebp,esp
4548 
4549 // --{ push esi }---------------------------------------------------------------
4550 //                      | CTX | FP | RET | args + receiver |  caller frame |
4551 //                      ^esp  ^ ebp
4552 
4553 // --{ push edi }---------------------------------------------------------------
4554 //                | FNC | CTX | FP | RET | args + receiver |  caller frame |
4555 //                ^esp        ^ ebp
4556 
4557 // --{ subi esp, #N }-----------------------------------------------------------
4558 // | callee frame | FNC | CTX | FP | RET | args + receiver |  caller frame |
4559 // ^esp                       ^ ebp
4560 
4561 // =={ body of called function }================================================
4562 
4563 // =={ epilogue of called function }============================================
4564 // --{ mov esp, ebp }-----------------------------------------------------------
4565 //                            | FP | RET | args + receiver |  caller frame |
4566 //                            ^ esp,ebp
4567 
4568 // --{ pop ebp }-----------------------------------------------------------
4569 // |                               | RET | args + receiver |  caller frame |
4570 //                                 ^ esp                                   ^ ebp
4571 
4572 // --{ ret #A+1 }-----------------------------------------------------------
4573 // |                                                       |  caller frame |
4574 //                                                         ^ esp           ^ ebp
4575 
4576 // Runtime function calls are accomplished by doing a stub call to the
4577 // CEntry (a real code object). On IA32 passes arguments on the
4578 // stack, the number of arguments in EAX, the address of the runtime function
4579 // in EBX, and the context in ESI.
4580 
4581 // --{ before the call instruction }--------------------------------------------
4582 //                                                         |  caller frame |
4583 //                                                         ^ esp           ^ ebp
4584 
4585 // --{ push arguments and setup EAX, EBX, and ESI }-----------------------------
4586 //                                       | args + receiver |  caller frame |
4587 //                                       ^ esp                             ^ ebp
4588 //              [eax = #args, ebx = runtime function, esi = context]
4589 
4590 // --{ call #CEntry }-----------------------------------------------------------
4591 //                                 | RET | args + receiver |  caller frame |
4592 //                                 ^ esp                                   ^ ebp
4593 
4594 // =={ body of runtime function }===============================================
4595 
4596 // --{ runtime returns }--------------------------------------------------------
4597 //                                                         |  caller frame |
4598 //                                                         ^ esp           ^ ebp
4599 
4600 // Other custom linkages (e.g. for calling directly into and out of C++) may
4601 // need to save callee-saved registers on the stack, which is done in the
4602 // function prologue of generated code.
4603 
4604 // --{ before the call instruction }--------------------------------------------
4605 //                                                         |  caller frame |
4606 //                                                         ^ esp           ^ ebp
4607 
4608 // --{ set up arguments in registers on stack }---------------------------------
4609 //                                                  | args |  caller frame |
4610 //                                                  ^ esp                  ^ ebp
4611 //                  [r0 = arg0, r1 = arg1, ...]
4612 
4613 // --{ call code }--------------------------------------------------------------
4614 //                                            | RET | args |  caller frame |
4615 //                                            ^ esp                        ^ ebp
4616 
4617 // =={ prologue of called function }============================================
4618 // --{ push ebp }---------------------------------------------------------------
4619 //                                       | FP | RET | args |  caller frame |
4620 //                                       ^ esp                             ^ ebp
4621 
4622 // --{ mov ebp, esp }-----------------------------------------------------------
4623 //                                       | FP | RET | args |  caller frame |
4624 //                                       ^ ebp,esp
4625 
4626 // --{ save registers }---------------------------------------------------------
4627 //                                | regs | FP | RET | args |  caller frame |
4628 //                                ^ esp  ^ ebp
4629 
4630 // --{ subi esp, #N }-----------------------------------------------------------
4631 //                 | callee frame | regs | FP | RET | args |  caller frame |
4632 //                 ^esp                  ^ ebp
4633 
4634 // =={ body of called function }================================================
4635 
4636 // =={ epilogue of called function }============================================
4637 // --{ restore registers }------------------------------------------------------
4638 //                                | regs | FP | RET | args |  caller frame |
4639 //                                ^ esp  ^ ebp
4640 
4641 // --{ mov esp, ebp }-----------------------------------------------------------
4642 //                                       | FP | RET | args |  caller frame |
4643 //                                       ^ esp,ebp
4644 
4645 // --{ pop ebp }----------------------------------------------------------------
4646 //                                            | RET | args |  caller frame |
4647 //                                            ^ esp                        ^ ebp
4648 
FinishFrame(Frame * frame)4649 void CodeGenerator::FinishFrame(Frame* frame) {
4650   auto call_descriptor = linkage()->GetIncomingDescriptor();
4651   const RegList saves = call_descriptor->CalleeSavedRegisters();
4652   if (saves != 0) {  // Save callee-saved registers.
4653     DCHECK(!info()->is_osr());
4654     int pushed = 0;
4655     for (int i = Register::kNumRegisters - 1; i >= 0; i--) {
4656       if (!((1 << i) & saves)) continue;
4657       ++pushed;
4658     }
4659     frame->AllocateSavedCalleeRegisterSlots(pushed);
4660   }
4661 }
4662 
AssembleConstructFrame()4663 void CodeGenerator::AssembleConstructFrame() {
4664   auto call_descriptor = linkage()->GetIncomingDescriptor();
4665   if (frame_access_state()->has_frame()) {
4666     if (call_descriptor->IsCFunctionCall()) {
4667       __ push(ebp);
4668       __ mov(ebp, esp);
4669       if (info()->GetOutputStackFrameType() == StackFrame::C_WASM_ENTRY) {
4670         __ Push(Immediate(StackFrame::TypeToMarker(StackFrame::C_WASM_ENTRY)));
4671         // Reserve stack space for saving the c_entry_fp later.
4672         __ AllocateStackSpace(kSystemPointerSize);
4673       }
4674     } else if (call_descriptor->IsJSFunctionCall()) {
4675       __ Prologue();
4676     } else {
4677       __ StubPrologue(info()->GetOutputStackFrameType());
4678       if (call_descriptor->IsWasmFunctionCall()) {
4679         __ push(kWasmInstanceRegister);
4680       } else if (call_descriptor->IsWasmImportWrapper() ||
4681                  call_descriptor->IsWasmCapiFunction()) {
4682         // Wasm import wrappers are passed a tuple in the place of the instance.
4683         // Unpack the tuple into the instance and the target callable.
4684         // This must be done here in the codegen because it cannot be expressed
4685         // properly in the graph.
4686         __ mov(kJSFunctionRegister,
4687                Operand(kWasmInstanceRegister,
4688                        Tuple2::kValue2Offset - kHeapObjectTag));
4689         __ mov(kWasmInstanceRegister,
4690                Operand(kWasmInstanceRegister,
4691                        Tuple2::kValue1Offset - kHeapObjectTag));
4692         __ push(kWasmInstanceRegister);
4693         if (call_descriptor->IsWasmCapiFunction()) {
4694           // Reserve space for saving the PC later.
4695           __ AllocateStackSpace(kSystemPointerSize);
4696         }
4697       }
4698     }
4699   }
4700 
4701   int required_slots =
4702       frame()->GetTotalFrameSlotCount() - frame()->GetFixedSlotCount();
4703 
4704   if (info()->is_osr()) {
4705     // TurboFan OSR-compiled functions cannot be entered directly.
4706     __ Abort(AbortReason::kShouldNotDirectlyEnterOsrFunction);
4707 
4708     // Unoptimized code jumps directly to this entrypoint while the unoptimized
4709     // frame is still on the stack. Optimized code uses OSR values directly from
4710     // the unoptimized frame. Thus, all that needs to be done is to allocate the
4711     // remaining stack slots.
4712     if (FLAG_code_comments) __ RecordComment("-- OSR entrypoint --");
4713     osr_pc_offset_ = __ pc_offset();
4714     required_slots -= osr_helper()->UnoptimizedFrameSlots();
4715   }
4716 
4717   const RegList saves = call_descriptor->CalleeSavedRegisters();
4718   if (required_slots > 0) {
4719     DCHECK(frame_access_state()->has_frame());
4720     if (info()->IsWasm() && required_slots > 128) {
4721       // For WebAssembly functions with big frames we have to do the stack
4722       // overflow check before we construct the frame. Otherwise we may not
4723       // have enough space on the stack to call the runtime for the stack
4724       // overflow.
4725       Label done;
4726 
4727       // If the frame is bigger than the stack, we throw the stack overflow
4728       // exception unconditionally. Thereby we can avoid the integer overflow
4729       // check in the condition code.
4730       if (required_slots * kSystemPointerSize < FLAG_stack_size * 1024) {
4731         Register scratch = esi;
4732         __ push(scratch);
4733         __ mov(scratch,
4734                FieldOperand(kWasmInstanceRegister,
4735                             WasmInstanceObject::kRealStackLimitAddressOffset));
4736         __ mov(scratch, Operand(scratch, 0));
4737         __ add(scratch, Immediate(required_slots * kSystemPointerSize));
4738         __ cmp(esp, scratch);
4739         __ pop(scratch);
4740         __ j(above_equal, &done, Label::kNear);
4741       }
4742 
4743       __ wasm_call(wasm::WasmCode::kWasmStackOverflow,
4744                    RelocInfo::WASM_STUB_CALL);
4745       ReferenceMap* reference_map = zone()->New<ReferenceMap>(zone());
4746       RecordSafepoint(reference_map, Safepoint::kNoLazyDeopt);
4747       __ AssertUnreachable(AbortReason::kUnexpectedReturnFromWasmTrap);
4748       __ bind(&done);
4749     }
4750 
4751     // Skip callee-saved and return slots, which are created below.
4752     required_slots -= base::bits::CountPopulation(saves);
4753     required_slots -= frame()->GetReturnSlotCount();
4754     if (required_slots > 0) {
4755       __ AllocateStackSpace(required_slots * kSystemPointerSize);
4756     }
4757   }
4758 
4759   if (saves != 0) {  // Save callee-saved registers.
4760     DCHECK(!info()->is_osr());
4761     for (int i = Register::kNumRegisters - 1; i >= 0; i--) {
4762       if (((1 << i) & saves)) __ push(Register::from_code(i));
4763     }
4764   }
4765 
4766   // Allocate return slots (located after callee-saved).
4767   if (frame()->GetReturnSlotCount() > 0) {
4768     __ AllocateStackSpace(frame()->GetReturnSlotCount() * kSystemPointerSize);
4769   }
4770 }
4771 
AssembleReturn(InstructionOperand * additional_pop_count)4772 void CodeGenerator::AssembleReturn(InstructionOperand* additional_pop_count) {
4773   auto call_descriptor = linkage()->GetIncomingDescriptor();
4774 
4775   const RegList saves = call_descriptor->CalleeSavedRegisters();
4776   // Restore registers.
4777   if (saves != 0) {
4778     const int returns = frame()->GetReturnSlotCount();
4779     if (returns != 0) {
4780       __ add(esp, Immediate(returns * kSystemPointerSize));
4781     }
4782     for (int i = 0; i < Register::kNumRegisters; i++) {
4783       if (!((1 << i) & saves)) continue;
4784       __ pop(Register::from_code(i));
4785     }
4786   }
4787 
4788   // We might need ecx and edx for scratch.
4789   DCHECK_EQ(0u, call_descriptor->CalleeSavedRegisters() & edx.bit());
4790   DCHECK_EQ(0u, call_descriptor->CalleeSavedRegisters() & ecx.bit());
4791   IA32OperandConverter g(this, nullptr);
4792   int parameter_count =
4793       static_cast<int>(call_descriptor->StackParameterCount());
4794 
4795   // {aditional_pop_count} is only greater than zero if {parameter_count = 0}.
4796   // Check RawMachineAssembler::PopAndReturn.
4797   if (parameter_count != 0) {
4798     if (additional_pop_count->IsImmediate()) {
4799       DCHECK_EQ(g.ToConstant(additional_pop_count).ToInt32(), 0);
4800     } else if (__ emit_debug_code()) {
4801       __ cmp(g.ToRegister(additional_pop_count), Immediate(0));
4802       __ Assert(equal, AbortReason::kUnexpectedAdditionalPopValue);
4803     }
4804   }
4805 
4806   Register argc_reg = ecx;
4807 #ifdef V8_NO_ARGUMENTS_ADAPTOR
4808   // Functions with JS linkage have at least one parameter (the receiver).
4809   // If {parameter_count} == 0, it means it is a builtin with
4810   // kDontAdaptArgumentsSentinel, which takes care of JS arguments popping
4811   // itself.
4812   const bool drop_jsargs = frame_access_state()->has_frame() &&
4813                            call_descriptor->IsJSFunctionCall() &&
4814                            parameter_count != 0;
4815 #else
4816   const bool drop_jsargs = false;
4817 #endif
4818   if (call_descriptor->IsCFunctionCall()) {
4819     AssembleDeconstructFrame();
4820   } else if (frame_access_state()->has_frame()) {
4821     // Canonicalize JSFunction return sites for now if they always have the same
4822     // number of return args.
4823     if (additional_pop_count->IsImmediate() &&
4824         g.ToConstant(additional_pop_count).ToInt32() == 0) {
4825       if (return_label_.is_bound()) {
4826         __ jmp(&return_label_);
4827         return;
4828       } else {
4829         __ bind(&return_label_);
4830       }
4831     }
4832     if (drop_jsargs) {
4833       // Get the actual argument count.
4834       __ mov(argc_reg, Operand(ebp, StandardFrameConstants::kArgCOffset));
4835     }
4836     AssembleDeconstructFrame();
4837   }
4838 
4839   if (drop_jsargs) {
4840     // We must pop all arguments from the stack (including the receiver). This
4841     // number of arguments is given by max(1 + argc_reg, parameter_count).
4842     int parameter_count_without_receiver =
4843         parameter_count - 1;  // Exclude the receiver to simplify the
4844                               // computation. We'll account for it at the end.
4845     Label mismatch_return;
4846     Register scratch_reg = edx;
4847     DCHECK_NE(argc_reg, scratch_reg);
4848     __ cmp(argc_reg, Immediate(parameter_count_without_receiver));
4849     __ j(greater, &mismatch_return, Label::kNear);
4850     __ Ret(parameter_count * kSystemPointerSize, scratch_reg);
4851     __ bind(&mismatch_return);
4852     __ PopReturnAddressTo(scratch_reg);
4853     __ lea(esp, Operand(esp, argc_reg, times_system_pointer_size,
4854                         kSystemPointerSize));  // Also pop the receiver.
4855     // We use a return instead of a jump for better return address prediction.
4856     __ PushReturnAddressFrom(scratch_reg);
4857     __ Ret();
4858   } else if (additional_pop_count->IsImmediate()) {
4859     Register scratch_reg = ecx;
4860     int additional_count = g.ToConstant(additional_pop_count).ToInt32();
4861     size_t pop_size = (parameter_count + additional_count) * kSystemPointerSize;
4862     CHECK_LE(pop_size, static_cast<size_t>(std::numeric_limits<int>::max()));
4863     __ Ret(static_cast<int>(pop_size), scratch_reg);
4864   } else {
4865     Register pop_reg = g.ToRegister(additional_pop_count);
4866     Register scratch_reg = pop_reg == ecx ? edx : ecx;
4867     int pop_size = static_cast<int>(parameter_count * kSystemPointerSize);
4868     __ PopReturnAddressTo(scratch_reg);
4869     __ lea(esp, Operand(esp, pop_reg, times_system_pointer_size,
4870                         static_cast<int>(pop_size)));
4871     __ PushReturnAddressFrom(scratch_reg);
4872     __ Ret();
4873   }
4874 }
4875 
FinishCode()4876 void CodeGenerator::FinishCode() {}
4877 
PrepareForDeoptimizationExits(ZoneDeque<DeoptimizationExit * > * exits)4878 void CodeGenerator::PrepareForDeoptimizationExits(
4879     ZoneDeque<DeoptimizationExit*>* exits) {}
4880 
AssembleMove(InstructionOperand * source,InstructionOperand * destination)4881 void CodeGenerator::AssembleMove(InstructionOperand* source,
4882                                  InstructionOperand* destination) {
4883   IA32OperandConverter g(this, nullptr);
4884   // Dispatch on the source and destination operand kinds.
4885   switch (MoveType::InferMove(source, destination)) {
4886     case MoveType::kRegisterToRegister:
4887       if (source->IsRegister()) {
4888         __ mov(g.ToRegister(destination), g.ToRegister(source));
4889       } else {
4890         DCHECK(source->IsFPRegister());
4891         __ movaps(g.ToDoubleRegister(destination), g.ToDoubleRegister(source));
4892       }
4893       return;
4894     case MoveType::kRegisterToStack: {
4895       Operand dst = g.ToOperand(destination);
4896       if (source->IsRegister()) {
4897         __ mov(dst, g.ToRegister(source));
4898       } else {
4899         DCHECK(source->IsFPRegister());
4900         XMMRegister src = g.ToDoubleRegister(source);
4901         MachineRepresentation rep =
4902             LocationOperand::cast(source)->representation();
4903         if (rep == MachineRepresentation::kFloat32) {
4904           __ movss(dst, src);
4905         } else if (rep == MachineRepresentation::kFloat64) {
4906           __ movsd(dst, src);
4907         } else {
4908           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
4909           __ movups(dst, src);
4910         }
4911       }
4912       return;
4913     }
4914     case MoveType::kStackToRegister: {
4915       Operand src = g.ToOperand(source);
4916       if (source->IsStackSlot()) {
4917         __ mov(g.ToRegister(destination), src);
4918       } else {
4919         DCHECK(source->IsFPStackSlot());
4920         XMMRegister dst = g.ToDoubleRegister(destination);
4921         MachineRepresentation rep =
4922             LocationOperand::cast(source)->representation();
4923         if (rep == MachineRepresentation::kFloat32) {
4924           __ movss(dst, src);
4925         } else if (rep == MachineRepresentation::kFloat64) {
4926           __ movsd(dst, src);
4927         } else {
4928           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
4929           __ movups(dst, src);
4930         }
4931       }
4932       return;
4933     }
4934     case MoveType::kStackToStack: {
4935       Operand src = g.ToOperand(source);
4936       Operand dst = g.ToOperand(destination);
4937       if (source->IsStackSlot()) {
4938         __ push(src);
4939         __ pop(dst);
4940       } else {
4941         MachineRepresentation rep =
4942             LocationOperand::cast(source)->representation();
4943         if (rep == MachineRepresentation::kFloat32) {
4944           __ movss(kScratchDoubleReg, src);
4945           __ movss(dst, kScratchDoubleReg);
4946         } else if (rep == MachineRepresentation::kFloat64) {
4947           __ movsd(kScratchDoubleReg, src);
4948           __ movsd(dst, kScratchDoubleReg);
4949         } else {
4950           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
4951           __ movups(kScratchDoubleReg, src);
4952           __ movups(dst, kScratchDoubleReg);
4953         }
4954       }
4955       return;
4956     }
4957     case MoveType::kConstantToRegister: {
4958       Constant src = g.ToConstant(source);
4959       if (destination->IsRegister()) {
4960         Register dst = g.ToRegister(destination);
4961         if (src.type() == Constant::kHeapObject) {
4962           __ Move(dst, src.ToHeapObject());
4963         } else {
4964           __ Move(dst, g.ToImmediate(source));
4965         }
4966       } else {
4967         DCHECK(destination->IsFPRegister());
4968         XMMRegister dst = g.ToDoubleRegister(destination);
4969         if (src.type() == Constant::kFloat32) {
4970           // TODO(turbofan): Can we do better here?
4971           __ Move(dst, src.ToFloat32AsInt());
4972         } else {
4973           DCHECK_EQ(src.type(), Constant::kFloat64);
4974           __ Move(dst, src.ToFloat64().AsUint64());
4975         }
4976       }
4977       return;
4978     }
4979     case MoveType::kConstantToStack: {
4980       Constant src = g.ToConstant(source);
4981       Operand dst = g.ToOperand(destination);
4982       if (destination->IsStackSlot()) {
4983         __ Move(dst, g.ToImmediate(source));
4984       } else {
4985         DCHECK(destination->IsFPStackSlot());
4986         if (src.type() == Constant::kFloat32) {
4987           __ Move(dst, Immediate(src.ToFloat32AsInt()));
4988         } else {
4989           DCHECK_EQ(src.type(), Constant::kFloat64);
4990           uint64_t constant_value = src.ToFloat64().AsUint64();
4991           uint32_t lower = static_cast<uint32_t>(constant_value);
4992           uint32_t upper = static_cast<uint32_t>(constant_value >> 32);
4993           Operand dst0 = dst;
4994           Operand dst1 = g.ToOperand(destination, kSystemPointerSize);
4995           __ Move(dst0, Immediate(lower));
4996           __ Move(dst1, Immediate(upper));
4997         }
4998       }
4999       return;
5000     }
5001   }
5002   UNREACHABLE();
5003 }
5004 
AssembleSwap(InstructionOperand * source,InstructionOperand * destination)5005 void CodeGenerator::AssembleSwap(InstructionOperand* source,
5006                                  InstructionOperand* destination) {
5007   IA32OperandConverter g(this, nullptr);
5008   // Dispatch on the source and destination operand kinds.  Not all
5009   // combinations are possible.
5010   switch (MoveType::InferSwap(source, destination)) {
5011     case MoveType::kRegisterToRegister: {
5012       if (source->IsRegister()) {
5013         Register src = g.ToRegister(source);
5014         Register dst = g.ToRegister(destination);
5015         __ push(src);
5016         __ mov(src, dst);
5017         __ pop(dst);
5018       } else {
5019         DCHECK(source->IsFPRegister());
5020         XMMRegister src = g.ToDoubleRegister(source);
5021         XMMRegister dst = g.ToDoubleRegister(destination);
5022         __ movaps(kScratchDoubleReg, src);
5023         __ movaps(src, dst);
5024         __ movaps(dst, kScratchDoubleReg);
5025       }
5026       return;
5027     }
5028     case MoveType::kRegisterToStack: {
5029       if (source->IsRegister()) {
5030         Register src = g.ToRegister(source);
5031         __ push(src);
5032         frame_access_state()->IncreaseSPDelta(1);
5033         Operand dst = g.ToOperand(destination);
5034         __ mov(src, dst);
5035         frame_access_state()->IncreaseSPDelta(-1);
5036         dst = g.ToOperand(destination);
5037         __ pop(dst);
5038       } else {
5039         DCHECK(source->IsFPRegister());
5040         XMMRegister src = g.ToDoubleRegister(source);
5041         Operand dst = g.ToOperand(destination);
5042         MachineRepresentation rep =
5043             LocationOperand::cast(source)->representation();
5044         if (rep == MachineRepresentation::kFloat32) {
5045           __ movss(kScratchDoubleReg, dst);
5046           __ movss(dst, src);
5047           __ movaps(src, kScratchDoubleReg);
5048         } else if (rep == MachineRepresentation::kFloat64) {
5049           __ movsd(kScratchDoubleReg, dst);
5050           __ movsd(dst, src);
5051           __ movaps(src, kScratchDoubleReg);
5052         } else {
5053           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
5054           __ movups(kScratchDoubleReg, dst);
5055           __ movups(dst, src);
5056           __ movups(src, kScratchDoubleReg);
5057         }
5058       }
5059       return;
5060     }
5061     case MoveType::kStackToStack: {
5062       if (source->IsStackSlot()) {
5063         Operand dst1 = g.ToOperand(destination);
5064         __ push(dst1);
5065         frame_access_state()->IncreaseSPDelta(1);
5066         Operand src1 = g.ToOperand(source);
5067         __ push(src1);
5068         Operand dst2 = g.ToOperand(destination);
5069         __ pop(dst2);
5070         frame_access_state()->IncreaseSPDelta(-1);
5071         Operand src2 = g.ToOperand(source);
5072         __ pop(src2);
5073       } else {
5074         DCHECK(source->IsFPStackSlot());
5075         Operand src0 = g.ToOperand(source);
5076         Operand dst0 = g.ToOperand(destination);
5077         MachineRepresentation rep =
5078             LocationOperand::cast(source)->representation();
5079         if (rep == MachineRepresentation::kFloat32) {
5080           __ movss(kScratchDoubleReg, dst0);  // Save dst in scratch register.
5081           __ push(src0);  // Then use stack to copy src to destination.
5082           __ pop(dst0);
5083           __ movss(src0, kScratchDoubleReg);
5084         } else if (rep == MachineRepresentation::kFloat64) {
5085           __ movsd(kScratchDoubleReg, dst0);  // Save dst in scratch register.
5086           __ push(src0);  // Then use stack to copy src to destination.
5087           __ pop(dst0);
5088           __ push(g.ToOperand(source, kSystemPointerSize));
5089           __ pop(g.ToOperand(destination, kSystemPointerSize));
5090           __ movsd(src0, kScratchDoubleReg);
5091         } else {
5092           DCHECK_EQ(MachineRepresentation::kSimd128, rep);
5093           __ movups(kScratchDoubleReg, dst0);  // Save dst in scratch register.
5094           __ push(src0);  // Then use stack to copy src to destination.
5095           __ pop(dst0);
5096           __ push(g.ToOperand(source, kSystemPointerSize));
5097           __ pop(g.ToOperand(destination, kSystemPointerSize));
5098           __ push(g.ToOperand(source, 2 * kSystemPointerSize));
5099           __ pop(g.ToOperand(destination, 2 * kSystemPointerSize));
5100           __ push(g.ToOperand(source, 3 * kSystemPointerSize));
5101           __ pop(g.ToOperand(destination, 3 * kSystemPointerSize));
5102           __ movups(src0, kScratchDoubleReg);
5103         }
5104       }
5105       return;
5106     }
5107     default:
5108       UNREACHABLE();
5109   }
5110 }
5111 
AssembleJumpTable(Label ** targets,size_t target_count)5112 void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
5113   for (size_t index = 0; index < target_count; ++index) {
5114     __ dd(targets[index]);
5115   }
5116 }
5117 
5118 #undef __
5119 #undef kScratchDoubleReg
5120 #undef ASSEMBLE_COMPARE
5121 #undef ASSEMBLE_IEEE754_BINOP
5122 #undef ASSEMBLE_IEEE754_UNOP
5123 #undef ASSEMBLE_BINOP
5124 #undef ASSEMBLE_ATOMIC_BINOP
5125 #undef ASSEMBLE_I64ATOMIC_BINOP
5126 #undef ASSEMBLE_MOVX
5127 #undef ASSEMBLE_SIMD_PUNPCK_SHUFFLE
5128 #undef ASSEMBLE_SIMD_IMM_SHUFFLE
5129 #undef ASSEMBLE_SIMD_ALL_TRUE
5130 #undef ASSEMBLE_SIMD_SHIFT
5131 
5132 }  // namespace compiler
5133 }  // namespace internal
5134 }  // namespace v8
5135