1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // Assembler to produce x86-64 instructions. Somewhat influenced by V8 assembler.
18
19 #ifndef BERBERIS_ASSEMBLER_X86_64_H_
20 #define BERBERIS_ASSEMBLER_X86_64_H_
21
22 #include <type_traits> // std::is_same
23
24 #include "berberis/assembler/common_x86.h"
25 #include "berberis/base/logging.h"
26 #include "berberis/base/macros.h" // DISALLOW_IMPLICIT_CONSTRUCTORS
27
28 namespace berberis {
29
30 class MachindeCode;
31
32 namespace x86_64 {
33
34 class Assembler : public AssemblerX86<Assembler> {
35 public:
Assembler(MachineCode * code)36 explicit Assembler(MachineCode* code) : AssemblerX86(code) {}
37
38 static constexpr Register no_register{0x80};
39 static constexpr Register rax{0};
40 static constexpr Register rcx{1};
41 static constexpr Register rdx{2};
42 static constexpr Register rbx{3};
43 static constexpr Register rsp{4};
44 static constexpr Register rbp{5};
45 static constexpr Register rsi{6};
46 static constexpr Register rdi{7};
47 static constexpr Register r8{8};
48 static constexpr Register r9{9};
49 static constexpr Register r10{10};
50 static constexpr Register r11{11};
51 static constexpr Register r12{12};
52 static constexpr Register r13{13};
53 static constexpr Register r14{14};
54 static constexpr Register r15{15};
55
56 static constexpr XMMRegister no_xmm_register{0x80};
57 static constexpr XMMRegister xmm0{0};
58 static constexpr XMMRegister xmm1{1};
59 static constexpr XMMRegister xmm2{2};
60 static constexpr XMMRegister xmm3{3};
61 static constexpr XMMRegister xmm4{4};
62 static constexpr XMMRegister xmm5{5};
63 static constexpr XMMRegister xmm6{6};
64 static constexpr XMMRegister xmm7{7};
65 static constexpr XMMRegister xmm8{8};
66 static constexpr XMMRegister xmm9{9};
67 static constexpr XMMRegister xmm10{10};
68 static constexpr XMMRegister xmm11{11};
69 static constexpr XMMRegister xmm12{12};
70 static constexpr XMMRegister xmm13{13};
71 static constexpr XMMRegister xmm14{14};
72 static constexpr XMMRegister xmm15{15};
73
74 // Macroassembler uses these names to support both x86-32 and x86-64 modes.
75 static constexpr Register gpr_a{0};
76 static constexpr Register gpr_c{1};
77 static constexpr Register gpr_d{2};
78 static constexpr Register gpr_s{4};
79
80 // Instructions.
81 #include "berberis/assembler/gen_assembler_x86_64-inl.h" // NOLINT generated file!
82
83 // Historical curiosity: x86-32 mode has Movq for memory-to-xmm operations.
84 // x86-64 added another one, with different opcode but since they are functionally equivalent
85 // GNU Assembler and Clang use old one both in 32-bit mode and 64-bit mode thus we are doing
86 // the same.
87
88 // Unhide Decl(Mem) hidden by Decl(Reg).
89 using AssemblerX86::Decl;
90
91 // Unhide Decw(Mem) hidden by Decw(Reg).
92 using AssemblerX86::Decw;
93
94 // Unhide Incl(Mem) hidden by Incl(Reg).
95 using AssemblerX86::Incl;
96
97 // Unhide Incw(Mem) hidden by Incw(Reg).
98 using AssemblerX86::Incw;
99
100 // Unhide Movq(Mem, XMMReg) and Movq(XMMReg, Mem) hidden by Movq(Reg, Imm) and many others.
101 using AssemblerX86::Movq;
102
103 // Unhide Xchgl(Mem, Reg) hidden by modified version below.
104 using AssemblerX86::Xchgl;
105
106 // Unhide Vmov*(Mem, Reg) hidden by Vmov*(Reg, Reg).
107 using AssemblerX86::Vmovapd;
108 using AssemblerX86::Vmovaps;
109 using AssemblerX86::Vmovdqa;
110 using AssemblerX86::Vmovdqu;
111 using AssemblerX86::Vmovq;
112 using AssemblerX86::Vmovsd;
113 using AssemblerX86::Vmovss;
114
Xchgl(Register dest,Register src)115 void Xchgl(Register dest, Register src) {
116 // In 32-bit mode "xchgl %eax, %eax" did nothing and was often reused as "nop".
117 //
118 // On x86-64 "xchgl %eax, %eax" clears top half of %eax register, but having single-byte nop
119 // is too convenient, thus, as special exception, 0x90 is not interpreted as "xchgl %eax, %eax",
120 // but was kept as "nop" - thus longer encoding for "xchgl %eax, %eax" must be used.
121
122 if (IsAccumulator(src) && IsAccumulator(dest)) {
123 Emit16(0xc087);
124 } else {
125 AssemblerX86::Xchgl(dest, src);
126 }
127 }
128
129 // TODO(b/127356868): decide what to do with these functions when cross-arch assembler is used.
130
131 #ifdef __amd64__
132
133 // Unhide Call(Reg), hidden by special version below.
134 using AssemblerX86::Call;
135
Call(const void * target)136 void Call(const void* target) {
137 // There are no call instruction with properties we need thus we emulate it.
138 // This is what the following code looks like when decoded with objdump (if
139 // target address is 0x123456789abcdef0):
140 // 0: ff 15 02 00 00 00 callq *0x2(%rip) # 0x8
141 // 6: eb 08 jmp 0x10
142 // 8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
143 // First we do call - with address taken from last 8 bytes, then we jump over
144 // these 8 bytes.
145 Emit64(0x08eb0000000215ff);
146 Emit64(bit_cast<int64_t>(target));
147 }
148
149 // Unhide Jcc(Label), hidden by special version below.
150 using AssemblerX86::Jcc;
151
152 // Make sure only type void* can be passed to function below, not Label* or any other type.
153 template <typename T>
154 auto Jcc(Condition cc, T* target) -> void = delete;
155
Jcc(Condition cc,const void * target)156 void Jcc(Condition cc, const void* target) {
157 if (cc == Condition::kAlways) {
158 Jmp(target);
159 return;
160 } else if (cc == Condition::kNever) {
161 return;
162 }
163 CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xF0);
164 // There are no Jcc instruction with properties we need thus we emulate it.
165 // This is what the following code looks like when decoded with objdump (if
166 // target address is 0x123456789abcdef0):
167 // 0: 75 0e jne 0x10
168 // 2: ff 25 00 00 00 00 jmpq *0x0(%rip) # 0x8
169 // 8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
170 // We are doing relative jump for the inverted condition (because Jcc could
171 // only jump ±2GiB and in 64 bit mode which is not enough to reach arbitrary
172 // address), then jmpq with address stored right after jmpq.
173 Emit64(0x0000000025ff'0e70 | static_cast<int8_t>(ToReverseCond(cc)));
174 Emit64(bit_cast<int64_t>(target));
175 }
176
177 // Unhide Jmp(Reg), hidden by special version below.
178 using AssemblerX86::Jmp;
179
180 // Make sure only type void* can be passed to function below, not Label* or any other type.
181 template <typename T>
182 auto Jmp(T* target) -> void = delete;
183
184 void Jmp(const void* target) {
185 // There are no jump instruction with properties we need thus we emulate it.
186 // This is what the following code looks like when decoded with objdump (if
187 // target address is 0x123456789abcdef0):
188 // 0: ff 25 00 00 00 00 jmpq *0x0(%rip) # 0x6
189 // 6: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
190 // We are doing jump to the address stored right after jmpq using %rip-relative
191 // addressing (with offset 0).
192 Emit16(0x25ff);
193 Emit32(0x00000000);
194 Emit64(bit_cast<int64_t>(target));
195 }
196
197 #endif
198
199 private:
200 DISALLOW_IMPLICIT_CONSTRUCTORS(Assembler);
201
202 static Register Accumulator() { return rax; }
203 static bool IsAccumulator(Register reg) { return reg == rax; }
204
205 struct Register64Bit {
206 explicit constexpr Register64Bit(Register reg) : num(reg.num) {}
207 uint8_t num;
208 };
209
210 struct Memory64Bit {
211 explicit Memory64Bit(const Operand& op) : operand(op) {}
212 Operand operand;
213 };
214
215 struct Label64Bit {
216 explicit Label64Bit(const LabelOperand& l) : label(l.label) {}
217 const Label& label;
218 };
219
220 // This type is only used by CmpXchg16b and acts similarly to Memory64Bit there.
221 using Memory128Bit = Memory64Bit;
222 using Label128Bit = Label64Bit;
223
224 // Check if a given type is "a register with size" (for EmitInstruction).
225 template <typename ArgumentType>
226 struct IsRegister {
227 static constexpr bool value = std::is_same_v<ArgumentType, Register8Bit> ||
228 std::is_same_v<ArgumentType, Register32Bit> ||
229 std::is_same_v<ArgumentType, Register64Bit>;
230 };
231
232 // Check if a given type is "a memory operand with size" (for EmitInstruction).
233 template <typename ArgumentType>
234 struct IsMemoryOperand {
235 static constexpr bool value =
236 std::is_same_v<ArgumentType, Memory32Bit> || std::is_same_v<ArgumentType, Memory64Bit>;
237 };
238
239 template <typename ArgumentType>
240 struct IsLabelOperand {
241 static constexpr bool value =
242 std::is_same_v<ArgumentType, Label32Bit> || std::is_same_v<ArgumentType, Label64Bit>;
243 };
244
245 template <typename... ArgumentsTypes>
246 void EmitRex(ArgumentsTypes... arguments) {
247 constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
248 constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
249 static_assert(registers_count + operands_count <= 2,
250 "Only two-arguments instructions are supported, not VEX or EVEX");
251 uint8_t rex = 0;
252 if constexpr (registers_count == 2) {
253 rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
254 Rex<0b0001>(ArgumentByType<1, IsRegister>(arguments...));
255 } else if constexpr (registers_count == 1 && operands_count == 1) {
256 rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
257 Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
258 } else if constexpr (registers_count == 1) {
259 rex = Rex<0b0001>(ArgumentByType<0, IsRegister>(arguments...));
260 } else if constexpr (operands_count == 1) {
261 rex = Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
262 }
263 if (rex) {
264 Emit8(rex);
265 }
266 }
267
268 template <uint8_t base_rex, typename ArgumentType>
269 uint8_t Rex(ArgumentType argument) {
270 if (argument.num & 0b1000) {
271 // 64-bit argument requires REX.W bit
272 if (std::is_same_v<ArgumentType, Register64Bit>) {
273 return 0b0100'1000 | base_rex;
274 }
275 return 0b0100'0000 | base_rex;
276 }
277 // 8-bit argument requires REX (even if without any bits).
278 if (std::is_same_v<ArgumentType, Register8Bit> && argument.num > 3) {
279 return 0b0100'0000;
280 }
281 if (std::is_same_v<ArgumentType, Register64Bit>) {
282 return 0b0100'1000;
283 }
284 return 0;
285 }
286
Rex(Operand operand)287 uint8_t Rex(Operand operand) {
288 // REX.B and REX.X always come from operand.
289 uint8_t rex = ((operand.base.num & 0b1000) >> 3) | ((operand.index.num & 0b1000) >> 2);
290 if (rex) {
291 // We actually need rex byte here.
292 return 0b0100'0000 | rex;
293 } else {
294 return 0;
295 }
296 }
297
Rex(Memory32Bit operand)298 uint8_t Rex(Memory32Bit operand) { return Rex(operand.operand); }
299
Rex(Memory64Bit operand)300 uint8_t Rex(Memory64Bit operand) {
301 // 64-bit argument requires REX.W bit - and thus REX itself.
302 return 0b0100'1000 | Rex(operand.operand);
303 }
304
305 template <typename RegisterType>
306 [[nodiscard]] static bool IsSwapProfitable(RegisterType rm_arg, RegisterType vex_arg) {
307 // In 64bit mode we may use more compact encoding if operand encoded in rm is low register.
308 // Return true if we may achieve that by swapping arguments.
309 return rm_arg.num >= 8 && vex_arg.num < 8;
310 }
311
312 template <uint8_t byte1,
313 uint8_t byte2,
314 uint8_t byte3,
315 bool reg_is_opcode_extension,
316 typename... ArgumentsTypes>
317 void EmitVex(ArgumentsTypes... arguments) {
318 constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
319 constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
320 constexpr auto labels_count = kCountArguments<IsLabelOperand, ArgumentsTypes...>;
321 constexpr auto vvvv_parameter = 2 - reg_is_opcode_extension - operands_count - labels_count;
322 int vvvv = 0;
323 if constexpr (registers_count > vvvv_parameter) {
324 vvvv = ArgumentByType<vvvv_parameter, IsRegister>(arguments...).num;
325 }
326 auto vex2 = byte2 | 0b111'00000;
327 if constexpr (operands_count == 1) {
328 auto operand = ArgumentByType<0, IsMemoryOperand>(arguments...);
329 vex2 ^= (operand.operand.base.num & 0b1000) << 2;
330 vex2 ^= (operand.operand.index.num & 0b1000) << 3;
331 if constexpr (!reg_is_opcode_extension) {
332 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 4;
333 }
334 } else if constexpr (labels_count == 1) {
335 if constexpr (!reg_is_opcode_extension) {
336 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 4;
337 }
338 } else if constexpr (registers_count > 0) {
339 if constexpr (reg_is_opcode_extension) {
340 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 2;
341 } else {
342 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num & 0b1000) << 4;
343 vex2 ^= (ArgumentByType<1, IsRegister>(arguments...).num & 0b1000) << 2;
344 }
345 }
346 if (byte1 == 0xC4 && (vex2 & 0b0'1'1'11111) == 0b0'1'1'00001 && (byte3 & 0b1'0000'0'00) == 0) {
347 Emit16((0xc5 | ((vex2 & 0b1'0'0'00000) << 8) | (byte3 << 8) |
348 0b0'1111'000'00000000) ^ (vvvv << 11));
349 } else {
350 Emit8(byte1);
351 Emit16((vex2 | (byte3 << 8) | 0b0'1111'000'00000000) ^ (vvvv << 11));
352 }
353 }
354
355 template <typename ArgumentType>
EmitRegisterInOpcode(uint8_t opcode,ArgumentType argument)356 void EmitRegisterInOpcode(uint8_t opcode, ArgumentType argument) {
357 Emit8(opcode | (argument.num & 0b111));
358 }
359
360 template <typename ArgumentType1, typename ArgumentType2>
EmitModRM(ArgumentType1 argument1,ArgumentType2 argument2)361 void EmitModRM(ArgumentType1 argument1, ArgumentType2 argument2) {
362 Emit8(0xC0 | ((argument1.num & 0b111) << 3) | (argument2.num & 0b111));
363 }
364
365 template <typename ArgumentType>
EmitModRM(uint8_t opcode_extension,ArgumentType argument)366 void EmitModRM(uint8_t opcode_extension, ArgumentType argument) {
367 CHECK_LE(opcode_extension, 0b111);
368 Emit8(0xC0 | (opcode_extension << 3) | (argument.num & 0b111));
369 }
370
371 template <typename ArgumentType>
EmitOperandOp(ArgumentType argument,Operand operand)372 void EmitOperandOp(ArgumentType argument, Operand operand) {
373 EmitOperandOp(static_cast<int>(argument.num & 0b111), operand);
374 }
375
376 template <size_t kImmediatesSize, typename ArgumentType>
EmitRipOp(ArgumentType argument,const Label & label)377 void EmitRipOp(ArgumentType argument, const Label& label) {
378 EmitRipOp<kImmediatesSize>(static_cast<int>(argument.num) & 0b111, label);
379 }
380
381 // Emit the ModR/M byte, and optionally the SIB byte and
382 // 1- or 4-byte offset for a memory operand. Also used to encode
383 // a three-bit opcode extension into the ModR/M byte.
384 void EmitOperandOp(int number, const Operand& addr);
385 // Helper functions to handle various ModR/M and SIB combinations.
386 // Should *only* be called from EmitOperandOp!
387 void EmitIndexDispOperand(int reg, const Operand& addr);
388 template <typename ArgType, void (AssemblerBase::*)(ArgType)>
389 void EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr);
390 // Emit ModR/M for rip-addressig.
391 template <size_t kImmediatesSize>
392 void EmitRipOp(int num, const Label& label);
393
394 friend AssemblerX86<Assembler>;
395 };
396
397 // This function looks big, but when we are emitting Operand with fixed registers
398 // (which is the most common case) all "if"s below are calculated statically which
399 // makes effective size of that function very small.
400 //
401 // But for this to happen function have to be inline and in header.
EmitOperandOp(int number,const Operand & addr)402 inline void Assembler::EmitOperandOp(int number, const Operand& addr) {
403 // Additional info (register number, etc) is limited to 3 bits.
404 CHECK_LE(unsigned(number), 7);
405
406 // Reg field must be shifted by 3 bits.
407 int reg = number << 3;
408
409 // On x86 %rsp cannot be index, only base.
410 CHECK(addr.index != rsp);
411
412 // If base is not %rsp/r12 and we don't have index, then we don't have SIB byte.
413 // All other cases have "ModR/M" and SIB bytes.
414 if (addr.base != rsp && addr.base != r12 && addr.index == no_register) {
415 // If we have base register then we could use the same logic as for other common cases.
416 if (addr.base != no_register) {
417 EmitBaseIndexDispOperand<uint8_t, &Assembler::Emit8>((addr.base.num & 7) | reg, addr);
418 } else {
419 Emit16(0x2504 | reg);
420 Emit32(addr.disp);
421 }
422 } else if (addr.index == no_register) {
423 // Note: when ModR/M and SIB are used "no index" is encoded as if %rsp is used in place of
424 // index (that's why %rsp couldn't be used as index - see check above).
425 EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(0x2004 | ((addr.base.num & 7) << 8) | reg,
426 addr);
427 } else if (addr.base == no_register) {
428 EmitIndexDispOperand(reg, addr);
429 } else {
430 EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(
431 0x04 | (addr.scale << 14) | ((addr.index.num & 7) << 11) | ((addr.base.num & 7) << 8) | reg,
432 addr);
433 }
434 }
435
EmitIndexDispOperand(int reg,const Operand & addr)436 inline void Assembler::EmitIndexDispOperand(int reg, const Operand& addr) {
437 // We only have index here, no base, use SIB but put %rbp in "base" field.
438 Emit16(0x0504 | (addr.scale << 14) | ((addr.index.num & 7) << 11) | reg);
439 Emit32(addr.disp);
440 }
441
442 template <size_t kImmediatesSize>
EmitRipOp(int num,const Label & label)443 inline void Assembler::EmitRipOp(int num, const Label& label) {
444 Emit8(0x05 | (num << 3));
445 jumps_.push_back(Jump{&label, pc(), false});
446 Emit32(0xfffffffc - kImmediatesSize);
447 }
448
449 template <typename ArgType, void (AssemblerBase::*EmitBase)(ArgType)>
EmitBaseIndexDispOperand(int base_modrm_and_sib,const Operand & addr)450 inline void Assembler::EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr) {
451 if (addr.disp == 0 && addr.base != rbp && addr.base != r13) {
452 // We can omit zero displacement only if base isn't %rbp/%r13
453 (this->*EmitBase)(base_modrm_and_sib);
454 } else if (IsInRange<int8_t>(addr.disp)) {
455 // If disp could it in byte then use byte-disp.
456 (this->*EmitBase)(base_modrm_and_sib | 0x40);
457 Emit8(addr.disp);
458 } else {
459 // Otherwise use full-disp.
460 (this->*EmitBase)(base_modrm_and_sib | 0x80);
461 Emit32(addr.disp);
462 }
463 }
464
Movq(Register dest,int64_t imm64)465 inline void Assembler::Movq(Register dest, int64_t imm64) {
466 if (IsInRange<uint32_t>(imm64)) {
467 // Shorter encoding.
468 Movl(dest, static_cast<uint32_t>(imm64));
469 } else if (IsInRange<int32_t>(imm64)) {
470 // Slightly longer encoding.
471 EmitInstruction<Opcodes<0xc7, 0x00>>(Register64Bit(dest), static_cast<int32_t>(imm64));
472 } else {
473 // Longest encoding.
474 EmitInstruction<Opcodes<0xb8>>(Register64Bit(dest), imm64);
475 }
476 }
477
Vmovapd(XMMRegister arg0,XMMRegister arg1)478 inline void Assembler::Vmovapd(XMMRegister arg0, XMMRegister arg1) {
479 if (arg0.num < 8 && arg1.num >= 8) {
480 return EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x29>>(VectorRegister128Bit(arg1),
481 VectorRegister128Bit(arg0));
482 }
483 EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x28>>(VectorRegister128Bit(arg0),
484 VectorRegister128Bit(arg1));
485 }
486
Vmovaps(XMMRegister arg0,XMMRegister arg1)487 inline void Assembler::Vmovaps(XMMRegister arg0, XMMRegister arg1) {
488 if (arg0.num < 8 && arg1.num >= 8) {
489 return EmitInstruction<Opcodes<0xc4, 0x01, 0x00, 0x29>>(VectorRegister128Bit(arg1),
490 VectorRegister128Bit(arg0));
491 }
492 EmitInstruction<Opcodes<0xc4, 0x01, 0x00, 0x28>>(VectorRegister128Bit(arg0),
493 VectorRegister128Bit(arg1));
494 }
495
Vmovdqa(XMMRegister arg0,XMMRegister arg1)496 inline void Assembler::Vmovdqa(XMMRegister arg0, XMMRegister arg1) {
497 if (arg0.num < 8 && arg1.num >= 8) {
498 return EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x7F>>(VectorRegister128Bit(arg1),
499 VectorRegister128Bit(arg0));
500 }
501 EmitInstruction<Opcodes<0xc4, 0x01, 0x01, 0x6F>>(VectorRegister128Bit(arg0),
502 VectorRegister128Bit(arg1));
503 }
504
Vmovdqu(XMMRegister arg0,XMMRegister arg1)505 inline void Assembler::Vmovdqu(XMMRegister arg0, XMMRegister arg1) {
506 if (arg0.num < 8 && arg1.num >= 8) {
507 return EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x7F>>(VectorRegister128Bit(arg1),
508 VectorRegister128Bit(arg0));
509 }
510 EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x6F>>(VectorRegister128Bit(arg0),
511 VectorRegister128Bit(arg1));
512 }
513
Vmovsd(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)514 inline void Assembler::Vmovsd(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
515 if (arg0.num < 8 && arg2.num >= 8) {
516 return EmitInstruction<Opcodes<0xc4, 0x01, 0x03, 0x11>>(
517 VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
518 }
519 EmitInstruction<Opcodes<0xc4, 0x01, 0x03, 0x10>>(
520 VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
521 }
522
Vmovss(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)523 inline void Assembler::Vmovss(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
524 if (arg0.num < 8 && arg2.num >= 8) {
525 return EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x11>>(
526 VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
527 }
528 EmitInstruction<Opcodes<0xc4, 0x01, 0x02, 0x10>>(
529 VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
530 }
531
Xchgq(Register dest,Register src)532 inline void Assembler::Xchgq(Register dest, Register src) {
533 // We compare output to that from clang and thus want to produce the same code.
534 // 0x48 0x90 is suboptimal encoding for that operation (pure 0x90 does the same
535 // and this is what gcc + gas are producing), but this is what clang <= 8 does.
536 if (IsAccumulator(src) && IsAccumulator(dest)) {
537 Emit8(0x90);
538 } else if (IsAccumulator(src) || IsAccumulator(dest)) {
539 Register other = IsAccumulator(src) ? dest : src;
540 EmitInstruction<Opcodes<0x90>>(Register64Bit(other));
541 } else {
542 // Clang 8 (after r330298) puts dest before src. We are comparing output
543 // to clang in exhaustive test thus we want to match clang behavior exactly.
544 EmitInstruction<Opcodes<0x87>>(Register64Bit(dest), Register64Bit(src));
545 }
546 }
547
548 } // namespace x86_64
549
550 } // namespace berberis
551
552 #endif // BERBERIS_ASSEMBLER_X86_64_H_
553