1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // Assembler to produce x86-64 instructions. Somewhat influenced by V8 assembler.
18
19 #ifndef BERBERIS_ASSEMBLER_X86_64_H_
20 #define BERBERIS_ASSEMBLER_X86_64_H_
21
22 #include <type_traits> // std::is_same
23
24 #include "berberis/assembler/x86_32_and_x86_64.h"
25 #include "berberis/base/logging.h"
26
27 namespace berberis {
28
29 class MachindeCode;
30
31 namespace x86_64 {
32
33 class Assembler : public x86_32_and_x86_64::Assembler<Assembler> {
34 public:
35 using BaseAssembler = x86_32_and_x86_64::Assembler<Assembler>;
36 using FinalAssembler = Assembler;
37
Assembler(MachineCode * code)38 explicit Assembler(MachineCode* code) : BaseAssembler(code) {}
39
40 static constexpr Register no_register{0x80};
41 static constexpr Register rax{0};
42 static constexpr Register rcx{1};
43 static constexpr Register rdx{2};
44 static constexpr Register rbx{3};
45 static constexpr Register rsp{4};
46 static constexpr Register rbp{5};
47 static constexpr Register rsi{6};
48 static constexpr Register rdi{7};
49 static constexpr Register r8{8};
50 static constexpr Register r9{9};
51 static constexpr Register r10{10};
52 static constexpr Register r11{11};
53 static constexpr Register r12{12};
54 static constexpr Register r13{13};
55 static constexpr Register r14{14};
56 static constexpr Register r15{15};
57
58 static constexpr XMMRegister no_xmm_register{0x80};
59 static constexpr XMMRegister xmm0{0};
60 static constexpr XMMRegister xmm1{1};
61 static constexpr XMMRegister xmm2{2};
62 static constexpr XMMRegister xmm3{3};
63 static constexpr XMMRegister xmm4{4};
64 static constexpr XMMRegister xmm5{5};
65 static constexpr XMMRegister xmm6{6};
66 static constexpr XMMRegister xmm7{7};
67 static constexpr XMMRegister xmm8{8};
68 static constexpr XMMRegister xmm9{9};
69 static constexpr XMMRegister xmm10{10};
70 static constexpr XMMRegister xmm11{11};
71 static constexpr XMMRegister xmm12{12};
72 static constexpr XMMRegister xmm13{13};
73 static constexpr XMMRegister xmm14{14};
74 static constexpr XMMRegister xmm15{15};
75
76 static constexpr YMMRegister no_ymm_register{0x80};
77 static constexpr YMMRegister ymm0{0};
78 static constexpr YMMRegister ymm1{1};
79 static constexpr YMMRegister ymm2{2};
80 static constexpr YMMRegister ymm3{3};
81 static constexpr YMMRegister ymm4{4};
82 static constexpr YMMRegister ymm5{5};
83 static constexpr YMMRegister ymm6{6};
84 static constexpr YMMRegister ymm7{7};
85 static constexpr YMMRegister ymm8{8};
86 static constexpr YMMRegister ymm9{9};
87 static constexpr YMMRegister ymm10{10};
88 static constexpr YMMRegister ymm11{11};
89 static constexpr YMMRegister ymm12{12};
90 static constexpr YMMRegister ymm13{13};
91 static constexpr YMMRegister ymm14{14};
92 static constexpr YMMRegister ymm15{15};
93
94 // Macroassembler uses these names to support both x86-32 and x86-64 modes.
95 static constexpr Register gpr_a{0};
96 static constexpr Register gpr_c{1};
97 static constexpr Register gpr_d{2};
98 static constexpr Register gpr_b{3};
99 static constexpr Register gpr_s{4};
100
101 // Instructions.
102 #include "berberis/assembler/gen_assembler_x86_64-inl.h" // NOLINT generated file!
103
104 // Historical curiosity: x86-32 mode has Movq for memory-to-xmm operations.
105 // x86-64 added another one, with different opcode but since they are functionally equivalent
106 // GNU Assembler and Clang use old one both in 32-bit mode and 64-bit mode thus we are doing
107 // the same.
108
109 // Unhide Decl(Mem) hidden by Decl(Reg).
110 using BaseAssembler::Decl;
111
112 // Unhide Decw(Mem) hidden by Decw(Reg).
113 using BaseAssembler::Decw;
114
115 // Unhide Incl(Mem) hidden by Incl(Reg).
116 using BaseAssembler::Incl;
117
118 // Unhide Incw(Mem) hidden by Incw(Reg).
119 using BaseAssembler::Incw;
120
121 // Unhide Movq(Mem, XMMReg) and Movq(XMMReg, Mem) hidden by Movq(Reg, Imm) and many others.
122 using BaseAssembler::Movq;
123
124 // Unhide Xchgl(Mem, Reg) hidden by modified version below.
125 using BaseAssembler::Xchgl;
126
127 // Unhide Vmov*(Mem, Reg) hidden by Vmov*(Reg, Reg).
128 using BaseAssembler::Vmovapd;
129 using BaseAssembler::Vmovaps;
130 using BaseAssembler::Vmovdqa;
131 using BaseAssembler::Vmovdqu;
132 using BaseAssembler::Vmovq;
133 using BaseAssembler::Vmovsd;
134 using BaseAssembler::Vmovss;
135
Xchgl(Register dest,Register src)136 void Xchgl(Register dest, Register src) {
137 // In 32-bit mode "xchgl %eax, %eax" did nothing and was often reused as "nop".
138 //
139 // On x86-64 "xchgl %eax, %eax" clears top half of %eax register, but having single-byte nop
140 // is too convenient, thus, as special exception, 0x90 is not interpreted as "xchgl %eax, %eax",
141 // but was kept as "nop" - thus longer encoding for "xchgl %eax, %eax" must be used.
142
143 if (IsAccumulator(src) && IsAccumulator(dest)) {
144 Emit16(0xc087);
145 } else {
146 BaseAssembler::Xchgl(dest, src);
147 }
148 }
149
150 // TODO(b/127356868): decide what to do with these functions when cross-arch assembler is used.
151
152 #ifdef __amd64__
153
154 // Unhide Call(Reg), hidden by special version below.
155 using BaseAssembler::Call;
156
Call(const void * target)157 void Call(const void* target) {
158 // There are no call instruction with properties we need thus we emulate it.
159 // This is what the following code looks like when decoded with objdump (if
160 // target address is 0x123456789abcdef0):
161 // 0: ff 15 02 00 00 00 callq *0x2(%rip) # 0x8
162 // 6: eb 08 jmp 0x10
163 // 8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
164 // First we do call - with address taken from last 8 bytes, then we jump over
165 // these 8 bytes.
166 Emit64(0x08eb0000000215ff);
167 Emit64(bit_cast<int64_t>(target));
168 }
169
170 // Unhide Jcc(Label), hidden by special version below.
171 using BaseAssembler::Jcc;
172
173 // Make sure only type void* can be passed to function below, not Label* or any other pointer.
174 template <typename T>
175 auto Jcc(Condition cc, T* target) -> void = delete;
176
177 template <typename T>
178 auto Jcc(Condition cc, T target)
179 -> std::enable_if_t<std::is_integral_v<T> && sizeof(uintptr_t) < sizeof(T)> = delete;
180
Jcc(Condition cc,uintptr_t target)181 void Jcc(Condition cc, uintptr_t target) {
182 if (cc == Condition::kAlways) {
183 Jmp(target);
184 return;
185 } else if (cc == Condition::kNever) {
186 return;
187 }
188 CHECK_EQ(0, static_cast<uint8_t>(cc) & 0xF0);
189 // There are no Jcc instruction with properties we need thus we emulate it.
190 // This is what the following code looks like when decoded with objdump (if
191 // target address is 0x123456789abcdef0):
192 // 0: 75 0e jne 0x10
193 // 2: ff 25 00 00 00 00 jmpq *0x0(%rip) # 0x8
194 // 8: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
195 // We are doing relative jump for the inverted condition (because Jcc could
196 // only jump ±2GiB and in 64 bit mode which is not enough to reach arbitrary
197 // address), then jmpq with address stored right after jmpq.
198 Emit64(0x0000000025ff'0e70 | static_cast<int8_t>(ToReverseCond(cc)));
199 Emit64(bit_cast<int64_t>(target));
200 }
201
202 // Emit short relative jcc to an absolute address.
203 //
204 // This is used to shorten jcc in the code installed in lower 2G address space.
205 // Use this if the target is also within this address space.
206 void Jcc32(Condition cc, uintptr_t target) {
207 if (cc == Condition::kAlways) {
208 Jmp32(target);
209 return;
210 } else if (cc == Condition::kNever) {
211 return;
212 }
213 CHECK_EQ(static_cast<uint8_t>(cc) & 0xf0, 0);
214 Emit8(0x0f);
215 Emit8(0x80 | static_cast<uint8_t>(cc));
216 Emit32(0xcccc'cccc);
217 // Set last 4 bytes to displacement from current pc to 'target'.
218 AddRelocation(pc() - 4, RelocationType::RelocAbsToDisp32, pc(), bit_cast<intptr_t>(target));
219 }
220
Jcc(Condition cc,const void * target)221 void Jcc(Condition cc, const void* target) { Jcc(cc, bit_cast<uintptr_t>(target)); }
222
223 // Unhide Jmp(Reg), hidden by special version below.
224 using BaseAssembler::Jmp;
225
226 // Make sure only type void* can be passed to function below, not Label* or any other pointer.
227 template <typename T>
228 auto Jmp(T* target) -> void = delete;
229
230 template <typename T>
231 auto Jmp(T target)
232 -> std::enable_if_t<std::is_integral_v<T> && sizeof(uintptr_t) < sizeof(T)> = delete;
233
Jmp(uintptr_t target)234 void Jmp(uintptr_t target) {
235 // There are no jump instruction with properties we need thus we emulate it.
236 // This is what the following code looks like when decoded with objdump (if
237 // target address is 0x123456789abcdef0):
238 // 0: ff 25 00 00 00 00 jmpq *0x0(%rip) # 0x6
239 // 6: f0 de bc 9a 78 56 34 12 lock fidivrs 0x12345678(%rdx,%rbx,4)
240 // We are doing jump to the address stored right after jmpq using %rip-relative
241 // addressing (with offset 0).
242 Emit16(0x25ff);
243 Emit32(0x00000000);
244 Emit64(bit_cast<int64_t>(target));
245 }
246
247 // Emit short relative jump to an absolute address.
248 //
249 // This is used to shorten jmps in the code installed in lower 2G address space.
250 // Use this if the target is also within this address space.
Jmp32(uintptr_t target)251 void Jmp32(uintptr_t target) {
252 Emit8(0xe9);
253 Emit32(0xcccc'cccc);
254 AddRelocation(pc() - 4, RelocationType::RelocAbsToDisp32, pc(), target);
255 }
256
257 void Jmp(const void* target) { Jmp(bit_cast<uintptr_t>(target)); }
258
259 #endif
260
261 private:
262 Assembler() = delete;
263 Assembler(const Assembler&) = delete;
264 Assembler(Assembler&&) = delete;
265 void operator=(const Assembler&) = delete;
266 void operator=(Assembler&&) = delete;
267 using DerivedAssemblerType = Assembler;
268
269 static Register Accumulator() { return rax; }
270 static bool IsAccumulator(Register reg) { return reg == rax; }
271
272 struct Register64Bit {
273 explicit constexpr Register64Bit(Register reg) : num_(reg.num_) {}
274 uint8_t num_;
275 };
276
277 struct Memory64Bit {
278 explicit Memory64Bit(const Operand& op) : operand(op) {}
279 Operand operand;
280 };
281
282 struct Label64Bit {
283 explicit Label64Bit(const LabelOperand& l) : label(l.label) {}
284 const Label& label;
285 };
286
287 // This type is only used by CmpXchg16b and acts similarly to Memory64Bit there.
288 using Memory128Bit = Memory64Bit;
289 using Label128Bit = Label64Bit;
290
291 // Check if a given type is "a register with size" (for EmitInstruction).
292 template <typename ArgumentType>
293 struct IsRegister {
294 static constexpr bool value = std::is_same_v<ArgumentType, Register8Bit> ||
295 std::is_same_v<ArgumentType, Register32Bit> ||
296 std::is_same_v<ArgumentType, Register64Bit>;
297 };
298
299 // Check if a given type is "a memory operand with size" (for EmitInstruction).
300 template <typename ArgumentType>
301 struct IsMemoryOperand {
302 static constexpr bool value =
303 std::is_same_v<ArgumentType, Memory32Bit> || std::is_same_v<ArgumentType, Memory64Bit>;
304 };
305
306 template <typename ArgumentType>
307 struct IsLabelOperand {
308 static constexpr bool value =
309 std::is_same_v<ArgumentType, Label32Bit> || std::is_same_v<ArgumentType, Label64Bit>;
310 };
311
312 template <typename... ArgumentsTypes>
313 void EmitRex(ArgumentsTypes... arguments) {
314 constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
315 constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
316 static_assert(registers_count + operands_count <= 2,
317 "Only two-arguments instructions are supported, not VEX or EVEX");
318 uint8_t rex = 0;
319 if constexpr (registers_count == 2) {
320 rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
321 Rex<0b0001>(ArgumentByType<1, IsRegister>(arguments...));
322 } else if constexpr (registers_count == 1 && operands_count == 1) {
323 rex = Rex<0b0100>(ArgumentByType<0, IsRegister>(arguments...)) |
324 Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
325 } else if constexpr (registers_count == 1) {
326 rex = Rex<0b0001>(ArgumentByType<0, IsRegister>(arguments...));
327 } else if constexpr (operands_count == 1) {
328 rex = Rex(ArgumentByType<0, IsMemoryOperand>(arguments...));
329 }
330 if (rex) {
331 Emit8(rex);
332 }
333 }
334
335 template <uint8_t base_rex, typename ArgumentType>
336 uint8_t Rex(ArgumentType argument) {
337 if (argument.num_ & 0b1000) {
338 // 64-bit argument requires REX.W bit
339 if (std::is_same_v<ArgumentType, Register64Bit>) {
340 return 0b0100'1000 | base_rex;
341 }
342 return 0b0100'0000 | base_rex;
343 }
344 // 8-bit argument requires REX (even if without any bits).
345 if (std::is_same_v<ArgumentType, Register8Bit> && argument.num_ > 3) {
346 return 0b0100'0000;
347 }
348 if (std::is_same_v<ArgumentType, Register64Bit>) {
349 return 0b0100'1000;
350 }
351 return 0;
352 }
353
Rex(Operand operand)354 uint8_t Rex(Operand operand) {
355 // REX.B and REX.X always come from operand.
356 uint8_t rex = ((operand.base.num_ & 0b1000) >> 3) | ((operand.index.num_ & 0b1000) >> 2);
357 if (rex) {
358 // We actually need rex byte here.
359 return 0b0100'0000 | rex;
360 } else {
361 return 0;
362 }
363 }
364
Rex(Memory32Bit operand)365 uint8_t Rex(Memory32Bit operand) { return Rex(operand.operand); }
366
Rex(Memory64Bit operand)367 uint8_t Rex(Memory64Bit operand) {
368 // 64-bit argument requires REX.W bit - and thus REX itself.
369 return 0b0100'1000 | Rex(operand.operand);
370 }
371
372 template <typename RegisterType>
373 [[nodiscard]] static bool IsSwapProfitable(RegisterType rm_arg, RegisterType vex_arg) {
374 // In 64bit mode we may use more compact encoding if operand encoded in rm is low register.
375 // Return true if we may achieve that by swapping arguments.
376 return rm_arg.num_ >= 8 && vex_arg.num_ < 8;
377 }
378
379 template <uint8_t byte1,
380 uint8_t byte2,
381 uint8_t byte3,
382 bool reg_is_opcode_extension,
383 typename... ArgumentsTypes>
384 void EmitVex(ArgumentsTypes... arguments) {
385 constexpr auto registers_count = kCountArguments<IsRegister, ArgumentsTypes...>;
386 constexpr auto operands_count = kCountArguments<IsMemoryOperand, ArgumentsTypes...>;
387 constexpr auto labels_count = kCountArguments<IsLabelOperand, ArgumentsTypes...>;
388 constexpr auto vvvv_parameter = 2 - reg_is_opcode_extension - operands_count - labels_count;
389 int vvvv = 0;
390 if constexpr (registers_count > vvvv_parameter) {
391 vvvv = ArgumentByType<vvvv_parameter, IsRegister>(arguments...).num_;
392 }
393 auto vex2 = byte2 | 0b111'00000;
394 if constexpr (operands_count == 1) {
395 auto operand = ArgumentByType<0, IsMemoryOperand>(arguments...);
396 vex2 ^= (operand.operand.base.num_ & 0b1000) << 2;
397 vex2 ^= (operand.operand.index.num_ & 0b1000) << 3;
398 if constexpr (!reg_is_opcode_extension) {
399 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
400 }
401 } else if constexpr (labels_count == 1) {
402 if constexpr (!reg_is_opcode_extension) {
403 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
404 }
405 } else if constexpr (registers_count > 0) {
406 if constexpr (reg_is_opcode_extension) {
407 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 2;
408 } else {
409 vex2 ^= (ArgumentByType<0, IsRegister>(arguments...).num_ & 0b1000) << 4;
410 vex2 ^= (ArgumentByType<1, IsRegister>(arguments...).num_ & 0b1000) << 2;
411 }
412 }
413 if (byte1 == 0xC4 && (vex2 & 0b0'1'1'11111) == 0b0'1'1'00001 && (byte3 & 0b1'0000'0'00) == 0) {
414 Emit16((0xc5 | ((vex2 & 0b1'0'0'00000) << 8) | (byte3 << 8) |
415 0b0'1111'000'00000000) ^ (vvvv << 11));
416 } else {
417 Emit8(byte1);
418 Emit16((vex2 | (byte3 << 8) | 0b0'1111'000'00000000) ^ (vvvv << 11));
419 }
420 }
421
422 template <typename ArgumentType>
EmitRegisterInOpcode(uint8_t opcode,ArgumentType argument)423 void EmitRegisterInOpcode(uint8_t opcode, ArgumentType argument) {
424 Emit8(opcode | (argument.num_ & 0b111));
425 }
426
427 template <typename ArgumentType1, typename ArgumentType2>
EmitModRM(ArgumentType1 argument1,ArgumentType2 argument2)428 void EmitModRM(ArgumentType1 argument1, ArgumentType2 argument2) {
429 Emit8(0xC0 | ((argument1.num_ & 0b111) << 3) | (argument2.num_ & 0b111));
430 }
431
432 template <typename ArgumentType>
EmitModRM(uint8_t opcode_extension,ArgumentType argument)433 void EmitModRM(uint8_t opcode_extension, ArgumentType argument) {
434 CHECK_LE(opcode_extension, 0b111);
435 Emit8(0xC0 | (opcode_extension << 3) | (argument.num_ & 0b111));
436 }
437
438 template <typename ArgumentType>
EmitOperandOp(ArgumentType argument,Operand operand)439 constexpr void EmitOperandOp(ArgumentType argument, Operand operand) {
440 EmitOperandOp(static_cast<int>(argument.num_ & 0b111), operand);
441 }
442
443 template <size_t kImmediatesSize, typename ArgumentType>
EmitRipOp(ArgumentType argument,const Label & label)444 constexpr void EmitRipOp(ArgumentType argument, const Label& label) {
445 EmitRipOp<kImmediatesSize>(static_cast<int>(argument.num_) & 0b111, label);
446 }
447
448 // Emit the ModR/M byte, and optionally the SIB byte and
449 // 1- or 4-byte offset for a memory operand. Also used to encode
450 // a three-bit opcode extension into the ModR/M byte.
451 constexpr void EmitOperandOp(int num_ber, const Operand& addr);
452 // Helper functions to handle various ModR/M and SIB combinations.
453 // Should *only* be called from EmitOperandOp!
454 constexpr void EmitIndexDispOperand(int reg, const Operand& addr);
455 template <typename ArgType, void (AssemblerBase::*)(ArgType)>
456 constexpr void EmitBaseIndexDispOperand(int base_modrm_and_sib, const Operand& addr);
457 // Emit ModR/M for rip-addressig.
458 template <size_t kImmediatesSize>
459 constexpr void EmitRipOp(int num_, const Label& label);
460
461 friend BaseAssembler;
462 };
463
464 // This function looks big, but when we are emitting Operand with fixed registers
465 // (which is the most common case) all "if"s below are calculated statically which
466 // makes effective size of that function very small.
467 //
468 // But for this to happen function have to be inline and in header.
EmitOperandOp(int num_ber,const Operand & addr)469 constexpr inline void Assembler::EmitOperandOp(int num_ber, const Operand& addr) {
470 // Additional info (register num_ber, etc) is limited to 3 bits.
471 CHECK_LE(unsigned(num_ber), 7);
472
473 // Reg field must be shifted by 3 bits.
474 int reg = num_ber << 3;
475
476 // On x86 %rsp cannot be index, only base.
477 CHECK(addr.index != rsp);
478
479 // If base is not %rsp/r12 and we don't have index, then we don't have SIB byte.
480 // All other cases have "ModR/M" and SIB bytes.
481 if (addr.base != rsp && addr.base != r12 && addr.index == no_register) {
482 // If we have base register then we could use the same logic as for other common cases.
483 if (addr.base != no_register) {
484 EmitBaseIndexDispOperand<uint8_t, &Assembler::Emit8>((addr.base.num_ & 7) | reg, addr);
485 } else {
486 Emit16(0x2504 | reg);
487 Emit32(addr.disp);
488 }
489 } else if (addr.index == no_register) {
490 // Note: when ModR/M and SIB are used "no index" is encoded as if %rsp is used in place of
491 // index (that's why %rsp couldn't be used as index - see check above).
492 EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(
493 0x2004 | ((addr.base.num_ & 7) << 8) | reg, addr);
494 } else if (addr.base == no_register) {
495 EmitIndexDispOperand(reg, addr);
496 } else {
497 EmitBaseIndexDispOperand<int16_t, &Assembler::Emit16>(0x04 | (addr.scale << 14) |
498 ((addr.index.num_ & 7) << 11) |
499 ((addr.base.num_ & 7) << 8) | reg,
500 addr);
501 }
502 }
503
EmitIndexDispOperand(int reg,const Operand & addr)504 constexpr inline void Assembler::EmitIndexDispOperand(int reg, const Operand& addr) {
505 // We only have index here, no base, use SIB but put %rbp in "base" field.
506 Emit16(0x0504 | (addr.scale << 14) | ((addr.index.num_ & 7) << 11) | reg);
507 Emit32(addr.disp);
508 }
509
510 template <size_t kImmediatesSize>
EmitRipOp(int num_,const Label & label)511 constexpr inline void Assembler::EmitRipOp(int num_, const Label& label) {
512 Emit8(0x05 | (num_ << 3));
513 jumps_.push_back(Jump{&label, pc(), false});
514 Emit32(0xfffffffc - kImmediatesSize);
515 }
516
517 template <typename ArgType, void (AssemblerBase::* EmitBase)(ArgType)>
EmitBaseIndexDispOperand(int base_modrm_and_sib,const Operand & addr)518 constexpr inline void Assembler::EmitBaseIndexDispOperand(int base_modrm_and_sib,
519 const Operand& addr) {
520 if (addr.disp == 0 && addr.base != rbp && addr.base != r13) {
521 // We can omit zero displacement only if base isn't %rbp/%r13
522 (this->*EmitBase)(base_modrm_and_sib);
523 } else if (IsInRange<int8_t>(addr.disp)) {
524 // If disp could it in byte then use byte-disp.
525 (this->*EmitBase)(base_modrm_and_sib | 0x40);
526 Emit8(addr.disp);
527 } else {
528 // Otherwise use full-disp.
529 (this->*EmitBase)(base_modrm_and_sib | 0x80);
530 Emit32(addr.disp);
531 }
532 }
533
Movq(Register dest,int64_t imm64)534 constexpr inline void Assembler::Movq(Register dest, int64_t imm64) {
535 if (IsInRange<uint32_t>(imm64)) {
536 // Shorter encoding.
537 Movl(dest, static_cast<uint32_t>(imm64));
538 } else if (IsInRange<int32_t>(imm64)) {
539 // Slightly longer encoding.
540 EmitInstruction<0xc7, 0x00>(Register64Bit(dest), static_cast<int32_t>(imm64));
541 } else {
542 // Longest encoding.
543 EmitInstruction<0xb8>(Register64Bit(dest), imm64);
544 }
545 }
546
Vmovapd(XMMRegister arg0,XMMRegister arg1)547 constexpr inline void Assembler::Vmovapd(XMMRegister arg0, XMMRegister arg1) {
548 if (IsSwapProfitable(arg1, arg0)) {
549 return EmitInstruction<0xc4, 0x01, 0x01, 0x29>(VectorRegister128Bit(arg1),
550 VectorRegister128Bit(arg0));
551 }
552 EmitInstruction<0xc4, 0x01, 0x01, 0x28>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
553 }
554
Vmovapd(YMMRegister arg0,YMMRegister arg1)555 constexpr inline void Assembler::Vmovapd(YMMRegister arg0, YMMRegister arg1) {
556 if (IsSwapProfitable(arg1, arg0)) {
557 return EmitInstruction<0xc4, 0x01, 0x05, 0x29>(VectorRegister256Bit(arg1),
558 VectorRegister256Bit(arg0));
559 }
560 EmitInstruction<0xc4, 0x01, 0x05, 0x28>(VectorRegister256Bit(arg0), VectorRegister256Bit(arg1));
561 }
562
Vmovaps(XMMRegister arg0,XMMRegister arg1)563 constexpr inline void Assembler::Vmovaps(XMMRegister arg0, XMMRegister arg1) {
564 if (IsSwapProfitable(arg1, arg0)) {
565 return EmitInstruction<0xc4, 0x01, 0x00, 0x29>(VectorRegister128Bit(arg1),
566 VectorRegister128Bit(arg0));
567 }
568 EmitInstruction<0xc4, 0x01, 0x00, 0x28>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
569 }
570
Vmovaps(YMMRegister arg0,YMMRegister arg1)571 constexpr inline void Assembler::Vmovaps(YMMRegister arg0, YMMRegister arg1) {
572 if (IsSwapProfitable(arg1, arg0)) {
573 return EmitInstruction<0xc4, 0x01, 0x04, 0x29>(VectorRegister256Bit(arg1),
574 VectorRegister256Bit(arg0));
575 }
576 EmitInstruction<0xc4, 0x01, 0x04, 0x28>(VectorRegister256Bit(arg0), VectorRegister256Bit(arg1));
577 }
578
Vmovdqa(XMMRegister arg0,XMMRegister arg1)579 constexpr inline void Assembler::Vmovdqa(XMMRegister arg0, XMMRegister arg1) {
580 if (IsSwapProfitable(arg1, arg0)) {
581 return EmitInstruction<0xc4, 0x01, 0x01, 0x7F>(VectorRegister128Bit(arg1),
582 VectorRegister128Bit(arg0));
583 }
584 EmitInstruction<0xc4, 0x01, 0x01, 0x6F>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
585 }
586
Vmovdqa(YMMRegister arg0,YMMRegister arg1)587 constexpr inline void Assembler::Vmovdqa(YMMRegister arg0, YMMRegister arg1) {
588 if (IsSwapProfitable(arg1, arg0)) {
589 return EmitInstruction<0xc4, 0x01, 0x05, 0x7F>(VectorRegister256Bit(arg1),
590 VectorRegister256Bit(arg0));
591 }
592 EmitInstruction<0xc4, 0x01, 0x05, 0x6F>(VectorRegister256Bit(arg0), VectorRegister256Bit(arg1));
593 }
594
Vmovdqu(XMMRegister arg0,XMMRegister arg1)595 constexpr inline void Assembler::Vmovdqu(XMMRegister arg0, XMMRegister arg1) {
596 if (IsSwapProfitable(arg1, arg0)) {
597 return EmitInstruction<0xc4, 0x01, 0x02, 0x7F>(VectorRegister128Bit(arg1),
598 VectorRegister128Bit(arg0));
599 }
600 EmitInstruction<0xc4, 0x01, 0x02, 0x6F>(VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
601 }
602
Vmovdqu(YMMRegister arg0,YMMRegister arg1)603 constexpr inline void Assembler::Vmovdqu(YMMRegister arg0, YMMRegister arg1) {
604 if (IsSwapProfitable(arg1, arg0)) {
605 return EmitInstruction<0xc4, 0x01, 0x06, 0x7F>(VectorRegister256Bit(arg1),
606 VectorRegister256Bit(arg0));
607 }
608 EmitInstruction<0xc4, 0x01, 0x06, 0x6F>(VectorRegister256Bit(arg0), VectorRegister256Bit(arg1));
609 }
610
Vmovsd(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)611 constexpr inline void Assembler::Vmovsd(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
612 if (IsSwapProfitable(arg2, arg0)) {
613 return EmitInstruction<0xc4, 0x01, 0x03, 0x11>(
614 VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
615 }
616 EmitInstruction<0xc4, 0x01, 0x03, 0x10>(
617 VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
618 }
619
Vmovss(XMMRegister arg0,XMMRegister arg1,XMMRegister arg2)620 constexpr inline void Assembler::Vmovss(XMMRegister arg0, XMMRegister arg1, XMMRegister arg2) {
621 if (IsSwapProfitable(arg2, arg0)) {
622 return EmitInstruction<0xc4, 0x01, 0x02, 0x11>(
623 VectorRegister128Bit(arg2), VectorRegister128Bit(arg0), VectorRegister128Bit(arg1));
624 }
625 EmitInstruction<0xc4, 0x01, 0x02, 0x10>(
626 VectorRegister128Bit(arg0), VectorRegister128Bit(arg2), VectorRegister128Bit(arg1));
627 }
628
Xchgq(Register dest,Register src)629 constexpr inline void Assembler::Xchgq(Register dest, Register src) {
630 // We compare output to that from clang and thus want to produce the same code.
631 // 0x48 0x90 is suboptimal encoding for that operation (pure 0x90 does the same
632 // and this is what gcc + gas are producing), but this is what clang <= 8 does.
633 if (IsAccumulator(src) && IsAccumulator(dest)) {
634 Emit8(0x90);
635 } else if (IsAccumulator(src) || IsAccumulator(dest)) {
636 Register other = IsAccumulator(src) ? dest : src;
637 EmitInstruction<0x90>(Register64Bit(other));
638 } else {
639 // Clang 8 (after r330298) puts dest before src. We are comparing output
640 // to clang in exhaustive test thus we want to match clang behavior exactly.
641 EmitInstruction<0x87>(Register64Bit(dest), Register64Bit(src));
642 }
643 }
644
645 } // namespace x86_64
646
647 } // namespace berberis
648
649 #endif // BERBERIS_ASSEMBLER_X86_64_H_
650