1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "intrinsics_x86_64.h"
18
19 #include <limits>
20
21 #include "arch/x86_64/instruction_set_features_x86_64.h"
22 #include "art_method.h"
23 #include "base/bit_utils.h"
24 #include "code_generator_x86_64.h"
25 #include "entrypoints/quick/quick_entrypoints.h"
26 #include "heap_poisoning.h"
27 #include "intrinsics.h"
28 #include "intrinsics_utils.h"
29 #include "lock_word.h"
30 #include "mirror/array-inl.h"
31 #include "mirror/object_array-inl.h"
32 #include "mirror/reference.h"
33 #include "mirror/string.h"
34 #include "scoped_thread_state_change-inl.h"
35 #include "thread-current-inl.h"
36 #include "utils/x86_64/assembler_x86_64.h"
37 #include "utils/x86_64/constants_x86_64.h"
38
39 namespace art {
40
41 namespace x86_64 {
42
IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64 * codegen)43 IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
44 : allocator_(codegen->GetGraph()->GetAllocator()), codegen_(codegen) {
45 }
46
GetAssembler()47 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
48 return down_cast<X86_64Assembler*>(codegen_->GetAssembler());
49 }
50
GetAllocator()51 ArenaAllocator* IntrinsicCodeGeneratorX86_64::GetAllocator() {
52 return codegen_->GetGraph()->GetAllocator();
53 }
54
TryDispatch(HInvoke * invoke)55 bool IntrinsicLocationsBuilderX86_64::TryDispatch(HInvoke* invoke) {
56 Dispatch(invoke);
57 LocationSummary* res = invoke->GetLocations();
58 if (res == nullptr) {
59 return false;
60 }
61 return res->Intrinsified();
62 }
63
MoveArguments(HInvoke * invoke,CodeGeneratorX86_64 * codegen)64 static void MoveArguments(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
65 InvokeDexCallingConventionVisitorX86_64 calling_convention_visitor;
66 IntrinsicVisitor::MoveArguments(invoke, codegen, &calling_convention_visitor);
67 }
68
69 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
70
71 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
72 #define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT
73
74 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
75 class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode {
76 public:
ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction * instruction)77 explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction)
78 : SlowPathCode(instruction) {
79 DCHECK(kEmitCompilerReadBarrier);
80 DCHECK(kUseBakerReadBarrier);
81 }
82
EmitNativeCode(CodeGenerator * codegen)83 void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
84 CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
85 LocationSummary* locations = instruction_->GetLocations();
86 DCHECK(locations->CanCall());
87 DCHECK(instruction_->IsInvokeStaticOrDirect())
88 << "Unexpected instruction in read barrier arraycopy slow path: "
89 << instruction_->DebugName();
90 DCHECK(instruction_->GetLocations()->Intrinsified());
91 DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
92
93 int32_t element_size = DataType::Size(DataType::Type::kReference);
94
95 CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>();
96 CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>();
97 CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>();
98
99 __ Bind(GetEntryLabel());
100 NearLabel loop;
101 __ Bind(&loop);
102 __ movl(CpuRegister(TMP), Address(src_curr_addr, 0));
103 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
104 // TODO: Inline the mark bit check before calling the runtime?
105 // TMP = ReadBarrier::Mark(TMP);
106 // No need to save live registers; it's taken care of by the
107 // entrypoint. Also, there is no need to update the stack mask,
108 // as this runtime call will not trigger a garbage collection.
109 int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
110 // This runtime call does not require a stack map.
111 x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
112 __ MaybePoisonHeapReference(CpuRegister(TMP));
113 __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP));
114 __ addl(src_curr_addr, Immediate(element_size));
115 __ addl(dst_curr_addr, Immediate(element_size));
116 __ cmpl(src_curr_addr, src_stop_addr);
117 __ j(kNotEqual, &loop);
118 __ jmp(GetExitLabel());
119 }
120
GetDescription() const121 const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86_64"; }
122
123 private:
124 DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64);
125 };
126
127 #undef __
128
129 #define __ assembler->
130
CreateFPToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)131 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
132 LocationSummary* locations =
133 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
134 locations->SetInAt(0, Location::RequiresFpuRegister());
135 locations->SetOut(Location::RequiresRegister());
136 }
137
CreateIntToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)138 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
139 LocationSummary* locations =
140 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
141 locations->SetInAt(0, Location::RequiresRegister());
142 locations->SetOut(Location::RequiresFpuRegister());
143 }
144
MoveFPToInt(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler)145 static void MoveFPToInt(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
146 Location input = locations->InAt(0);
147 Location output = locations->Out();
148 __ movd(output.AsRegister<CpuRegister>(), input.AsFpuRegister<XmmRegister>(), is64bit);
149 }
150
MoveIntToFP(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler)151 static void MoveIntToFP(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
152 Location input = locations->InAt(0);
153 Location output = locations->Out();
154 __ movd(output.AsFpuRegister<XmmRegister>(), input.AsRegister<CpuRegister>(), is64bit);
155 }
156
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)157 void IntrinsicLocationsBuilderX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
158 CreateFPToIntLocations(allocator_, invoke);
159 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)160 void IntrinsicLocationsBuilderX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
161 CreateIntToFPLocations(allocator_, invoke);
162 }
163
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)164 void IntrinsicCodeGeneratorX86_64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
165 MoveFPToInt(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
166 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)167 void IntrinsicCodeGeneratorX86_64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
168 MoveIntToFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
169 }
170
VisitFloatFloatToRawIntBits(HInvoke * invoke)171 void IntrinsicLocationsBuilderX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
172 CreateFPToIntLocations(allocator_, invoke);
173 }
VisitFloatIntBitsToFloat(HInvoke * invoke)174 void IntrinsicLocationsBuilderX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
175 CreateIntToFPLocations(allocator_, invoke);
176 }
177
VisitFloatFloatToRawIntBits(HInvoke * invoke)178 void IntrinsicCodeGeneratorX86_64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
179 MoveFPToInt(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
180 }
VisitFloatIntBitsToFloat(HInvoke * invoke)181 void IntrinsicCodeGeneratorX86_64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
182 MoveIntToFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
183 }
184
CreateIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)185 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
186 LocationSummary* locations =
187 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
188 locations->SetInAt(0, Location::RequiresRegister());
189 locations->SetOut(Location::SameAsFirstInput());
190 }
191
GenReverseBytes(LocationSummary * locations,DataType::Type size,X86_64Assembler * assembler)192 static void GenReverseBytes(LocationSummary* locations,
193 DataType::Type size,
194 X86_64Assembler* assembler) {
195 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
196
197 switch (size) {
198 case DataType::Type::kInt16:
199 // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
200 __ bswapl(out);
201 __ sarl(out, Immediate(16));
202 break;
203 case DataType::Type::kInt32:
204 __ bswapl(out);
205 break;
206 case DataType::Type::kInt64:
207 __ bswapq(out);
208 break;
209 default:
210 LOG(FATAL) << "Unexpected size for reverse-bytes: " << size;
211 UNREACHABLE();
212 }
213 }
214
VisitIntegerReverseBytes(HInvoke * invoke)215 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
216 CreateIntToIntLocations(allocator_, invoke);
217 }
218
VisitIntegerReverseBytes(HInvoke * invoke)219 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverseBytes(HInvoke* invoke) {
220 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
221 }
222
VisitLongReverseBytes(HInvoke * invoke)223 void IntrinsicLocationsBuilderX86_64::VisitLongReverseBytes(HInvoke* invoke) {
224 CreateIntToIntLocations(allocator_, invoke);
225 }
226
VisitLongReverseBytes(HInvoke * invoke)227 void IntrinsicCodeGeneratorX86_64::VisitLongReverseBytes(HInvoke* invoke) {
228 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
229 }
230
VisitShortReverseBytes(HInvoke * invoke)231 void IntrinsicLocationsBuilderX86_64::VisitShortReverseBytes(HInvoke* invoke) {
232 CreateIntToIntLocations(allocator_, invoke);
233 }
234
VisitShortReverseBytes(HInvoke * invoke)235 void IntrinsicCodeGeneratorX86_64::VisitShortReverseBytes(HInvoke* invoke) {
236 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
237 }
238
239
240 // TODO: Consider Quick's way of doing Double abs through integer operations, as the immediate we
241 // need is 64b.
242
CreateFloatToFloatPlusTemps(ArenaAllocator * allocator,HInvoke * invoke)243 static void CreateFloatToFloatPlusTemps(ArenaAllocator* allocator, HInvoke* invoke) {
244 // TODO: Enable memory operations when the assembler supports them.
245 LocationSummary* locations =
246 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
247 locations->SetInAt(0, Location::RequiresFpuRegister());
248 locations->SetOut(Location::SameAsFirstInput());
249 locations->AddTemp(Location::RequiresFpuRegister()); // FP reg to hold mask.
250 }
251
MathAbsFP(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen)252 static void MathAbsFP(LocationSummary* locations,
253 bool is64bit,
254 X86_64Assembler* assembler,
255 CodeGeneratorX86_64* codegen) {
256 Location output = locations->Out();
257
258 DCHECK(output.IsFpuRegister());
259 XmmRegister xmm_temp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
260
261 // TODO: Can mask directly with constant area using pand if we can guarantee
262 // that the literal is aligned on a 16 byte boundary. This will avoid a
263 // temporary.
264 if (is64bit) {
265 __ movsd(xmm_temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF)));
266 __ andpd(output.AsFpuRegister<XmmRegister>(), xmm_temp);
267 } else {
268 __ movss(xmm_temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF)));
269 __ andps(output.AsFpuRegister<XmmRegister>(), xmm_temp);
270 }
271 }
272
VisitMathAbsDouble(HInvoke * invoke)273 void IntrinsicLocationsBuilderX86_64::VisitMathAbsDouble(HInvoke* invoke) {
274 CreateFloatToFloatPlusTemps(allocator_, invoke);
275 }
276
VisitMathAbsDouble(HInvoke * invoke)277 void IntrinsicCodeGeneratorX86_64::VisitMathAbsDouble(HInvoke* invoke) {
278 MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler(), codegen_);
279 }
280
VisitMathAbsFloat(HInvoke * invoke)281 void IntrinsicLocationsBuilderX86_64::VisitMathAbsFloat(HInvoke* invoke) {
282 CreateFloatToFloatPlusTemps(allocator_, invoke);
283 }
284
VisitMathAbsFloat(HInvoke * invoke)285 void IntrinsicCodeGeneratorX86_64::VisitMathAbsFloat(HInvoke* invoke) {
286 MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler(), codegen_);
287 }
288
CreateIntToIntPlusTemp(ArenaAllocator * allocator,HInvoke * invoke)289 static void CreateIntToIntPlusTemp(ArenaAllocator* allocator, HInvoke* invoke) {
290 LocationSummary* locations =
291 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
292 locations->SetInAt(0, Location::RequiresRegister());
293 locations->SetOut(Location::SameAsFirstInput());
294 locations->AddTemp(Location::RequiresRegister());
295 }
296
GenAbsInteger(LocationSummary * locations,bool is64bit,X86_64Assembler * assembler)297 static void GenAbsInteger(LocationSummary* locations, bool is64bit, X86_64Assembler* assembler) {
298 Location output = locations->Out();
299 CpuRegister out = output.AsRegister<CpuRegister>();
300 CpuRegister mask = locations->GetTemp(0).AsRegister<CpuRegister>();
301
302 if (is64bit) {
303 // Create mask.
304 __ movq(mask, out);
305 __ sarq(mask, Immediate(63));
306 // Add mask.
307 __ addq(out, mask);
308 __ xorq(out, mask);
309 } else {
310 // Create mask.
311 __ movl(mask, out);
312 __ sarl(mask, Immediate(31));
313 // Add mask.
314 __ addl(out, mask);
315 __ xorl(out, mask);
316 }
317 }
318
VisitMathAbsInt(HInvoke * invoke)319 void IntrinsicLocationsBuilderX86_64::VisitMathAbsInt(HInvoke* invoke) {
320 CreateIntToIntPlusTemp(allocator_, invoke);
321 }
322
VisitMathAbsInt(HInvoke * invoke)323 void IntrinsicCodeGeneratorX86_64::VisitMathAbsInt(HInvoke* invoke) {
324 GenAbsInteger(invoke->GetLocations(), /* is64bit */ false, GetAssembler());
325 }
326
VisitMathAbsLong(HInvoke * invoke)327 void IntrinsicLocationsBuilderX86_64::VisitMathAbsLong(HInvoke* invoke) {
328 CreateIntToIntPlusTemp(allocator_, invoke);
329 }
330
VisitMathAbsLong(HInvoke * invoke)331 void IntrinsicCodeGeneratorX86_64::VisitMathAbsLong(HInvoke* invoke) {
332 GenAbsInteger(invoke->GetLocations(), /* is64bit */ true, GetAssembler());
333 }
334
GenMinMaxFP(LocationSummary * locations,bool is_min,bool is_double,X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen)335 static void GenMinMaxFP(LocationSummary* locations,
336 bool is_min,
337 bool is_double,
338 X86_64Assembler* assembler,
339 CodeGeneratorX86_64* codegen) {
340 Location op1_loc = locations->InAt(0);
341 Location op2_loc = locations->InAt(1);
342 Location out_loc = locations->Out();
343 XmmRegister out = out_loc.AsFpuRegister<XmmRegister>();
344
345 // Shortcut for same input locations.
346 if (op1_loc.Equals(op2_loc)) {
347 DCHECK(out_loc.Equals(op1_loc));
348 return;
349 }
350
351 // (out := op1)
352 // out <=? op2
353 // if Nan jmp Nan_label
354 // if out is min jmp done
355 // if op2 is min jmp op2_label
356 // handle -0/+0
357 // jmp done
358 // Nan_label:
359 // out := NaN
360 // op2_label:
361 // out := op2
362 // done:
363 //
364 // This removes one jmp, but needs to copy one input (op1) to out.
365 //
366 // TODO: This is straight from Quick. Make NaN an out-of-line slowpath?
367
368 XmmRegister op2 = op2_loc.AsFpuRegister<XmmRegister>();
369
370 NearLabel nan, done, op2_label;
371 if (is_double) {
372 __ ucomisd(out, op2);
373 } else {
374 __ ucomiss(out, op2);
375 }
376
377 __ j(Condition::kParityEven, &nan);
378
379 __ j(is_min ? Condition::kAbove : Condition::kBelow, &op2_label);
380 __ j(is_min ? Condition::kBelow : Condition::kAbove, &done);
381
382 // Handle 0.0/-0.0.
383 if (is_min) {
384 if (is_double) {
385 __ orpd(out, op2);
386 } else {
387 __ orps(out, op2);
388 }
389 } else {
390 if (is_double) {
391 __ andpd(out, op2);
392 } else {
393 __ andps(out, op2);
394 }
395 }
396 __ jmp(&done);
397
398 // NaN handling.
399 __ Bind(&nan);
400 if (is_double) {
401 __ movsd(out, codegen->LiteralInt64Address(INT64_C(0x7FF8000000000000)));
402 } else {
403 __ movss(out, codegen->LiteralInt32Address(INT32_C(0x7FC00000)));
404 }
405 __ jmp(&done);
406
407 // out := op2;
408 __ Bind(&op2_label);
409 if (is_double) {
410 __ movsd(out, op2);
411 } else {
412 __ movss(out, op2);
413 }
414
415 // Done.
416 __ Bind(&done);
417 }
418
CreateFPFPToFP(ArenaAllocator * allocator,HInvoke * invoke)419 static void CreateFPFPToFP(ArenaAllocator* allocator, HInvoke* invoke) {
420 LocationSummary* locations =
421 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
422 locations->SetInAt(0, Location::RequiresFpuRegister());
423 locations->SetInAt(1, Location::RequiresFpuRegister());
424 // The following is sub-optimal, but all we can do for now. It would be fine to also accept
425 // the second input to be the output (we can simply swap inputs).
426 locations->SetOut(Location::SameAsFirstInput());
427 }
428
VisitMathMinDoubleDouble(HInvoke * invoke)429 void IntrinsicLocationsBuilderX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
430 CreateFPFPToFP(allocator_, invoke);
431 }
432
VisitMathMinDoubleDouble(HInvoke * invoke)433 void IntrinsicCodeGeneratorX86_64::VisitMathMinDoubleDouble(HInvoke* invoke) {
434 GenMinMaxFP(
435 invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler(), codegen_);
436 }
437
VisitMathMinFloatFloat(HInvoke * invoke)438 void IntrinsicLocationsBuilderX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
439 CreateFPFPToFP(allocator_, invoke);
440 }
441
VisitMathMinFloatFloat(HInvoke * invoke)442 void IntrinsicCodeGeneratorX86_64::VisitMathMinFloatFloat(HInvoke* invoke) {
443 GenMinMaxFP(
444 invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler(), codegen_);
445 }
446
VisitMathMaxDoubleDouble(HInvoke * invoke)447 void IntrinsicLocationsBuilderX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
448 CreateFPFPToFP(allocator_, invoke);
449 }
450
VisitMathMaxDoubleDouble(HInvoke * invoke)451 void IntrinsicCodeGeneratorX86_64::VisitMathMaxDoubleDouble(HInvoke* invoke) {
452 GenMinMaxFP(
453 invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler(), codegen_);
454 }
455
VisitMathMaxFloatFloat(HInvoke * invoke)456 void IntrinsicLocationsBuilderX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
457 CreateFPFPToFP(allocator_, invoke);
458 }
459
VisitMathMaxFloatFloat(HInvoke * invoke)460 void IntrinsicCodeGeneratorX86_64::VisitMathMaxFloatFloat(HInvoke* invoke) {
461 GenMinMaxFP(
462 invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler(), codegen_);
463 }
464
GenMinMax(LocationSummary * locations,bool is_min,bool is_long,X86_64Assembler * assembler)465 static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long,
466 X86_64Assembler* assembler) {
467 Location op1_loc = locations->InAt(0);
468 Location op2_loc = locations->InAt(1);
469
470 // Shortcut for same input locations.
471 if (op1_loc.Equals(op2_loc)) {
472 // Can return immediately, as op1_loc == out_loc.
473 // Note: if we ever support separate registers, e.g., output into memory, we need to check for
474 // a copy here.
475 DCHECK(locations->Out().Equals(op1_loc));
476 return;
477 }
478
479 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
480 CpuRegister op2 = op2_loc.AsRegister<CpuRegister>();
481
482 // (out := op1)
483 // out <=? op2
484 // if out is min jmp done
485 // out := op2
486 // done:
487
488 if (is_long) {
489 __ cmpq(out, op2);
490 } else {
491 __ cmpl(out, op2);
492 }
493
494 __ cmov(is_min ? Condition::kGreater : Condition::kLess, out, op2, is_long);
495 }
496
CreateIntIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)497 static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
498 LocationSummary* locations =
499 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
500 locations->SetInAt(0, Location::RequiresRegister());
501 locations->SetInAt(1, Location::RequiresRegister());
502 locations->SetOut(Location::SameAsFirstInput());
503 }
504
VisitMathMinIntInt(HInvoke * invoke)505 void IntrinsicLocationsBuilderX86_64::VisitMathMinIntInt(HInvoke* invoke) {
506 CreateIntIntToIntLocations(allocator_, invoke);
507 }
508
VisitMathMinIntInt(HInvoke * invoke)509 void IntrinsicCodeGeneratorX86_64::VisitMathMinIntInt(HInvoke* invoke) {
510 GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ false, GetAssembler());
511 }
512
VisitMathMinLongLong(HInvoke * invoke)513 void IntrinsicLocationsBuilderX86_64::VisitMathMinLongLong(HInvoke* invoke) {
514 CreateIntIntToIntLocations(allocator_, invoke);
515 }
516
VisitMathMinLongLong(HInvoke * invoke)517 void IntrinsicCodeGeneratorX86_64::VisitMathMinLongLong(HInvoke* invoke) {
518 GenMinMax(invoke->GetLocations(), /* is_min */ true, /* is_long */ true, GetAssembler());
519 }
520
VisitMathMaxIntInt(HInvoke * invoke)521 void IntrinsicLocationsBuilderX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
522 CreateIntIntToIntLocations(allocator_, invoke);
523 }
524
VisitMathMaxIntInt(HInvoke * invoke)525 void IntrinsicCodeGeneratorX86_64::VisitMathMaxIntInt(HInvoke* invoke) {
526 GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ false, GetAssembler());
527 }
528
VisitMathMaxLongLong(HInvoke * invoke)529 void IntrinsicLocationsBuilderX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
530 CreateIntIntToIntLocations(allocator_, invoke);
531 }
532
VisitMathMaxLongLong(HInvoke * invoke)533 void IntrinsicCodeGeneratorX86_64::VisitMathMaxLongLong(HInvoke* invoke) {
534 GenMinMax(invoke->GetLocations(), /* is_min */ false, /* is_long */ true, GetAssembler());
535 }
536
CreateFPToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)537 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
538 LocationSummary* locations =
539 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
540 locations->SetInAt(0, Location::RequiresFpuRegister());
541 locations->SetOut(Location::RequiresFpuRegister());
542 }
543
VisitMathSqrt(HInvoke * invoke)544 void IntrinsicLocationsBuilderX86_64::VisitMathSqrt(HInvoke* invoke) {
545 CreateFPToFPLocations(allocator_, invoke);
546 }
547
VisitMathSqrt(HInvoke * invoke)548 void IntrinsicCodeGeneratorX86_64::VisitMathSqrt(HInvoke* invoke) {
549 LocationSummary* locations = invoke->GetLocations();
550 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
551 XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
552
553 GetAssembler()->sqrtsd(out, in);
554 }
555
InvokeOutOfLineIntrinsic(CodeGeneratorX86_64 * codegen,HInvoke * invoke)556 static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
557 MoveArguments(invoke, codegen);
558
559 DCHECK(invoke->IsInvokeStaticOrDirect());
560 codegen->GenerateStaticOrDirectCall(
561 invoke->AsInvokeStaticOrDirect(), Location::RegisterLocation(RDI));
562
563 // Copy the result back to the expected output.
564 Location out = invoke->GetLocations()->Out();
565 if (out.IsValid()) {
566 DCHECK(out.IsRegister());
567 codegen->MoveFromReturnRegister(out, invoke->GetType());
568 }
569 }
570
CreateSSE41FPToFPLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorX86_64 * codegen)571 static void CreateSSE41FPToFPLocations(ArenaAllocator* allocator,
572 HInvoke* invoke,
573 CodeGeneratorX86_64* codegen) {
574 // Do we have instruction support?
575 if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
576 CreateFPToFPLocations(allocator, invoke);
577 return;
578 }
579
580 // We have to fall back to a call to the intrinsic.
581 LocationSummary* locations =
582 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
583 InvokeRuntimeCallingConvention calling_convention;
584 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
585 locations->SetOut(Location::FpuRegisterLocation(XMM0));
586 // Needs to be RDI for the invoke.
587 locations->AddTemp(Location::RegisterLocation(RDI));
588 }
589
GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64 * codegen,HInvoke * invoke,X86_64Assembler * assembler,int round_mode)590 static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
591 HInvoke* invoke,
592 X86_64Assembler* assembler,
593 int round_mode) {
594 LocationSummary* locations = invoke->GetLocations();
595 if (locations->WillCall()) {
596 InvokeOutOfLineIntrinsic(codegen, invoke);
597 } else {
598 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
599 XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
600 __ roundsd(out, in, Immediate(round_mode));
601 }
602 }
603
VisitMathCeil(HInvoke * invoke)604 void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
605 CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
606 }
607
VisitMathCeil(HInvoke * invoke)608 void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
609 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
610 }
611
VisitMathFloor(HInvoke * invoke)612 void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
613 CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
614 }
615
VisitMathFloor(HInvoke * invoke)616 void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
617 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
618 }
619
VisitMathRint(HInvoke * invoke)620 void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
621 CreateSSE41FPToFPLocations(allocator_, invoke, codegen_);
622 }
623
VisitMathRint(HInvoke * invoke)624 void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
625 GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
626 }
627
CreateSSE41FPToIntLocations(ArenaAllocator * allocator,HInvoke * invoke,CodeGeneratorX86_64 * codegen)628 static void CreateSSE41FPToIntLocations(ArenaAllocator* allocator,
629 HInvoke* invoke,
630 CodeGeneratorX86_64* codegen) {
631 // Do we have instruction support?
632 if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
633 LocationSummary* locations =
634 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
635 locations->SetInAt(0, Location::RequiresFpuRegister());
636 locations->SetOut(Location::RequiresRegister());
637 locations->AddTemp(Location::RequiresFpuRegister());
638 locations->AddTemp(Location::RequiresFpuRegister());
639 return;
640 }
641
642 // We have to fall back to a call to the intrinsic.
643 LocationSummary* locations =
644 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly);
645 InvokeRuntimeCallingConvention calling_convention;
646 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
647 locations->SetOut(Location::RegisterLocation(RAX));
648 // Needs to be RDI for the invoke.
649 locations->AddTemp(Location::RegisterLocation(RDI));
650 }
651
VisitMathRoundFloat(HInvoke * invoke)652 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
653 CreateSSE41FPToIntLocations(allocator_, invoke, codegen_);
654 }
655
VisitMathRoundFloat(HInvoke * invoke)656 void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
657 LocationSummary* locations = invoke->GetLocations();
658 if (locations->WillCall()) {
659 InvokeOutOfLineIntrinsic(codegen_, invoke);
660 return;
661 }
662
663 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
664 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
665 XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
666 XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
667 NearLabel skip_incr, done;
668 X86_64Assembler* assembler = GetAssembler();
669
670 // Since no direct x86 rounding instruction matches the required semantics,
671 // this intrinsic is implemented as follows:
672 // result = floor(in);
673 // if (in - result >= 0.5f)
674 // result = result + 1.0f;
675 __ movss(t2, in);
676 __ roundss(t1, in, Immediate(1));
677 __ subss(t2, t1);
678 __ comiss(t2, codegen_->LiteralFloatAddress(0.5f));
679 __ j(kBelow, &skip_incr);
680 __ addss(t1, codegen_->LiteralFloatAddress(1.0f));
681 __ Bind(&skip_incr);
682
683 // Final conversion to an integer. Unfortunately this also does not have a
684 // direct x86 instruction, since NaN should map to 0 and large positive
685 // values need to be clipped to the extreme value.
686 codegen_->Load32BitValue(out, kPrimIntMax);
687 __ cvtsi2ss(t2, out);
688 __ comiss(t1, t2);
689 __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered
690 __ movl(out, Immediate(0)); // does not change flags
691 __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out)
692 __ cvttss2si(out, t1);
693 __ Bind(&done);
694 }
695
VisitMathRoundDouble(HInvoke * invoke)696 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
697 CreateSSE41FPToIntLocations(allocator_, invoke, codegen_);
698 }
699
VisitMathRoundDouble(HInvoke * invoke)700 void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
701 LocationSummary* locations = invoke->GetLocations();
702 if (locations->WillCall()) {
703 InvokeOutOfLineIntrinsic(codegen_, invoke);
704 return;
705 }
706
707 XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
708 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
709 XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
710 XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
711 NearLabel skip_incr, done;
712 X86_64Assembler* assembler = GetAssembler();
713
714 // Since no direct x86 rounding instruction matches the required semantics,
715 // this intrinsic is implemented as follows:
716 // result = floor(in);
717 // if (in - result >= 0.5)
718 // result = result + 1.0f;
719 __ movsd(t2, in);
720 __ roundsd(t1, in, Immediate(1));
721 __ subsd(t2, t1);
722 __ comisd(t2, codegen_->LiteralDoubleAddress(0.5));
723 __ j(kBelow, &skip_incr);
724 __ addsd(t1, codegen_->LiteralDoubleAddress(1.0f));
725 __ Bind(&skip_incr);
726
727 // Final conversion to an integer. Unfortunately this also does not have a
728 // direct x86 instruction, since NaN should map to 0 and large positive
729 // values need to be clipped to the extreme value.
730 codegen_->Load64BitValue(out, kPrimLongMax);
731 __ cvtsi2sd(t2, out, /* is64bit */ true);
732 __ comisd(t1, t2);
733 __ j(kAboveEqual, &done); // clipped to max (already in out), does not jump on unordered
734 __ movl(out, Immediate(0)); // does not change flags, implicit zero extension to 64-bit
735 __ j(kUnordered, &done); // NaN mapped to 0 (just moved in out)
736 __ cvttsd2si(out, t1, /* is64bit */ true);
737 __ Bind(&done);
738 }
739
CreateFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)740 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
741 LocationSummary* locations =
742 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
743 InvokeRuntimeCallingConvention calling_convention;
744 locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
745 locations->SetOut(Location::FpuRegisterLocation(XMM0));
746
747 // We have to ensure that the native code doesn't clobber the XMM registers which are
748 // non-volatile for ART, but volatile for Native calls. This will ensure that they are
749 // saved in the prologue and properly restored.
750 for (FloatRegister fp_reg : non_volatile_xmm_regs) {
751 locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
752 }
753 }
754
GenFPToFPCall(HInvoke * invoke,CodeGeneratorX86_64 * codegen,QuickEntrypointEnum entry)755 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
756 QuickEntrypointEnum entry) {
757 LocationSummary* locations = invoke->GetLocations();
758 DCHECK(locations->WillCall());
759 DCHECK(invoke->IsInvokeStaticOrDirect());
760
761 codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
762 }
763
VisitMathCos(HInvoke * invoke)764 void IntrinsicLocationsBuilderX86_64::VisitMathCos(HInvoke* invoke) {
765 CreateFPToFPCallLocations(allocator_, invoke);
766 }
767
VisitMathCos(HInvoke * invoke)768 void IntrinsicCodeGeneratorX86_64::VisitMathCos(HInvoke* invoke) {
769 GenFPToFPCall(invoke, codegen_, kQuickCos);
770 }
771
VisitMathSin(HInvoke * invoke)772 void IntrinsicLocationsBuilderX86_64::VisitMathSin(HInvoke* invoke) {
773 CreateFPToFPCallLocations(allocator_, invoke);
774 }
775
VisitMathSin(HInvoke * invoke)776 void IntrinsicCodeGeneratorX86_64::VisitMathSin(HInvoke* invoke) {
777 GenFPToFPCall(invoke, codegen_, kQuickSin);
778 }
779
VisitMathAcos(HInvoke * invoke)780 void IntrinsicLocationsBuilderX86_64::VisitMathAcos(HInvoke* invoke) {
781 CreateFPToFPCallLocations(allocator_, invoke);
782 }
783
VisitMathAcos(HInvoke * invoke)784 void IntrinsicCodeGeneratorX86_64::VisitMathAcos(HInvoke* invoke) {
785 GenFPToFPCall(invoke, codegen_, kQuickAcos);
786 }
787
VisitMathAsin(HInvoke * invoke)788 void IntrinsicLocationsBuilderX86_64::VisitMathAsin(HInvoke* invoke) {
789 CreateFPToFPCallLocations(allocator_, invoke);
790 }
791
VisitMathAsin(HInvoke * invoke)792 void IntrinsicCodeGeneratorX86_64::VisitMathAsin(HInvoke* invoke) {
793 GenFPToFPCall(invoke, codegen_, kQuickAsin);
794 }
795
VisitMathAtan(HInvoke * invoke)796 void IntrinsicLocationsBuilderX86_64::VisitMathAtan(HInvoke* invoke) {
797 CreateFPToFPCallLocations(allocator_, invoke);
798 }
799
VisitMathAtan(HInvoke * invoke)800 void IntrinsicCodeGeneratorX86_64::VisitMathAtan(HInvoke* invoke) {
801 GenFPToFPCall(invoke, codegen_, kQuickAtan);
802 }
803
VisitMathCbrt(HInvoke * invoke)804 void IntrinsicLocationsBuilderX86_64::VisitMathCbrt(HInvoke* invoke) {
805 CreateFPToFPCallLocations(allocator_, invoke);
806 }
807
VisitMathCbrt(HInvoke * invoke)808 void IntrinsicCodeGeneratorX86_64::VisitMathCbrt(HInvoke* invoke) {
809 GenFPToFPCall(invoke, codegen_, kQuickCbrt);
810 }
811
VisitMathCosh(HInvoke * invoke)812 void IntrinsicLocationsBuilderX86_64::VisitMathCosh(HInvoke* invoke) {
813 CreateFPToFPCallLocations(allocator_, invoke);
814 }
815
VisitMathCosh(HInvoke * invoke)816 void IntrinsicCodeGeneratorX86_64::VisitMathCosh(HInvoke* invoke) {
817 GenFPToFPCall(invoke, codegen_, kQuickCosh);
818 }
819
VisitMathExp(HInvoke * invoke)820 void IntrinsicLocationsBuilderX86_64::VisitMathExp(HInvoke* invoke) {
821 CreateFPToFPCallLocations(allocator_, invoke);
822 }
823
VisitMathExp(HInvoke * invoke)824 void IntrinsicCodeGeneratorX86_64::VisitMathExp(HInvoke* invoke) {
825 GenFPToFPCall(invoke, codegen_, kQuickExp);
826 }
827
VisitMathExpm1(HInvoke * invoke)828 void IntrinsicLocationsBuilderX86_64::VisitMathExpm1(HInvoke* invoke) {
829 CreateFPToFPCallLocations(allocator_, invoke);
830 }
831
VisitMathExpm1(HInvoke * invoke)832 void IntrinsicCodeGeneratorX86_64::VisitMathExpm1(HInvoke* invoke) {
833 GenFPToFPCall(invoke, codegen_, kQuickExpm1);
834 }
835
VisitMathLog(HInvoke * invoke)836 void IntrinsicLocationsBuilderX86_64::VisitMathLog(HInvoke* invoke) {
837 CreateFPToFPCallLocations(allocator_, invoke);
838 }
839
VisitMathLog(HInvoke * invoke)840 void IntrinsicCodeGeneratorX86_64::VisitMathLog(HInvoke* invoke) {
841 GenFPToFPCall(invoke, codegen_, kQuickLog);
842 }
843
VisitMathLog10(HInvoke * invoke)844 void IntrinsicLocationsBuilderX86_64::VisitMathLog10(HInvoke* invoke) {
845 CreateFPToFPCallLocations(allocator_, invoke);
846 }
847
VisitMathLog10(HInvoke * invoke)848 void IntrinsicCodeGeneratorX86_64::VisitMathLog10(HInvoke* invoke) {
849 GenFPToFPCall(invoke, codegen_, kQuickLog10);
850 }
851
VisitMathSinh(HInvoke * invoke)852 void IntrinsicLocationsBuilderX86_64::VisitMathSinh(HInvoke* invoke) {
853 CreateFPToFPCallLocations(allocator_, invoke);
854 }
855
VisitMathSinh(HInvoke * invoke)856 void IntrinsicCodeGeneratorX86_64::VisitMathSinh(HInvoke* invoke) {
857 GenFPToFPCall(invoke, codegen_, kQuickSinh);
858 }
859
VisitMathTan(HInvoke * invoke)860 void IntrinsicLocationsBuilderX86_64::VisitMathTan(HInvoke* invoke) {
861 CreateFPToFPCallLocations(allocator_, invoke);
862 }
863
VisitMathTan(HInvoke * invoke)864 void IntrinsicCodeGeneratorX86_64::VisitMathTan(HInvoke* invoke) {
865 GenFPToFPCall(invoke, codegen_, kQuickTan);
866 }
867
VisitMathTanh(HInvoke * invoke)868 void IntrinsicLocationsBuilderX86_64::VisitMathTanh(HInvoke* invoke) {
869 CreateFPToFPCallLocations(allocator_, invoke);
870 }
871
VisitMathTanh(HInvoke * invoke)872 void IntrinsicCodeGeneratorX86_64::VisitMathTanh(HInvoke* invoke) {
873 GenFPToFPCall(invoke, codegen_, kQuickTanh);
874 }
875
CreateFPFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)876 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
877 LocationSummary* locations =
878 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
879 InvokeRuntimeCallingConvention calling_convention;
880 locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
881 locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
882 locations->SetOut(Location::FpuRegisterLocation(XMM0));
883
884 // We have to ensure that the native code doesn't clobber the XMM registers which are
885 // non-volatile for ART, but volatile for Native calls. This will ensure that they are
886 // saved in the prologue and properly restored.
887 for (FloatRegister fp_reg : non_volatile_xmm_regs) {
888 locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
889 }
890 }
891
VisitMathAtan2(HInvoke * invoke)892 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
893 CreateFPFPToFPCallLocations(allocator_, invoke);
894 }
895
VisitMathAtan2(HInvoke * invoke)896 void IntrinsicCodeGeneratorX86_64::VisitMathAtan2(HInvoke* invoke) {
897 GenFPToFPCall(invoke, codegen_, kQuickAtan2);
898 }
899
VisitMathPow(HInvoke * invoke)900 void IntrinsicLocationsBuilderX86_64::VisitMathPow(HInvoke* invoke) {
901 CreateFPFPToFPCallLocations(allocator_, invoke);
902 }
903
VisitMathPow(HInvoke * invoke)904 void IntrinsicCodeGeneratorX86_64::VisitMathPow(HInvoke* invoke) {
905 GenFPToFPCall(invoke, codegen_, kQuickPow);
906 }
907
VisitMathHypot(HInvoke * invoke)908 void IntrinsicLocationsBuilderX86_64::VisitMathHypot(HInvoke* invoke) {
909 CreateFPFPToFPCallLocations(allocator_, invoke);
910 }
911
VisitMathHypot(HInvoke * invoke)912 void IntrinsicCodeGeneratorX86_64::VisitMathHypot(HInvoke* invoke) {
913 GenFPToFPCall(invoke, codegen_, kQuickHypot);
914 }
915
VisitMathNextAfter(HInvoke * invoke)916 void IntrinsicLocationsBuilderX86_64::VisitMathNextAfter(HInvoke* invoke) {
917 CreateFPFPToFPCallLocations(allocator_, invoke);
918 }
919
VisitMathNextAfter(HInvoke * invoke)920 void IntrinsicCodeGeneratorX86_64::VisitMathNextAfter(HInvoke* invoke) {
921 GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
922 }
923
VisitSystemArrayCopyChar(HInvoke * invoke)924 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
925 // Check to see if we have known failures that will cause us to have to bail out
926 // to the runtime, and just generate the runtime call directly.
927 HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
928 HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
929
930 // The positions must be non-negative.
931 if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
932 (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
933 // We will have to fail anyways.
934 return;
935 }
936
937 // The length must be > 0.
938 HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
939 if (length != nullptr) {
940 int32_t len = length->GetValue();
941 if (len < 0) {
942 // Just call as normal.
943 return;
944 }
945 }
946
947 LocationSummary* locations =
948 new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
949 // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
950 locations->SetInAt(0, Location::RequiresRegister());
951 locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
952 locations->SetInAt(2, Location::RequiresRegister());
953 locations->SetInAt(3, Location::RegisterOrConstant(invoke->InputAt(3)));
954 locations->SetInAt(4, Location::RegisterOrConstant(invoke->InputAt(4)));
955
956 // And we need some temporaries. We will use REP MOVSW, so we need fixed registers.
957 locations->AddTemp(Location::RegisterLocation(RSI));
958 locations->AddTemp(Location::RegisterLocation(RDI));
959 locations->AddTemp(Location::RegisterLocation(RCX));
960 }
961
CheckPosition(X86_64Assembler * assembler,Location pos,CpuRegister input,Location length,SlowPathCode * slow_path,CpuRegister temp,bool length_is_input_length=false)962 static void CheckPosition(X86_64Assembler* assembler,
963 Location pos,
964 CpuRegister input,
965 Location length,
966 SlowPathCode* slow_path,
967 CpuRegister temp,
968 bool length_is_input_length = false) {
969 // Where is the length in the Array?
970 const uint32_t length_offset = mirror::Array::LengthOffset().Uint32Value();
971
972 if (pos.IsConstant()) {
973 int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
974 if (pos_const == 0) {
975 if (!length_is_input_length) {
976 // Check that length(input) >= length.
977 if (length.IsConstant()) {
978 __ cmpl(Address(input, length_offset),
979 Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
980 } else {
981 __ cmpl(Address(input, length_offset), length.AsRegister<CpuRegister>());
982 }
983 __ j(kLess, slow_path->GetEntryLabel());
984 }
985 } else {
986 // Check that length(input) >= pos.
987 __ movl(temp, Address(input, length_offset));
988 __ subl(temp, Immediate(pos_const));
989 __ j(kLess, slow_path->GetEntryLabel());
990
991 // Check that (length(input) - pos) >= length.
992 if (length.IsConstant()) {
993 __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
994 } else {
995 __ cmpl(temp, length.AsRegister<CpuRegister>());
996 }
997 __ j(kLess, slow_path->GetEntryLabel());
998 }
999 } else if (length_is_input_length) {
1000 // The only way the copy can succeed is if pos is zero.
1001 CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
1002 __ testl(pos_reg, pos_reg);
1003 __ j(kNotEqual, slow_path->GetEntryLabel());
1004 } else {
1005 // Check that pos >= 0.
1006 CpuRegister pos_reg = pos.AsRegister<CpuRegister>();
1007 __ testl(pos_reg, pos_reg);
1008 __ j(kLess, slow_path->GetEntryLabel());
1009
1010 // Check that pos <= length(input).
1011 __ cmpl(Address(input, length_offset), pos_reg);
1012 __ j(kLess, slow_path->GetEntryLabel());
1013
1014 // Check that (length(input) - pos) >= length.
1015 __ movl(temp, Address(input, length_offset));
1016 __ subl(temp, pos_reg);
1017 if (length.IsConstant()) {
1018 __ cmpl(temp, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1019 } else {
1020 __ cmpl(temp, length.AsRegister<CpuRegister>());
1021 }
1022 __ j(kLess, slow_path->GetEntryLabel());
1023 }
1024 }
1025
VisitSystemArrayCopyChar(HInvoke * invoke)1026 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopyChar(HInvoke* invoke) {
1027 X86_64Assembler* assembler = GetAssembler();
1028 LocationSummary* locations = invoke->GetLocations();
1029
1030 CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
1031 Location src_pos = locations->InAt(1);
1032 CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
1033 Location dest_pos = locations->InAt(3);
1034 Location length = locations->InAt(4);
1035
1036 // Temporaries that we need for MOVSW.
1037 CpuRegister src_base = locations->GetTemp(0).AsRegister<CpuRegister>();
1038 DCHECK_EQ(src_base.AsRegister(), RSI);
1039 CpuRegister dest_base = locations->GetTemp(1).AsRegister<CpuRegister>();
1040 DCHECK_EQ(dest_base.AsRegister(), RDI);
1041 CpuRegister count = locations->GetTemp(2).AsRegister<CpuRegister>();
1042 DCHECK_EQ(count.AsRegister(), RCX);
1043
1044 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1045 codegen_->AddSlowPath(slow_path);
1046
1047 // Bail out if the source and destination are the same.
1048 __ cmpl(src, dest);
1049 __ j(kEqual, slow_path->GetEntryLabel());
1050
1051 // Bail out if the source is null.
1052 __ testl(src, src);
1053 __ j(kEqual, slow_path->GetEntryLabel());
1054
1055 // Bail out if the destination is null.
1056 __ testl(dest, dest);
1057 __ j(kEqual, slow_path->GetEntryLabel());
1058
1059 // If the length is negative, bail out.
1060 // We have already checked in the LocationsBuilder for the constant case.
1061 if (!length.IsConstant()) {
1062 __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
1063 __ j(kLess, slow_path->GetEntryLabel());
1064 }
1065
1066 // Validity checks: source. Use src_base as a temporary register.
1067 CheckPosition(assembler, src_pos, src, length, slow_path, src_base);
1068
1069 // Validity checks: dest. Use src_base as a temporary register.
1070 CheckPosition(assembler, dest_pos, dest, length, slow_path, src_base);
1071
1072 // We need the count in RCX.
1073 if (length.IsConstant()) {
1074 __ movl(count, Immediate(length.GetConstant()->AsIntConstant()->GetValue()));
1075 } else {
1076 __ movl(count, length.AsRegister<CpuRegister>());
1077 }
1078
1079 // Okay, everything checks out. Finally time to do the copy.
1080 // Check assumption that sizeof(Char) is 2 (used in scaling below).
1081 const size_t char_size = DataType::Size(DataType::Type::kUint16);
1082 DCHECK_EQ(char_size, 2u);
1083
1084 const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
1085
1086 if (src_pos.IsConstant()) {
1087 int32_t src_pos_const = src_pos.GetConstant()->AsIntConstant()->GetValue();
1088 __ leal(src_base, Address(src, char_size * src_pos_const + data_offset));
1089 } else {
1090 __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(),
1091 ScaleFactor::TIMES_2, data_offset));
1092 }
1093 if (dest_pos.IsConstant()) {
1094 int32_t dest_pos_const = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1095 __ leal(dest_base, Address(dest, char_size * dest_pos_const + data_offset));
1096 } else {
1097 __ leal(dest_base, Address(dest, dest_pos.AsRegister<CpuRegister>(),
1098 ScaleFactor::TIMES_2, data_offset));
1099 }
1100
1101 // Do the move.
1102 __ rep_movsw();
1103
1104 __ Bind(slow_path->GetExitLabel());
1105 }
1106
1107
VisitSystemArrayCopy(HInvoke * invoke)1108 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
1109 // The only read barrier implementation supporting the
1110 // SystemArrayCopy intrinsic is the Baker-style read barriers.
1111 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
1112 return;
1113 }
1114
1115 CodeGenerator::CreateSystemArrayCopyLocationSummary(invoke);
1116 }
1117
1118 // Compute base source address, base destination address, and end
1119 // source address for the System.arraycopy intrinsic in `src_base`,
1120 // `dst_base` and `src_end` respectively.
GenSystemArrayCopyAddresses(X86_64Assembler * assembler,DataType::Type type,const CpuRegister & src,const Location & src_pos,const CpuRegister & dst,const Location & dst_pos,const Location & copy_length,const CpuRegister & src_base,const CpuRegister & dst_base,const CpuRegister & src_end)1121 static void GenSystemArrayCopyAddresses(X86_64Assembler* assembler,
1122 DataType::Type type,
1123 const CpuRegister& src,
1124 const Location& src_pos,
1125 const CpuRegister& dst,
1126 const Location& dst_pos,
1127 const Location& copy_length,
1128 const CpuRegister& src_base,
1129 const CpuRegister& dst_base,
1130 const CpuRegister& src_end) {
1131 // This routine is only used by the SystemArrayCopy intrinsic.
1132 DCHECK_EQ(type, DataType::Type::kReference);
1133 const int32_t element_size = DataType::Size(type);
1134 const ScaleFactor scale_factor = static_cast<ScaleFactor>(DataType::SizeShift(type));
1135 const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
1136
1137 if (src_pos.IsConstant()) {
1138 int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
1139 __ leal(src_base, Address(src, element_size * constant + data_offset));
1140 } else {
1141 __ leal(src_base, Address(src, src_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
1142 }
1143
1144 if (dst_pos.IsConstant()) {
1145 int32_t constant = dst_pos.GetConstant()->AsIntConstant()->GetValue();
1146 __ leal(dst_base, Address(dst, element_size * constant + data_offset));
1147 } else {
1148 __ leal(dst_base, Address(dst, dst_pos.AsRegister<CpuRegister>(), scale_factor, data_offset));
1149 }
1150
1151 if (copy_length.IsConstant()) {
1152 int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
1153 __ leal(src_end, Address(src_base, element_size * constant));
1154 } else {
1155 __ leal(src_end, Address(src_base, copy_length.AsRegister<CpuRegister>(), scale_factor, 0));
1156 }
1157 }
1158
VisitSystemArrayCopy(HInvoke * invoke)1159 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
1160 // The only read barrier implementation supporting the
1161 // SystemArrayCopy intrinsic is the Baker-style read barriers.
1162 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
1163
1164 X86_64Assembler* assembler = GetAssembler();
1165 LocationSummary* locations = invoke->GetLocations();
1166
1167 uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
1168 uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
1169 uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
1170 uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
1171 uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
1172
1173 CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
1174 Location src_pos = locations->InAt(1);
1175 CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
1176 Location dest_pos = locations->InAt(3);
1177 Location length = locations->InAt(4);
1178 Location temp1_loc = locations->GetTemp(0);
1179 CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>();
1180 Location temp2_loc = locations->GetTemp(1);
1181 CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
1182 Location temp3_loc = locations->GetTemp(2);
1183 CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>();
1184 Location TMP_loc = Location::RegisterLocation(TMP);
1185
1186 SlowPathCode* intrinsic_slow_path =
1187 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1188 codegen_->AddSlowPath(intrinsic_slow_path);
1189
1190 NearLabel conditions_on_positions_validated;
1191 SystemArrayCopyOptimizations optimizations(invoke);
1192
1193 // If source and destination are the same, we go to slow path if we need to do
1194 // forward copying.
1195 if (src_pos.IsConstant()) {
1196 int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
1197 if (dest_pos.IsConstant()) {
1198 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1199 if (optimizations.GetDestinationIsSource()) {
1200 // Checked when building locations.
1201 DCHECK_GE(src_pos_constant, dest_pos_constant);
1202 } else if (src_pos_constant < dest_pos_constant) {
1203 __ cmpl(src, dest);
1204 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1205 }
1206 } else {
1207 if (!optimizations.GetDestinationIsSource()) {
1208 __ cmpl(src, dest);
1209 __ j(kNotEqual, &conditions_on_positions_validated);
1210 }
1211 __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
1212 __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
1213 }
1214 } else {
1215 if (!optimizations.GetDestinationIsSource()) {
1216 __ cmpl(src, dest);
1217 __ j(kNotEqual, &conditions_on_positions_validated);
1218 }
1219 if (dest_pos.IsConstant()) {
1220 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
1221 __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
1222 __ j(kLess, intrinsic_slow_path->GetEntryLabel());
1223 } else {
1224 __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
1225 __ j(kLess, intrinsic_slow_path->GetEntryLabel());
1226 }
1227 }
1228
1229 __ Bind(&conditions_on_positions_validated);
1230
1231 if (!optimizations.GetSourceIsNotNull()) {
1232 // Bail out if the source is null.
1233 __ testl(src, src);
1234 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1235 }
1236
1237 if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
1238 // Bail out if the destination is null.
1239 __ testl(dest, dest);
1240 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1241 }
1242
1243 // If the length is negative, bail out.
1244 // We have already checked in the LocationsBuilder for the constant case.
1245 if (!length.IsConstant() &&
1246 !optimizations.GetCountIsSourceLength() &&
1247 !optimizations.GetCountIsDestinationLength()) {
1248 __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
1249 __ j(kLess, intrinsic_slow_path->GetEntryLabel());
1250 }
1251
1252 // Validity checks: source.
1253 CheckPosition(assembler,
1254 src_pos,
1255 src,
1256 length,
1257 intrinsic_slow_path,
1258 temp1,
1259 optimizations.GetCountIsSourceLength());
1260
1261 // Validity checks: dest.
1262 CheckPosition(assembler,
1263 dest_pos,
1264 dest,
1265 length,
1266 intrinsic_slow_path,
1267 temp1,
1268 optimizations.GetCountIsDestinationLength());
1269
1270 if (!optimizations.GetDoesNotNeedTypeCheck()) {
1271 // Check whether all elements of the source array are assignable to the component
1272 // type of the destination array. We do two checks: the classes are the same,
1273 // or the destination is Object[]. If none of these checks succeed, we go to the
1274 // slow path.
1275
1276 bool did_unpoison = false;
1277 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1278 // /* HeapReference<Class> */ temp1 = dest->klass_
1279 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1280 invoke, temp1_loc, dest, class_offset, /* needs_null_check */ false);
1281 // Register `temp1` is not trashed by the read barrier emitted
1282 // by GenerateFieldLoadWithBakerReadBarrier below, as that
1283 // method produces a call to a ReadBarrierMarkRegX entry point,
1284 // which saves all potentially live registers, including
1285 // temporaries such a `temp1`.
1286 // /* HeapReference<Class> */ temp2 = src->klass_
1287 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1288 invoke, temp2_loc, src, class_offset, /* needs_null_check */ false);
1289 // If heap poisoning is enabled, `temp1` and `temp2` have been
1290 // unpoisoned by the the previous calls to
1291 // GenerateFieldLoadWithBakerReadBarrier.
1292 } else {
1293 // /* HeapReference<Class> */ temp1 = dest->klass_
1294 __ movl(temp1, Address(dest, class_offset));
1295 // /* HeapReference<Class> */ temp2 = src->klass_
1296 __ movl(temp2, Address(src, class_offset));
1297 if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
1298 !optimizations.GetSourceIsNonPrimitiveArray()) {
1299 // One or two of the references need to be unpoisoned. Unpoison them
1300 // both to make the identity check valid.
1301 __ MaybeUnpoisonHeapReference(temp1);
1302 __ MaybeUnpoisonHeapReference(temp2);
1303 did_unpoison = true;
1304 }
1305 }
1306
1307 if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
1308 // Bail out if the destination is not a non primitive array.
1309 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1310 // /* HeapReference<Class> */ TMP = temp1->component_type_
1311 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1312 invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
1313 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1314 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1315 // If heap poisoning is enabled, `TMP` has been unpoisoned by
1316 // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
1317 } else {
1318 // /* HeapReference<Class> */ TMP = temp1->component_type_
1319 __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1320 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1321 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1322 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1323 }
1324 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1325 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1326 }
1327
1328 if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1329 // Bail out if the source is not a non primitive array.
1330 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1331 // For the same reason given earlier, `temp1` is not trashed by the
1332 // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
1333 // /* HeapReference<Class> */ TMP = temp2->component_type_
1334 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1335 invoke, TMP_loc, temp2, component_offset, /* needs_null_check */ false);
1336 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1337 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1338 // If heap poisoning is enabled, `TMP` has been unpoisoned by
1339 // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
1340 } else {
1341 // /* HeapReference<Class> */ TMP = temp2->component_type_
1342 __ movl(CpuRegister(TMP), Address(temp2, component_offset));
1343 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1344 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1345 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1346 }
1347 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1348 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1349 }
1350
1351 __ cmpl(temp1, temp2);
1352
1353 if (optimizations.GetDestinationIsTypedObjectArray()) {
1354 NearLabel do_copy;
1355 __ j(kEqual, &do_copy);
1356 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1357 // /* HeapReference<Class> */ temp1 = temp1->component_type_
1358 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1359 invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
1360 // We do not need to emit a read barrier for the following
1361 // heap reference load, as `temp1` is only used in a
1362 // comparison with null below, and this reference is not
1363 // kept afterwards.
1364 __ cmpl(Address(temp1, super_offset), Immediate(0));
1365 } else {
1366 if (!did_unpoison) {
1367 __ MaybeUnpoisonHeapReference(temp1);
1368 }
1369 // /* HeapReference<Class> */ temp1 = temp1->component_type_
1370 __ movl(temp1, Address(temp1, component_offset));
1371 __ MaybeUnpoisonHeapReference(temp1);
1372 // No need to unpoison the following heap reference load, as
1373 // we're comparing against null.
1374 __ cmpl(Address(temp1, super_offset), Immediate(0));
1375 }
1376 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1377 __ Bind(&do_copy);
1378 } else {
1379 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1380 }
1381 } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
1382 DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
1383 // Bail out if the source is not a non primitive array.
1384 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1385 // /* HeapReference<Class> */ temp1 = src->klass_
1386 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1387 invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
1388 // /* HeapReference<Class> */ TMP = temp1->component_type_
1389 codegen_->GenerateFieldLoadWithBakerReadBarrier(
1390 invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
1391 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1392 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1393 } else {
1394 // /* HeapReference<Class> */ temp1 = src->klass_
1395 __ movl(temp1, Address(src, class_offset));
1396 __ MaybeUnpoisonHeapReference(temp1);
1397 // /* HeapReference<Class> */ TMP = temp1->component_type_
1398 __ movl(CpuRegister(TMP), Address(temp1, component_offset));
1399 // No need to unpoison `TMP` now, as we're comparing against null.
1400 __ testl(CpuRegister(TMP), CpuRegister(TMP));
1401 __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
1402 __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
1403 }
1404 __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
1405 __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
1406 }
1407
1408 const DataType::Type type = DataType::Type::kReference;
1409 const int32_t element_size = DataType::Size(type);
1410
1411 // Compute base source address, base destination address, and end
1412 // source address in `temp1`, `temp2` and `temp3` respectively.
1413 GenSystemArrayCopyAddresses(
1414 GetAssembler(), type, src, src_pos, dest, dest_pos, length, temp1, temp2, temp3);
1415
1416 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
1417 // SystemArrayCopy implementation for Baker read barriers (see
1418 // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier):
1419 //
1420 // if (src_ptr != end_ptr) {
1421 // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
1422 // lfence; // Load fence or artificial data dependency to prevent load-load reordering
1423 // bool is_gray = (rb_state == ReadBarrier::GrayState());
1424 // if (is_gray) {
1425 // // Slow-path copy.
1426 // do {
1427 // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
1428 // } while (src_ptr != end_ptr)
1429 // } else {
1430 // // Fast-path copy.
1431 // do {
1432 // *dest_ptr++ = *src_ptr++;
1433 // } while (src_ptr != end_ptr)
1434 // }
1435 // }
1436
1437 NearLabel loop, done;
1438
1439 // Don't enter copy loop if `length == 0`.
1440 __ cmpl(temp1, temp3);
1441 __ j(kEqual, &done);
1442
1443 // Given the numeric representation, it's enough to check the low bit of the rb_state.
1444 static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
1445 static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
1446 constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
1447 constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
1448 constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
1449
1450 // if (rb_state == ReadBarrier::GrayState())
1451 // goto slow_path;
1452 // At this point, just do the "if" and make sure that flags are preserved until the branch.
1453 __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
1454
1455 // Load fence to prevent load-load reordering.
1456 // Note that this is a no-op, thanks to the x86-64 memory model.
1457 codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
1458
1459 // Slow path used to copy array when `src` is gray.
1460 SlowPathCode* read_barrier_slow_path =
1461 new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke);
1462 codegen_->AddSlowPath(read_barrier_slow_path);
1463
1464 // We have done the "if" of the gray bit check above, now branch based on the flags.
1465 __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
1466
1467 // Fast-path copy.
1468 // Iterate over the arrays and do a raw copy of the objects. We don't need to
1469 // poison/unpoison.
1470 __ Bind(&loop);
1471 __ movl(CpuRegister(TMP), Address(temp1, 0));
1472 __ movl(Address(temp2, 0), CpuRegister(TMP));
1473 __ addl(temp1, Immediate(element_size));
1474 __ addl(temp2, Immediate(element_size));
1475 __ cmpl(temp1, temp3);
1476 __ j(kNotEqual, &loop);
1477
1478 __ Bind(read_barrier_slow_path->GetExitLabel());
1479 __ Bind(&done);
1480 } else {
1481 // Non read barrier code.
1482
1483 // Iterate over the arrays and do a raw copy of the objects. We don't need to
1484 // poison/unpoison.
1485 NearLabel loop, done;
1486 __ cmpl(temp1, temp3);
1487 __ j(kEqual, &done);
1488 __ Bind(&loop);
1489 __ movl(CpuRegister(TMP), Address(temp1, 0));
1490 __ movl(Address(temp2, 0), CpuRegister(TMP));
1491 __ addl(temp1, Immediate(element_size));
1492 __ addl(temp2, Immediate(element_size));
1493 __ cmpl(temp1, temp3);
1494 __ j(kNotEqual, &loop);
1495 __ Bind(&done);
1496 }
1497
1498 // We only need one card marking on the destination array.
1499 codegen_->MarkGCCard(temp1, temp2, dest, CpuRegister(kNoRegister), /* value_can_be_null */ false);
1500
1501 __ Bind(intrinsic_slow_path->GetExitLabel());
1502 }
1503
VisitStringCompareTo(HInvoke * invoke)1504 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
1505 LocationSummary* locations = new (allocator_) LocationSummary(
1506 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1507 InvokeRuntimeCallingConvention calling_convention;
1508 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1509 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1510 locations->SetOut(Location::RegisterLocation(RAX));
1511 }
1512
VisitStringCompareTo(HInvoke * invoke)1513 void IntrinsicCodeGeneratorX86_64::VisitStringCompareTo(HInvoke* invoke) {
1514 X86_64Assembler* assembler = GetAssembler();
1515 LocationSummary* locations = invoke->GetLocations();
1516
1517 // Note that the null check must have been done earlier.
1518 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1519
1520 CpuRegister argument = locations->InAt(1).AsRegister<CpuRegister>();
1521 __ testl(argument, argument);
1522 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1523 codegen_->AddSlowPath(slow_path);
1524 __ j(kEqual, slow_path->GetEntryLabel());
1525
1526 codegen_->InvokeRuntime(kQuickStringCompareTo, invoke, invoke->GetDexPc(), slow_path);
1527 __ Bind(slow_path->GetExitLabel());
1528 }
1529
VisitStringEquals(HInvoke * invoke)1530 void IntrinsicLocationsBuilderX86_64::VisitStringEquals(HInvoke* invoke) {
1531 if (kEmitCompilerReadBarrier &&
1532 !StringEqualsOptimizations(invoke).GetArgumentIsString() &&
1533 !StringEqualsOptimizations(invoke).GetNoReadBarrierForStringClass()) {
1534 // No support for this odd case (String class is moveable, not in the boot image).
1535 return;
1536 }
1537
1538 LocationSummary* locations =
1539 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1540 locations->SetInAt(0, Location::RequiresRegister());
1541 locations->SetInAt(1, Location::RequiresRegister());
1542
1543 // Request temporary registers, RCX and RDI needed for repe_cmpsq instruction.
1544 locations->AddTemp(Location::RegisterLocation(RCX));
1545 locations->AddTemp(Location::RegisterLocation(RDI));
1546
1547 // Set output, RSI needed for repe_cmpsq instruction anyways.
1548 locations->SetOut(Location::RegisterLocation(RSI), Location::kOutputOverlap);
1549 }
1550
VisitStringEquals(HInvoke * invoke)1551 void IntrinsicCodeGeneratorX86_64::VisitStringEquals(HInvoke* invoke) {
1552 X86_64Assembler* assembler = GetAssembler();
1553 LocationSummary* locations = invoke->GetLocations();
1554
1555 CpuRegister str = locations->InAt(0).AsRegister<CpuRegister>();
1556 CpuRegister arg = locations->InAt(1).AsRegister<CpuRegister>();
1557 CpuRegister rcx = locations->GetTemp(0).AsRegister<CpuRegister>();
1558 CpuRegister rdi = locations->GetTemp(1).AsRegister<CpuRegister>();
1559 CpuRegister rsi = locations->Out().AsRegister<CpuRegister>();
1560
1561 NearLabel end, return_true, return_false;
1562
1563 // Get offsets of count, value, and class fields within a string object.
1564 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1565 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1566 const uint32_t class_offset = mirror::Object::ClassOffset().Uint32Value();
1567
1568 // Note that the null check must have been done earlier.
1569 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1570
1571 StringEqualsOptimizations optimizations(invoke);
1572 if (!optimizations.GetArgumentNotNull()) {
1573 // Check if input is null, return false if it is.
1574 __ testl(arg, arg);
1575 __ j(kEqual, &return_false);
1576 }
1577
1578 if (!optimizations.GetArgumentIsString()) {
1579 // Instanceof check for the argument by comparing class fields.
1580 // All string objects must have the same type since String cannot be subclassed.
1581 // Receiver must be a string object, so its class field is equal to all strings' class fields.
1582 // If the argument is a string object, its class field must be equal to receiver's class field.
1583 __ movl(rcx, Address(str, class_offset));
1584 __ cmpl(rcx, Address(arg, class_offset));
1585 __ j(kNotEqual, &return_false);
1586 }
1587
1588 // Reference equality check, return true if same reference.
1589 __ cmpl(str, arg);
1590 __ j(kEqual, &return_true);
1591
1592 // Load length and compression flag of receiver string.
1593 __ movl(rcx, Address(str, count_offset));
1594 // Check if lengths and compressiond flags are equal, return false if they're not.
1595 // Two identical strings will always have same compression style since
1596 // compression style is decided on alloc.
1597 __ cmpl(rcx, Address(arg, count_offset));
1598 __ j(kNotEqual, &return_false);
1599 // Return true if both strings are empty. Even with string compression `count == 0` means empty.
1600 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1601 "Expecting 0=compressed, 1=uncompressed");
1602 __ jrcxz(&return_true);
1603
1604 if (mirror::kUseStringCompression) {
1605 NearLabel string_uncompressed;
1606 // Extract length and differentiate between both compressed or both uncompressed.
1607 // Different compression style is cut above.
1608 __ shrl(rcx, Immediate(1));
1609 __ j(kCarrySet, &string_uncompressed);
1610 // Divide string length by 2, rounding up, and continue as if uncompressed.
1611 // Merge clearing the compression flag with +1 for rounding.
1612 __ addl(rcx, Immediate(1));
1613 __ shrl(rcx, Immediate(1));
1614 __ Bind(&string_uncompressed);
1615 }
1616 // Load starting addresses of string values into RSI/RDI as required for repe_cmpsq instruction.
1617 __ leal(rsi, Address(str, value_offset));
1618 __ leal(rdi, Address(arg, value_offset));
1619
1620 // Divide string length by 4 and adjust for lengths not divisible by 4.
1621 __ addl(rcx, Immediate(3));
1622 __ shrl(rcx, Immediate(2));
1623
1624 // Assertions that must hold in order to compare strings 4 characters (uncompressed)
1625 // or 8 characters (compressed) at a time.
1626 DCHECK_ALIGNED(value_offset, 8);
1627 static_assert(IsAligned<8>(kObjectAlignment), "String is not zero padded");
1628
1629 // Loop to compare strings four characters at a time starting at the beginning of the string.
1630 __ repe_cmpsq();
1631 // If strings are not equal, zero flag will be cleared.
1632 __ j(kNotEqual, &return_false);
1633
1634 // Return true and exit the function.
1635 // If loop does not result in returning false, we return true.
1636 __ Bind(&return_true);
1637 __ movl(rsi, Immediate(1));
1638 __ jmp(&end);
1639
1640 // Return false and exit the function.
1641 __ Bind(&return_false);
1642 __ xorl(rsi, rsi);
1643 __ Bind(&end);
1644 }
1645
CreateStringIndexOfLocations(HInvoke * invoke,ArenaAllocator * allocator,bool start_at_zero)1646 static void CreateStringIndexOfLocations(HInvoke* invoke,
1647 ArenaAllocator* allocator,
1648 bool start_at_zero) {
1649 LocationSummary* locations = new (allocator) LocationSummary(invoke,
1650 LocationSummary::kCallOnSlowPath,
1651 kIntrinsified);
1652 // The data needs to be in RDI for scasw. So request that the string is there, anyways.
1653 locations->SetInAt(0, Location::RegisterLocation(RDI));
1654 // If we look for a constant char, we'll still have to copy it into RAX. So just request the
1655 // allocator to do that, anyways. We can still do the constant check by checking the parameter
1656 // of the instruction explicitly.
1657 // Note: This works as we don't clobber RAX anywhere.
1658 locations->SetInAt(1, Location::RegisterLocation(RAX));
1659 if (!start_at_zero) {
1660 locations->SetInAt(2, Location::RequiresRegister()); // The starting index.
1661 }
1662 // As we clobber RDI during execution anyways, also use it as the output.
1663 locations->SetOut(Location::SameAsFirstInput());
1664
1665 // repne scasw uses RCX as the counter.
1666 locations->AddTemp(Location::RegisterLocation(RCX));
1667 // Need another temporary to be able to compute the result.
1668 locations->AddTemp(Location::RequiresRegister());
1669 }
1670
GenerateStringIndexOf(HInvoke * invoke,X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,bool start_at_zero)1671 static void GenerateStringIndexOf(HInvoke* invoke,
1672 X86_64Assembler* assembler,
1673 CodeGeneratorX86_64* codegen,
1674 bool start_at_zero) {
1675 LocationSummary* locations = invoke->GetLocations();
1676
1677 // Note that the null check must have been done earlier.
1678 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1679
1680 CpuRegister string_obj = locations->InAt(0).AsRegister<CpuRegister>();
1681 CpuRegister search_value = locations->InAt(1).AsRegister<CpuRegister>();
1682 CpuRegister counter = locations->GetTemp(0).AsRegister<CpuRegister>();
1683 CpuRegister string_length = locations->GetTemp(1).AsRegister<CpuRegister>();
1684 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
1685
1686 // Check our assumptions for registers.
1687 DCHECK_EQ(string_obj.AsRegister(), RDI);
1688 DCHECK_EQ(search_value.AsRegister(), RAX);
1689 DCHECK_EQ(counter.AsRegister(), RCX);
1690 DCHECK_EQ(out.AsRegister(), RDI);
1691
1692 // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
1693 // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
1694 SlowPathCode* slow_path = nullptr;
1695 HInstruction* code_point = invoke->InputAt(1);
1696 if (code_point->IsIntConstant()) {
1697 if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) >
1698 std::numeric_limits<uint16_t>::max()) {
1699 // Always needs the slow-path. We could directly dispatch to it, but this case should be
1700 // rare, so for simplicity just put the full slow-path down and branch unconditionally.
1701 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1702 codegen->AddSlowPath(slow_path);
1703 __ jmp(slow_path->GetEntryLabel());
1704 __ Bind(slow_path->GetExitLabel());
1705 return;
1706 }
1707 } else if (code_point->GetType() != DataType::Type::kUint16) {
1708 __ cmpl(search_value, Immediate(std::numeric_limits<uint16_t>::max()));
1709 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1710 codegen->AddSlowPath(slow_path);
1711 __ j(kAbove, slow_path->GetEntryLabel());
1712 }
1713
1714 // From here down, we know that we are looking for a char that fits in
1715 // 16 bits (uncompressed) or 8 bits (compressed).
1716 // Location of reference to data array within the String object.
1717 int32_t value_offset = mirror::String::ValueOffset().Int32Value();
1718 // Location of count within the String object.
1719 int32_t count_offset = mirror::String::CountOffset().Int32Value();
1720
1721 // Load the count field of the string containing the length and compression flag.
1722 __ movl(string_length, Address(string_obj, count_offset));
1723
1724 // Do a zero-length check. Even with string compression `count == 0` means empty.
1725 // TODO: Support jecxz.
1726 NearLabel not_found_label;
1727 __ testl(string_length, string_length);
1728 __ j(kEqual, ¬_found_label);
1729
1730 if (mirror::kUseStringCompression) {
1731 // Use TMP to keep string_length_flagged.
1732 __ movl(CpuRegister(TMP), string_length);
1733 // Mask out first bit used as compression flag.
1734 __ shrl(string_length, Immediate(1));
1735 }
1736
1737 if (start_at_zero) {
1738 // Number of chars to scan is the same as the string length.
1739 __ movl(counter, string_length);
1740 // Move to the start of the string.
1741 __ addq(string_obj, Immediate(value_offset));
1742 } else {
1743 CpuRegister start_index = locations->InAt(2).AsRegister<CpuRegister>();
1744
1745 // Do a start_index check.
1746 __ cmpl(start_index, string_length);
1747 __ j(kGreaterEqual, ¬_found_label);
1748
1749 // Ensure we have a start index >= 0;
1750 __ xorl(counter, counter);
1751 __ cmpl(start_index, Immediate(0));
1752 __ cmov(kGreater, counter, start_index, /* is64bit */ false); // 32-bit copy is enough.
1753
1754 if (mirror::kUseStringCompression) {
1755 NearLabel modify_counter, offset_uncompressed_label;
1756 __ testl(CpuRegister(TMP), Immediate(1));
1757 __ j(kNotZero, &offset_uncompressed_label);
1758 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_1, value_offset));
1759 __ jmp(&modify_counter);
1760 // Move to the start of the string: string_obj + value_offset + 2 * start_index.
1761 __ Bind(&offset_uncompressed_label);
1762 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
1763 __ Bind(&modify_counter);
1764 } else {
1765 __ leaq(string_obj, Address(string_obj, counter, ScaleFactor::TIMES_2, value_offset));
1766 }
1767 // Now update ecx, the work counter: it's gonna be string.length - start_index.
1768 __ negq(counter); // Needs to be 64-bit negation, as the address computation is 64-bit.
1769 __ leaq(counter, Address(string_length, counter, ScaleFactor::TIMES_1, 0));
1770 }
1771
1772 if (mirror::kUseStringCompression) {
1773 NearLabel uncompressed_string_comparison;
1774 NearLabel comparison_done;
1775 __ testl(CpuRegister(TMP), Immediate(1));
1776 __ j(kNotZero, &uncompressed_string_comparison);
1777 // Check if RAX (search_value) is ASCII.
1778 __ cmpl(search_value, Immediate(127));
1779 __ j(kGreater, ¬_found_label);
1780 // Comparing byte-per-byte.
1781 __ repne_scasb();
1782 __ jmp(&comparison_done);
1783 // Everything is set up for repne scasw:
1784 // * Comparison address in RDI.
1785 // * Counter in ECX.
1786 __ Bind(&uncompressed_string_comparison);
1787 __ repne_scasw();
1788 __ Bind(&comparison_done);
1789 } else {
1790 __ repne_scasw();
1791 }
1792 // Did we find a match?
1793 __ j(kNotEqual, ¬_found_label);
1794
1795 // Yes, we matched. Compute the index of the result.
1796 __ subl(string_length, counter);
1797 __ leal(out, Address(string_length, -1));
1798
1799 NearLabel done;
1800 __ jmp(&done);
1801
1802 // Failed to match; return -1.
1803 __ Bind(¬_found_label);
1804 __ movl(out, Immediate(-1));
1805
1806 // And join up at the end.
1807 __ Bind(&done);
1808 if (slow_path != nullptr) {
1809 __ Bind(slow_path->GetExitLabel());
1810 }
1811 }
1812
VisitStringIndexOf(HInvoke * invoke)1813 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOf(HInvoke* invoke) {
1814 CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero */ true);
1815 }
1816
VisitStringIndexOf(HInvoke * invoke)1817 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOf(HInvoke* invoke) {
1818 GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero */ true);
1819 }
1820
VisitStringIndexOfAfter(HInvoke * invoke)1821 void IntrinsicLocationsBuilderX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1822 CreateStringIndexOfLocations(invoke, allocator_, /* start_at_zero */ false);
1823 }
1824
VisitStringIndexOfAfter(HInvoke * invoke)1825 void IntrinsicCodeGeneratorX86_64::VisitStringIndexOfAfter(HInvoke* invoke) {
1826 GenerateStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero */ false);
1827 }
1828
VisitStringNewStringFromBytes(HInvoke * invoke)1829 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1830 LocationSummary* locations = new (allocator_) LocationSummary(
1831 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1832 InvokeRuntimeCallingConvention calling_convention;
1833 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1834 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1835 locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1836 locations->SetInAt(3, Location::RegisterLocation(calling_convention.GetRegisterAt(3)));
1837 locations->SetOut(Location::RegisterLocation(RAX));
1838 }
1839
VisitStringNewStringFromBytes(HInvoke * invoke)1840 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1841 X86_64Assembler* assembler = GetAssembler();
1842 LocationSummary* locations = invoke->GetLocations();
1843
1844 CpuRegister byte_array = locations->InAt(0).AsRegister<CpuRegister>();
1845 __ testl(byte_array, byte_array);
1846 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1847 codegen_->AddSlowPath(slow_path);
1848 __ j(kEqual, slow_path->GetEntryLabel());
1849
1850 codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc());
1851 CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
1852 __ Bind(slow_path->GetExitLabel());
1853 }
1854
VisitStringNewStringFromChars(HInvoke * invoke)1855 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1856 LocationSummary* locations =
1857 new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
1858 InvokeRuntimeCallingConvention calling_convention;
1859 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1860 locations->SetInAt(1, Location::RegisterLocation(calling_convention.GetRegisterAt(1)));
1861 locations->SetInAt(2, Location::RegisterLocation(calling_convention.GetRegisterAt(2)));
1862 locations->SetOut(Location::RegisterLocation(RAX));
1863 }
1864
VisitStringNewStringFromChars(HInvoke * invoke)1865 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromChars(HInvoke* invoke) {
1866 // No need to emit code checking whether `locations->InAt(2)` is a null
1867 // pointer, as callers of the native method
1868 //
1869 // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
1870 //
1871 // all include a null check on `data` before calling that method.
1872 codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
1873 CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
1874 }
1875
VisitStringNewStringFromString(HInvoke * invoke)1876 void IntrinsicLocationsBuilderX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1877 LocationSummary* locations = new (allocator_) LocationSummary(
1878 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1879 InvokeRuntimeCallingConvention calling_convention;
1880 locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
1881 locations->SetOut(Location::RegisterLocation(RAX));
1882 }
1883
VisitStringNewStringFromString(HInvoke * invoke)1884 void IntrinsicCodeGeneratorX86_64::VisitStringNewStringFromString(HInvoke* invoke) {
1885 X86_64Assembler* assembler = GetAssembler();
1886 LocationSummary* locations = invoke->GetLocations();
1887
1888 CpuRegister string_to_copy = locations->InAt(0).AsRegister<CpuRegister>();
1889 __ testl(string_to_copy, string_to_copy);
1890 SlowPathCode* slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathX86_64(invoke);
1891 codegen_->AddSlowPath(slow_path);
1892 __ j(kEqual, slow_path->GetEntryLabel());
1893
1894 codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc());
1895 CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
1896 __ Bind(slow_path->GetExitLabel());
1897 }
1898
VisitStringGetCharsNoCheck(HInvoke * invoke)1899 void IntrinsicLocationsBuilderX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1900 // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1901 LocationSummary* locations =
1902 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1903 locations->SetInAt(0, Location::RequiresRegister());
1904 locations->SetInAt(1, Location::RegisterOrConstant(invoke->InputAt(1)));
1905 locations->SetInAt(2, Location::RequiresRegister());
1906 locations->SetInAt(3, Location::RequiresRegister());
1907 locations->SetInAt(4, Location::RequiresRegister());
1908
1909 // And we need some temporaries. We will use REP MOVSW, so we need fixed registers.
1910 locations->AddTemp(Location::RegisterLocation(RSI));
1911 locations->AddTemp(Location::RegisterLocation(RDI));
1912 locations->AddTemp(Location::RegisterLocation(RCX));
1913 }
1914
VisitStringGetCharsNoCheck(HInvoke * invoke)1915 void IntrinsicCodeGeneratorX86_64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1916 X86_64Assembler* assembler = GetAssembler();
1917 LocationSummary* locations = invoke->GetLocations();
1918
1919 size_t char_component_size = DataType::Size(DataType::Type::kUint16);
1920 // Location of data in char array buffer.
1921 const uint32_t data_offset = mirror::Array::DataOffset(char_component_size).Uint32Value();
1922 // Location of char array data in string.
1923 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1924
1925 // public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1926 CpuRegister obj = locations->InAt(0).AsRegister<CpuRegister>();
1927 Location srcBegin = locations->InAt(1);
1928 int srcBegin_value =
1929 srcBegin.IsConstant() ? srcBegin.GetConstant()->AsIntConstant()->GetValue() : 0;
1930 CpuRegister srcEnd = locations->InAt(2).AsRegister<CpuRegister>();
1931 CpuRegister dst = locations->InAt(3).AsRegister<CpuRegister>();
1932 CpuRegister dstBegin = locations->InAt(4).AsRegister<CpuRegister>();
1933
1934 // Check assumption that sizeof(Char) is 2 (used in scaling below).
1935 const size_t char_size = DataType::Size(DataType::Type::kUint16);
1936 DCHECK_EQ(char_size, 2u);
1937
1938 NearLabel done;
1939 // Compute the number of chars (words) to move.
1940 __ movl(CpuRegister(RCX), srcEnd);
1941 if (srcBegin.IsConstant()) {
1942 __ subl(CpuRegister(RCX), Immediate(srcBegin_value));
1943 } else {
1944 DCHECK(srcBegin.IsRegister());
1945 __ subl(CpuRegister(RCX), srcBegin.AsRegister<CpuRegister>());
1946 }
1947 if (mirror::kUseStringCompression) {
1948 NearLabel copy_uncompressed, copy_loop;
1949 const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
1950 DCHECK_EQ(c_char_size, 1u);
1951 // Location of count in string.
1952 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1953
1954 __ testl(Address(obj, count_offset), Immediate(1));
1955 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1956 "Expecting 0=compressed, 1=uncompressed");
1957 __ j(kNotZero, ©_uncompressed);
1958 // Compute the address of the source string by adding the number of chars from
1959 // the source beginning to the value offset of a string.
1960 __ leaq(CpuRegister(RSI),
1961 CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_1, value_offset));
1962 // Start the loop to copy String's value to Array of Char.
1963 __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
1964
1965 __ Bind(©_loop);
1966 __ jrcxz(&done);
1967 // Use TMP as temporary (convert byte from RSI to word).
1968 // TODO: Selecting RAX as the temporary and using LODSB/STOSW.
1969 __ movzxb(CpuRegister(TMP), Address(CpuRegister(RSI), 0));
1970 __ movw(Address(CpuRegister(RDI), 0), CpuRegister(TMP));
1971 __ leaq(CpuRegister(RDI), Address(CpuRegister(RDI), char_size));
1972 __ leaq(CpuRegister(RSI), Address(CpuRegister(RSI), c_char_size));
1973 // TODO: Add support for LOOP to X86_64Assembler.
1974 __ subl(CpuRegister(RCX), Immediate(1));
1975 __ jmp(©_loop);
1976
1977 __ Bind(©_uncompressed);
1978 }
1979
1980 __ leaq(CpuRegister(RSI),
1981 CodeGeneratorX86_64::ArrayAddress(obj, srcBegin, TIMES_2, value_offset));
1982 // Compute the address of the destination buffer.
1983 __ leaq(CpuRegister(RDI), Address(dst, dstBegin, ScaleFactor::TIMES_2, data_offset));
1984 // Do the move.
1985 __ rep_movsw();
1986
1987 __ Bind(&done);
1988 }
1989
GenPeek(LocationSummary * locations,DataType::Type size,X86_64Assembler * assembler)1990 static void GenPeek(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) {
1991 CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
1992 CpuRegister out = locations->Out().AsRegister<CpuRegister>(); // == address, here for clarity.
1993 // x86 allows unaligned access. We do not have to check the input or use specific instructions
1994 // to avoid a SIGBUS.
1995 switch (size) {
1996 case DataType::Type::kInt8:
1997 __ movsxb(out, Address(address, 0));
1998 break;
1999 case DataType::Type::kInt16:
2000 __ movsxw(out, Address(address, 0));
2001 break;
2002 case DataType::Type::kInt32:
2003 __ movl(out, Address(address, 0));
2004 break;
2005 case DataType::Type::kInt64:
2006 __ movq(out, Address(address, 0));
2007 break;
2008 default:
2009 LOG(FATAL) << "Type not recognized for peek: " << size;
2010 UNREACHABLE();
2011 }
2012 }
2013
VisitMemoryPeekByte(HInvoke * invoke)2014 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
2015 CreateIntToIntLocations(allocator_, invoke);
2016 }
2017
VisitMemoryPeekByte(HInvoke * invoke)2018 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekByte(HInvoke* invoke) {
2019 GenPeek(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
2020 }
2021
VisitMemoryPeekIntNative(HInvoke * invoke)2022 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
2023 CreateIntToIntLocations(allocator_, invoke);
2024 }
2025
VisitMemoryPeekIntNative(HInvoke * invoke)2026 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekIntNative(HInvoke* invoke) {
2027 GenPeek(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
2028 }
2029
VisitMemoryPeekLongNative(HInvoke * invoke)2030 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
2031 CreateIntToIntLocations(allocator_, invoke);
2032 }
2033
VisitMemoryPeekLongNative(HInvoke * invoke)2034 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekLongNative(HInvoke* invoke) {
2035 GenPeek(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
2036 }
2037
VisitMemoryPeekShortNative(HInvoke * invoke)2038 void IntrinsicLocationsBuilderX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
2039 CreateIntToIntLocations(allocator_, invoke);
2040 }
2041
VisitMemoryPeekShortNative(HInvoke * invoke)2042 void IntrinsicCodeGeneratorX86_64::VisitMemoryPeekShortNative(HInvoke* invoke) {
2043 GenPeek(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
2044 }
2045
CreateIntIntToVoidLocations(ArenaAllocator * allocator,HInvoke * invoke)2046 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2047 LocationSummary* locations =
2048 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2049 locations->SetInAt(0, Location::RequiresRegister());
2050 locations->SetInAt(1, Location::RegisterOrInt32Constant(invoke->InputAt(1)));
2051 }
2052
GenPoke(LocationSummary * locations,DataType::Type size,X86_64Assembler * assembler)2053 static void GenPoke(LocationSummary* locations, DataType::Type size, X86_64Assembler* assembler) {
2054 CpuRegister address = locations->InAt(0).AsRegister<CpuRegister>();
2055 Location value = locations->InAt(1);
2056 // x86 allows unaligned access. We do not have to check the input or use specific instructions
2057 // to avoid a SIGBUS.
2058 switch (size) {
2059 case DataType::Type::kInt8:
2060 if (value.IsConstant()) {
2061 __ movb(Address(address, 0),
2062 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
2063 } else {
2064 __ movb(Address(address, 0), value.AsRegister<CpuRegister>());
2065 }
2066 break;
2067 case DataType::Type::kInt16:
2068 if (value.IsConstant()) {
2069 __ movw(Address(address, 0),
2070 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
2071 } else {
2072 __ movw(Address(address, 0), value.AsRegister<CpuRegister>());
2073 }
2074 break;
2075 case DataType::Type::kInt32:
2076 if (value.IsConstant()) {
2077 __ movl(Address(address, 0),
2078 Immediate(CodeGenerator::GetInt32ValueOf(value.GetConstant())));
2079 } else {
2080 __ movl(Address(address, 0), value.AsRegister<CpuRegister>());
2081 }
2082 break;
2083 case DataType::Type::kInt64:
2084 if (value.IsConstant()) {
2085 int64_t v = value.GetConstant()->AsLongConstant()->GetValue();
2086 DCHECK(IsInt<32>(v));
2087 int32_t v_32 = v;
2088 __ movq(Address(address, 0), Immediate(v_32));
2089 } else {
2090 __ movq(Address(address, 0), value.AsRegister<CpuRegister>());
2091 }
2092 break;
2093 default:
2094 LOG(FATAL) << "Type not recognized for poke: " << size;
2095 UNREACHABLE();
2096 }
2097 }
2098
VisitMemoryPokeByte(HInvoke * invoke)2099 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
2100 CreateIntIntToVoidLocations(allocator_, invoke);
2101 }
2102
VisitMemoryPokeByte(HInvoke * invoke)2103 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeByte(HInvoke* invoke) {
2104 GenPoke(invoke->GetLocations(), DataType::Type::kInt8, GetAssembler());
2105 }
2106
VisitMemoryPokeIntNative(HInvoke * invoke)2107 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
2108 CreateIntIntToVoidLocations(allocator_, invoke);
2109 }
2110
VisitMemoryPokeIntNative(HInvoke * invoke)2111 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeIntNative(HInvoke* invoke) {
2112 GenPoke(invoke->GetLocations(), DataType::Type::kInt32, GetAssembler());
2113 }
2114
VisitMemoryPokeLongNative(HInvoke * invoke)2115 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
2116 CreateIntIntToVoidLocations(allocator_, invoke);
2117 }
2118
VisitMemoryPokeLongNative(HInvoke * invoke)2119 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeLongNative(HInvoke* invoke) {
2120 GenPoke(invoke->GetLocations(), DataType::Type::kInt64, GetAssembler());
2121 }
2122
VisitMemoryPokeShortNative(HInvoke * invoke)2123 void IntrinsicLocationsBuilderX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
2124 CreateIntIntToVoidLocations(allocator_, invoke);
2125 }
2126
VisitMemoryPokeShortNative(HInvoke * invoke)2127 void IntrinsicCodeGeneratorX86_64::VisitMemoryPokeShortNative(HInvoke* invoke) {
2128 GenPoke(invoke->GetLocations(), DataType::Type::kInt16, GetAssembler());
2129 }
2130
VisitThreadCurrentThread(HInvoke * invoke)2131 void IntrinsicLocationsBuilderX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
2132 LocationSummary* locations =
2133 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2134 locations->SetOut(Location::RequiresRegister());
2135 }
2136
VisitThreadCurrentThread(HInvoke * invoke)2137 void IntrinsicCodeGeneratorX86_64::VisitThreadCurrentThread(HInvoke* invoke) {
2138 CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
2139 GetAssembler()->gs()->movl(out, Address::Absolute(Thread::PeerOffset<kX86_64PointerSize>(),
2140 /* no_rip */ true));
2141 }
2142
GenUnsafeGet(HInvoke * invoke,DataType::Type type,bool is_volatile ATTRIBUTE_UNUSED,CodeGeneratorX86_64 * codegen)2143 static void GenUnsafeGet(HInvoke* invoke,
2144 DataType::Type type,
2145 bool is_volatile ATTRIBUTE_UNUSED,
2146 CodeGeneratorX86_64* codegen) {
2147 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2148 LocationSummary* locations = invoke->GetLocations();
2149 Location base_loc = locations->InAt(1);
2150 CpuRegister base = base_loc.AsRegister<CpuRegister>();
2151 Location offset_loc = locations->InAt(2);
2152 CpuRegister offset = offset_loc.AsRegister<CpuRegister>();
2153 Location output_loc = locations->Out();
2154 CpuRegister output = output_loc.AsRegister<CpuRegister>();
2155
2156 switch (type) {
2157 case DataType::Type::kInt32:
2158 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2159 break;
2160
2161 case DataType::Type::kReference: {
2162 if (kEmitCompilerReadBarrier) {
2163 if (kUseBakerReadBarrier) {
2164 Address src(base, offset, ScaleFactor::TIMES_1, 0);
2165 codegen->GenerateReferenceLoadWithBakerReadBarrier(
2166 invoke, output_loc, base, src, /* needs_null_check */ false);
2167 } else {
2168 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2169 codegen->GenerateReadBarrierSlow(
2170 invoke, output_loc, output_loc, base_loc, 0U, offset_loc);
2171 }
2172 } else {
2173 __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2174 __ MaybeUnpoisonHeapReference(output);
2175 }
2176 break;
2177 }
2178
2179 case DataType::Type::kInt64:
2180 __ movq(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
2181 break;
2182
2183 default:
2184 LOG(FATAL) << "Unsupported op size " << type;
2185 UNREACHABLE();
2186 }
2187 }
2188
CreateIntIntIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)2189 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2190 bool can_call = kEmitCompilerReadBarrier &&
2191 (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
2192 invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
2193 LocationSummary* locations =
2194 new (allocator) LocationSummary(invoke,
2195 can_call
2196 ? LocationSummary::kCallOnSlowPath
2197 : LocationSummary::kNoCall,
2198 kIntrinsified);
2199 if (can_call && kUseBakerReadBarrier) {
2200 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers.
2201 }
2202 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
2203 locations->SetInAt(1, Location::RequiresRegister());
2204 locations->SetInAt(2, Location::RequiresRegister());
2205 locations->SetOut(Location::RequiresRegister(),
2206 (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
2207 }
2208
VisitUnsafeGet(HInvoke * invoke)2209 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
2210 CreateIntIntIntToIntLocations(allocator_, invoke);
2211 }
VisitUnsafeGetVolatile(HInvoke * invoke)2212 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
2213 CreateIntIntIntToIntLocations(allocator_, invoke);
2214 }
VisitUnsafeGetLong(HInvoke * invoke)2215 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
2216 CreateIntIntIntToIntLocations(allocator_, invoke);
2217 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)2218 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
2219 CreateIntIntIntToIntLocations(allocator_, invoke);
2220 }
VisitUnsafeGetObject(HInvoke * invoke)2221 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
2222 CreateIntIntIntToIntLocations(allocator_, invoke);
2223 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)2224 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
2225 CreateIntIntIntToIntLocations(allocator_, invoke);
2226 }
2227
2228
VisitUnsafeGet(HInvoke * invoke)2229 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGet(HInvoke* invoke) {
2230 GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile */ false, codegen_);
2231 }
VisitUnsafeGetVolatile(HInvoke * invoke)2232 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
2233 GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile */ true, codegen_);
2234 }
VisitUnsafeGetLong(HInvoke * invoke)2235 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
2236 GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile */ false, codegen_);
2237 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)2238 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
2239 GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile */ true, codegen_);
2240 }
VisitUnsafeGetObject(HInvoke * invoke)2241 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
2242 GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile */ false, codegen_);
2243 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)2244 void IntrinsicCodeGeneratorX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
2245 GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile */ true, codegen_);
2246 }
2247
2248
CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator * allocator,DataType::Type type,HInvoke * invoke)2249 static void CreateIntIntIntIntToVoidPlusTempsLocations(ArenaAllocator* allocator,
2250 DataType::Type type,
2251 HInvoke* invoke) {
2252 LocationSummary* locations =
2253 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2254 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
2255 locations->SetInAt(1, Location::RequiresRegister());
2256 locations->SetInAt(2, Location::RequiresRegister());
2257 locations->SetInAt(3, Location::RequiresRegister());
2258 if (type == DataType::Type::kReference) {
2259 // Need temp registers for card-marking.
2260 locations->AddTemp(Location::RequiresRegister()); // Possibly used for reference poisoning too.
2261 locations->AddTemp(Location::RequiresRegister());
2262 }
2263 }
2264
VisitUnsafePut(HInvoke * invoke)2265 void IntrinsicLocationsBuilderX86_64::VisitUnsafePut(HInvoke* invoke) {
2266 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
2267 }
VisitUnsafePutOrdered(HInvoke * invoke)2268 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
2269 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
2270 }
VisitUnsafePutVolatile(HInvoke * invoke)2271 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
2272 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt32, invoke);
2273 }
VisitUnsafePutObject(HInvoke * invoke)2274 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObject(HInvoke* invoke) {
2275 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
2276 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)2277 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
2278 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
2279 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)2280 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
2281 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kReference, invoke);
2282 }
VisitUnsafePutLong(HInvoke * invoke)2283 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLong(HInvoke* invoke) {
2284 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
2285 }
VisitUnsafePutLongOrdered(HInvoke * invoke)2286 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
2287 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
2288 }
VisitUnsafePutLongVolatile(HInvoke * invoke)2289 void IntrinsicLocationsBuilderX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
2290 CreateIntIntIntIntToVoidPlusTempsLocations(allocator_, DataType::Type::kInt64, invoke);
2291 }
2292
2293 // We don't care for ordered: it requires an AnyStore barrier, which is already given by the x86
2294 // memory model.
GenUnsafePut(LocationSummary * locations,DataType::Type type,bool is_volatile,CodeGeneratorX86_64 * codegen)2295 static void GenUnsafePut(LocationSummary* locations, DataType::Type type, bool is_volatile,
2296 CodeGeneratorX86_64* codegen) {
2297 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2298 CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2299 CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2300 CpuRegister value = locations->InAt(3).AsRegister<CpuRegister>();
2301
2302 if (type == DataType::Type::kInt64) {
2303 __ movq(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2304 } else if (kPoisonHeapReferences && type == DataType::Type::kReference) {
2305 CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2306 __ movl(temp, value);
2307 __ PoisonHeapReference(temp);
2308 __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), temp);
2309 } else {
2310 __ movl(Address(base, offset, ScaleFactor::TIMES_1, 0), value);
2311 }
2312
2313 if (is_volatile) {
2314 codegen->MemoryFence();
2315 }
2316
2317 if (type == DataType::Type::kReference) {
2318 bool value_can_be_null = true; // TODO: Worth finding out this information?
2319 codegen->MarkGCCard(locations->GetTemp(0).AsRegister<CpuRegister>(),
2320 locations->GetTemp(1).AsRegister<CpuRegister>(),
2321 base,
2322 value,
2323 value_can_be_null);
2324 }
2325 }
2326
VisitUnsafePut(HInvoke * invoke)2327 void IntrinsicCodeGeneratorX86_64::VisitUnsafePut(HInvoke* invoke) {
2328 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile */ false, codegen_);
2329 }
VisitUnsafePutOrdered(HInvoke * invoke)2330 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutOrdered(HInvoke* invoke) {
2331 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile */ false, codegen_);
2332 }
VisitUnsafePutVolatile(HInvoke * invoke)2333 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutVolatile(HInvoke* invoke) {
2334 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt32, /* is_volatile */ true, codegen_);
2335 }
VisitUnsafePutObject(HInvoke * invoke)2336 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObject(HInvoke* invoke) {
2337 GenUnsafePut(
2338 invoke->GetLocations(), DataType::Type::kReference, /* is_volatile */ false, codegen_);
2339 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)2340 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
2341 GenUnsafePut(
2342 invoke->GetLocations(), DataType::Type::kReference, /* is_volatile */ false, codegen_);
2343 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)2344 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
2345 GenUnsafePut(
2346 invoke->GetLocations(), DataType::Type::kReference, /* is_volatile */ true, codegen_);
2347 }
VisitUnsafePutLong(HInvoke * invoke)2348 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLong(HInvoke* invoke) {
2349 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile */ false, codegen_);
2350 }
VisitUnsafePutLongOrdered(HInvoke * invoke)2351 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
2352 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile */ false, codegen_);
2353 }
VisitUnsafePutLongVolatile(HInvoke * invoke)2354 void IntrinsicCodeGeneratorX86_64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
2355 GenUnsafePut(invoke->GetLocations(), DataType::Type::kInt64, /* is_volatile */ true, codegen_);
2356 }
2357
CreateIntIntIntIntIntToInt(ArenaAllocator * allocator,DataType::Type type,HInvoke * invoke)2358 static void CreateIntIntIntIntIntToInt(ArenaAllocator* allocator,
2359 DataType::Type type,
2360 HInvoke* invoke) {
2361 bool can_call = kEmitCompilerReadBarrier &&
2362 kUseBakerReadBarrier &&
2363 (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
2364 LocationSummary* locations =
2365 new (allocator) LocationSummary(invoke,
2366 can_call
2367 ? LocationSummary::kCallOnSlowPath
2368 : LocationSummary::kNoCall,
2369 kIntrinsified);
2370 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
2371 locations->SetInAt(1, Location::RequiresRegister());
2372 locations->SetInAt(2, Location::RequiresRegister());
2373 // expected value must be in EAX/RAX.
2374 locations->SetInAt(3, Location::RegisterLocation(RAX));
2375 locations->SetInAt(4, Location::RequiresRegister());
2376
2377 locations->SetOut(Location::RequiresRegister());
2378 if (type == DataType::Type::kReference) {
2379 // Need temporary registers for card-marking, and possibly for
2380 // (Baker) read barrier.
2381 locations->AddTemp(Location::RequiresRegister()); // Possibly used for reference poisoning too.
2382 locations->AddTemp(Location::RequiresRegister());
2383 }
2384 }
2385
VisitUnsafeCASInt(HInvoke * invoke)2386 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2387 CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt32, invoke);
2388 }
2389
VisitUnsafeCASLong(HInvoke * invoke)2390 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2391 CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kInt64, invoke);
2392 }
2393
VisitUnsafeCASObject(HInvoke * invoke)2394 void IntrinsicLocationsBuilderX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2395 // The only read barrier implementation supporting the
2396 // UnsafeCASObject intrinsic is the Baker-style read barriers.
2397 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
2398 return;
2399 }
2400
2401 CreateIntIntIntIntIntToInt(allocator_, DataType::Type::kReference, invoke);
2402 }
2403
GenCAS(DataType::Type type,HInvoke * invoke,CodeGeneratorX86_64 * codegen)2404 static void GenCAS(DataType::Type type, HInvoke* invoke, CodeGeneratorX86_64* codegen) {
2405 X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
2406 LocationSummary* locations = invoke->GetLocations();
2407
2408 CpuRegister base = locations->InAt(1).AsRegister<CpuRegister>();
2409 CpuRegister offset = locations->InAt(2).AsRegister<CpuRegister>();
2410 CpuRegister expected = locations->InAt(3).AsRegister<CpuRegister>();
2411 // Ensure `expected` is in RAX (required by the CMPXCHG instruction).
2412 DCHECK_EQ(expected.AsRegister(), RAX);
2413 CpuRegister value = locations->InAt(4).AsRegister<CpuRegister>();
2414 Location out_loc = locations->Out();
2415 CpuRegister out = out_loc.AsRegister<CpuRegister>();
2416
2417 if (type == DataType::Type::kReference) {
2418 // The only read barrier implementation supporting the
2419 // UnsafeCASObject intrinsic is the Baker-style read barriers.
2420 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
2421
2422 CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
2423 CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
2424
2425 // Mark card for object assuming new value is stored.
2426 bool value_can_be_null = true; // TODO: Worth finding out this information?
2427 codegen->MarkGCCard(temp1, temp2, base, value, value_can_be_null);
2428
2429 // The address of the field within the holding object.
2430 Address field_addr(base, offset, ScaleFactor::TIMES_1, 0);
2431
2432 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2433 // Need to make sure the reference stored in the field is a to-space
2434 // one before attempting the CAS or the CAS could fail incorrectly.
2435 codegen->GenerateReferenceLoadWithBakerReadBarrier(
2436 invoke,
2437 out_loc, // Unused, used only as a "temporary" within the read barrier.
2438 base,
2439 field_addr,
2440 /* needs_null_check */ false,
2441 /* always_update_field */ true,
2442 &temp1,
2443 &temp2);
2444 }
2445
2446 bool base_equals_value = (base.AsRegister() == value.AsRegister());
2447 Register value_reg = value.AsRegister();
2448 if (kPoisonHeapReferences) {
2449 if (base_equals_value) {
2450 // If `base` and `value` are the same register location, move
2451 // `value_reg` to a temporary register. This way, poisoning
2452 // `value_reg` won't invalidate `base`.
2453 value_reg = temp1.AsRegister();
2454 __ movl(CpuRegister(value_reg), base);
2455 }
2456
2457 // Check that the register allocator did not assign the location
2458 // of `expected` (RAX) to `value` nor to `base`, so that heap
2459 // poisoning (when enabled) works as intended below.
2460 // - If `value` were equal to `expected`, both references would
2461 // be poisoned twice, meaning they would not be poisoned at
2462 // all, as heap poisoning uses address negation.
2463 // - If `base` were equal to `expected`, poisoning `expected`
2464 // would invalidate `base`.
2465 DCHECK_NE(value_reg, expected.AsRegister());
2466 DCHECK_NE(base.AsRegister(), expected.AsRegister());
2467
2468 __ PoisonHeapReference(expected);
2469 __ PoisonHeapReference(CpuRegister(value_reg));
2470 }
2471
2472 __ LockCmpxchgl(field_addr, CpuRegister(value_reg));
2473
2474 // LOCK CMPXCHG has full barrier semantics, and we don't need
2475 // scheduling barriers at this time.
2476
2477 // Convert ZF into the Boolean result.
2478 __ setcc(kZero, out);
2479 __ movzxb(out, out);
2480
2481 // If heap poisoning is enabled, we need to unpoison the values
2482 // that were poisoned earlier.
2483 if (kPoisonHeapReferences) {
2484 if (base_equals_value) {
2485 // `value_reg` has been moved to a temporary register, no need
2486 // to unpoison it.
2487 } else {
2488 // Ensure `value` is different from `out`, so that unpoisoning
2489 // the former does not invalidate the latter.
2490 DCHECK_NE(value_reg, out.AsRegister());
2491 __ UnpoisonHeapReference(CpuRegister(value_reg));
2492 }
2493 // Ensure `expected` is different from `out`, so that unpoisoning
2494 // the former does not invalidate the latter.
2495 DCHECK_NE(expected.AsRegister(), out.AsRegister());
2496 __ UnpoisonHeapReference(expected);
2497 }
2498 } else {
2499 if (type == DataType::Type::kInt32) {
2500 __ LockCmpxchgl(Address(base, offset, TIMES_1, 0), value);
2501 } else if (type == DataType::Type::kInt64) {
2502 __ LockCmpxchgq(Address(base, offset, TIMES_1, 0), value);
2503 } else {
2504 LOG(FATAL) << "Unexpected CAS type " << type;
2505 }
2506
2507 // LOCK CMPXCHG has full barrier semantics, and we don't need
2508 // scheduling barriers at this time.
2509
2510 // Convert ZF into the Boolean result.
2511 __ setcc(kZero, out);
2512 __ movzxb(out, out);
2513 }
2514 }
2515
VisitUnsafeCASInt(HInvoke * invoke)2516 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
2517 GenCAS(DataType::Type::kInt32, invoke, codegen_);
2518 }
2519
VisitUnsafeCASLong(HInvoke * invoke)2520 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASLong(HInvoke* invoke) {
2521 GenCAS(DataType::Type::kInt64, invoke, codegen_);
2522 }
2523
VisitUnsafeCASObject(HInvoke * invoke)2524 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASObject(HInvoke* invoke) {
2525 // The only read barrier implementation supporting the
2526 // UnsafeCASObject intrinsic is the Baker-style read barriers.
2527 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
2528
2529 GenCAS(DataType::Type::kReference, invoke, codegen_);
2530 }
2531
VisitIntegerReverse(HInvoke * invoke)2532 void IntrinsicLocationsBuilderX86_64::VisitIntegerReverse(HInvoke* invoke) {
2533 LocationSummary* locations =
2534 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2535 locations->SetInAt(0, Location::RequiresRegister());
2536 locations->SetOut(Location::SameAsFirstInput());
2537 locations->AddTemp(Location::RequiresRegister());
2538 }
2539
SwapBits(CpuRegister reg,CpuRegister temp,int32_t shift,int32_t mask,X86_64Assembler * assembler)2540 static void SwapBits(CpuRegister reg, CpuRegister temp, int32_t shift, int32_t mask,
2541 X86_64Assembler* assembler) {
2542 Immediate imm_shift(shift);
2543 Immediate imm_mask(mask);
2544 __ movl(temp, reg);
2545 __ shrl(reg, imm_shift);
2546 __ andl(temp, imm_mask);
2547 __ andl(reg, imm_mask);
2548 __ shll(temp, imm_shift);
2549 __ orl(reg, temp);
2550 }
2551
VisitIntegerReverse(HInvoke * invoke)2552 void IntrinsicCodeGeneratorX86_64::VisitIntegerReverse(HInvoke* invoke) {
2553 X86_64Assembler* assembler = GetAssembler();
2554 LocationSummary* locations = invoke->GetLocations();
2555
2556 CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2557 CpuRegister temp = locations->GetTemp(0).AsRegister<CpuRegister>();
2558
2559 /*
2560 * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2561 * swapping bits to reverse bits in a number x. Using bswap to save instructions
2562 * compared to generic luni implementation which has 5 rounds of swapping bits.
2563 * x = bswap x
2564 * x = (x & 0x55555555) << 1 | (x >> 1) & 0x55555555;
2565 * x = (x & 0x33333333) << 2 | (x >> 2) & 0x33333333;
2566 * x = (x & 0x0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F;
2567 */
2568 __ bswapl(reg);
2569 SwapBits(reg, temp, 1, 0x55555555, assembler);
2570 SwapBits(reg, temp, 2, 0x33333333, assembler);
2571 SwapBits(reg, temp, 4, 0x0f0f0f0f, assembler);
2572 }
2573
VisitLongReverse(HInvoke * invoke)2574 void IntrinsicLocationsBuilderX86_64::VisitLongReverse(HInvoke* invoke) {
2575 LocationSummary* locations =
2576 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2577 locations->SetInAt(0, Location::RequiresRegister());
2578 locations->SetOut(Location::SameAsFirstInput());
2579 locations->AddTemp(Location::RequiresRegister());
2580 locations->AddTemp(Location::RequiresRegister());
2581 }
2582
SwapBits64(CpuRegister reg,CpuRegister temp,CpuRegister temp_mask,int32_t shift,int64_t mask,X86_64Assembler * assembler)2583 static void SwapBits64(CpuRegister reg, CpuRegister temp, CpuRegister temp_mask,
2584 int32_t shift, int64_t mask, X86_64Assembler* assembler) {
2585 Immediate imm_shift(shift);
2586 __ movq(temp_mask, Immediate(mask));
2587 __ movq(temp, reg);
2588 __ shrq(reg, imm_shift);
2589 __ andq(temp, temp_mask);
2590 __ andq(reg, temp_mask);
2591 __ shlq(temp, imm_shift);
2592 __ orq(reg, temp);
2593 }
2594
VisitLongReverse(HInvoke * invoke)2595 void IntrinsicCodeGeneratorX86_64::VisitLongReverse(HInvoke* invoke) {
2596 X86_64Assembler* assembler = GetAssembler();
2597 LocationSummary* locations = invoke->GetLocations();
2598
2599 CpuRegister reg = locations->InAt(0).AsRegister<CpuRegister>();
2600 CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
2601 CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
2602
2603 /*
2604 * Use one bswap instruction to reverse byte order first and then use 3 rounds of
2605 * swapping bits to reverse bits in a long number x. Using bswap to save instructions
2606 * compared to generic luni implementation which has 5 rounds of swapping bits.
2607 * x = bswap x
2608 * x = (x & 0x5555555555555555) << 1 | (x >> 1) & 0x5555555555555555;
2609 * x = (x & 0x3333333333333333) << 2 | (x >> 2) & 0x3333333333333333;
2610 * x = (x & 0x0F0F0F0F0F0F0F0F) << 4 | (x >> 4) & 0x0F0F0F0F0F0F0F0F;
2611 */
2612 __ bswapq(reg);
2613 SwapBits64(reg, temp1, temp2, 1, INT64_C(0x5555555555555555), assembler);
2614 SwapBits64(reg, temp1, temp2, 2, INT64_C(0x3333333333333333), assembler);
2615 SwapBits64(reg, temp1, temp2, 4, INT64_C(0x0f0f0f0f0f0f0f0f), assembler);
2616 }
2617
CreateBitCountLocations(ArenaAllocator * allocator,CodeGeneratorX86_64 * codegen,HInvoke * invoke)2618 static void CreateBitCountLocations(
2619 ArenaAllocator* allocator, CodeGeneratorX86_64* codegen, HInvoke* invoke) {
2620 if (!codegen->GetInstructionSetFeatures().HasPopCnt()) {
2621 // Do nothing if there is no popcnt support. This results in generating
2622 // a call for the intrinsic rather than direct code.
2623 return;
2624 }
2625 LocationSummary* locations =
2626 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2627 locations->SetInAt(0, Location::Any());
2628 locations->SetOut(Location::RequiresRegister());
2629 }
2630
GenBitCount(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_long)2631 static void GenBitCount(X86_64Assembler* assembler,
2632 CodeGeneratorX86_64* codegen,
2633 HInvoke* invoke,
2634 bool is_long) {
2635 LocationSummary* locations = invoke->GetLocations();
2636 Location src = locations->InAt(0);
2637 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2638
2639 if (invoke->InputAt(0)->IsConstant()) {
2640 // Evaluate this at compile time.
2641 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2642 int32_t result = is_long
2643 ? POPCOUNT(static_cast<uint64_t>(value))
2644 : POPCOUNT(static_cast<uint32_t>(value));
2645 codegen->Load32BitValue(out, result);
2646 return;
2647 }
2648
2649 if (src.IsRegister()) {
2650 if (is_long) {
2651 __ popcntq(out, src.AsRegister<CpuRegister>());
2652 } else {
2653 __ popcntl(out, src.AsRegister<CpuRegister>());
2654 }
2655 } else if (is_long) {
2656 DCHECK(src.IsDoubleStackSlot());
2657 __ popcntq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2658 } else {
2659 DCHECK(src.IsStackSlot());
2660 __ popcntl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2661 }
2662 }
2663
VisitIntegerBitCount(HInvoke * invoke)2664 void IntrinsicLocationsBuilderX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2665 CreateBitCountLocations(allocator_, codegen_, invoke);
2666 }
2667
VisitIntegerBitCount(HInvoke * invoke)2668 void IntrinsicCodeGeneratorX86_64::VisitIntegerBitCount(HInvoke* invoke) {
2669 GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ false);
2670 }
2671
VisitLongBitCount(HInvoke * invoke)2672 void IntrinsicLocationsBuilderX86_64::VisitLongBitCount(HInvoke* invoke) {
2673 CreateBitCountLocations(allocator_, codegen_, invoke);
2674 }
2675
VisitLongBitCount(HInvoke * invoke)2676 void IntrinsicCodeGeneratorX86_64::VisitLongBitCount(HInvoke* invoke) {
2677 GenBitCount(GetAssembler(), codegen_, invoke, /* is_long */ true);
2678 }
2679
CreateOneBitLocations(ArenaAllocator * allocator,HInvoke * invoke,bool is_high)2680 static void CreateOneBitLocations(ArenaAllocator* allocator, HInvoke* invoke, bool is_high) {
2681 LocationSummary* locations =
2682 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2683 locations->SetInAt(0, Location::Any());
2684 locations->SetOut(Location::RequiresRegister());
2685 locations->AddTemp(is_high ? Location::RegisterLocation(RCX) // needs CL
2686 : Location::RequiresRegister()); // any will do
2687 }
2688
GenOneBit(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_high,bool is_long)2689 static void GenOneBit(X86_64Assembler* assembler,
2690 CodeGeneratorX86_64* codegen,
2691 HInvoke* invoke,
2692 bool is_high, bool is_long) {
2693 LocationSummary* locations = invoke->GetLocations();
2694 Location src = locations->InAt(0);
2695 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2696
2697 if (invoke->InputAt(0)->IsConstant()) {
2698 // Evaluate this at compile time.
2699 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2700 if (value == 0) {
2701 __ xorl(out, out); // Clears upper bits too.
2702 return;
2703 }
2704 // Nonzero value.
2705 if (is_high) {
2706 value = is_long ? 63 - CLZ(static_cast<uint64_t>(value))
2707 : 31 - CLZ(static_cast<uint32_t>(value));
2708 } else {
2709 value = is_long ? CTZ(static_cast<uint64_t>(value))
2710 : CTZ(static_cast<uint32_t>(value));
2711 }
2712 if (is_long) {
2713 codegen->Load64BitValue(out, 1ULL << value);
2714 } else {
2715 codegen->Load32BitValue(out, 1 << value);
2716 }
2717 return;
2718 }
2719
2720 // Handle the non-constant cases.
2721 CpuRegister tmp = locations->GetTemp(0).AsRegister<CpuRegister>();
2722 if (is_high) {
2723 // Use architectural support: basically 1 << bsr.
2724 if (src.IsRegister()) {
2725 if (is_long) {
2726 __ bsrq(tmp, src.AsRegister<CpuRegister>());
2727 } else {
2728 __ bsrl(tmp, src.AsRegister<CpuRegister>());
2729 }
2730 } else if (is_long) {
2731 DCHECK(src.IsDoubleStackSlot());
2732 __ bsrq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2733 } else {
2734 DCHECK(src.IsStackSlot());
2735 __ bsrl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2736 }
2737 // BSR sets ZF if the input was zero.
2738 NearLabel is_zero, done;
2739 __ j(kEqual, &is_zero);
2740 __ movl(out, Immediate(1)); // Clears upper bits too.
2741 if (is_long) {
2742 __ shlq(out, tmp);
2743 } else {
2744 __ shll(out, tmp);
2745 }
2746 __ jmp(&done);
2747 __ Bind(&is_zero);
2748 __ xorl(out, out); // Clears upper bits too.
2749 __ Bind(&done);
2750 } else {
2751 // Copy input into temporary.
2752 if (src.IsRegister()) {
2753 if (is_long) {
2754 __ movq(tmp, src.AsRegister<CpuRegister>());
2755 } else {
2756 __ movl(tmp, src.AsRegister<CpuRegister>());
2757 }
2758 } else if (is_long) {
2759 DCHECK(src.IsDoubleStackSlot());
2760 __ movq(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2761 } else {
2762 DCHECK(src.IsStackSlot());
2763 __ movl(tmp, Address(CpuRegister(RSP), src.GetStackIndex()));
2764 }
2765 // Do the bit twiddling: basically tmp & -tmp;
2766 if (is_long) {
2767 __ movq(out, tmp);
2768 __ negq(tmp);
2769 __ andq(out, tmp);
2770 } else {
2771 __ movl(out, tmp);
2772 __ negl(tmp);
2773 __ andl(out, tmp);
2774 }
2775 }
2776 }
2777
VisitIntegerHighestOneBit(HInvoke * invoke)2778 void IntrinsicLocationsBuilderX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2779 CreateOneBitLocations(allocator_, invoke, /* is_high */ true);
2780 }
2781
VisitIntegerHighestOneBit(HInvoke * invoke)2782 void IntrinsicCodeGeneratorX86_64::VisitIntegerHighestOneBit(HInvoke* invoke) {
2783 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ false);
2784 }
2785
VisitLongHighestOneBit(HInvoke * invoke)2786 void IntrinsicLocationsBuilderX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2787 CreateOneBitLocations(allocator_, invoke, /* is_high */ true);
2788 }
2789
VisitLongHighestOneBit(HInvoke * invoke)2790 void IntrinsicCodeGeneratorX86_64::VisitLongHighestOneBit(HInvoke* invoke) {
2791 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ true, /* is_long */ true);
2792 }
2793
VisitIntegerLowestOneBit(HInvoke * invoke)2794 void IntrinsicLocationsBuilderX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2795 CreateOneBitLocations(allocator_, invoke, /* is_high */ false);
2796 }
2797
VisitIntegerLowestOneBit(HInvoke * invoke)2798 void IntrinsicCodeGeneratorX86_64::VisitIntegerLowestOneBit(HInvoke* invoke) {
2799 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ false);
2800 }
2801
VisitLongLowestOneBit(HInvoke * invoke)2802 void IntrinsicLocationsBuilderX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2803 CreateOneBitLocations(allocator_, invoke, /* is_high */ false);
2804 }
2805
VisitLongLowestOneBit(HInvoke * invoke)2806 void IntrinsicCodeGeneratorX86_64::VisitLongLowestOneBit(HInvoke* invoke) {
2807 GenOneBit(GetAssembler(), codegen_, invoke, /* is_high */ false, /* is_long */ true);
2808 }
2809
CreateLeadingZeroLocations(ArenaAllocator * allocator,HInvoke * invoke)2810 static void CreateLeadingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2811 LocationSummary* locations =
2812 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2813 locations->SetInAt(0, Location::Any());
2814 locations->SetOut(Location::RequiresRegister());
2815 }
2816
GenLeadingZeros(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_long)2817 static void GenLeadingZeros(X86_64Assembler* assembler,
2818 CodeGeneratorX86_64* codegen,
2819 HInvoke* invoke, bool is_long) {
2820 LocationSummary* locations = invoke->GetLocations();
2821 Location src = locations->InAt(0);
2822 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2823
2824 int zero_value_result = is_long ? 64 : 32;
2825 if (invoke->InputAt(0)->IsConstant()) {
2826 // Evaluate this at compile time.
2827 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2828 if (value == 0) {
2829 value = zero_value_result;
2830 } else {
2831 value = is_long ? CLZ(static_cast<uint64_t>(value)) : CLZ(static_cast<uint32_t>(value));
2832 }
2833 codegen->Load32BitValue(out, value);
2834 return;
2835 }
2836
2837 // Handle the non-constant cases.
2838 if (src.IsRegister()) {
2839 if (is_long) {
2840 __ bsrq(out, src.AsRegister<CpuRegister>());
2841 } else {
2842 __ bsrl(out, src.AsRegister<CpuRegister>());
2843 }
2844 } else if (is_long) {
2845 DCHECK(src.IsDoubleStackSlot());
2846 __ bsrq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2847 } else {
2848 DCHECK(src.IsStackSlot());
2849 __ bsrl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2850 }
2851
2852 // BSR sets ZF if the input was zero, and the output is undefined.
2853 NearLabel is_zero, done;
2854 __ j(kEqual, &is_zero);
2855
2856 // Correct the result from BSR to get the CLZ result.
2857 __ xorl(out, Immediate(zero_value_result - 1));
2858 __ jmp(&done);
2859
2860 // Fix the zero case with the expected result.
2861 __ Bind(&is_zero);
2862 __ movl(out, Immediate(zero_value_result));
2863
2864 __ Bind(&done);
2865 }
2866
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)2867 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2868 CreateLeadingZeroLocations(allocator_, invoke);
2869 }
2870
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)2871 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
2872 GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
2873 }
2874
VisitLongNumberOfLeadingZeros(HInvoke * invoke)2875 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2876 CreateLeadingZeroLocations(allocator_, invoke);
2877 }
2878
VisitLongNumberOfLeadingZeros(HInvoke * invoke)2879 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
2880 GenLeadingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
2881 }
2882
CreateTrailingZeroLocations(ArenaAllocator * allocator,HInvoke * invoke)2883 static void CreateTrailingZeroLocations(ArenaAllocator* allocator, HInvoke* invoke) {
2884 LocationSummary* locations =
2885 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2886 locations->SetInAt(0, Location::Any());
2887 locations->SetOut(Location::RequiresRegister());
2888 }
2889
GenTrailingZeros(X86_64Assembler * assembler,CodeGeneratorX86_64 * codegen,HInvoke * invoke,bool is_long)2890 static void GenTrailingZeros(X86_64Assembler* assembler,
2891 CodeGeneratorX86_64* codegen,
2892 HInvoke* invoke, bool is_long) {
2893 LocationSummary* locations = invoke->GetLocations();
2894 Location src = locations->InAt(0);
2895 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2896
2897 int zero_value_result = is_long ? 64 : 32;
2898 if (invoke->InputAt(0)->IsConstant()) {
2899 // Evaluate this at compile time.
2900 int64_t value = Int64FromConstant(invoke->InputAt(0)->AsConstant());
2901 if (value == 0) {
2902 value = zero_value_result;
2903 } else {
2904 value = is_long ? CTZ(static_cast<uint64_t>(value)) : CTZ(static_cast<uint32_t>(value));
2905 }
2906 codegen->Load32BitValue(out, value);
2907 return;
2908 }
2909
2910 // Handle the non-constant cases.
2911 if (src.IsRegister()) {
2912 if (is_long) {
2913 __ bsfq(out, src.AsRegister<CpuRegister>());
2914 } else {
2915 __ bsfl(out, src.AsRegister<CpuRegister>());
2916 }
2917 } else if (is_long) {
2918 DCHECK(src.IsDoubleStackSlot());
2919 __ bsfq(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2920 } else {
2921 DCHECK(src.IsStackSlot());
2922 __ bsfl(out, Address(CpuRegister(RSP), src.GetStackIndex()));
2923 }
2924
2925 // BSF sets ZF if the input was zero, and the output is undefined.
2926 NearLabel done;
2927 __ j(kNotEqual, &done);
2928
2929 // Fix the zero case with the expected result.
2930 __ movl(out, Immediate(zero_value_result));
2931
2932 __ Bind(&done);
2933 }
2934
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)2935 void IntrinsicLocationsBuilderX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2936 CreateTrailingZeroLocations(allocator_, invoke);
2937 }
2938
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)2939 void IntrinsicCodeGeneratorX86_64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
2940 GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ false);
2941 }
2942
VisitLongNumberOfTrailingZeros(HInvoke * invoke)2943 void IntrinsicLocationsBuilderX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2944 CreateTrailingZeroLocations(allocator_, invoke);
2945 }
2946
VisitLongNumberOfTrailingZeros(HInvoke * invoke)2947 void IntrinsicCodeGeneratorX86_64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
2948 GenTrailingZeros(GetAssembler(), codegen_, invoke, /* is_long */ true);
2949 }
2950
VisitIntegerValueOf(HInvoke * invoke)2951 void IntrinsicLocationsBuilderX86_64::VisitIntegerValueOf(HInvoke* invoke) {
2952 InvokeRuntimeCallingConvention calling_convention;
2953 IntrinsicVisitor::ComputeIntegerValueOfLocations(
2954 invoke,
2955 codegen_,
2956 Location::RegisterLocation(RAX),
2957 Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
2958 }
2959
VisitIntegerValueOf(HInvoke * invoke)2960 void IntrinsicCodeGeneratorX86_64::VisitIntegerValueOf(HInvoke* invoke) {
2961 IntrinsicVisitor::IntegerValueOfInfo info = IntrinsicVisitor::ComputeIntegerValueOfInfo();
2962 LocationSummary* locations = invoke->GetLocations();
2963 X86_64Assembler* assembler = GetAssembler();
2964
2965 CpuRegister out = locations->Out().AsRegister<CpuRegister>();
2966 InvokeRuntimeCallingConvention calling_convention;
2967 if (invoke->InputAt(0)->IsConstant()) {
2968 int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
2969 if (value >= info.low && value <= info.high) {
2970 // Just embed the j.l.Integer in the code.
2971 ScopedObjectAccess soa(Thread::Current());
2972 mirror::Object* boxed = info.cache->Get(value + (-info.low));
2973 DCHECK(boxed != nullptr && Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(boxed));
2974 uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(boxed));
2975 __ movl(out, Immediate(static_cast<int32_t>(address)));
2976 } else {
2977 // Allocate and initialize a new j.l.Integer.
2978 // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
2979 // JIT object table.
2980 CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0));
2981 uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
2982 __ movl(argument, Immediate(static_cast<int32_t>(address)));
2983 codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
2984 CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
2985 __ movl(Address(out, info.value_offset), Immediate(value));
2986 }
2987 } else {
2988 CpuRegister in = locations->InAt(0).AsRegister<CpuRegister>();
2989 // Check bounds of our cache.
2990 __ leal(out, Address(in, -info.low));
2991 __ cmpl(out, Immediate(info.high - info.low + 1));
2992 NearLabel allocate, done;
2993 __ j(kAboveEqual, &allocate);
2994 // If the value is within the bounds, load the j.l.Integer directly from the array.
2995 uint32_t data_offset = mirror::Array::DataOffset(kHeapReferenceSize).Uint32Value();
2996 uint32_t address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.cache));
2997 if (data_offset + address <= std::numeric_limits<int32_t>::max()) {
2998 __ movl(out, Address(out, TIMES_4, data_offset + address));
2999 } else {
3000 CpuRegister temp = CpuRegister(calling_convention.GetRegisterAt(0));
3001 __ movl(temp, Immediate(static_cast<int32_t>(data_offset + address)));
3002 __ movl(out, Address(temp, out, TIMES_4, 0));
3003 }
3004 __ MaybeUnpoisonHeapReference(out);
3005 __ jmp(&done);
3006 __ Bind(&allocate);
3007 // Otherwise allocate and initialize a new j.l.Integer.
3008 CpuRegister argument = CpuRegister(calling_convention.GetRegisterAt(0));
3009 address = dchecked_integral_cast<uint32_t>(reinterpret_cast<uintptr_t>(info.integer));
3010 __ movl(argument, Immediate(static_cast<int32_t>(address)));
3011 codegen_->InvokeRuntime(kQuickAllocObjectInitialized, invoke, invoke->GetDexPc());
3012 CheckEntrypointTypes<kQuickAllocObjectWithChecks, void*, mirror::Class*>();
3013 __ movl(Address(out, info.value_offset), in);
3014 __ Bind(&done);
3015 }
3016 }
3017
VisitThreadInterrupted(HInvoke * invoke)3018 void IntrinsicLocationsBuilderX86_64::VisitThreadInterrupted(HInvoke* invoke) {
3019 LocationSummary* locations =
3020 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
3021 locations->SetOut(Location::RequiresRegister());
3022 }
3023
VisitThreadInterrupted(HInvoke * invoke)3024 void IntrinsicCodeGeneratorX86_64::VisitThreadInterrupted(HInvoke* invoke) {
3025 X86_64Assembler* assembler = GetAssembler();
3026 CpuRegister out = invoke->GetLocations()->Out().AsRegister<CpuRegister>();
3027 Address address = Address::Absolute
3028 (Thread::InterruptedOffset<kX86_64PointerSize>().Int32Value(), /* no_rip */ true);
3029 NearLabel done;
3030 __ gs()->movl(out, address);
3031 __ testl(out, out);
3032 __ j(kEqual, &done);
3033 __ gs()->movl(address, Immediate(0));
3034 codegen_->MemoryFence();
3035 __ Bind(&done);
3036 }
3037
VisitReachabilityFence(HInvoke * invoke)3038 void IntrinsicLocationsBuilderX86_64::VisitReachabilityFence(HInvoke* invoke) {
3039 LocationSummary* locations =
3040 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
3041 locations->SetInAt(0, Location::Any());
3042 }
3043
VisitReachabilityFence(HInvoke * invoke ATTRIBUTE_UNUSED)3044 void IntrinsicCodeGeneratorX86_64::VisitReachabilityFence(HInvoke* invoke ATTRIBUTE_UNUSED) { }
3045
3046 UNIMPLEMENTED_INTRINSIC(X86_64, ReferenceGetReferent)
3047 UNIMPLEMENTED_INTRINSIC(X86_64, FloatIsInfinite)
3048 UNIMPLEMENTED_INTRINSIC(X86_64, DoubleIsInfinite)
3049
3050 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
3051 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
3052 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferAppend);
3053 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferLength);
3054 UNIMPLEMENTED_INTRINSIC(X86_64, StringBufferToString);
3055 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderAppend);
3056 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderLength);
3057 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderToString);
3058
3059 // 1.8.
3060 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt)
3061 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong)
3062 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt)
3063 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetLong)
3064 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetObject)
3065
3066 UNREACHABLE_INTRINSICS(X86_64)
3067
3068 #undef __
3069
3070 } // namespace x86_64
3071 } // namespace art
3072