1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "intrinsics_arm64.h"
18
19 #include "arch/arm64/instruction_set_features_arm64.h"
20 #include "art_method.h"
21 #include "code_generator_arm64.h"
22 #include "common_arm64.h"
23 #include "entrypoints/quick/quick_entrypoints.h"
24 #include "heap_poisoning.h"
25 #include "intrinsics.h"
26 #include "intrinsics_utils.h"
27 #include "lock_word.h"
28 #include "mirror/array-inl.h"
29 #include "mirror/object_array-inl.h"
30 #include "mirror/reference.h"
31 #include "mirror/string-inl.h"
32 #include "scoped_thread_state_change-inl.h"
33 #include "thread-current-inl.h"
34 #include "utils/arm64/assembler_arm64.h"
35
36 using namespace vixl::aarch64; // NOLINT(build/namespaces)
37
38 // TODO(VIXL): Make VIXL compile with -Wshadow.
39 #pragma GCC diagnostic push
40 #pragma GCC diagnostic ignored "-Wshadow"
41 #include "aarch64/disasm-aarch64.h"
42 #include "aarch64/macro-assembler-aarch64.h"
43 #pragma GCC diagnostic pop
44
45 namespace art {
46
47 namespace arm64 {
48
49 using helpers::DRegisterFrom;
50 using helpers::HeapOperand;
51 using helpers::LocationFrom;
52 using helpers::OperandFrom;
53 using helpers::RegisterFrom;
54 using helpers::SRegisterFrom;
55 using helpers::WRegisterFrom;
56 using helpers::XRegisterFrom;
57 using helpers::HRegisterFrom;
58 using helpers::InputRegisterAt;
59 using helpers::OutputRegister;
60
61 namespace {
62
AbsoluteHeapOperandFrom(Location location,size_t offset=0)63 ALWAYS_INLINE inline MemOperand AbsoluteHeapOperandFrom(Location location, size_t offset = 0) {
64 return MemOperand(XRegisterFrom(location), offset);
65 }
66
67 } // namespace
68
GetVIXLAssembler()69 MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() {
70 return codegen_->GetVIXLAssembler();
71 }
72
GetAllocator()73 ArenaAllocator* IntrinsicCodeGeneratorARM64::GetAllocator() {
74 return codegen_->GetGraph()->GetAllocator();
75 }
76
77 using IntrinsicSlowPathARM64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM64,
78 SlowPathCodeARM64,
79 Arm64Assembler>;
80
81 #define __ codegen->GetVIXLAssembler()->
82
83 // Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
84 class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 {
85 public:
ReadBarrierSystemArrayCopySlowPathARM64(HInstruction * instruction,Location tmp)86 ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp)
87 : SlowPathCodeARM64(instruction), tmp_(tmp) {
88 DCHECK(kEmitCompilerReadBarrier);
89 DCHECK(kUseBakerReadBarrier);
90 }
91
EmitNativeCode(CodeGenerator * codegen_in)92 void EmitNativeCode(CodeGenerator* codegen_in) override {
93 CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
94 LocationSummary* locations = instruction_->GetLocations();
95 DCHECK(locations->CanCall());
96 DCHECK(instruction_->IsInvokeStaticOrDirect())
97 << "Unexpected instruction in read barrier arraycopy slow path: "
98 << instruction_->DebugName();
99 DCHECK(instruction_->GetLocations()->Intrinsified());
100 DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
101
102 const int32_t element_size = DataType::Size(DataType::Type::kReference);
103
104 Register src_curr_addr = XRegisterFrom(locations->GetTemp(0));
105 Register dst_curr_addr = XRegisterFrom(locations->GetTemp(1));
106 Register src_stop_addr = XRegisterFrom(locations->GetTemp(2));
107 Register tmp_reg = WRegisterFrom(tmp_);
108
109 __ Bind(GetEntryLabel());
110 vixl::aarch64::Label slow_copy_loop;
111 __ Bind(&slow_copy_loop);
112 __ Ldr(tmp_reg, MemOperand(src_curr_addr, element_size, PostIndex));
113 codegen->GetAssembler()->MaybeUnpoisonHeapReference(tmp_reg);
114 // TODO: Inline the mark bit check before calling the runtime?
115 // tmp_reg = ReadBarrier::Mark(tmp_reg);
116 // No need to save live registers; it's taken care of by the
117 // entrypoint. Also, there is no need to update the stack mask,
118 // as this runtime call will not trigger a garbage collection.
119 // (See ReadBarrierMarkSlowPathARM64::EmitNativeCode for more
120 // explanations.)
121 DCHECK_NE(tmp_.reg(), LR);
122 DCHECK_NE(tmp_.reg(), WSP);
123 DCHECK_NE(tmp_.reg(), WZR);
124 // IP0 is used internally by the ReadBarrierMarkRegX entry point
125 // as a temporary (and not preserved). It thus cannot be used by
126 // any live register in this slow path.
127 DCHECK_NE(LocationFrom(src_curr_addr).reg(), IP0);
128 DCHECK_NE(LocationFrom(dst_curr_addr).reg(), IP0);
129 DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0);
130 DCHECK_NE(tmp_.reg(), IP0);
131 DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg();
132 // TODO: Load the entrypoint once before the loop, instead of
133 // loading it at every iteration.
134 int32_t entry_point_offset =
135 Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg());
136 // This runtime call does not require a stack map.
137 codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
138 codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg);
139 __ Str(tmp_reg, MemOperand(dst_curr_addr, element_size, PostIndex));
140 __ Cmp(src_curr_addr, src_stop_addr);
141 __ B(&slow_copy_loop, ne);
142 __ B(GetExitLabel());
143 }
144
GetDescription() const145 const char* GetDescription() const override { return "ReadBarrierSystemArrayCopySlowPathARM64"; }
146
147 private:
148 Location tmp_;
149
150 DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM64);
151 };
152 #undef __
153
TryDispatch(HInvoke * invoke)154 bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) {
155 Dispatch(invoke);
156 LocationSummary* res = invoke->GetLocations();
157 if (res == nullptr) {
158 return false;
159 }
160 return res->Intrinsified();
161 }
162
163 #define __ masm->
164
CreateFPToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)165 static void CreateFPToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
166 LocationSummary* locations =
167 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
168 locations->SetInAt(0, Location::RequiresFpuRegister());
169 locations->SetOut(Location::RequiresRegister());
170 }
171
CreateIntToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)172 static void CreateIntToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
173 LocationSummary* locations =
174 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
175 locations->SetInAt(0, Location::RequiresRegister());
176 locations->SetOut(Location::RequiresFpuRegister());
177 }
178
MoveFPToInt(LocationSummary * locations,bool is64bit,MacroAssembler * masm)179 static void MoveFPToInt(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
180 Location input = locations->InAt(0);
181 Location output = locations->Out();
182 __ Fmov(is64bit ? XRegisterFrom(output) : WRegisterFrom(output),
183 is64bit ? DRegisterFrom(input) : SRegisterFrom(input));
184 }
185
MoveIntToFP(LocationSummary * locations,bool is64bit,MacroAssembler * masm)186 static void MoveIntToFP(LocationSummary* locations, bool is64bit, MacroAssembler* masm) {
187 Location input = locations->InAt(0);
188 Location output = locations->Out();
189 __ Fmov(is64bit ? DRegisterFrom(output) : SRegisterFrom(output),
190 is64bit ? XRegisterFrom(input) : WRegisterFrom(input));
191 }
192
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)193 void IntrinsicLocationsBuilderARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
194 CreateFPToIntLocations(allocator_, invoke);
195 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)196 void IntrinsicLocationsBuilderARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
197 CreateIntToFPLocations(allocator_, invoke);
198 }
199
VisitDoubleDoubleToRawLongBits(HInvoke * invoke)200 void IntrinsicCodeGeneratorARM64::VisitDoubleDoubleToRawLongBits(HInvoke* invoke) {
201 MoveFPToInt(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
202 }
VisitDoubleLongBitsToDouble(HInvoke * invoke)203 void IntrinsicCodeGeneratorARM64::VisitDoubleLongBitsToDouble(HInvoke* invoke) {
204 MoveIntToFP(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
205 }
206
VisitFloatFloatToRawIntBits(HInvoke * invoke)207 void IntrinsicLocationsBuilderARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
208 CreateFPToIntLocations(allocator_, invoke);
209 }
VisitFloatIntBitsToFloat(HInvoke * invoke)210 void IntrinsicLocationsBuilderARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
211 CreateIntToFPLocations(allocator_, invoke);
212 }
213
VisitFloatFloatToRawIntBits(HInvoke * invoke)214 void IntrinsicCodeGeneratorARM64::VisitFloatFloatToRawIntBits(HInvoke* invoke) {
215 MoveFPToInt(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
216 }
VisitFloatIntBitsToFloat(HInvoke * invoke)217 void IntrinsicCodeGeneratorARM64::VisitFloatIntBitsToFloat(HInvoke* invoke) {
218 MoveIntToFP(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
219 }
220
CreateIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)221 static void CreateIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
222 LocationSummary* locations =
223 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
224 locations->SetInAt(0, Location::RequiresRegister());
225 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
226 }
227
CreateIntIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)228 static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
229 LocationSummary* locations =
230 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
231 locations->SetInAt(0, Location::RequiresRegister());
232 locations->SetInAt(1, Location::RequiresRegister());
233 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
234 }
235
GenReverseBytes(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)236 static void GenReverseBytes(LocationSummary* locations,
237 DataType::Type type,
238 MacroAssembler* masm) {
239 Location in = locations->InAt(0);
240 Location out = locations->Out();
241
242 switch (type) {
243 case DataType::Type::kInt16:
244 __ Rev16(WRegisterFrom(out), WRegisterFrom(in));
245 __ Sxth(WRegisterFrom(out), WRegisterFrom(out));
246 break;
247 case DataType::Type::kInt32:
248 case DataType::Type::kInt64:
249 __ Rev(RegisterFrom(out, type), RegisterFrom(in, type));
250 break;
251 default:
252 LOG(FATAL) << "Unexpected size for reverse-bytes: " << type;
253 UNREACHABLE();
254 }
255 }
256
VisitIntegerReverseBytes(HInvoke * invoke)257 void IntrinsicLocationsBuilderARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
258 CreateIntToIntLocations(allocator_, invoke);
259 }
260
VisitIntegerReverseBytes(HInvoke * invoke)261 void IntrinsicCodeGeneratorARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
262 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
263 }
264
VisitLongReverseBytes(HInvoke * invoke)265 void IntrinsicLocationsBuilderARM64::VisitLongReverseBytes(HInvoke* invoke) {
266 CreateIntToIntLocations(allocator_, invoke);
267 }
268
VisitLongReverseBytes(HInvoke * invoke)269 void IntrinsicCodeGeneratorARM64::VisitLongReverseBytes(HInvoke* invoke) {
270 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
271 }
272
VisitShortReverseBytes(HInvoke * invoke)273 void IntrinsicLocationsBuilderARM64::VisitShortReverseBytes(HInvoke* invoke) {
274 CreateIntToIntLocations(allocator_, invoke);
275 }
276
VisitShortReverseBytes(HInvoke * invoke)277 void IntrinsicCodeGeneratorARM64::VisitShortReverseBytes(HInvoke* invoke) {
278 GenReverseBytes(invoke->GetLocations(), DataType::Type::kInt16, GetVIXLAssembler());
279 }
280
GenNumberOfLeadingZeros(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)281 static void GenNumberOfLeadingZeros(LocationSummary* locations,
282 DataType::Type type,
283 MacroAssembler* masm) {
284 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
285
286 Location in = locations->InAt(0);
287 Location out = locations->Out();
288
289 __ Clz(RegisterFrom(out, type), RegisterFrom(in, type));
290 }
291
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)292 void IntrinsicLocationsBuilderARM64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
293 CreateIntToIntLocations(allocator_, invoke);
294 }
295
VisitIntegerNumberOfLeadingZeros(HInvoke * invoke)296 void IntrinsicCodeGeneratorARM64::VisitIntegerNumberOfLeadingZeros(HInvoke* invoke) {
297 GenNumberOfLeadingZeros(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
298 }
299
VisitLongNumberOfLeadingZeros(HInvoke * invoke)300 void IntrinsicLocationsBuilderARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
301 CreateIntToIntLocations(allocator_, invoke);
302 }
303
VisitLongNumberOfLeadingZeros(HInvoke * invoke)304 void IntrinsicCodeGeneratorARM64::VisitLongNumberOfLeadingZeros(HInvoke* invoke) {
305 GenNumberOfLeadingZeros(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
306 }
307
GenNumberOfTrailingZeros(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)308 static void GenNumberOfTrailingZeros(LocationSummary* locations,
309 DataType::Type type,
310 MacroAssembler* masm) {
311 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
312
313 Location in = locations->InAt(0);
314 Location out = locations->Out();
315
316 __ Rbit(RegisterFrom(out, type), RegisterFrom(in, type));
317 __ Clz(RegisterFrom(out, type), RegisterFrom(out, type));
318 }
319
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)320 void IntrinsicLocationsBuilderARM64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
321 CreateIntToIntLocations(allocator_, invoke);
322 }
323
VisitIntegerNumberOfTrailingZeros(HInvoke * invoke)324 void IntrinsicCodeGeneratorARM64::VisitIntegerNumberOfTrailingZeros(HInvoke* invoke) {
325 GenNumberOfTrailingZeros(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
326 }
327
VisitLongNumberOfTrailingZeros(HInvoke * invoke)328 void IntrinsicLocationsBuilderARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
329 CreateIntToIntLocations(allocator_, invoke);
330 }
331
VisitLongNumberOfTrailingZeros(HInvoke * invoke)332 void IntrinsicCodeGeneratorARM64::VisitLongNumberOfTrailingZeros(HInvoke* invoke) {
333 GenNumberOfTrailingZeros(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
334 }
335
GenReverse(LocationSummary * locations,DataType::Type type,MacroAssembler * masm)336 static void GenReverse(LocationSummary* locations,
337 DataType::Type type,
338 MacroAssembler* masm) {
339 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
340
341 Location in = locations->InAt(0);
342 Location out = locations->Out();
343
344 __ Rbit(RegisterFrom(out, type), RegisterFrom(in, type));
345 }
346
VisitIntegerReverse(HInvoke * invoke)347 void IntrinsicLocationsBuilderARM64::VisitIntegerReverse(HInvoke* invoke) {
348 CreateIntToIntLocations(allocator_, invoke);
349 }
350
VisitIntegerReverse(HInvoke * invoke)351 void IntrinsicCodeGeneratorARM64::VisitIntegerReverse(HInvoke* invoke) {
352 GenReverse(invoke->GetLocations(), DataType::Type::kInt32, GetVIXLAssembler());
353 }
354
VisitLongReverse(HInvoke * invoke)355 void IntrinsicLocationsBuilderARM64::VisitLongReverse(HInvoke* invoke) {
356 CreateIntToIntLocations(allocator_, invoke);
357 }
358
VisitLongReverse(HInvoke * invoke)359 void IntrinsicCodeGeneratorARM64::VisitLongReverse(HInvoke* invoke) {
360 GenReverse(invoke->GetLocations(), DataType::Type::kInt64, GetVIXLAssembler());
361 }
362
GenBitCount(HInvoke * instr,DataType::Type type,MacroAssembler * masm)363 static void GenBitCount(HInvoke* instr, DataType::Type type, MacroAssembler* masm) {
364 DCHECK(DataType::IsIntOrLongType(type)) << type;
365 DCHECK_EQ(instr->GetType(), DataType::Type::kInt32);
366 DCHECK_EQ(DataType::Kind(instr->InputAt(0)->GetType()), type);
367
368 UseScratchRegisterScope temps(masm);
369
370 Register src = InputRegisterAt(instr, 0);
371 Register dst = RegisterFrom(instr->GetLocations()->Out(), type);
372 VRegister fpr = (type == DataType::Type::kInt64) ? temps.AcquireD() : temps.AcquireS();
373
374 __ Fmov(fpr, src);
375 __ Cnt(fpr.V8B(), fpr.V8B());
376 __ Addv(fpr.B(), fpr.V8B());
377 __ Fmov(dst, fpr);
378 }
379
VisitLongBitCount(HInvoke * invoke)380 void IntrinsicLocationsBuilderARM64::VisitLongBitCount(HInvoke* invoke) {
381 CreateIntToIntLocations(allocator_, invoke);
382 }
383
VisitLongBitCount(HInvoke * invoke)384 void IntrinsicCodeGeneratorARM64::VisitLongBitCount(HInvoke* invoke) {
385 GenBitCount(invoke, DataType::Type::kInt64, GetVIXLAssembler());
386 }
387
VisitIntegerBitCount(HInvoke * invoke)388 void IntrinsicLocationsBuilderARM64::VisitIntegerBitCount(HInvoke* invoke) {
389 CreateIntToIntLocations(allocator_, invoke);
390 }
391
VisitIntegerBitCount(HInvoke * invoke)392 void IntrinsicCodeGeneratorARM64::VisitIntegerBitCount(HInvoke* invoke) {
393 GenBitCount(invoke, DataType::Type::kInt32, GetVIXLAssembler());
394 }
395
GenHighestOneBit(HInvoke * invoke,DataType::Type type,MacroAssembler * masm)396 static void GenHighestOneBit(HInvoke* invoke, DataType::Type type, MacroAssembler* masm) {
397 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
398
399 UseScratchRegisterScope temps(masm);
400
401 Register src = InputRegisterAt(invoke, 0);
402 Register dst = RegisterFrom(invoke->GetLocations()->Out(), type);
403 Register temp = (type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
404 size_t high_bit = (type == DataType::Type::kInt64) ? 63u : 31u;
405 size_t clz_high_bit = (type == DataType::Type::kInt64) ? 6u : 5u;
406
407 __ Clz(temp, src);
408 __ Mov(dst, UINT64_C(1) << high_bit); // MOV (bitmask immediate)
409 __ Bic(dst, dst, Operand(temp, LSL, high_bit - clz_high_bit)); // Clear dst if src was 0.
410 __ Lsr(dst, dst, temp);
411 }
412
VisitIntegerHighestOneBit(HInvoke * invoke)413 void IntrinsicLocationsBuilderARM64::VisitIntegerHighestOneBit(HInvoke* invoke) {
414 CreateIntToIntLocations(allocator_, invoke);
415 }
416
VisitIntegerHighestOneBit(HInvoke * invoke)417 void IntrinsicCodeGeneratorARM64::VisitIntegerHighestOneBit(HInvoke* invoke) {
418 GenHighestOneBit(invoke, DataType::Type::kInt32, GetVIXLAssembler());
419 }
420
VisitLongHighestOneBit(HInvoke * invoke)421 void IntrinsicLocationsBuilderARM64::VisitLongHighestOneBit(HInvoke* invoke) {
422 CreateIntToIntLocations(allocator_, invoke);
423 }
424
VisitLongHighestOneBit(HInvoke * invoke)425 void IntrinsicCodeGeneratorARM64::VisitLongHighestOneBit(HInvoke* invoke) {
426 GenHighestOneBit(invoke, DataType::Type::kInt64, GetVIXLAssembler());
427 }
428
GenLowestOneBit(HInvoke * invoke,DataType::Type type,MacroAssembler * masm)429 static void GenLowestOneBit(HInvoke* invoke, DataType::Type type, MacroAssembler* masm) {
430 DCHECK(type == DataType::Type::kInt32 || type == DataType::Type::kInt64);
431
432 UseScratchRegisterScope temps(masm);
433
434 Register src = InputRegisterAt(invoke, 0);
435 Register dst = RegisterFrom(invoke->GetLocations()->Out(), type);
436 Register temp = (type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
437
438 __ Neg(temp, src);
439 __ And(dst, temp, src);
440 }
441
VisitIntegerLowestOneBit(HInvoke * invoke)442 void IntrinsicLocationsBuilderARM64::VisitIntegerLowestOneBit(HInvoke* invoke) {
443 CreateIntToIntLocations(allocator_, invoke);
444 }
445
VisitIntegerLowestOneBit(HInvoke * invoke)446 void IntrinsicCodeGeneratorARM64::VisitIntegerLowestOneBit(HInvoke* invoke) {
447 GenLowestOneBit(invoke, DataType::Type::kInt32, GetVIXLAssembler());
448 }
449
VisitLongLowestOneBit(HInvoke * invoke)450 void IntrinsicLocationsBuilderARM64::VisitLongLowestOneBit(HInvoke* invoke) {
451 CreateIntToIntLocations(allocator_, invoke);
452 }
453
VisitLongLowestOneBit(HInvoke * invoke)454 void IntrinsicCodeGeneratorARM64::VisitLongLowestOneBit(HInvoke* invoke) {
455 GenLowestOneBit(invoke, DataType::Type::kInt64, GetVIXLAssembler());
456 }
457
CreateFPToFPLocations(ArenaAllocator * allocator,HInvoke * invoke)458 static void CreateFPToFPLocations(ArenaAllocator* allocator, HInvoke* invoke) {
459 LocationSummary* locations =
460 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
461 locations->SetInAt(0, Location::RequiresFpuRegister());
462 locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
463 }
464
VisitMathSqrt(HInvoke * invoke)465 void IntrinsicLocationsBuilderARM64::VisitMathSqrt(HInvoke* invoke) {
466 CreateFPToFPLocations(allocator_, invoke);
467 }
468
VisitMathSqrt(HInvoke * invoke)469 void IntrinsicCodeGeneratorARM64::VisitMathSqrt(HInvoke* invoke) {
470 LocationSummary* locations = invoke->GetLocations();
471 MacroAssembler* masm = GetVIXLAssembler();
472 __ Fsqrt(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
473 }
474
VisitMathCeil(HInvoke * invoke)475 void IntrinsicLocationsBuilderARM64::VisitMathCeil(HInvoke* invoke) {
476 CreateFPToFPLocations(allocator_, invoke);
477 }
478
VisitMathCeil(HInvoke * invoke)479 void IntrinsicCodeGeneratorARM64::VisitMathCeil(HInvoke* invoke) {
480 LocationSummary* locations = invoke->GetLocations();
481 MacroAssembler* masm = GetVIXLAssembler();
482 __ Frintp(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
483 }
484
VisitMathFloor(HInvoke * invoke)485 void IntrinsicLocationsBuilderARM64::VisitMathFloor(HInvoke* invoke) {
486 CreateFPToFPLocations(allocator_, invoke);
487 }
488
VisitMathFloor(HInvoke * invoke)489 void IntrinsicCodeGeneratorARM64::VisitMathFloor(HInvoke* invoke) {
490 LocationSummary* locations = invoke->GetLocations();
491 MacroAssembler* masm = GetVIXLAssembler();
492 __ Frintm(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
493 }
494
VisitMathRint(HInvoke * invoke)495 void IntrinsicLocationsBuilderARM64::VisitMathRint(HInvoke* invoke) {
496 CreateFPToFPLocations(allocator_, invoke);
497 }
498
VisitMathRint(HInvoke * invoke)499 void IntrinsicCodeGeneratorARM64::VisitMathRint(HInvoke* invoke) {
500 LocationSummary* locations = invoke->GetLocations();
501 MacroAssembler* masm = GetVIXLAssembler();
502 __ Frintn(DRegisterFrom(locations->Out()), DRegisterFrom(locations->InAt(0)));
503 }
504
CreateFPToIntPlusFPTempLocations(ArenaAllocator * allocator,HInvoke * invoke)505 static void CreateFPToIntPlusFPTempLocations(ArenaAllocator* allocator, HInvoke* invoke) {
506 LocationSummary* locations =
507 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
508 locations->SetInAt(0, Location::RequiresFpuRegister());
509 locations->SetOut(Location::RequiresRegister());
510 locations->AddTemp(Location::RequiresFpuRegister());
511 }
512
GenMathRound(HInvoke * invoke,bool is_double,vixl::aarch64::MacroAssembler * masm)513 static void GenMathRound(HInvoke* invoke, bool is_double, vixl::aarch64::MacroAssembler* masm) {
514 // Java 8 API definition for Math.round():
515 // Return the closest long or int to the argument, with ties rounding to positive infinity.
516 //
517 // There is no single instruction in ARMv8 that can support the above definition.
518 // We choose to use FCVTAS here, because it has closest semantic.
519 // FCVTAS performs rounding to nearest integer, ties away from zero.
520 // For most inputs (positive values, zero or NaN), this instruction is enough.
521 // We only need a few handling code after FCVTAS if the input is negative half value.
522 //
523 // The reason why we didn't choose FCVTPS instruction here is that
524 // although it performs rounding toward positive infinity, it doesn't perform rounding to nearest.
525 // For example, FCVTPS(-1.9) = -1 and FCVTPS(1.1) = 2.
526 // If we were using this instruction, for most inputs, more handling code would be needed.
527 LocationSummary* l = invoke->GetLocations();
528 VRegister in_reg = is_double ? DRegisterFrom(l->InAt(0)) : SRegisterFrom(l->InAt(0));
529 VRegister tmp_fp = is_double ? DRegisterFrom(l->GetTemp(0)) : SRegisterFrom(l->GetTemp(0));
530 Register out_reg = is_double ? XRegisterFrom(l->Out()) : WRegisterFrom(l->Out());
531 vixl::aarch64::Label done;
532
533 // Round to nearest integer, ties away from zero.
534 __ Fcvtas(out_reg, in_reg);
535
536 // For positive values, zero or NaN inputs, rounding is done.
537 __ Tbz(out_reg, out_reg.GetSizeInBits() - 1, &done);
538
539 // Handle input < 0 cases.
540 // If input is negative but not a tie, previous result (round to nearest) is valid.
541 // If input is a negative tie, out_reg += 1.
542 __ Frinta(tmp_fp, in_reg);
543 __ Fsub(tmp_fp, in_reg, tmp_fp);
544 __ Fcmp(tmp_fp, 0.5);
545 __ Cinc(out_reg, out_reg, eq);
546
547 __ Bind(&done);
548 }
549
VisitMathRoundDouble(HInvoke * invoke)550 void IntrinsicLocationsBuilderARM64::VisitMathRoundDouble(HInvoke* invoke) {
551 CreateFPToIntPlusFPTempLocations(allocator_, invoke);
552 }
553
VisitMathRoundDouble(HInvoke * invoke)554 void IntrinsicCodeGeneratorARM64::VisitMathRoundDouble(HInvoke* invoke) {
555 GenMathRound(invoke, /* is_double= */ true, GetVIXLAssembler());
556 }
557
VisitMathRoundFloat(HInvoke * invoke)558 void IntrinsicLocationsBuilderARM64::VisitMathRoundFloat(HInvoke* invoke) {
559 CreateFPToIntPlusFPTempLocations(allocator_, invoke);
560 }
561
VisitMathRoundFloat(HInvoke * invoke)562 void IntrinsicCodeGeneratorARM64::VisitMathRoundFloat(HInvoke* invoke) {
563 GenMathRound(invoke, /* is_double= */ false, GetVIXLAssembler());
564 }
565
VisitMemoryPeekByte(HInvoke * invoke)566 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekByte(HInvoke* invoke) {
567 CreateIntToIntLocations(allocator_, invoke);
568 }
569
VisitMemoryPeekByte(HInvoke * invoke)570 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekByte(HInvoke* invoke) {
571 MacroAssembler* masm = GetVIXLAssembler();
572 __ Ldrsb(WRegisterFrom(invoke->GetLocations()->Out()),
573 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
574 }
575
VisitMemoryPeekIntNative(HInvoke * invoke)576 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
577 CreateIntToIntLocations(allocator_, invoke);
578 }
579
VisitMemoryPeekIntNative(HInvoke * invoke)580 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekIntNative(HInvoke* invoke) {
581 MacroAssembler* masm = GetVIXLAssembler();
582 __ Ldr(WRegisterFrom(invoke->GetLocations()->Out()),
583 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
584 }
585
VisitMemoryPeekLongNative(HInvoke * invoke)586 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
587 CreateIntToIntLocations(allocator_, invoke);
588 }
589
VisitMemoryPeekLongNative(HInvoke * invoke)590 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekLongNative(HInvoke* invoke) {
591 MacroAssembler* masm = GetVIXLAssembler();
592 __ Ldr(XRegisterFrom(invoke->GetLocations()->Out()),
593 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
594 }
595
VisitMemoryPeekShortNative(HInvoke * invoke)596 void IntrinsicLocationsBuilderARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
597 CreateIntToIntLocations(allocator_, invoke);
598 }
599
VisitMemoryPeekShortNative(HInvoke * invoke)600 void IntrinsicCodeGeneratorARM64::VisitMemoryPeekShortNative(HInvoke* invoke) {
601 MacroAssembler* masm = GetVIXLAssembler();
602 __ Ldrsh(WRegisterFrom(invoke->GetLocations()->Out()),
603 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
604 }
605
CreateIntIntToVoidLocations(ArenaAllocator * allocator,HInvoke * invoke)606 static void CreateIntIntToVoidLocations(ArenaAllocator* allocator, HInvoke* invoke) {
607 LocationSummary* locations =
608 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
609 locations->SetInAt(0, Location::RequiresRegister());
610 locations->SetInAt(1, Location::RequiresRegister());
611 }
612
VisitMemoryPokeByte(HInvoke * invoke)613 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeByte(HInvoke* invoke) {
614 CreateIntIntToVoidLocations(allocator_, invoke);
615 }
616
VisitMemoryPokeByte(HInvoke * invoke)617 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeByte(HInvoke* invoke) {
618 MacroAssembler* masm = GetVIXLAssembler();
619 __ Strb(WRegisterFrom(invoke->GetLocations()->InAt(1)),
620 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
621 }
622
VisitMemoryPokeIntNative(HInvoke * invoke)623 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
624 CreateIntIntToVoidLocations(allocator_, invoke);
625 }
626
VisitMemoryPokeIntNative(HInvoke * invoke)627 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeIntNative(HInvoke* invoke) {
628 MacroAssembler* masm = GetVIXLAssembler();
629 __ Str(WRegisterFrom(invoke->GetLocations()->InAt(1)),
630 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
631 }
632
VisitMemoryPokeLongNative(HInvoke * invoke)633 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
634 CreateIntIntToVoidLocations(allocator_, invoke);
635 }
636
VisitMemoryPokeLongNative(HInvoke * invoke)637 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeLongNative(HInvoke* invoke) {
638 MacroAssembler* masm = GetVIXLAssembler();
639 __ Str(XRegisterFrom(invoke->GetLocations()->InAt(1)),
640 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
641 }
642
VisitMemoryPokeShortNative(HInvoke * invoke)643 void IntrinsicLocationsBuilderARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
644 CreateIntIntToVoidLocations(allocator_, invoke);
645 }
646
VisitMemoryPokeShortNative(HInvoke * invoke)647 void IntrinsicCodeGeneratorARM64::VisitMemoryPokeShortNative(HInvoke* invoke) {
648 MacroAssembler* masm = GetVIXLAssembler();
649 __ Strh(WRegisterFrom(invoke->GetLocations()->InAt(1)),
650 AbsoluteHeapOperandFrom(invoke->GetLocations()->InAt(0), 0));
651 }
652
VisitThreadCurrentThread(HInvoke * invoke)653 void IntrinsicLocationsBuilderARM64::VisitThreadCurrentThread(HInvoke* invoke) {
654 LocationSummary* locations =
655 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
656 locations->SetOut(Location::RequiresRegister());
657 }
658
VisitThreadCurrentThread(HInvoke * invoke)659 void IntrinsicCodeGeneratorARM64::VisitThreadCurrentThread(HInvoke* invoke) {
660 codegen_->Load(DataType::Type::kReference, WRegisterFrom(invoke->GetLocations()->Out()),
661 MemOperand(tr, Thread::PeerOffset<kArm64PointerSize>().Int32Value()));
662 }
663
GenUnsafeGet(HInvoke * invoke,DataType::Type type,bool is_volatile,CodeGeneratorARM64 * codegen)664 static void GenUnsafeGet(HInvoke* invoke,
665 DataType::Type type,
666 bool is_volatile,
667 CodeGeneratorARM64* codegen) {
668 LocationSummary* locations = invoke->GetLocations();
669 DCHECK((type == DataType::Type::kInt32) ||
670 (type == DataType::Type::kInt64) ||
671 (type == DataType::Type::kReference));
672 Location base_loc = locations->InAt(1);
673 Register base = WRegisterFrom(base_loc); // Object pointer.
674 Location offset_loc = locations->InAt(2);
675 Register offset = XRegisterFrom(offset_loc); // Long offset.
676 Location trg_loc = locations->Out();
677 Register trg = RegisterFrom(trg_loc, type);
678
679 if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
680 // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case.
681 Register temp = WRegisterFrom(locations->GetTemp(0));
682 MacroAssembler* masm = codegen->GetVIXLAssembler();
683 // Piggy-back on the field load path using introspection for the Baker read barrier.
684 __ Add(temp, base, offset.W()); // Offset should not exceed 32 bits.
685 codegen->GenerateFieldLoadWithBakerReadBarrier(invoke,
686 trg_loc,
687 base,
688 MemOperand(temp.X()),
689 /* needs_null_check= */ false,
690 is_volatile);
691 } else {
692 // Other cases.
693 MemOperand mem_op(base.X(), offset);
694 if (is_volatile) {
695 codegen->LoadAcquire(invoke, trg, mem_op, /* needs_null_check= */ true);
696 } else {
697 codegen->Load(type, trg, mem_op);
698 }
699
700 if (type == DataType::Type::kReference) {
701 DCHECK(trg.IsW());
702 codegen->MaybeGenerateReadBarrierSlow(invoke, trg_loc, trg_loc, base_loc, 0u, offset_loc);
703 }
704 }
705 }
706
CreateIntIntIntToIntLocations(ArenaAllocator * allocator,HInvoke * invoke)707 static void CreateIntIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
708 bool can_call = kEmitCompilerReadBarrier &&
709 (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
710 invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
711 LocationSummary* locations =
712 new (allocator) LocationSummary(invoke,
713 can_call
714 ? LocationSummary::kCallOnSlowPath
715 : LocationSummary::kNoCall,
716 kIntrinsified);
717 if (can_call && kUseBakerReadBarrier) {
718 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers.
719 // We need a temporary register for the read barrier load in order to use
720 // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier().
721 locations->AddTemp(FixedTempLocation());
722 }
723 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
724 locations->SetInAt(1, Location::RequiresRegister());
725 locations->SetInAt(2, Location::RequiresRegister());
726 locations->SetOut(Location::RequiresRegister(),
727 (can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap));
728 }
729
VisitUnsafeGet(HInvoke * invoke)730 void IntrinsicLocationsBuilderARM64::VisitUnsafeGet(HInvoke* invoke) {
731 CreateIntIntIntToIntLocations(allocator_, invoke);
732 }
VisitUnsafeGetVolatile(HInvoke * invoke)733 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
734 CreateIntIntIntToIntLocations(allocator_, invoke);
735 }
VisitUnsafeGetLong(HInvoke * invoke)736 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLong(HInvoke* invoke) {
737 CreateIntIntIntToIntLocations(allocator_, invoke);
738 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)739 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
740 CreateIntIntIntToIntLocations(allocator_, invoke);
741 }
VisitUnsafeGetObject(HInvoke * invoke)742 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObject(HInvoke* invoke) {
743 CreateIntIntIntToIntLocations(allocator_, invoke);
744 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)745 void IntrinsicLocationsBuilderARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
746 CreateIntIntIntToIntLocations(allocator_, invoke);
747 }
748
VisitUnsafeGet(HInvoke * invoke)749 void IntrinsicCodeGeneratorARM64::VisitUnsafeGet(HInvoke* invoke) {
750 GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ false, codegen_);
751 }
VisitUnsafeGetVolatile(HInvoke * invoke)752 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetVolatile(HInvoke* invoke) {
753 GenUnsafeGet(invoke, DataType::Type::kInt32, /* is_volatile= */ true, codegen_);
754 }
VisitUnsafeGetLong(HInvoke * invoke)755 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLong(HInvoke* invoke) {
756 GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ false, codegen_);
757 }
VisitUnsafeGetLongVolatile(HInvoke * invoke)758 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
759 GenUnsafeGet(invoke, DataType::Type::kInt64, /* is_volatile= */ true, codegen_);
760 }
VisitUnsafeGetObject(HInvoke * invoke)761 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObject(HInvoke* invoke) {
762 GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ false, codegen_);
763 }
VisitUnsafeGetObjectVolatile(HInvoke * invoke)764 void IntrinsicCodeGeneratorARM64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
765 GenUnsafeGet(invoke, DataType::Type::kReference, /* is_volatile= */ true, codegen_);
766 }
767
CreateIntIntIntIntToVoid(ArenaAllocator * allocator,HInvoke * invoke)768 static void CreateIntIntIntIntToVoid(ArenaAllocator* allocator, HInvoke* invoke) {
769 LocationSummary* locations =
770 new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
771 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
772 locations->SetInAt(1, Location::RequiresRegister());
773 locations->SetInAt(2, Location::RequiresRegister());
774 locations->SetInAt(3, Location::RequiresRegister());
775 }
776
VisitUnsafePut(HInvoke * invoke)777 void IntrinsicLocationsBuilderARM64::VisitUnsafePut(HInvoke* invoke) {
778 CreateIntIntIntIntToVoid(allocator_, invoke);
779 }
VisitUnsafePutOrdered(HInvoke * invoke)780 void IntrinsicLocationsBuilderARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
781 CreateIntIntIntIntToVoid(allocator_, invoke);
782 }
VisitUnsafePutVolatile(HInvoke * invoke)783 void IntrinsicLocationsBuilderARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
784 CreateIntIntIntIntToVoid(allocator_, invoke);
785 }
VisitUnsafePutObject(HInvoke * invoke)786 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObject(HInvoke* invoke) {
787 CreateIntIntIntIntToVoid(allocator_, invoke);
788 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)789 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
790 CreateIntIntIntIntToVoid(allocator_, invoke);
791 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)792 void IntrinsicLocationsBuilderARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
793 CreateIntIntIntIntToVoid(allocator_, invoke);
794 }
VisitUnsafePutLong(HInvoke * invoke)795 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLong(HInvoke* invoke) {
796 CreateIntIntIntIntToVoid(allocator_, invoke);
797 }
VisitUnsafePutLongOrdered(HInvoke * invoke)798 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
799 CreateIntIntIntIntToVoid(allocator_, invoke);
800 }
VisitUnsafePutLongVolatile(HInvoke * invoke)801 void IntrinsicLocationsBuilderARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
802 CreateIntIntIntIntToVoid(allocator_, invoke);
803 }
804
GenUnsafePut(HInvoke * invoke,DataType::Type type,bool is_volatile,bool is_ordered,CodeGeneratorARM64 * codegen)805 static void GenUnsafePut(HInvoke* invoke,
806 DataType::Type type,
807 bool is_volatile,
808 bool is_ordered,
809 CodeGeneratorARM64* codegen) {
810 LocationSummary* locations = invoke->GetLocations();
811 MacroAssembler* masm = codegen->GetVIXLAssembler();
812
813 Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
814 Register offset = XRegisterFrom(locations->InAt(2)); // Long offset.
815 Register value = RegisterFrom(locations->InAt(3), type);
816 Register source = value;
817 MemOperand mem_op(base.X(), offset);
818
819 {
820 // We use a block to end the scratch scope before the write barrier, thus
821 // freeing the temporary registers so they can be used in `MarkGCCard`.
822 UseScratchRegisterScope temps(masm);
823
824 if (kPoisonHeapReferences && type == DataType::Type::kReference) {
825 DCHECK(value.IsW());
826 Register temp = temps.AcquireW();
827 __ Mov(temp.W(), value.W());
828 codegen->GetAssembler()->PoisonHeapReference(temp.W());
829 source = temp;
830 }
831
832 if (is_volatile || is_ordered) {
833 codegen->StoreRelease(invoke, type, source, mem_op, /* needs_null_check= */ false);
834 } else {
835 codegen->Store(type, source, mem_op);
836 }
837 }
838
839 if (type == DataType::Type::kReference) {
840 bool value_can_be_null = true; // TODO: Worth finding out this information?
841 codegen->MarkGCCard(base, value, value_can_be_null);
842 }
843 }
844
VisitUnsafePut(HInvoke * invoke)845 void IntrinsicCodeGeneratorARM64::VisitUnsafePut(HInvoke* invoke) {
846 GenUnsafePut(invoke,
847 DataType::Type::kInt32,
848 /* is_volatile= */ false,
849 /* is_ordered= */ false,
850 codegen_);
851 }
VisitUnsafePutOrdered(HInvoke * invoke)852 void IntrinsicCodeGeneratorARM64::VisitUnsafePutOrdered(HInvoke* invoke) {
853 GenUnsafePut(invoke,
854 DataType::Type::kInt32,
855 /* is_volatile= */ false,
856 /* is_ordered= */ true,
857 codegen_);
858 }
VisitUnsafePutVolatile(HInvoke * invoke)859 void IntrinsicCodeGeneratorARM64::VisitUnsafePutVolatile(HInvoke* invoke) {
860 GenUnsafePut(invoke,
861 DataType::Type::kInt32,
862 /* is_volatile= */ true,
863 /* is_ordered= */ false,
864 codegen_);
865 }
VisitUnsafePutObject(HInvoke * invoke)866 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObject(HInvoke* invoke) {
867 GenUnsafePut(invoke,
868 DataType::Type::kReference,
869 /* is_volatile= */ false,
870 /* is_ordered= */ false,
871 codegen_);
872 }
VisitUnsafePutObjectOrdered(HInvoke * invoke)873 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectOrdered(HInvoke* invoke) {
874 GenUnsafePut(invoke,
875 DataType::Type::kReference,
876 /* is_volatile= */ false,
877 /* is_ordered= */ true,
878 codegen_);
879 }
VisitUnsafePutObjectVolatile(HInvoke * invoke)880 void IntrinsicCodeGeneratorARM64::VisitUnsafePutObjectVolatile(HInvoke* invoke) {
881 GenUnsafePut(invoke,
882 DataType::Type::kReference,
883 /* is_volatile= */ true,
884 /* is_ordered= */ false,
885 codegen_);
886 }
VisitUnsafePutLong(HInvoke * invoke)887 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLong(HInvoke* invoke) {
888 GenUnsafePut(invoke,
889 DataType::Type::kInt64,
890 /* is_volatile= */ false,
891 /* is_ordered= */ false,
892 codegen_);
893 }
VisitUnsafePutLongOrdered(HInvoke * invoke)894 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongOrdered(HInvoke* invoke) {
895 GenUnsafePut(invoke,
896 DataType::Type::kInt64,
897 /* is_volatile= */ false,
898 /* is_ordered= */ true,
899 codegen_);
900 }
VisitUnsafePutLongVolatile(HInvoke * invoke)901 void IntrinsicCodeGeneratorARM64::VisitUnsafePutLongVolatile(HInvoke* invoke) {
902 GenUnsafePut(invoke,
903 DataType::Type::kInt64,
904 /* is_volatile= */ true,
905 /* is_ordered= */ false,
906 codegen_);
907 }
908
CreateIntIntIntIntIntToInt(ArenaAllocator * allocator,HInvoke * invoke,DataType::Type type)909 static void CreateIntIntIntIntIntToInt(ArenaAllocator* allocator,
910 HInvoke* invoke,
911 DataType::Type type) {
912 bool can_call = kEmitCompilerReadBarrier &&
913 kUseBakerReadBarrier &&
914 (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
915 LocationSummary* locations =
916 new (allocator) LocationSummary(invoke,
917 can_call
918 ? LocationSummary::kCallOnSlowPath
919 : LocationSummary::kNoCall,
920 kIntrinsified);
921 if (can_call) {
922 locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers.
923 }
924 locations->SetInAt(0, Location::NoLocation()); // Unused receiver.
925 locations->SetInAt(1, Location::RequiresRegister());
926 locations->SetInAt(2, Location::RequiresRegister());
927 locations->SetInAt(3, Location::RequiresRegister());
928 locations->SetInAt(4, Location::RequiresRegister());
929
930 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
931 if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
932 // We need two non-scratch temporary registers for (Baker) read barrier.
933 locations->AddTemp(Location::RequiresRegister());
934 locations->AddTemp(Location::RequiresRegister());
935 }
936 }
937
938 class BakerReadBarrierCasSlowPathARM64 : public SlowPathCodeARM64 {
939 public:
BakerReadBarrierCasSlowPathARM64(HInvoke * invoke)940 explicit BakerReadBarrierCasSlowPathARM64(HInvoke* invoke)
941 : SlowPathCodeARM64(invoke) {}
942
GetDescription() const943 const char* GetDescription() const override { return "BakerReadBarrierCasSlowPathARM64"; }
944
EmitNativeCode(CodeGenerator * codegen)945 void EmitNativeCode(CodeGenerator* codegen) override {
946 CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
947 Arm64Assembler* assembler = arm64_codegen->GetAssembler();
948 MacroAssembler* masm = assembler->GetVIXLAssembler();
949 __ Bind(GetEntryLabel());
950
951 // Get the locations.
952 LocationSummary* locations = instruction_->GetLocations();
953 Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
954 Register offset = XRegisterFrom(locations->InAt(2)); // Long offset.
955 Register expected = WRegisterFrom(locations->InAt(3)); // Expected.
956 Register value = WRegisterFrom(locations->InAt(4)); // Value.
957
958 Register old_value = WRegisterFrom(locations->GetTemp(0)); // The old value from main path.
959 Register marked = WRegisterFrom(locations->GetTemp(1)); // The marked old value.
960
961 // Mark the `old_value` from the main path and compare with `expected`. This clobbers the
962 // `tmp_ptr` scratch register but we do not want to allocate another non-scratch temporary.
963 arm64_codegen->GenerateUnsafeCasOldValueMovWithBakerReadBarrier(marked, old_value);
964 __ Cmp(marked, expected);
965 __ B(GetExitLabel(), ne); // If taken, Z=false indicates failure.
966
967 // The `old_value` we have read did not match `expected` (which is always a to-space reference)
968 // but after the read barrier in GenerateUnsafeCasOldValueMovWithBakerReadBarrier() the marked
969 // to-space value matched, so the `old_value` must be a from-space reference to the same
970 // object. Do the same CAS loop as the main path but check for both `expected` and the unmarked
971 // old value representing the to-space and from-space references for the same object.
972
973 UseScratchRegisterScope temps(masm);
974 Register tmp_ptr = temps.AcquireX();
975 Register tmp = temps.AcquireSameSizeAs(value);
976
977 // Recalculate the `tmp_ptr` clobbered above.
978 __ Add(tmp_ptr, base.X(), Operand(offset));
979
980 // do {
981 // tmp_value = [tmp_ptr];
982 // } while ((tmp_value == expected || tmp == old_value) && failure([tmp_ptr] <- r_new_value));
983 // result = (tmp_value == expected || tmp == old_value);
984
985 vixl::aarch64::Label loop_head;
986 __ Bind(&loop_head);
987 __ Ldaxr(tmp, MemOperand(tmp_ptr));
988 assembler->MaybeUnpoisonHeapReference(tmp);
989 __ Cmp(tmp, expected);
990 __ Ccmp(tmp, old_value, ZFlag, ne);
991 __ B(GetExitLabel(), ne); // If taken, Z=false indicates failure.
992 assembler->MaybePoisonHeapReference(value);
993 __ Stlxr(tmp.W(), value, MemOperand(tmp_ptr));
994 assembler->MaybeUnpoisonHeapReference(value);
995 __ Cbnz(tmp.W(), &loop_head);
996
997 // Z=true from the above CMP+CCMP indicates success.
998 __ B(GetExitLabel());
999 }
1000 };
1001
GenCas(HInvoke * invoke,DataType::Type type,CodeGeneratorARM64 * codegen)1002 static void GenCas(HInvoke* invoke, DataType::Type type, CodeGeneratorARM64* codegen) {
1003 Arm64Assembler* assembler = codegen->GetAssembler();
1004 MacroAssembler* masm = assembler->GetVIXLAssembler();
1005 LocationSummary* locations = invoke->GetLocations();
1006
1007 Register out = WRegisterFrom(locations->Out()); // Boolean result.
1008 Register base = WRegisterFrom(locations->InAt(1)); // Object pointer.
1009 Register offset = XRegisterFrom(locations->InAt(2)); // Long offset.
1010 Register expected = RegisterFrom(locations->InAt(3), type); // Expected.
1011 Register value = RegisterFrom(locations->InAt(4), type); // Value.
1012
1013 // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
1014 if (type == DataType::Type::kReference) {
1015 // Mark card for object assuming new value is stored.
1016 bool value_can_be_null = true; // TODO: Worth finding out this information?
1017 codegen->MarkGCCard(base, value, value_can_be_null);
1018 }
1019
1020 UseScratchRegisterScope temps(masm);
1021 Register tmp_ptr = temps.AcquireX(); // Pointer to actual memory.
1022 Register old_value; // Value in memory.
1023
1024 vixl::aarch64::Label exit_loop_label;
1025 vixl::aarch64::Label* exit_loop = &exit_loop_label;
1026 vixl::aarch64::Label* failure = &exit_loop_label;
1027
1028 if (kEmitCompilerReadBarrier && type == DataType::Type::kReference) {
1029 // The only read barrier implementation supporting the
1030 // UnsafeCASObject intrinsic is the Baker-style read barriers.
1031 DCHECK(kUseBakerReadBarrier);
1032
1033 BakerReadBarrierCasSlowPathARM64* slow_path =
1034 new (codegen->GetScopedAllocator()) BakerReadBarrierCasSlowPathARM64(invoke);
1035 codegen->AddSlowPath(slow_path);
1036 exit_loop = slow_path->GetExitLabel();
1037 failure = slow_path->GetEntryLabel();
1038 // We need to store the `old_value` in a non-scratch register to make sure
1039 // the Baker read barrier in the slow path does not clobber it.
1040 old_value = WRegisterFrom(locations->GetTemp(0));
1041 } else {
1042 old_value = temps.AcquireSameSizeAs(value);
1043 }
1044
1045 __ Add(tmp_ptr, base.X(), Operand(offset));
1046
1047 // do {
1048 // tmp_value = [tmp_ptr];
1049 // } while (tmp_value == expected && failure([tmp_ptr] <- r_new_value));
1050 // result = tmp_value == expected;
1051
1052 vixl::aarch64::Label loop_head;
1053 __ Bind(&loop_head);
1054 __ Ldaxr(old_value, MemOperand(tmp_ptr));
1055 if (type == DataType::Type::kReference) {
1056 assembler->MaybeUnpoisonHeapReference(old_value);
1057 }
1058 __ Cmp(old_value, expected);
1059 __ B(failure, ne);
1060 if (type == DataType::Type::kReference) {
1061 assembler->MaybePoisonHeapReference(value);
1062 }
1063 __ Stlxr(old_value.W(), value, MemOperand(tmp_ptr)); // Reuse `old_value` for STLXR result.
1064 if (type == DataType::Type::kReference) {
1065 assembler->MaybeUnpoisonHeapReference(value);
1066 }
1067 __ Cbnz(old_value.W(), &loop_head);
1068 __ Bind(exit_loop);
1069 __ Cset(out, eq);
1070 }
1071
VisitUnsafeCASInt(HInvoke * invoke)1072 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASInt(HInvoke* invoke) {
1073 CreateIntIntIntIntIntToInt(allocator_, invoke, DataType::Type::kInt32);
1074 }
VisitUnsafeCASLong(HInvoke * invoke)1075 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASLong(HInvoke* invoke) {
1076 CreateIntIntIntIntIntToInt(allocator_, invoke, DataType::Type::kInt64);
1077 }
VisitUnsafeCASObject(HInvoke * invoke)1078 void IntrinsicLocationsBuilderARM64::VisitUnsafeCASObject(HInvoke* invoke) {
1079 // The only read barrier implementation supporting the
1080 // UnsafeCASObject intrinsic is the Baker-style read barriers.
1081 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
1082 return;
1083 }
1084
1085 CreateIntIntIntIntIntToInt(allocator_, invoke, DataType::Type::kReference);
1086 }
1087
VisitUnsafeCASInt(HInvoke * invoke)1088 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASInt(HInvoke* invoke) {
1089 GenCas(invoke, DataType::Type::kInt32, codegen_);
1090 }
VisitUnsafeCASLong(HInvoke * invoke)1091 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASLong(HInvoke* invoke) {
1092 GenCas(invoke, DataType::Type::kInt64, codegen_);
1093 }
VisitUnsafeCASObject(HInvoke * invoke)1094 void IntrinsicCodeGeneratorARM64::VisitUnsafeCASObject(HInvoke* invoke) {
1095 // The only read barrier implementation supporting the
1096 // UnsafeCASObject intrinsic is the Baker-style read barriers.
1097 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
1098
1099 GenCas(invoke, DataType::Type::kReference, codegen_);
1100 }
1101
VisitStringCompareTo(HInvoke * invoke)1102 void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
1103 LocationSummary* locations =
1104 new (allocator_) LocationSummary(invoke,
1105 invoke->InputAt(1)->CanBeNull()
1106 ? LocationSummary::kCallOnSlowPath
1107 : LocationSummary::kNoCall,
1108 kIntrinsified);
1109 locations->SetInAt(0, Location::RequiresRegister());
1110 locations->SetInAt(1, Location::RequiresRegister());
1111 locations->AddTemp(Location::RequiresRegister());
1112 locations->AddTemp(Location::RequiresRegister());
1113 locations->AddTemp(Location::RequiresRegister());
1114 // Need temporary registers for String compression's feature.
1115 if (mirror::kUseStringCompression) {
1116 locations->AddTemp(Location::RequiresRegister());
1117 }
1118 locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
1119 }
1120
VisitStringCompareTo(HInvoke * invoke)1121 void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
1122 MacroAssembler* masm = GetVIXLAssembler();
1123 LocationSummary* locations = invoke->GetLocations();
1124
1125 Register str = InputRegisterAt(invoke, 0);
1126 Register arg = InputRegisterAt(invoke, 1);
1127 DCHECK(str.IsW());
1128 DCHECK(arg.IsW());
1129 Register out = OutputRegister(invoke);
1130
1131 Register temp0 = WRegisterFrom(locations->GetTemp(0));
1132 Register temp1 = WRegisterFrom(locations->GetTemp(1));
1133 Register temp2 = WRegisterFrom(locations->GetTemp(2));
1134 Register temp3;
1135 if (mirror::kUseStringCompression) {
1136 temp3 = WRegisterFrom(locations->GetTemp(3));
1137 }
1138
1139 vixl::aarch64::Label loop;
1140 vixl::aarch64::Label find_char_diff;
1141 vixl::aarch64::Label end;
1142 vixl::aarch64::Label different_compression;
1143
1144 // Get offsets of count and value fields within a string object.
1145 const int32_t count_offset = mirror::String::CountOffset().Int32Value();
1146 const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
1147
1148 // Note that the null check must have been done earlier.
1149 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1150
1151 // Take slow path and throw if input can be and is null.
1152 SlowPathCodeARM64* slow_path = nullptr;
1153 const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
1154 if (can_slow_path) {
1155 slow_path = new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
1156 codegen_->AddSlowPath(slow_path);
1157 __ Cbz(arg, slow_path->GetEntryLabel());
1158 }
1159
1160 // Reference equality check, return 0 if same reference.
1161 __ Subs(out, str, arg);
1162 __ B(&end, eq);
1163
1164 if (mirror::kUseStringCompression) {
1165 // Load `count` fields of this and argument strings.
1166 __ Ldr(temp3, HeapOperand(str, count_offset));
1167 __ Ldr(temp2, HeapOperand(arg, count_offset));
1168 // Clean out compression flag from lengths.
1169 __ Lsr(temp0, temp3, 1u);
1170 __ Lsr(temp1, temp2, 1u);
1171 } else {
1172 // Load lengths of this and argument strings.
1173 __ Ldr(temp0, HeapOperand(str, count_offset));
1174 __ Ldr(temp1, HeapOperand(arg, count_offset));
1175 }
1176 // out = length diff.
1177 __ Subs(out, temp0, temp1);
1178 // temp0 = min(len(str), len(arg)).
1179 __ Csel(temp0, temp1, temp0, ge);
1180 // Shorter string is empty?
1181 __ Cbz(temp0, &end);
1182
1183 if (mirror::kUseStringCompression) {
1184 // Check if both strings using same compression style to use this comparison loop.
1185 __ Eor(temp2, temp2, Operand(temp3));
1186 // Interleave with compression flag extraction which is needed for both paths
1187 // and also set flags which is needed only for the different compressions path.
1188 __ Ands(temp3.W(), temp3.W(), Operand(1));
1189 __ Tbnz(temp2, 0, &different_compression); // Does not use flags.
1190 }
1191 // Store offset of string value in preparation for comparison loop.
1192 __ Mov(temp1, value_offset);
1193 if (mirror::kUseStringCompression) {
1194 // For string compression, calculate the number of bytes to compare (not chars).
1195 // This could in theory exceed INT32_MAX, so treat temp0 as unsigned.
1196 __ Lsl(temp0, temp0, temp3);
1197 }
1198
1199 UseScratchRegisterScope scratch_scope(masm);
1200 Register temp4 = scratch_scope.AcquireX();
1201
1202 // Assertions that must hold in order to compare strings 8 bytes at a time.
1203 DCHECK_ALIGNED(value_offset, 8);
1204 static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
1205
1206 const size_t char_size = DataType::Size(DataType::Type::kUint16);
1207 DCHECK_EQ(char_size, 2u);
1208
1209 // Promote temp2 to an X reg, ready for LDR.
1210 temp2 = temp2.X();
1211
1212 // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
1213 __ Bind(&loop);
1214 __ Ldr(temp4, MemOperand(str.X(), temp1.X()));
1215 __ Ldr(temp2, MemOperand(arg.X(), temp1.X()));
1216 __ Cmp(temp4, temp2);
1217 __ B(ne, &find_char_diff);
1218 __ Add(temp1, temp1, char_size * 4);
1219 // With string compression, we have compared 8 bytes, otherwise 4 chars.
1220 __ Subs(temp0, temp0, (mirror::kUseStringCompression) ? 8 : 4);
1221 __ B(&loop, hi);
1222 __ B(&end);
1223
1224 // Promote temp1 to an X reg, ready for EOR.
1225 temp1 = temp1.X();
1226
1227 // Find the single character difference.
1228 __ Bind(&find_char_diff);
1229 // Get the bit position of the first character that differs.
1230 __ Eor(temp1, temp2, temp4);
1231 __ Rbit(temp1, temp1);
1232 __ Clz(temp1, temp1);
1233
1234 // If the number of chars remaining <= the index where the difference occurs (0-3), then
1235 // the difference occurs outside the remaining string data, so just return length diff (out).
1236 // Unlike ARM, we're doing the comparison in one go here, without the subtraction at the
1237 // find_char_diff_2nd_cmp path, so it doesn't matter whether the comparison is signed or
1238 // unsigned when string compression is disabled.
1239 // When it's enabled, the comparison must be unsigned.
1240 __ Cmp(temp0, Operand(temp1.W(), LSR, (mirror::kUseStringCompression) ? 3 : 4));
1241 __ B(ls, &end);
1242
1243 // Extract the characters and calculate the difference.
1244 if (mirror:: kUseStringCompression) {
1245 __ Bic(temp1, temp1, 0x7);
1246 __ Bic(temp1, temp1, Operand(temp3.X(), LSL, 3u));
1247 } else {
1248 __ Bic(temp1, temp1, 0xf);
1249 }
1250 __ Lsr(temp2, temp2, temp1);
1251 __ Lsr(temp4, temp4, temp1);
1252 if (mirror::kUseStringCompression) {
1253 // Prioritize the case of compressed strings and calculate such result first.
1254 __ Uxtb(temp1, temp4);
1255 __ Sub(out, temp1.W(), Operand(temp2.W(), UXTB));
1256 __ Tbz(temp3, 0u, &end); // If actually compressed, we're done.
1257 }
1258 __ Uxth(temp4, temp4);
1259 __ Sub(out, temp4.W(), Operand(temp2.W(), UXTH));
1260
1261 if (mirror::kUseStringCompression) {
1262 __ B(&end);
1263 __ Bind(&different_compression);
1264
1265 // Comparison for different compression style.
1266 const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
1267 DCHECK_EQ(c_char_size, 1u);
1268 temp1 = temp1.W();
1269 temp2 = temp2.W();
1270 temp4 = temp4.W();
1271
1272 // `temp1` will hold the compressed data pointer, `temp2` the uncompressed data pointer.
1273 // Note that flags have been set by the `str` compression flag extraction to `temp3`
1274 // before branching to the `different_compression` label.
1275 __ Csel(temp1, str, arg, eq); // Pointer to the compressed string.
1276 __ Csel(temp2, str, arg, ne); // Pointer to the uncompressed string.
1277
1278 // We want to free up the temp3, currently holding `str` compression flag, for comparison.
1279 // So, we move it to the bottom bit of the iteration count `temp0` which we then need to treat
1280 // as unsigned. Start by freeing the bit with a LSL and continue further down by a SUB which
1281 // will allow `subs temp0, #2; bhi different_compression_loop` to serve as the loop condition.
1282 __ Lsl(temp0, temp0, 1u);
1283
1284 // Adjust temp1 and temp2 from string pointers to data pointers.
1285 __ Add(temp1, temp1, Operand(value_offset));
1286 __ Add(temp2, temp2, Operand(value_offset));
1287
1288 // Complete the move of the compression flag.
1289 __ Sub(temp0, temp0, Operand(temp3));
1290
1291 vixl::aarch64::Label different_compression_loop;
1292 vixl::aarch64::Label different_compression_diff;
1293
1294 __ Bind(&different_compression_loop);
1295 __ Ldrb(temp4, MemOperand(temp1.X(), c_char_size, PostIndex));
1296 __ Ldrh(temp3, MemOperand(temp2.X(), char_size, PostIndex));
1297 __ Subs(temp4, temp4, Operand(temp3));
1298 __ B(&different_compression_diff, ne);
1299 __ Subs(temp0, temp0, 2);
1300 __ B(&different_compression_loop, hi);
1301 __ B(&end);
1302
1303 // Calculate the difference.
1304 __ Bind(&different_compression_diff);
1305 __ Tst(temp0, Operand(1));
1306 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1307 "Expecting 0=compressed, 1=uncompressed");
1308 __ Cneg(out, temp4, ne);
1309 }
1310
1311 __ Bind(&end);
1312
1313 if (can_slow_path) {
1314 __ Bind(slow_path->GetExitLabel());
1315 }
1316 }
1317
1318 // The cut off for unrolling the loop in String.equals() intrinsic for const strings.
1319 // The normal loop plus the pre-header is 9 instructions without string compression and 12
1320 // instructions with string compression. We can compare up to 8 bytes in 4 instructions
1321 // (LDR+LDR+CMP+BNE) and up to 16 bytes in 5 instructions (LDP+LDP+CMP+CCMP+BNE). Allow up
1322 // to 10 instructions for the unrolled loop.
1323 constexpr size_t kShortConstStringEqualsCutoffInBytes = 32;
1324
GetConstString(HInstruction * candidate,uint32_t * utf16_length)1325 static const char* GetConstString(HInstruction* candidate, uint32_t* utf16_length) {
1326 if (candidate->IsLoadString()) {
1327 HLoadString* load_string = candidate->AsLoadString();
1328 const DexFile& dex_file = load_string->GetDexFile();
1329 return dex_file.StringDataAndUtf16LengthByIdx(load_string->GetStringIndex(), utf16_length);
1330 }
1331 return nullptr;
1332 }
1333
VisitStringEquals(HInvoke * invoke)1334 void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {
1335 LocationSummary* locations =
1336 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1337 locations->SetInAt(0, Location::RequiresRegister());
1338 locations->SetInAt(1, Location::RequiresRegister());
1339
1340 // For the generic implementation and for long const strings we need a temporary.
1341 // We do not need it for short const strings, up to 8 bytes, see code generation below.
1342 uint32_t const_string_length = 0u;
1343 const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
1344 if (const_string == nullptr) {
1345 const_string = GetConstString(invoke->InputAt(1), &const_string_length);
1346 }
1347 bool is_compressed =
1348 mirror::kUseStringCompression &&
1349 const_string != nullptr &&
1350 mirror::String::DexFileStringAllASCII(const_string, const_string_length);
1351 if (const_string == nullptr || const_string_length > (is_compressed ? 8u : 4u)) {
1352 locations->AddTemp(Location::RequiresRegister());
1353 }
1354
1355 // TODO: If the String.equals() is used only for an immediately following HIf, we can
1356 // mark it as emitted-at-use-site and emit branches directly to the appropriate blocks.
1357 // Then we shall need an extra temporary register instead of the output register.
1358 locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
1359 }
1360
VisitStringEquals(HInvoke * invoke)1361 void IntrinsicCodeGeneratorARM64::VisitStringEquals(HInvoke* invoke) {
1362 MacroAssembler* masm = GetVIXLAssembler();
1363 LocationSummary* locations = invoke->GetLocations();
1364
1365 Register str = WRegisterFrom(locations->InAt(0));
1366 Register arg = WRegisterFrom(locations->InAt(1));
1367 Register out = XRegisterFrom(locations->Out());
1368
1369 UseScratchRegisterScope scratch_scope(masm);
1370 Register temp = scratch_scope.AcquireW();
1371 Register temp1 = scratch_scope.AcquireW();
1372
1373 vixl::aarch64::Label loop;
1374 vixl::aarch64::Label end;
1375 vixl::aarch64::Label return_true;
1376 vixl::aarch64::Label return_false;
1377
1378 // Get offsets of count, value, and class fields within a string object.
1379 const int32_t count_offset = mirror::String::CountOffset().Int32Value();
1380 const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
1381 const int32_t class_offset = mirror::Object::ClassOffset().Int32Value();
1382
1383 // Note that the null check must have been done earlier.
1384 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1385
1386 StringEqualsOptimizations optimizations(invoke);
1387 if (!optimizations.GetArgumentNotNull()) {
1388 // Check if input is null, return false if it is.
1389 __ Cbz(arg, &return_false);
1390 }
1391
1392 // Reference equality check, return true if same reference.
1393 __ Cmp(str, arg);
1394 __ B(&return_true, eq);
1395
1396 if (!optimizations.GetArgumentIsString()) {
1397 // Instanceof check for the argument by comparing class fields.
1398 // All string objects must have the same type since String cannot be subclassed.
1399 // Receiver must be a string object, so its class field is equal to all strings' class fields.
1400 // If the argument is a string object, its class field must be equal to receiver's class field.
1401 //
1402 // As the String class is expected to be non-movable, we can read the class
1403 // field from String.equals' arguments without read barriers.
1404 AssertNonMovableStringClass();
1405 // /* HeapReference<Class> */ temp = str->klass_
1406 __ Ldr(temp, MemOperand(str.X(), class_offset));
1407 // /* HeapReference<Class> */ temp1 = arg->klass_
1408 __ Ldr(temp1, MemOperand(arg.X(), class_offset));
1409 // Also, because we use the previously loaded class references only in the
1410 // following comparison, we don't need to unpoison them.
1411 __ Cmp(temp, temp1);
1412 __ B(&return_false, ne);
1413 }
1414
1415 // Check if one of the inputs is a const string. Do not special-case both strings
1416 // being const, such cases should be handled by constant folding if needed.
1417 uint32_t const_string_length = 0u;
1418 const char* const_string = GetConstString(invoke->InputAt(0), &const_string_length);
1419 if (const_string == nullptr) {
1420 const_string = GetConstString(invoke->InputAt(1), &const_string_length);
1421 if (const_string != nullptr) {
1422 std::swap(str, arg); // Make sure the const string is in `str`.
1423 }
1424 }
1425 bool is_compressed =
1426 mirror::kUseStringCompression &&
1427 const_string != nullptr &&
1428 mirror::String::DexFileStringAllASCII(const_string, const_string_length);
1429
1430 if (const_string != nullptr) {
1431 // Load `count` field of the argument string and check if it matches the const string.
1432 // Also compares the compression style, if differs return false.
1433 __ Ldr(temp, MemOperand(arg.X(), count_offset));
1434 // Temporarily release temp1 as we may not be able to embed the flagged count in CMP immediate.
1435 scratch_scope.Release(temp1);
1436 __ Cmp(temp, Operand(mirror::String::GetFlaggedCount(const_string_length, is_compressed)));
1437 temp1 = scratch_scope.AcquireW();
1438 __ B(&return_false, ne);
1439 } else {
1440 // Load `count` fields of this and argument strings.
1441 __ Ldr(temp, MemOperand(str.X(), count_offset));
1442 __ Ldr(temp1, MemOperand(arg.X(), count_offset));
1443 // Check if `count` fields are equal, return false if they're not.
1444 // Also compares the compression style, if differs return false.
1445 __ Cmp(temp, temp1);
1446 __ B(&return_false, ne);
1447 }
1448
1449 // Assertions that must hold in order to compare strings 8 bytes at a time.
1450 // Ok to do this because strings are zero-padded to kObjectAlignment.
1451 DCHECK_ALIGNED(value_offset, 8);
1452 static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
1453
1454 if (const_string != nullptr &&
1455 const_string_length <= (is_compressed ? kShortConstStringEqualsCutoffInBytes
1456 : kShortConstStringEqualsCutoffInBytes / 2u)) {
1457 // Load and compare the contents. Though we know the contents of the short const string
1458 // at compile time, materializing constants may be more code than loading from memory.
1459 int32_t offset = value_offset;
1460 size_t remaining_bytes =
1461 RoundUp(is_compressed ? const_string_length : const_string_length * 2u, 8u);
1462 temp = temp.X();
1463 temp1 = temp1.X();
1464 while (remaining_bytes > sizeof(uint64_t)) {
1465 Register temp2 = XRegisterFrom(locations->GetTemp(0));
1466 __ Ldp(temp, temp1, MemOperand(str.X(), offset));
1467 __ Ldp(temp2, out, MemOperand(arg.X(), offset));
1468 __ Cmp(temp, temp2);
1469 __ Ccmp(temp1, out, NoFlag, eq);
1470 __ B(&return_false, ne);
1471 offset += 2u * sizeof(uint64_t);
1472 remaining_bytes -= 2u * sizeof(uint64_t);
1473 }
1474 if (remaining_bytes != 0u) {
1475 __ Ldr(temp, MemOperand(str.X(), offset));
1476 __ Ldr(temp1, MemOperand(arg.X(), offset));
1477 __ Cmp(temp, temp1);
1478 __ B(&return_false, ne);
1479 }
1480 } else {
1481 // Return true if both strings are empty. Even with string compression `count == 0` means empty.
1482 static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
1483 "Expecting 0=compressed, 1=uncompressed");
1484 __ Cbz(temp, &return_true);
1485
1486 if (mirror::kUseStringCompression) {
1487 // For string compression, calculate the number of bytes to compare (not chars).
1488 // This could in theory exceed INT32_MAX, so treat temp as unsigned.
1489 __ And(temp1, temp, Operand(1)); // Extract compression flag.
1490 __ Lsr(temp, temp, 1u); // Extract length.
1491 __ Lsl(temp, temp, temp1); // Calculate number of bytes to compare.
1492 }
1493
1494 // Store offset of string value in preparation for comparison loop
1495 __ Mov(temp1, value_offset);
1496
1497 temp1 = temp1.X();
1498 Register temp2 = XRegisterFrom(locations->GetTemp(0));
1499 // Loop to compare strings 8 bytes at a time starting at the front of the string.
1500 __ Bind(&loop);
1501 __ Ldr(out, MemOperand(str.X(), temp1));
1502 __ Ldr(temp2, MemOperand(arg.X(), temp1));
1503 __ Add(temp1, temp1, Operand(sizeof(uint64_t)));
1504 __ Cmp(out, temp2);
1505 __ B(&return_false, ne);
1506 // With string compression, we have compared 8 bytes, otherwise 4 chars.
1507 __ Sub(temp, temp, Operand(mirror::kUseStringCompression ? 8 : 4), SetFlags);
1508 __ B(&loop, hi);
1509 }
1510
1511 // Return true and exit the function.
1512 // If loop does not result in returning false, we return true.
1513 __ Bind(&return_true);
1514 __ Mov(out, 1);
1515 __ B(&end);
1516
1517 // Return false and exit the function.
1518 __ Bind(&return_false);
1519 __ Mov(out, 0);
1520 __ Bind(&end);
1521 }
1522
GenerateVisitStringIndexOf(HInvoke * invoke,MacroAssembler * masm,CodeGeneratorARM64 * codegen,bool start_at_zero)1523 static void GenerateVisitStringIndexOf(HInvoke* invoke,
1524 MacroAssembler* masm,
1525 CodeGeneratorARM64* codegen,
1526 bool start_at_zero) {
1527 LocationSummary* locations = invoke->GetLocations();
1528
1529 // Note that the null check must have been done earlier.
1530 DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
1531
1532 // Check for code points > 0xFFFF. Either a slow-path check when we don't know statically,
1533 // or directly dispatch for a large constant, or omit slow-path for a small constant or a char.
1534 SlowPathCodeARM64* slow_path = nullptr;
1535 HInstruction* code_point = invoke->InputAt(1);
1536 if (code_point->IsIntConstant()) {
1537 if (static_cast<uint32_t>(code_point->AsIntConstant()->GetValue()) > 0xFFFFU) {
1538 // Always needs the slow-path. We could directly dispatch to it, but this case should be
1539 // rare, so for simplicity just put the full slow-path down and branch unconditionally.
1540 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
1541 codegen->AddSlowPath(slow_path);
1542 __ B(slow_path->GetEntryLabel());
1543 __ Bind(slow_path->GetExitLabel());
1544 return;
1545 }
1546 } else if (code_point->GetType() != DataType::Type::kUint16) {
1547 Register char_reg = WRegisterFrom(locations->InAt(1));
1548 __ Tst(char_reg, 0xFFFF0000);
1549 slow_path = new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
1550 codegen->AddSlowPath(slow_path);
1551 __ B(ne, slow_path->GetEntryLabel());
1552 }
1553
1554 if (start_at_zero) {
1555 // Start-index = 0.
1556 Register tmp_reg = WRegisterFrom(locations->GetTemp(0));
1557 __ Mov(tmp_reg, 0);
1558 }
1559
1560 codegen->InvokeRuntime(kQuickIndexOf, invoke, invoke->GetDexPc(), slow_path);
1561 CheckEntrypointTypes<kQuickIndexOf, int32_t, void*, uint32_t, uint32_t>();
1562
1563 if (slow_path != nullptr) {
1564 __ Bind(slow_path->GetExitLabel());
1565 }
1566 }
1567
VisitStringIndexOf(HInvoke * invoke)1568 void IntrinsicLocationsBuilderARM64::VisitStringIndexOf(HInvoke* invoke) {
1569 LocationSummary* locations = new (allocator_) LocationSummary(
1570 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1571 // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
1572 // best to align the inputs accordingly.
1573 InvokeRuntimeCallingConvention calling_convention;
1574 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
1575 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
1576 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kInt32));
1577
1578 // Need to send start_index=0.
1579 locations->AddTemp(LocationFrom(calling_convention.GetRegisterAt(2)));
1580 }
1581
VisitStringIndexOf(HInvoke * invoke)1582 void IntrinsicCodeGeneratorARM64::VisitStringIndexOf(HInvoke* invoke) {
1583 GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, /* start_at_zero= */ true);
1584 }
1585
VisitStringIndexOfAfter(HInvoke * invoke)1586 void IntrinsicLocationsBuilderARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
1587 LocationSummary* locations = new (allocator_) LocationSummary(
1588 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1589 // We have a hand-crafted assembly stub that follows the runtime calling convention. So it's
1590 // best to align the inputs accordingly.
1591 InvokeRuntimeCallingConvention calling_convention;
1592 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
1593 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
1594 locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
1595 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kInt32));
1596 }
1597
VisitStringIndexOfAfter(HInvoke * invoke)1598 void IntrinsicCodeGeneratorARM64::VisitStringIndexOfAfter(HInvoke* invoke) {
1599 GenerateVisitStringIndexOf(invoke, GetVIXLAssembler(), codegen_, /* start_at_zero= */ false);
1600 }
1601
VisitStringNewStringFromBytes(HInvoke * invoke)1602 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1603 LocationSummary* locations = new (allocator_) LocationSummary(
1604 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1605 InvokeRuntimeCallingConvention calling_convention;
1606 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
1607 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
1608 locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
1609 locations->SetInAt(3, LocationFrom(calling_convention.GetRegisterAt(3)));
1610 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
1611 }
1612
VisitStringNewStringFromBytes(HInvoke * invoke)1613 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromBytes(HInvoke* invoke) {
1614 MacroAssembler* masm = GetVIXLAssembler();
1615 LocationSummary* locations = invoke->GetLocations();
1616
1617 Register byte_array = WRegisterFrom(locations->InAt(0));
1618 __ Cmp(byte_array, 0);
1619 SlowPathCodeARM64* slow_path =
1620 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
1621 codegen_->AddSlowPath(slow_path);
1622 __ B(eq, slow_path->GetEntryLabel());
1623
1624 codegen_->InvokeRuntime(kQuickAllocStringFromBytes, invoke, invoke->GetDexPc(), slow_path);
1625 CheckEntrypointTypes<kQuickAllocStringFromBytes, void*, void*, int32_t, int32_t, int32_t>();
1626 __ Bind(slow_path->GetExitLabel());
1627 }
1628
VisitStringNewStringFromChars(HInvoke * invoke)1629 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
1630 LocationSummary* locations =
1631 new (allocator_) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
1632 InvokeRuntimeCallingConvention calling_convention;
1633 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
1634 locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
1635 locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
1636 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
1637 }
1638
VisitStringNewStringFromChars(HInvoke * invoke)1639 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromChars(HInvoke* invoke) {
1640 // No need to emit code checking whether `locations->InAt(2)` is a null
1641 // pointer, as callers of the native method
1642 //
1643 // java.lang.StringFactory.newStringFromChars(int offset, int charCount, char[] data)
1644 //
1645 // all include a null check on `data` before calling that method.
1646 codegen_->InvokeRuntime(kQuickAllocStringFromChars, invoke, invoke->GetDexPc());
1647 CheckEntrypointTypes<kQuickAllocStringFromChars, void*, int32_t, int32_t, void*>();
1648 }
1649
VisitStringNewStringFromString(HInvoke * invoke)1650 void IntrinsicLocationsBuilderARM64::VisitStringNewStringFromString(HInvoke* invoke) {
1651 LocationSummary* locations = new (allocator_) LocationSummary(
1652 invoke, LocationSummary::kCallOnMainAndSlowPath, kIntrinsified);
1653 InvokeRuntimeCallingConvention calling_convention;
1654 locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
1655 locations->SetOut(calling_convention.GetReturnLocation(DataType::Type::kReference));
1656 }
1657
VisitStringNewStringFromString(HInvoke * invoke)1658 void IntrinsicCodeGeneratorARM64::VisitStringNewStringFromString(HInvoke* invoke) {
1659 MacroAssembler* masm = GetVIXLAssembler();
1660 LocationSummary* locations = invoke->GetLocations();
1661
1662 Register string_to_copy = WRegisterFrom(locations->InAt(0));
1663 __ Cmp(string_to_copy, 0);
1664 SlowPathCodeARM64* slow_path =
1665 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
1666 codegen_->AddSlowPath(slow_path);
1667 __ B(eq, slow_path->GetEntryLabel());
1668
1669 codegen_->InvokeRuntime(kQuickAllocStringFromString, invoke, invoke->GetDexPc(), slow_path);
1670 CheckEntrypointTypes<kQuickAllocStringFromString, void*, void*>();
1671 __ Bind(slow_path->GetExitLabel());
1672 }
1673
CreateFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)1674 static void CreateFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
1675 DCHECK_EQ(invoke->GetNumberOfArguments(), 1U);
1676 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
1677 DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
1678
1679 LocationSummary* const locations =
1680 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
1681 InvokeRuntimeCallingConvention calling_convention;
1682
1683 locations->SetInAt(0, LocationFrom(calling_convention.GetFpuRegisterAt(0)));
1684 locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
1685 }
1686
CreateFPFPToFPCallLocations(ArenaAllocator * allocator,HInvoke * invoke)1687 static void CreateFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
1688 DCHECK_EQ(invoke->GetNumberOfArguments(), 2U);
1689 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
1690 DCHECK(DataType::IsFloatingPointType(invoke->InputAt(1)->GetType()));
1691 DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
1692
1693 LocationSummary* const locations =
1694 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnMainOnly, kIntrinsified);
1695 InvokeRuntimeCallingConvention calling_convention;
1696
1697 locations->SetInAt(0, LocationFrom(calling_convention.GetFpuRegisterAt(0)));
1698 locations->SetInAt(1, LocationFrom(calling_convention.GetFpuRegisterAt(1)));
1699 locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
1700 }
1701
GenFPToFPCall(HInvoke * invoke,CodeGeneratorARM64 * codegen,QuickEntrypointEnum entry)1702 static void GenFPToFPCall(HInvoke* invoke,
1703 CodeGeneratorARM64* codegen,
1704 QuickEntrypointEnum entry) {
1705 codegen->InvokeRuntime(entry, invoke, invoke->GetDexPc());
1706 }
1707
VisitMathCos(HInvoke * invoke)1708 void IntrinsicLocationsBuilderARM64::VisitMathCos(HInvoke* invoke) {
1709 CreateFPToFPCallLocations(allocator_, invoke);
1710 }
1711
VisitMathCos(HInvoke * invoke)1712 void IntrinsicCodeGeneratorARM64::VisitMathCos(HInvoke* invoke) {
1713 GenFPToFPCall(invoke, codegen_, kQuickCos);
1714 }
1715
VisitMathSin(HInvoke * invoke)1716 void IntrinsicLocationsBuilderARM64::VisitMathSin(HInvoke* invoke) {
1717 CreateFPToFPCallLocations(allocator_, invoke);
1718 }
1719
VisitMathSin(HInvoke * invoke)1720 void IntrinsicCodeGeneratorARM64::VisitMathSin(HInvoke* invoke) {
1721 GenFPToFPCall(invoke, codegen_, kQuickSin);
1722 }
1723
VisitMathAcos(HInvoke * invoke)1724 void IntrinsicLocationsBuilderARM64::VisitMathAcos(HInvoke* invoke) {
1725 CreateFPToFPCallLocations(allocator_, invoke);
1726 }
1727
VisitMathAcos(HInvoke * invoke)1728 void IntrinsicCodeGeneratorARM64::VisitMathAcos(HInvoke* invoke) {
1729 GenFPToFPCall(invoke, codegen_, kQuickAcos);
1730 }
1731
VisitMathAsin(HInvoke * invoke)1732 void IntrinsicLocationsBuilderARM64::VisitMathAsin(HInvoke* invoke) {
1733 CreateFPToFPCallLocations(allocator_, invoke);
1734 }
1735
VisitMathAsin(HInvoke * invoke)1736 void IntrinsicCodeGeneratorARM64::VisitMathAsin(HInvoke* invoke) {
1737 GenFPToFPCall(invoke, codegen_, kQuickAsin);
1738 }
1739
VisitMathAtan(HInvoke * invoke)1740 void IntrinsicLocationsBuilderARM64::VisitMathAtan(HInvoke* invoke) {
1741 CreateFPToFPCallLocations(allocator_, invoke);
1742 }
1743
VisitMathAtan(HInvoke * invoke)1744 void IntrinsicCodeGeneratorARM64::VisitMathAtan(HInvoke* invoke) {
1745 GenFPToFPCall(invoke, codegen_, kQuickAtan);
1746 }
1747
VisitMathCbrt(HInvoke * invoke)1748 void IntrinsicLocationsBuilderARM64::VisitMathCbrt(HInvoke* invoke) {
1749 CreateFPToFPCallLocations(allocator_, invoke);
1750 }
1751
VisitMathCbrt(HInvoke * invoke)1752 void IntrinsicCodeGeneratorARM64::VisitMathCbrt(HInvoke* invoke) {
1753 GenFPToFPCall(invoke, codegen_, kQuickCbrt);
1754 }
1755
VisitMathCosh(HInvoke * invoke)1756 void IntrinsicLocationsBuilderARM64::VisitMathCosh(HInvoke* invoke) {
1757 CreateFPToFPCallLocations(allocator_, invoke);
1758 }
1759
VisitMathCosh(HInvoke * invoke)1760 void IntrinsicCodeGeneratorARM64::VisitMathCosh(HInvoke* invoke) {
1761 GenFPToFPCall(invoke, codegen_, kQuickCosh);
1762 }
1763
VisitMathExp(HInvoke * invoke)1764 void IntrinsicLocationsBuilderARM64::VisitMathExp(HInvoke* invoke) {
1765 CreateFPToFPCallLocations(allocator_, invoke);
1766 }
1767
VisitMathExp(HInvoke * invoke)1768 void IntrinsicCodeGeneratorARM64::VisitMathExp(HInvoke* invoke) {
1769 GenFPToFPCall(invoke, codegen_, kQuickExp);
1770 }
1771
VisitMathExpm1(HInvoke * invoke)1772 void IntrinsicLocationsBuilderARM64::VisitMathExpm1(HInvoke* invoke) {
1773 CreateFPToFPCallLocations(allocator_, invoke);
1774 }
1775
VisitMathExpm1(HInvoke * invoke)1776 void IntrinsicCodeGeneratorARM64::VisitMathExpm1(HInvoke* invoke) {
1777 GenFPToFPCall(invoke, codegen_, kQuickExpm1);
1778 }
1779
VisitMathLog(HInvoke * invoke)1780 void IntrinsicLocationsBuilderARM64::VisitMathLog(HInvoke* invoke) {
1781 CreateFPToFPCallLocations(allocator_, invoke);
1782 }
1783
VisitMathLog(HInvoke * invoke)1784 void IntrinsicCodeGeneratorARM64::VisitMathLog(HInvoke* invoke) {
1785 GenFPToFPCall(invoke, codegen_, kQuickLog);
1786 }
1787
VisitMathLog10(HInvoke * invoke)1788 void IntrinsicLocationsBuilderARM64::VisitMathLog10(HInvoke* invoke) {
1789 CreateFPToFPCallLocations(allocator_, invoke);
1790 }
1791
VisitMathLog10(HInvoke * invoke)1792 void IntrinsicCodeGeneratorARM64::VisitMathLog10(HInvoke* invoke) {
1793 GenFPToFPCall(invoke, codegen_, kQuickLog10);
1794 }
1795
VisitMathSinh(HInvoke * invoke)1796 void IntrinsicLocationsBuilderARM64::VisitMathSinh(HInvoke* invoke) {
1797 CreateFPToFPCallLocations(allocator_, invoke);
1798 }
1799
VisitMathSinh(HInvoke * invoke)1800 void IntrinsicCodeGeneratorARM64::VisitMathSinh(HInvoke* invoke) {
1801 GenFPToFPCall(invoke, codegen_, kQuickSinh);
1802 }
1803
VisitMathTan(HInvoke * invoke)1804 void IntrinsicLocationsBuilderARM64::VisitMathTan(HInvoke* invoke) {
1805 CreateFPToFPCallLocations(allocator_, invoke);
1806 }
1807
VisitMathTan(HInvoke * invoke)1808 void IntrinsicCodeGeneratorARM64::VisitMathTan(HInvoke* invoke) {
1809 GenFPToFPCall(invoke, codegen_, kQuickTan);
1810 }
1811
VisitMathTanh(HInvoke * invoke)1812 void IntrinsicLocationsBuilderARM64::VisitMathTanh(HInvoke* invoke) {
1813 CreateFPToFPCallLocations(allocator_, invoke);
1814 }
1815
VisitMathTanh(HInvoke * invoke)1816 void IntrinsicCodeGeneratorARM64::VisitMathTanh(HInvoke* invoke) {
1817 GenFPToFPCall(invoke, codegen_, kQuickTanh);
1818 }
1819
VisitMathAtan2(HInvoke * invoke)1820 void IntrinsicLocationsBuilderARM64::VisitMathAtan2(HInvoke* invoke) {
1821 CreateFPFPToFPCallLocations(allocator_, invoke);
1822 }
1823
VisitMathAtan2(HInvoke * invoke)1824 void IntrinsicCodeGeneratorARM64::VisitMathAtan2(HInvoke* invoke) {
1825 GenFPToFPCall(invoke, codegen_, kQuickAtan2);
1826 }
1827
VisitMathPow(HInvoke * invoke)1828 void IntrinsicLocationsBuilderARM64::VisitMathPow(HInvoke* invoke) {
1829 CreateFPFPToFPCallLocations(allocator_, invoke);
1830 }
1831
VisitMathPow(HInvoke * invoke)1832 void IntrinsicCodeGeneratorARM64::VisitMathPow(HInvoke* invoke) {
1833 GenFPToFPCall(invoke, codegen_, kQuickPow);
1834 }
1835
VisitMathHypot(HInvoke * invoke)1836 void IntrinsicLocationsBuilderARM64::VisitMathHypot(HInvoke* invoke) {
1837 CreateFPFPToFPCallLocations(allocator_, invoke);
1838 }
1839
VisitMathHypot(HInvoke * invoke)1840 void IntrinsicCodeGeneratorARM64::VisitMathHypot(HInvoke* invoke) {
1841 GenFPToFPCall(invoke, codegen_, kQuickHypot);
1842 }
1843
VisitMathNextAfter(HInvoke * invoke)1844 void IntrinsicLocationsBuilderARM64::VisitMathNextAfter(HInvoke* invoke) {
1845 CreateFPFPToFPCallLocations(allocator_, invoke);
1846 }
1847
VisitMathNextAfter(HInvoke * invoke)1848 void IntrinsicCodeGeneratorARM64::VisitMathNextAfter(HInvoke* invoke) {
1849 GenFPToFPCall(invoke, codegen_, kQuickNextAfter);
1850 }
1851
VisitStringGetCharsNoCheck(HInvoke * invoke)1852 void IntrinsicLocationsBuilderARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1853 LocationSummary* locations =
1854 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
1855 locations->SetInAt(0, Location::RequiresRegister());
1856 locations->SetInAt(1, Location::RequiresRegister());
1857 locations->SetInAt(2, Location::RequiresRegister());
1858 locations->SetInAt(3, Location::RequiresRegister());
1859 locations->SetInAt(4, Location::RequiresRegister());
1860
1861 locations->AddTemp(Location::RequiresRegister());
1862 locations->AddTemp(Location::RequiresRegister());
1863 locations->AddTemp(Location::RequiresRegister());
1864 }
1865
VisitStringGetCharsNoCheck(HInvoke * invoke)1866 void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
1867 MacroAssembler* masm = GetVIXLAssembler();
1868 LocationSummary* locations = invoke->GetLocations();
1869
1870 // Check assumption that sizeof(Char) is 2 (used in scaling below).
1871 const size_t char_size = DataType::Size(DataType::Type::kUint16);
1872 DCHECK_EQ(char_size, 2u);
1873
1874 // Location of data in char array buffer.
1875 const uint32_t data_offset = mirror::Array::DataOffset(char_size).Uint32Value();
1876
1877 // Location of char array data in string.
1878 const uint32_t value_offset = mirror::String::ValueOffset().Uint32Value();
1879
1880 // void getCharsNoCheck(int srcBegin, int srcEnd, char[] dst, int dstBegin);
1881 // Since getChars() calls getCharsNoCheck() - we use registers rather than constants.
1882 Register srcObj = XRegisterFrom(locations->InAt(0));
1883 Register srcBegin = XRegisterFrom(locations->InAt(1));
1884 Register srcEnd = XRegisterFrom(locations->InAt(2));
1885 Register dstObj = XRegisterFrom(locations->InAt(3));
1886 Register dstBegin = XRegisterFrom(locations->InAt(4));
1887
1888 Register src_ptr = XRegisterFrom(locations->GetTemp(0));
1889 Register num_chr = XRegisterFrom(locations->GetTemp(1));
1890 Register tmp1 = XRegisterFrom(locations->GetTemp(2));
1891
1892 UseScratchRegisterScope temps(masm);
1893 Register dst_ptr = temps.AcquireX();
1894 Register tmp2 = temps.AcquireX();
1895
1896 vixl::aarch64::Label done;
1897 vixl::aarch64::Label compressed_string_vector_loop;
1898 vixl::aarch64::Label compressed_string_remainder;
1899 __ Sub(num_chr, srcEnd, srcBegin);
1900 // Early out for valid zero-length retrievals.
1901 __ Cbz(num_chr, &done);
1902
1903 // dst address start to copy to.
1904 __ Add(dst_ptr, dstObj, Operand(data_offset));
1905 __ Add(dst_ptr, dst_ptr, Operand(dstBegin, LSL, 1));
1906
1907 // src address to copy from.
1908 __ Add(src_ptr, srcObj, Operand(value_offset));
1909 vixl::aarch64::Label compressed_string_preloop;
1910 if (mirror::kUseStringCompression) {
1911 // Location of count in string.
1912 const uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
1913 // String's length.
1914 __ Ldr(tmp2, MemOperand(srcObj, count_offset));
1915 __ Tbz(tmp2, 0, &compressed_string_preloop);
1916 }
1917 __ Add(src_ptr, src_ptr, Operand(srcBegin, LSL, 1));
1918
1919 // Do the copy.
1920 vixl::aarch64::Label loop;
1921 vixl::aarch64::Label remainder;
1922
1923 // Save repairing the value of num_chr on the < 8 character path.
1924 __ Subs(tmp1, num_chr, 8);
1925 __ B(lt, &remainder);
1926
1927 // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
1928 __ Mov(num_chr, tmp1);
1929
1930 // Main loop used for longer fetches loads and stores 8x16-bit characters at a time.
1931 // (Unaligned addresses are acceptable here and not worth inlining extra code to rectify.)
1932 __ Bind(&loop);
1933 __ Ldp(tmp1, tmp2, MemOperand(src_ptr, char_size * 8, PostIndex));
1934 __ Subs(num_chr, num_chr, 8);
1935 __ Stp(tmp1, tmp2, MemOperand(dst_ptr, char_size * 8, PostIndex));
1936 __ B(ge, &loop);
1937
1938 __ Adds(num_chr, num_chr, 8);
1939 __ B(eq, &done);
1940
1941 // Main loop for < 8 character case and remainder handling. Loads and stores one
1942 // 16-bit Java character at a time.
1943 __ Bind(&remainder);
1944 __ Ldrh(tmp1, MemOperand(src_ptr, char_size, PostIndex));
1945 __ Subs(num_chr, num_chr, 1);
1946 __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
1947 __ B(gt, &remainder);
1948 __ B(&done);
1949
1950 if (mirror::kUseStringCompression) {
1951 // For compressed strings, acquire a SIMD temporary register.
1952 VRegister vtmp1 = temps.AcquireVRegisterOfSize(kQRegSize);
1953 const size_t c_char_size = DataType::Size(DataType::Type::kInt8);
1954 DCHECK_EQ(c_char_size, 1u);
1955 __ Bind(&compressed_string_preloop);
1956 __ Add(src_ptr, src_ptr, Operand(srcBegin));
1957
1958 // Save repairing the value of num_chr on the < 8 character path.
1959 __ Subs(tmp1, num_chr, 8);
1960 __ B(lt, &compressed_string_remainder);
1961
1962 // Keep the result of the earlier subs, we are going to fetch at least 8 characters.
1963 __ Mov(num_chr, tmp1);
1964
1965 // Main loop for compressed src, copying 8 characters (8-bit) to (16-bit) at a time.
1966 // Uses SIMD instructions.
1967 __ Bind(&compressed_string_vector_loop);
1968 __ Ld1(vtmp1.V8B(), MemOperand(src_ptr, c_char_size * 8, PostIndex));
1969 __ Subs(num_chr, num_chr, 8);
1970 __ Uxtl(vtmp1.V8H(), vtmp1.V8B());
1971 __ St1(vtmp1.V8H(), MemOperand(dst_ptr, char_size * 8, PostIndex));
1972 __ B(ge, &compressed_string_vector_loop);
1973
1974 __ Adds(num_chr, num_chr, 8);
1975 __ B(eq, &done);
1976
1977 // Loop for < 8 character case and remainder handling with a compressed src.
1978 // Copies 1 character (8-bit) to (16-bit) at a time.
1979 __ Bind(&compressed_string_remainder);
1980 __ Ldrb(tmp1, MemOperand(src_ptr, c_char_size, PostIndex));
1981 __ Strh(tmp1, MemOperand(dst_ptr, char_size, PostIndex));
1982 __ Subs(num_chr, num_chr, Operand(1));
1983 __ B(gt, &compressed_string_remainder);
1984 }
1985
1986 __ Bind(&done);
1987 }
1988
1989 // Mirrors ARRAYCOPY_SHORT_CHAR_ARRAY_THRESHOLD in libcore, so we can choose to use the native
1990 // implementation there for longer copy lengths.
1991 static constexpr int32_t kSystemArrayCopyCharThreshold = 32;
1992
SetSystemArrayCopyLocationRequires(LocationSummary * locations,uint32_t at,HInstruction * input)1993 static void SetSystemArrayCopyLocationRequires(LocationSummary* locations,
1994 uint32_t at,
1995 HInstruction* input) {
1996 HIntConstant* const_input = input->AsIntConstant();
1997 if (const_input != nullptr && !vixl::aarch64::Assembler::IsImmAddSub(const_input->GetValue())) {
1998 locations->SetInAt(at, Location::RequiresRegister());
1999 } else {
2000 locations->SetInAt(at, Location::RegisterOrConstant(input));
2001 }
2002 }
2003
VisitSystemArrayCopyChar(HInvoke * invoke)2004 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
2005 // Check to see if we have known failures that will cause us to have to bail out
2006 // to the runtime, and just generate the runtime call directly.
2007 HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
2008 HIntConstant* dst_pos = invoke->InputAt(3)->AsIntConstant();
2009
2010 // The positions must be non-negative.
2011 if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
2012 (dst_pos != nullptr && dst_pos->GetValue() < 0)) {
2013 // We will have to fail anyways.
2014 return;
2015 }
2016
2017 // The length must be >= 0 and not so long that we would (currently) prefer libcore's
2018 // native implementation.
2019 HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
2020 if (length != nullptr) {
2021 int32_t len = length->GetValue();
2022 if (len < 0 || len > kSystemArrayCopyCharThreshold) {
2023 // Just call as normal.
2024 return;
2025 }
2026 }
2027
2028 ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
2029 LocationSummary* locations =
2030 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
2031 // arraycopy(char[] src, int src_pos, char[] dst, int dst_pos, int length).
2032 locations->SetInAt(0, Location::RequiresRegister());
2033 SetSystemArrayCopyLocationRequires(locations, 1, invoke->InputAt(1));
2034 locations->SetInAt(2, Location::RequiresRegister());
2035 SetSystemArrayCopyLocationRequires(locations, 3, invoke->InputAt(3));
2036 SetSystemArrayCopyLocationRequires(locations, 4, invoke->InputAt(4));
2037
2038 locations->AddTemp(Location::RequiresRegister());
2039 locations->AddTemp(Location::RequiresRegister());
2040 locations->AddTemp(Location::RequiresRegister());
2041 }
2042
CheckSystemArrayCopyPosition(MacroAssembler * masm,const Location & pos,const Register & input,const Location & length,SlowPathCodeARM64 * slow_path,const Register & temp,bool length_is_input_length=false)2043 static void CheckSystemArrayCopyPosition(MacroAssembler* masm,
2044 const Location& pos,
2045 const Register& input,
2046 const Location& length,
2047 SlowPathCodeARM64* slow_path,
2048 const Register& temp,
2049 bool length_is_input_length = false) {
2050 const int32_t length_offset = mirror::Array::LengthOffset().Int32Value();
2051 if (pos.IsConstant()) {
2052 int32_t pos_const = pos.GetConstant()->AsIntConstant()->GetValue();
2053 if (pos_const == 0) {
2054 if (!length_is_input_length) {
2055 // Check that length(input) >= length.
2056 __ Ldr(temp, MemOperand(input, length_offset));
2057 __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
2058 __ B(slow_path->GetEntryLabel(), lt);
2059 }
2060 } else {
2061 // Check that length(input) >= pos.
2062 __ Ldr(temp, MemOperand(input, length_offset));
2063 __ Subs(temp, temp, pos_const);
2064 __ B(slow_path->GetEntryLabel(), lt);
2065
2066 // Check that (length(input) - pos) >= length.
2067 __ Cmp(temp, OperandFrom(length, DataType::Type::kInt32));
2068 __ B(slow_path->GetEntryLabel(), lt);
2069 }
2070 } else if (length_is_input_length) {
2071 // The only way the copy can succeed is if pos is zero.
2072 __ Cbnz(WRegisterFrom(pos), slow_path->GetEntryLabel());
2073 } else {
2074 // Check that pos >= 0.
2075 Register pos_reg = WRegisterFrom(pos);
2076 __ Tbnz(pos_reg, pos_reg.GetSizeInBits() - 1, slow_path->GetEntryLabel());
2077
2078 // Check that pos <= length(input) && (length(input) - pos) >= length.
2079 __ Ldr(temp, MemOperand(input, length_offset));
2080 __ Subs(temp, temp, pos_reg);
2081 // Ccmp if length(input) >= pos, else definitely bail to slow path (N!=V == lt).
2082 __ Ccmp(temp, OperandFrom(length, DataType::Type::kInt32), NFlag, ge);
2083 __ B(slow_path->GetEntryLabel(), lt);
2084 }
2085 }
2086
2087 // Compute base source address, base destination address, and end
2088 // source address for System.arraycopy* intrinsics in `src_base`,
2089 // `dst_base` and `src_end` respectively.
GenSystemArrayCopyAddresses(MacroAssembler * masm,DataType::Type type,const Register & src,const Location & src_pos,const Register & dst,const Location & dst_pos,const Location & copy_length,const Register & src_base,const Register & dst_base,const Register & src_end)2090 static void GenSystemArrayCopyAddresses(MacroAssembler* masm,
2091 DataType::Type type,
2092 const Register& src,
2093 const Location& src_pos,
2094 const Register& dst,
2095 const Location& dst_pos,
2096 const Location& copy_length,
2097 const Register& src_base,
2098 const Register& dst_base,
2099 const Register& src_end) {
2100 // This routine is used by the SystemArrayCopy and the SystemArrayCopyChar intrinsics.
2101 DCHECK(type == DataType::Type::kReference || type == DataType::Type::kUint16)
2102 << "Unexpected element type: " << type;
2103 const int32_t element_size = DataType::Size(type);
2104 const int32_t element_size_shift = DataType::SizeShift(type);
2105 const uint32_t data_offset = mirror::Array::DataOffset(element_size).Uint32Value();
2106
2107 if (src_pos.IsConstant()) {
2108 int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
2109 __ Add(src_base, src, element_size * constant + data_offset);
2110 } else {
2111 __ Add(src_base, src, data_offset);
2112 __ Add(src_base, src_base, Operand(XRegisterFrom(src_pos), LSL, element_size_shift));
2113 }
2114
2115 if (dst_pos.IsConstant()) {
2116 int32_t constant = dst_pos.GetConstant()->AsIntConstant()->GetValue();
2117 __ Add(dst_base, dst, element_size * constant + data_offset);
2118 } else {
2119 __ Add(dst_base, dst, data_offset);
2120 __ Add(dst_base, dst_base, Operand(XRegisterFrom(dst_pos), LSL, element_size_shift));
2121 }
2122
2123 if (copy_length.IsConstant()) {
2124 int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
2125 __ Add(src_end, src_base, element_size * constant);
2126 } else {
2127 __ Add(src_end, src_base, Operand(XRegisterFrom(copy_length), LSL, element_size_shift));
2128 }
2129 }
2130
VisitSystemArrayCopyChar(HInvoke * invoke)2131 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
2132 MacroAssembler* masm = GetVIXLAssembler();
2133 LocationSummary* locations = invoke->GetLocations();
2134 Register src = XRegisterFrom(locations->InAt(0));
2135 Location src_pos = locations->InAt(1);
2136 Register dst = XRegisterFrom(locations->InAt(2));
2137 Location dst_pos = locations->InAt(3);
2138 Location length = locations->InAt(4);
2139
2140 SlowPathCodeARM64* slow_path =
2141 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2142 codegen_->AddSlowPath(slow_path);
2143
2144 // If source and destination are the same, take the slow path. Overlapping copy regions must be
2145 // copied in reverse and we can't know in all cases if it's needed.
2146 __ Cmp(src, dst);
2147 __ B(slow_path->GetEntryLabel(), eq);
2148
2149 // Bail out if the source is null.
2150 __ Cbz(src, slow_path->GetEntryLabel());
2151
2152 // Bail out if the destination is null.
2153 __ Cbz(dst, slow_path->GetEntryLabel());
2154
2155 if (!length.IsConstant()) {
2156 // Merge the following two comparisons into one:
2157 // If the length is negative, bail out (delegate to libcore's native implementation).
2158 // If the length > 32 then (currently) prefer libcore's native implementation.
2159 __ Cmp(WRegisterFrom(length), kSystemArrayCopyCharThreshold);
2160 __ B(slow_path->GetEntryLabel(), hi);
2161 } else {
2162 // We have already checked in the LocationsBuilder for the constant case.
2163 DCHECK_GE(length.GetConstant()->AsIntConstant()->GetValue(), 0);
2164 DCHECK_LE(length.GetConstant()->AsIntConstant()->GetValue(), 32);
2165 }
2166
2167 Register src_curr_addr = WRegisterFrom(locations->GetTemp(0));
2168 Register dst_curr_addr = WRegisterFrom(locations->GetTemp(1));
2169 Register src_stop_addr = WRegisterFrom(locations->GetTemp(2));
2170
2171 CheckSystemArrayCopyPosition(masm,
2172 src_pos,
2173 src,
2174 length,
2175 slow_path,
2176 src_curr_addr,
2177 false);
2178
2179 CheckSystemArrayCopyPosition(masm,
2180 dst_pos,
2181 dst,
2182 length,
2183 slow_path,
2184 src_curr_addr,
2185 false);
2186
2187 src_curr_addr = src_curr_addr.X();
2188 dst_curr_addr = dst_curr_addr.X();
2189 src_stop_addr = src_stop_addr.X();
2190
2191 GenSystemArrayCopyAddresses(masm,
2192 DataType::Type::kUint16,
2193 src,
2194 src_pos,
2195 dst,
2196 dst_pos,
2197 length,
2198 src_curr_addr,
2199 dst_curr_addr,
2200 src_stop_addr);
2201
2202 // Iterate over the arrays and do a raw copy of the chars.
2203 const int32_t char_size = DataType::Size(DataType::Type::kUint16);
2204 UseScratchRegisterScope temps(masm);
2205 Register tmp = temps.AcquireW();
2206 vixl::aarch64::Label loop, done;
2207 __ Bind(&loop);
2208 __ Cmp(src_curr_addr, src_stop_addr);
2209 __ B(&done, eq);
2210 __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, PostIndex));
2211 __ Strh(tmp, MemOperand(dst_curr_addr, char_size, PostIndex));
2212 __ B(&loop);
2213 __ Bind(&done);
2214
2215 __ Bind(slow_path->GetExitLabel());
2216 }
2217
2218 // We can choose to use the native implementation there for longer copy lengths.
2219 static constexpr int32_t kSystemArrayCopyThreshold = 128;
2220
2221 // CodeGenerator::CreateSystemArrayCopyLocationSummary use three temporary registers.
2222 // We want to use two temporary registers in order to reduce the register pressure in arm64.
2223 // So we don't use the CodeGenerator::CreateSystemArrayCopyLocationSummary.
VisitSystemArrayCopy(HInvoke * invoke)2224 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
2225 // The only read barrier implementation supporting the
2226 // SystemArrayCopy intrinsic is the Baker-style read barriers.
2227 if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
2228 return;
2229 }
2230
2231 // Check to see if we have known failures that will cause us to have to bail out
2232 // to the runtime, and just generate the runtime call directly.
2233 HIntConstant* src_pos = invoke->InputAt(1)->AsIntConstant();
2234 HIntConstant* dest_pos = invoke->InputAt(3)->AsIntConstant();
2235
2236 // The positions must be non-negative.
2237 if ((src_pos != nullptr && src_pos->GetValue() < 0) ||
2238 (dest_pos != nullptr && dest_pos->GetValue() < 0)) {
2239 // We will have to fail anyways.
2240 return;
2241 }
2242
2243 // The length must be >= 0.
2244 HIntConstant* length = invoke->InputAt(4)->AsIntConstant();
2245 if (length != nullptr) {
2246 int32_t len = length->GetValue();
2247 if (len < 0 || len >= kSystemArrayCopyThreshold) {
2248 // Just call as normal.
2249 return;
2250 }
2251 }
2252
2253 SystemArrayCopyOptimizations optimizations(invoke);
2254
2255 if (optimizations.GetDestinationIsSource()) {
2256 if (src_pos != nullptr && dest_pos != nullptr && src_pos->GetValue() < dest_pos->GetValue()) {
2257 // We only support backward copying if source and destination are the same.
2258 return;
2259 }
2260 }
2261
2262 if (optimizations.GetDestinationIsPrimitiveArray() || optimizations.GetSourceIsPrimitiveArray()) {
2263 // We currently don't intrinsify primitive copying.
2264 return;
2265 }
2266
2267 ArenaAllocator* allocator = invoke->GetBlock()->GetGraph()->GetAllocator();
2268 LocationSummary* locations =
2269 new (allocator) LocationSummary(invoke, LocationSummary::kCallOnSlowPath, kIntrinsified);
2270 // arraycopy(Object src, int src_pos, Object dest, int dest_pos, int length).
2271 locations->SetInAt(0, Location::RequiresRegister());
2272 SetSystemArrayCopyLocationRequires(locations, 1, invoke->InputAt(1));
2273 locations->SetInAt(2, Location::RequiresRegister());
2274 SetSystemArrayCopyLocationRequires(locations, 3, invoke->InputAt(3));
2275 SetSystemArrayCopyLocationRequires(locations, 4, invoke->InputAt(4));
2276
2277 locations->AddTemp(Location::RequiresRegister());
2278 locations->AddTemp(Location::RequiresRegister());
2279 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2280 // Temporary register IP0, obtained from the VIXL scratch register
2281 // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64
2282 // (because that register is clobbered by ReadBarrierMarkRegX
2283 // entry points). It cannot be used in calls to
2284 // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier
2285 // either. For these reasons, get a third extra temporary register
2286 // from the register allocator.
2287 locations->AddTemp(Location::RequiresRegister());
2288 } else {
2289 // Cases other than Baker read barriers: the third temporary will
2290 // be acquired from the VIXL scratch register pool.
2291 }
2292 }
2293
VisitSystemArrayCopy(HInvoke * invoke)2294 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
2295 // The only read barrier implementation supporting the
2296 // SystemArrayCopy intrinsic is the Baker-style read barriers.
2297 DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
2298
2299 MacroAssembler* masm = GetVIXLAssembler();
2300 LocationSummary* locations = invoke->GetLocations();
2301
2302 uint32_t class_offset = mirror::Object::ClassOffset().Int32Value();
2303 uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
2304 uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
2305 uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
2306 uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
2307
2308 Register src = XRegisterFrom(locations->InAt(0));
2309 Location src_pos = locations->InAt(1);
2310 Register dest = XRegisterFrom(locations->InAt(2));
2311 Location dest_pos = locations->InAt(3);
2312 Location length = locations->InAt(4);
2313 Register temp1 = WRegisterFrom(locations->GetTemp(0));
2314 Location temp1_loc = LocationFrom(temp1);
2315 Register temp2 = WRegisterFrom(locations->GetTemp(1));
2316 Location temp2_loc = LocationFrom(temp2);
2317
2318 SlowPathCodeARM64* intrinsic_slow_path =
2319 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
2320 codegen_->AddSlowPath(intrinsic_slow_path);
2321
2322 vixl::aarch64::Label conditions_on_positions_validated;
2323 SystemArrayCopyOptimizations optimizations(invoke);
2324
2325 // If source and destination are the same, we go to slow path if we need to do
2326 // forward copying.
2327 if (src_pos.IsConstant()) {
2328 int32_t src_pos_constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
2329 if (dest_pos.IsConstant()) {
2330 int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
2331 if (optimizations.GetDestinationIsSource()) {
2332 // Checked when building locations.
2333 DCHECK_GE(src_pos_constant, dest_pos_constant);
2334 } else if (src_pos_constant < dest_pos_constant) {
2335 __ Cmp(src, dest);
2336 __ B(intrinsic_slow_path->GetEntryLabel(), eq);
2337 }
2338 // Checked when building locations.
2339 DCHECK(!optimizations.GetDestinationIsSource()
2340 || (src_pos_constant >= dest_pos.GetConstant()->AsIntConstant()->GetValue()));
2341 } else {
2342 if (!optimizations.GetDestinationIsSource()) {
2343 __ Cmp(src, dest);
2344 __ B(&conditions_on_positions_validated, ne);
2345 }
2346 __ Cmp(WRegisterFrom(dest_pos), src_pos_constant);
2347 __ B(intrinsic_slow_path->GetEntryLabel(), gt);
2348 }
2349 } else {
2350 if (!optimizations.GetDestinationIsSource()) {
2351 __ Cmp(src, dest);
2352 __ B(&conditions_on_positions_validated, ne);
2353 }
2354 __ Cmp(RegisterFrom(src_pos, invoke->InputAt(1)->GetType()),
2355 OperandFrom(dest_pos, invoke->InputAt(3)->GetType()));
2356 __ B(intrinsic_slow_path->GetEntryLabel(), lt);
2357 }
2358
2359 __ Bind(&conditions_on_positions_validated);
2360
2361 if (!optimizations.GetSourceIsNotNull()) {
2362 // Bail out if the source is null.
2363 __ Cbz(src, intrinsic_slow_path->GetEntryLabel());
2364 }
2365
2366 if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
2367 // Bail out if the destination is null.
2368 __ Cbz(dest, intrinsic_slow_path->GetEntryLabel());
2369 }
2370
2371 // We have already checked in the LocationsBuilder for the constant case.
2372 if (!length.IsConstant() &&
2373 !optimizations.GetCountIsSourceLength() &&
2374 !optimizations.GetCountIsDestinationLength()) {
2375 // Merge the following two comparisons into one:
2376 // If the length is negative, bail out (delegate to libcore's native implementation).
2377 // If the length >= 128 then (currently) prefer native implementation.
2378 __ Cmp(WRegisterFrom(length), kSystemArrayCopyThreshold);
2379 __ B(intrinsic_slow_path->GetEntryLabel(), hs);
2380 }
2381 // Validity checks: source.
2382 CheckSystemArrayCopyPosition(masm,
2383 src_pos,
2384 src,
2385 length,
2386 intrinsic_slow_path,
2387 temp1,
2388 optimizations.GetCountIsSourceLength());
2389
2390 // Validity checks: dest.
2391 CheckSystemArrayCopyPosition(masm,
2392 dest_pos,
2393 dest,
2394 length,
2395 intrinsic_slow_path,
2396 temp1,
2397 optimizations.GetCountIsDestinationLength());
2398 {
2399 // We use a block to end the scratch scope before the write barrier, thus
2400 // freeing the temporary registers so they can be used in `MarkGCCard`.
2401 UseScratchRegisterScope temps(masm);
2402 Location temp3_loc; // Used only for Baker read barrier.
2403 Register temp3;
2404 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2405 temp3_loc = locations->GetTemp(2);
2406 temp3 = WRegisterFrom(temp3_loc);
2407 } else {
2408 temp3 = temps.AcquireW();
2409 }
2410
2411 if (!optimizations.GetDoesNotNeedTypeCheck()) {
2412 // Check whether all elements of the source array are assignable to the component
2413 // type of the destination array. We do two checks: the classes are the same,
2414 // or the destination is Object[]. If none of these checks succeed, we go to the
2415 // slow path.
2416
2417 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2418 if (!optimizations.GetSourceIsNonPrimitiveArray()) {
2419 // /* HeapReference<Class> */ temp1 = src->klass_
2420 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2421 temp1_loc,
2422 src.W(),
2423 class_offset,
2424 temp3_loc,
2425 /* needs_null_check= */ false,
2426 /* use_load_acquire= */ false);
2427 // Bail out if the source is not a non primitive array.
2428 // /* HeapReference<Class> */ temp1 = temp1->component_type_
2429 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2430 temp1_loc,
2431 temp1,
2432 component_offset,
2433 temp3_loc,
2434 /* needs_null_check= */ false,
2435 /* use_load_acquire= */ false);
2436 __ Cbz(temp1, intrinsic_slow_path->GetEntryLabel());
2437 // If heap poisoning is enabled, `temp1` has been unpoisoned
2438 // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
2439 // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_);
2440 __ Ldrh(temp1, HeapOperand(temp1, primitive_offset));
2441 static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
2442 __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
2443 }
2444
2445 // /* HeapReference<Class> */ temp1 = dest->klass_
2446 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2447 temp1_loc,
2448 dest.W(),
2449 class_offset,
2450 temp3_loc,
2451 /* needs_null_check= */ false,
2452 /* use_load_acquire= */ false);
2453
2454 if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
2455 // Bail out if the destination is not a non primitive array.
2456 //
2457 // Register `temp1` is not trashed by the read barrier emitted
2458 // by GenerateFieldLoadWithBakerReadBarrier below, as that
2459 // method produces a call to a ReadBarrierMarkRegX entry point,
2460 // which saves all potentially live registers, including
2461 // temporaries such a `temp1`.
2462 // /* HeapReference<Class> */ temp2 = temp1->component_type_
2463 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2464 temp2_loc,
2465 temp1,
2466 component_offset,
2467 temp3_loc,
2468 /* needs_null_check= */ false,
2469 /* use_load_acquire= */ false);
2470 __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
2471 // If heap poisoning is enabled, `temp2` has been unpoisoned
2472 // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
2473 // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
2474 __ Ldrh(temp2, HeapOperand(temp2, primitive_offset));
2475 static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
2476 __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel());
2477 }
2478
2479 // For the same reason given earlier, `temp1` is not trashed by the
2480 // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
2481 // /* HeapReference<Class> */ temp2 = src->klass_
2482 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2483 temp2_loc,
2484 src.W(),
2485 class_offset,
2486 temp3_loc,
2487 /* needs_null_check= */ false,
2488 /* use_load_acquire= */ false);
2489 // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
2490 __ Cmp(temp1, temp2);
2491
2492 if (optimizations.GetDestinationIsTypedObjectArray()) {
2493 vixl::aarch64::Label do_copy;
2494 __ B(&do_copy, eq);
2495 // /* HeapReference<Class> */ temp1 = temp1->component_type_
2496 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2497 temp1_loc,
2498 temp1,
2499 component_offset,
2500 temp3_loc,
2501 /* needs_null_check= */ false,
2502 /* use_load_acquire= */ false);
2503 // /* HeapReference<Class> */ temp1 = temp1->super_class_
2504 // We do not need to emit a read barrier for the following
2505 // heap reference load, as `temp1` is only used in a
2506 // comparison with null below, and this reference is not
2507 // kept afterwards.
2508 __ Ldr(temp1, HeapOperand(temp1, super_offset));
2509 __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
2510 __ Bind(&do_copy);
2511 } else {
2512 __ B(intrinsic_slow_path->GetEntryLabel(), ne);
2513 }
2514 } else {
2515 // Non read barrier code.
2516
2517 // /* HeapReference<Class> */ temp1 = dest->klass_
2518 __ Ldr(temp1, MemOperand(dest, class_offset));
2519 // /* HeapReference<Class> */ temp2 = src->klass_
2520 __ Ldr(temp2, MemOperand(src, class_offset));
2521 bool did_unpoison = false;
2522 if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
2523 !optimizations.GetSourceIsNonPrimitiveArray()) {
2524 // One or two of the references need to be unpoisoned. Unpoison them
2525 // both to make the identity check valid.
2526 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
2527 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
2528 did_unpoison = true;
2529 }
2530
2531 if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
2532 // Bail out if the destination is not a non primitive array.
2533 // /* HeapReference<Class> */ temp3 = temp1->component_type_
2534 __ Ldr(temp3, HeapOperand(temp1, component_offset));
2535 __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel());
2536 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
2537 // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
2538 __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
2539 static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
2540 __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel());
2541 }
2542
2543 if (!optimizations.GetSourceIsNonPrimitiveArray()) {
2544 // Bail out if the source is not a non primitive array.
2545 // /* HeapReference<Class> */ temp3 = temp2->component_type_
2546 __ Ldr(temp3, HeapOperand(temp2, component_offset));
2547 __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel());
2548 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
2549 // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
2550 __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
2551 static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
2552 __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel());
2553 }
2554
2555 __ Cmp(temp1, temp2);
2556
2557 if (optimizations.GetDestinationIsTypedObjectArray()) {
2558 vixl::aarch64::Label do_copy;
2559 __ B(&do_copy, eq);
2560 if (!did_unpoison) {
2561 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
2562 }
2563 // /* HeapReference<Class> */ temp1 = temp1->component_type_
2564 __ Ldr(temp1, HeapOperand(temp1, component_offset));
2565 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
2566 // /* HeapReference<Class> */ temp1 = temp1->super_class_
2567 __ Ldr(temp1, HeapOperand(temp1, super_offset));
2568 // No need to unpoison the result, we're comparing against null.
2569 __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
2570 __ Bind(&do_copy);
2571 } else {
2572 __ B(intrinsic_slow_path->GetEntryLabel(), ne);
2573 }
2574 }
2575 } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
2576 DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
2577 // Bail out if the source is not a non primitive array.
2578 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2579 // /* HeapReference<Class> */ temp1 = src->klass_
2580 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2581 temp1_loc,
2582 src.W(),
2583 class_offset,
2584 temp3_loc,
2585 /* needs_null_check= */ false,
2586 /* use_load_acquire= */ false);
2587 // /* HeapReference<Class> */ temp2 = temp1->component_type_
2588 codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
2589 temp2_loc,
2590 temp1,
2591 component_offset,
2592 temp3_loc,
2593 /* needs_null_check= */ false,
2594 /* use_load_acquire= */ false);
2595 __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
2596 // If heap poisoning is enabled, `temp2` has been unpoisoned
2597 // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
2598 } else {
2599 // /* HeapReference<Class> */ temp1 = src->klass_
2600 __ Ldr(temp1, HeapOperand(src.W(), class_offset));
2601 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
2602 // /* HeapReference<Class> */ temp2 = temp1->component_type_
2603 __ Ldr(temp2, HeapOperand(temp1, component_offset));
2604 __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
2605 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
2606 }
2607 // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
2608 __ Ldrh(temp2, HeapOperand(temp2, primitive_offset));
2609 static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
2610 __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel());
2611 }
2612
2613 if (length.IsConstant() && length.GetConstant()->AsIntConstant()->GetValue() == 0) {
2614 // Null constant length: not need to emit the loop code at all.
2615 } else {
2616 Register src_curr_addr = temp1.X();
2617 Register dst_curr_addr = temp2.X();
2618 Register src_stop_addr = temp3.X();
2619 vixl::aarch64::Label done;
2620 const DataType::Type type = DataType::Type::kReference;
2621 const int32_t element_size = DataType::Size(type);
2622
2623 if (length.IsRegister()) {
2624 // Don't enter the copy loop if the length is null.
2625 __ Cbz(WRegisterFrom(length), &done);
2626 }
2627
2628 if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
2629 // TODO: Also convert this intrinsic to the IsGcMarking strategy?
2630
2631 // SystemArrayCopy implementation for Baker read barriers (see
2632 // also CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier):
2633 //
2634 // uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
2635 // lfence; // Load fence or artificial data dependency to prevent load-load reordering
2636 // bool is_gray = (rb_state == ReadBarrier::GrayState());
2637 // if (is_gray) {
2638 // // Slow-path copy.
2639 // do {
2640 // *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
2641 // } while (src_ptr != end_ptr)
2642 // } else {
2643 // // Fast-path copy.
2644 // do {
2645 // *dest_ptr++ = *src_ptr++;
2646 // } while (src_ptr != end_ptr)
2647 // }
2648
2649 // Make sure `tmp` is not IP0, as it is clobbered by
2650 // ReadBarrierMarkRegX entry points in
2651 // ReadBarrierSystemArrayCopySlowPathARM64.
2652 DCHECK(temps.IsAvailable(ip0));
2653 temps.Exclude(ip0);
2654 Register tmp = temps.AcquireW();
2655 DCHECK_NE(LocationFrom(tmp).reg(), IP0);
2656 // Put IP0 back in the pool so that VIXL has at least one
2657 // scratch register available to emit macro-instructions (note
2658 // that IP1 is already used for `tmp`). Indeed some
2659 // macro-instructions used in GenSystemArrayCopyAddresses
2660 // (invoked hereunder) may require a scratch register (for
2661 // instance to emit a load with a large constant offset).
2662 temps.Include(ip0);
2663
2664 // /* int32_t */ monitor = src->monitor_
2665 __ Ldr(tmp, HeapOperand(src.W(), monitor_offset));
2666 // /* LockWord */ lock_word = LockWord(monitor)
2667 static_assert(sizeof(LockWord) == sizeof(int32_t),
2668 "art::LockWord and int32_t have different sizes.");
2669
2670 // Introduce a dependency on the lock_word including rb_state,
2671 // to prevent load-load reordering, and without using
2672 // a memory barrier (which would be more expensive).
2673 // `src` is unchanged by this operation, but its value now depends
2674 // on `tmp`.
2675 __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32));
2676
2677 // Compute base source address, base destination address, and end
2678 // source address for System.arraycopy* intrinsics in `src_base`,
2679 // `dst_base` and `src_end` respectively.
2680 // Note that `src_curr_addr` is computed from from `src` (and
2681 // `src_pos`) here, and thus honors the artificial dependency
2682 // of `src` on `tmp`.
2683 GenSystemArrayCopyAddresses(masm,
2684 type,
2685 src,
2686 src_pos,
2687 dest,
2688 dest_pos,
2689 length,
2690 src_curr_addr,
2691 dst_curr_addr,
2692 src_stop_addr);
2693
2694 // Slow path used to copy array when `src` is gray.
2695 SlowPathCodeARM64* read_barrier_slow_path =
2696 new (codegen_->GetScopedAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(
2697 invoke, LocationFrom(tmp));
2698 codegen_->AddSlowPath(read_barrier_slow_path);
2699
2700 // Given the numeric representation, it's enough to check the low bit of the rb_state.
2701 static_assert(ReadBarrier::NonGrayState() == 0, "Expecting non-gray to have value 0");
2702 static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
2703 __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel());
2704
2705 // Fast-path copy.
2706 // Iterate over the arrays and do a raw copy of the objects. We don't need to
2707 // poison/unpoison.
2708 vixl::aarch64::Label loop;
2709 __ Bind(&loop);
2710 __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
2711 __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
2712 __ Cmp(src_curr_addr, src_stop_addr);
2713 __ B(&loop, ne);
2714
2715 __ Bind(read_barrier_slow_path->GetExitLabel());
2716 } else {
2717 // Non read barrier code.
2718 // Compute base source address, base destination address, and end
2719 // source address for System.arraycopy* intrinsics in `src_base`,
2720 // `dst_base` and `src_end` respectively.
2721 GenSystemArrayCopyAddresses(masm,
2722 type,
2723 src,
2724 src_pos,
2725 dest,
2726 dest_pos,
2727 length,
2728 src_curr_addr,
2729 dst_curr_addr,
2730 src_stop_addr);
2731 // Iterate over the arrays and do a raw copy of the objects. We don't need to
2732 // poison/unpoison.
2733 vixl::aarch64::Label loop;
2734 __ Bind(&loop);
2735 {
2736 Register tmp = temps.AcquireW();
2737 __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
2738 __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
2739 }
2740 __ Cmp(src_curr_addr, src_stop_addr);
2741 __ B(&loop, ne);
2742 }
2743 __ Bind(&done);
2744 }
2745 }
2746
2747 // We only need one card marking on the destination array.
2748 codegen_->MarkGCCard(dest.W(), Register(), /* value_can_be_null= */ false);
2749
2750 __ Bind(intrinsic_slow_path->GetExitLabel());
2751 }
2752
GenIsInfinite(LocationSummary * locations,bool is64bit,MacroAssembler * masm)2753 static void GenIsInfinite(LocationSummary* locations,
2754 bool is64bit,
2755 MacroAssembler* masm) {
2756 Operand infinity;
2757 Operand tst_mask;
2758 Register out;
2759
2760 if (is64bit) {
2761 infinity = kPositiveInfinityDouble;
2762 tst_mask = MaskLeastSignificant<uint64_t>(63);
2763 out = XRegisterFrom(locations->Out());
2764 } else {
2765 infinity = kPositiveInfinityFloat;
2766 tst_mask = MaskLeastSignificant<uint32_t>(31);
2767 out = WRegisterFrom(locations->Out());
2768 }
2769
2770 MoveFPToInt(locations, is64bit, masm);
2771 // Checks whether exponent bits are all 1 and fraction bits are all 0.
2772 __ Eor(out, out, infinity);
2773 // TST bitmask is used to mask out the sign bit: either 0x7fffffff or 0x7fffffffffffffff
2774 // depending on is64bit.
2775 __ Tst(out, tst_mask);
2776 __ Cset(out, eq);
2777 }
2778
VisitFloatIsInfinite(HInvoke * invoke)2779 void IntrinsicLocationsBuilderARM64::VisitFloatIsInfinite(HInvoke* invoke) {
2780 CreateFPToIntLocations(allocator_, invoke);
2781 }
2782
VisitFloatIsInfinite(HInvoke * invoke)2783 void IntrinsicCodeGeneratorARM64::VisitFloatIsInfinite(HInvoke* invoke) {
2784 GenIsInfinite(invoke->GetLocations(), /* is64bit= */ false, GetVIXLAssembler());
2785 }
2786
VisitDoubleIsInfinite(HInvoke * invoke)2787 void IntrinsicLocationsBuilderARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
2788 CreateFPToIntLocations(allocator_, invoke);
2789 }
2790
VisitDoubleIsInfinite(HInvoke * invoke)2791 void IntrinsicCodeGeneratorARM64::VisitDoubleIsInfinite(HInvoke* invoke) {
2792 GenIsInfinite(invoke->GetLocations(), /* is64bit= */ true, GetVIXLAssembler());
2793 }
2794
VisitIntegerValueOf(HInvoke * invoke)2795 void IntrinsicLocationsBuilderARM64::VisitIntegerValueOf(HInvoke* invoke) {
2796 InvokeRuntimeCallingConvention calling_convention;
2797 IntrinsicVisitor::ComputeIntegerValueOfLocations(
2798 invoke,
2799 codegen_,
2800 calling_convention.GetReturnLocation(DataType::Type::kReference),
2801 Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode()));
2802 }
2803
VisitIntegerValueOf(HInvoke * invoke)2804 void IntrinsicCodeGeneratorARM64::VisitIntegerValueOf(HInvoke* invoke) {
2805 IntrinsicVisitor::IntegerValueOfInfo info =
2806 IntrinsicVisitor::ComputeIntegerValueOfInfo(invoke, codegen_->GetCompilerOptions());
2807 LocationSummary* locations = invoke->GetLocations();
2808 MacroAssembler* masm = GetVIXLAssembler();
2809
2810 Register out = RegisterFrom(locations->Out(), DataType::Type::kReference);
2811 UseScratchRegisterScope temps(masm);
2812 Register temp = temps.AcquireW();
2813 if (invoke->InputAt(0)->IsConstant()) {
2814 int32_t value = invoke->InputAt(0)->AsIntConstant()->GetValue();
2815 if (static_cast<uint32_t>(value - info.low) < info.length) {
2816 // Just embed the j.l.Integer in the code.
2817 DCHECK_NE(info.value_boot_image_reference, IntegerValueOfInfo::kInvalidReference);
2818 codegen_->LoadBootImageAddress(out, info.value_boot_image_reference);
2819 } else {
2820 DCHECK(locations->CanCall());
2821 // Allocate and initialize a new j.l.Integer.
2822 // TODO: If we JIT, we could allocate the j.l.Integer now, and store it in the
2823 // JIT object table.
2824 codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(),
2825 info.integer_boot_image_offset);
2826 __ Mov(temp.W(), value);
2827 __ Str(temp.W(), HeapOperand(out.W(), info.value_offset));
2828 // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
2829 // one.
2830 codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
2831 }
2832 } else {
2833 DCHECK(locations->CanCall());
2834 Register in = RegisterFrom(locations->InAt(0), DataType::Type::kInt32);
2835 // Check bounds of our cache.
2836 __ Add(out.W(), in.W(), -info.low);
2837 __ Cmp(out.W(), info.length);
2838 vixl::aarch64::Label allocate, done;
2839 __ B(&allocate, hs);
2840 // If the value is within the bounds, load the j.l.Integer directly from the array.
2841 codegen_->LoadBootImageAddress(temp, info.array_data_boot_image_reference);
2842 MemOperand source = HeapOperand(
2843 temp, out.X(), LSL, DataType::SizeShift(DataType::Type::kReference));
2844 codegen_->Load(DataType::Type::kReference, out, source);
2845 codegen_->GetAssembler()->MaybeUnpoisonHeapReference(out);
2846 __ B(&done);
2847 __ Bind(&allocate);
2848 // Otherwise allocate and initialize a new j.l.Integer.
2849 codegen_->AllocateInstanceForIntrinsic(invoke->AsInvokeStaticOrDirect(),
2850 info.integer_boot_image_offset);
2851 __ Str(in.W(), HeapOperand(out.W(), info.value_offset));
2852 // `value` is a final field :-( Ideally, we'd merge this memory barrier with the allocation
2853 // one.
2854 codegen_->GenerateMemoryBarrier(MemBarrierKind::kStoreStore);
2855 __ Bind(&done);
2856 }
2857 }
2858
VisitThreadInterrupted(HInvoke * invoke)2859 void IntrinsicLocationsBuilderARM64::VisitThreadInterrupted(HInvoke* invoke) {
2860 LocationSummary* locations =
2861 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2862 locations->SetOut(Location::RequiresRegister());
2863 }
2864
VisitThreadInterrupted(HInvoke * invoke)2865 void IntrinsicCodeGeneratorARM64::VisitThreadInterrupted(HInvoke* invoke) {
2866 MacroAssembler* masm = GetVIXLAssembler();
2867 Register out = RegisterFrom(invoke->GetLocations()->Out(), DataType::Type::kInt32);
2868 UseScratchRegisterScope temps(masm);
2869 Register temp = temps.AcquireX();
2870
2871 __ Add(temp, tr, Thread::InterruptedOffset<kArm64PointerSize>().Int32Value());
2872 __ Ldar(out.W(), MemOperand(temp));
2873
2874 vixl::aarch64::Label done;
2875 __ Cbz(out.W(), &done);
2876 __ Stlr(wzr, MemOperand(temp));
2877 __ Bind(&done);
2878 }
2879
VisitReachabilityFence(HInvoke * invoke)2880 void IntrinsicLocationsBuilderARM64::VisitReachabilityFence(HInvoke* invoke) {
2881 LocationSummary* locations =
2882 new (allocator_) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
2883 locations->SetInAt(0, Location::Any());
2884 }
2885
VisitReachabilityFence(HInvoke * invoke ATTRIBUTE_UNUSED)2886 void IntrinsicCodeGeneratorARM64::VisitReachabilityFence(HInvoke* invoke ATTRIBUTE_UNUSED) { }
2887
VisitCRC32Update(HInvoke * invoke)2888 void IntrinsicLocationsBuilderARM64::VisitCRC32Update(HInvoke* invoke) {
2889 if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
2890 return;
2891 }
2892
2893 LocationSummary* locations = new (allocator_) LocationSummary(invoke,
2894 LocationSummary::kNoCall,
2895 kIntrinsified);
2896
2897 locations->SetInAt(0, Location::RequiresRegister());
2898 locations->SetInAt(1, Location::RequiresRegister());
2899 locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
2900 }
2901
2902 // Lower the invoke of CRC32.update(int crc, int b).
VisitCRC32Update(HInvoke * invoke)2903 void IntrinsicCodeGeneratorARM64::VisitCRC32Update(HInvoke* invoke) {
2904 DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
2905
2906 MacroAssembler* masm = GetVIXLAssembler();
2907
2908 Register crc = InputRegisterAt(invoke, 0);
2909 Register val = InputRegisterAt(invoke, 1);
2910 Register out = OutputRegister(invoke);
2911
2912 // The general algorithm of the CRC32 calculation is:
2913 // crc = ~crc
2914 // result = crc32_for_byte(crc, b)
2915 // crc = ~result
2916 // It is directly lowered to three instructions.
2917
2918 UseScratchRegisterScope temps(masm);
2919 Register tmp = temps.AcquireSameSizeAs(out);
2920
2921 __ Mvn(tmp, crc);
2922 __ Crc32b(tmp, tmp, val);
2923 __ Mvn(out, tmp);
2924 }
2925
2926 // Generate code using CRC32 instructions which calculates
2927 // a CRC32 value of a byte.
2928 //
2929 // Parameters:
2930 // masm - VIXL macro assembler
2931 // crc - a register holding an initial CRC value
2932 // ptr - a register holding a memory address of bytes
2933 // length - a register holding a number of bytes to process
2934 // out - a register to put a result of calculation
GenerateCodeForCalculationCRC32ValueOfBytes(MacroAssembler * masm,const Register & crc,const Register & ptr,const Register & length,const Register & out)2935 static void GenerateCodeForCalculationCRC32ValueOfBytes(MacroAssembler* masm,
2936 const Register& crc,
2937 const Register& ptr,
2938 const Register& length,
2939 const Register& out) {
2940 // The algorithm of CRC32 of bytes is:
2941 // crc = ~crc
2942 // process a few first bytes to make the array 8-byte aligned
2943 // while array has 8 bytes do:
2944 // crc = crc32_of_8bytes(crc, 8_bytes(array))
2945 // if array has 4 bytes:
2946 // crc = crc32_of_4bytes(crc, 4_bytes(array))
2947 // if array has 2 bytes:
2948 // crc = crc32_of_2bytes(crc, 2_bytes(array))
2949 // if array has a byte:
2950 // crc = crc32_of_byte(crc, 1_byte(array))
2951 // crc = ~crc
2952
2953 vixl::aarch64::Label loop, done;
2954 vixl::aarch64::Label process_4bytes, process_2bytes, process_1byte;
2955 vixl::aarch64::Label aligned2, aligned4, aligned8;
2956
2957 // Use VIXL scratch registers as the VIXL macro assembler won't use them in
2958 // instructions below.
2959 UseScratchRegisterScope temps(masm);
2960 Register len = temps.AcquireW();
2961 Register array_elem = temps.AcquireW();
2962
2963 __ Mvn(out, crc);
2964 __ Mov(len, length);
2965
2966 __ Tbz(ptr, 0, &aligned2);
2967 __ Subs(len, len, 1);
2968 __ B(&done, lo);
2969 __ Ldrb(array_elem, MemOperand(ptr, 1, PostIndex));
2970 __ Crc32b(out, out, array_elem);
2971
2972 __ Bind(&aligned2);
2973 __ Tbz(ptr, 1, &aligned4);
2974 __ Subs(len, len, 2);
2975 __ B(&process_1byte, lo);
2976 __ Ldrh(array_elem, MemOperand(ptr, 2, PostIndex));
2977 __ Crc32h(out, out, array_elem);
2978
2979 __ Bind(&aligned4);
2980 __ Tbz(ptr, 2, &aligned8);
2981 __ Subs(len, len, 4);
2982 __ B(&process_2bytes, lo);
2983 __ Ldr(array_elem, MemOperand(ptr, 4, PostIndex));
2984 __ Crc32w(out, out, array_elem);
2985
2986 __ Bind(&aligned8);
2987 __ Subs(len, len, 8);
2988 // If len < 8 go to process data by 4 bytes, 2 bytes and a byte.
2989 __ B(&process_4bytes, lo);
2990
2991 // The main loop processing data by 8 bytes.
2992 __ Bind(&loop);
2993 __ Ldr(array_elem.X(), MemOperand(ptr, 8, PostIndex));
2994 __ Subs(len, len, 8);
2995 __ Crc32x(out, out, array_elem.X());
2996 // if len >= 8, process the next 8 bytes.
2997 __ B(&loop, hs);
2998
2999 // Process the data which is less than 8 bytes.
3000 // The code generated below works with values of len
3001 // which come in the range [-8, 0].
3002 // The first three bits are used to detect whether 4 bytes or 2 bytes or
3003 // a byte can be processed.
3004 // The checking order is from bit 2 to bit 0:
3005 // bit 2 is set: at least 4 bytes available
3006 // bit 1 is set: at least 2 bytes available
3007 // bit 0 is set: at least a byte available
3008 __ Bind(&process_4bytes);
3009 // Goto process_2bytes if less than four bytes available
3010 __ Tbz(len, 2, &process_2bytes);
3011 __ Ldr(array_elem, MemOperand(ptr, 4, PostIndex));
3012 __ Crc32w(out, out, array_elem);
3013
3014 __ Bind(&process_2bytes);
3015 // Goto process_1bytes if less than two bytes available
3016 __ Tbz(len, 1, &process_1byte);
3017 __ Ldrh(array_elem, MemOperand(ptr, 2, PostIndex));
3018 __ Crc32h(out, out, array_elem);
3019
3020 __ Bind(&process_1byte);
3021 // Goto done if no bytes available
3022 __ Tbz(len, 0, &done);
3023 __ Ldrb(array_elem, MemOperand(ptr));
3024 __ Crc32b(out, out, array_elem);
3025
3026 __ Bind(&done);
3027 __ Mvn(out, out);
3028 }
3029
3030 // The threshold for sizes of arrays to use the library provided implementation
3031 // of CRC32.updateBytes instead of the intrinsic.
3032 static constexpr int32_t kCRC32UpdateBytesThreshold = 64 * 1024;
3033
VisitCRC32UpdateBytes(HInvoke * invoke)3034 void IntrinsicLocationsBuilderARM64::VisitCRC32UpdateBytes(HInvoke* invoke) {
3035 if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
3036 return;
3037 }
3038
3039 LocationSummary* locations =
3040 new (allocator_) LocationSummary(invoke,
3041 LocationSummary::kCallOnSlowPath,
3042 kIntrinsified);
3043
3044 locations->SetInAt(0, Location::RequiresRegister());
3045 locations->SetInAt(1, Location::RequiresRegister());
3046 locations->SetInAt(2, Location::RegisterOrConstant(invoke->InputAt(2)));
3047 locations->SetInAt(3, Location::RequiresRegister());
3048 locations->AddTemp(Location::RequiresRegister());
3049 locations->SetOut(Location::RequiresRegister());
3050 }
3051
3052 // Lower the invoke of CRC32.updateBytes(int crc, byte[] b, int off, int len)
3053 //
3054 // Note: The intrinsic is not used if len exceeds a threshold.
VisitCRC32UpdateBytes(HInvoke * invoke)3055 void IntrinsicCodeGeneratorARM64::VisitCRC32UpdateBytes(HInvoke* invoke) {
3056 DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
3057
3058 MacroAssembler* masm = GetVIXLAssembler();
3059 LocationSummary* locations = invoke->GetLocations();
3060
3061 SlowPathCodeARM64* slow_path =
3062 new (codegen_->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
3063 codegen_->AddSlowPath(slow_path);
3064
3065 Register length = WRegisterFrom(locations->InAt(3));
3066 __ Cmp(length, kCRC32UpdateBytesThreshold);
3067 __ B(slow_path->GetEntryLabel(), hi);
3068
3069 const uint32_t array_data_offset =
3070 mirror::Array::DataOffset(Primitive::kPrimByte).Uint32Value();
3071 Register ptr = XRegisterFrom(locations->GetTemp(0));
3072 Register array = XRegisterFrom(locations->InAt(1));
3073 Location offset = locations->InAt(2);
3074 if (offset.IsConstant()) {
3075 int32_t offset_value = offset.GetConstant()->AsIntConstant()->GetValue();
3076 __ Add(ptr, array, array_data_offset + offset_value);
3077 } else {
3078 __ Add(ptr, array, array_data_offset);
3079 __ Add(ptr, ptr, XRegisterFrom(offset));
3080 }
3081
3082 Register crc = WRegisterFrom(locations->InAt(0));
3083 Register out = WRegisterFrom(locations->Out());
3084
3085 GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
3086
3087 __ Bind(slow_path->GetExitLabel());
3088 }
3089
VisitCRC32UpdateByteBuffer(HInvoke * invoke)3090 void IntrinsicLocationsBuilderARM64::VisitCRC32UpdateByteBuffer(HInvoke* invoke) {
3091 if (!codegen_->GetInstructionSetFeatures().HasCRC()) {
3092 return;
3093 }
3094
3095 LocationSummary* locations =
3096 new (allocator_) LocationSummary(invoke,
3097 LocationSummary::kNoCall,
3098 kIntrinsified);
3099
3100 locations->SetInAt(0, Location::RequiresRegister());
3101 locations->SetInAt(1, Location::RequiresRegister());
3102 locations->SetInAt(2, Location::RequiresRegister());
3103 locations->SetInAt(3, Location::RequiresRegister());
3104 locations->AddTemp(Location::RequiresRegister());
3105 locations->SetOut(Location::RequiresRegister());
3106 }
3107
3108 // Lower the invoke of CRC32.updateByteBuffer(int crc, long addr, int off, int len)
3109 //
3110 // There is no need to generate code checking if addr is 0.
3111 // The method updateByteBuffer is a private method of java.util.zip.CRC32.
3112 // This guarantees no calls outside of the CRC32 class.
3113 // An address of DirectBuffer is always passed to the call of updateByteBuffer.
3114 // It might be an implementation of an empty DirectBuffer which can use a zero
3115 // address but it must have the length to be zero. The current generated code
3116 // correctly works with the zero length.
VisitCRC32UpdateByteBuffer(HInvoke * invoke)3117 void IntrinsicCodeGeneratorARM64::VisitCRC32UpdateByteBuffer(HInvoke* invoke) {
3118 DCHECK(codegen_->GetInstructionSetFeatures().HasCRC());
3119
3120 MacroAssembler* masm = GetVIXLAssembler();
3121 LocationSummary* locations = invoke->GetLocations();
3122
3123 Register addr = XRegisterFrom(locations->InAt(1));
3124 Register ptr = XRegisterFrom(locations->GetTemp(0));
3125 __ Add(ptr, addr, XRegisterFrom(locations->InAt(2)));
3126
3127 Register crc = WRegisterFrom(locations->InAt(0));
3128 Register length = WRegisterFrom(locations->InAt(3));
3129 Register out = WRegisterFrom(locations->Out());
3130 GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
3131 }
3132
VisitFP16ToFloat(HInvoke * invoke)3133 void IntrinsicLocationsBuilderARM64::VisitFP16ToFloat(HInvoke* invoke) {
3134 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3135 return;
3136 }
3137
3138 LocationSummary* locations = new (allocator_) LocationSummary(invoke,
3139 LocationSummary::kNoCall,
3140 kIntrinsified);
3141 locations->SetInAt(0, Location::RequiresRegister());
3142 locations->SetOut(Location::RequiresFpuRegister());
3143 }
3144
VisitFP16ToFloat(HInvoke * invoke)3145 void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
3146 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
3147 MacroAssembler* masm = GetVIXLAssembler();
3148 UseScratchRegisterScope scratch_scope(masm);
3149 Register bits = InputRegisterAt(invoke, 0);
3150 VRegister out = SRegisterFrom(invoke->GetLocations()->Out());
3151 VRegister half = scratch_scope.AcquireH();
3152 __ Fmov(half, bits); // ARMv8.2
3153 __ Fcvt(out, half);
3154 }
3155
VisitFP16ToHalf(HInvoke * invoke)3156 void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) {
3157 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3158 return;
3159 }
3160
3161 LocationSummary* locations = new (allocator_) LocationSummary(invoke,
3162 LocationSummary::kNoCall,
3163 kIntrinsified);
3164 locations->SetInAt(0, Location::RequiresFpuRegister());
3165 locations->SetOut(Location::RequiresRegister());
3166 }
3167
VisitFP16ToHalf(HInvoke * invoke)3168 void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) {
3169 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
3170 MacroAssembler* masm = GetVIXLAssembler();
3171 UseScratchRegisterScope scratch_scope(masm);
3172 VRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0));
3173 VRegister half = scratch_scope.AcquireH();
3174 Register out = WRegisterFrom(invoke->GetLocations()->Out());
3175 __ Fcvt(half, in);
3176 __ Fmov(out, half);
3177 __ Sxth(out, out); // sign extend due to returning a short type.
3178 }
3179
3180 template<typename OP>
GenerateFP16Round(HInvoke * invoke,CodeGeneratorARM64 * const codegen_,MacroAssembler * masm,const OP roundOp)3181 void GenerateFP16Round(HInvoke* invoke,
3182 CodeGeneratorARM64* const codegen_,
3183 MacroAssembler* masm,
3184 const OP roundOp) {
3185 DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
3186 LocationSummary* locations = invoke->GetLocations();
3187 UseScratchRegisterScope scratch_scope(masm);
3188 Register out = WRegisterFrom(locations->Out());
3189 VRegister half = scratch_scope.AcquireH();
3190 __ Fmov(half, WRegisterFrom(locations->InAt(0)));
3191 roundOp(half, half);
3192 __ Fmov(out, half);
3193 __ Sxth(out, out);
3194 }
3195
VisitFP16Floor(HInvoke * invoke)3196 void IntrinsicLocationsBuilderARM64::VisitFP16Floor(HInvoke* invoke) {
3197 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3198 return;
3199 }
3200
3201 CreateIntToIntLocations(allocator_, invoke);
3202 }
3203
VisitFP16Floor(HInvoke * invoke)3204 void IntrinsicCodeGeneratorARM64::VisitFP16Floor(HInvoke* invoke) {
3205 MacroAssembler* masm = GetVIXLAssembler();
3206 auto roundOp = [masm](const VRegister& out, const VRegister& in) {
3207 __ Frintm(out, in); // Round towards Minus infinity
3208 };
3209 GenerateFP16Round(invoke, codegen_, masm, roundOp);
3210 }
3211
VisitFP16Ceil(HInvoke * invoke)3212 void IntrinsicLocationsBuilderARM64::VisitFP16Ceil(HInvoke* invoke) {
3213 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3214 return;
3215 }
3216
3217 CreateIntToIntLocations(allocator_, invoke);
3218 }
3219
VisitFP16Ceil(HInvoke * invoke)3220 void IntrinsicCodeGeneratorARM64::VisitFP16Ceil(HInvoke* invoke) {
3221 MacroAssembler* masm = GetVIXLAssembler();
3222 auto roundOp = [masm](const VRegister& out, const VRegister& in) {
3223 __ Frintp(out, in); // Round towards Plus infinity
3224 };
3225 GenerateFP16Round(invoke, codegen_, masm, roundOp);
3226 }
3227
VisitFP16Rint(HInvoke * invoke)3228 void IntrinsicLocationsBuilderARM64::VisitFP16Rint(HInvoke* invoke) {
3229 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3230 return;
3231 }
3232
3233 CreateIntToIntLocations(allocator_, invoke);
3234 }
3235
VisitFP16Rint(HInvoke * invoke)3236 void IntrinsicCodeGeneratorARM64::VisitFP16Rint(HInvoke* invoke) {
3237 MacroAssembler* masm = GetVIXLAssembler();
3238 auto roundOp = [masm](const VRegister& out, const VRegister& in) {
3239 __ Frintn(out, in); // Round to nearest, with ties to even
3240 };
3241 GenerateFP16Round(invoke, codegen_, masm, roundOp);
3242 }
3243
3244 template<typename OP>
GenerateFP16Compare(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,const OP compareOp)3245 void GenerateFP16Compare(HInvoke* invoke,
3246 CodeGeneratorARM64* codegen,
3247 MacroAssembler* masm,
3248 const OP compareOp) {
3249 DCHECK(codegen->GetInstructionSetFeatures().HasFP16());
3250 LocationSummary* locations = invoke->GetLocations();
3251 Register out = WRegisterFrom(locations->Out());
3252 VRegister half0 = HRegisterFrom(locations->GetTemp(0));
3253 VRegister half1 = HRegisterFrom(locations->GetTemp(1));
3254 __ Fmov(half0, WRegisterFrom(locations->InAt(0)));
3255 __ Fmov(half1, WRegisterFrom(locations->InAt(1)));
3256 compareOp(out, half0, half1);
3257 }
3258
GenerateFP16Compare(HInvoke * invoke,CodeGeneratorARM64 * codegen,MacroAssembler * masm,vixl::aarch64::Condition cond)3259 static inline void GenerateFP16Compare(HInvoke* invoke,
3260 CodeGeneratorARM64* codegen,
3261 MacroAssembler* masm,
3262 vixl::aarch64::Condition cond) {
3263 auto compareOp = [masm, cond](const Register out, const VRegister& in0, const VRegister& in1) {
3264 __ Fcmp(in0, in1);
3265 __ Cset(out, cond);
3266 };
3267 GenerateFP16Compare(invoke, codegen, masm, compareOp);
3268 }
3269
VisitFP16Greater(HInvoke * invoke)3270 void IntrinsicLocationsBuilderARM64::VisitFP16Greater(HInvoke* invoke) {
3271 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3272 return;
3273 }
3274
3275 CreateIntIntToIntLocations(allocator_, invoke);
3276 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3277 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3278 }
3279
VisitFP16Greater(HInvoke * invoke)3280 void IntrinsicCodeGeneratorARM64::VisitFP16Greater(HInvoke* invoke) {
3281 MacroAssembler* masm = GetVIXLAssembler();
3282 GenerateFP16Compare(invoke, codegen_, masm, gt);
3283 }
3284
VisitFP16GreaterEquals(HInvoke * invoke)3285 void IntrinsicLocationsBuilderARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
3286 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3287 return;
3288 }
3289
3290 CreateIntIntToIntLocations(allocator_, invoke);
3291 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3292 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3293 }
3294
VisitFP16GreaterEquals(HInvoke * invoke)3295 void IntrinsicCodeGeneratorARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
3296 MacroAssembler* masm = GetVIXLAssembler();
3297 GenerateFP16Compare(invoke, codegen_, masm, ge);
3298 }
3299
VisitFP16Less(HInvoke * invoke)3300 void IntrinsicLocationsBuilderARM64::VisitFP16Less(HInvoke* invoke) {
3301 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3302 return;
3303 }
3304
3305 CreateIntIntToIntLocations(allocator_, invoke);
3306 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3307 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3308 }
3309
VisitFP16Less(HInvoke * invoke)3310 void IntrinsicCodeGeneratorARM64::VisitFP16Less(HInvoke* invoke) {
3311 MacroAssembler* masm = GetVIXLAssembler();
3312 GenerateFP16Compare(invoke, codegen_, masm, mi);
3313 }
3314
VisitFP16LessEquals(HInvoke * invoke)3315 void IntrinsicLocationsBuilderARM64::VisitFP16LessEquals(HInvoke* invoke) {
3316 if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
3317 return;
3318 }
3319
3320 CreateIntIntToIntLocations(allocator_, invoke);
3321 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3322 invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
3323 }
3324
VisitFP16LessEquals(HInvoke * invoke)3325 void IntrinsicCodeGeneratorARM64::VisitFP16LessEquals(HInvoke* invoke) {
3326 MacroAssembler* masm = GetVIXLAssembler();
3327 GenerateFP16Compare(invoke, codegen_, masm, ls);
3328 }
3329
3330 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
3331 UNIMPLEMENTED_INTRINSIC(ARM64, IntegerDivideUnsigned)
3332
3333 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
3334 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOfAfter);
3335 UNIMPLEMENTED_INTRINSIC(ARM64, StringBufferAppend);
3336 UNIMPLEMENTED_INTRINSIC(ARM64, StringBufferLength);
3337 UNIMPLEMENTED_INTRINSIC(ARM64, StringBufferToString);
3338 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendObject);
3339 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendString);
3340 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendCharSequence);
3341 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendCharArray);
3342 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendBoolean);
3343 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendChar);
3344 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendInt);
3345 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendLong);
3346 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendFloat);
3347 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderAppendDouble);
3348 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderLength);
3349 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderToString);
3350
3351 // 1.8.
3352 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddInt)
3353 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddLong)
3354 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndSetInt)
3355 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndSetLong)
3356 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndSetObject)
3357
3358 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleFullFence)
3359 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleAcquireFence)
3360 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleReleaseFence)
3361 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleLoadLoadFence)
3362 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleStoreStoreFence)
3363 UNIMPLEMENTED_INTRINSIC(ARM64, MethodHandleInvokeExact)
3364 UNIMPLEMENTED_INTRINSIC(ARM64, MethodHandleInvoke)
3365 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleCompareAndExchange)
3366 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleCompareAndExchangeAcquire)
3367 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleCompareAndExchangeRelease)
3368 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleCompareAndSet)
3369 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGet)
3370 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAcquire)
3371 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndAdd)
3372 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndAddAcquire)
3373 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndAddRelease)
3374 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseAnd)
3375 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseAndAcquire)
3376 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseAndRelease)
3377 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseOr)
3378 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseOrAcquire)
3379 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseOrRelease)
3380 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseXor)
3381 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseXorAcquire)
3382 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndBitwiseXorRelease)
3383 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndSet)
3384 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndSetAcquire)
3385 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetAndSetRelease)
3386 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetOpaque)
3387 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleGetVolatile)
3388 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleSet)
3389 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleSetOpaque)
3390 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleSetRelease)
3391 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleSetVolatile)
3392 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleWeakCompareAndSet)
3393 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleWeakCompareAndSetAcquire)
3394 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleWeakCompareAndSetPlain)
3395 UNIMPLEMENTED_INTRINSIC(ARM64, VarHandleWeakCompareAndSetRelease)
3396
3397 UNREACHABLE_INTRINSICS(ARM64)
3398
3399 #undef __
3400
3401 } // namespace arm64
3402 } // namespace art
3403