1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "ruy/allocator.h"
17
18 #include "ruy/opt_set.h"
19 #include "ruy/size_util.h"
20 #include "ruy/system_aligned_alloc.h"
21
22 namespace ruy {
23
~Allocator()24 Allocator::~Allocator() {
25 FreeAll();
26 detail::SystemAlignedFree(ptr_);
27 }
28
AllocateFast(std::ptrdiff_t num_bytes)29 void* Allocator::AllocateFast(std::ptrdiff_t num_bytes) {
30 if (current_ + num_bytes > size_) {
31 return nullptr;
32 }
33 void* ret = static_cast<char*>(ptr_) + current_;
34 current_ += num_bytes;
35 return ret;
36 }
37
AllocateSlow(std::ptrdiff_t num_bytes)38 void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) {
39 void* p = detail::SystemAlignedAlloc(num_bytes);
40 fallback_blocks_total_size_ += num_bytes;
41 fallback_blocks_.push_back(p);
42 return p;
43 }
44
AllocateBytes(std::ptrdiff_t num_bytes)45 void* Allocator::AllocateBytes(std::ptrdiff_t num_bytes) {
46 if (num_bytes == 0) {
47 return nullptr;
48 }
49 const std::ptrdiff_t rounded_num_bytes =
50 round_up_pot(num_bytes, detail::kMinimumBlockAlignment);
51 if (void* p = AllocateFast(rounded_num_bytes)) {
52 return p;
53 }
54 return AllocateSlow(rounded_num_bytes);
55 }
56
AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,const void * to_avoid)57 void* Allocator::AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,
58 const void* to_avoid) {
59 #if RUY_OPT(AVOID_ALIASING)
60 if (num_bytes == 0) {
61 return nullptr;
62 }
63 // The minimum L1D cache aliasing periodicity in bytes that we expect to
64 // encounter on any device. This does not seem to be documented, but
65 // empirically we observe the following:
66 // Cortex-A53: 1024
67 // Cortex-A55r1: 2048
68 // Cortex-A76: not as easily observable.
69 // Over-estimating this makes the AVOID_ALIASING optimization useless on
70 // devices with lower periodicity.
71 // Under-estimating this by 2x should be harmless.
72 // Under-estimating this by a larger factor should gradually degrade
73 // performance due to cache aliasing causing mutual eviction between
74 // the packed matrix data, and the source matrix data being prefetched by the
75 // CPU ahead of the packing code execution.
76 static constexpr std::uint32_t kMinPeriod = 1024;
77 static_assert(is_pot(kMinPeriod), "");
78 void* p = AllocateBytes(num_bytes + kMinPeriod);
79 auto unsigned_low_bits = [](const void* p) {
80 return static_cast<std::uint32_t>(reinterpret_cast<std::uintptr_t>(p));
81 };
82 // This relies on unsigned integer overflow wrapping around.
83 std::uint32_t diff_modulus =
84 (unsigned_low_bits(p) - unsigned_low_bits(to_avoid)) % kMinPeriod;
85 // diff_modulus is in [0, kMinPeriod).
86 // We want it as close as possible to the middle of that interval,
87 // kMinPeriod/2. The bad 'aliasing' case, that we are working to avoid,
88 // is when diff_modulus is close to the ends of that interval, 0 or
89 // kMinPeriod. So we want to add an offset of kMinPeriod/2 if it is in the
90 // first or the last quarter of that interval.
91 bool need_offset =
92 diff_modulus < kMinPeriod / 4 || diff_modulus > 3 * kMinPeriod / 4;
93 return static_cast<char*>(p) + (need_offset ? (kMinPeriod / 2) : 0);
94 #else
95 (void)to_avoid;
96 return AllocateBytes(num_bytes);
97 #endif
98 }
99
FreeAll()100 void Allocator::FreeAll() {
101 current_ = 0;
102 if (fallback_blocks_.empty()) {
103 return;
104 }
105
106 // No rounding-up of the size means linear instead of logarithmic
107 // bound on the number of allocation in some worst-case calling patterns.
108 // This is considered worth it because minimizing memory usage is important
109 // and actual calling patterns in applications that we care about still
110 // reach the no-further-allocations steady state in a small finite number
111 // of iterations.
112 std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
113 detail::SystemAlignedFree(ptr_);
114 ptr_ = detail::SystemAlignedAlloc(new_size);
115 size_ = new_size;
116
117 for (void* p : fallback_blocks_) {
118 detail::SystemAlignedFree(p);
119 }
120 fallback_blocks_.clear();
121 fallback_blocks_total_size_ = 0;
122 }
123
124 } // namespace ruy
125