1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "ruy/prepacked_cache.h"
17
18 #include "ruy/mat.h"
19 #include "ruy/profiler/instrumentation.h"
20 #include "ruy/system_aligned_alloc.h"
21
22 namespace ruy {
23
24 namespace {
25
26 // Allocates the `data` and `sums` buffers, and sets the corresponding
27 // pointer fields, in a PEMat whose other fields, particularly `layout`
28 // and the runtime data types, are already populated.
AllocateBuffers(PEMat * packed_matrix)29 int AllocateBuffers(PEMat* packed_matrix) {
30 const int data_bytes = DataBytes(*packed_matrix);
31 packed_matrix->data = detail::SystemAlignedAlloc(data_bytes);
32 int sums_bytes = 0;
33 if (!packed_matrix->sums_type.is_floating_point) {
34 // Integer quantized matrices also need the `sums` buffer.
35 sums_bytes = SumsBytes(*packed_matrix);
36 packed_matrix->sums = detail::SystemAlignedAlloc(sums_bytes);
37 }
38 return data_bytes + sums_bytes;
39 }
40
41 // Frees the `data` and `sums` buffers held by a PEMat.
FreeBuffers(const PEMat & packed_matrix)42 void FreeBuffers(const PEMat& packed_matrix) {
43 detail::SystemAlignedFree(packed_matrix.data);
44 detail::SystemAlignedFree(packed_matrix.sums);
45 }
46
47 } // end anonymous namespace
48
operator ()(const PrepackedCache::Key & key) const49 std::size_t PrepackedCache::KeyHash::operator()(
50 const PrepackedCache::Key& key) const {
51 std::size_t src_data_hash = reinterpret_cast<std::size_t>(key.src_data);
52 // Naive hash of the layout. Based on some heuristic reasoning, not any
53 // benchmarking.
54 // A choice of hash function here is just an optimization matter
55 // anyway, since a hash collision only results in some Key::operator== calls
56 // to disambiguate, and even just returning src_data_hash, ignoring the layout
57 // altogether, would probably be good enough, as the case of multiple entries
58 // with the same data pointer will be uncommon.
59 // Here we multiply-add the layout fields using some small constant prime
60 // numbers as multipliers. The conventional approach of xor-ing bit-rotations
61 // would result in some hash collisions because these values are typically
62 // small positive integers, so bit-rotations are essentially bit-shifts,
63 // and powers of two are common.
64 std::size_t packed_layout_hash =
65 static_cast<int>(key.packed_layout.order) +
66 static_cast<int>(key.packed_layout.kernel.order) * 2 +
67 key.packed_layout.stride * 3 + key.packed_layout.kernel.rows * 5 +
68 key.packed_layout.kernel.cols * 7 + key.packed_layout.rows * 11 +
69 key.packed_layout.cols * 13;
70 return src_data_hash ^ packed_layout_hash;
71 }
72
~PrepackedCache()73 PrepackedCache::~PrepackedCache() {
74 for (const auto& pair : cache_) {
75 FreeBuffers(pair.second.packed_matrix);
76 }
77 }
78
Get(const void * src_data,PEMat * packed_matrix)79 PrepackedCache::Action PrepackedCache::Get(const void* src_data,
80 PEMat* packed_matrix) {
81 // Construct a Key and look up the cache.
82 Key key;
83 key.src_data = src_data;
84 key.packed_layout = packed_matrix->layout;
85 key.zero_point = packed_matrix->zero_point;
86 const auto& itr = cache_.find(key);
87
88 if (itr != cache_.end()) {
89 // Found existing entry. Update its timestamp and return it.
90 itr->second.timestamp = timestamp_++;
91 *packed_matrix = itr->second.packed_matrix;
92 return Action::kGotExistingEntry;
93 }
94
95 // No existing entry found. Allocate new buffers now and insert in the cache.
96 const int new_bytes = AllocateBuffers(packed_matrix);
97 EjectUntilRoomFor(new_bytes);
98 Entry entry{*packed_matrix, timestamp_++};
99 cache_.emplace(key, entry);
100 buffers_bytes_ += new_bytes;
101 return Action::kInsertedNewEntry;
102 }
103
EjectUntilRoomFor(int new_bytes)104 void PrepackedCache::EjectUntilRoomFor(int new_bytes) {
105 profiler::ScopeLabel label("PrepackedCacheEjection");
106 // While we are above the threshold of ejection, eject the LRU entry.
107 while (!cache_.empty() && buffers_bytes_ + new_bytes > max_buffers_bytes_) {
108 EjectOne();
109 }
110 }
111
EjectOne()112 void PrepackedCache::EjectOne() {
113 auto oldest = cache_.begin();
114 Timestamp oldest_timestamp = oldest->second.timestamp;
115 {
116 for (auto itr = cache_.begin(); itr != cache_.end(); ++itr) {
117 if (itr->second.timestamp < oldest_timestamp) {
118 oldest = itr;
119 oldest_timestamp = itr->second.timestamp;
120 }
121 }
122 }
123 const PEMat& packed_matrix = oldest->second.packed_matrix;
124 buffers_bytes_ -= DataBytes(packed_matrix) + SumsBytes(packed_matrix);
125 FreeBuffers(packed_matrix);
126 cache_.erase(oldest);
127 }
128
129 } // namespace ruy
130