1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #pragma once
18
19 #include <vector>
20 #include <cstdint>
21 #include <memory>
22
23 namespace slicer {
24
25 // A specialized Key -> T* map (note that, unlike std:: containers, the values
26 // are always pointers here, and we don't explicitly store the lookup keys)
27 //
28 // Implemented as an incrementally resizable hash table: we split the logical hash table
29 // into two internal fixed size tables, the "full table" and a "insertion table".
30 // When the insertion table overflows, we allocate a larger hashtable to replace
31 // it and "insertion table" becomes the "full table" (the old "full table" is
32 // rehashed into the new hash table)
33 //
34 // Similar to open addressing hash tables, all the buckets are a single,
35 // contiguous array. But this table is growing and the collisions are still handled
36 // as chains (using indexes instead of pointers).
37 //
38 // The result is faster than std::unordered_map and uses ~25% of
39 // the memory used by std::unordered_map<const char*, String*>
40 //
41 // The Hash template argument is a type which must implement:
42 // 1. hash function : uint32_t Hash(const Key& key)
43 // 2. key compare : bool Compare(const Key& key, T* value)
44 // 3. key extraction : Key GetKey(T* value)
45 // 4. copy semantics
46 //
47 template<class Key, class T, class Hash>
48 class HashTable {
49 private:
50 // the index type inside the bucket array
51 using Index = uint32_t;
52
53 static constexpr Index kInitialHashBuckets = (1 << 7) - 1;
54 static constexpr Index kAvgChainLength = 2;
55 static constexpr Index kInvalidIndex = static_cast<Index>(-1);
56 static constexpr double kResizeFactor = 1.6;
57
58 struct __attribute__((packed)) Bucket {
59 T* value = nullptr;
60 Index next = kInvalidIndex;
61 };
62
63 class Partition {
64 public:
65 Partition(Index size, const Hash& hasher);
66 bool Insert(T* value);
67 T* Lookup(const Key& key, uint32_t hash_value) const;
HashBuckets()68 Index HashBuckets() const { return hash_buckets_; }
69 void InsertAll(const Partition& src);
70 void PrintStats(const char* name, bool verbose);
71
72 private:
73 std::vector<Bucket> buckets_;
74 const Index hash_buckets_;
75 Hash hasher_;
76 };
77
78 public:
hasher_(hasher)79 explicit HashTable(const Hash& hasher = Hash()) : hasher_(hasher) {
80 // we start with full_table_ == nullptr
81 insertion_table_.reset(new Partition(kInitialHashBuckets, hasher_));
82 }
83
84 ~HashTable() = default;
85
86 // No move or copy semantics
87 HashTable(const HashTable&) = delete;
88 HashTable& operator=(const HashTable&) = delete;
89
90 // Insert a new, non-nullptr T* into the hash table
91 // (we only store unique values so the new value must
92 // not be in the table already)
93 void Insert(T* value);
94
95 // Lookup an existing value
96 // (returns nullptr if the value is not found)
97 T* Lookup(const Key& key) const;
98
99 void PrintStats(const char* name, bool verbose);
100
101 private:
102 std::unique_ptr<Partition> full_table_;
103 std::unique_ptr<Partition> insertion_table_;
104 Hash hasher_;
105 };
106
107 template<class Key, class T, class Hash>
Partition(Index size,const Hash & hasher)108 HashTable<Key, T, Hash>::Partition::Partition(Index size, const Hash& hasher)
109 : hash_buckets_(size), hasher_(hasher) {
110 // allocate space for the hash buckets + avg chain length
111 buckets_.reserve(hash_buckets_ * kAvgChainLength);
112 buckets_.resize(hash_buckets_);
113 }
114
115 // Similar to the "cellar" version of coalesced hashing,
116 // the buckets array is divided into a fixed set of entries
117 // addressable by the hash value [0 .. hash_buckets_) and
118 // extra buckets for the collision chains [hash_buckets_, buckets_.size())
119 // Unlike coalesced hashing, our "cellar" is growing so we don't actually
120 // have to coalesce any chains.
121 //
122 // Returns true if the insertion succeeded, false if the table overflows
123 // (we never insert more than the pre-reserved capacity)
124 //
125 template<class Key, class T, class Hash>
Insert(T * value)126 bool HashTable<Key, T, Hash>::Partition::Insert(T* value) {
127 SLICER_CHECK(value != nullptr);
128 // overflow?
129 if (buckets_.size() + 1 > buckets_.capacity()) {
130 return false;
131 }
132 auto key = hasher_.GetKey(value);
133 Index bucket_index = hasher_.Hash(key) % hash_buckets_;
134 if (buckets_[bucket_index].value == nullptr) {
135 buckets_[bucket_index].value = value;
136 } else {
137 Bucket new_bucket = {};
138 new_bucket.value = value;
139 new_bucket.next = buckets_[bucket_index].next;
140 buckets_[bucket_index].next = buckets_.size();
141 buckets_.push_back(new_bucket);
142 }
143 return true;
144 }
145
146 template<class Key, class T, class Hash>
Lookup(const Key & key,uint32_t hash_value)147 T* HashTable<Key, T, Hash>::Partition::Lookup(const Key& key, uint32_t hash_value) const {
148 assert(hash_value == hasher_.Hash(key));
149 Index bucket_index = hash_value % hash_buckets_;
150 for (Index index = bucket_index; index != kInvalidIndex; index = buckets_[index].next) {
151 auto value = buckets_[index].value;
152 if (value == nullptr) {
153 assert(index < hash_buckets_);
154 break;
155 } else if (hasher_.Compare(key, value)) {
156 return value;
157 }
158 }
159 return nullptr;
160 }
161
162 template<class Key, class T, class Hash>
InsertAll(const Partition & src)163 void HashTable<Key, T, Hash>::Partition::InsertAll(const Partition& src) {
164 for (const auto& bucket : src.buckets_) {
165 if (bucket.value != nullptr) {
166 SLICER_CHECK(Insert(bucket.value));
167 }
168 }
169 }
170
171 // Try to insert into the "insertion table". If that overflows,
172 // we allocate a new, larger hash table, move "full table" value to it
173 // and "insertion table" becomes the new "full table".
174 template<class Key, class T, class Hash>
Insert(T * value)175 void HashTable<Key, T, Hash>::Insert(T* value) {
176 assert(Lookup(hasher_.GetKey(value)) == nullptr);
177 if (!insertion_table_->Insert(value)) {
178 std::unique_ptr<Partition> new_hash_table(
179 new Partition(insertion_table_->HashBuckets() * kResizeFactor, hasher_));
180 if (full_table_) {
181 new_hash_table->InsertAll(*full_table_);
182 }
183 SLICER_CHECK(new_hash_table->Insert(value));
184 full_table_ = std::move(insertion_table_);
185 insertion_table_ = std::move(new_hash_table);
186 }
187 }
188
189 // First look into the "full table" and if the value is
190 // not found there look into the "insertion table" next
191 template<class Key, class T, class Hash>
Lookup(const Key & key)192 T* HashTable<Key, T, Hash>::Lookup(const Key& key) const {
193 auto hash_value = hasher_.Hash(key);
194 if (full_table_) {
195 auto value = full_table_->Lookup(key, hash_value);
196 if (value != nullptr) {
197 return value;
198 }
199 }
200 return insertion_table_->Lookup(key, hash_value);
201 }
202
203 template<class Key, class T, class Hash>
PrintStats(const char * name,bool verbose)204 void HashTable<Key, T, Hash>::Partition::PrintStats(const char* name, bool verbose) {
205 int max_chain_length = 0;
206 int sum_chain_length = 0;
207 int used_buckets = 0;
208 for (Index i = 0; i < hash_buckets_; ++i) {
209 if (verbose) printf("%4d : ", i);
210 if (buckets_[i].value != nullptr) {
211 ++used_buckets;
212 int chain_length = 0;
213 for (Index ci = i; buckets_[ci].next != kInvalidIndex; ci = buckets_[ci].next) {
214 SLICER_CHECK(buckets_[ci].value != nullptr);
215 ++chain_length;
216 if (verbose) printf("*");
217 }
218 max_chain_length = std::max(max_chain_length, chain_length);
219 sum_chain_length += chain_length;
220 }
221 if (verbose) printf("\n");
222 }
223
224 int avg_chain_length = used_buckets ? sum_chain_length / used_buckets : 0;
225
226 printf("\nHash table partition (%s):\n", name);
227 printf(" hash_buckets : %u\n", hash_buckets_);
228 printf(" size/capacity : %zu / %zu\n", buckets_.size(), buckets_.capacity());
229 printf(" used_buckets : %d\n", used_buckets);
230 printf(" max_chain_length : %d\n", max_chain_length);
231 printf(" avg_chain_length : %d\n", avg_chain_length);
232 }
233
234 template<class Key, class T, class Hash>
PrintStats(const char * name,bool verbose)235 void HashTable<Key, T, Hash>::PrintStats(const char* name, bool verbose) {
236 printf("\nHash table stats (%s)\n", name);
237 if (full_table_) {
238 full_table_->PrintStats("full_table", verbose);
239 }
240 insertion_table_->PrintStats("insertion_table", verbose);
241 }
242
243 } // namespace slicer
244