1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <vector>
20 #include <cstdint>
21 #include <memory>
22 
23 namespace slicer {
24 
25 // A specialized Key -> T* map (note that, unlike std:: containers, the values
26 // are always pointers here, and we don't explicitly store the lookup keys)
27 //
28 // Implemented as an incrementally resizable hash table: we split the logical hash table
29 // into two internal fixed size tables, the "full table" and a "insertion table".
30 // When the insertion table overflows, we allocate a larger hashtable to replace
31 // it and "insertion table" becomes the "full table" (the old "full table" is
32 // rehashed into the new hash table)
33 //
34 // Similar to open addressing hash tables, all the buckets are a single,
35 // contiguous array. But this table is growing and the collisions are still handled
36 // as chains (using indexes instead of pointers).
37 //
38 // The result is faster than std::unordered_map and uses ~25% of
39 // the memory used by std::unordered_map<const char*, String*>
40 //
41 // The Hash template argument is a type which must implement:
42 //   1. hash function   : uint32_t Hash(const Key& key)
43 //   2. key compare     : bool Compare(const Key& key, T* value)
44 //   3. key extraction  : Key GetKey(T* value)
45 //   4. copy semantics
46 //
47 template<class Key, class T, class Hash>
48 class HashTable {
49  private:
50   // the index type inside the bucket array
51   using Index = uint32_t;
52 
53   static constexpr Index kInitialHashBuckets = (1 << 7) - 1;
54   static constexpr Index kAvgChainLength = 2;
55   static constexpr Index kInvalidIndex = static_cast<Index>(-1);
56   static constexpr double kResizeFactor = 1.6;
57 
58   struct __attribute__((packed)) Bucket {
59     T* value = nullptr;
60     Index next = kInvalidIndex;
61   };
62 
63   class Partition {
64    public:
65     Partition(Index size, const Hash& hasher);
66     bool Insert(T* value);
67     T* Lookup(const Key& key, uint32_t hash_value) const;
HashBuckets()68     Index HashBuckets() const { return hash_buckets_; }
69     void InsertAll(const Partition& src);
70     void PrintStats(const char* name, bool verbose);
71 
72    private:
73     std::vector<Bucket> buckets_;
74     const Index hash_buckets_;
75     Hash hasher_;
76   };
77 
78  public:
hasher_(hasher)79   explicit HashTable(const Hash& hasher = Hash()) : hasher_(hasher) {
80     // we start with full_table_ == nullptr
81     insertion_table_.reset(new Partition(kInitialHashBuckets, hasher_));
82   }
83 
84   ~HashTable() = default;
85 
86   // No move or copy semantics
87   HashTable(const HashTable&) = delete;
88   HashTable& operator=(const HashTable&) = delete;
89 
90   // Insert a new, non-nullptr T* into the hash table
91   // (we only store unique values so the new value must
92   // not be in the table already)
93   void Insert(T* value);
94 
95   // Lookup an existing value
96   // (returns nullptr if the value is not found)
97   T* Lookup(const Key& key) const;
98 
99   void PrintStats(const char* name, bool verbose);
100 
101  private:
102   std::unique_ptr<Partition> full_table_;
103   std::unique_ptr<Partition> insertion_table_;
104   Hash hasher_;
105 };
106 
107 template<class Key, class T, class Hash>
Partition(Index size,const Hash & hasher)108 HashTable<Key, T, Hash>::Partition::Partition(Index size, const Hash& hasher)
109     : hash_buckets_(size), hasher_(hasher) {
110   // allocate space for the hash buckets + avg chain length
111   buckets_.reserve(hash_buckets_ * kAvgChainLength);
112   buckets_.resize(hash_buckets_);
113 }
114 
115 // Similar to the "cellar" version of coalesced hashing,
116 // the buckets array is divided into a fixed set of entries
117 // addressable by the hash value [0 .. hash_buckets_) and
118 // extra buckets for the collision chains [hash_buckets_, buckets_.size())
119 // Unlike coalesced hashing, our "cellar" is growing so we don't actually
120 // have to coalesce any chains.
121 //
122 // Returns true if the insertion succeeded, false if the table overflows
123 // (we never insert more than the pre-reserved capacity)
124 //
125 template<class Key, class T, class Hash>
Insert(T * value)126 bool HashTable<Key, T, Hash>::Partition::Insert(T* value) {
127   SLICER_CHECK(value != nullptr);
128   // overflow?
129   if (buckets_.size() + 1 > buckets_.capacity()) {
130     return false;
131   }
132   auto key = hasher_.GetKey(value);
133   Index bucket_index = hasher_.Hash(key) % hash_buckets_;
134   if (buckets_[bucket_index].value == nullptr) {
135     buckets_[bucket_index].value = value;
136   } else {
137     Bucket new_bucket = {};
138     new_bucket.value = value;
139     new_bucket.next = buckets_[bucket_index].next;
140     buckets_[bucket_index].next = buckets_.size();
141     buckets_.push_back(new_bucket);
142   }
143   return true;
144 }
145 
146 template<class Key, class T, class Hash>
Lookup(const Key & key,uint32_t hash_value)147 T* HashTable<Key, T, Hash>::Partition::Lookup(const Key& key, uint32_t hash_value) const {
148   assert(hash_value == hasher_.Hash(key));
149   Index bucket_index = hash_value % hash_buckets_;
150   for (Index index = bucket_index; index != kInvalidIndex; index = buckets_[index].next) {
151     auto value = buckets_[index].value;
152     if (value == nullptr) {
153       assert(index < hash_buckets_);
154       break;
155     } else if (hasher_.Compare(key, value)) {
156       return value;
157     }
158   }
159   return nullptr;
160 }
161 
162 template<class Key, class T, class Hash>
InsertAll(const Partition & src)163 void HashTable<Key, T, Hash>::Partition::InsertAll(const Partition& src) {
164   for (const auto& bucket : src.buckets_) {
165     if (bucket.value != nullptr) {
166       SLICER_CHECK(Insert(bucket.value));
167     }
168   }
169 }
170 
171 // Try to insert into the "insertion table". If that overflows,
172 // we allocate a new, larger hash table, move "full table" value to it
173 // and "insertion table" becomes the new "full table".
174 template<class Key, class T, class Hash>
Insert(T * value)175 void HashTable<Key, T, Hash>::Insert(T* value) {
176   assert(Lookup(hasher_.GetKey(value)) == nullptr);
177   if (!insertion_table_->Insert(value)) {
178     std::unique_ptr<Partition> new_hash_table(
179         new Partition(insertion_table_->HashBuckets() * kResizeFactor, hasher_));
180     if (full_table_) {
181       new_hash_table->InsertAll(*full_table_);
182     }
183     SLICER_CHECK(new_hash_table->Insert(value));
184     full_table_ = std::move(insertion_table_);
185     insertion_table_ = std::move(new_hash_table);
186   }
187 }
188 
189 // First look into the "full table" and if the value is
190 // not found there look into the "insertion table" next
191 template<class Key, class T, class Hash>
Lookup(const Key & key)192 T* HashTable<Key, T, Hash>::Lookup(const Key& key) const {
193   auto hash_value = hasher_.Hash(key);
194   if (full_table_) {
195     auto value = full_table_->Lookup(key, hash_value);
196     if (value != nullptr) {
197       return value;
198     }
199   }
200   return insertion_table_->Lookup(key, hash_value);
201 }
202 
203 template<class Key, class T, class Hash>
PrintStats(const char * name,bool verbose)204 void HashTable<Key, T, Hash>::Partition::PrintStats(const char* name, bool verbose) {
205   int max_chain_length = 0;
206   int sum_chain_length = 0;
207   int used_buckets = 0;
208   for (Index i = 0; i < hash_buckets_; ++i) {
209     if (verbose) printf("%4d : ", i);
210     if (buckets_[i].value != nullptr) {
211       ++used_buckets;
212       int chain_length = 0;
213       for (Index ci = i; buckets_[ci].next != kInvalidIndex; ci = buckets_[ci].next) {
214         SLICER_CHECK(buckets_[ci].value != nullptr);
215         ++chain_length;
216         if (verbose) printf("*");
217       }
218       max_chain_length = std::max(max_chain_length, chain_length);
219       sum_chain_length += chain_length;
220     }
221     if (verbose) printf("\n");
222   }
223 
224   int avg_chain_length = used_buckets ? sum_chain_length / used_buckets : 0;
225 
226   printf("\nHash table partition (%s):\n", name);
227   printf("  hash_buckets                   : %u\n", hash_buckets_);
228   printf("  size/capacity                  : %zu / %zu\n", buckets_.size(), buckets_.capacity());
229   printf("  used_buckets                   : %d\n", used_buckets);
230   printf("  max_chain_length               : %d\n", max_chain_length);
231   printf("  avg_chain_length               : %d\n", avg_chain_length);
232 }
233 
234 template<class Key, class T, class Hash>
PrintStats(const char * name,bool verbose)235 void HashTable<Key, T, Hash>::PrintStats(const char* name, bool verbose) {
236   printf("\nHash table stats (%s)\n", name);
237   if (full_table_) {
238     full_table_->PrintStats("full_table", verbose);
239   }
240   insertion_table_->PrintStats("insertion_table", verbose);
241 }
242 
243 }  // namespace slicer
244