1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_BLOOM_FILTER_H
18 #define LATINIME_BLOOM_FILTER_H
19 
20 #include <bitset>
21 
22 #include "defines.h"
23 
24 namespace latinime {
25 
26 // This bloom filter is used for optimizing bigram retrieval.
27 // Execution times with previous word "this" are as follows:
28 //  without bloom filter (use only hash_map):
29 //   Total 147792.34 (sum of others 147771.57)
30 //  with bloom filter:
31 //   Total 145900.64 (sum of others 145874.30)
32 //  always read binary dictionary:
33 //   Total 148603.14 (sum of others 148579.90)
34 class BloomFilter {
35  public:
BloomFilter()36     BloomFilter() : mFilter() {}
37 
setInFilter(const int position)38     AK_FORCE_INLINE void setInFilter(const int position) {
39         mFilter.set(getIndex(position));
40     }
41 
isInFilter(const int position)42     AK_FORCE_INLINE bool isInFilter(const int position) const {
43         return mFilter.test(getIndex(position));
44     }
45 
46  private:
47     DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter);
48 
getIndex(const int position)49     AK_FORCE_INLINE size_t getIndex(const int position) const {
50         return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO;
51     }
52 
53     // Size, in bits, of the bloom filter index for bigrams
54     // The probability of false positive is (1 - e ** (-kn/m))**k,
55     // where k is the number of hash functions, n the number of bigrams, and m the number of
56     // bits we can test.
57     // At the moment 100 is the maximum number of bigrams for a word with the current main
58     // dictionaries, so n = 100. 1024 buckets give us m = 1024.
59     // With 1 hash function, our false positive rate is about 9.3%, which should be enough for
60     // our uses since we are only using this to increase average performance. For the record,
61     // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
62     // and m = 4096 gives 2.4%.
63     // This is assigned here because it is used for bitset size.
64     // 1021 is the largest prime under 1024.
65     static const size_t BIGRAM_FILTER_MODULO = 1021;
66     std::bitset<BIGRAM_FILTER_MODULO> mFilter;
67 };
68 } // namespace latinime
69 #endif // LATINIME_BLOOM_FILTER_H
70