summaryrefslogtreecommitdiffstats
path: root/native/jni/src/suggest/core/dictionary/bloom_filter.h
diff options
context:
space:
mode:
authorKeisuke Kuroynagi <ksk@google.com>2013-06-14 20:35:41 +0900
committerKeisuke Kuroynagi <ksk@google.com>2013-06-14 20:35:41 +0900
commit1ff81e889045d35ff8420b266398e73239bd15c9 (patch)
tree6e576bd462a84209b9f94329c44b788aff464829 /native/jni/src/suggest/core/dictionary/bloom_filter.h
parent4f19193560c2eb4ecc9111b6c6daaae83352e649 (diff)
downloadandroid_packages_inputmethods_LatinIME-1ff81e889045d35ff8420b266398e73239bd15c9.tar.gz
android_packages_inputmethods_LatinIME-1ff81e889045d35ff8420b266398e73239bd15c9.tar.bz2
android_packages_inputmethods_LatinIME-1ff81e889045d35ff8420b266398e73239bd15c9.zip
Use bloom filter in multi bigram map.
Evaluated with previous word "this". without bloom filter (use only hash_map): Total 147792.34 (sum of others 147771.57) with bloom filter: Total 145900.64 (sum of others 145874.30) always read binary dictionary: Total 148603.14 (sum of others 148579.90) Bug: 8592527 Change-Id: I821dc39454543826adb73b9eeeef6408fad8ae28
Diffstat (limited to 'native/jni/src/suggest/core/dictionary/bloom_filter.h')
-rw-r--r--native/jni/src/suggest/core/dictionary/bloom_filter.h54
1 files changed, 43 insertions, 11 deletions
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/suggest/core/dictionary/bloom_filter.h
index bcce1f7ea..5205456a8 100644
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.h
+++ b/native/jni/src/suggest/core/dictionary/bloom_filter.h
@@ -23,16 +23,48 @@
namespace latinime {
-// TODO: uint32_t position
-static inline void setInFilter(uint8_t *filter, const int32_t position) {
- const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
- filter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
-}
-
-// TODO: uint32_t position
-static inline bool isInFilter(const uint8_t *filter, const int32_t position) {
- const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
- return filter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7));
-}
+// This bloom filter is used for optimizing bigram retrieval.
+// Execution times with previous word "this" are as follows:
+// without bloom filter (use only hash_map):
+// Total 147792.34 (sum of others 147771.57)
+// with bloom filter:
+// Total 145900.64 (sum of others 145874.30)
+// always read binary dictionary:
+// Total 148603.14 (sum of others 148579.90)
+class BloomFilter {
+ public:
+ BloomFilter() {
+ ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO);
+ }
+
+ // TODO: uint32_t position
+ AK_FORCE_INLINE void setInFilter(const int32_t position) {
+ const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
+ mFilter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
+ }
+
+ // TODO: uint32_t position
+ AK_FORCE_INLINE bool isInFilter(const int32_t position) const {
+ const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
+ return (mFilter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7))) != 0;
+ }
+
+ private:
+ // Size, in bytes, of the bloom filter index for bigrams
+ // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
+ // where k is the number of hash functions, n the number of bigrams, and m the number of
+ // bits we can test.
+ // At the moment 100 is the maximum number of bigrams for a word with the current
+ // dictionaries, so n = 100. 1024 buckets give us m = 1024.
+ // With 1 hash function, our false positive rate is about 9.3%, which should be enough for
+ // our uses since we are only using this to increase average performance. For the record,
+ // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
+ // and m = 4096 gives 2.4%.
+ // This is assigned here because it is used for array size.
+ static const int BIGRAM_FILTER_BYTE_SIZE = 128;
+ static const int BIGRAM_FILTER_MODULO;
+
+ uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE];
+};
} // namespace latinime
#endif // LATINIME_BLOOM_FILTER_H