From 38a00aea59a787fef29e882981d404f917f3a90a Mon Sep 17 00:00:00 2001 From: Andreas Dewes Date: Wed, 1 Apr 2015 19:35:12 +0200 Subject: [PATCH] Added nstar and nstar_intersection functions as well as tests. --- README.rst | 11 ++++++++++ pybloom/pybloom.py | 9 ++++++++ pybloom/tests.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) diff --git a/README.rst b/README.rst index 4dd2106..b2a4fb9 100644 --- a/README.rst +++ b/README.rst @@ -38,6 +38,16 @@ functions. >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 True + # f.nstar() function returns an estimate of the number of elements in + # the Bloom filter f. + + >>> print f.nstar() + + # f1.nstar_intersection(f2) returns an estimate of the number of elements in + # the intersection between the filters f1 and f2. + + >>> print f1.nstar_intersection(f2) + >>> from pybloom import ScalableBloomFilter >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) >>> count = 10000 @@ -50,3 +60,4 @@ functions. # len(sbf) may not equal the entire input length. 0.01% error is well # below the default 0.1% error threshold. As the capacity goes up, the # error will approach 0.1%. + diff --git a/pybloom/pybloom.py b/pybloom/pybloom.py index beeefe4..ec3d25b 100644 --- a/pybloom/pybloom.py +++ b/pybloom/pybloom.py @@ -171,6 +171,15 @@ def __len__(self): """Return the number of keys stored by this bloom filter.""" return self.count + def nstar(self): + #from http://en.wikipedia.org/wiki/Bloom_filter (The union and intersection of sets) + return -self.num_bits*math.log(1-float(self.bitarray.count(True))/float(self.num_bits))/self.num_slices + + def nstar_intersection(self,bloom): + #from http://en.wikipedia.org/wiki/Bloom_filter (The union and intersection of sets) + new_bloom = self.union(bloom) + return self.nstar()+bloom.nstar()-new_bloom.nstar() + def add(self, key, skip_check=False): """ Adds a key to this bloom filter. If the key already exists in this filter it will return True. Otherwise False. diff --git a/pybloom/tests.py b/pybloom/tests.py index 13d9b7d..89b3894 100644 --- a/pybloom/tests.py +++ b/pybloom/tests.py @@ -23,6 +23,7 @@ def additional_tests(): return suite class TestUnionIntersection(unittest.TestCase): + def test_union(self): bloom_one = BloomFilter(100, 0.001) bloom_two = BloomFilter(100, 0.001) @@ -49,6 +50,59 @@ def test_intersection(self): for char in chars[int(len(chars)/2):]: self.assertTrue(char not in new_bloom) + def test_nstar(self): + bloom = BloomFilter(1000, 0.001) + chars = [chr(i) for i in range_fn(0,200)] + for char in chars: + bloom.add(char) + self.assertTrue(bloom.nstar() > len(chars)-10 and bloom.nstar() < len(chars)+10) + + def test_nstar_intersection_1(self): + bloom_one = BloomFilter(200, 0.001) + bloom_two = BloomFilter(200, 0.001) + chars = [chr(i) for i in range_fn(0, 200)] + for char in chars: + bloom_one.add(char) + for char in chars[:int(len(chars)/2)]: + bloom_two.add(char) + new_bloom = bloom_one.intersection(bloom_two) + + self.assertTrue(bloom_one.nstar() > len(chars)-10 and bloom_one.nstar() < len(chars)+10) + self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10) + self.assertTrue(new_bloom.nstar() > len(chars)/2-10 and new_bloom.nstar() < len(chars)/2+10) + + def test_nstar_intersection_2(self): + bloom_one = BloomFilter(200, 0.001) + bloom_two = BloomFilter(200, 0.001) + chars = [chr(i) for i in range_fn(0, 200)] + for char in chars[int(len(chars)/2):]: + bloom_one.add(char) + for char in chars[:int(len(chars)/2)]: + bloom_two.add(char) + new_bloom = bloom_one.intersection(bloom_two) + + self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10) + self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10) + + #The nstar operator will fail on the intersection of the filters.. + self.assertTrue(new_bloom.nstar() > 10) + + self.assertTrue(bloom_one.nstar_intersection(bloom_two) < 10) + + def test_nstar_union(self): + bloom_one = BloomFilter(200, 0.001) + bloom_two = BloomFilter(200, 0.001) + chars = [chr(i) for i in range_fn(0, 200)] + for char in chars[:int(len(chars)/2)]: + bloom_one.add(char) + for char in chars[int(len(chars)/2):]: + bloom_two.add(char) + new_bloom = bloom_one.union(bloom_two) + + self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10) + self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10) + self.assertTrue(new_bloom.nstar() > len(chars)-10 and new_bloom.nstar() < len(chars)+10) + def test_intersection_capacity_fail(self): bloom_one = BloomFilter(1000, 0.001) bloom_two = BloomFilter(100, 0.001)