Skip to content
This repository has been archived by the owner on Jul 30, 2024. It is now read-only.

Added nstar and nstar_intersection functions as well as tests. #18

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ functions.
>>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
True

# f.nstar() function returns an estimate of the number of elements in
# the Bloom filter f.

>>> print f.nstar()

# f1.nstar_intersection(f2) returns an estimate of the number of elements in
# the intersection between the filters f1 and f2.

>>> print f1.nstar_intersection(f2)

>>> from pybloom import ScalableBloomFilter
>>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
>>> count = 10000
Expand All @@ -50,3 +60,4 @@ functions.
# len(sbf) may not equal the entire input length. 0.01% error is well
# below the default 0.1% error threshold. As the capacity goes up, the
# error will approach 0.1%.

9 changes: 9 additions & 0 deletions pybloom/pybloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,15 @@ def __len__(self):
"""Return the number of keys stored by this bloom filter."""
return self.count

def nstar(self):
#from http://en.wikipedia.org/wiki/Bloom_filter (The union and intersection of sets)
return -self.num_bits*math.log(1-float(self.bitarray.count(True))/float(self.num_bits))/self.num_slices

def nstar_intersection(self,bloom):
#from http://en.wikipedia.org/wiki/Bloom_filter (The union and intersection of sets)
new_bloom = self.union(bloom)
return self.nstar()+bloom.nstar()-new_bloom.nstar()

def add(self, key, skip_check=False):
""" Adds a key to this bloom filter. If the key already exists in this
filter it will return True. Otherwise False.
Expand Down
54 changes: 54 additions & 0 deletions pybloom/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def additional_tests():
return suite

class TestUnionIntersection(unittest.TestCase):

def test_union(self):
bloom_one = BloomFilter(100, 0.001)
bloom_two = BloomFilter(100, 0.001)
Expand All @@ -49,6 +50,59 @@ def test_intersection(self):
for char in chars[int(len(chars)/2):]:
self.assertTrue(char not in new_bloom)

def test_nstar(self):
bloom = BloomFilter(1000, 0.001)
chars = [chr(i) for i in range_fn(0,200)]
for char in chars:
bloom.add(char)
self.assertTrue(bloom.nstar() > len(chars)-10 and bloom.nstar() < len(chars)+10)

def test_nstar_intersection_1(self):
bloom_one = BloomFilter(200, 0.001)
bloom_two = BloomFilter(200, 0.001)
chars = [chr(i) for i in range_fn(0, 200)]
for char in chars:
bloom_one.add(char)
for char in chars[:int(len(chars)/2)]:
bloom_two.add(char)
new_bloom = bloom_one.intersection(bloom_two)

self.assertTrue(bloom_one.nstar() > len(chars)-10 and bloom_one.nstar() < len(chars)+10)
self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10)
self.assertTrue(new_bloom.nstar() > len(chars)/2-10 and new_bloom.nstar() < len(chars)/2+10)

def test_nstar_intersection_2(self):
bloom_one = BloomFilter(200, 0.001)
bloom_two = BloomFilter(200, 0.001)
chars = [chr(i) for i in range_fn(0, 200)]
for char in chars[int(len(chars)/2):]:
bloom_one.add(char)
for char in chars[:int(len(chars)/2)]:
bloom_two.add(char)
new_bloom = bloom_one.intersection(bloom_two)

self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10)
self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10)

#The nstar operator will fail on the intersection of the filters..
self.assertTrue(new_bloom.nstar() > 10)

self.assertTrue(bloom_one.nstar_intersection(bloom_two) < 10)

def test_nstar_union(self):
bloom_one = BloomFilter(200, 0.001)
bloom_two = BloomFilter(200, 0.001)
chars = [chr(i) for i in range_fn(0, 200)]
for char in chars[:int(len(chars)/2)]:
bloom_one.add(char)
for char in chars[int(len(chars)/2):]:
bloom_two.add(char)
new_bloom = bloom_one.union(bloom_two)

self.assertTrue(bloom_one.nstar() > len(chars)/2-10 and bloom_one.nstar() < len(chars)/2+10)
self.assertTrue(bloom_two.nstar() > len(chars)/2-10 and bloom_two.nstar() < len(chars)/2+10)
self.assertTrue(new_bloom.nstar() > len(chars)-10 and new_bloom.nstar() < len(chars)+10)

def test_intersection_capacity_fail(self):
bloom_one = BloomFilter(1000, 0.001)
bloom_two = BloomFilter(100, 0.001)
Expand Down