bloom_filter_mod.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  2. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  3. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  4. # Tweaked a bit by Daniel Richard Stromberg, mostly to make it pass pylint
  5. import array
  6. import random
  7. def get_probes(bfilter, key):
  8. '''Generate a bunch of fast hash functions'''
  9. hasher = random.Random(key).randrange
  10. for _ in range(bfilter.num_probes):
  11. array_index = hasher(len(bfilter.arr))
  12. bit_index = hasher(32)
  13. yield array_index, 1 << bit_index
  14. class Bloom_filter:
  15. '''Probabilistic set membership testing for large sets'''
  16. def __init__(self, num_bits, num_probes, probe_func=get_probes):
  17. self.num_bits = num_bits
  18. num_words = (num_bits + 31) // 32
  19. self.arr = array.array('L', [0]) * num_words
  20. self.num_probes = num_probes
  21. self.probe_func = probe_func
  22. def add(self, key):
  23. '''Add an element to the filter'''
  24. for i, mask in self.probe_func(self, key):
  25. self.arr[i] |= mask
  26. def _match_template(self, bfilter):
  27. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  28. return (self.num_bits == bfilter.num_bits \
  29. and self.num_probes == bfilter.num_probes \
  30. and self.probe_func == bfilter.probe_func)
  31. def union(self, bfilter):
  32. '''Compute the set union of two bloom filters'''
  33. if self._match_template(bfilter):
  34. self.arr = [a | b for a, b in zip(self.arr, bfilter.arr)]
  35. else:
  36. # Union b/w two unrelated bloom filter raises this
  37. raise ValueError("Mismatched bloom filters")
  38. def __or__(self, bfilter):
  39. return self.union(bfilter)
  40. def intersection(self, bfilter):
  41. '''Compute the set intersection of two bloom filters'''
  42. if self._match_template(bfilter):
  43. self.arr = [a & b for a, b in zip(self.arr, bfilter.arr)]
  44. else:
  45. # Intersection b/w two unrelated bloom filter raises this
  46. raise ValueError("Mismatched bloom filters")
  47. def __and__(self, bfilter):
  48. return self.intersection(bfilter)
  49. def __contains__(self, key):
  50. return all(self.arr[i] & mask for i, mask in self.probe_func(self, key))