bloom_filter_mod.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  2. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  3. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  4. # Tweaked a bit by Daniel Richard Stromberg, mostly to make it pass pylint
  5. import math
  6. import array
  7. import random
  8. def get_probes(bfilter, key):
  9. '''Generate a bunch of fast hash functions'''
  10. hasher = random.Random(key).randrange
  11. for _ in range(bfilter.num_probes):
  12. array_index = hasher(len(bfilter.array_))
  13. bit_index = hasher(32)
  14. yield array_index, 1 << bit_index
  15. class Bloom_filter:
  16. '''Probabilistic set membership testing for large sets'''
  17. def __init__(self, ideal_num_elements, error_rate, probe_func=get_probes):
  18. if ideal_num_elements <= 0:
  19. raise ValueError('ideal_num_elements must be > 0')
  20. if not (0 < error_rate < 1):
  21. raise ValueError('error_rate must be between 0 and 1 inclusive')
  22. self.error_rate = error_rate
  23. # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
  24. # drops rapidly.
  25. self.ideal_num_elements = ideal_num_elements
  26. self.num_bits = - int((self.ideal_num_elements * math.log(self.error_rate)) / (math.log(2) ** 2))
  27. self.num_words = int((self.num_bits + 31) / 32)
  28. self.array_ = array.array('L', [0]) * self.num_words
  29. self.num_probes = int((self.num_bits / self.ideal_num_elements) * math.log(2))
  30. self.probe_func = probe_func
  31. def __repr__(self):
  32. return 'Bloom_filter(ideal_num_elements=%d, error_rate=%f, num_bits=%d)' % (
  33. self.ideal_num_elements,
  34. self.error_rate,
  35. self.num_bits,
  36. )
  37. def add(self, key):
  38. '''Add an element to the filter'''
  39. for i, mask in self.probe_func(self, key):
  40. self.array_[i] |= mask
  41. def _match_template(self, bfilter):
  42. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  43. return (self.num_bits == bfilter.num_bits \
  44. and self.num_probes == bfilter.num_probes \
  45. and self.probe_func == bfilter.probe_func)
  46. def union(self, bfilter):
  47. '''Compute the set union of two bloom filters'''
  48. if self._match_template(bfilter):
  49. self.array_ = [a | b for a, b in zip(self.array_, bfilter.array_)]
  50. else:
  51. # Union b/w two unrelated bloom filter raises this
  52. raise ValueError("Mismatched bloom filters")
  53. def __or__(self, bfilter):
  54. return self.union(bfilter)
  55. def intersection(self, bfilter):
  56. '''Compute the set intersection of two bloom filters'''
  57. if self._match_template(bfilter):
  58. self.array_ = [a & b for a, b in zip(self.array_, bfilter.array_)]
  59. else:
  60. # Intersection b/w two unrelated bloom filter raises this
  61. raise ValueError("Mismatched bloom filters")
  62. def __and__(self, bfilter):
  63. return self.intersection(bfilter)
  64. def __contains__(self, key):
  65. return all(self.array_[i] & mask for i, mask in self.probe_func(self, key))