bloom_filter_mod.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  2. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  3. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  4. # Tweaked a bit by Daniel Richard Stromberg, mostly to make it pass pylint and give it a little nicer
  5. # __init__ parameters.
  6. import math
  7. import array
  8. import random
  9. # In the literature:
  10. # k is the number of probes - we call this num_probes_k
  11. # m is the number of bits in the filter - we call this num_bits_m
  12. # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
  13. # p is the desired error rate when full - we call this error_rate_p
  14. def get_probe_index_and_bitmask(bloom_filter, key):
  15. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  16. # We're using key as a seed to a pseudorandom number generator
  17. hasher = random.Random(key).randrange
  18. for _ in range(bloom_filter.num_probes_k):
  19. # We could precompute this length for speed. But we don't
  20. array_index = hasher(bloom_filter.num_words)
  21. bit_index = hasher(32)
  22. yield array_index, 1 << bit_index
  23. class Bloom_filter:
  24. '''Probabilistic set membership testing for large sets'''
  25. def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_probe_index_and_bitmask):
  26. if ideal_num_elements_n <= 0:
  27. raise ValueError('ideal_num_elements_n must be > 0')
  28. if not (0 < error_rate_p < 1):
  29. raise ValueError('error_rate_p must be between 0 and 1 inclusive')
  30. self.error_rate_p = error_rate_p
  31. # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
  32. # drops rapidly.
  33. self.ideal_num_elements_n = ideal_num_elements_n
  34. numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
  35. denominator = math.log(2) ** 2
  36. #self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
  37. real_num_bits_m = numerator / denominator
  38. self.num_bits_m = int(math.ceil(real_num_bits_m))
  39. self.num_words = int((self.num_bits_m + 31) / 32)
  40. self.array_ = array.array('L', [0]) * self.num_words
  41. # AKA num_offsetters
  42. # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
  43. real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
  44. self.num_probes_k = int(math.ceil(real_num_probes_k))
  45. self.probe_offsetter = probe_offsetter
  46. def __repr__(self):
  47. return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
  48. self.ideal_num_elements_n,
  49. self.error_rate_p,
  50. self.num_bits_m,
  51. )
  52. def add(self, key):
  53. '''Add an element to the filter'''
  54. for index, mask in self.probe_offsetter(self, key):
  55. self.array_[index] |= mask
  56. def __iadd__(self, key):
  57. self.add(key)
  58. return self
  59. def _match_template(self, bloom_filter):
  60. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  61. return (self.num_bits_m == bloom_filter.num_bits_m \
  62. and self.num_probes_k == bloom_filter.num_probes_k \
  63. and self.probe_offsetter == bloom_filter.probe_offsetter)
  64. def union(self, bloom_filter):
  65. '''Compute the set union of two bloom filters'''
  66. if self._match_template(bloom_filter):
  67. self.array_ = [a | b for a, b in zip(self.array_, bloom_filter.array_)]
  68. else:
  69. # Union b/w two unrelated bloom filter raises this
  70. raise ValueError("Mismatched bloom filters")
  71. def __ior__(self, bloom_filter):
  72. self.union(bloom_filter)
  73. return self
  74. def intersection(self, bloom_filter):
  75. '''Compute the set intersection of two bloom filters'''
  76. if self._match_template(bloom_filter):
  77. self.array_ = [a & b for a, b in zip(self.array_, bloom_filter.array_)]
  78. else:
  79. # Intersection b/w two unrelated bloom filter raises this
  80. raise ValueError("Mismatched bloom filters")
  81. def __iand__(self, bloom_filter):
  82. self.intersection(bloom_filter)
  83. return self
  84. def __contains__(self, key):
  85. return all(self.array_[i] & mask for i, mask in self.probe_offsetter(self, key))