bloom_filter_mod.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  2. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  3. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  4. # Tweaked by Daniel Richard Stromberg, mostly to:
  5. # 1) Give it a little nicer __init__ parameters.
  6. # 2) Improve the hash functions to get a much lower rate of false positives
  7. # 3) Make it pass pylint
  8. #mport sys
  9. import math
  10. import array
  11. import random
  12. #mport hashlib
  13. #mport numbers
  14. # In the literature:
  15. # k is the number of probes - we call this num_probes_k
  16. # m is the number of bits in the filter - we call this num_bits_m
  17. # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
  18. # p is the desired error rate when full - we call this error_rate_p
  19. def get_bitno_seed_rnd(bloom_filter, key):
  20. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  21. # We're using key as a seed to a pseudorandom number generator
  22. hasher = random.Random(key).randrange
  23. for dummy in range(bloom_filter.num_probes_k):
  24. bitno = hasher(bloom_filter.num_bits_m)
  25. yield bitno % bloom_filter.num_bits_m
  26. MERSENNES1 = [ 2 ** x - 1 for x in [ 17, 31, 127 ] ]
  27. MERSENNES2 = [ 2 ** x - 1 for x in [ 19, 67, 257 ] ]
  28. def simple_hash(int_list, prime1, prime2, prime3):
  29. '''Compute a hash value from a list of integers and 3 primes'''
  30. result = 0
  31. for integer in int_list:
  32. result += ((result + integer + prime1) * prime2) % prime3
  33. return result
  34. def hash1(int_list):
  35. '''Basic hash function #1'''
  36. return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
  37. def hash2(int_list):
  38. '''Basic hash function #2'''
  39. return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
  40. def get_bitno_lin_comb(bloom_filter, key):
  41. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  42. # This one assumes key is either bytes or str (or other list of integers)
  43. # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type
  44. if isinstance(key, int):
  45. int_list = []
  46. temp = key
  47. while temp:
  48. int_list.append(temp % 256)
  49. temp //= 256
  50. elif isinstance(key[0], int):
  51. int_list = key
  52. elif isinstance(key[0], str):
  53. int_list = [ ord(char) for char in key ]
  54. else:
  55. raise TypeError('Sorry, I do not know how to hash this type')
  56. hash_value1 = hash1(int_list)
  57. hash_value2 = hash2(int_list)
  58. # We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
  59. for probeno in range(1, bloom_filter.num_probes_k + 1):
  60. bit_index = hash_value1 + probeno * hash_value2
  61. yield bit_index % bloom_filter.num_bits_m
  62. class Bloom_filter:
  63. '''Probabilistic set membership testing for large sets'''
  64. #def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
  65. def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb):
  66. if ideal_num_elements_n <= 0:
  67. raise ValueError('ideal_num_elements_n must be > 0')
  68. if not (0 < error_rate_p < 1):
  69. raise ValueError('error_rate_p must be between 0 and 1 inclusive')
  70. self.error_rate_p = error_rate_p
  71. # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
  72. # drops rapidly.
  73. self.ideal_num_elements_n = ideal_num_elements_n
  74. numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
  75. denominator = math.log(2) ** 2
  76. #self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
  77. real_num_bits_m = numerator / denominator
  78. self.num_bits_m = int(math.ceil(real_num_bits_m))
  79. self.array_ = array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
  80. # AKA num_offsetters
  81. # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
  82. real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
  83. self.num_probes_k = int(math.ceil(real_num_probes_k))
  84. # This comes close, but often isn't the same value
  85. # alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
  86. #
  87. # if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
  88. # sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' %
  89. # (real_num_probes_k, alternative_real_num_probes_k)
  90. # )
  91. # sys.exit(1)
  92. self.probe_bitnoer = probe_bitnoer
  93. def __repr__(self):
  94. return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
  95. self.ideal_num_elements_n,
  96. self.error_rate_p,
  97. self.num_bits_m,
  98. )
  99. def add(self, key):
  100. '''Add an element to the filter'''
  101. for bitno in self.probe_bitnoer(self, key):
  102. index, bit_within_word = divmod(bitno, 32)
  103. mask = 1 << bit_within_word
  104. self.array_[index] |= mask
  105. def __iadd__(self, key):
  106. self.add(key)
  107. return self
  108. def _match_template(self, bloom_filter):
  109. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  110. return (self.num_bits_m == bloom_filter.num_bits_m \
  111. and self.num_probes_k == bloom_filter.num_probes_k \
  112. and self.probe_bitnoer == bloom_filter.probe_bitnoer)
  113. def union(self, bloom_filter):
  114. '''Compute the set union of two bloom filters'''
  115. if self._match_template(bloom_filter):
  116. self.array_ = [a | b for a, b in zip(self.array_, bloom_filter.array_)]
  117. else:
  118. # Union b/w two unrelated bloom filter raises this
  119. raise ValueError("Mismatched bloom filters")
  120. def __ior__(self, bloom_filter):
  121. self.union(bloom_filter)
  122. return self
  123. def intersection(self, bloom_filter):
  124. '''Compute the set intersection of two bloom filters'''
  125. if self._match_template(bloom_filter):
  126. self.array_ = [a & b for a, b in zip(self.array_, bloom_filter.array_)]
  127. else:
  128. # Intersection b/w two unrelated bloom filter raises this
  129. raise ValueError("Mismatched bloom filters")
  130. def __iand__(self, bloom_filter):
  131. self.intersection(bloom_filter)
  132. return self
  133. def __contains__(self, key):
  134. for bitno in self.probe_bitnoer(self, key):
  135. wordno, bit_within_word = divmod(bitno, 32)
  136. mask = 1 << bit_within_word
  137. if not (self.array_[wordno] & mask):
  138. return False
  139. return True
  140. #return all(self.array_[i] & mask for i, mask in self.probe_bitnoer(self, key))