bloom_filter_mod.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  2. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  3. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  4. # Tweaked by Daniel Richard Stromberg, mostly to:
  5. # 1) Give it a little nicer __init__ parameters.
  6. # 2) Improve the hash functions to get a much lower rate of false positives
  7. # 3) Make it pass pylint
  8. import os
  9. #mport sys
  10. import math
  11. import array
  12. import random
  13. #mport bufsock
  14. #mport hashlib
  15. #mport numbers
  16. import python2x3
  17. # In the literature:
  18. # k is the number of probes - we call this num_probes_k
  19. # m is the number of bits in the filter - we call this num_bits_m
  20. # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
  21. # p is the desired error rate when full - we call this error_rate_p
  22. def my_range(num_values):
  23. '''Generate numbers from 0..num_values-1'''
  24. value = 0
  25. while value < num_values:
  26. yield value
  27. value += 1
  28. # In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow
  29. #class Backend_set_operations:
  30. # '''Provide &= and |= for backends'''
  31. # # pylint: disable=W0232
  32. # # W0232: We don't need an __init__ method; we're never instantiated directly
  33. # def __iand__(self, other):
  34. # assert self.num_bits == other.num_bits
  35. #
  36. # for bitno in my_range(num_bits):
  37. # if self.is_set(bitno) and other.is_set(bitno):
  38. # self[bitno].set()
  39. # else:
  40. # self[bitno].clear()
  41. #
  42. # def __ior__(self, other):
  43. # assert self.num_bits == other.num_bits
  44. #
  45. # for bitno in xrange(num_bits):
  46. # if self[bitno] or other[bitno]:
  47. # self[bitno].set()
  48. # else:
  49. # self[bitno].clear()
  50. class File_seek_backend:
  51. '''Backend storage for our "array of bits" using a file in which we seek'''
  52. effs = 2^8 - 1
  53. def __init__(self, num_bits, filename):
  54. self.num_bits = num_bits
  55. self.num_chars = (self.num_bits + 7) // 8
  56. flags = os.O_RDWR | os.O_CREAT
  57. if hasattr(os, 'O_BINARY'):
  58. flags |= getattr(os, 'O_BINARY')
  59. self.file_ = os.open(filename, flags)
  60. os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
  61. os.write(self.file_, python2x3.null_byte)
  62. def is_set(self, bitno):
  63. '''Return true iff bit number bitno is set'''
  64. byteno, bit_within_wordno = divmod(bitno, 8)
  65. mask = 1 << bit_within_wordno
  66. os.lseek(self.file_, byteno, os.SEEK_SET)
  67. char = os.read(self.file_, 1)
  68. if isinstance(char, str):
  69. byte = ord(char)
  70. else:
  71. byte = int(char)
  72. return byte & mask
  73. def set(self, bitno):
  74. '''set bit number bitno to true'''
  75. byteno, bit_within_byteno = divmod(bitno, 8)
  76. mask = 1 << bit_within_byteno
  77. os.lseek(self.file_, byteno, os.SEEK_SET)
  78. char = os.read(self.file_, 1)
  79. if isinstance(char, str):
  80. byte = ord(char)
  81. was_char = True
  82. else:
  83. byte = char
  84. was_char = False
  85. byte |= mask
  86. os.lseek(self.file_, byteno, os.SEEK_SET)
  87. if was_char:
  88. os.write(self.file_, chr(byte))
  89. else:
  90. os.write(self.file_, byte)
  91. def clear(self, bitno):
  92. '''clear bit number bitno - set it to false'''
  93. byteno, bit_within_byteno = divmod(bitno, 8)
  94. mask = 1 << bit_within_byteno
  95. os.lseek(self.file_, byteno, os.SEEK_SET)
  96. char = os.read(self.file_, 1)
  97. if isinstance(char, str):
  98. byte = ord(char)
  99. was_char = True
  100. else:
  101. byte = int(char)
  102. was_char = False
  103. byte &= File_seek_backend.effs - mask
  104. os.lseek(self.file_, byteno, os.SEEK_SET)
  105. if was_char:
  106. os.write(chr(byte))
  107. else:
  108. os.write(byte)
  109. # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
  110. def __iand__(self, other):
  111. assert self.num_bits == other.num_bits
  112. for bitno in my_range(self.num_bits):
  113. if self.is_set(bitno) and other.is_set(bitno):
  114. self.set(bitno)
  115. else:
  116. self.clear(bitno)
  117. return self
  118. def __ior__(self, other):
  119. assert self.num_bits == other.num_bits
  120. for bitno in my_range(self.num_bits):
  121. if self.is_set(bitno) or other.is_set(bitno):
  122. self.set(bitno)
  123. else:
  124. self.clear(bitno)
  125. return self
  126. def close(self):
  127. '''Close the file'''
  128. os.close(self.file_)
  129. class Array_backend:
  130. '''Backend storage for our "array of bits" using a python array of integers'''
  131. effs = 2^32 - 1
  132. def __init__(self, num_bits):
  133. self.num_bits = num_bits
  134. self.num_words = (self.num_bits + 31) // 32
  135. self.array_ = array.array('L', [0]) * self.num_words
  136. def is_set(self, bitno):
  137. '''Return true iff bit number bitno is set'''
  138. wordno, bit_within_wordno = divmod(bitno, 32)
  139. mask = 1 << bit_within_wordno
  140. return self.array_[wordno] & mask
  141. def set(self, bitno):
  142. '''set bit number bitno to true'''
  143. wordno, bit_within_wordno = divmod(bitno, 32)
  144. mask = 1 << bit_within_wordno
  145. self.array_[wordno] |= mask
  146. def clear(self, bitno):
  147. '''clear bit number bitno - set it to false'''
  148. wordno, bit_within_wordno = divmod(bitno, 32)
  149. mask = Array_backend.effs - (1 << bit_within_wordno)
  150. self.array_[wordno] &= mask
  151. # It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower
  152. def __iand__(self, other):
  153. assert self.num_bits == other.num_bits
  154. for wordno in my_range(self.num_words):
  155. self.array_[wordno] &= other.array_[wordno]
  156. return self
  157. def __ior__(self, other):
  158. assert self.num_bits == other.num_bits
  159. for wordno in my_range(self.num_words):
  160. self.array_[wordno] |= other.array_[wordno]
  161. return self
  162. def close(self):
  163. '''Noop for compatibility with the file+seek backend'''
  164. pass
  165. def get_bitno_seed_rnd(bloom_filter, key):
  166. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  167. # We're using key as a seed to a pseudorandom number generator
  168. hasher = random.Random(key).randrange
  169. for dummy in range(bloom_filter.num_probes_k):
  170. bitno = hasher(bloom_filter.num_bits_m)
  171. yield bitno % bloom_filter.num_bits_m
  172. MERSENNES1 = [ 2 ** x - 1 for x in [ 17, 31, 127 ] ]
  173. MERSENNES2 = [ 2 ** x - 1 for x in [ 19, 67, 257 ] ]
  174. def simple_hash(int_list, prime1, prime2, prime3):
  175. '''Compute a hash value from a list of integers and 3 primes'''
  176. result = 0
  177. for integer in int_list:
  178. result += ((result + integer + prime1) * prime2) % prime3
  179. return result
  180. def hash1(int_list):
  181. '''Basic hash function #1'''
  182. return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
  183. def hash2(int_list):
  184. '''Basic hash function #2'''
  185. return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
  186. def get_bitno_lin_comb(bloom_filter, key):
  187. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  188. # This one assumes key is either bytes or str (or other list of integers)
  189. # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type
  190. if isinstance(key, int):
  191. int_list = []
  192. temp = key
  193. while temp:
  194. int_list.append(temp % 256)
  195. temp //= 256
  196. elif isinstance(key[0], int):
  197. int_list = key
  198. elif isinstance(key[0], str):
  199. int_list = [ ord(char) for char in key ]
  200. else:
  201. raise TypeError('Sorry, I do not know how to hash this type')
  202. hash_value1 = hash1(int_list)
  203. hash_value2 = hash2(int_list)
  204. # We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
  205. for probeno in range(1, bloom_filter.num_probes_k + 1):
  206. bit_index = hash_value1 + probeno * hash_value2
  207. yield bit_index % bloom_filter.num_bits_m
  208. class Bloom_filter:
  209. '''Probabilistic set membership testing for large sets'''
  210. #def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
  211. def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb, filename=None):
  212. if ideal_num_elements_n <= 0:
  213. raise ValueError('ideal_num_elements_n must be > 0')
  214. if not (0 < error_rate_p < 1):
  215. raise ValueError('error_rate_p must be between 0 and 1 inclusive')
  216. self.error_rate_p = error_rate_p
  217. # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
  218. # drops rapidly.
  219. self.ideal_num_elements_n = ideal_num_elements_n
  220. numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
  221. denominator = math.log(2) ** 2
  222. #self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
  223. real_num_bits_m = numerator / denominator
  224. self.num_bits_m = int(math.ceil(real_num_bits_m))
  225. if filename is None:
  226. self.backend = Array_backend(self.num_bits_m)
  227. else:
  228. self.backend = File_seek_backend(self.num_bits_m, filename)
  229. #array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
  230. # AKA num_offsetters
  231. # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
  232. real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
  233. self.num_probes_k = int(math.ceil(real_num_probes_k))
  234. # This comes close, but often isn't the same value
  235. # alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
  236. #
  237. # if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
  238. # sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' %
  239. # (real_num_probes_k, alternative_real_num_probes_k)
  240. # )
  241. # sys.exit(1)
  242. self.probe_bitnoer = probe_bitnoer
  243. def __repr__(self):
  244. return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
  245. self.ideal_num_elements_n,
  246. self.error_rate_p,
  247. self.num_bits_m,
  248. )
  249. def add(self, key):
  250. '''Add an element to the filter'''
  251. for bitno in self.probe_bitnoer(self, key):
  252. self.backend.set(bitno)
  253. def __iadd__(self, key):
  254. self.add(key)
  255. return self
  256. def _match_template(self, bloom_filter):
  257. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  258. return (self.num_bits_m == bloom_filter.num_bits_m \
  259. and self.num_probes_k == bloom_filter.num_probes_k \
  260. and self.probe_bitnoer == bloom_filter.probe_bitnoer)
  261. def union(self, bloom_filter):
  262. '''Compute the set union of two bloom filters'''
  263. self.backend |= bloom_filter.backend
  264. def __ior__(self, bloom_filter):
  265. self.union(bloom_filter)
  266. return self
  267. def intersection(self, bloom_filter):
  268. '''Compute the set intersection of two bloom filters'''
  269. self.backend &= bloom_filter.backend
  270. def __iand__(self, bloom_filter):
  271. self.intersection(bloom_filter)
  272. return self
  273. def __contains__(self, key):
  274. for bitno in self.probe_bitnoer(self, key):
  275. #wordno, bit_within_word = divmod(bitno, 32)
  276. #mask = 1 << bit_within_word
  277. #if not (self.array_[wordno] & mask):
  278. if not self.backend.is_set(bitno):
  279. return False
  280. return True
  281. #return all(self.array_[i] & mask for i, mask in self.probe_bitnoer(self, key))