bloom_filter_mod.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  2. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  3. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  4. # Tweaked by Daniel Richard Stromberg, mostly to:
  5. # 1) Give it a little nicer __init__ parameters.
  6. # 2) Improve the hash functions to get a much lower rate of false positives
  7. # 3) Make it pass pylint
  8. import os
  9. #mport sys
  10. import math
  11. import array
  12. import random
  13. #mport bufsock
  14. #mport hashlib
  15. #mport numbers
  16. import python2x3
  17. try:
  18. import mmap as mmap_mod
  19. except ImportError:
  20. # Jython lacks mmap()
  21. HAVE_MMAP = False
  22. else:
  23. HAVE_MMAP = True
  24. # In the literature:
  25. # k is the number of probes - we call this num_probes_k
  26. # m is the number of bits in the filter - we call this num_bits_m
  27. # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
  28. # p is the desired error rate when full - we call this error_rate_p
  29. def my_range(num_values):
  30. '''Generate numbers from 0..num_values-1'''
  31. value = 0
  32. while value < num_values:
  33. yield value
  34. value += 1
  35. # In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow
  36. #class Backend_set_operations:
  37. # '''Provide &= and |= for backends'''
  38. # # pylint: disable=W0232
  39. # # W0232: We don't need an __init__ method; we're never instantiated directly
  40. # def __iand__(self, other):
  41. # assert self.num_bits == other.num_bits
  42. #
  43. # for bitno in my_range(num_bits):
  44. # if self.is_set(bitno) and other.is_set(bitno):
  45. # self[bitno].set()
  46. # else:
  47. # self[bitno].clear()
  48. #
  49. # def __ior__(self, other):
  50. # assert self.num_bits == other.num_bits
  51. #
  52. # for bitno in xrange(num_bits):
  53. # if self[bitno] or other[bitno]:
  54. # self[bitno].set()
  55. # else:
  56. # self[bitno].clear()
  57. if HAVE_MMAP:
  58. class Mmap_backend:
  59. '''
  60. Backend storage for our "array of bits" using an mmap'd file.
  61. Please note that this has only been tested on Linux so far: 2011-11-01.
  62. '''
  63. effs = 2 ^ 8 - 1
  64. def __init__(self, num_bits, filename):
  65. self.num_bits = num_bits
  66. self.num_chars = (self.num_bits + 7) // 8
  67. flags = os.O_RDWR | os.O_CREAT
  68. if hasattr(os, 'O_BINARY'):
  69. flags |= getattr(os, 'O_BINARY')
  70. self.file_ = os.open(filename, flags)
  71. os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
  72. os.write(self.file_, python2x3.null_byte)
  73. self.mmap = mmap_mod.mmap(self.file_, self.num_chars)
  74. def is_set(self, bitno):
  75. '''Return true iff bit number bitno is set'''
  76. byteno, bit_within_wordno = divmod(bitno, 8)
  77. mask = 1 << bit_within_wordno
  78. char = self.mmap[byteno]
  79. if isinstance(char, str):
  80. byte = ord(char)
  81. else:
  82. byte = int(char)
  83. return byte & mask
  84. def set(self, bitno):
  85. '''set bit number bitno to true'''
  86. byteno, bit_within_byteno = divmod(bitno, 8)
  87. mask = 1 << bit_within_byteno
  88. char = self.mmap[byteno]
  89. byte = ord(char)
  90. byte |= mask
  91. self.mmap[byteno] = chr(byte)
  92. def clear(self, bitno):
  93. '''clear bit number bitno - set it to false'''
  94. byteno, bit_within_byteno = divmod(bitno, 8)
  95. mask = 1 << bit_within_byteno
  96. char = self.mmap[byteno]
  97. byte = ord(char)
  98. byte &= Mmap_backend.effs - mask
  99. self.mmap[byteno] = chr(byte)
  100. def __iand__(self, other):
  101. assert self.num_bits == other.num_bits
  102. for byteno in my_range(self.num_chars):
  103. self.mmap[byteno] = chr(ord(self.mmap[byteno]) & ord(other.mmap[byteno]))
  104. return self
  105. def __ior__(self, other):
  106. assert self.num_bits == other.num_bits
  107. for byteno in my_range(self.num_chars):
  108. self.mmap[byteno] = chr(ord(self.mmap[byteno]) | ord(other.mmap[byteno]))
  109. return self
  110. def close(self):
  111. '''Close the file'''
  112. os.close(self.file_)
  113. class File_seek_backend:
  114. '''Backend storage for our "array of bits" using a file in which we seek'''
  115. effs = 2 ^ 8 - 1
  116. def __init__(self, num_bits, filename):
  117. self.num_bits = num_bits
  118. self.num_chars = (self.num_bits + 7) // 8
  119. flags = os.O_RDWR | os.O_CREAT
  120. if hasattr(os, 'O_BINARY'):
  121. flags |= getattr(os, 'O_BINARY')
  122. self.file_ = os.open(filename, flags)
  123. os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
  124. os.write(self.file_, python2x3.null_byte)
  125. def is_set(self, bitno):
  126. '''Return true iff bit number bitno is set'''
  127. byteno, bit_within_wordno = divmod(bitno, 8)
  128. mask = 1 << bit_within_wordno
  129. os.lseek(self.file_, byteno, os.SEEK_SET)
  130. char = os.read(self.file_, 1)
  131. if isinstance(char, str):
  132. byte = ord(char)
  133. else:
  134. byte = char[0]
  135. return byte & mask
  136. def set(self, bitno):
  137. '''set bit number bitno to true'''
  138. byteno, bit_within_byteno = divmod(bitno, 8)
  139. mask = 1 << bit_within_byteno
  140. os.lseek(self.file_, byteno, os.SEEK_SET)
  141. char = os.read(self.file_, 1)
  142. if isinstance(char, str):
  143. byte = ord(char)
  144. was_char = True
  145. else:
  146. byte = char[0]
  147. was_char = False
  148. byte |= mask
  149. os.lseek(self.file_, byteno, os.SEEK_SET)
  150. if was_char:
  151. os.write(self.file_, chr(byte))
  152. else:
  153. char = python2x3.intlist_to_binary([ byte ])
  154. os.write(self.file_, char)
  155. def clear(self, bitno):
  156. '''clear bit number bitno - set it to false'''
  157. byteno, bit_within_byteno = divmod(bitno, 8)
  158. mask = 1 << bit_within_byteno
  159. os.lseek(self.file_, byteno, os.SEEK_SET)
  160. char = os.read(self.file_, 1)
  161. if isinstance(char, str):
  162. byte = ord(char)
  163. was_char = True
  164. else:
  165. byte = int(char)
  166. was_char = False
  167. byte &= File_seek_backend.effs - mask
  168. os.lseek(self.file_, byteno, os.SEEK_SET)
  169. if was_char:
  170. os.write(chr(byte))
  171. else:
  172. char = python2x3.intlist_to_binary([ byte ])
  173. os.write(char)
  174. # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
  175. def __iand__(self, other):
  176. assert self.num_bits == other.num_bits
  177. for bitno in my_range(self.num_bits):
  178. if self.is_set(bitno) and other.is_set(bitno):
  179. self.set(bitno)
  180. else:
  181. self.clear(bitno)
  182. return self
  183. def __ior__(self, other):
  184. assert self.num_bits == other.num_bits
  185. for bitno in my_range(self.num_bits):
  186. if self.is_set(bitno) or other.is_set(bitno):
  187. self.set(bitno)
  188. else:
  189. self.clear(bitno)
  190. return self
  191. def close(self):
  192. '''Close the file'''
  193. os.close(self.file_)
  194. class Array_then_file_seek_backend:
  195. # pylint: disable=R0902
  196. # R0902: We kinda need a bunch of instance attributes
  197. '''
  198. Backend storage for our "array of bits" using a python array of integers up to some maximum number of bytes, then spilling over to a file.
  199. This is -not- a cache; we instead save the leftmost bits in RAM, and the rightmost bits (if necessary) in a file.
  200. On open, we read from the file to RAM. On close, we write from RAM to the file.
  201. '''
  202. effs = 2 ^ 8 - 1
  203. def __init__(self, num_bits, filename, max_bytes_in_memory):
  204. self.num_bits = num_bits
  205. num_chars = (self.num_bits + 7) // 8
  206. self.filename = filename
  207. self.max_bytes_in_memory = max_bytes_in_memory
  208. self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8)
  209. self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0)
  210. self.bytes_in_memory = (self.bits_in_memory + 7) // 8
  211. self.bytes_in_file = (self.bits_in_file + 7) // 8
  212. self.array_ = array.array('B', [0]) * self.bytes_in_memory
  213. flags = os.O_RDWR | os.O_CREAT
  214. if hasattr(os, 'O_BINARY'):
  215. flags |= getattr(os, 'O_BINARY')
  216. self.file_ = os.open(filename, flags)
  217. os.lseek(self.file_, num_chars + 1, os.SEEK_SET)
  218. os.write(self.file_, python2x3.null_byte)
  219. os.lseek(self.file_, 0, os.SEEK_SET)
  220. offset = 0
  221. intended_block_len = 2 ** 17
  222. while True:
  223. if offset + intended_block_len < self.bytes_in_memory:
  224. block = os.read(self.file_, intended_block_len)
  225. elif offset < self.bytes_in_memory:
  226. block = os.read(self.file_, self.bytes_in_memory - offset)
  227. else:
  228. break
  229. for index_in_block, character in enumerate(block):
  230. self.array_[offset + index_in_block] = ord(character)
  231. offset += intended_block_len
  232. def is_set(self, bitno):
  233. '''Return true iff bit number bitno is set'''
  234. byteno, bit_within_byteno = divmod(bitno, 8)
  235. mask = 1 << bit_within_byteno
  236. if byteno < self.bytes_in_memory:
  237. return self.array_[byteno] & mask
  238. else:
  239. os.lseek(self.file_, byteno, os.SEEK_SET)
  240. char = os.read(self.file_, 1)
  241. if isinstance(char, str):
  242. byte = ord(char)
  243. else:
  244. byte = int(char)
  245. return byte & mask
  246. def set(self, bitno):
  247. '''set bit number bitno to true'''
  248. byteno, bit_within_byteno = divmod(bitno, 8)
  249. mask = 1 << bit_within_byteno
  250. if byteno < self.bytes_in_memory:
  251. self.array_[byteno] |= mask
  252. else:
  253. os.lseek(self.file_, byteno, os.SEEK_SET)
  254. char = os.read(self.file_, 1)
  255. if isinstance(char, str):
  256. byte = ord(char)
  257. was_char = True
  258. else:
  259. byte = char
  260. was_char = False
  261. byte |= mask
  262. os.lseek(self.file_, byteno, os.SEEK_SET)
  263. if was_char:
  264. os.write(self.file_, chr(byte))
  265. else:
  266. os.write(self.file_, byte)
  267. def clear(self, bitno):
  268. '''clear bit number bitno - set it to false'''
  269. byteno, bit_within_byteno = divmod(bitno, 8)
  270. mask = Array_backend.effs - (1 << bit_within_byteno)
  271. if byteno < self.bytes_in_memory:
  272. self.array_[byteno] &= mask
  273. else:
  274. os.lseek(self.file_, byteno, os.SEEK_SET)
  275. char = os.read(self.file_, 1)
  276. if isinstance(char, str):
  277. byte = ord(char)
  278. was_char = True
  279. else:
  280. byte = int(char)
  281. was_char = False
  282. byte &= File_seek_backend.effs - mask
  283. os.lseek(self.file_, byteno, os.SEEK_SET)
  284. if was_char:
  285. os.write(chr(byte))
  286. else:
  287. os.write(byte)
  288. # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
  289. def __iand__(self, other):
  290. assert self.num_bits == other.num_bits
  291. for bitno in my_range(self.num_bits):
  292. if self.is_set(bitno) and other.is_set(bitno):
  293. self.set(bitno)
  294. else:
  295. self.clear(bitno)
  296. return self
  297. def __ior__(self, other):
  298. assert self.num_bits == other.num_bits
  299. for bitno in my_range(self.num_bits):
  300. if self.is_set(bitno) or other.is_set(bitno):
  301. self.set(bitno)
  302. else:
  303. self.clear(bitno)
  304. return self
  305. def close(self):
  306. '''Write the in-memory portion to disk, leave the already-on-disk portion unchanged'''
  307. os.lseek(self.file_, 0, os.SEEK_SET)
  308. for index in my_range(self.bytes_in_memory):
  309. self.file_.write(self.array_[index])
  310. os.close(self.file_)
  311. class Array_backend:
  312. '''Backend storage for our "array of bits" using a python array of integers'''
  313. effs = 2 ^ 32 - 1
  314. def __init__(self, num_bits):
  315. self.num_bits = num_bits
  316. self.num_words = (self.num_bits + 31) // 32
  317. self.array_ = array.array('L', [0]) * self.num_words
  318. def is_set(self, bitno):
  319. '''Return true iff bit number bitno is set'''
  320. wordno, bit_within_wordno = divmod(bitno, 32)
  321. mask = 1 << bit_within_wordno
  322. return self.array_[wordno] & mask
  323. def set(self, bitno):
  324. '''set bit number bitno to true'''
  325. wordno, bit_within_wordno = divmod(bitno, 32)
  326. mask = 1 << bit_within_wordno
  327. self.array_[wordno] |= mask
  328. def clear(self, bitno):
  329. '''clear bit number bitno - set it to false'''
  330. wordno, bit_within_wordno = divmod(bitno, 32)
  331. mask = Array_backend.effs - (1 << bit_within_wordno)
  332. self.array_[wordno] &= mask
  333. # It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower
  334. def __iand__(self, other):
  335. assert self.num_bits == other.num_bits
  336. for wordno in my_range(self.num_words):
  337. self.array_[wordno] &= other.array_[wordno]
  338. return self
  339. def __ior__(self, other):
  340. assert self.num_bits == other.num_bits
  341. for wordno in my_range(self.num_words):
  342. self.array_[wordno] |= other.array_[wordno]
  343. return self
  344. def close(self):
  345. '''Noop for compatibility with the file+seek backend'''
  346. pass
  347. def get_bitno_seed_rnd(bloom_filter, key):
  348. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  349. # We're using key as a seed to a pseudorandom number generator
  350. hasher = random.Random(key).randrange
  351. for dummy in range(bloom_filter.num_probes_k):
  352. bitno = hasher(bloom_filter.num_bits_m)
  353. yield bitno % bloom_filter.num_bits_m
  354. MERSENNES1 = [ 2 ** x - 1 for x in [ 17, 31, 127 ] ]
  355. MERSENNES2 = [ 2 ** x - 1 for x in [ 19, 67, 257 ] ]
  356. def simple_hash(int_list, prime1, prime2, prime3):
  357. '''Compute a hash value from a list of integers and 3 primes'''
  358. result = 0
  359. for integer in int_list:
  360. result += ((result + integer + prime1) * prime2) % prime3
  361. return result
  362. def hash1(int_list):
  363. '''Basic hash function #1'''
  364. return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
  365. def hash2(int_list):
  366. '''Basic hash function #2'''
  367. return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
  368. def get_bitno_lin_comb(bloom_filter, key):
  369. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  370. # This one assumes key is either bytes or str (or other list of integers)
  371. # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type
  372. if hasattr(key, '__divmod__'):
  373. int_list = []
  374. temp = key
  375. while temp:
  376. quotient, remainder = divmod(temp, 256)
  377. int_list.append(remainder)
  378. temp = quotient
  379. elif hasattr(key[0], '__divmod__'):
  380. int_list = key
  381. elif isinstance(key[0], str):
  382. int_list = [ ord(char) for char in key ]
  383. else:
  384. raise TypeError('Sorry, I do not know how to hash this type')
  385. hash_value1 = hash1(int_list)
  386. hash_value2 = hash2(int_list)
  387. # We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
  388. for probeno in range(1, bloom_filter.num_probes_k + 1):
  389. bit_index = hash_value1 + probeno * hash_value2
  390. yield bit_index % bloom_filter.num_bits_m
  391. def try_unlink(filename):
  392. '''unlink a file. Don't complain if it's not there'''
  393. try:
  394. os.unlink(filename)
  395. except OSError:
  396. pass
  397. return
  398. class Bloom_filter:
  399. '''Probabilistic set membership testing for large sets'''
  400. #def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
  401. def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb, filename=None, start_fresh=False):
  402. # pylint: disable=R0913
  403. # R0913: We want a few arguments
  404. if ideal_num_elements_n <= 0:
  405. raise ValueError('ideal_num_elements_n must be > 0')
  406. if not (0 < error_rate_p < 1):
  407. raise ValueError('error_rate_p must be between 0 and 1 inclusive')
  408. self.error_rate_p = error_rate_p
  409. # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
  410. # drops rapidly.
  411. self.ideal_num_elements_n = ideal_num_elements_n
  412. numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
  413. denominator = math.log(2) ** 2
  414. #self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
  415. real_num_bits_m = numerator / denominator
  416. self.num_bits_m = int(math.ceil(real_num_bits_m))
  417. if filename is None:
  418. self.backend = Array_backend(self.num_bits_m)
  419. elif isinstance(filename, tuple) and isinstance(filename[1], int):
  420. if start_fresh:
  421. try_unlink(filename[0])
  422. if filename[1] == -1:
  423. self.backend = Mmap_backend(self.num_bits_m, filename[0])
  424. else:
  425. self.backend = Array_then_file_seek_backend(self.num_bits_m, filename[0], filename[1])
  426. else:
  427. if start_fresh:
  428. try_unlink(filename)
  429. self.backend = File_seek_backend(self.num_bits_m, filename)
  430. #array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
  431. # AKA num_offsetters
  432. # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
  433. real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
  434. self.num_probes_k = int(math.ceil(real_num_probes_k))
  435. # This comes close, but often isn't the same value
  436. # alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
  437. #
  438. # if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
  439. # sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' %
  440. # (real_num_probes_k, alternative_real_num_probes_k)
  441. # )
  442. # sys.exit(1)
  443. self.probe_bitnoer = probe_bitnoer
  444. def __repr__(self):
  445. return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
  446. self.ideal_num_elements_n,
  447. self.error_rate_p,
  448. self.num_bits_m,
  449. )
  450. def add(self, key):
  451. '''Add an element to the filter'''
  452. for bitno in self.probe_bitnoer(self, key):
  453. self.backend.set(bitno)
  454. def __iadd__(self, key):
  455. self.add(key)
  456. return self
  457. def _match_template(self, bloom_filter):
  458. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  459. return (self.num_bits_m == bloom_filter.num_bits_m \
  460. and self.num_probes_k == bloom_filter.num_probes_k \
  461. and self.probe_bitnoer == bloom_filter.probe_bitnoer)
  462. def union(self, bloom_filter):
  463. '''Compute the set union of two bloom filters'''
  464. self.backend |= bloom_filter.backend
  465. def __ior__(self, bloom_filter):
  466. self.union(bloom_filter)
  467. return self
  468. def intersection(self, bloom_filter):
  469. '''Compute the set intersection of two bloom filters'''
  470. self.backend &= bloom_filter.backend
  471. def __iand__(self, bloom_filter):
  472. self.intersection(bloom_filter)
  473. return self
  474. def __contains__(self, key):
  475. for bitno in self.probe_bitnoer(self, key):
  476. #wordno, bit_within_word = divmod(bitno, 32)
  477. #mask = 1 << bit_within_word
  478. #if not (self.array_[wordno] & mask):
  479. if not self.backend.is_set(bitno):
  480. return False
  481. return True
  482. #return all(self.array_[i] & mask for i, mask in self.probe_bitnoer(self, key))