bloom_filter_mod.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  2. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  3. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  4. # Tweaked by Daniel Richard Stromberg, mostly to:
  5. # 1) Give it a little nicer __init__ parameters.
  6. # 2) Improve the hash functions to get a much lower rate of false positives
  7. # 3) Make it pass pylint
  8. import os
  9. #mport sys
  10. import math
  11. import array
  12. import random
  13. #mport bufsock
  14. #mport hashlib
  15. #mport numbers
  16. import python2x3
  17. try:
  18. import mmap as mmap_mod
  19. except ImportError:
  20. # Jython lacks mmap()
  21. HAVE_MMAP = False
  22. else:
  23. HAVE_MMAP = True
  24. # In the literature:
  25. # k is the number of probes - we call this num_probes_k
  26. # m is the number of bits in the filter - we call this num_bits_m
  27. # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
  28. # p is the desired error rate when full - we call this error_rate_p
  29. def my_range(num_values):
  30. '''Generate numbers from 0..num_values-1'''
  31. value = 0
  32. while value < num_values:
  33. yield value
  34. value += 1
  35. # In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow
  36. #class Backend_set_operations:
  37. # '''Provide &= and |= for backends'''
  38. # # pylint: disable=W0232
  39. # # W0232: We don't need an __init__ method; we're never instantiated directly
  40. # def __iand__(self, other):
  41. # assert self.num_bits == other.num_bits
  42. #
  43. # for bitno in my_range(num_bits):
  44. # if self.is_set(bitno) and other.is_set(bitno):
  45. # self[bitno].set()
  46. # else:
  47. # self[bitno].clear()
  48. #
  49. # def __ior__(self, other):
  50. # assert self.num_bits == other.num_bits
  51. #
  52. # for bitno in xrange(num_bits):
  53. # if self[bitno] or other[bitno]:
  54. # self[bitno].set()
  55. # else:
  56. # self[bitno].clear()
  57. if HAVE_MMAP:
  58. class Mmap_backend(object):
  59. '''
  60. Backend storage for our "array of bits" using an mmap'd file.
  61. Please note that this has only been tested on Linux so far: 2 -11-01.
  62. '''
  63. effs = 2 ^ 8 - 1
  64. def __init__(self, num_bits, filename):
  65. self.num_bits = num_bits
  66. self.num_chars = (self.num_bits + 7) // 8
  67. flags = os.O_RDWR | os.O_CREAT
  68. if hasattr(os, 'O_BINARY'):
  69. flags |= getattr(os, 'O_BINARY')
  70. self.file_ = os.open(filename, flags)
  71. os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
  72. os.write(self.file_, python2x3.null_byte)
  73. self.mmap = mmap_mod.mmap(self.file_, self.num_chars)
  74. def is_set(self, bitno):
  75. '''Return true iff bit number bitno is set'''
  76. byteno, bit_within_wordno = divmod(bitno, 8)
  77. mask = 1 << bit_within_wordno
  78. char = self.mmap[byteno]
  79. if isinstance(char, str):
  80. byte = ord(char)
  81. else:
  82. byte = int(char)
  83. return byte & mask
  84. def set(self, bitno):
  85. '''set bit number bitno to true'''
  86. byteno, bit_within_byteno = divmod(bitno, 8)
  87. mask = 1 << bit_within_byteno
  88. char = self.mmap[byteno]
  89. byte = ord(char)
  90. byte |= mask
  91. self.mmap[byteno] = chr(byte)
  92. def clear(self, bitno):
  93. '''clear bit number bitno - set it to false'''
  94. byteno, bit_within_byteno = divmod(bitno, 8)
  95. mask = 1 << bit_within_byteno
  96. char = self.mmap[byteno]
  97. byte = ord(char)
  98. byte &= Mmap_backend.effs - mask
  99. self.mmap[byteno] = chr(byte)
  100. def __iand__(self, other):
  101. assert self.num_bits == other.num_bits
  102. for byteno in my_range(self.num_chars):
  103. self.mmap[byteno] = chr(ord(self.mmap[byteno]) & ord(other.mmap[byteno]))
  104. return self
  105. def __ior__(self, other):
  106. assert self.num_bits == other.num_bits
  107. for byteno in my_range(self.num_chars):
  108. self.mmap[byteno] = chr(ord(self.mmap[byteno]) | ord(other.mmap[byteno]))
  109. return self
  110. def close(self):
  111. '''Close the file'''
  112. os.close(self.file_)
  113. class File_seek_backend(object):
  114. '''Backend storage for our "array of bits" using a file in which we seek'''
  115. effs = 2 ^ 8 - 1
  116. def __init__(self, num_bits, filename):
  117. self.num_bits = num_bits
  118. self.num_chars = (self.num_bits + 7) // 8
  119. flags = os.O_RDWR | os.O_CREAT
  120. if hasattr(os, 'O_BINARY'):
  121. flags |= getattr(os, 'O_BINARY')
  122. self.file_ = os.open(filename, flags)
  123. os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
  124. os.write(self.file_, python2x3.null_byte)
  125. def is_set(self, bitno):
  126. '''Return true iff bit number bitno is set'''
  127. byteno, bit_within_wordno = divmod(bitno, 8)
  128. mask = 1 << bit_within_wordno
  129. os.lseek(self.file_, byteno, os.SEEK_SET)
  130. char = os.read(self.file_, 1)
  131. if isinstance(char, str):
  132. byte = ord(char)
  133. else:
  134. byte = char[0]
  135. return byte & mask
  136. def set(self, bitno):
  137. '''set bit number bitno to true'''
  138. byteno, bit_within_byteno = divmod(bitno, 8)
  139. mask = 1 << bit_within_byteno
  140. os.lseek(self.file_, byteno, os.SEEK_SET)
  141. char = os.read(self.file_, 1)
  142. if isinstance(char, str):
  143. byte = ord(char)
  144. was_char = True
  145. else:
  146. byte = char[0]
  147. was_char = False
  148. byte |= mask
  149. os.lseek(self.file_, byteno, os.SEEK_SET)
  150. if was_char:
  151. os.write(self.file_, chr(byte))
  152. else:
  153. char = python2x3.intlist_to_binary([byte])
  154. os.write(self.file_, char)
  155. def clear(self, bitno):
  156. '''clear bit number bitno - set it to false'''
  157. byteno, bit_within_byteno = divmod(bitno, 8)
  158. mask = 1 << bit_within_byteno
  159. os.lseek(self.file_, byteno, os.SEEK_SET)
  160. char = os.read(self.file_, 1)
  161. if isinstance(char, str):
  162. byte = ord(char)
  163. was_char = True
  164. else:
  165. byte = int(char)
  166. was_char = False
  167. byte &= File_seek_backend.effs - mask
  168. os.lseek(self.file_, byteno, os.SEEK_SET)
  169. if was_char:
  170. os.write(chr(byte))
  171. else:
  172. char = python2x3.intlist_to_binary([byte])
  173. os.write(char)
  174. # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
  175. def __iand__(self, other):
  176. assert self.num_bits == other.num_bits
  177. for bitno in my_range(self.num_bits):
  178. if self.is_set(bitno) and other.is_set(bitno):
  179. self.set(bitno)
  180. else:
  181. self.clear(bitno)
  182. return self
  183. def __ior__(self, other):
  184. assert self.num_bits == other.num_bits
  185. for bitno in my_range(self.num_bits):
  186. if self.is_set(bitno) or other.is_set(bitno):
  187. self.set(bitno)
  188. else:
  189. self.clear(bitno)
  190. return self
  191. def close(self):
  192. '''Close the file'''
  193. os.close(self.file_)
  194. class Array_then_file_seek_backend(object):
  195. # pylint: disable=R0902
  196. # R0902: We kinda need a bunch of instance attributes
  197. '''
  198. Backend storage for our "array of bits" using a python array of integers up to some maximum number of bytes,
  199. then spilling over to a file. This is -not- a cache; we instead save the leftmost bits in RAM, and the
  200. rightmost bits (if necessary) in a file. On open, we read from the file to RAM. On close, we write from RAM
  201. to the file.
  202. '''
  203. effs = 2 ** 8 - 1
  204. def __init__(self, num_bits, filename, max_bytes_in_memory):
  205. self.num_bits = num_bits
  206. num_chars = (self.num_bits + 7) // 8
  207. self.filename = filename
  208. self.max_bytes_in_memory = max_bytes_in_memory
  209. self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8)
  210. self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0)
  211. self.bytes_in_memory = (self.bits_in_memory + 7) // 8
  212. self.bytes_in_file = (self.bits_in_file + 7) // 8
  213. self.array_ = array.array('B', [0]) * self.bytes_in_memory
  214. flags = os.O_RDWR | os.O_CREAT
  215. if hasattr(os, 'O_BINARY'):
  216. flags |= getattr(os, 'O_BINARY')
  217. self.file_ = os.open(filename, flags)
  218. os.lseek(self.file_, num_chars + 1, os.SEEK_SET)
  219. os.write(self.file_, python2x3.null_byte)
  220. os.lseek(self.file_, 0, os.SEEK_SET)
  221. offset = 0
  222. intended_block_len = 2 ** 17
  223. while True:
  224. if offset + intended_block_len < self.bytes_in_memory:
  225. block = os.read(self.file_, intended_block_len)
  226. elif offset < self.bytes_in_memory:
  227. block = os.read(self.file_, self.bytes_in_memory - offset)
  228. else:
  229. break
  230. for index_in_block, character in enumerate(block):
  231. self.array_[offset + index_in_block] = ord(character)
  232. offset += intended_block_len
  233. def is_set(self, bitno):
  234. '''Return true iff bit number bitno is set'''
  235. byteno, bit_within_byteno = divmod(bitno, 8)
  236. mask = 1 << bit_within_byteno
  237. if byteno < self.bytes_in_memory:
  238. return self.array_[byteno] & mask
  239. else:
  240. os.lseek(self.file_, byteno, os.SEEK_SET)
  241. char = os.read(self.file_, 1)
  242. if isinstance(char, str):
  243. byte = ord(char)
  244. else:
  245. byte = int(char)
  246. return byte & mask
  247. def set(self, bitno):
  248. '''set bit number bitno to true'''
  249. byteno, bit_within_byteno = divmod(bitno, 8)
  250. mask = 1 << bit_within_byteno
  251. if byteno < self.bytes_in_memory:
  252. self.array_[byteno] |= mask
  253. else:
  254. os.lseek(self.file_, byteno, os.SEEK_SET)
  255. char = os.read(self.file_, 1)
  256. if isinstance(char, str):
  257. byte = ord(char)
  258. was_char = True
  259. else:
  260. byte = char
  261. was_char = False
  262. byte |= mask
  263. os.lseek(self.file_, byteno, os.SEEK_SET)
  264. if was_char:
  265. os.write(self.file_, chr(byte))
  266. else:
  267. os.write(self.file_, byte)
  268. def clear(self, bitno):
  269. '''clear bit number bitno - set it to false'''
  270. byteno, bit_within_byteno = divmod(bitno, 8)
  271. mask = Array_backend.effs - (1 << bit_within_byteno)
  272. if byteno < self.bytes_in_memory:
  273. self.array_[byteno] &= mask
  274. else:
  275. os.lseek(self.file_, byteno, os.SEEK_SET)
  276. char = os.read(self.file_, 1)
  277. if isinstance(char, str):
  278. byte = ord(char)
  279. was_char = True
  280. else:
  281. byte = int(char)
  282. was_char = False
  283. byte &= File_seek_backend.effs - mask
  284. os.lseek(self.file_, byteno, os.SEEK_SET)
  285. if was_char:
  286. os.write(chr(byte))
  287. else:
  288. os.write(byte)
  289. # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
  290. def __iand__(self, other):
  291. assert self.num_bits == other.num_bits
  292. for bitno in my_range(self.num_bits):
  293. if self.is_set(bitno) and other.is_set(bitno):
  294. self.set(bitno)
  295. else:
  296. self.clear(bitno)
  297. return self
  298. def __ior__(self, other):
  299. assert self.num_bits == other.num_bits
  300. for bitno in my_range(self.num_bits):
  301. if self.is_set(bitno) or other.is_set(bitno):
  302. self.set(bitno)
  303. else:
  304. self.clear(bitno)
  305. return self
  306. def close(self):
  307. '''Write the in-memory portion to disk, leave the already-on-disk portion unchanged'''
  308. os.lseek(self.file_, 0, os.SEEK_SET)
  309. for index in my_range(self.bytes_in_memory):
  310. self.file_.write(self.array_[index])
  311. os.close(self.file_)
  312. class Array_backend(object):
  313. '''Backend storage for our "array of bits" using a python array of integers'''
  314. # Note that this has now been split out into a bits_mod for the benefit of other projects.
  315. effs = 2 ** 32 - 1
  316. def __init__(self, num_bits):
  317. self.num_bits = num_bits
  318. self.num_words = (self.num_bits + 31) // 32
  319. self.array_ = array.array('L', [0]) * self.num_words
  320. def is_set(self, bitno):
  321. '''Return true iff bit number bitno is set'''
  322. wordno, bit_within_wordno = divmod(bitno, 32)
  323. mask = 1 << bit_within_wordno
  324. return self.array_[wordno] & mask
  325. def set(self, bitno):
  326. '''set bit number bitno to true'''
  327. wordno, bit_within_wordno = divmod(bitno, 32)
  328. mask = 1 << bit_within_wordno
  329. self.array_[wordno] |= mask
  330. def clear(self, bitno):
  331. '''clear bit number bitno - set it to false'''
  332. wordno, bit_within_wordno = divmod(bitno, 32)
  333. mask = Array_backend.effs - (1 << bit_within_wordno)
  334. self.array_[wordno] &= mask
  335. # It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower
  336. def __iand__(self, other):
  337. assert self.num_bits == other.num_bits
  338. for wordno in my_range(self.num_words):
  339. self.array_[wordno] &= other.array_[wordno]
  340. return self
  341. def __ior__(self, other):
  342. assert self.num_bits == other.num_bits
  343. for wordno in my_range(self.num_words):
  344. self.array_[wordno] |= other.array_[wordno]
  345. return self
  346. def close(self):
  347. '''Noop for compatibility with the file+seek backend'''
  348. pass
  349. def get_bitno_seed_rnd(bloom_filter, key):
  350. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  351. # We're using key as a seed to a pseudorandom number generator
  352. hasher = random.Random(key).randrange
  353. for dummy in range(bloom_filter.num_probes_k):
  354. bitno = hasher(bloom_filter.num_bits_m)
  355. yield bitno % bloom_filter.num_bits_m
  356. MERSENNES1 = [2 ** x - 1 for x in [17, 31, 127]]
  357. MERSENNES2 = [2 ** x - 1 for x in [19, 67, 257]]
  358. def simple_hash(int_list, prime1, prime2, prime3):
  359. '''Compute a hash value from a list of integers and 3 primes'''
  360. result = 0
  361. for integer in int_list:
  362. result += ((result + integer + prime1) * prime2) % prime3
  363. return result
  364. def hash1(int_list):
  365. '''Basic hash function #1'''
  366. return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
  367. def hash2(int_list):
  368. '''Basic hash function #2'''
  369. return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
  370. def get_bitno_lin_comb(bloom_filter, key):
  371. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  372. # This one assumes key is either bytes or str (or other list of integers)
  373. # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type
  374. if hasattr(key, '__divmod__'):
  375. int_list = []
  376. temp = key
  377. while temp:
  378. quotient, remainder = divmod(temp, 256)
  379. int_list.append(remainder)
  380. temp = quotient
  381. elif hasattr(key[0], '__divmod__'):
  382. int_list = key
  383. elif isinstance(key[0], str):
  384. int_list = [ord(char) for char in key]
  385. else:
  386. raise TypeError('Sorry, I do not know how to hash this type')
  387. hash_value1 = hash1(int_list)
  388. hash_value2 = hash2(int_list)
  389. # We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
  390. for probeno in range(1, bloom_filter.num_probes_k + 1):
  391. bit_index = hash_value1 + probeno * hash_value2
  392. yield bit_index % bloom_filter.num_bits_m
  393. def try_unlink(filename):
  394. '''unlink a file. Don't complain if it's not there'''
  395. try:
  396. os.unlink(filename)
  397. except OSError:
  398. pass
  399. return
  400. class Bloom_filter(object):
  401. '''Probabilistic set membership testing for large sets'''
  402. #def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
  403. def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb, filename=None, start_fresh=False):
  404. # pylint: disable=R0913
  405. # R0913: We want a few arguments
  406. if ideal_num_elements_n <= 0:
  407. raise ValueError('ideal_num_elements_n must be > 0')
  408. if not (0 < error_rate_p < 1):
  409. raise ValueError('error_rate_p must be between 0 and 1 exclusive')
  410. self.error_rate_p = error_rate_p
  411. # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
  412. # drops rapidly.
  413. self.ideal_num_elements_n = ideal_num_elements_n
  414. numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
  415. denominator = math.log(2) ** 2
  416. #self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
  417. real_num_bits_m = numerator / denominator
  418. self.num_bits_m = int(math.ceil(real_num_bits_m))
  419. if filename is None:
  420. self.backend = Array_backend(self.num_bits_m)
  421. elif isinstance(filename, tuple) and isinstance(filename[1], int):
  422. if start_fresh:
  423. try_unlink(filename[0])
  424. if filename[1] == -1:
  425. self.backend = Mmap_backend(self.num_bits_m, filename[0])
  426. else:
  427. self.backend = Array_then_file_seek_backend(self.num_bits_m, filename[0], filename[1])
  428. else:
  429. if start_fresh:
  430. try_unlink(filename)
  431. self.backend = File_seek_backend(self.num_bits_m, filename)
  432. #array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
  433. # AKA num_offsetters
  434. # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
  435. real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
  436. self.num_probes_k = int(math.ceil(real_num_probes_k))
  437. # This comes close, but often isn't the same value
  438. # alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
  439. #
  440. # if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
  441. # sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' %
  442. # (real_num_probes_k, alternative_real_num_probes_k)
  443. # )
  444. # sys.exit(1)
  445. self.probe_bitnoer = probe_bitnoer
  446. def __repr__(self):
  447. return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
  448. self.ideal_num_elements_n,
  449. self.error_rate_p,
  450. self.num_bits_m,
  451. )
  452. def add(self, key):
  453. '''Add an element to the filter'''
  454. for bitno in self.probe_bitnoer(self, key):
  455. self.backend.set(bitno)
  456. def __iadd__(self, key):
  457. self.add(key)
  458. return self
  459. def _match_template(self, bloom_filter):
  460. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  461. return (self.num_bits_m == bloom_filter.num_bits_m
  462. and self.num_probes_k == bloom_filter.num_probes_k
  463. and self.probe_bitnoer == bloom_filter.probe_bitnoer)
  464. def union(self, bloom_filter):
  465. '''Compute the set union of two bloom filters'''
  466. self.backend |= bloom_filter.backend
  467. def __ior__(self, bloom_filter):
  468. self.union(bloom_filter)
  469. return self
  470. def intersection(self, bloom_filter):
  471. '''Compute the set intersection of two bloom filters'''
  472. self.backend &= bloom_filter.backend
  473. def __iand__(self, bloom_filter):
  474. self.intersection(bloom_filter)
  475. return self
  476. def __contains__(self, key):
  477. for bitno in self.probe_bitnoer(self, key):
  478. #wordno, bit_within_word = divmod(bitno, 32)
  479. #mask = 1 << bit_within_word
  480. #if not (self.array_[wordno] & mask):
  481. if not self.backend.is_set(bitno):
  482. return False
  483. return True
  484. #return all(self.array_[i] & mask for i, mask in self.probe_bitnoer(self, key))