drs_bloom_filter.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. # coding=utf-8
  2. # pylint: disable=superfluous-parens,redefined-variable-type
  3. # superfluous-parens: Sometimes extra parens are more clear
  4. '''Bloom Filter: Probabilistic set membership testing for large sets'''
  5. # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  6. # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  7. # Tweaked by Daniel Richard Stromberg, mostly to:
  8. # 1) Give it a little nicer __init__ parameters.
  9. # 2) Improve the hash functions to get a much lower rate of false positives.
  10. # 3) Give it a selection of backends.
  11. # 4) Make it pass pylint.
  12. import os
  13. #mport sys
  14. import math
  15. import array
  16. import random
  17. try:
  18. import mmap as mmap_mod
  19. except ImportError:
  20. # Jython lacks mmap()
  21. HAVE_MMAP = False
  22. else:
  23. HAVE_MMAP = True
  24. #mport bufsock
  25. #mport hashlib
  26. #mport numbers
  27. import python2x3
  28. # In the literature:
  29. # k is the number of probes - we call this num_probes_k
  30. # m is the number of bits in the filter - we call this num_bits_m
  31. # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
  32. # p is the desired error rate when full - we call this error_rate_p
  33. def my_range(num_values):
  34. '''Generate numbers from 0..num_values-1'''
  35. value = 0
  36. while value < num_values:
  37. yield value
  38. value += 1
  39. # In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow
  40. #class Backend_set_operations:
  41. # '''Provide &= and |= for backends'''
  42. # # pylint: disable=W0232
  43. # # W0232: We don't need an __init__ method; we're never instantiated directly
  44. # def __iand__(self, other):
  45. # assert self.num_bits == other.num_bits
  46. #
  47. # for bitno in my_range(num_bits):
  48. # if self.is_set(bitno) and other.is_set(bitno):
  49. # self[bitno].set()
  50. # else:
  51. # self[bitno].clear()
  52. #
  53. # def __ior__(self, other):
  54. # assert self.num_bits == other.num_bits
  55. #
  56. # for bitno in xrange(num_bits):
  57. # if self[bitno] or other[bitno]:
  58. # self[bitno].set()
  59. # else:
  60. # self[bitno].clear()
  61. if HAVE_MMAP:
  62. class Mmap_backend(object):
  63. '''
  64. Backend storage for our "array of bits" using an mmap'd file.
  65. Please note that this has only been tested on Linux so far: 2 -11-01.
  66. '''
  67. effs = 2 ^ 8 - 1
  68. def __init__(self, num_bits, filename):
  69. self.num_bits = num_bits
  70. self.num_chars = (self.num_bits + 7) // 8
  71. flags = os.O_RDWR | os.O_CREAT
  72. if hasattr(os, 'O_BINARY'):
  73. flags |= getattr(os, 'O_BINARY')
  74. self.file_ = os.open(filename, flags)
  75. os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
  76. os.write(self.file_, python2x3.null_byte)
  77. self.mmap = mmap_mod.mmap(self.file_, self.num_chars)
  78. def is_set(self, bitno):
  79. '''Return true iff bit number bitno is set'''
  80. byteno, bit_within_wordno = divmod(bitno, 8)
  81. mask = 1 << bit_within_wordno
  82. char = self.mmap[byteno]
  83. if isinstance(char, str):
  84. byte = ord(char)
  85. else:
  86. byte = int(char)
  87. return byte & mask
  88. def set(self, bitno):
  89. '''set bit number bitno to true'''
  90. byteno, bit_within_byteno = divmod(bitno, 8)
  91. mask = 1 << bit_within_byteno
  92. char = self.mmap[byteno]
  93. byte = ord(char)
  94. byte |= mask
  95. self.mmap[byteno] = chr(byte)
  96. def clear(self, bitno):
  97. '''clear bit number bitno - set it to false'''
  98. byteno, bit_within_byteno = divmod(bitno, 8)
  99. mask = 1 << bit_within_byteno
  100. char = self.mmap[byteno]
  101. byte = ord(char)
  102. byte &= Mmap_backend.effs - mask
  103. self.mmap[byteno] = chr(byte)
  104. def __iand__(self, other):
  105. assert self.num_bits == other.num_bits
  106. for byteno in my_range(self.num_chars):
  107. self.mmap[byteno] = chr(ord(self.mmap[byteno]) & ord(other.mmap[byteno]))
  108. return self
  109. def __ior__(self, other):
  110. assert self.num_bits == other.num_bits
  111. for byteno in my_range(self.num_chars):
  112. self.mmap[byteno] = chr(ord(self.mmap[byteno]) | ord(other.mmap[byteno]))
  113. return self
  114. def close(self):
  115. '''Close the file'''
  116. os.close(self.file_)
  117. class File_seek_backend(object):
  118. '''Backend storage for our "array of bits" using a file in which we seek'''
  119. effs = 2 ^ 8 - 1
  120. def __init__(self, num_bits, filename):
  121. self.num_bits = num_bits
  122. self.num_chars = (self.num_bits + 7) // 8
  123. flags = os.O_RDWR | os.O_CREAT
  124. if hasattr(os, 'O_BINARY'):
  125. flags |= getattr(os, 'O_BINARY')
  126. self.file_ = os.open(filename, flags)
  127. os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
  128. os.write(self.file_, python2x3.null_byte)
  129. def is_set(self, bitno):
  130. '''Return true iff bit number bitno is set'''
  131. byteno, bit_within_wordno = divmod(bitno, 8)
  132. mask = 1 << bit_within_wordno
  133. os.lseek(self.file_, byteno, os.SEEK_SET)
  134. char = os.read(self.file_, 1)
  135. if isinstance(char, str):
  136. byte = ord(char)
  137. else:
  138. byte = char[0]
  139. return byte & mask
  140. def set(self, bitno):
  141. '''set bit number bitno to true'''
  142. byteno, bit_within_byteno = divmod(bitno, 8)
  143. mask = 1 << bit_within_byteno
  144. os.lseek(self.file_, byteno, os.SEEK_SET)
  145. char = os.read(self.file_, 1)
  146. if isinstance(char, str):
  147. byte = ord(char)
  148. was_char = True
  149. else:
  150. byte = char[0]
  151. was_char = False
  152. byte |= mask
  153. os.lseek(self.file_, byteno, os.SEEK_SET)
  154. if was_char:
  155. os.write(self.file_, chr(byte))
  156. else:
  157. char = python2x3.intlist_to_binary([byte])
  158. os.write(self.file_, char)
  159. def clear(self, bitno):
  160. '''clear bit number bitno - set it to false'''
  161. byteno, bit_within_byteno = divmod(bitno, 8)
  162. mask = 1 << bit_within_byteno
  163. os.lseek(self.file_, byteno, os.SEEK_SET)
  164. char = os.read(self.file_, 1)
  165. if isinstance(char, str):
  166. byte = ord(char)
  167. was_char = True
  168. else:
  169. byte = int(char)
  170. was_char = False
  171. byte &= File_seek_backend.effs - mask
  172. os.lseek(self.file_, byteno, os.SEEK_SET)
  173. if was_char:
  174. os.write(chr(byte))
  175. else:
  176. char = python2x3.intlist_to_binary([byte])
  177. os.write(char)
  178. # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
  179. def __iand__(self, other):
  180. assert self.num_bits == other.num_bits
  181. for bitno in my_range(self.num_bits):
  182. if self.is_set(bitno) and other.is_set(bitno):
  183. self.set(bitno)
  184. else:
  185. self.clear(bitno)
  186. return self
  187. def __ior__(self, other):
  188. assert self.num_bits == other.num_bits
  189. for bitno in my_range(self.num_bits):
  190. if self.is_set(bitno) or other.is_set(bitno):
  191. self.set(bitno)
  192. else:
  193. self.clear(bitno)
  194. return self
  195. def close(self):
  196. '''Close the file'''
  197. os.close(self.file_)
  198. class Array_then_file_seek_backend(object):
  199. # pylint: disable=R0902
  200. # R0902: We kinda need a bunch of instance attributes
  201. '''
  202. Backend storage for our "array of bits" using a python array of integers up to some maximum number of bytes,
  203. then spilling over to a file. This is -not- a cache; we instead save the leftmost bits in RAM, and the
  204. rightmost bits (if necessary) in a file. On open, we read from the file to RAM. On close, we write from RAM
  205. to the file.
  206. '''
  207. effs = 2 ** 8 - 1
  208. def __init__(self, num_bits, filename, max_bytes_in_memory):
  209. self.num_bits = num_bits
  210. num_chars = (self.num_bits + 7) // 8
  211. self.filename = filename
  212. self.max_bytes_in_memory = max_bytes_in_memory
  213. self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8)
  214. self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0)
  215. self.bytes_in_memory = (self.bits_in_memory + 7) // 8
  216. self.bytes_in_file = (self.bits_in_file + 7) // 8
  217. self.array_ = array.array('B', [0]) * self.bytes_in_memory
  218. flags = os.O_RDWR | os.O_CREAT
  219. if hasattr(os, 'O_BINARY'):
  220. flags |= getattr(os, 'O_BINARY')
  221. self.file_ = os.open(filename, flags)
  222. os.lseek(self.file_, num_chars + 1, os.SEEK_SET)
  223. os.write(self.file_, python2x3.null_byte)
  224. os.lseek(self.file_, 0, os.SEEK_SET)
  225. offset = 0
  226. intended_block_len = 2 ** 17
  227. while True:
  228. if offset + intended_block_len < self.bytes_in_memory:
  229. block = os.read(self.file_, intended_block_len)
  230. elif offset < self.bytes_in_memory:
  231. block = os.read(self.file_, self.bytes_in_memory - offset)
  232. else:
  233. break
  234. for index_in_block, character in enumerate(block):
  235. self.array_[offset + index_in_block] = ord(character)
  236. offset += intended_block_len
  237. def is_set(self, bitno):
  238. '''Return true iff bit number bitno is set'''
  239. byteno, bit_within_byteno = divmod(bitno, 8)
  240. mask = 1 << bit_within_byteno
  241. if byteno < self.bytes_in_memory:
  242. return self.array_[byteno] & mask
  243. else:
  244. os.lseek(self.file_, byteno, os.SEEK_SET)
  245. char = os.read(self.file_, 1)
  246. if isinstance(char, str):
  247. byte = ord(char)
  248. else:
  249. byte = int(char)
  250. return byte & mask
  251. def set(self, bitno):
  252. '''set bit number bitno to true'''
  253. byteno, bit_within_byteno = divmod(bitno, 8)
  254. mask = 1 << bit_within_byteno
  255. if byteno < self.bytes_in_memory:
  256. self.array_[byteno] |= mask
  257. else:
  258. os.lseek(self.file_, byteno, os.SEEK_SET)
  259. char = os.read(self.file_, 1)
  260. if isinstance(char, str):
  261. byte = ord(char)
  262. was_char = True
  263. else:
  264. byte = char
  265. was_char = False
  266. byte |= mask
  267. os.lseek(self.file_, byteno, os.SEEK_SET)
  268. if was_char:
  269. os.write(self.file_, chr(byte))
  270. else:
  271. os.write(self.file_, byte)
  272. def clear(self, bitno):
  273. '''clear bit number bitno - set it to false'''
  274. byteno, bit_within_byteno = divmod(bitno, 8)
  275. mask = Array_backend.effs - (1 << bit_within_byteno)
  276. if byteno < self.bytes_in_memory:
  277. self.array_[byteno] &= mask
  278. else:
  279. os.lseek(self.file_, byteno, os.SEEK_SET)
  280. char = os.read(self.file_, 1)
  281. if isinstance(char, str):
  282. byte = ord(char)
  283. was_char = True
  284. else:
  285. byte = int(char)
  286. was_char = False
  287. byte &= File_seek_backend.effs - mask
  288. os.lseek(self.file_, byteno, os.SEEK_SET)
  289. if was_char:
  290. os.write(chr(byte))
  291. else:
  292. os.write(byte)
  293. # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
  294. def __iand__(self, other):
  295. assert self.num_bits == other.num_bits
  296. for bitno in my_range(self.num_bits):
  297. if self.is_set(bitno) and other.is_set(bitno):
  298. self.set(bitno)
  299. else:
  300. self.clear(bitno)
  301. return self
  302. def __ior__(self, other):
  303. assert self.num_bits == other.num_bits
  304. for bitno in my_range(self.num_bits):
  305. if self.is_set(bitno) or other.is_set(bitno):
  306. self.set(bitno)
  307. else:
  308. self.clear(bitno)
  309. return self
  310. def close(self):
  311. '''Write the in-memory portion to disk, leave the already-on-disk portion unchanged'''
  312. os.lseek(self.file_, 0, os.SEEK_SET)
  313. for index in my_range(self.bytes_in_memory):
  314. self.file_.write(self.array_[index])
  315. os.close(self.file_)
  316. class Array_backend(object):
  317. '''Backend storage for our "array of bits" using a python array of integers'''
  318. # Note that this has now been split out into a bits_mod for the benefit of other projects.
  319. effs = 2 ** 32 - 1
  320. def __init__(self, num_bits):
  321. self.num_bits = num_bits
  322. self.num_words = (self.num_bits + 31) // 32
  323. self.array_ = array.array('L', [0]) * self.num_words
  324. def is_set(self, bitno):
  325. '''Return true iff bit number bitno is set'''
  326. wordno, bit_within_wordno = divmod(bitno, 32)
  327. mask = 1 << bit_within_wordno
  328. return self.array_[wordno] & mask
  329. def set(self, bitno):
  330. '''set bit number bitno to true'''
  331. wordno, bit_within_wordno = divmod(bitno, 32)
  332. mask = 1 << bit_within_wordno
  333. self.array_[wordno] |= mask
  334. def clear(self, bitno):
  335. '''clear bit number bitno - set it to false'''
  336. wordno, bit_within_wordno = divmod(bitno, 32)
  337. mask = Array_backend.effs - (1 << bit_within_wordno)
  338. self.array_[wordno] &= mask
  339. # It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower
  340. def __iand__(self, other):
  341. assert self.num_bits == other.num_bits
  342. for wordno in my_range(self.num_words):
  343. self.array_[wordno] &= other.array_[wordno]
  344. return self
  345. def __ior__(self, other):
  346. assert self.num_bits == other.num_bits
  347. for wordno in my_range(self.num_words):
  348. self.array_[wordno] |= other.array_[wordno]
  349. return self
  350. def close(self):
  351. '''Noop for compatibility with the file+seek backend'''
  352. pass
  353. def get_bitno_seed_rnd(bloom_filter, key):
  354. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  355. # We're using key as a seed to a pseudorandom number generator
  356. hasher = random.Random(key).randrange
  357. for dummy in range(bloom_filter.num_probes_k):
  358. bitno = hasher(bloom_filter.num_bits_m)
  359. yield bitno % bloom_filter.num_bits_m
  360. MERSENNES1 = [2 ** x - 1 for x in [17, 31, 127]]
  361. MERSENNES2 = [2 ** x - 1 for x in [19, 67, 257]]
  362. def simple_hash(int_list, prime1, prime2, prime3):
  363. '''Compute a hash value from a list of integers and 3 primes'''
  364. result = 0
  365. for integer in int_list:
  366. result += ((result + integer + prime1) * prime2) % prime3
  367. return result
  368. def hash1(int_list):
  369. '''Basic hash function #1'''
  370. return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
  371. def hash2(int_list):
  372. '''Basic hash function #2'''
  373. return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
  374. def get_bitno_lin_comb(bloom_filter, key):
  375. '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
  376. # This one assumes key is either bytes or str (or other list of integers)
  377. # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type
  378. if hasattr(key, '__divmod__'):
  379. int_list = []
  380. temp = key
  381. while temp:
  382. quotient, remainder = divmod(temp, 256)
  383. int_list.append(remainder)
  384. temp = quotient
  385. elif hasattr(key[0], '__divmod__'):
  386. int_list = key
  387. elif isinstance(key[0], str):
  388. int_list = [ord(char) for char in key]
  389. else:
  390. raise TypeError('Sorry, I do not know how to hash this type')
  391. hash_value1 = hash1(int_list)
  392. hash_value2 = hash2(int_list)
  393. # We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
  394. for probeno in range(1, bloom_filter.num_probes_k + 1):
  395. bit_index = hash_value1 + probeno * hash_value2
  396. yield bit_index % bloom_filter.num_bits_m
  397. def try_unlink(filename):
  398. '''unlink a file. Don't complain if it's not there'''
  399. try:
  400. os.unlink(filename)
  401. except OSError:
  402. pass
  403. return
  404. class Bloom_filter(object):
  405. '''Probabilistic set membership testing for large sets'''
  406. #def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
  407. def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb, filename=None, start_fresh=False):
  408. # pylint: disable=R0913
  409. # R0913: We want a few arguments
  410. if ideal_num_elements_n <= 0:
  411. raise ValueError('ideal_num_elements_n must be > 0')
  412. if not (0 < error_rate_p < 1):
  413. raise ValueError('error_rate_p must be between 0 and 1 exclusive')
  414. self.error_rate_p = error_rate_p
  415. # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
  416. # drops rapidly.
  417. self.ideal_num_elements_n = ideal_num_elements_n
  418. numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
  419. denominator = math.log(2) ** 2
  420. #self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
  421. real_num_bits_m = numerator / denominator
  422. self.num_bits_m = int(math.ceil(real_num_bits_m))
  423. if filename is None:
  424. self.backend = Array_backend(self.num_bits_m)
  425. elif isinstance(filename, tuple) and isinstance(filename[1], int):
  426. if start_fresh:
  427. try_unlink(filename[0])
  428. if filename[1] == -1:
  429. self.backend = Mmap_backend(self.num_bits_m, filename[0])
  430. else:
  431. self.backend = Array_then_file_seek_backend(self.num_bits_m, filename[0], filename[1])
  432. else:
  433. if start_fresh:
  434. try_unlink(filename)
  435. self.backend = File_seek_backend(self.num_bits_m, filename)
  436. #array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
  437. # AKA num_offsetters
  438. # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
  439. real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
  440. self.num_probes_k = int(math.ceil(real_num_probes_k))
  441. # This comes close, but often isn't the same value
  442. # alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
  443. #
  444. # if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
  445. # sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' %
  446. # (real_num_probes_k, alternative_real_num_probes_k)
  447. # )
  448. # sys.exit(1)
  449. self.probe_bitnoer = probe_bitnoer
  450. def __repr__(self):
  451. return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
  452. self.ideal_num_elements_n,
  453. self.error_rate_p,
  454. self.num_bits_m,
  455. )
  456. def add(self, key):
  457. '''Add an element to the filter'''
  458. for bitno in self.probe_bitnoer(self, key):
  459. self.backend.set(bitno)
  460. def __iadd__(self, key):
  461. self.add(key)
  462. return self
  463. def _match_template(self, bloom_filter):
  464. '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
  465. return (self.num_bits_m == bloom_filter.num_bits_m
  466. and self.num_probes_k == bloom_filter.num_probes_k
  467. and self.probe_bitnoer == bloom_filter.probe_bitnoer)
  468. def union(self, bloom_filter):
  469. '''Compute the set union of two bloom filters'''
  470. self.backend |= bloom_filter.backend
  471. def __ior__(self, bloom_filter):
  472. self.union(bloom_filter)
  473. return self
  474. def intersection(self, bloom_filter):
  475. '''Compute the set intersection of two bloom filters'''
  476. self.backend &= bloom_filter.backend
  477. def __iand__(self, bloom_filter):
  478. self.intersection(bloom_filter)
  479. return self
  480. def __contains__(self, key):
  481. for bitno in self.probe_bitnoer(self, key):
  482. #wordno, bit_within_word = divmod(bitno, 32)
  483. #mask = 1 << bit_within_word
  484. #if not (self.array_[wordno] & mask):
  485. if not self.backend.is_set(bitno):
  486. return False
  487. return True
  488. #return all(self.array_[i] & mask for i, mask in self.probe_bitnoer(self, key))