123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598 |
- '''Bloom Filter: Probabilistic set membership testing for large sets'''
- # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
- # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
- # Tweaked by Daniel Richard Stromberg, mostly to:
- # 1) Give it a little nicer __init__ parameters.
- # 2) Improve the hash functions to get a much lower rate of false positives
- # 3) Make it pass pylint
- import os
- #mport sys
- import math
- import array
- import random
- #mport bufsock
- #mport hashlib
- #mport numbers
- import python2x3
- try:
- import mmap as mmap_mod
- except ImportError:
- # Jython lacks mmap()
- HAVE_MMAP = False
- else:
- HAVE_MMAP = True
- # In the literature:
- # k is the number of probes - we call this num_probes_k
- # m is the number of bits in the filter - we call this num_bits_m
- # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
- # p is the desired error rate when full - we call this error_rate_p
- def my_range(num_values):
- '''Generate numbers from 0..num_values-1'''
- value = 0
- while value < num_values:
- yield value
- value += 1
- # In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow
- #class Backend_set_operations:
- # '''Provide &= and |= for backends'''
- # # pylint: disable=W0232
- # # W0232: We don't need an __init__ method; we're never instantiated directly
- # def __iand__(self, other):
- # assert self.num_bits == other.num_bits
- #
- # for bitno in my_range(num_bits):
- # if self.is_set(bitno) and other.is_set(bitno):
- # self[bitno].set()
- # else:
- # self[bitno].clear()
- #
- # def __ior__(self, other):
- # assert self.num_bits == other.num_bits
- #
- # for bitno in xrange(num_bits):
- # if self[bitno] or other[bitno]:
- # self[bitno].set()
- # else:
- # self[bitno].clear()
- if HAVE_MMAP:
- class Mmap_backend:
- '''
- Backend storage for our "array of bits" using an mmap'd file.
- Please note that this has only been tested on Linux so far: 2011-11-01.
- '''
- effs = 2 ^ 8 - 1
- def __init__(self, num_bits, filename):
- self.num_bits = num_bits
- self.num_chars = (self.num_bits + 7) // 8
- flags = os.O_RDWR | os.O_CREAT
- if hasattr(os, 'O_BINARY'):
- flags |= getattr(os, 'O_BINARY')
- self.file_ = os.open(filename, flags)
- os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
- os.write(self.file_, python2x3.null_byte)
- self.mmap = mmap_mod.mmap(self.file_, self.num_chars)
- def is_set(self, bitno):
- '''Return true iff bit number bitno is set'''
- byteno, bit_within_wordno = divmod(bitno, 8)
- mask = 1 << bit_within_wordno
- char = self.mmap[byteno]
- if isinstance(char, str):
- byte = ord(char)
- else:
- byte = int(char)
- return byte & mask
- def set(self, bitno):
- '''set bit number bitno to true'''
- byteno, bit_within_byteno = divmod(bitno, 8)
- mask = 1 << bit_within_byteno
- char = self.mmap[byteno]
- byte = ord(char)
- byte |= mask
- self.mmap[byteno] = chr(byte)
- def clear(self, bitno):
- '''clear bit number bitno - set it to false'''
- byteno, bit_within_byteno = divmod(bitno, 8)
- mask = 1 << bit_within_byteno
- char = self.mmap[byteno]
- byte = ord(char)
- byte &= Mmap_backend.effs - mask
- self.mmap[byteno] = chr(byte)
- def __iand__(self, other):
- assert self.num_bits == other.num_bits
- for byteno in my_range(self.num_chars):
- self.mmap[byteno] = chr(ord(self.mmap[byteno]) & ord(other.mmap[byteno]))
- return self
- def __ior__(self, other):
- assert self.num_bits == other.num_bits
- for byteno in my_range(self.num_chars):
- self.mmap[byteno] = chr(ord(self.mmap[byteno]) | ord(other.mmap[byteno]))
- return self
- def close(self):
- '''Close the file'''
- os.close(self.file_)
- class File_seek_backend:
- '''Backend storage for our "array of bits" using a file in which we seek'''
- effs = 2 ^ 8 - 1
- def __init__(self, num_bits, filename):
- self.num_bits = num_bits
- self.num_chars = (self.num_bits + 7) // 8
- flags = os.O_RDWR | os.O_CREAT
- if hasattr(os, 'O_BINARY'):
- flags |= getattr(os, 'O_BINARY')
- self.file_ = os.open(filename, flags)
- os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
- os.write(self.file_, python2x3.null_byte)
- def is_set(self, bitno):
- '''Return true iff bit number bitno is set'''
- byteno, bit_within_wordno = divmod(bitno, 8)
- mask = 1 << bit_within_wordno
- os.lseek(self.file_, byteno, os.SEEK_SET)
- char = os.read(self.file_, 1)
- if isinstance(char, str):
- byte = ord(char)
- else:
- byte = char[0]
- return byte & mask
- def set(self, bitno):
- '''set bit number bitno to true'''
- byteno, bit_within_byteno = divmod(bitno, 8)
- mask = 1 << bit_within_byteno
- os.lseek(self.file_, byteno, os.SEEK_SET)
- char = os.read(self.file_, 1)
- if isinstance(char, str):
- byte = ord(char)
- was_char = True
- else:
- byte = char[0]
- was_char = False
- byte |= mask
- os.lseek(self.file_, byteno, os.SEEK_SET)
- if was_char:
- os.write(self.file_, chr(byte))
- else:
- char = python2x3.intlist_to_binary([ byte ])
- os.write(self.file_, char)
- def clear(self, bitno):
- '''clear bit number bitno - set it to false'''
- byteno, bit_within_byteno = divmod(bitno, 8)
- mask = 1 << bit_within_byteno
- os.lseek(self.file_, byteno, os.SEEK_SET)
- char = os.read(self.file_, 1)
- if isinstance(char, str):
- byte = ord(char)
- was_char = True
- else:
- byte = int(char)
- was_char = False
- byte &= File_seek_backend.effs - mask
- os.lseek(self.file_, byteno, os.SEEK_SET)
- if was_char:
- os.write(chr(byte))
- else:
- char = python2x3.intlist_to_binary([ byte ])
- os.write(char)
- # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
- def __iand__(self, other):
- assert self.num_bits == other.num_bits
- for bitno in my_range(self.num_bits):
- if self.is_set(bitno) and other.is_set(bitno):
- self.set(bitno)
- else:
- self.clear(bitno)
- return self
- def __ior__(self, other):
- assert self.num_bits == other.num_bits
- for bitno in my_range(self.num_bits):
- if self.is_set(bitno) or other.is_set(bitno):
- self.set(bitno)
- else:
- self.clear(bitno)
- return self
- def close(self):
- '''Close the file'''
- os.close(self.file_)
- class Array_then_file_seek_backend:
- # pylint: disable=R0902
- # R0902: We kinda need a bunch of instance attributes
- '''
- Backend storage for our "array of bits" using a python array of integers up to some maximum number of bytes, then spilling over to a file.
- This is -not- a cache; we instead save the leftmost bits in RAM, and the rightmost bits (if necessary) in a file.
- On open, we read from the file to RAM. On close, we write from RAM to the file.
- '''
- effs = 2 ^ 8 - 1
- def __init__(self, num_bits, filename, max_bytes_in_memory):
- self.num_bits = num_bits
- num_chars = (self.num_bits + 7) // 8
- self.filename = filename
- self.max_bytes_in_memory = max_bytes_in_memory
- self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8)
- self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0)
- self.bytes_in_memory = (self.bits_in_memory + 7) // 8
- self.bytes_in_file = (self.bits_in_file + 7) // 8
- self.array_ = array.array('B', [0]) * self.bytes_in_memory
- flags = os.O_RDWR | os.O_CREAT
- if hasattr(os, 'O_BINARY'):
- flags |= getattr(os, 'O_BINARY')
- self.file_ = os.open(filename, flags)
- os.lseek(self.file_, num_chars + 1, os.SEEK_SET)
- os.write(self.file_, python2x3.null_byte)
- os.lseek(self.file_, 0, os.SEEK_SET)
- offset = 0
- intended_block_len = 2 ** 17
- while True:
- if offset + intended_block_len < self.bytes_in_memory:
- block = os.read(self.file_, intended_block_len)
- elif offset < self.bytes_in_memory:
- block = os.read(self.file_, self.bytes_in_memory - offset)
- else:
- break
- for index_in_block, character in enumerate(block):
- self.array_[offset + index_in_block] = ord(character)
- offset += intended_block_len
- def is_set(self, bitno):
- '''Return true iff bit number bitno is set'''
- byteno, bit_within_byteno = divmod(bitno, 8)
- mask = 1 << bit_within_byteno
- if byteno < self.bytes_in_memory:
- return self.array_[byteno] & mask
- else:
- os.lseek(self.file_, byteno, os.SEEK_SET)
- char = os.read(self.file_, 1)
- if isinstance(char, str):
- byte = ord(char)
- else:
- byte = int(char)
- return byte & mask
- def set(self, bitno):
- '''set bit number bitno to true'''
- byteno, bit_within_byteno = divmod(bitno, 8)
- mask = 1 << bit_within_byteno
- if byteno < self.bytes_in_memory:
- self.array_[byteno] |= mask
- else:
- os.lseek(self.file_, byteno, os.SEEK_SET)
- char = os.read(self.file_, 1)
- if isinstance(char, str):
- byte = ord(char)
- was_char = True
- else:
- byte = char
- was_char = False
- byte |= mask
- os.lseek(self.file_, byteno, os.SEEK_SET)
- if was_char:
- os.write(self.file_, chr(byte))
- else:
- os.write(self.file_, byte)
- def clear(self, bitno):
- '''clear bit number bitno - set it to false'''
- byteno, bit_within_byteno = divmod(bitno, 8)
- mask = Array_backend.effs - (1 << bit_within_byteno)
- if byteno < self.bytes_in_memory:
- self.array_[byteno] &= mask
- else:
- os.lseek(self.file_, byteno, os.SEEK_SET)
- char = os.read(self.file_, 1)
- if isinstance(char, str):
- byte = ord(char)
- was_char = True
- else:
- byte = int(char)
- was_char = False
- byte &= File_seek_backend.effs - mask
- os.lseek(self.file_, byteno, os.SEEK_SET)
- if was_char:
- os.write(chr(byte))
- else:
- os.write(byte)
- # These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
- def __iand__(self, other):
- assert self.num_bits == other.num_bits
- for bitno in my_range(self.num_bits):
- if self.is_set(bitno) and other.is_set(bitno):
- self.set(bitno)
- else:
- self.clear(bitno)
- return self
- def __ior__(self, other):
- assert self.num_bits == other.num_bits
- for bitno in my_range(self.num_bits):
- if self.is_set(bitno) or other.is_set(bitno):
- self.set(bitno)
- else:
- self.clear(bitno)
- return self
- def close(self):
- '''Write the in-memory portion to disk, leave the already-on-disk portion unchanged'''
- os.lseek(self.file_, 0, os.SEEK_SET)
- for index in my_range(self.bytes_in_memory):
- self.file_.write(self.array_[index])
- os.close(self.file_)
- class Array_backend:
- '''Backend storage for our "array of bits" using a python array of integers'''
- effs = 2 ^ 32 - 1
- def __init__(self, num_bits):
- self.num_bits = num_bits
- self.num_words = (self.num_bits + 31) // 32
- self.array_ = array.array('L', [0]) * self.num_words
- def is_set(self, bitno):
- '''Return true iff bit number bitno is set'''
- wordno, bit_within_wordno = divmod(bitno, 32)
- mask = 1 << bit_within_wordno
- return self.array_[wordno] & mask
- def set(self, bitno):
- '''set bit number bitno to true'''
- wordno, bit_within_wordno = divmod(bitno, 32)
- mask = 1 << bit_within_wordno
- self.array_[wordno] |= mask
- def clear(self, bitno):
- '''clear bit number bitno - set it to false'''
- wordno, bit_within_wordno = divmod(bitno, 32)
- mask = Array_backend.effs - (1 << bit_within_wordno)
- self.array_[wordno] &= mask
- # It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower
- def __iand__(self, other):
- assert self.num_bits == other.num_bits
- for wordno in my_range(self.num_words):
- self.array_[wordno] &= other.array_[wordno]
- return self
- def __ior__(self, other):
- assert self.num_bits == other.num_bits
- for wordno in my_range(self.num_words):
- self.array_[wordno] |= other.array_[wordno]
- return self
- def close(self):
- '''Noop for compatibility with the file+seek backend'''
- pass
- def get_bitno_seed_rnd(bloom_filter, key):
- '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
- # We're using key as a seed to a pseudorandom number generator
- hasher = random.Random(key).randrange
- for dummy in range(bloom_filter.num_probes_k):
- bitno = hasher(bloom_filter.num_bits_m)
- yield bitno % bloom_filter.num_bits_m
- MERSENNES1 = [ 2 ** x - 1 for x in [ 17, 31, 127 ] ]
- MERSENNES2 = [ 2 ** x - 1 for x in [ 19, 67, 257 ] ]
- def simple_hash(int_list, prime1, prime2, prime3):
- '''Compute a hash value from a list of integers and 3 primes'''
- result = 0
- for integer in int_list:
- result += ((result + integer + prime1) * prime2) % prime3
- return result
- def hash1(int_list):
- '''Basic hash function #1'''
- return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
- def hash2(int_list):
- '''Basic hash function #2'''
- return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
- def get_bitno_lin_comb(bloom_filter, key):
- '''Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result'''
- # This one assumes key is either bytes or str (or other list of integers)
- # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type
- if hasattr(key, '__divmod__'):
- int_list = []
- temp = key
- while temp:
- quotient, remainder = divmod(temp, 256)
- int_list.append(remainder)
- temp = quotient
- elif hasattr(key[0], '__divmod__'):
- int_list = key
- elif isinstance(key[0], str):
- int_list = [ ord(char) for char in key ]
- else:
- raise TypeError('Sorry, I do not know how to hash this type')
- hash_value1 = hash1(int_list)
- hash_value2 = hash2(int_list)
- # We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
- for probeno in range(1, bloom_filter.num_probes_k + 1):
- bit_index = hash_value1 + probeno * hash_value2
- yield bit_index % bloom_filter.num_bits_m
- def try_unlink(filename):
- '''unlink a file. Don't complain if it's not there'''
- try:
- os.unlink(filename)
- except OSError:
- pass
- return
- class Bloom_filter:
- '''Probabilistic set membership testing for large sets'''
- #def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
- def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb, filename=None, start_fresh=False):
- # pylint: disable=R0913
- # R0913: We want a few arguments
- if ideal_num_elements_n <= 0:
- raise ValueError('ideal_num_elements_n must be > 0')
- if not (0 < error_rate_p < 1):
- raise ValueError('error_rate_p must be between 0 and 1 inclusive')
- self.error_rate_p = error_rate_p
- # With fewer elements, we should do very well. With more elements, our error rate "guarantee"
- # drops rapidly.
- self.ideal_num_elements_n = ideal_num_elements_n
- numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
- denominator = math.log(2) ** 2
- #self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
- real_num_bits_m = numerator / denominator
- self.num_bits_m = int(math.ceil(real_num_bits_m))
- if filename is None:
- self.backend = Array_backend(self.num_bits_m)
- elif isinstance(filename, tuple) and isinstance(filename[1], int):
- if start_fresh:
- try_unlink(filename[0])
- if filename[1] == -1:
- self.backend = Mmap_backend(self.num_bits_m, filename[0])
- else:
- self.backend = Array_then_file_seek_backend(self.num_bits_m, filename[0], filename[1])
- else:
- if start_fresh:
- try_unlink(filename)
- self.backend = File_seek_backend(self.num_bits_m, filename)
- #array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
- # AKA num_offsetters
- # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
- real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
- self.num_probes_k = int(math.ceil(real_num_probes_k))
- # This comes close, but often isn't the same value
- # alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
- #
- # if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
- # sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' %
- # (real_num_probes_k, alternative_real_num_probes_k)
- # )
- # sys.exit(1)
- self.probe_bitnoer = probe_bitnoer
- def __repr__(self):
- return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
- self.ideal_num_elements_n,
- self.error_rate_p,
- self.num_bits_m,
- )
- def add(self, key):
- '''Add an element to the filter'''
- for bitno in self.probe_bitnoer(self, key):
- self.backend.set(bitno)
- def __iadd__(self, key):
- self.add(key)
- return self
- def _match_template(self, bloom_filter):
- '''Compare a sort of signature for two bloom filters. Used in preparation for binary operations'''
- return (self.num_bits_m == bloom_filter.num_bits_m \
- and self.num_probes_k == bloom_filter.num_probes_k \
- and self.probe_bitnoer == bloom_filter.probe_bitnoer)
- def union(self, bloom_filter):
- '''Compute the set union of two bloom filters'''
- self.backend |= bloom_filter.backend
- def __ior__(self, bloom_filter):
- self.union(bloom_filter)
- return self
- def intersection(self, bloom_filter):
- '''Compute the set intersection of two bloom filters'''
- self.backend &= bloom_filter.backend
- def __iand__(self, bloom_filter):
- self.intersection(bloom_filter)
- return self
- def __contains__(self, key):
- for bitno in self.probe_bitnoer(self, key):
- #wordno, bit_within_word = divmod(bitno, 32)
- #mask = 1 << bit_within_word
- #if not (self.array_[wordno] & mask):
- if not self.backend.is_set(bitno):
- return False
- return True
- #return all(self.array_[i] & mask for i, mask in self.probe_bitnoer(self, key))
|