marx
/
bloom-filter


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
							
'''Bloom Filter: Probabilistic set membership testing for large sets'''

# Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
# About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter

# Tweaked by Daniel Richard Stromberg, mostly to:
# 1) Give it a little nicer __init__ parameters.
# 2) Improve the hash functions to get a much lower rate of false positives
# 3) Make it pass pylint

#mport sys
import math
import array
import random
#mport hashlib

# In the literature:
# k is the number of probes - we call this num_probes_k
# m is the number of bits in the filter - we call this num_bits_m
# n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
# p is the desired error rate when full - we call this error_rate_p

def get_index_bitmask_seed_rnd(bloom_filter, key):
	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''

	# We're using key as a seed to a pseudorandom number generator
	hasher = random.Random(key).randrange
	for _ in range(bloom_filter.num_probes_k):
		array_index = hasher(bloom_filter.num_words)
		bit_within_word_index = hasher(32)
		yield array_index, 1 << bit_within_word_index


MERSENNES1 = [ 2**x - 1 for x in [ 17, 31, 127 ] ]
MERSENNES2 = [ 2**x - 1 for x in [ 19, 67, 257 ] ]

def simple_hash(int_list, prime1, prime2, prime3):
	'''Compute a hash value from a list of integers and 3 primes'''
	result = 0
	for integer in int_list:
		result += ((result + integer + prime1) * prime2) % prime3
	return result

def hash1(int_list):
	'''Basic hash function #1'''
	return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])

def hash2(int_list):
	'''Basic hash function #2'''
	return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])

def get_index_bitmask_lin_comb(bloom_filter, key):
	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''

	# This one assumes key is either bytes or str (or other list of integers)

	if isinstance(key[0], int):
		int_list = key
	elif isinstance(key[0], str):
		int_list = [ ord(char) for char in key ]
	else:
		raise TypeError

	hash_value1 = hash1(int_list)
	hash_value2 = hash2(int_list)

	# We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
	for probeno in range(1, bloom_filter.num_probes_k + 1):
		bit_index = hash_value1 + probeno * hash_value2
		bit_within_word_index = bit_index % 32
		array_index = (bit_index // 32) % bloom_filter.num_words
		yield array_index, 1 << bit_within_word_index


class Bloom_filter:
	'''Probabilistic set membership testing for large sets'''

	#def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
	def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_lin_comb):
		if ideal_num_elements_n <= 0:
			raise ValueError('ideal_num_elements_n must be > 0')
		if not (0 < error_rate_p < 1):
			raise ValueError('error_rate_p must be between 0 and 1 inclusive')

		self.error_rate_p = error_rate_p
		# With fewer elements, we should do very well.  With more elements, our error rate "guarantee"
		# drops rapidly.
		self.ideal_num_elements_n = ideal_num_elements_n

		numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
		denominator = math.log(2) ** 2
		#self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
		real_num_bits_m = numerator / denominator
		self.num_bits_m = int(math.ceil(real_num_bits_m))

		self.num_words = int((self.num_bits_m + 31) / 32)
		self.array_ = array.array('L', [0]) * self.num_words

		# AKA num_offsetters
		# Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
		real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
		self.num_probes_k = int(math.ceil(real_num_probes_k))

# This comes close, but often isn't the same value
#		alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
#
#		if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
#			sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' % 
#				(real_num_probes_k, alternative_real_num_probes_k)
#				)
#			sys.exit(1)

		self.probe_offsetter = probe_offsetter

	def __repr__(self):
		return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
			self.ideal_num_elements_n,
			self.error_rate_p,
			self.num_bits_m,
			)

	def add(self, key):
		'''Add an element to the filter'''
		for index, mask in self.probe_offsetter(self, key):
			self.array_[index] |= mask

	def __iadd__(self, key):
		self.add(key)
		return self

	def _match_template(self, bloom_filter):
		'''Compare a sort of signature for two bloom filters.  Used in preparation for binary operations'''
		return (self.num_bits_m == bloom_filter.num_bits_m \
			and self.num_probes_k == bloom_filter.num_probes_k \
			and self.probe_offsetter == bloom_filter.probe_offsetter)

	def union(self, bloom_filter):
		'''Compute the set union of two bloom filters'''
		if self._match_template(bloom_filter):
			self.array_ = [a | b for a, b in zip(self.array_, bloom_filter.array_)]
		else:
			# Union b/w two unrelated bloom filter raises this
			raise ValueError("Mismatched bloom filters")

	def __ior__(self, bloom_filter):
		self.union(bloom_filter)
		return self

	def intersection(self, bloom_filter):
		'''Compute the set intersection of two bloom filters'''
		if self._match_template(bloom_filter):
			self.array_ = [a & b for a, b in zip(self.array_, bloom_filter.array_)]
		else:
			# Intersection b/w two unrelated bloom filter raises this
			raise ValueError("Mismatched bloom filters")

	def __iand__(self, bloom_filter):
		self.intersection(bloom_filter)
		return self

	def __contains__(self, key):
		return all(self.array_[i] & mask for i, mask in self.probe_offsetter(self, key))