14 år sedan · 5deb38eba4
--- a/bloom_filter_mod.py
+++ b/bloom_filter_mod.py
@@ -7,9 +7,11 @@
 
				 # Tweaked a bit by Daniel Richard Stromberg, mostly to make it pass pylint and give it a little nicer
			
 
				 # __init__ parameters.
			
 
				 
			
 
				+#mport sys
			
 
				 import math
			
 
				 import array
			
 
				 import random
			
 
				+#mport hashlib
			
 
				 
			
 
				 # In the literature:
			
 
				 # k is the number of probes - we call this num_probes_k
			
@@ -17,22 +19,63 @@ import random
 
				 # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
			
 
				 # p is the desired error rate when full - we call this error_rate_p
			
 
				 
			
 
				-def get_probe_index_and_bitmask(bloom_filter, key):
			
 
				+def get_index_bitmask_seed_rnd(bloom_filter, key):
			
 
				 	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''
			
 
				 
			
 
				 	# We're using key as a seed to a pseudorandom number generator
			
 
				 	hasher = random.Random(key).randrange
			
 
				 	for _ in range(bloom_filter.num_probes_k):
			
 
				-		# We could precompute this length for speed.  But we don't
			
 
				 		array_index = hasher(bloom_filter.num_words)
			
 
				-		bit_index = hasher(32)
			
 
				-		yield array_index, 1 << bit_index
			
 
				+		bit_within_word_index = hasher(32)
			
 
				+		yield array_index, 1 << bit_within_word_index
			
 
				+
			
 
				+
			
 
				+MERSENNES1 = [ 2**x - 1 for x in 17, 31, 127 ]
			
 
				+MERSENNES2 = [ 2**x - 1 for x in 19, 67, 257 ]
			
 
				+
			
 
				+def simple_hash(int_list, prime1, prime2, prime3):
			
 
				+	'''Compute a hash value from a list of integers and 3 primes'''
			
 
				+	result = 0
			
 
				+	for integer in int_list:
			
 
				+		result += ((result + integer + prime1) * prime2) % prime3
			
 
				+	return result
			
 
				+
			
 
				+def hash1(int_list):
			
 
				+	'''Basic hash function #1'''
			
 
				+	return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
			
 
				+
			
 
				+def hash2(int_list):
			
 
				+	'''Basic hash function #2'''
			
 
				+	return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
			
 
				+
			
 
				+def get_index_bitmask_lin_comb(bloom_filter, key):
			
 
				+	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''
			
 
				+
			
 
				+	# This one assumes key is either bytes or str (or other list of integers)
			
 
				+
			
 
				+	if isinstance(key[0], int):
			
 
				+		int_list = key
			
 
				+	elif isinstance(key[0], str):
			
 
				+		int_list = [ ord(char) for char in key ]
			
 
				+	else:
			
 
				+		raise TypeError
			
 
				+
			
 
				+	hash_value1 = hash1(int_list)
			
 
				+	hash_value2 = hash2(int_list)
			
 
				+
			
 
				+	# We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
			
 
				+	for probeno in range(1, bloom_filter.num_probes_k + 1):
			
 
				+		bit_index = hash_value1 + probeno * hash_value2
			
 
				+		bit_within_word_index = bit_index % 32
			
 
				+		array_index = (bit_index // 32) % bloom_filter.num_words
			
 
				+		yield array_index, 1 << bit_within_word_index
			
 
				 
			
 
				 
			
 
				 class Bloom_filter:
			
 
				 	'''Probabilistic set membership testing for large sets'''
			
 
				 
			
 
				-	def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_probe_index_and_bitmask):
			
 
				+	#def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
			
 
				+	def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_lin_comb):
			
 
				 		if ideal_num_elements_n <= 0:
			
 
				 			raise ValueError('ideal_num_elements_n must be > 0')
			
 
				 		if not (0 < error_rate_p < 1):
			
@@ -57,6 +100,15 @@ class Bloom_filter:
 
				 		real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
			
 
				 		self.num_probes_k = int(math.ceil(real_num_probes_k))
			
 
				 
			
 
				+# This comes close, but often isn't the same value
			
 
				+#		alternative_real_num_probes_k = -math.log(self.error_rate_p) / math.log(2)
			
 
				+#
			
 
				+#		if abs(real_num_probes_k - alternative_real_num_probes_k) > 1e-6:
			
 
				+#			sys.stderr.write('real_num_probes_k: %f, alternative_real_num_probes_k: %f\n' % 
			
 
				+#				(real_num_probes_k, alternative_real_num_probes_k)
			
 
				+#				)
			
 
				+#			sys.exit(1)
			
 
				+
			
 
				 		self.probe_offsetter = probe_offsetter
			
 
				 
			
 
				 	def __repr__(self):