Quellcode durchsuchen

Temporarily moved 32 bit concerns into callers, in preparation for abstracting and adding file support

dstromberg vor 13 Jahren
Ursprung
Commit
d46261638d
1 geänderte Dateien mit 21 neuen und 16 gelöschten Zeilen
  1. 21 16
      bloom_filter_mod.py

+ 21 - 16
bloom_filter_mod.py

@@ -23,15 +23,14 @@ import random
 # p is the desired error rate when full - we call this error_rate_p
 
 
-def get_index_bitmask_seed_rnd(bloom_filter, key):
+def get_bitno_seed_rnd(bloom_filter, key):
 	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''
 
 	# We're using key as a seed to a pseudorandom number generator
 	hasher = random.Random(key).randrange
-	for _ in range(bloom_filter.num_probes_k):
-		array_index = hasher(bloom_filter.num_words)
-		bit_within_word_index = hasher(32)
-		yield array_index, 1 << bit_within_word_index
+	for dummy in range(bloom_filter.num_probes_k):
+		bitno = hasher(bloom_filter.num_bits_m)
+		yield bitno % bloom_filter.num_bits_m
 
 
 MERSENNES1 = [ 2 ** x - 1 for x in [ 17, 31, 127 ] ]
@@ -56,7 +55,7 @@ def hash2(int_list):
 	return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
 
 
-def get_index_bitmask_lin_comb(bloom_filter, key):
+def get_bitno_lin_comb(bloom_filter, key):
 	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''
 
 	# This one assumes key is either bytes or str (or other list of integers)
@@ -81,16 +80,14 @@ def get_index_bitmask_lin_comb(bloom_filter, key):
 	# We're using linear combinations of hash_value1 and hash_value2 to obtain num_probes_k hash functions
 	for probeno in range(1, bloom_filter.num_probes_k + 1):
 		bit_index = hash_value1 + probeno * hash_value2
-		bit_within_word_index = bit_index % 32
-		array_index = (bit_index // 32) % bloom_filter.num_words
-		yield array_index, 1 << bit_within_word_index
+		yield bit_index % bloom_filter.num_bits_m
 
 
 class Bloom_filter:
 	'''Probabilistic set membership testing for large sets'''
 
 	#def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
-	def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_lin_comb):
+	def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb):
 		if ideal_num_elements_n <= 0:
 			raise ValueError('ideal_num_elements_n must be > 0')
 		if not (0 < error_rate_p < 1):
@@ -107,8 +104,7 @@ class Bloom_filter:
 		real_num_bits_m = numerator / denominator
 		self.num_bits_m = int(math.ceil(real_num_bits_m))
 
-		self.num_words = int((self.num_bits_m + 31) / 32)
-		self.array_ = array.array('L', [0]) * self.num_words
+		self.array_ = array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
 
 		# AKA num_offsetters
 		# Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
@@ -124,7 +120,7 @@ class Bloom_filter:
 #				)
 #			sys.exit(1)
 
-		self.probe_offsetter = probe_offsetter
+		self.probe_bitnoer = probe_bitnoer
 
 	def __repr__(self):
 		return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
@@ -135,7 +131,9 @@ class Bloom_filter:
 
 	def add(self, key):
 		'''Add an element to the filter'''
-		for index, mask in self.probe_offsetter(self, key):
+		for bitno in self.probe_bitnoer(self, key):
+			index, bit_within_word = divmod(bitno, 32)
+			mask = 1 << bit_within_word
 			self.array_[index] |= mask
 
 	def __iadd__(self, key):
@@ -146,7 +144,7 @@ class Bloom_filter:
 		'''Compare a sort of signature for two bloom filters.  Used in preparation for binary operations'''
 		return (self.num_bits_m == bloom_filter.num_bits_m \
 			and self.num_probes_k == bloom_filter.num_probes_k \
-			and self.probe_offsetter == bloom_filter.probe_offsetter)
+			and self.probe_bitnoer == bloom_filter.probe_bitnoer)
 
 	def union(self, bloom_filter):
 		'''Compute the set union of two bloom filters'''
@@ -173,5 +171,12 @@ class Bloom_filter:
 		return self
 
 	def __contains__(self, key):
-		return all(self.array_[i] & mask for i, mask in self.probe_offsetter(self, key))
+		for bitno in self.probe_bitnoer(self, key):
+			wordno, bit_within_word = divmod(bitno, 32)
+			mask = 1 << bit_within_word
+			if not (self.array_[wordno] & mask):
+				return False
+		return True
+				
+		#return all(self.array_[i] & mask for i, mask in self.probe_bitnoer(self, key))