浏览代码

Variable name changes. Better doc, mapping literature's single character variables to ours. Simplified some expressions using temporary variables. Avoided recomputing length of array_ over and over.

dstromberg 13 年之前
父节点
当前提交
51b0df0662
共有 1 个文件被更改,包括 51 次插入36 次删除
  1. 51 36
      bloom_filter_mod.py

+ 51 - 36
bloom_filter_mod.py

@@ -11,12 +11,20 @@ import math
 import array
 import random
 
+# In the literature:
+# k is the number of probes - we call this num_probes_k
+# m is the number of bits in the filter - we call this num_bits_m
+# n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
+# p is the desired error rate when full - we call this error_rate_p
 
-def get_probes(bfilter, key):
-	'''Generate a bunch of fast hash functions - the output of this function is knoown tersely in the literature as "K"'''
+def get_probe_index_and_bitmask(bloom_filter, key):
+	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''
+
+	# We're using key as a seed to a pseudorandom number generator
 	hasher = random.Random(key).randrange
-	for _ in range(bfilter.num_probes):
-		array_index = hasher(len(bfilter.array_))
+	for _ in range(bloom_filter.num_probes_k):
+		# We could precompute this length for speed.  But we don't
+		array_index = hasher(bloom_filter.num_words)
 		bit_index = hasher(32)
 		yield array_index, 1 << bit_index
 
@@ -24,72 +32,79 @@ def get_probes(bfilter, key):
 class Bloom_filter:
 	'''Probabilistic set membership testing for large sets'''
 
-	def __init__(self, ideal_num_elements, error_rate, probe_func=get_probes):
-		if ideal_num_elements <= 0:
-			raise ValueError('ideal_num_elements must be > 0')
-		if not (0 < error_rate < 1):
-			raise ValueError('error_rate must be between 0 and 1 inclusive')
+	def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_probe_index_and_bitmask):
+		if ideal_num_elements_n <= 0:
+			raise ValueError('ideal_num_elements_n must be > 0')
+		if not (0 < error_rate_p < 1):
+			raise ValueError('error_rate_p must be between 0 and 1 inclusive')
 
-		self.error_rate = error_rate
+		self.error_rate_p = error_rate_p
 		# With fewer elements, we should do very well.  With more elements, our error rate "guarantee"
 		# drops rapidly.
-		self.ideal_num_elements = ideal_num_elements
+		self.ideal_num_elements_n = ideal_num_elements_n
 
-		self.num_bits = - int((self.ideal_num_elements * math.log(self.error_rate)) / (math.log(2) ** 2))
+		numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
+		denominator = math.log(2) ** 2
+		#self.num_bits_m = - int((self.ideal_num_elements_n * math.log(self.error_rate_p)) / (math.log(2) ** 2))
+		real_num_bits_m = numerator / denominator
+		self.num_bits_m = int(math.ceil(real_num_bits_m))
 
-		self.num_words = int((self.num_bits + 31) / 32)
+		self.num_words = int((self.num_bits_m + 31) / 32)
 		self.array_ = array.array('L', [0]) * self.num_words
 
-		self.num_probes = int((self.num_bits / self.ideal_num_elements) * math.log(2))
+		# AKA num_offsetters
+		# Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
+		real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
+		self.num_probes_k = int(math.ceil(real_num_probes_k))
 
-		self.probe_func = probe_func
+		self.probe_offsetter = probe_offsetter
 
 	def __repr__(self):
-		return 'Bloom_filter(ideal_num_elements=%d, error_rate=%f, num_bits=%d)' % (
-			self.ideal_num_elements,
-			self.error_rate,
-			self.num_bits,
+		return 'Bloom_filter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
+			self.ideal_num_elements_n,
+			self.error_rate_p,
+			self.num_bits_m,
 			)
 
 	def add(self, key):
 		'''Add an element to the filter'''
-		for i, mask in self.probe_func(self, key):
-			self.array_[i] |= mask
+		for index, mask in self.probe_offsetter(self, key):
+			self.array_[index] |= mask
 
 	def __iadd__(self, key):
 		self.add(key)
 		return self
 
-	def _match_template(self, bfilter):
+	def _match_template(self, bloom_filter):
 		'''Compare a sort of signature for two bloom filters.  Used in preparation for binary operations'''
-		return (self.num_bits == bfilter.num_bits \
-			and self.num_probes == bfilter.num_probes \
-			and self.probe_func == bfilter.probe_func)
+		return (self.num_bits_m == bloom_filter.num_bits_m \
+			and self.num_probes_k == bloom_filter.num_probes_k \
+			and self.probe_offsetter == bloom_filter.probe_offsetter)
 
-	def union(self, bfilter):
+	def union(self, bloom_filter):
 		'''Compute the set union of two bloom filters'''
-		if self._match_template(bfilter):
-			self.array_ = [a | b for a, b in zip(self.array_, bfilter.array_)]
+		if self._match_template(bloom_filter):
+			self.array_ = [a | b for a, b in zip(self.array_, bloom_filter.array_)]
 		else:
 			# Union b/w two unrelated bloom filter raises this
 			raise ValueError("Mismatched bloom filters")
 
-	def __ior__(self, bfilter):
-		self.union(bfilter)
+	def __ior__(self, bloom_filter):
+		self.union(bloom_filter)
 		return self
 
-	def intersection(self, bfilter):
+	def intersection(self, bloom_filter):
 		'''Compute the set intersection of two bloom filters'''
-		if self._match_template(bfilter):
-			self.array_ = [a & b for a, b in zip(self.array_, bfilter.array_)]
+		if self._match_template(bloom_filter):
+			self.array_ = [a & b for a, b in zip(self.array_, bloom_filter.array_)]
 		else:
 			# Intersection b/w two unrelated bloom filter raises this
 			raise ValueError("Mismatched bloom filters")
 
-	def __iand__(self, bfilter):
-		self.intersection(bfilter)
+	def __iand__(self, bloom_filter):
+		self.intersection(bloom_filter)
 		return self
 
 	def __contains__(self, key):
-		return all(self.array_[i] & mask for i, mask in self.probe_func(self, key))
+		return all(self.array_[i] & mask for i, mask in self.probe_offsetter(self, key))