Quellcode durchsuchen

Now have File_seek_backend and Array_backend

dstromberg vor 13 Jahren
Ursprung
Commit
3f2ba47d7d
1 geänderte Dateien mit 193 neuen und 18 gelöschten Zeilen
  1. 193 18
      bloom_filter_mod.py

+ 193 - 18
bloom_filter_mod.py

@@ -9,12 +9,15 @@
 # 2) Improve the hash functions to get a much lower rate of false positives
 # 3) Make it pass pylint
 
+import os
 #mport sys
 import math
 import array
 import random
+#mport bufsock
 #mport hashlib
 #mport numbers
+import python2x3
 
 # In the literature:
 # k is the number of probes - we call this num_probes_k
@@ -23,6 +26,182 @@ import random
 # p is the desired error rate when full - we call this error_rate_p
 
 
+def my_range(num_values):
+	'''Generate numbers from 0..num_values-1'''
+
+	value = 0
+	while value < num_values:
+		yield value
+		value += 1
+
+# In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow
+#class Backend_set_operations:
+#	'''Provide &= and |= for backends'''
+#	# pylint: disable=W0232
+#	# W0232: We don't need an __init__ method; we're never instantiated directly
+#	def __iand__(self, other):
+#		assert self.num_bits == other.num_bits
+#
+#		for bitno in my_range(num_bits):
+#			if self.is_set(bitno) and other.is_set(bitno):
+#				self[bitno].set()
+#			else:
+#				self[bitno].clear()
+#
+#	def __ior__(self, other):
+#		assert self.num_bits == other.num_bits
+#
+#		for bitno in xrange(num_bits):
+#			if self[bitno] or other[bitno]:
+#				self[bitno].set()
+#			else:
+#				self[bitno].clear()
+
+class File_seek_backend:
+	'''Backend storage for our "array of bits" using a file in which we seek'''
+
+	effs = 2^8 - 1
+
+	def __init__(self, num_bits, filename):
+		self.num_bits = num_bits
+		self.num_chars = (self.num_bits + 7) // 8
+		flags = os.O_RDWR | os.O_CREAT
+		if hasattr(os, 'O_BINARY'):
+			flags |= getattr(os, 'O_BINARY')
+		self.file_ = os.open(filename, flags)
+		os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
+		os.write(self.file_, python2x3.null_byte)
+
+	def is_set(self, bitno):
+		'''Return true iff bit number bitno is set'''
+		byteno, bit_within_wordno = divmod(bitno, 8)
+		mask = 1 << bit_within_wordno
+		os.lseek(self.file_, byteno, os.SEEK_SET)
+		char = os.read(self.file_, 1)
+		if isinstance(char, str):
+			byte = ord(char)
+		else:
+			byte = int(char)
+		return byte & mask
+
+	def set(self, bitno):
+		'''set bit number bitno to true'''
+
+		byteno, bit_within_byteno = divmod(bitno, 8)
+		mask = 1 << bit_within_byteno
+		os.lseek(self.file_, byteno, os.SEEK_SET)
+		char = os.read(self.file_, 1)
+		if isinstance(char, str):
+			byte = ord(char)
+			was_char = True
+		else:
+			byte = char
+			was_char = False
+		byte |= mask
+		os.lseek(self.file_, byteno, os.SEEK_SET)
+		if was_char:
+			os.write(self.file_, chr(byte))
+		else:
+			os.write(self.file_, byte)
+
+	def clear(self, bitno):
+		'''clear bit number bitno - set it to false'''
+
+		byteno, bit_within_byteno = divmod(bitno, 8)
+		mask = 1 << bit_within_byteno
+		os.lseek(self.file_, byteno, os.SEEK_SET)
+		char = os.read(self.file_, 1)
+		if isinstance(char, str):
+			byte = ord(char)
+			was_char = True
+		else:
+			byte = int(char)
+			was_char = False
+		byte &= File_seek_backend.effs - mask
+		os.lseek(self.file_, byteno, os.SEEK_SET)
+		if was_char:
+			os.write(chr(byte))
+		else:
+			os.write(byte)
+
+	# These are quite slow ways to do iand and ior, but they should work, and a faster version is going to take more time
+	def __iand__(self, other):
+		assert self.num_bits == other.num_bits
+
+		for bitno in my_range(self.num_bits):
+			if self.is_set(bitno) and other.is_set(bitno):
+				self.set(bitno)
+			else:
+				self.clear(bitno)
+
+		return self
+
+	def __ior__(self, other):
+		assert self.num_bits == other.num_bits
+
+		for bitno in my_range(self.num_bits):
+			if self.is_set(bitno) or other.is_set(bitno):
+				self.set(bitno)
+			else:
+				self.clear(bitno)
+
+		return self
+
+	def close(self):
+		'''Close the file'''
+		os.close(self.file_)
+
+
+class Array_backend:
+	'''Backend storage for our "array of bits" using a python array of integers'''
+
+	effs = 2^32 - 1
+
+	def __init__(self, num_bits):
+		self.num_bits = num_bits
+		self.num_words = (self.num_bits + 31) // 32
+		self.array_ = array.array('L', [0]) * self.num_words
+
+	def is_set(self, bitno):
+		'''Return true iff bit number bitno is set'''
+		wordno, bit_within_wordno = divmod(bitno, 32)
+		mask = 1 << bit_within_wordno
+		return self.array_[wordno] & mask
+
+	def set(self, bitno):
+		'''set bit number bitno to true'''
+		wordno, bit_within_wordno = divmod(bitno, 32)
+		mask = 1 << bit_within_wordno
+		self.array_[wordno] |= mask
+
+	def clear(self, bitno):
+		'''clear bit number bitno - set it to false'''
+		wordno, bit_within_wordno = divmod(bitno, 32)
+		mask = Array_backend.effs - (1 << bit_within_wordno)
+		self.array_[wordno] &= mask
+
+	# It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower
+
+	def __iand__(self, other):
+		assert self.num_bits == other.num_bits
+
+		for wordno in my_range(self.num_words):
+			self.array_[wordno] &= other.array_[wordno]
+
+		return self
+
+	def __ior__(self, other):
+		assert self.num_bits == other.num_bits
+
+		for wordno in my_range(self.num_words):
+			self.array_[wordno] |= other.array_[wordno]
+
+		return self
+
+	def close(self):
+		'''Noop for compatibility with the file+seek backend'''
+		pass
+
 def get_bitno_seed_rnd(bloom_filter, key):
 	'''Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result'''
 
@@ -87,7 +266,7 @@ class Bloom_filter:
 	'''Probabilistic set membership testing for large sets'''
 
 	#def __init__(self, ideal_num_elements_n, error_rate_p, probe_offsetter=get_index_bitmask_seed_rnd):
-	def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb):
+	def __init__(self, ideal_num_elements_n, error_rate_p, probe_bitnoer=get_bitno_lin_comb, filename=None):
 		if ideal_num_elements_n <= 0:
 			raise ValueError('ideal_num_elements_n must be > 0')
 		if not (0 < error_rate_p < 1):
@@ -104,7 +283,12 @@ class Bloom_filter:
 		real_num_bits_m = numerator / denominator
 		self.num_bits_m = int(math.ceil(real_num_bits_m))
 
-		self.array_ = array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
+		if filename is None:
+			self.backend = Array_backend(self.num_bits_m)
+		else:
+			self.backend = File_seek_backend(self.num_bits_m, filename)
+
+		#array.array('L', [0]) * ((self.num_bits_m + 31) // 32)
 
 		# AKA num_offsetters
 		# Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
@@ -132,9 +316,7 @@ class Bloom_filter:
 	def add(self, key):
 		'''Add an element to the filter'''
 		for bitno in self.probe_bitnoer(self, key):
-			index, bit_within_word = divmod(bitno, 32)
-			mask = 1 << bit_within_word
-			self.array_[index] |= mask
+			self.backend.set(bitno)
 
 	def __iadd__(self, key):
 		self.add(key)
@@ -148,11 +330,7 @@ class Bloom_filter:
 
 	def union(self, bloom_filter):
 		'''Compute the set union of two bloom filters'''
-		if self._match_template(bloom_filter):
-			self.array_ = [a | b for a, b in zip(self.array_, bloom_filter.array_)]
-		else:
-			# Union b/w two unrelated bloom filter raises this
-			raise ValueError("Mismatched bloom filters")
+		self.backend |= bloom_filter.backend
 
 	def __ior__(self, bloom_filter):
 		self.union(bloom_filter)
@@ -160,11 +338,7 @@ class Bloom_filter:
 
 	def intersection(self, bloom_filter):
 		'''Compute the set intersection of two bloom filters'''
-		if self._match_template(bloom_filter):
-			self.array_ = [a & b for a, b in zip(self.array_, bloom_filter.array_)]
-		else:
-			# Intersection b/w two unrelated bloom filter raises this
-			raise ValueError("Mismatched bloom filters")
+		self.backend &= bloom_filter.backend
 
 	def __iand__(self, bloom_filter):
 		self.intersection(bloom_filter)
@@ -172,9 +346,10 @@ class Bloom_filter:
 
 	def __contains__(self, key):
 		for bitno in self.probe_bitnoer(self, key):
-			wordno, bit_within_word = divmod(bitno, 32)
-			mask = 1 << bit_within_word
-			if not (self.array_[wordno] & mask):
+			#wordno, bit_within_word = divmod(bitno, 32)
+			#mask = 1 << bit_within_word
+			#if not (self.array_[wordno] & mask):
+			if not self.backend.is_set(bitno):
 				return False
 		return True