Browse Source

Initial checkin

dstromberg 14 years ago
commit
c763fde8d6
4 changed files with 170 additions and 0 deletions
  1. 12 0
      Makefile
  2. 65 0
      bloom_filter_mod.py
  3. 47 0
      test-bloom-filter
  4. 46 0
      this-pylint

+ 12 - 0
Makefile

@@ -0,0 +1,12 @@
+
+go:
+	./this-pylint bloom_filter_mod.py test-bloom-filter
+	/usr/local/cpython-2.7/bin/python ./test-bloom-filter
+	/usr/local/cpython-3.2/bin/python ./test-bloom-filter
+	/usr/local/jython-2.5.2-r7288/bin/jython ./test-bloom-filter
+	/usr/local/pypy-1.4.1/bin/pypy ./test-bloom-filter
+
+clean:
+	rm -f *.pyc *.class
+	rm -rf __pycache__
+

+ 65 - 0
bloom_filter_mod.py

@@ -0,0 +1,65 @@
+
+'''Bloom Filter: Probabilistic set membership testing for large sets'''
+
+# Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
+# About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
+
+# Tweaked a bit by Daniel Richard Stromberg, mostly to make it pass pylint
+
+import array
+import random
+
+def get_probes(bfilter, key):
+	'''Generate a bunch of fast hash functions'''
+	hasher = random.Random(key).randrange
+	for _ in range(bfilter.num_probes):
+		array_index = hasher(len(bfilter.arr))
+		bit_index = hasher(32)
+		yield array_index, 1 << bit_index
+
+class Bloom_filter:
+	'''Probabilistic set membership testing for large sets'''
+
+	def __init__(self, num_bits, num_probes, probe_func=get_probes):
+		self.num_bits = num_bits
+		num_words = (num_bits + 31) // 32
+		self.arr = array.array('L', [0]) * num_words
+		self.num_probes = num_probes
+		self.probe_func = probe_func
+
+	def add(self, key):
+		'''Add an element to the filter'''
+		for i, mask in self.probe_func(self, key):
+			self.arr[i] |= mask
+
+	def _match_template(self, bfilter):
+		'''Compare a sort of signature for two bloom filters.  Used in preparation for binary operations'''
+		return (self.num_bits == bfilter.num_bits \
+			and self.num_probes == bfilter.num_probes \
+			and self.probe_func == bfilter.probe_func)
+
+	def union(self, bfilter):
+		'''Compute the set union of two bloom filters'''
+		if self._match_template(bfilter):
+			self.arr = [a | b for a, b in zip(self.arr, bfilter.arr)]
+		else:
+			# Union b/w two unrelated bloom filter raises this
+			raise ValueError("Mismatched bloom filters")
+
+	def __or__(self, bfilter):
+		return self.union(bfilter)
+
+	def intersection(self, bfilter):
+		'''Compute the set intersection of two bloom filters'''
+		if self._match_template(bfilter):
+			self.arr = [a & b for a, b in zip(self.arr, bfilter.arr)]
+		else:
+			# Intersection b/w two unrelated bloom filter raises this
+			raise ValueError("Mismatched bloom filters")
+
+	def __and__(self, bfilter):
+		return self.intersection(bfilter)
+
+	def __contains__(self, key):
+		return all(self.arr[i] & mask for i, mask in self.probe_func(self, key))
+

+ 47 - 0
test-bloom-filter

@@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+# pylint: disable=W0402
+# W0402: We want the deprecated string module, for a use that isn't deprecated
+
+'''Unit tests for bloom_filter_mod'''
+
+import random
+import string
+
+import bloom_filter_mod
+
+def tests():
+	'''Some quick automatic tests for the bloom filter class'''
+
+	states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
+		Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
+		Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
+		Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
+		NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
+		Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
+		Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
+
+	bloom_filter = bloom_filter_mod.Bloom_filter(num_bits=1000, num_probes=14)
+	for state in states:
+		bloom_filter.add(state)
+
+	states_in_count = sum(state in bloom_filter for state in states)
+	print('%d true positives out of %d trials' % (states_in_count, len(states)))
+
+	trials = 100000
+	false_positives = 0
+	for trialno in range(trials):
+		dummy = trialno
+		while True:
+			candidate = ''.join(random.sample(string.ascii_letters, 5))
+			# If we accidentally found a real state, try again
+			if candidate in states:
+				continue
+			if candidate in bloom_filter:
+				false_positives += 1
+			break
+	print('%d true negatives and %d false positives out of %d trials' % (trials - false_positives, false_positives, trials))
+
+tests()
+
+

+ 46 - 0
this-pylint

@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+set -eu
+
+#et -o pipefail
+
+output=$(set -eu; /usr/local/cpython-2.7/bin/pylint \
+	'--init-hook'='import sys; sys.path.append("/home/dstromberg/lib"); sys.path.append(".")' \
+	'--max-line-length'=133 \
+	'--indent-string'="\\t" \
+	'--module-rgx'='[A-Za-z_][-a-zA-Z0-9_]+$' \
+	'--class-rgx'='[A-Za-z_][-a-zA-Z0-9_]+$' \
+	"$@" 2>&1 || true)
+if echo "$output" | egrep -i traceback > /dev/null
+then
+	echo "pylint exited with a traceback:" 1>&2
+	echo "$output" 1>&2
+	exit 1
+fi
+
+pruned_output=$(echo "$output" | egrep '^\*\*\*|^[CWERF]:' || true)
+
+if echo "$pruned_output" | egrep -v '^\*\*\*|^W: *[0-9]*: FIXME:' | egrep . > /dev/null
+then
+	echo "$pruned_output" 1>&2
+	exit 1
+else
+	exit 0
+fi
+	
+
+# ************* Module drs_buffer_mod
+# C:  7:unpack_slice: Missing docstring
+# /usr/local/lib/python2.6/dist-packages/pylint-0.21.3-py2.6.egg/pylint/checkers/variables.py:308: DeprecationWarning: enumerate exists in builtins since py2.3
+# for i, stmt in enumerate(astmts[1:]):
+# C: 25:DRS_buffer: Missing docstring
+# /usr/local/lib/python2.6/dist-packages/logilab_astng-0.20.3-py2.6.egg/logilab/astng/scoped_nodes.py:904: DeprecationWarning: chain exists in itertools since py2.3
+# for astng in chain(iter((self,)), self.ancestors()):
+# E: 40:DRS_buffer.__getitem__: Undefined variable 'exceptions'
+# E: 55:DRS_buffer.__delitem__: Undefined variable 'exceptions'
+# C: 57:DRS_buffer.extend: Missing docstring
+# E: 64:DRS_buffer.extend: Undefined variable 'exceptions'
+# R: 25:DRS_buffer: Too few public methods (1/2)
+# ************* Module test-drs_buffer_mod
+# E: 15: expected an indented block
+