marx
/
bloom-filter


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
							#!/usr/bin/python

# pylint: disable=W0402
# W0402: We want the deprecated string module, for a use that isn't deprecated

'''Unit tests for bloom_filter_mod'''

import sys
import random
import string

import bloom_filter_mod

def my_range(maximum):
	'''A range function with consistent semantics on 2.x and 3.x'''
	value = 0
	while True:
		if value >= maximum:
			break
		yield value
		value += 1

def primary_test(order, included, trials, error_rate):
	'''Some quick automatic tests for the bloom filter class'''

	all_good = True

	bloom_filter = bloom_filter_mod.Bloom_filter(ideal_num_elements=trials, error_rate=error_rate)
	print(repr(bloom_filter))
	for step in order:
		if step == 1:
			for include in included:
				bloom_filter.add(include)

			include_in_count = sum(include in bloom_filter for include in included)
			if include_in_count == len(included):
				# Good
				pass
			else:
				sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, len(included)))
				all_good = False

		elif step == 2:
			false_positives = 0
			for trialno in my_range(trials):
				if trialno % 10000 == 0:
					sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
				dummy = trialno
				while True:
					candidate = ''.join(random.sample(string.ascii_letters, 5))
					# If we accidentally found a real include, try again
					if candidate in included:
						continue
					if candidate in bloom_filter:
						false_positives += 1
					break
		else:
			raise ValueError('step is not 1 or 2')

	#print('%d true negatives and %d false positives out of %d trials' % (trials - false_positives, false_positives, trials))
	actual_error_rate = float(false_positives) / trials

	if actual_error_rate > error_rate:
		sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
			sys.argv[0],
			actual_error_rate,
			error_rate,
			))
		all_good = False

	return all_good

def random_string():
	'''Generate a random, 10 character string - for testing purposes'''
	list_ = []
	for chrno in range(10):
		dummy = chrno
		character = string.ascii_letters[int(random.random() * 26)]
		list_.append(character)
	return ''.join(list_)

def and_test():
	'''Test the & operator'''

	all_good = True

	abc = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
	for character in [ 'a', 'b', 'c' ]:
		abc += character

	bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
	for character in [ 'b', 'c', 'd' ]:
		bcd += character

	abc_and_bcd = abc
	abc_and_bcd &= bcd

	if 'a' in abc_and_bcd:
		sys.stderr.write('a in abc_and_bcd, but should not be')
		all_good = False
	if not 'b' in abc_and_bcd:
		sys.stderr.write('b not in abc_and_bcd, but should be')
		all_good = False
	if not 'c' in abc_and_bcd:
		sys.stderr.write('c not in abc_and_bcd, but should be')
		all_good = False
	if 'd' in abc_and_bcd:
		sys.stderr.write('d in abc_and_bcd, but should not be')
		all_good = False

	return all_good
	
def or_test():
	'''Test the | operator'''

	all_good = True

	abc = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
	for character in [ 'a', 'b', 'c' ]:
		abc += character

	bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
	for character in [ 'b', 'c', 'd' ]:
		bcd += character

	abc_and_bcd = abc
	abc_and_bcd |= bcd

	if not 'a' in abc_and_bcd:
		sys.stderr.write('a not in abc_and_bcd, but should be')
		all_good = False
	if not 'b' in abc_and_bcd:
		sys.stderr.write('b not in abc_and_bcd, but should be')
		all_good = False
	if not 'c' in abc_and_bcd:
		sys.stderr.write('c not in abc_and_bcd, but should be')
		all_good = False
	if not 'd' in abc_and_bcd:
		sys.stderr.write('d not in abc_and_bcd, but should be')
		all_good = False
	if 'e' in abc_and_bcd:
		sys.stderr.write('e in abc_and_bcd, but should be')
		all_good = False

	return all_good
	
def main():
	'''Unit tests for Bloom_filter class'''

	all_good = True

	states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
		Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
		Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
		Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
		NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
		Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
		Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()

	random_content = [ random_string() for dummy in range(1000) ]

	all_good &= primary_test([1, 2], states, trials=10000, error_rate=0.01)

	all_good &= primary_test([1, 2], random_content, trials=10000, error_rate=0.1)

	all_good &= primary_test([2, 1], [ 'a', 'b', 'c'], trials=100, error_rate=0.000001)

	all_good &= and_test()

	all_good &= or_test()

	if all_good:
		sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
		sys.exit(0)
	else:
		sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
		sys.exit(1)

main()