123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330 |
- #!/usr/bin/python
- # pylint: disable=W0402,W0404
- # W0402: We want the deprecated string module, for a use that isn't deprecated
- # W0404: We need to import anydbm or dbm
- '''Unit tests for bloom_filter_mod'''
- #mport os
- import sys
- import math
- import time
- try:
- import anydbm
- except ImportError:
- import dbm as anydbm
- import random
- import bloom_filter_mod
- CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
- def my_range(maximum):
- '''A range function with consistent semantics on 2.x and 3.x'''
- value = 0
- while True:
- if value >= maximum:
- break
- yield value
- value += 1
- def test(description, values, trials, error_rate, probe_bitnoer=bloom_filter_mod.get_bitno_lin_comb, filename=None):
- # pylint: disable=R0913,R0914
- # R0913: We want a few arguments
- # R0914: We want some local variables too. This is just test code.
- '''Some quick automatic tests for the bloom filter class'''
- all_good = True
- divisor = 100000
- bloom_filter = bloom_filter_mod.Bloom_filter(
- ideal_num_elements_n=trials * 2,
- error_rate_p=error_rate,
- probe_bitnoer=probe_bitnoer,
- filename=filename,
- start_fresh=True,
- )
- sys.stdout.write('\ndescription: %s num_bits_m: %s num_probes_k: %s\n' %
- (description, bloom_filter.num_bits_m, bloom_filter.num_probes_k))
- print('starting to add values to an empty bloom filter')
- for valueno, value in enumerate(values.generator()):
- reverse_valueno = values.length() - valueno
- if reverse_valueno % divisor == 0:
- print('adding valueno %d' % reverse_valueno)
- bloom_filter.add(value)
- print('testing all known members')
- include_in_count = sum(include in bloom_filter for include in values.generator())
- if include_in_count == values.length():
- # Good
- pass
- else:
- sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
- all_good = False
- print('testing random non-members')
- false_positives = 0
- for trialno in my_range(trials):
- if trialno % divisor == 0:
- sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
- while True:
- candidate = ''.join(random.sample(CHARACTERS, 5))
- # If we accidentally found a member, try again
- if values.within(candidate):
- continue
- if candidate in bloom_filter:
- #print 'We erroneously think %s is in the filter' % candidate
- false_positives += 1
- break
- actual_error_rate = float(false_positives) / trials
- if actual_error_rate > error_rate:
- sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
- sys.argv[0],
- actual_error_rate,
- error_rate,
- ))
- all_good = False
- return all_good
- class States:
- '''Generate the USA's state names'''
- def __init__(self):
- pass
- states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
- Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
- Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
- Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
- NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
- Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
- Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
- @staticmethod
- def generator():
- '''Generate the states'''
- for state in States.states:
- yield state
- @staticmethod
- def within(value):
- '''Is the vaoue in our list of states?'''
- return value in States.states
- @staticmethod
- def length():
- '''What is the length of our contained values?'''
- return len(States.states)
- def random_string():
- '''Generate a random, 10 character string - for testing purposes'''
- list_ = []
- for chrno in range(10):
- dummy = chrno
- character = CHARACTERS[int(random.random() * len(CHARACTERS))]
- list_.append(character)
- return ''.join(list_)
- class Random_content:
- '''Generated a bunch of random strings in sorted order'''
- random_content = [ random_string() for dummy in range(1000) ]
- def __init__(self):
- pass
- @staticmethod
- def generator():
- '''Generate all values'''
- for item in Random_content.random_content:
- yield item
- @staticmethod
- def within(value):
- '''Test for membership'''
- return value in Random_content.random_content
- @staticmethod
- def length():
- '''How many members?'''
- return len(Random_content.random_content)
- class Evens:
- '''Generate a bunch of even numbers'''
- def __init__(self, maximum):
- self.maximum = maximum
- def generator(self):
- '''Generate all values'''
- for value in my_range(self.maximum):
- if value % 2 == 0:
- yield str(value)
- def within(self, value):
- '''Test for membership'''
- try:
- int_value = int(value)
- except ValueError:
- return False
- if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
- return True
- else:
- return False
- def length(self):
- '''How many members?'''
- return int(math.ceil(self.maximum / 2.0))
- def and_test():
- '''Test the & operator'''
- all_good = True
- abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
- for character in [ 'a', 'b', 'c' ]:
- abc += character
- bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
- for character in [ 'b', 'c', 'd' ]:
- bcd += character
- abc_and_bcd = abc
- abc_and_bcd &= bcd
- if 'a' in abc_and_bcd:
- sys.stderr.write('a in abc_and_bcd, but should not be')
- all_good = False
- if not 'b' in abc_and_bcd:
- sys.stderr.write('b not in abc_and_bcd, but should be')
- all_good = False
- if not 'c' in abc_and_bcd:
- sys.stderr.write('c not in abc_and_bcd, but should be')
- all_good = False
- if 'd' in abc_and_bcd:
- sys.stderr.write('d in abc_and_bcd, but should not be')
- all_good = False
- return all_good
- def or_test():
- '''Test the | operator'''
- all_good = True
- abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
- for character in [ 'a', 'b', 'c' ]:
- abc += character
- bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
- for character in [ 'b', 'c', 'd' ]:
- bcd += character
- abc_and_bcd = abc
- abc_and_bcd |= bcd
- if not 'a' in abc_and_bcd:
- sys.stderr.write('a not in abc_and_bcd, but should be')
- all_good = False
- if not 'b' in abc_and_bcd:
- sys.stderr.write('b not in abc_and_bcd, but should be')
- all_good = False
- if not 'c' in abc_and_bcd:
- sys.stderr.write('c not in abc_and_bcd, but should be')
- all_good = False
- if not 'd' in abc_and_bcd:
- sys.stderr.write('d not in abc_and_bcd, but should be')
- all_good = False
- if 'e' in abc_and_bcd:
- sys.stderr.write('e in abc_and_bcd, but should be')
- all_good = False
- return all_good
- def give_description(filename):
- '''Return a description of the filename type - could be array, file or hybrid'''
- if filename is None:
- return 'array'
- elif isinstance(filename, tuple):
- if filename[1] == -1:
- return 'mmap'
- else:
- return 'hybrid'
- else:
- return 'seek'
- def main():
- '''Unit tests for Bloom_filter class'''
- if sys.argv[1:] == [ '--performance-test' ]:
- performance_test = True
- else:
- performance_test = False
- all_good = True
- all_good &= test('states', States(), trials=100000, error_rate=0.01)
- all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
- all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, probe_bitnoer=bloom_filter_mod.get_bitno_seed_rnd)
- filename = 'bloom-filter-rm-me'
- all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
- all_good &= and_test()
- all_good &= or_test()
- if performance_test:
- sqrt_of_10 = math.sqrt(10)
- #for exponent in range(5): # this is a lot, but probably not unreasonable
- for exponent in range(19): # this is a lot, but probably not unreasonable
- elements = int(sqrt_of_10 ** exponent + 0.5)
- for filename in [ None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2**20), ('bloom-filter-rm-me', -1) ]:
- description = give_description(filename)
- key = '%s %s' % (description, elements)
- database = anydbm.open('performance-numbers', 'c')
- if key in database:
- database.close()
- continue
- if elements >= 100000000 and description == 'seek':
- continue
- if elements >= 100000000 and description == 'mmap':
- continue
- if elements >= 1000000000 and description == 'array':
- continue
- time0 = time.time()
- all_good &= test(
- 'evens %s elements: %d' % (give_description(filename), elements),
- Evens(elements),
- trials=elements,
- error_rate=1e-2,
- filename=filename,
- )
- time1 = time.time()
- delta_t = time1 - time0
- #file_ = open('%s.txt' % description, 'a')
- #file_.write('%d %f\n' % (elements, delta_t))
- #file_.close()
- database = anydbm.open('performance-numbers', 'c')
- database[key] = '%f' % delta_t
- database.close()
- if all_good:
- sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
- sys.exit(0)
- else:
- sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
- sys.exit(1)
- main()
|