test-bloom-filter 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. #!/usr/bin/python
  2. '''Unit tests for bloom_filter_mod'''
  3. #mport os
  4. import sys
  5. import math
  6. import time
  7. try:
  8. import anydbm
  9. except ImportError:
  10. import dbm as anydbm
  11. import random
  12. import bloom_filter_mod
  13. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  14. def my_range(maximum):
  15. '''A range function with consistent semantics on 2.x and 3.x'''
  16. value = 0
  17. while True:
  18. if value >= maximum:
  19. break
  20. yield value
  21. value += 1
  22. def test(description, values, trials, error_rate, probe_bitnoer=bloom_filter_mod.get_bitno_lin_comb, filename=None):
  23. # pylint: disable=R0913,R0914
  24. # R0913: We want a few arguments
  25. # R0914: We want some local variables too. This is just test code.
  26. '''Some quick automatic tests for the bloom filter class'''
  27. all_good = True
  28. divisor = 100000
  29. bloom_filter = bloom_filter_mod.Bloom_filter(
  30. ideal_num_elements_n=trials * 2,
  31. error_rate_p=error_rate,
  32. probe_bitnoer=probe_bitnoer,
  33. filename=filename,
  34. start_fresh=True,
  35. )
  36. sys.stdout.write('\ndescription: %s num_bits_m: %s num_probes_k: %s\n' %
  37. (description, bloom_filter.num_bits_m, bloom_filter.num_probes_k))
  38. print('starting to add values to an empty bloom filter')
  39. for valueno, value in enumerate(values.generator()):
  40. reverse_valueno = values.length() - valueno
  41. if reverse_valueno % divisor == 0:
  42. print('adding valueno %d' % reverse_valueno)
  43. bloom_filter.add(value)
  44. print('testing all known members')
  45. include_in_count = sum(include in bloom_filter for include in values.generator())
  46. if include_in_count == values.length():
  47. # Good
  48. pass
  49. else:
  50. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  51. all_good = False
  52. print('testing random non-members')
  53. false_positives = 0
  54. for trialno in my_range(trials):
  55. if trialno % divisor == 0:
  56. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  57. while True:
  58. candidate = ''.join(random.sample(CHARACTERS, 5))
  59. # If we accidentally found a member, try again
  60. if values.within(candidate):
  61. continue
  62. if candidate in bloom_filter:
  63. #print 'We erroneously think %s is in the filter' % candidate
  64. false_positives += 1
  65. break
  66. actual_error_rate = float(false_positives) / trials
  67. if actual_error_rate > error_rate:
  68. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  69. sys.argv[0],
  70. actual_error_rate,
  71. error_rate,
  72. ))
  73. all_good = False
  74. return all_good
  75. class States(object):
  76. '''Generate the USA's state names'''
  77. def __init__(self):
  78. pass
  79. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  80. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  81. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  82. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  83. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  84. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  85. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  86. @staticmethod
  87. def generator():
  88. '''Generate the states'''
  89. for state in States.states:
  90. yield state
  91. @staticmethod
  92. def within(value):
  93. '''Is the value in our list of states?'''
  94. return value in States.states
  95. @staticmethod
  96. def length():
  97. '''What is the length of our contained values?'''
  98. return len(States.states)
  99. def random_string():
  100. '''Generate a random, 10 character string - for testing purposes'''
  101. list_ = []
  102. for chrno in range(10):
  103. dummy = chrno
  104. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  105. list_.append(character)
  106. return ''.join(list_)
  107. class Random_content(object):
  108. '''Generated a bunch of random strings in sorted order'''
  109. random_content = [ random_string() for dummy in range(1000) ]
  110. def __init__(self):
  111. pass
  112. @staticmethod
  113. def generator():
  114. '''Generate all values'''
  115. for item in Random_content.random_content:
  116. yield item
  117. @staticmethod
  118. def within(value):
  119. '''Test for membership'''
  120. return value in Random_content.random_content
  121. @staticmethod
  122. def length():
  123. '''How many members?'''
  124. return len(Random_content.random_content)
  125. class Evens(object):
  126. '''Generate a bunch of even numbers'''
  127. def __init__(self, maximum):
  128. self.maximum = maximum
  129. def generator(self):
  130. '''Generate all values'''
  131. for value in my_range(self.maximum):
  132. if value % 2 == 0:
  133. yield str(value)
  134. def within(self, value):
  135. '''Test for membership'''
  136. try:
  137. int_value = int(value)
  138. except ValueError:
  139. return False
  140. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  141. return True
  142. else:
  143. return False
  144. def length(self):
  145. '''How many members?'''
  146. return int(math.ceil(self.maximum / 2.0))
  147. def and_test():
  148. '''Test the & operator'''
  149. all_good = True
  150. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  151. for character in [ 'a', 'b', 'c' ]:
  152. abc += character
  153. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  154. for character in [ 'b', 'c', 'd' ]:
  155. bcd += character
  156. abc_and_bcd = abc
  157. abc_and_bcd &= bcd
  158. if 'a' in abc_and_bcd:
  159. sys.stderr.write('a in abc_and_bcd, but should not be')
  160. all_good = False
  161. if not 'b' in abc_and_bcd:
  162. sys.stderr.write('b not in abc_and_bcd, but should be')
  163. all_good = False
  164. if not 'c' in abc_and_bcd:
  165. sys.stderr.write('c not in abc_and_bcd, but should be')
  166. all_good = False
  167. if 'd' in abc_and_bcd:
  168. sys.stderr.write('d in abc_and_bcd, but should not be')
  169. all_good = False
  170. return all_good
  171. def or_test():
  172. '''Test the | operator'''
  173. all_good = True
  174. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  175. for character in [ 'a', 'b', 'c' ]:
  176. abc += character
  177. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  178. for character in [ 'b', 'c', 'd' ]:
  179. bcd += character
  180. abc_and_bcd = abc
  181. abc_and_bcd |= bcd
  182. if not 'a' in abc_and_bcd:
  183. sys.stderr.write('a not in abc_and_bcd, but should be')
  184. all_good = False
  185. if not 'b' in abc_and_bcd:
  186. sys.stderr.write('b not in abc_and_bcd, but should be')
  187. all_good = False
  188. if not 'c' in abc_and_bcd:
  189. sys.stderr.write('c not in abc_and_bcd, but should be')
  190. all_good = False
  191. if not 'd' in abc_and_bcd:
  192. sys.stderr.write('d not in abc_and_bcd, but should be')
  193. all_good = False
  194. if 'e' in abc_and_bcd:
  195. sys.stderr.write('e in abc_and_bcd, but should not be')
  196. all_good = False
  197. return all_good
  198. def give_description(filename):
  199. '''Return a description of the filename type - could be array, file or hybrid'''
  200. if filename is None:
  201. return 'array'
  202. elif isinstance(filename, tuple):
  203. if filename[1] == -1:
  204. return 'mmap'
  205. else:
  206. return 'hybrid'
  207. else:
  208. return 'seek'
  209. def main():
  210. '''Unit tests for Bloom_filter class'''
  211. if sys.argv[1:] == [ '--performance-test' ]:
  212. performance_test = True
  213. else:
  214. performance_test = False
  215. all_good = True
  216. all_good &= test('states', States(), trials=100000, error_rate=0.01)
  217. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
  218. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, probe_bitnoer=bloom_filter_mod.get_bitno_seed_rnd)
  219. filename = 'bloom-filter-rm-me'
  220. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
  221. all_good &= and_test()
  222. all_good &= or_test()
  223. if performance_test:
  224. sqrt_of_10 = math.sqrt(10)
  225. #for exponent in range(5): # this is a lot, but probably not unreasonable
  226. for exponent in range(19): # this is a lot, but probably not unreasonable
  227. elements = int(sqrt_of_10 ** exponent + 0.5)
  228. for filename in [ None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2**20), ('bloom-filter-rm-me', -1) ]:
  229. description = give_description(filename)
  230. key = '%s %s' % (description, elements)
  231. database = anydbm.open('performance-numbers', 'c')
  232. if key in database.keys():
  233. database.close()
  234. continue
  235. if elements >= 100000000 and description == 'seek':
  236. continue
  237. if elements >= 100000000 and description == 'mmap':
  238. continue
  239. if elements >= 1000000000 and description == 'array':
  240. continue
  241. time0 = time.time()
  242. all_good &= test(
  243. 'evens %s elements: %d' % (give_description(filename), elements),
  244. Evens(elements),
  245. trials=elements,
  246. error_rate=1e-2,
  247. filename=filename,
  248. )
  249. time1 = time.time()
  250. delta_t = time1 - time0
  251. #file_ = open('%s.txt' % description, 'a')
  252. #file_.write('%d %f\n' % (elements, delta_t))
  253. #file_.close()
  254. database = anydbm.open('performance-numbers', 'c')
  255. database[key] = '%f' % delta_t
  256. database.close()
  257. if all_good:
  258. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  259. sys.exit(0)
  260. else:
  261. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  262. sys.exit(1)
  263. main()