test-bloom-filter 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. #!/usr/bin/python
  2. # pylint: disable=superfluous-parens
  3. # superfluous-parens: Parentheses are good for clarity and portability
  4. '''Unit tests for bloom_filter_mod'''
  5. #mport os
  6. import sys
  7. import math
  8. import time
  9. try:
  10. import anydbm
  11. except ImportError:
  12. import dbm as anydbm
  13. import random
  14. import bloom_filter_mod
  15. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  16. def my_range(maximum):
  17. '''A range function with consistent semantics on 2.x and 3.x'''
  18. value = 0
  19. while True:
  20. if value >= maximum:
  21. break
  22. yield value
  23. value += 1
  24. def test(description, values, trials, error_rate, probe_bitnoer=bloom_filter_mod.get_bitno_lin_comb, filename=None):
  25. # pylint: disable=R0913,R0914
  26. # R0913: We want a few arguments
  27. # R0914: We want some local variables too. This is just test code.
  28. '''Some quick automatic tests for the bloom filter class'''
  29. all_good = True
  30. divisor = 100000
  31. bloom_filter = bloom_filter_mod.Bloom_filter(
  32. ideal_num_elements_n=trials * 2,
  33. error_rate_p=error_rate,
  34. probe_bitnoer=probe_bitnoer,
  35. filename=filename,
  36. start_fresh=True,
  37. )
  38. message = '\ndescription: %s num_bits_m: %s num_probes_k: %s\n'
  39. filled_out_message = message % (
  40. description,
  41. bloom_filter.num_bits_m,
  42. bloom_filter.num_probes_k,
  43. )
  44. sys.stdout.write(filled_out_message)
  45. print('starting to add values to an empty bloom filter')
  46. for valueno, value in enumerate(values.generator()):
  47. reverse_valueno = values.length() - valueno
  48. if reverse_valueno % divisor == 0:
  49. print('adding valueno %d' % reverse_valueno)
  50. bloom_filter.add(value)
  51. print('testing all known members')
  52. include_in_count = sum(include in bloom_filter for include in values.generator())
  53. if include_in_count == values.length():
  54. # Good
  55. pass
  56. else:
  57. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  58. all_good = False
  59. print('testing random non-members')
  60. false_positives = 0
  61. for trialno in my_range(trials):
  62. if trialno % divisor == 0:
  63. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  64. while True:
  65. candidate = ''.join(random.sample(CHARACTERS, 5))
  66. # If we accidentally found a member, try again
  67. if values.within(candidate):
  68. continue
  69. if candidate in bloom_filter:
  70. #print 'We erroneously think %s is in the filter' % candidate
  71. false_positives += 1
  72. break
  73. actual_error_rate = float(false_positives) / trials
  74. if actual_error_rate > error_rate:
  75. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  76. sys.argv[0],
  77. actual_error_rate,
  78. error_rate,
  79. ))
  80. all_good = False
  81. return all_good
  82. class States(object):
  83. '''Generate the USA's state names'''
  84. def __init__(self):
  85. pass
  86. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  87. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  88. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  89. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  90. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  91. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  92. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  93. @staticmethod
  94. def generator():
  95. '''Generate the states'''
  96. for state in States.states:
  97. yield state
  98. @staticmethod
  99. def within(value):
  100. '''Is the value in our list of states?'''
  101. return value in States.states
  102. @staticmethod
  103. def length():
  104. '''What is the length of our contained values?'''
  105. return len(States.states)
  106. def random_string():
  107. '''Generate a random, 10 character string - for testing purposes'''
  108. list_ = []
  109. for chrno in range(10):
  110. dummy = chrno
  111. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  112. list_.append(character)
  113. return ''.join(list_)
  114. class Random_content(object):
  115. '''Generated a bunch of random strings in sorted order'''
  116. random_content = [random_string() for dummy in range(1000)]
  117. def __init__(self):
  118. pass
  119. @staticmethod
  120. def generator():
  121. '''Generate all values'''
  122. for item in Random_content.random_content:
  123. yield item
  124. @staticmethod
  125. def within(value):
  126. '''Test for membership'''
  127. return value in Random_content.random_content
  128. @staticmethod
  129. def length():
  130. '''How many members?'''
  131. return len(Random_content.random_content)
  132. class Evens(object):
  133. '''Generate a bunch of even numbers'''
  134. def __init__(self, maximum):
  135. self.maximum = maximum
  136. def generator(self):
  137. '''Generate all values'''
  138. for value in my_range(self.maximum):
  139. if value % 2 == 0:
  140. yield str(value)
  141. def within(self, value):
  142. '''Test for membership'''
  143. try:
  144. int_value = int(value)
  145. except ValueError:
  146. return False
  147. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  148. return True
  149. else:
  150. return False
  151. def length(self):
  152. '''How many members?'''
  153. return int(math.ceil(self.maximum / 2.0))
  154. def and_test():
  155. '''Test the & operator'''
  156. all_good = True
  157. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  158. for character in ['a', 'b', 'c']:
  159. abc += character
  160. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  161. for character in ['b', 'c', 'd']:
  162. bcd += character
  163. abc_and_bcd = abc
  164. abc_and_bcd &= bcd
  165. if 'a' in abc_and_bcd:
  166. sys.stderr.write('a in abc_and_bcd, but should not be')
  167. all_good = False
  168. if not 'b' in abc_and_bcd:
  169. sys.stderr.write('b not in abc_and_bcd, but should be')
  170. all_good = False
  171. if not 'c' in abc_and_bcd:
  172. sys.stderr.write('c not in abc_and_bcd, but should be')
  173. all_good = False
  174. if 'd' in abc_and_bcd:
  175. sys.stderr.write('d in abc_and_bcd, but should not be')
  176. all_good = False
  177. return all_good
  178. def or_test():
  179. '''Test the | operator'''
  180. all_good = True
  181. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  182. for character in ['a', 'b', 'c']:
  183. abc += character
  184. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  185. for character in ['b', 'c', 'd']:
  186. bcd += character
  187. abc_and_bcd = abc
  188. abc_and_bcd |= bcd
  189. if not 'a' in abc_and_bcd:
  190. sys.stderr.write('a not in abc_and_bcd, but should be')
  191. all_good = False
  192. if not 'b' in abc_and_bcd:
  193. sys.stderr.write('b not in abc_and_bcd, but should be')
  194. all_good = False
  195. if not 'c' in abc_and_bcd:
  196. sys.stderr.write('c not in abc_and_bcd, but should be')
  197. all_good = False
  198. if not 'd' in abc_and_bcd:
  199. sys.stderr.write('d not in abc_and_bcd, but should be')
  200. all_good = False
  201. if 'e' in abc_and_bcd:
  202. sys.stderr.write('e in abc_and_bcd, but should not be')
  203. all_good = False
  204. return all_good
  205. def give_description(filename):
  206. '''Return a description of the filename type - could be array, file or hybrid'''
  207. if filename is None:
  208. return 'array'
  209. elif isinstance(filename, tuple):
  210. if filename[1] == -1:
  211. return 'mmap'
  212. else:
  213. return 'hybrid'
  214. else:
  215. return 'seek'
  216. def main():
  217. '''Unit tests for Bloom_filter class'''
  218. if sys.argv[1:] == ['--performance-test']:
  219. performance_test = True
  220. else:
  221. performance_test = False
  222. all_good = True
  223. all_good &= test('states', States(), trials=100000, error_rate=0.01)
  224. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
  225. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, probe_bitnoer=bloom_filter_mod.get_bitno_seed_rnd)
  226. filename = 'bloom-filter-rm-me'
  227. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
  228. all_good &= and_test()
  229. all_good &= or_test()
  230. if performance_test:
  231. sqrt_of_10 = math.sqrt(10)
  232. #for exponent in range(5): # this is a lot, but probably not unreasonable
  233. for exponent in range(19): # this is a lot, but probably not unreasonable
  234. elements = int(sqrt_of_10 ** exponent + 0.5)
  235. for filename in [None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2**20), ('bloom-filter-rm-me', -1)]:
  236. description = give_description(filename)
  237. key = '%s %s' % (description, elements)
  238. database = anydbm.open('performance-numbers', 'c')
  239. if key in database.keys():
  240. database.close()
  241. continue
  242. if elements >= 100000000 and description == 'seek':
  243. continue
  244. if elements >= 100000000 and description == 'mmap':
  245. continue
  246. if elements >= 1000000000 and description == 'array':
  247. continue
  248. time0 = time.time()
  249. all_good &= test(
  250. 'evens %s elements: %d' % (give_description(filename), elements),
  251. Evens(elements),
  252. trials=elements,
  253. error_rate=1e-2,
  254. filename=filename,
  255. )
  256. time1 = time.time()
  257. delta_t = time1 - time0
  258. #file_ = open('%s.txt' % description, 'a')
  259. #file_.write('%d %f\n' % (elements, delta_t))
  260. #file_.close()
  261. database = anydbm.open('performance-numbers', 'c')
  262. database[key] = '%f' % delta_t
  263. database.close()
  264. if all_good:
  265. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  266. sys.exit(0)
  267. else:
  268. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  269. sys.exit(1)
  270. main()