test-bloom-filter 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. #!/usr/bin/python
  2. # pylint: disable=W0402,W0404
  3. # W0402: We want the deprecated string module, for a use that isn't deprecated
  4. # W0404: We need to import anydbm or dbm
  5. '''Unit tests for bloom_filter_mod'''
  6. #mport os
  7. import sys
  8. import math
  9. import time
  10. try:
  11. import anydbm
  12. except ImportError:
  13. import dbm as anydbm
  14. import random
  15. import bloom_filter_mod
  16. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  17. def my_range(maximum):
  18. '''A range function with consistent semantics on 2.x and 3.x'''
  19. value = 0
  20. while True:
  21. if value >= maximum:
  22. break
  23. yield value
  24. value += 1
  25. def test(description, values, trials, error_rate, probe_bitnoer=bloom_filter_mod.get_bitno_lin_comb, filename=None):
  26. # pylint: disable=R0913,R0914
  27. # R0913: We want a few arguments
  28. # R0914: We want some local variables too. This is just test code.
  29. '''Some quick automatic tests for the bloom filter class'''
  30. all_good = True
  31. divisor = 100000
  32. bloom_filter = bloom_filter_mod.Bloom_filter(
  33. ideal_num_elements_n=trials * 2,
  34. error_rate_p=error_rate,
  35. probe_bitnoer=probe_bitnoer,
  36. filename=filename,
  37. start_fresh=True,
  38. )
  39. sys.stdout.write('\ndescription: %s num_bits_m: %s num_probes_k: %s\n' %
  40. (description, bloom_filter.num_bits_m, bloom_filter.num_probes_k))
  41. print('starting to add values to an empty bloom filter')
  42. for valueno, value in enumerate(values.generator()):
  43. reverse_valueno = values.length() - valueno
  44. if reverse_valueno % divisor == 0:
  45. print('adding valueno %d' % reverse_valueno)
  46. bloom_filter.add(value)
  47. print('testing all known members')
  48. include_in_count = sum(include in bloom_filter for include in values.generator())
  49. if include_in_count == values.length():
  50. # Good
  51. pass
  52. else:
  53. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  54. all_good = False
  55. print('testing random non-members')
  56. false_positives = 0
  57. for trialno in my_range(trials):
  58. if trialno % divisor == 0:
  59. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  60. while True:
  61. candidate = ''.join(random.sample(CHARACTERS, 5))
  62. # If we accidentally found a member, try again
  63. if values.within(candidate):
  64. continue
  65. if candidate in bloom_filter:
  66. #print 'We erroneously think %s is in the filter' % candidate
  67. false_positives += 1
  68. break
  69. actual_error_rate = float(false_positives) / trials
  70. if actual_error_rate > error_rate:
  71. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  72. sys.argv[0],
  73. actual_error_rate,
  74. error_rate,
  75. ))
  76. all_good = False
  77. return all_good
  78. class States:
  79. '''Generate the USA's state names'''
  80. def __init__(self):
  81. pass
  82. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  83. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  84. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  85. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  86. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  87. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  88. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  89. @staticmethod
  90. def generator():
  91. '''Generate the states'''
  92. for state in States.states:
  93. yield state
  94. @staticmethod
  95. def within(value):
  96. '''Is the vaoue in our list of states?'''
  97. return value in States.states
  98. @staticmethod
  99. def length():
  100. '''What is the length of our contained values?'''
  101. return len(States.states)
  102. def random_string():
  103. '''Generate a random, 10 character string - for testing purposes'''
  104. list_ = []
  105. for chrno in range(10):
  106. dummy = chrno
  107. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  108. list_.append(character)
  109. return ''.join(list_)
  110. class Random_content:
  111. '''Generated a bunch of random strings in sorted order'''
  112. random_content = [ random_string() for dummy in range(1000) ]
  113. def __init__(self):
  114. pass
  115. @staticmethod
  116. def generator():
  117. '''Generate all values'''
  118. for item in Random_content.random_content:
  119. yield item
  120. @staticmethod
  121. def within(value):
  122. '''Test for membership'''
  123. return value in Random_content.random_content
  124. @staticmethod
  125. def length():
  126. '''How many members?'''
  127. return len(Random_content.random_content)
  128. class Evens:
  129. '''Generate a bunch of even numbers'''
  130. def __init__(self, maximum):
  131. self.maximum = maximum
  132. def generator(self):
  133. '''Generate all values'''
  134. for value in my_range(self.maximum):
  135. if value % 2 == 0:
  136. yield str(value)
  137. def within(self, value):
  138. '''Test for membership'''
  139. try:
  140. int_value = int(value)
  141. except ValueError:
  142. return False
  143. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  144. return True
  145. else:
  146. return False
  147. def length(self):
  148. '''How many members?'''
  149. return int(math.ceil(self.maximum / 2.0))
  150. def and_test():
  151. '''Test the & operator'''
  152. all_good = True
  153. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  154. for character in [ 'a', 'b', 'c' ]:
  155. abc += character
  156. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  157. for character in [ 'b', 'c', 'd' ]:
  158. bcd += character
  159. abc_and_bcd = abc
  160. abc_and_bcd &= bcd
  161. if 'a' in abc_and_bcd:
  162. sys.stderr.write('a in abc_and_bcd, but should not be')
  163. all_good = False
  164. if not 'b' in abc_and_bcd:
  165. sys.stderr.write('b not in abc_and_bcd, but should be')
  166. all_good = False
  167. if not 'c' in abc_and_bcd:
  168. sys.stderr.write('c not in abc_and_bcd, but should be')
  169. all_good = False
  170. if 'd' in abc_and_bcd:
  171. sys.stderr.write('d in abc_and_bcd, but should not be')
  172. all_good = False
  173. return all_good
  174. def or_test():
  175. '''Test the | operator'''
  176. all_good = True
  177. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  178. for character in [ 'a', 'b', 'c' ]:
  179. abc += character
  180. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  181. for character in [ 'b', 'c', 'd' ]:
  182. bcd += character
  183. abc_and_bcd = abc
  184. abc_and_bcd |= bcd
  185. if not 'a' in abc_and_bcd:
  186. sys.stderr.write('a not in abc_and_bcd, but should be')
  187. all_good = False
  188. if not 'b' in abc_and_bcd:
  189. sys.stderr.write('b not in abc_and_bcd, but should be')
  190. all_good = False
  191. if not 'c' in abc_and_bcd:
  192. sys.stderr.write('c not in abc_and_bcd, but should be')
  193. all_good = False
  194. if not 'd' in abc_and_bcd:
  195. sys.stderr.write('d not in abc_and_bcd, but should be')
  196. all_good = False
  197. if 'e' in abc_and_bcd:
  198. sys.stderr.write('e in abc_and_bcd, but should be')
  199. all_good = False
  200. return all_good
  201. def give_description(filename):
  202. '''Return a description of the filename type - could be array, file or hybrid'''
  203. if filename is None:
  204. return 'array'
  205. elif isinstance(filename, tuple):
  206. if filename[1] == -1:
  207. return 'mmap'
  208. else:
  209. return 'hybrid'
  210. else:
  211. return 'seek'
  212. def main():
  213. '''Unit tests for Bloom_filter class'''
  214. if sys.argv[1:] == [ '--performance-test' ]:
  215. performance_test = True
  216. else:
  217. performance_test = False
  218. all_good = True
  219. all_good &= test('states', States(), trials=100000, error_rate=0.01)
  220. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
  221. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, probe_bitnoer=bloom_filter_mod.get_bitno_seed_rnd)
  222. filename = 'bloom-filter-rm-me'
  223. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
  224. all_good &= and_test()
  225. all_good &= or_test()
  226. if performance_test:
  227. sqrt_of_10 = math.sqrt(10)
  228. #for exponent in range(5): # this is a lot, but probably not unreasonable
  229. for exponent in range(19): # this is a lot, but probably not unreasonable
  230. elements = int(sqrt_of_10 ** exponent + 0.5)
  231. for filename in [ None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2**20), ('bloom-filter-rm-me', -1) ]:
  232. description = give_description(filename)
  233. key = '%s %s' % (description, elements)
  234. database = anydbm.open('performance-numbers', 'c')
  235. if key in database:
  236. database.close()
  237. continue
  238. if elements >= 100000000 and description == 'seek':
  239. continue
  240. if elements >= 100000000 and description == 'mmap':
  241. continue
  242. if elements >= 1000000000 and description == 'array':
  243. continue
  244. time0 = time.time()
  245. all_good &= test(
  246. 'evens %s elements: %d' % (give_description(filename), elements),
  247. Evens(elements),
  248. trials=elements,
  249. error_rate=1e-2,
  250. filename=filename,
  251. )
  252. time1 = time.time()
  253. delta_t = time1 - time0
  254. #file_ = open('%s.txt' % description, 'a')
  255. #file_.write('%d %f\n' % (elements, delta_t))
  256. #file_.close()
  257. database = anydbm.open('performance-numbers', 'c')
  258. database[key] = '%f' % delta_t
  259. database.close()
  260. if all_good:
  261. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  262. sys.exit(0)
  263. else:
  264. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  265. sys.exit(1)
  266. main()