test-bloom-filter 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #!/usr/bin/python
  2. # pylint: disable=W0402
  3. # W0402: We want the deprecated string module, for a use that isn't deprecated
  4. '''Unit tests for bloom_filter_mod'''
  5. #mport os
  6. import sys
  7. import math
  8. import time
  9. import anydbm
  10. import random
  11. import bloom_filter_mod
  12. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  13. def my_range(maximum):
  14. '''A range function with consistent semantics on 2.x and 3.x'''
  15. value = 0
  16. while True:
  17. if value >= maximum:
  18. break
  19. yield value
  20. value += 1
  21. def test(description, values, trials, error_rate, probe_bitnoer=bloom_filter_mod.get_bitno_lin_comb, filename=None):
  22. # pylint: disable=R0913,R0914
  23. # R0913: We want a few arguments
  24. # R0914: We want some local variables too. This is just test code.
  25. '''Some quick automatic tests for the bloom filter class'''
  26. all_good = True
  27. divisor = 100000
  28. bloom_filter = bloom_filter_mod.Bloom_filter(
  29. ideal_num_elements_n=trials * 2,
  30. error_rate_p=error_rate,
  31. probe_bitnoer=probe_bitnoer,
  32. filename=filename,
  33. start_fresh=True,
  34. )
  35. sys.stdout.write('\ndescription: %s num_bits_m: %s num_probes_k: %s\n' %
  36. (description, bloom_filter.num_bits_m, bloom_filter.num_probes_k))
  37. print('starting to add values to an empty bloom filter')
  38. for valueno, value in enumerate(values.generator()):
  39. reverse_valueno = values.length() - valueno
  40. if reverse_valueno % divisor == 0:
  41. print('adding valueno %d' % reverse_valueno)
  42. bloom_filter.add(value)
  43. print('testing all known members')
  44. include_in_count = sum(include in bloom_filter for include in values.generator())
  45. if include_in_count == values.length():
  46. # Good
  47. pass
  48. else:
  49. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  50. all_good = False
  51. print('testing random non-members')
  52. false_positives = 0
  53. for trialno in my_range(trials):
  54. if trialno % divisor == 0:
  55. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  56. while True:
  57. candidate = ''.join(random.sample(CHARACTERS, 5))
  58. # If we accidentally found a member, try again
  59. if values.within(candidate):
  60. continue
  61. if candidate in bloom_filter:
  62. #print 'We erroneously think %s is in the filter' % candidate
  63. false_positives += 1
  64. break
  65. actual_error_rate = float(false_positives) / trials
  66. if actual_error_rate > error_rate:
  67. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  68. sys.argv[0],
  69. actual_error_rate,
  70. error_rate,
  71. ))
  72. all_good = False
  73. return all_good
  74. class States:
  75. '''Generate the USA's state names'''
  76. def __init__(self):
  77. pass
  78. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  79. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  80. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  81. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  82. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  83. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  84. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  85. @staticmethod
  86. def generator():
  87. '''Generate the states'''
  88. for state in States.states:
  89. yield state
  90. @staticmethod
  91. def within(value):
  92. '''Is the vaoue in our list of states?'''
  93. return value in States.states
  94. @staticmethod
  95. def length():
  96. '''What is the length of our contained values?'''
  97. return len(States.states)
  98. def random_string():
  99. '''Generate a random, 10 character string - for testing purposes'''
  100. list_ = []
  101. for chrno in range(10):
  102. dummy = chrno
  103. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  104. list_.append(character)
  105. return ''.join(list_)
  106. class Random_content:
  107. '''Generated a bunch of random strings in sorted order'''
  108. random_content = [ random_string() for dummy in range(1000) ]
  109. def __init__(self):
  110. pass
  111. @staticmethod
  112. def generator():
  113. '''Generate all values'''
  114. for item in Random_content.random_content:
  115. yield item
  116. @staticmethod
  117. def within(value):
  118. '''Test for membership'''
  119. return value in Random_content.random_content
  120. @staticmethod
  121. def length():
  122. '''How many members?'''
  123. return len(Random_content.random_content)
  124. class Evens:
  125. '''Generate a bunch of even numbers'''
  126. def __init__(self, maximum):
  127. self.maximum = maximum
  128. def generator(self):
  129. '''Generate all values'''
  130. for value in my_range(self.maximum):
  131. if value % 2 == 0:
  132. yield str(value)
  133. def within(self, value):
  134. '''Test for membership'''
  135. try:
  136. int_value = int(value)
  137. except ValueError:
  138. return False
  139. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  140. return True
  141. else:
  142. return False
  143. def length(self):
  144. '''How many members?'''
  145. return int(math.ceil(self.maximum / 2.0))
  146. def and_test():
  147. '''Test the & operator'''
  148. all_good = True
  149. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  150. for character in [ 'a', 'b', 'c' ]:
  151. abc += character
  152. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  153. for character in [ 'b', 'c', 'd' ]:
  154. bcd += character
  155. abc_and_bcd = abc
  156. abc_and_bcd &= bcd
  157. if 'a' in abc_and_bcd:
  158. sys.stderr.write('a in abc_and_bcd, but should not be')
  159. all_good = False
  160. if not 'b' in abc_and_bcd:
  161. sys.stderr.write('b not in abc_and_bcd, but should be')
  162. all_good = False
  163. if not 'c' in abc_and_bcd:
  164. sys.stderr.write('c not in abc_and_bcd, but should be')
  165. all_good = False
  166. if 'd' in abc_and_bcd:
  167. sys.stderr.write('d in abc_and_bcd, but should not be')
  168. all_good = False
  169. return all_good
  170. def or_test():
  171. '''Test the | operator'''
  172. all_good = True
  173. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  174. for character in [ 'a', 'b', 'c' ]:
  175. abc += character
  176. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  177. for character in [ 'b', 'c', 'd' ]:
  178. bcd += character
  179. abc_and_bcd = abc
  180. abc_and_bcd |= bcd
  181. if not 'a' in abc_and_bcd:
  182. sys.stderr.write('a not in abc_and_bcd, but should be')
  183. all_good = False
  184. if not 'b' in abc_and_bcd:
  185. sys.stderr.write('b not in abc_and_bcd, but should be')
  186. all_good = False
  187. if not 'c' in abc_and_bcd:
  188. sys.stderr.write('c not in abc_and_bcd, but should be')
  189. all_good = False
  190. if not 'd' in abc_and_bcd:
  191. sys.stderr.write('d not in abc_and_bcd, but should be')
  192. all_good = False
  193. if 'e' in abc_and_bcd:
  194. sys.stderr.write('e in abc_and_bcd, but should be')
  195. all_good = False
  196. return all_good
  197. def give_description(filename):
  198. '''Return a description of the filename type - could be array, file or hybrid'''
  199. if filename is None:
  200. return 'array'
  201. elif isinstance(filename, tuple):
  202. if filename[1] == -1:
  203. return 'mmap'
  204. else:
  205. return 'hybrid'
  206. else:
  207. return 'seek'
  208. def main():
  209. '''Unit tests for Bloom_filter class'''
  210. if sys.argv[1:] == [ '--performance-test' ]:
  211. performance_test = True
  212. else:
  213. performance_test = False
  214. all_good = True
  215. all_good &= test('states', States(), trials=100000, error_rate=0.01)
  216. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
  217. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, probe_bitnoer=bloom_filter_mod.get_bitno_seed_rnd)
  218. filename = 'bloom-filter-rm-me'
  219. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
  220. all_good &= and_test()
  221. all_good &= or_test()
  222. if performance_test:
  223. sqrt_of_10 = math.sqrt(10)
  224. #for exponent in range(5): # this is a lot, but probably not unreasonable
  225. for exponent in range(19): # this is a lot, but probably not unreasonable
  226. elements = int(sqrt_of_10 ** exponent + 0.5)
  227. for filename in [ None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2**20), ('bloom-filter-rm-me', -1) ]:
  228. description = give_description(filename)
  229. key = '%s %s' % (description, elements)
  230. database = anydbm.open('performance-numbers', 'c')
  231. if key in database:
  232. database.close()
  233. continue
  234. if elements >= 100000000 and description == 'seek':
  235. continue
  236. if elements >= 100000000 and description == 'mmap':
  237. continue
  238. if elements >= 1000000000 and description == 'array':
  239. continue
  240. time0 = time.time()
  241. all_good &= test(
  242. 'evens %s elements: %d' % (give_description(filename), elements),
  243. Evens(elements),
  244. trials=elements,
  245. error_rate=1e-2,
  246. filename=filename,
  247. )
  248. time1 = time.time()
  249. delta_t = time1 - time0
  250. #file_ = open('%s.txt' % description, 'a')
  251. #file_.write('%d %f\n' % (elements, delta_t))
  252. #file_.close()
  253. database = anydbm.open('performance-numbers', 'c')
  254. database[key] = '%f' % delta_t
  255. database.close()
  256. if all_good:
  257. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  258. sys.exit(0)
  259. else:
  260. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  261. sys.exit(1)
  262. main()