test-bloom-filter 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. #!/usr/local/pypy-1.6/bin/pypy
  2. # pylint: disable=W0402
  3. # W0402: We want the deprecated string module, for a use that isn't deprecated
  4. '''Unit tests for bloom_filter_mod'''
  5. import sys
  6. import math
  7. import random
  8. import bloom_filter_mod
  9. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  10. def my_range(maximum):
  11. '''A range function with consistent semantics on 2.x and 3.x'''
  12. value = 0
  13. while True:
  14. if value >= maximum:
  15. break
  16. yield value
  17. value += 1
  18. def test(description, values, trials, error_rate, probe_bitnoer=bloom_filter_mod.get_bitno_lin_comb):
  19. '''Some quick automatic tests for the bloom filter class'''
  20. all_good = True
  21. bloom_filter = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=trials * 2, error_rate_p=error_rate, probe_bitnoer=probe_bitnoer)
  22. #print(repr(bloom_filter))
  23. sys.stdout.write('\ndescription: %s num_bits_m: %s num_probes_k: %s\n' %
  24. (description, bloom_filter.num_bits_m, bloom_filter.num_probes_k))
  25. print('adding')
  26. for include in values.generator():
  27. bloom_filter.add(include)
  28. print('testing all known members')
  29. include_in_count = sum(include in bloom_filter for include in values.generator())
  30. if include_in_count == values.length():
  31. # Good
  32. pass
  33. else:
  34. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  35. all_good = False
  36. print('testing random non-members')
  37. false_positives = 0
  38. for trialno in my_range(trials):
  39. if trialno % 100000 == 0:
  40. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  41. #dummy = trialno
  42. while True:
  43. candidate = ''.join(random.sample(CHARACTERS, 5))
  44. # If we accidentally found a member, try again
  45. if values.within(candidate):
  46. continue
  47. if candidate in bloom_filter:
  48. #print 'We erroneously think %s is in the filter' % candidate
  49. false_positives += 1
  50. break
  51. actual_error_rate = float(false_positives) / trials
  52. if actual_error_rate > error_rate:
  53. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  54. sys.argv[0],
  55. actual_error_rate,
  56. error_rate,
  57. ))
  58. all_good = False
  59. return all_good
  60. class States:
  61. '''Generate the USA's state names'''
  62. def __init__(self):
  63. pass
  64. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  65. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  66. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  67. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  68. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  69. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  70. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  71. @staticmethod
  72. def generator():
  73. '''Generate the states'''
  74. for state in States.states:
  75. yield state
  76. @staticmethod
  77. def within(value):
  78. '''Is the vaoue in our list of states?'''
  79. return value in States.states
  80. @staticmethod
  81. def length():
  82. '''What is the length of our contained values?'''
  83. return len(States.states)
  84. def random_string():
  85. '''Generate a random, 10 character string - for testing purposes'''
  86. list_ = []
  87. for chrno in range(10):
  88. dummy = chrno
  89. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  90. list_.append(character)
  91. return ''.join(list_)
  92. class Random_content:
  93. '''Generated a bunch of random strings in sorted order'''
  94. random_content = [ random_string() for dummy in range(1000) ]
  95. def __init__(self):
  96. pass
  97. @staticmethod
  98. def generator():
  99. '''Generate all values'''
  100. for item in Random_content.random_content:
  101. yield item
  102. @staticmethod
  103. def within(value):
  104. '''Test for membership'''
  105. return value in Random_content.random_content
  106. @staticmethod
  107. def length():
  108. '''How many members?'''
  109. return len(Random_content.random_content)
  110. class Evens:
  111. '''Generate a bunch of even numbers'''
  112. def __init__(self, maximum):
  113. self.maximum = maximum
  114. def generator(self):
  115. '''Generate all values'''
  116. for value in my_range(self.maximum):
  117. if value % 2 == 0:
  118. yield str(value)
  119. def within(self, value):
  120. '''Test for membership'''
  121. try:
  122. int_value = int(value)
  123. except ValueError:
  124. return False
  125. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  126. return True
  127. else:
  128. return False
  129. def length(self):
  130. '''How many members?'''
  131. return int(math.ceil(self.maximum / 2.0))
  132. def and_test():
  133. '''Test the & operator'''
  134. all_good = True
  135. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  136. for character in [ 'a', 'b', 'c' ]:
  137. abc += character
  138. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  139. for character in [ 'b', 'c', 'd' ]:
  140. bcd += character
  141. abc_and_bcd = abc
  142. abc_and_bcd &= bcd
  143. if 'a' in abc_and_bcd:
  144. sys.stderr.write('a in abc_and_bcd, but should not be')
  145. all_good = False
  146. if not 'b' in abc_and_bcd:
  147. sys.stderr.write('b not in abc_and_bcd, but should be')
  148. all_good = False
  149. if not 'c' in abc_and_bcd:
  150. sys.stderr.write('c not in abc_and_bcd, but should be')
  151. all_good = False
  152. if 'd' in abc_and_bcd:
  153. sys.stderr.write('d in abc_and_bcd, but should not be')
  154. all_good = False
  155. return all_good
  156. def or_test():
  157. '''Test the | operator'''
  158. all_good = True
  159. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  160. for character in [ 'a', 'b', 'c' ]:
  161. abc += character
  162. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  163. for character in [ 'b', 'c', 'd' ]:
  164. bcd += character
  165. abc_and_bcd = abc
  166. abc_and_bcd |= bcd
  167. if not 'a' in abc_and_bcd:
  168. sys.stderr.write('a not in abc_and_bcd, but should be')
  169. all_good = False
  170. if not 'b' in abc_and_bcd:
  171. sys.stderr.write('b not in abc_and_bcd, but should be')
  172. all_good = False
  173. if not 'c' in abc_and_bcd:
  174. sys.stderr.write('c not in abc_and_bcd, but should be')
  175. all_good = False
  176. if not 'd' in abc_and_bcd:
  177. sys.stderr.write('d not in abc_and_bcd, but should be')
  178. all_good = False
  179. if 'e' in abc_and_bcd:
  180. sys.stderr.write('e in abc_and_bcd, but should be')
  181. all_good = False
  182. return all_good
  183. def main():
  184. '''Unit tests for Bloom_filter class'''
  185. all_good = True
  186. all_good &= test('states', States(), trials=100000, error_rate=0.01)
  187. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
  188. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1, probe_bitnoer=bloom_filter_mod.get_bitno_seed_rnd)
  189. #for elements in [ 1, 10, 100, 1000 ]:
  190. for elements in [ 1, 10, 100, 1000, 10000, 100000, 1000000 ]:
  191. all_good &= test('evens %d' % elements, Evens(elements), trials=elements, error_rate=1e-12)
  192. all_good &= and_test()
  193. all_good &= or_test()
  194. if all_good:
  195. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  196. sys.exit(0)
  197. else:
  198. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  199. sys.exit(1)
  200. main()