test-bloom-filter 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. #!/usr/bin/python
  2. # pylint: disable=W0402
  3. # W0402: We want the deprecated string module, for a use that isn't deprecated
  4. '''Unit tests for bloom_filter_mod'''
  5. import sys
  6. import random
  7. import bloom_filter_mod
  8. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  9. def my_range(maximum):
  10. '''A range function with consistent semantics on 2.x and 3.x'''
  11. value = 0
  12. while True:
  13. if value >= maximum:
  14. break
  15. yield value
  16. value += 1
  17. def test(description, values, trials, error_rate):
  18. '''Some quick automatic tests for the bloom filter class'''
  19. all_good = True
  20. bloom_filter = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=trials * 2, error_rate_p=error_rate)
  21. #print(repr(bloom_filter))
  22. print '\n', description, bloom_filter.num_words, bloom_filter.num_probes_k
  23. print 'adding'
  24. for include in values.generator():
  25. bloom_filter.add(include)
  26. print 'testing all known members'''
  27. include_in_count = sum(include in bloom_filter for include in values.generator())
  28. if include_in_count == values.length():
  29. # Good
  30. pass
  31. else:
  32. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  33. all_good = False
  34. print 'testing random non-members'''
  35. false_positives = 0
  36. for trialno in my_range(trials):
  37. if trialno % 100000 == 0:
  38. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  39. #dummy = trialno
  40. while True:
  41. candidate = ''.join(random.sample(CHARACTERS, 5))
  42. # If we accidentally found a member, try again
  43. if values.within(candidate):
  44. continue
  45. if candidate in bloom_filter:
  46. #print 'We erroneously think %s is in the filter' % candidate
  47. false_positives += 1
  48. break
  49. actual_error_rate = float(false_positives) / trials
  50. if actual_error_rate > error_rate:
  51. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  52. sys.argv[0],
  53. actual_error_rate,
  54. error_rate,
  55. ))
  56. all_good = False
  57. return all_good
  58. class States:
  59. '''Generate the USA's state names'''
  60. def __init__(self):
  61. pass
  62. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  63. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  64. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  65. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  66. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  67. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  68. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  69. @staticmethod
  70. def generator():
  71. '''Generate the states'''
  72. for state in States.states:
  73. yield state
  74. @staticmethod
  75. def within(value):
  76. '''Is the vaoue in our list of states?'''
  77. return value in States.states
  78. @staticmethod
  79. def length():
  80. '''What is the length of our contained values?'''
  81. return len(States.states)
  82. def random_string():
  83. '''Generate a random, 10 character string - for testing purposes'''
  84. list_ = []
  85. for chrno in range(10):
  86. dummy = chrno
  87. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  88. list_.append(character)
  89. return ''.join(list_)
  90. class Random_content:
  91. '''Generated a bunch of random strings in sorted order'''
  92. random_content = [ random_string() for dummy in range(1000) ]
  93. def __init__(self):
  94. pass
  95. @staticmethod
  96. def generator():
  97. '''Generate all values'''
  98. for item in Random_content.random_content:
  99. yield item
  100. @staticmethod
  101. def within(value):
  102. '''Test for membership'''
  103. return value in Random_content.random_content
  104. @staticmethod
  105. def length():
  106. '''How many members?'''
  107. return len(Random_content.random_content)
  108. class Evens:
  109. '''Generate a bunch of even numbers'''
  110. def __init__(self, maximum):
  111. self.maximum = maximum
  112. def generator(self):
  113. '''Generate all values'''
  114. for value in my_range(self.maximum):
  115. if value % 2 == 0:
  116. yield str(value)
  117. def within(self, value):
  118. '''Test for membership'''
  119. try:
  120. int_value = int(value)
  121. except ValueError:
  122. return False
  123. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  124. return True
  125. else:
  126. return False
  127. def length(self):
  128. '''How many members?'''
  129. return self.maximum // 2
  130. def and_test():
  131. '''Test the & operator'''
  132. all_good = True
  133. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  134. for character in [ 'a', 'b', 'c' ]:
  135. abc += character
  136. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  137. for character in [ 'b', 'c', 'd' ]:
  138. bcd += character
  139. abc_and_bcd = abc
  140. abc_and_bcd &= bcd
  141. if 'a' in abc_and_bcd:
  142. sys.stderr.write('a in abc_and_bcd, but should not be')
  143. all_good = False
  144. if not 'b' in abc_and_bcd:
  145. sys.stderr.write('b not in abc_and_bcd, but should be')
  146. all_good = False
  147. if not 'c' in abc_and_bcd:
  148. sys.stderr.write('c not in abc_and_bcd, but should be')
  149. all_good = False
  150. if 'd' in abc_and_bcd:
  151. sys.stderr.write('d in abc_and_bcd, but should not be')
  152. all_good = False
  153. return all_good
  154. def or_test():
  155. '''Test the | operator'''
  156. all_good = True
  157. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  158. for character in [ 'a', 'b', 'c' ]:
  159. abc += character
  160. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  161. for character in [ 'b', 'c', 'd' ]:
  162. bcd += character
  163. abc_and_bcd = abc
  164. abc_and_bcd |= bcd
  165. if not 'a' in abc_and_bcd:
  166. sys.stderr.write('a not in abc_and_bcd, but should be')
  167. all_good = False
  168. if not 'b' in abc_and_bcd:
  169. sys.stderr.write('b not in abc_and_bcd, but should be')
  170. all_good = False
  171. if not 'c' in abc_and_bcd:
  172. sys.stderr.write('c not in abc_and_bcd, but should be')
  173. all_good = False
  174. if not 'd' in abc_and_bcd:
  175. sys.stderr.write('d not in abc_and_bcd, but should be')
  176. all_good = False
  177. if 'e' in abc_and_bcd:
  178. sys.stderr.write('e in abc_and_bcd, but should be')
  179. all_good = False
  180. return all_good
  181. def main():
  182. '''Unit tests for Bloom_filter class'''
  183. all_good = True
  184. all_good &= test('states', States(), trials=100000, error_rate=0.01)
  185. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
  186. #for elements in [ 1, 10, 100, 1000 ]:
  187. for elements in [ 1, 10, 100, 1000, 10000, 100000, 1000000 ]:
  188. all_good &= test('evens %d' % elements, Evens(elements), trials=elements, error_rate=1e-12)
  189. all_good &= and_test()
  190. all_good &= or_test()
  191. if all_good:
  192. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  193. sys.exit(0)
  194. else:
  195. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  196. sys.exit(1)
  197. main()