test-bloom-filter 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. #!/usr/bin/python
  2. # pylint: disable=W0402
  3. # W0402: We want the deprecated string module, for a use that isn't deprecated
  4. '''Unit tests for bloom_filter_mod'''
  5. import sys
  6. import math
  7. import random
  8. import bloom_filter_mod
  9. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  10. def my_range(maximum):
  11. '''A range function with consistent semantics on 2.x and 3.x'''
  12. value = 0
  13. while True:
  14. if value >= maximum:
  15. break
  16. yield value
  17. value += 1
  18. def test(description, values, trials, error_rate):
  19. '''Some quick automatic tests for the bloom filter class'''
  20. all_good = True
  21. bloom_filter = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=trials * 2, error_rate_p=error_rate)
  22. #print(repr(bloom_filter))
  23. print '\n', description, bloom_filter.num_words, bloom_filter.num_probes_k
  24. print 'adding'
  25. for include in values.generator():
  26. bloom_filter.add(include)
  27. print 'testing all known members'''
  28. include_in_count = sum(include in bloom_filter for include in values.generator())
  29. if include_in_count == values.length():
  30. # Good
  31. pass
  32. else:
  33. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  34. all_good = False
  35. print 'testing random non-members'''
  36. false_positives = 0
  37. for trialno in my_range(trials):
  38. if trialno % 100000 == 0:
  39. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  40. #dummy = trialno
  41. while True:
  42. candidate = ''.join(random.sample(CHARACTERS, 5))
  43. # If we accidentally found a member, try again
  44. if values.within(candidate):
  45. continue
  46. if candidate in bloom_filter:
  47. #print 'We erroneously think %s is in the filter' % candidate
  48. false_positives += 1
  49. break
  50. actual_error_rate = float(false_positives) / trials
  51. if actual_error_rate > error_rate:
  52. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  53. sys.argv[0],
  54. actual_error_rate,
  55. error_rate,
  56. ))
  57. all_good = False
  58. return all_good
  59. class States:
  60. '''Generate the USA's state names'''
  61. def __init__(self):
  62. pass
  63. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  64. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  65. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  66. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  67. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  68. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  69. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  70. @staticmethod
  71. def generator():
  72. '''Generate the states'''
  73. for state in States.states:
  74. yield state
  75. @staticmethod
  76. def within(value):
  77. '''Is the vaoue in our list of states?'''
  78. return value in States.states
  79. @staticmethod
  80. def length():
  81. '''What is the length of our contained values?'''
  82. return len(States.states)
  83. def random_string():
  84. '''Generate a random, 10 character string - for testing purposes'''
  85. list_ = []
  86. for chrno in range(10):
  87. dummy = chrno
  88. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  89. list_.append(character)
  90. return ''.join(list_)
  91. class Random_content:
  92. '''Generated a bunch of random strings in sorted order'''
  93. random_content = [ random_string() for dummy in range(1000) ]
  94. def __init__(self):
  95. pass
  96. @staticmethod
  97. def generator():
  98. '''Generate all values'''
  99. for item in Random_content.random_content:
  100. yield item
  101. @staticmethod
  102. def within(value):
  103. '''Test for membership'''
  104. return value in Random_content.random_content
  105. @staticmethod
  106. def length():
  107. '''How many members?'''
  108. return len(Random_content.random_content)
  109. class Evens:
  110. '''Generate a bunch of even numbers'''
  111. def __init__(self, maximum):
  112. self.maximum = maximum
  113. def generator(self):
  114. '''Generate all values'''
  115. for value in my_range(self.maximum):
  116. if value % 2 == 0:
  117. yield str(value)
  118. def within(self, value):
  119. '''Test for membership'''
  120. try:
  121. int_value = int(value)
  122. except ValueError:
  123. return False
  124. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  125. return True
  126. else:
  127. return False
  128. def length(self):
  129. '''How many members?'''
  130. return int(math.ceil(self.maximum / 2.0))
  131. def and_test():
  132. '''Test the & operator'''
  133. all_good = True
  134. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  135. for character in [ 'a', 'b', 'c' ]:
  136. abc += character
  137. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  138. for character in [ 'b', 'c', 'd' ]:
  139. bcd += character
  140. abc_and_bcd = abc
  141. abc_and_bcd &= bcd
  142. if 'a' in abc_and_bcd:
  143. sys.stderr.write('a in abc_and_bcd, but should not be')
  144. all_good = False
  145. if not 'b' in abc_and_bcd:
  146. sys.stderr.write('b not in abc_and_bcd, but should be')
  147. all_good = False
  148. if not 'c' in abc_and_bcd:
  149. sys.stderr.write('c not in abc_and_bcd, but should be')
  150. all_good = False
  151. if 'd' in abc_and_bcd:
  152. sys.stderr.write('d in abc_and_bcd, but should not be')
  153. all_good = False
  154. return all_good
  155. def or_test():
  156. '''Test the | operator'''
  157. all_good = True
  158. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  159. for character in [ 'a', 'b', 'c' ]:
  160. abc += character
  161. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  162. for character in [ 'b', 'c', 'd' ]:
  163. bcd += character
  164. abc_and_bcd = abc
  165. abc_and_bcd |= bcd
  166. if not 'a' in abc_and_bcd:
  167. sys.stderr.write('a not in abc_and_bcd, but should be')
  168. all_good = False
  169. if not 'b' in abc_and_bcd:
  170. sys.stderr.write('b not in abc_and_bcd, but should be')
  171. all_good = False
  172. if not 'c' in abc_and_bcd:
  173. sys.stderr.write('c not in abc_and_bcd, but should be')
  174. all_good = False
  175. if not 'd' in abc_and_bcd:
  176. sys.stderr.write('d not in abc_and_bcd, but should be')
  177. all_good = False
  178. if 'e' in abc_and_bcd:
  179. sys.stderr.write('e in abc_and_bcd, but should be')
  180. all_good = False
  181. return all_good
  182. def main():
  183. '''Unit tests for Bloom_filter class'''
  184. all_good = True
  185. all_good &= test('states', States(), trials=100000, error_rate=0.01)
  186. all_good &= test('random', Random_content(), trials=10000, error_rate=0.1)
  187. #for elements in [ 1, 10, 100, 1000 ]:
  188. for elements in [ 1, 10, 100, 1000, 10000, 100000, 1000000 ]:
  189. all_good &= test('evens %d' % elements, Evens(elements), trials=elements, error_rate=1e-12)
  190. all_good &= and_test()
  191. all_good &= or_test()
  192. if all_good:
  193. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  194. sys.exit(0)
  195. else:
  196. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  197. sys.exit(1)
  198. main()