test-bloom-filter 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. #!/usr/bin/python
  2. # pylint: disable=W0402
  3. # W0402: We want the deprecated string module, for a use that isn't deprecated
  4. '''Unit tests for bloom_filter_mod'''
  5. import sys
  6. import random
  7. import string
  8. import bloom_filter_mod
  9. def my_range(maximum):
  10. '''A range function with consistent semantics on 2.x and 3.x'''
  11. value = 0
  12. while True:
  13. if value >= maximum:
  14. break
  15. yield value
  16. value += 1
  17. def primary_test(order, included, trials, error_rate):
  18. '''Some quick automatic tests for the bloom filter class'''
  19. all_good = True
  20. bloom_filter = bloom_filter_mod.Bloom_filter(ideal_num_elements=trials, error_rate=error_rate)
  21. print(repr(bloom_filter))
  22. for step in order:
  23. if step == 1:
  24. for include in included:
  25. bloom_filter.add(include)
  26. include_in_count = sum(include in bloom_filter for include in included)
  27. if include_in_count == len(included):
  28. # Good
  29. pass
  30. else:
  31. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, len(included)))
  32. all_good = False
  33. elif step == 2:
  34. false_positives = 0
  35. for trialno in my_range(trials):
  36. if trialno % 10000 == 0:
  37. sys.stderr.write('trialno countdown: %d\n' % (trials-trialno))
  38. dummy = trialno
  39. while True:
  40. candidate = ''.join(random.sample(string.ascii_letters, 5))
  41. # If we accidentally found a real include, try again
  42. if candidate in included:
  43. continue
  44. if candidate in bloom_filter:
  45. false_positives += 1
  46. break
  47. else:
  48. raise ValueError('step is not 1 or 2')
  49. #print('%d true negatives and %d false positives out of %d trials' % (trials - false_positives, false_positives, trials))
  50. actual_error_rate = float(false_positives) / trials
  51. if actual_error_rate > error_rate:
  52. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  53. sys.argv[0],
  54. actual_error_rate,
  55. error_rate,
  56. ))
  57. all_good = False
  58. return all_good
  59. def random_string():
  60. '''Generate a random, 10 character string - for testing purposes'''
  61. list_ = []
  62. for chrno in range(10):
  63. dummy = chrno
  64. character = string.ascii_letters[int(random.random() * 26)]
  65. list_.append(character)
  66. return ''.join(list_)
  67. def and_test():
  68. '''Test the & operator'''
  69. all_good = True
  70. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
  71. for character in [ 'a', 'b', 'c' ]:
  72. abc += character
  73. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
  74. for character in [ 'b', 'c', 'd' ]:
  75. bcd += character
  76. abc_and_bcd = abc
  77. abc_and_bcd &= bcd
  78. if 'a' in abc_and_bcd:
  79. sys.stderr.write('a in abc_and_bcd, but should not be')
  80. all_good = False
  81. if not 'b' in abc_and_bcd:
  82. sys.stderr.write('b not in abc_and_bcd, but should be')
  83. all_good = False
  84. if not 'c' in abc_and_bcd:
  85. sys.stderr.write('c not in abc_and_bcd, but should be')
  86. all_good = False
  87. if 'd' in abc_and_bcd:
  88. sys.stderr.write('d in abc_and_bcd, but should not be')
  89. all_good = False
  90. return all_good
  91. def or_test():
  92. '''Test the | operator'''
  93. all_good = True
  94. abc = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
  95. for character in [ 'a', 'b', 'c' ]:
  96. abc += character
  97. bcd = bloom_filter_mod.Bloom_filter(ideal_num_elements=100, error_rate=0.01)
  98. for character in [ 'b', 'c', 'd' ]:
  99. bcd += character
  100. abc_and_bcd = abc
  101. abc_and_bcd |= bcd
  102. if not 'a' in abc_and_bcd:
  103. sys.stderr.write('a not in abc_and_bcd, but should be')
  104. all_good = False
  105. if not 'b' in abc_and_bcd:
  106. sys.stderr.write('b not in abc_and_bcd, but should be')
  107. all_good = False
  108. if not 'c' in abc_and_bcd:
  109. sys.stderr.write('c not in abc_and_bcd, but should be')
  110. all_good = False
  111. if not 'd' in abc_and_bcd:
  112. sys.stderr.write('d not in abc_and_bcd, but should be')
  113. all_good = False
  114. if 'e' in abc_and_bcd:
  115. sys.stderr.write('e in abc_and_bcd, but should be')
  116. all_good = False
  117. return all_good
  118. def main():
  119. '''Unit tests for Bloom_filter class'''
  120. all_good = True
  121. states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
  122. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  123. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  124. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  125. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  126. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  127. Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
  128. random_content = [ random_string() for dummy in range(1000) ]
  129. all_good &= primary_test([1, 2], states, trials=10000, error_rate=0.01)
  130. all_good &= primary_test([1, 2], random_content, trials=10000, error_rate=0.1)
  131. all_good &= primary_test([2, 1], [ 'a', 'b', 'c'], trials=100, error_rate=0.000001)
  132. all_good &= and_test()
  133. all_good &= or_test()
  134. if all_good:
  135. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  136. sys.exit(0)
  137. else:
  138. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  139. sys.exit(1)
  140. main()