test_bloom_filter.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. #!/usr/bin/python
  2. # coding=utf-8
  3. # pylint: disable=superfluous-parens
  4. # superfluous-parens: Parentheses are good for clarity and portability
  5. """Unit tests for bloom_filter_mod"""
  6. # mport os
  7. import sys
  8. import math
  9. import time
  10. try:
  11. import anydbm
  12. except ImportError:
  13. import dbm as anydbm
  14. import random
  15. import bloom_filter
  16. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  17. def my_range(maximum):
  18. """A range function with consistent semantics on 2.x and 3.x"""
  19. value = 0
  20. while True:
  21. if value >= maximum:
  22. break
  23. yield value
  24. value += 1
  25. def _test(description, values, trials, error_rate, probe_bitnoer=None, filename=None):
  26. # pylint: disable=R0913,R0914
  27. # R0913: We want a few arguments
  28. # R0914: We want some local variables too. This is just test code.
  29. """Some quick automatic tests for the bloom filter class"""
  30. if not probe_bitnoer:
  31. probe_bitnoer = bloom_filter.get_bitno_lin_comb
  32. all_good = True
  33. divisor = 100000
  34. bloom = bloom_filter.BloomFilter(
  35. max_elements=trials * 2,
  36. error_rate=error_rate,
  37. probe_bitnoer=probe_bitnoer,
  38. filename=filename,
  39. start_fresh=True,
  40. )
  41. message = '\ndescription: %s num_bits_m: %s num_probes_k: %s\n'
  42. filled_out_message = message % (
  43. description,
  44. bloom.num_bits_m,
  45. bloom.num_probes_k,
  46. )
  47. sys.stdout.write(filled_out_message)
  48. print('starting to add values to an empty bloom filter')
  49. for valueno, value in enumerate(values.generator()):
  50. reverse_valueno = values.length() - valueno
  51. if reverse_valueno % divisor == 0:
  52. print('adding valueno %d' % reverse_valueno)
  53. bloom.add(value)
  54. print('testing all known members')
  55. include_in_count = sum(include in bloom for include in values.generator())
  56. if include_in_count == values.length():
  57. # Good
  58. pass
  59. else:
  60. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  61. all_good = False
  62. print('testing random non-members')
  63. false_positives = 0
  64. for trialno in my_range(trials):
  65. if trialno % divisor == 0:
  66. sys.stderr.write('trialno countdown: %d\n' % (trials - trialno))
  67. while True:
  68. candidate = ''.join(random.sample(CHARACTERS, 5))
  69. # If we accidentally found a member, try again
  70. if values.within(candidate):
  71. continue
  72. if candidate in bloom:
  73. # print 'We erroneously think %s is in the filter' % candidate
  74. false_positives += 1
  75. break
  76. actual_error_rate = float(false_positives) / trials
  77. if actual_error_rate > error_rate:
  78. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  79. sys.argv[0],
  80. actual_error_rate,
  81. error_rate,
  82. ))
  83. all_good = False
  84. return all_good
  85. class States(object):
  86. """Generate the USA's state names"""
  87. def __init__(self):
  88. pass
  89. states = """Alabama Alaska Arizona Arkansas California Colorado Connecticut
  90. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  91. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  92. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  93. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  94. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  95. Vermont Virginia Washington WestVirginia Wisconsin Wyoming""".split()
  96. @staticmethod
  97. def generator():
  98. """Generate the states"""
  99. for state in States.states:
  100. yield state
  101. @staticmethod
  102. def within(value):
  103. """Is the value in our list of states?"""
  104. return value in States.states
  105. @staticmethod
  106. def length():
  107. """What is the length of our contained values?"""
  108. return len(States.states)
  109. def random_string():
  110. """Generate a random, 10 character string - for testing purposes"""
  111. list_ = []
  112. for chrno in range(10):
  113. dummy = chrno
  114. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  115. list_.append(character)
  116. return ''.join(list_)
  117. class Random_content(object):
  118. """Generated a bunch of random strings in sorted order"""
  119. random_content = [random_string() for dummy in range(1000)]
  120. def __init__(self):
  121. pass
  122. @staticmethod
  123. def generator():
  124. """Generate all values"""
  125. for item in Random_content.random_content:
  126. yield item
  127. @staticmethod
  128. def within(value):
  129. """Test for membership"""
  130. return value in Random_content.random_content
  131. @staticmethod
  132. def length():
  133. """How many members?"""
  134. return len(Random_content.random_content)
  135. class Evens(object):
  136. """Generate a bunch of even numbers"""
  137. def __init__(self, maximum):
  138. self.maximum = maximum
  139. def generator(self):
  140. """Generate all values"""
  141. for value in my_range(self.maximum):
  142. if value % 2 == 0:
  143. yield str(value)
  144. def within(self, value):
  145. """Test for membership"""
  146. try:
  147. int_value = int(value)
  148. except ValueError:
  149. return False
  150. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  151. return True
  152. else:
  153. return False
  154. def length(self):
  155. """How many members?"""
  156. return int(math.ceil(self.maximum / 2.0))
  157. def and_test():
  158. """Test the & operator"""
  159. all_good = True
  160. abc = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
  161. for character in ['a', 'b', 'c']:
  162. abc += character
  163. bcd = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
  164. for character in ['b', 'c', 'd']:
  165. bcd += character
  166. abc_and_bcd = abc
  167. abc_and_bcd &= bcd
  168. if 'a' in abc_and_bcd:
  169. sys.stderr.write('a in abc_and_bcd, but should not be')
  170. all_good = False
  171. if not 'b' in abc_and_bcd:
  172. sys.stderr.write('b not in abc_and_bcd, but should be')
  173. all_good = False
  174. if not 'c' in abc_and_bcd:
  175. sys.stderr.write('c not in abc_and_bcd, but should be')
  176. all_good = False
  177. if 'd' in abc_and_bcd:
  178. sys.stderr.write('d in abc_and_bcd, but should not be')
  179. all_good = False
  180. return all_good
  181. def or_test():
  182. """Test the | operator"""
  183. all_good = True
  184. abc = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
  185. for character in ['a', 'b', 'c']:
  186. abc += character
  187. bcd = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
  188. for character in ['b', 'c', 'd']:
  189. bcd += character
  190. abc_and_bcd = abc
  191. abc_and_bcd |= bcd
  192. if not 'a' in abc_and_bcd:
  193. sys.stderr.write('a not in abc_and_bcd, but should be')
  194. all_good = False
  195. if not 'b' in abc_and_bcd:
  196. sys.stderr.write('b not in abc_and_bcd, but should be')
  197. all_good = False
  198. if not 'c' in abc_and_bcd:
  199. sys.stderr.write('c not in abc_and_bcd, but should be')
  200. all_good = False
  201. if not 'd' in abc_and_bcd:
  202. sys.stderr.write('d not in abc_and_bcd, but should be')
  203. all_good = False
  204. if 'e' in abc_and_bcd:
  205. sys.stderr.write('e in abc_and_bcd, but should not be')
  206. all_good = False
  207. return all_good
  208. def give_description(filename):
  209. """Return a description of the filename type - could be array, file or hybrid"""
  210. if filename is None:
  211. return 'array'
  212. elif isinstance(filename, tuple):
  213. if filename[1] == -1:
  214. return 'mmap'
  215. else:
  216. return 'hybrid'
  217. else:
  218. return 'seek'
  219. def test_bloom_filter():
  220. """Unit tests for BloomFilter class"""
  221. if sys.argv[1:] == ['--performance-test']:
  222. performance_test = True
  223. else:
  224. performance_test = False
  225. all_good = True
  226. all_good &= _test('states', States(), trials=100000, error_rate=0.01)
  227. all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1)
  228. all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1,
  229. probe_bitnoer=bloom_filter.get_bitno_seed_rnd)
  230. filename = 'bloom-filter-rm-me'
  231. all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
  232. all_good &= and_test()
  233. all_good &= or_test()
  234. if performance_test:
  235. sqrt_of_10 = math.sqrt(10)
  236. # for exponent in range(5): # this is a lot, but probably not unreasonable
  237. for exponent in range(19): # this is a lot, but probably not unreasonable
  238. elements = int(sqrt_of_10 ** exponent + 0.5)
  239. for filename in [None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2 ** 20),
  240. ('bloom-filter-rm-me', -1)]:
  241. description = give_description(filename)
  242. key = '%s %s' % (description, elements)
  243. database = anydbm.open('performance-numbers', 'c')
  244. if key in database.keys():
  245. database.close()
  246. continue
  247. if elements >= 100000000 and description == 'seek':
  248. continue
  249. if elements >= 100000000 and description == 'mmap':
  250. continue
  251. if elements >= 1000000000 and description == 'array':
  252. continue
  253. time0 = time.time()
  254. all_good &= _test(
  255. 'evens %s elements: %d' % (give_description(filename), elements),
  256. Evens(elements),
  257. trials=elements,
  258. error_rate=1e-2,
  259. filename=filename,
  260. )
  261. time1 = time.time()
  262. delta_t = time1 - time0
  263. # file_ = open('%s.txt' % description, 'a')
  264. # file_.write('%d %f\n' % (elements, delta_t))
  265. # file_.close()
  266. database = anydbm.open('performance-numbers', 'c')
  267. database[key] = '%f' % delta_t
  268. database.close()
  269. # test prob count ok
  270. bloom = bloom_filter.BloomFilter(1000000, error_rate=.99)
  271. all_good &= bloom.num_probes_k == 1
  272. if not all_good:
  273. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  274. sys.exit(1)
  275. if __name__ == '__main__':
  276. test_bloom_filter()