test_bloom_filter.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. #!/usr/bin/python
  2. # coding=utf-8
  3. # pylint: disable=superfluous-parens
  4. # superfluous-parens: Parentheses are good for clarity and portability
  5. """Unit tests for bloom_filter_mod"""
  6. # mport os
  7. import sys
  8. import math
  9. import time
  10. try:
  11. import anydbm
  12. except ImportError:
  13. import dbm as anydbm
  14. import random
  15. import drs_bloom_filter
  16. CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
  17. def my_range(maximum):
  18. """A range function with consistent semantics on 2.x and 3.x"""
  19. value = 0
  20. while True:
  21. if value >= maximum:
  22. break
  23. yield value
  24. value += 1
  25. def _test(description, values, trials, error_rate, probe_bitnoer=drs_bloom_filter.get_bitno_lin_comb, filename=None):
  26. # pylint: disable=R0913,R0914
  27. # R0913: We want a few arguments
  28. # R0914: We want some local variables too. This is just test code.
  29. """Some quick automatic tests for the bloom filter class"""
  30. all_good = True
  31. divisor = 100000
  32. bloom_filter = drs_bloom_filter.Bloom_filter(
  33. ideal_num_elements_n=trials * 2,
  34. error_rate_p=error_rate,
  35. probe_bitnoer=probe_bitnoer,
  36. filename=filename,
  37. start_fresh=True,
  38. )
  39. message = '\ndescription: %s num_bits_m: %s num_probes_k: %s\n'
  40. filled_out_message = message % (
  41. description,
  42. bloom_filter.num_bits_m,
  43. bloom_filter.num_probes_k,
  44. )
  45. sys.stdout.write(filled_out_message)
  46. print('starting to add values to an empty bloom filter')
  47. for valueno, value in enumerate(values.generator()):
  48. reverse_valueno = values.length() - valueno
  49. if reverse_valueno % divisor == 0:
  50. print('adding valueno %d' % reverse_valueno)
  51. bloom_filter.add(value)
  52. print('testing all known members')
  53. include_in_count = sum(include in bloom_filter for include in values.generator())
  54. if include_in_count == values.length():
  55. # Good
  56. pass
  57. else:
  58. sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
  59. all_good = False
  60. print('testing random non-members')
  61. false_positives = 0
  62. for trialno in my_range(trials):
  63. if trialno % divisor == 0:
  64. sys.stderr.write('trialno countdown: %d\n' % (trials - trialno))
  65. while True:
  66. candidate = ''.join(random.sample(CHARACTERS, 5))
  67. # If we accidentally found a member, try again
  68. if values.within(candidate):
  69. continue
  70. if candidate in bloom_filter:
  71. # print 'We erroneously think %s is in the filter' % candidate
  72. false_positives += 1
  73. break
  74. actual_error_rate = float(false_positives) / trials
  75. if actual_error_rate > error_rate:
  76. sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
  77. sys.argv[0],
  78. actual_error_rate,
  79. error_rate,
  80. ))
  81. all_good = False
  82. return all_good
  83. class States(object):
  84. """Generate the USA's state names"""
  85. def __init__(self):
  86. pass
  87. states = """Alabama Alaska Arizona Arkansas California Colorado Connecticut
  88. Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
  89. Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
  90. Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
  91. NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
  92. Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
  93. Vermont Virginia Washington WestVirginia Wisconsin Wyoming""".split()
  94. @staticmethod
  95. def generator():
  96. """Generate the states"""
  97. for state in States.states:
  98. yield state
  99. @staticmethod
  100. def within(value):
  101. """Is the value in our list of states?"""
  102. return value in States.states
  103. @staticmethod
  104. def length():
  105. """What is the length of our contained values?"""
  106. return len(States.states)
  107. def random_string():
  108. """Generate a random, 10 character string - for testing purposes"""
  109. list_ = []
  110. for chrno in range(10):
  111. dummy = chrno
  112. character = CHARACTERS[int(random.random() * len(CHARACTERS))]
  113. list_.append(character)
  114. return ''.join(list_)
  115. class Random_content(object):
  116. """Generated a bunch of random strings in sorted order"""
  117. random_content = [random_string() for dummy in range(1000)]
  118. def __init__(self):
  119. pass
  120. @staticmethod
  121. def generator():
  122. """Generate all values"""
  123. for item in Random_content.random_content:
  124. yield item
  125. @staticmethod
  126. def within(value):
  127. """Test for membership"""
  128. return value in Random_content.random_content
  129. @staticmethod
  130. def length():
  131. """How many members?"""
  132. return len(Random_content.random_content)
  133. class Evens(object):
  134. """Generate a bunch of even numbers"""
  135. def __init__(self, maximum):
  136. self.maximum = maximum
  137. def generator(self):
  138. """Generate all values"""
  139. for value in my_range(self.maximum):
  140. if value % 2 == 0:
  141. yield str(value)
  142. def within(self, value):
  143. """Test for membership"""
  144. try:
  145. int_value = int(value)
  146. except ValueError:
  147. return False
  148. if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
  149. return True
  150. else:
  151. return False
  152. def length(self):
  153. """How many members?"""
  154. return int(math.ceil(self.maximum / 2.0))
  155. def and_test():
  156. """Test the & operator"""
  157. all_good = True
  158. abc = drs_bloom_filter.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  159. for character in ['a', 'b', 'c']:
  160. abc += character
  161. bcd = drs_bloom_filter.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  162. for character in ['b', 'c', 'd']:
  163. bcd += character
  164. abc_and_bcd = abc
  165. abc_and_bcd &= bcd
  166. if 'a' in abc_and_bcd:
  167. sys.stderr.write('a in abc_and_bcd, but should not be')
  168. all_good = False
  169. if not 'b' in abc_and_bcd:
  170. sys.stderr.write('b not in abc_and_bcd, but should be')
  171. all_good = False
  172. if not 'c' in abc_and_bcd:
  173. sys.stderr.write('c not in abc_and_bcd, but should be')
  174. all_good = False
  175. if 'd' in abc_and_bcd:
  176. sys.stderr.write('d in abc_and_bcd, but should not be')
  177. all_good = False
  178. return all_good
  179. def or_test():
  180. """Test the | operator"""
  181. all_good = True
  182. abc = drs_bloom_filter.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  183. for character in ['a', 'b', 'c']:
  184. abc += character
  185. bcd = drs_bloom_filter.Bloom_filter(ideal_num_elements_n=100, error_rate_p=0.01)
  186. for character in ['b', 'c', 'd']:
  187. bcd += character
  188. abc_and_bcd = abc
  189. abc_and_bcd |= bcd
  190. if not 'a' in abc_and_bcd:
  191. sys.stderr.write('a not in abc_and_bcd, but should be')
  192. all_good = False
  193. if not 'b' in abc_and_bcd:
  194. sys.stderr.write('b not in abc_and_bcd, but should be')
  195. all_good = False
  196. if not 'c' in abc_and_bcd:
  197. sys.stderr.write('c not in abc_and_bcd, but should be')
  198. all_good = False
  199. if not 'd' in abc_and_bcd:
  200. sys.stderr.write('d not in abc_and_bcd, but should be')
  201. all_good = False
  202. if 'e' in abc_and_bcd:
  203. sys.stderr.write('e in abc_and_bcd, but should not be')
  204. all_good = False
  205. return all_good
  206. def give_description(filename):
  207. """Return a description of the filename type - could be array, file or hybrid"""
  208. if filename is None:
  209. return 'array'
  210. elif isinstance(filename, tuple):
  211. if filename[1] == -1:
  212. return 'mmap'
  213. else:
  214. return 'hybrid'
  215. else:
  216. return 'seek'
  217. def test_drs_bloom_filter():
  218. """Unit tests for Bloom_filter class"""
  219. if sys.argv[1:] == ['--performance-test']:
  220. performance_test = True
  221. else:
  222. performance_test = False
  223. all_good = True
  224. all_good &= _test('states', States(), trials=100000, error_rate=0.01)
  225. all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1)
  226. all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1,
  227. probe_bitnoer=drs_bloom_filter.get_bitno_seed_rnd)
  228. filename = 'bloom-filter-rm-me'
  229. all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
  230. all_good &= and_test()
  231. all_good &= or_test()
  232. if performance_test:
  233. sqrt_of_10 = math.sqrt(10)
  234. # for exponent in range(5): # this is a lot, but probably not unreasonable
  235. for exponent in range(19): # this is a lot, but probably not unreasonable
  236. elements = int(sqrt_of_10 ** exponent + 0.5)
  237. for filename in [None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2 ** 20),
  238. ('bloom-filter-rm-me', -1)]:
  239. description = give_description(filename)
  240. key = '%s %s' % (description, elements)
  241. database = anydbm.open('performance-numbers', 'c')
  242. if key in database.keys():
  243. database.close()
  244. continue
  245. if elements >= 100000000 and description == 'seek':
  246. continue
  247. if elements >= 100000000 and description == 'mmap':
  248. continue
  249. if elements >= 1000000000 and description == 'array':
  250. continue
  251. time0 = time.time()
  252. all_good &= _test(
  253. 'evens %s elements: %d' % (give_description(filename), elements),
  254. Evens(elements),
  255. trials=elements,
  256. error_rate=1e-2,
  257. filename=filename,
  258. )
  259. time1 = time.time()
  260. delta_t = time1 - time0
  261. # file_ = open('%s.txt' % description, 'a')
  262. # file_.write('%d %f\n' % (elements, delta_t))
  263. # file_.close()
  264. database = anydbm.open('performance-numbers', 'c')
  265. database[key] = '%f' % delta_t
  266. database.close()
  267. if all_good:
  268. sys.stderr.write('%s: All tests passed\n' % sys.argv[0])
  269. # sys.exit(0)
  270. else:
  271. sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
  272. sys.exit(1)
  273. if __name__ == '__main__':
  274. test_drs_bloom_filter()