bloom-filter.cpp
Go to the documentation of this file.
1 /* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */
2 /*
3  * Copyright (c) 2014-2020, The University of Memphis
4  *
5  * This file is part of PSync.
6  * See AUTHORS.md for complete list of PSync authors and contributors.
7  *
8  * PSync is free software: you can redistribute it and/or modify it under the terms
9  * of the GNU Lesser General Public License as published by the Free Software Foundation,
10  * either version 3 of the License, or (at your option) any later version.
11  *
12  * PSync is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
13  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14  * PURPOSE. See the GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License along with
17  * PSync, e.g., in COPYING.md file. If not, see <http://www.gnu.org/licenses/>.
18  *
19 
20  * This file incorporates work covered by the following copyright and
21  * permission notice:
22 
23  * The MIT License (MIT)
24 
25  * Copyright (c) 2000 Arash Partow
26 
27  * Permission is hereby granted, free of charge, to any person obtaining a copy
28  * of this software and associated documentation files (the "Software"), to deal
29  * in the Software without restriction, including without limitation the rights
30  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
31  * copies of the Software, and to permit persons to whom the Software is
32  * furnished to do so, subject to the following conditions:
33 
34  * The above copyright notice and this permission notice shall be included in all
35  * copies or substantial portions of the Software.
36 
37  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43  * SOFTWARE.
44 */
45 
47 #include "PSync/detail/util.hpp"
48 
49 #include <ndn-cxx/util/exception.hpp>
50 
51 #include <algorithm>
52 #include <cmath>
53 #include <cstdlib>
54 #include <limits>
55 
56 namespace psync {
57 namespace detail {
58 
59 // https://github.com/ArashPartow/bloom
60 
61 static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
62 
63 static const unsigned char bit_mask[bits_per_char] = {
64  0x01, //00000001
65  0x02, //00000010
66  0x04, //00000100
67  0x08, //00001000
68  0x10, //00010000
69  0x20, //00100000
70  0x40, //01000000
71  0x80 //10000000
72 };
73 
74 class bloom_parameters
75 {
76 public:
77 
78  bloom_parameters()
79  : minimum_size(1),
80  maximum_size(std::numeric_limits<unsigned int>::max()),
81  minimum_number_of_hashes(1),
82  maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
83  projected_element_count(200),
84  false_positive_probability(1.0 / projected_element_count),
85  random_seed(0xA5A5A5A55A5A5A5AULL)
86  {}
87 
88  inline bool operator!()
89  {
90  return (minimum_size > maximum_size) ||
91  (minimum_number_of_hashes > maximum_number_of_hashes) ||
92  (minimum_number_of_hashes < 1) ||
93  (0 == maximum_number_of_hashes) ||
94  (0 == projected_element_count) ||
95  (false_positive_probability < 0.0) ||
96  (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
97  (0 == random_seed) ||
98  (0xFFFFFFFFFFFFFFFFULL == random_seed);
99  }
100 
101  // Allowable min/max size of the bloom filter in bits
102  unsigned int minimum_size;
103  unsigned int maximum_size;
104 
105  // Allowable min/max number of hash functions
106  unsigned int minimum_number_of_hashes;
107  unsigned int maximum_number_of_hashes;
108 
109  // The approximate number of elements to be inserted
110  // into the bloom filter, should be within one order
111  // of magnitude. The default is 200.
112  unsigned int projected_element_count;
113 
114  // The approximate false positive probability expected
115  // from the bloom filter. The default is assumed to be
116  // the reciprocal of the projected_element_count.
117  double false_positive_probability;
118 
119  unsigned long long int random_seed;
120 
121  struct optimal_parameters_t
122  {
123  optimal_parameters_t()
124  : number_of_hashes(0),
125  table_size(0)
126  {}
127 
128  unsigned int number_of_hashes;
129  unsigned int table_size;
130  };
131 
132  optimal_parameters_t optimal_parameters;
133 
134  bool compute_optimal_parameters()
135  {
136  /*
137  Note:
138  The following will attempt to find the number of hash functions
139  and minimum amount of storage bits required to construct a bloom
140  filter consistent with the user defined false positive probability
141  and estimated element insertion count.
142  */
143 
144  if (!(*this))
145  return false;
146 
147  double min_m = std::numeric_limits<double>::infinity();
148  double min_k = 0.0;
149  double k = 1.0;
150 
151  while (k < 1000.0)
152  {
153  const double numerator = (- k * projected_element_count);
154  const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
155 
156  const double curr_m = numerator / denominator;
157 
158  if (curr_m < min_m)
159  {
160  min_m = curr_m;
161  min_k = k;
162  }
163 
164  k += 1.0;
165  }
166 
167  optimal_parameters_t& optp = optimal_parameters;
168 
169  optp.number_of_hashes = static_cast<unsigned int>(min_k);
170 
171  optp.table_size = static_cast<unsigned int>(min_m);
172 
173  optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
174 
175  if (optp.number_of_hashes < minimum_number_of_hashes)
176  optp.number_of_hashes = minimum_number_of_hashes;
177  else if (optp.number_of_hashes > maximum_number_of_hashes)
178  optp.number_of_hashes = maximum_number_of_hashes;
179 
180  if (optp.table_size < minimum_size)
181  optp.table_size = minimum_size;
182  else if (optp.table_size > maximum_size)
183  optp.table_size = maximum_size;
184 
185  return true;
186  }
187 
188 };
189 
190 
191 BloomFilter::BloomFilter(const bloom_parameters& p)
192 : projected_element_count_(p.projected_element_count),
193  inserted_element_count_(0),
194  random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
195  desired_false_positive_probability_(p.false_positive_probability)
196 {
197  salt_count_ = p.optimal_parameters.number_of_hashes;
198  table_size_ = p.optimal_parameters.table_size;
199 
200  generate_unique_salt();
201 
202  bit_table_.resize(table_size_ / bits_per_char, static_cast<cell_type>(0x00));
203 }
204 
205 static bloom_parameters
206 makeParameters(unsigned int projected_element_count,
207  double false_positive_probability)
208 {
209  bloom_parameters p;
210  p.projected_element_count = projected_element_count;
211  p.false_positive_probability = false_positive_probability;
212 
213  if (!p.compute_optimal_parameters()) {
214  NDN_THROW(BloomFilter::Error("Bloom filter parameters are not correct!"));
215  }
216  return p;
217 }
218 
219 BloomFilter::BloomFilter(unsigned int projected_element_count,
220  double false_positive_probability)
221  : BloomFilter(makeParameters(projected_element_count, false_positive_probability))
222 {
223 }
224 
225 BloomFilter::BloomFilter(unsigned int projected_element_count,
226  double false_positive_probability,
227  const ndn::name::Component& bfName)
228  : BloomFilter(projected_element_count, false_positive_probability)
229 {
230  std::vector<cell_type> table(bfName.value_begin(), bfName.value_end());
231  if (table.size() != table_size_ / bits_per_char) {
232  NDN_THROW(Error("Bloom filter cannot be decoded!"));
233  }
234  bit_table_ = std::move(table);
235 }
236 
237 void
238 BloomFilter::appendToName(ndn::Name& name) const
239 {
240  name.appendNumber(projected_element_count_);
241  name.appendNumber(static_cast<uint64_t>(desired_false_positive_probability_ * 1000));
242  name.append(bit_table_.begin(), bit_table_.end());
243 }
244 
245 void
247 {
248  std::fill(bit_table_.begin(), bit_table_.end(), static_cast<cell_type>(0x00));
249  inserted_element_count_ = 0;
250 }
251 
252 void
253 BloomFilter::insert(const std::string& key)
254 {
255  std::size_t bit_index = 0;
256  std::size_t bit = 0;
257 
258  for (std::size_t i = 0; i < salt_.size(); ++i)
259  {
260  compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
261 
262  bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
263  }
264 
265  ++inserted_element_count_;
266 }
267 
268 bool
269 BloomFilter::contains(const std::string& key) const
270 {
271  std::size_t bit_index = 0;
272  std::size_t bit = 0;
273 
274  for (std::size_t i = 0; i < salt_.size(); ++i)
275  {
276  compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
277 
278  if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
279  {
280  return false;
281  }
282  }
283 
284  return true;
285 }
286 
287 void
288 BloomFilter::compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
289 {
290  bit_index = hash % table_size_;
291  bit = bit_index % bits_per_char;
292 }
293 
294 void
295 BloomFilter::generate_unique_salt()
296 {
297  /*
298  Note:
299  A distinct hash function need not be implementation-wise
300  distinct. In the current implementation "seeding" a common
301  hash function with different values seems to be adequate.
302  */
303  const unsigned int predef_salt_count = 128;
304 
305  static const bloom_type predef_salt[predef_salt_count] =
306  {
307  0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
308  0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
309  0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
310  0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
311  0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
312  0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
313  0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
314  0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
315  0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
316  0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
317  0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
318  0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
319  0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
320  0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
321  0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
322  0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
323  0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
324  0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
325  0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
326  0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
327  0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
328  0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
329  0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
330  0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
331  0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
332  0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
333  0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
334  0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
335  0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
336  0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
337  0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
338  0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
339  };
340 
341  if (salt_count_ <= predef_salt_count)
342  {
343  std::copy(predef_salt,
344  predef_salt + salt_count_,
345  std::back_inserter(salt_));
346 
347  for (std::size_t i = 0; i < salt_.size(); ++i)
348  {
349  /*
350  Note:
351  This is done to integrate the user defined random seed,
352  so as to allow for the generation of unique bloom filter
353  instances.
354  */
355  salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
356  }
357  }
358  else
359  {
360  std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
361 
362  srand(static_cast<unsigned int>(random_seed_));
363 
364  while (salt_.size() < salt_count_)
365  {
366  bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
367 
368  if (0 == current_salt)
369  continue;
370 
371  if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
372  {
373  salt_.push_back(current_salt);
374  }
375  }
376  }
377 }
378 
379 } // namespace detail
380 } // namespace psync
void insert(const std::string &key)
Definition: common.hpp:33
static const unsigned char bit_mask[bits_per_char]
bool contains(const std::string &key) const
static const std::size_t bits_per_char
static bloom_parameters makeParameters(unsigned int projected_element_count, double false_positive_probability)
void appendToName(ndn::Name &name) const
Append our bloom filter to the given name.
uint32_t murmurHash3(const void *key, size_t len, uint32_t seed)
Definition: util.cpp:61