bloom-filter.cpp
Go to the documentation of this file.
1 /* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */
2 /*
3  * Copyright (c) 2014-2022, The University of Memphis
4  *
5  * This file is part of PSync.
6  * See AUTHORS.md for complete list of PSync authors and contributors.
7  *
8  * PSync is free software: you can redistribute it and/or modify it under the terms
9  * of the GNU Lesser General Public License as published by the Free Software Foundation,
10  * either version 3 of the License, or (at your option) any later version.
11  *
12  * PSync is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
13  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14  * PURPOSE. See the GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License along with
17  * PSync, e.g., in COPYING.md file. If not, see <http://www.gnu.org/licenses/>.
18  *
19 
20  * This file incorporates work covered by the following copyright and
21  * permission notice:
22 
23  * The MIT License (MIT)
24 
25  * Copyright (c) 2000 Arash Partow
26 
27  * Permission is hereby granted, free of charge, to any person obtaining a copy
28  * of this software and associated documentation files (the "Software"), to deal
29  * in the Software without restriction, including without limitation the rights
30  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
31  * copies of the Software, and to permit persons to whom the Software is
32  * furnished to do so, subject to the following conditions:
33 
34  * The above copyright notice and this permission notice shall be included in all
35  * copies or substantial portions of the Software.
36 
37  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43  * SOFTWARE.
44 */
45 
47 #include "PSync/detail/util.hpp"
48 
49 #include <ndn-cxx/util/exception.hpp>
50 
51 #include <algorithm>
52 #include <cmath>
53 #include <cstdlib>
54 #include <limits>
55 
56 namespace psync::detail {
57 
58 // https://github.com/ArashPartow/bloom
59 
60 static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
61 
62 static const unsigned char bit_mask[bits_per_char] = {
63  0x01, //00000001
64  0x02, //00000010
65  0x04, //00000100
66  0x08, //00001000
67  0x10, //00010000
68  0x20, //00100000
69  0x40, //01000000
70  0x80 //10000000
71 };
72 
73 class bloom_parameters
74 {
75 public:
76 
77  bloom_parameters()
78  : minimum_size(1),
79  maximum_size(std::numeric_limits<unsigned int>::max()),
80  minimum_number_of_hashes(1),
81  maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
82  projected_element_count(200),
83  false_positive_probability(1.0 / projected_element_count),
84  random_seed(0xA5A5A5A55A5A5A5AULL)
85  {}
86 
87  inline bool operator!()
88  {
89  return (minimum_size > maximum_size) ||
90  (minimum_number_of_hashes > maximum_number_of_hashes) ||
91  (minimum_number_of_hashes < 1) ||
92  (0 == maximum_number_of_hashes) ||
93  (0 == projected_element_count) ||
94  (false_positive_probability < 0.0) ||
95  (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
96  (0 == random_seed) ||
97  (0xFFFFFFFFFFFFFFFFULL == random_seed);
98  }
99 
100  // Allowable min/max size of the bloom filter in bits
101  unsigned int minimum_size;
102  unsigned int maximum_size;
103 
104  // Allowable min/max number of hash functions
105  unsigned int minimum_number_of_hashes;
106  unsigned int maximum_number_of_hashes;
107 
108  // The approximate number of elements to be inserted
109  // into the bloom filter, should be within one order
110  // of magnitude. The default is 200.
111  unsigned int projected_element_count;
112 
113  // The approximate false positive probability expected
114  // from the bloom filter. The default is assumed to be
115  // the reciprocal of the projected_element_count.
116  double false_positive_probability;
117 
118  unsigned long long int random_seed;
119 
120  struct optimal_parameters_t
121  {
122  optimal_parameters_t()
123  : number_of_hashes(0),
124  table_size(0)
125  {}
126 
127  unsigned int number_of_hashes;
128  unsigned int table_size;
129  };
130 
131  optimal_parameters_t optimal_parameters;
132 
133  bool compute_optimal_parameters()
134  {
135  /*
136  Note:
137  The following will attempt to find the number of hash functions
138  and minimum amount of storage bits required to construct a bloom
139  filter consistent with the user defined false positive probability
140  and estimated element insertion count.
141  */
142 
143  if (!(*this))
144  return false;
145 
146  double min_m = std::numeric_limits<double>::infinity();
147  double min_k = 0.0;
148  double k = 1.0;
149 
150  while (k < 1000.0)
151  {
152  const double numerator = (- k * projected_element_count);
153  const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
154 
155  const double curr_m = numerator / denominator;
156 
157  if (curr_m < min_m)
158  {
159  min_m = curr_m;
160  min_k = k;
161  }
162 
163  k += 1.0;
164  }
165 
166  optimal_parameters_t& optp = optimal_parameters;
167 
168  optp.number_of_hashes = static_cast<unsigned int>(min_k);
169 
170  optp.table_size = static_cast<unsigned int>(min_m);
171 
172  optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
173 
174  if (optp.number_of_hashes < minimum_number_of_hashes)
175  optp.number_of_hashes = minimum_number_of_hashes;
176  else if (optp.number_of_hashes > maximum_number_of_hashes)
177  optp.number_of_hashes = maximum_number_of_hashes;
178 
179  if (optp.table_size < minimum_size)
180  optp.table_size = minimum_size;
181  else if (optp.table_size > maximum_size)
182  optp.table_size = maximum_size;
183 
184  return true;
185  }
186 
187 };
188 
189 
190 BloomFilter::BloomFilter(const bloom_parameters& p)
191 : projected_element_count_(p.projected_element_count),
192  inserted_element_count_(0),
193  random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
194  desired_false_positive_probability_(p.false_positive_probability)
195 {
196  salt_count_ = p.optimal_parameters.number_of_hashes;
197  table_size_ = p.optimal_parameters.table_size;
198 
199  generate_unique_salt();
200 
201  bit_table_.resize(table_size_ / bits_per_char, static_cast<cell_type>(0x00));
202 }
203 
204 static bloom_parameters
205 makeParameters(unsigned int projected_element_count,
206  double false_positive_probability)
207 {
208  bloom_parameters p;
209  p.projected_element_count = projected_element_count;
210  p.false_positive_probability = false_positive_probability;
211 
212  if (!p.compute_optimal_parameters()) {
213  NDN_THROW(BloomFilter::Error("Bloom filter parameters are not correct!"));
214  }
215  return p;
216 }
217 
218 BloomFilter::BloomFilter(unsigned int projected_element_count,
219  double false_positive_probability)
220  : BloomFilter(makeParameters(projected_element_count, false_positive_probability))
221 {
222 }
223 
224 BloomFilter::BloomFilter(unsigned int projected_element_count,
225  double false_positive_probability,
226  const ndn::name::Component& bfName)
227  : BloomFilter(projected_element_count, false_positive_probability)
228 {
229  std::vector<cell_type> table(bfName.value_begin(), bfName.value_end());
230  if (table.size() != table_size_ / bits_per_char) {
231  NDN_THROW(Error("Bloom filter cannot be decoded!"));
232  }
233  bit_table_ = std::move(table);
234 }
235 
236 void
237 BloomFilter::appendToName(ndn::Name& name) const
238 {
239  name.appendNumber(projected_element_count_);
240  name.appendNumber(static_cast<uint64_t>(desired_false_positive_probability_ * 1000));
241  name.append(bit_table_.begin(), bit_table_.end());
242 }
243 
244 void
246 {
247  std::fill(bit_table_.begin(), bit_table_.end(), static_cast<cell_type>(0x00));
248  inserted_element_count_ = 0;
249 }
250 
251 void
252 BloomFilter::insert(const ndn::Name& key)
253 {
254  std::size_t bit_index = 0;
255  std::size_t bit = 0;
256 
257  for (std::size_t i = 0; i < salt_.size(); ++i)
258  {
259  compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
260 
261  bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
262  }
263 
264  ++inserted_element_count_;
265 }
266 
267 bool
268 BloomFilter::contains(const ndn::Name& key) const
269 {
270  std::size_t bit_index = 0;
271  std::size_t bit = 0;
272 
273  for (std::size_t i = 0; i < salt_.size(); ++i)
274  {
275  compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
276 
277  if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
278  {
279  return false;
280  }
281  }
282 
283  return true;
284 }
285 
286 void
287 BloomFilter::compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
288 {
289  bit_index = hash % table_size_;
290  bit = bit_index % bits_per_char;
291 }
292 
293 void
294 BloomFilter::generate_unique_salt()
295 {
296  /*
297  Note:
298  A distinct hash function need not be implementation-wise
299  distinct. In the current implementation "seeding" a common
300  hash function with different values seems to be adequate.
301  */
302  const unsigned int predef_salt_count = 128;
303 
304  static const bloom_type predef_salt[predef_salt_count] =
305  {
306  0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
307  0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
308  0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
309  0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
310  0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
311  0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
312  0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
313  0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
314  0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
315  0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
316  0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
317  0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
318  0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
319  0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
320  0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
321  0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
322  0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
323  0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
324  0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
325  0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
326  0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
327  0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
328  0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
329  0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
330  0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
331  0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
332  0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
333  0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
334  0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
335  0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
336  0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
337  0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
338  };
339 
340  if (salt_count_ <= predef_salt_count)
341  {
342  std::copy(predef_salt,
343  predef_salt + salt_count_,
344  std::back_inserter(salt_));
345 
346  for (std::size_t i = 0; i < salt_.size(); ++i)
347  {
348  /*
349  Note:
350  This is done to integrate the user defined random seed,
351  so as to allow for the generation of unique bloom filter
352  instances.
353  */
354  salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
355  }
356  }
357  else
358  {
359  std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
360 
361  srand(static_cast<unsigned int>(random_seed_));
362 
363  while (salt_.size() < salt_count_)
364  {
365  bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
366 
367  if (0 == current_salt)
368  continue;
369 
370  if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
371  {
372  salt_.push_back(current_salt);
373  }
374  }
375  }
376 }
377 
378 } // namespace psync::detail
void insert(const ndn::Name &key)
void appendToName(ndn::Name &name) const
Append our bloom filter to the given name.
bool contains(const ndn::Name &key) const
uint32_t murmurHash3(const void *key, size_t len, uint32_t seed)
Definition: util.cpp:58