bloom-filter.cpp
Go to the documentation of this file.
1 /* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */
2 /*
3  * Copyright (c) 2014-2019, The University of Memphis
4  *
5  * This file is part of PSync.
6  * See AUTHORS.md for complete list of PSync authors and contributors.
7  *
8  * PSync is free software: you can redistribute it and/or modify it under the terms
9  * of the GNU Lesser General Public License as published by the Free Software Foundation,
10  * either version 3 of the License, or (at your option) any later version.
11  *
12  * PSync is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
13  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14  * PURPOSE. See the GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License along with
17  * PSync, e.g., in COPYING.md file. If not, see <http://www.gnu.org/licenses/>.
18  *
19 
20  * This file incorporates work covered by the following copyright and
21  * permission notice:
22 
23  * The MIT License (MIT)
24 
25  * Copyright (c) 2000 Arash Partow
26 
27  * Permission is hereby granted, free of charge, to any person obtaining a copy
28  * of this software and associated documentation files (the "Software"), to deal
29  * in the Software without restriction, including without limitation the rights
30  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
31  * copies of the Software, and to permit persons to whom the Software is
32  * furnished to do so, subject to the following conditions:
33 
34  * The above copyright notice and this permission notice shall be included in all
35  * copies or substantial portions of the Software.
36 
37  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43  * SOFTWARE.
44 */
45 
47 #include "PSync/detail/util.hpp"
48 
49 #include <ndn-cxx/util/logger.hpp>
50 
51 #include <algorithm>
52 #include <cmath>
53 #include <cstddef>
54 #include <iterator>
55 #include <limits>
56 #include <cstdlib>
57 
58 // https://github.com/ArashPartow/bloom
59 
60 NDN_LOG_INIT(psync.BloomFilter);
61 
62 namespace psync {
63 
64 static const std::size_t bits_per_char = 0x08;
65 static const unsigned char bit_mask[bits_per_char] = {
66  0x01, //00000001
67  0x02, //00000010
68  0x04, //00000100
69  0x08, //00001000
70  0x10, //00010000
71  0x20, //00100000
72  0x40, //01000000
73  0x80 //10000000
74 };
75 
77 : minimum_size(1)
78 , maximum_size(std::numeric_limits<unsigned int>::max())
79 , minimum_number_of_hashes(1)
80 , maximum_number_of_hashes(std::numeric_limits<unsigned int>::max())
81 , projected_element_count(200)
82 , false_positive_probability(1.0 / projected_element_count)
83 , random_seed(0xA5A5A5A55A5A5A5AULL)
84 {}
85 
86 bool
88 {
89  if (!(*this)) {
90  return false;
91  }
92 
93  double min_m = std::numeric_limits<double>::infinity();
94  double min_k = 0.0;
95  double curr_m = 0.0;
96  double k = 1.0;
97 
98  while (k < 1000.0)
99  {
100  double numerator = (- k * projected_element_count);
101  double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
102  curr_m = numerator / denominator;
103  if (curr_m < min_m)
104  {
105  min_m = curr_m;
106  min_k = k;
107  }
108  k += 1.0;
109  }
110 
112 
113  optp.number_of_hashes = static_cast<unsigned int>(min_k);
114  optp.table_size = static_cast<unsigned int>(min_m);
115  optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
116 
121 
122  if (optp.table_size < minimum_size)
123  optp.table_size = minimum_size;
124  else if (optp.table_size > maximum_size)
125  optp.table_size = maximum_size;
126 
127  return true;
128 }
129 
131  : bit_table_(0)
132  , salt_count_(0)
133  , table_size_(0)
134  , raw_table_size_(0)
135  , projected_element_count_(0)
136  , inserted_element_count_(0)
137  , random_seed_(0)
138  , desired_false_positive_probability_(0.0)
139 {}
140 
142  : bit_table_(0)
143  , projected_element_count_(p.projected_element_count)
144  , inserted_element_count_(0)
145  , random_seed_((p.random_seed * 0xA5A5A5A5) + 1)
146  , desired_false_positive_probability_(p.false_positive_probability)
147 {
148  salt_count_ = p.optimal_parameters.number_of_hashes;
149  table_size_ = p.optimal_parameters.table_size;
150  generate_unique_salt();
151  raw_table_size_ = table_size_ / bits_per_char;
152  //bit_table_ = new cell_type[static_cast<std::size_t>(raw_table_size_)];
153  bit_table_.resize(static_cast<std::size_t>(raw_table_size_), 0x00);
154 }
155 
156 BloomFilter::BloomFilter(unsigned int projected_element_count,
157  double false_positive_probability)
158  : BloomFilter(getParameters(projected_element_count, false_positive_probability))
159 {
160 }
161 
162 BloomFilter::BloomFilter(unsigned int projected_element_count,
163  double false_positive_probability,
164  const ndn::name::Component& bfName)
165  : BloomFilter(projected_element_count, false_positive_probability)
166 {
167  std::vector<BloomFilter::cell_type> table(bfName.value_begin(), bfName.value_end());
168 
169  if (table.size() != raw_table_size_) {
170  BOOST_THROW_EXCEPTION(Error("Received BloomFilter cannot be decoded!"));
171  }
172  bit_table_ = table;
173 }
174 
176 BloomFilter::getParameters(unsigned int projected_element_count,
177  double false_positive_probability)
178 {
179  BloomParameters opt;
180  opt.false_positive_probability = false_positive_probability;
181  opt.projected_element_count = projected_element_count;
182 
183  if (!opt) {
184  NDN_LOG_WARN("Bloom parameters are not correct!");
185  }
186 
188  return opt;
189 }
190 
191 void
192 BloomFilter::appendToName(ndn::Name& name) const
193 {
194  name.appendNumber(projected_element_count_);
195  name.appendNumber((int)(desired_false_positive_probability_ * 1000));
196  name.append(bit_table_.begin(), bit_table_.end());
197 }
198 
199 void
201 {
202  bit_table_.resize(static_cast<std::size_t>(raw_table_size_), 0x00);
203  inserted_element_count_ = 0;
204 }
205 
206 void
207 BloomFilter::insert(const std::string& key)
208 {
209  std::size_t bit_index = 0;
210  std::size_t bit = 0;
211  for (std::size_t i = 0; i < salt_.size(); ++i)
212  {
213  compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
214  bit_table_[bit_index/bits_per_char] |= bit_mask[bit];
215  }
216  ++inserted_element_count_;
217 }
218 
219 bool
220 BloomFilter::contains(const std::string& key) const
221 {
222  std::size_t bit_index = 0;
223  std::size_t bit = 0;
224 
225  for (std::size_t i = 0; i < salt_.size(); ++i)
226  {
227  compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
228  if ((bit_table_[bit_index/bits_per_char] & bit_mask[bit]) != bit_mask[bit]) {
229  return false;
230  }
231  }
232 
233  return true;
234 }
235 
236 std::vector <BloomFilter::cell_type>
238 {
239  return bit_table_;
240 }
241 
242 void
243 BloomFilter::generate_unique_salt()
244 {
245  /*
246  Note:
247  A distinct hash function need not be implementation-wise
248  distinct. In the current implementation "seeding" a common
249  hash function with different values seems to be adequate.
250  */
251  const unsigned int predef_salt_count = 128;
252  static const bloom_type predef_salt[predef_salt_count] =
253  {
254  0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
255  0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
256  0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
257  0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
258  0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
259  0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
260  0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
261  0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
262  0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
263  0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
264  0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
265  0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
266  0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
267  0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
268  0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
269  0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
270  0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
271  0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
272  0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
273  0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
274  0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
275  0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
276  0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
277  0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
278  0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
279  0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
280  0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
281  0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
282  0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
283  0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
284  0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
285  0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
286  };
287 
288  if (salt_count_ <= predef_salt_count)
289  {
290  std::copy(predef_salt,
291  predef_salt + salt_count_,
292  std::back_inserter(salt_));
293  for (unsigned int i = 0; i < salt_.size(); ++i)
294  {
295  salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
296  }
297  }
298  else
299  {
300  std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
301  srand(static_cast<unsigned int>(random_seed_));
302  while (salt_.size() < salt_count_)
303  {
304  bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
305  if (0 == current_salt) continue;
306  if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
307  {
308  salt_.push_back(current_salt);
309  }
310  }
311  }
312 }
313 
314 void
315 BloomFilter::compute_indices(const bloom_type& hash,
316  std::size_t& bit_index, std::size_t& bit) const
317 {
318  bit_index = hash % table_size_;
319  bit = bit_index % bits_per_char;
320 }
321 
322 bool
323 operator==(const BloomFilter& bf1, const BloomFilter& bf2)
324 {
325  auto table1 = bf1.table();
326  auto table2 = bf2.table();
327 
328  if (table1.size() != table2.size()) {
329  return false;
330  }
331 
332  for (size_t i = 0; i < table1.size(); i++) {
333  if (table1[i] != table2[i]) {
334  return false;
335  }
336  }
337  return true;
338 }
339 
340 std::ostream&
341 operator<<(std::ostream& out, const BloomFilter& bf)
342 {
343  for (const auto& element : bf.table()) {
344  out << unsigned(element);
345  }
346  return out;
347 }
348 
349 } // namespace psync
void appendToName(ndn::Name &name) const
Append our bloom filter to the given name.
static const unsigned char bit_mask[bits_per_char]
unsigned int minimum_size
bool contains(const std::string &key) const
uint32_t murmurHash3(uint32_t nHashSeed, const std::vector< unsigned char > &vDataToHash)
Definition: util.cpp:37
STL namespace.
unsigned int minimum_number_of_hashes
bool operator==(const BloomFilter &bf1, const BloomFilter &bf2)
unsigned int projected_element_count
static const std::size_t bits_per_char
unsigned int maximum_number_of_hashes
void insert(const std::string &key)
std::ostream & operator<<(std::ostream &out, const BloomFilter &bf)
unsigned int maximum_size
std::vector< cell_type > table() const
BloomParameters getParameters(unsigned int projected_element_count, double false_positive_probability)
optimal_parameters_t optimal_parameters