Loading...
Searching...
No Matches
bloom-filter.cpp
Go to the documentation of this file.
1/* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */
2/*
3 * Copyright (c) 2014-2022, The University of Memphis
4 *
5 * This file is part of PSync.
6 * See AUTHORS.md for complete list of PSync authors and contributors.
7 *
8 * PSync is free software: you can redistribute it and/or modify it under the terms
9 * of the GNU Lesser General Public License as published by the Free Software Foundation,
10 * either version 3 of the License, or (at your option) any later version.
11 *
12 * PSync is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
13 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
14 * PURPOSE. See the GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License along with
17 * PSync, e.g., in COPYING.md file. If not, see <http://www.gnu.org/licenses/>.
18 *
19
20 * This file incorporates work covered by the following copyright and
21 * permission notice:
22
23 * The MIT License (MIT)
24
25 * Copyright (c) 2000 Arash Partow
26
27 * Permission is hereby granted, free of charge, to any person obtaining a copy
28 * of this software and associated documentation files (the "Software"), to deal
29 * in the Software without restriction, including without limitation the rights
30 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
31 * copies of the Software, and to permit persons to whom the Software is
32 * furnished to do so, subject to the following conditions:
33
34 * The above copyright notice and this permission notice shall be included in all
35 * copies or substantial portions of the Software.
36
37 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43 * SOFTWARE.
44*/
45
47#include "PSync/detail/util.hpp"
48
49#include <ndn-cxx/util/exception.hpp>
50
51#include <algorithm>
52#include <cmath>
53#include <cstdlib>
54#include <limits>
55
56namespace psync::detail {
57
58// https://github.com/ArashPartow/bloom
59
60static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
61
62static const unsigned char bit_mask[bits_per_char] = {
63 0x01, //00000001
64 0x02, //00000010
65 0x04, //00000100
66 0x08, //00001000
67 0x10, //00010000
68 0x20, //00100000
69 0x40, //01000000
70 0x80 //10000000
71};
72
73class bloom_parameters
74{
75public:
76
77 bloom_parameters()
78 : minimum_size(1),
79 maximum_size(std::numeric_limits<unsigned int>::max()),
80 minimum_number_of_hashes(1),
81 maximum_number_of_hashes(std::numeric_limits<unsigned int>::max()),
82 projected_element_count(200),
83 false_positive_probability(1.0 / projected_element_count),
84 random_seed(0xA5A5A5A55A5A5A5AULL)
85 {}
86
87 inline bool operator!()
88 {
89 return (minimum_size > maximum_size) ||
90 (minimum_number_of_hashes > maximum_number_of_hashes) ||
91 (minimum_number_of_hashes < 1) ||
92 (0 == maximum_number_of_hashes) ||
93 (0 == projected_element_count) ||
94 (false_positive_probability < 0.0) ||
95 (std::numeric_limits<double>::infinity() == std::abs(false_positive_probability)) ||
96 (0 == random_seed) ||
97 (0xFFFFFFFFFFFFFFFFULL == random_seed);
98 }
99
100 // Allowable min/max size of the bloom filter in bits
101 unsigned int minimum_size;
102 unsigned int maximum_size;
103
104 // Allowable min/max number of hash functions
105 unsigned int minimum_number_of_hashes;
106 unsigned int maximum_number_of_hashes;
107
108 // The approximate number of elements to be inserted
109 // into the bloom filter, should be within one order
110 // of magnitude. The default is 200.
111 unsigned int projected_element_count;
112
113 // The approximate false positive probability expected
114 // from the bloom filter. The default is assumed to be
115 // the reciprocal of the projected_element_count.
116 double false_positive_probability;
117
118 unsigned long long int random_seed;
119
120 struct optimal_parameters_t
121 {
122 optimal_parameters_t()
123 : number_of_hashes(0),
124 table_size(0)
125 {}
126
127 unsigned int number_of_hashes;
128 unsigned int table_size;
129 };
130
131 optimal_parameters_t optimal_parameters;
132
133 bool compute_optimal_parameters()
134 {
135 /*
136 Note:
137 The following will attempt to find the number of hash functions
138 and minimum amount of storage bits required to construct a bloom
139 filter consistent with the user defined false positive probability
140 and estimated element insertion count.
141 */
142
143 if (!(*this))
144 return false;
145
146 double min_m = std::numeric_limits<double>::infinity();
147 double min_k = 0.0;
148 double k = 1.0;
149
150 while (k < 1000.0)
151 {
152 const double numerator = (- k * projected_element_count);
153 const double denominator = std::log(1.0 - std::pow(false_positive_probability, 1.0 / k));
154
155 const double curr_m = numerator / denominator;
156
157 if (curr_m < min_m)
158 {
159 min_m = curr_m;
160 min_k = k;
161 }
162
163 k += 1.0;
164 }
165
166 optimal_parameters_t& optp = optimal_parameters;
167
168 optp.number_of_hashes = static_cast<unsigned int>(min_k);
169
170 optp.table_size = static_cast<unsigned int>(min_m);
171
172 optp.table_size += (((optp.table_size % bits_per_char) != 0) ? (bits_per_char - (optp.table_size % bits_per_char)) : 0);
173
174 if (optp.number_of_hashes < minimum_number_of_hashes)
175 optp.number_of_hashes = minimum_number_of_hashes;
176 else if (optp.number_of_hashes > maximum_number_of_hashes)
177 optp.number_of_hashes = maximum_number_of_hashes;
178
179 if (optp.table_size < minimum_size)
180 optp.table_size = minimum_size;
181 else if (optp.table_size > maximum_size)
182 optp.table_size = maximum_size;
183
184 return true;
185 }
186
187};
188
189
190BloomFilter::BloomFilter(const bloom_parameters& p)
191: projected_element_count_(p.projected_element_count),
192 inserted_element_count_(0),
193 random_seed_((p.random_seed * 0xA5A5A5A5) + 1),
194 desired_false_positive_probability_(p.false_positive_probability)
195{
196 salt_count_ = p.optimal_parameters.number_of_hashes;
197 table_size_ = p.optimal_parameters.table_size;
198
199 generate_unique_salt();
200
201 bit_table_.resize(table_size_ / bits_per_char, static_cast<cell_type>(0x00));
202}
203
204static bloom_parameters
205makeParameters(unsigned int projected_element_count,
206 double false_positive_probability)
207{
208 bloom_parameters p;
209 p.projected_element_count = projected_element_count;
210 p.false_positive_probability = false_positive_probability;
211
212 if (!p.compute_optimal_parameters()) {
213 NDN_THROW(BloomFilter::Error("Bloom filter parameters are not correct!"));
214 }
215 return p;
216}
217
218BloomFilter::BloomFilter(unsigned int projected_element_count,
219 double false_positive_probability)
220 : BloomFilter(makeParameters(projected_element_count, false_positive_probability))
221{
222}
223
224BloomFilter::BloomFilter(unsigned int projected_element_count,
225 double false_positive_probability,
226 const ndn::name::Component& bfName)
227 : BloomFilter(projected_element_count, false_positive_probability)
228{
229 std::vector<cell_type> table(bfName.value_begin(), bfName.value_end());
230 if (table.size() != table_size_ / bits_per_char) {
231 NDN_THROW(Error("Bloom filter cannot be decoded!"));
232 }
233 bit_table_ = std::move(table);
234}
235
236void
237BloomFilter::appendToName(ndn::Name& name) const
238{
239 name.appendNumber(projected_element_count_);
240 name.appendNumber(static_cast<uint64_t>(desired_false_positive_probability_ * 1000));
241 name.append(bit_table_.begin(), bit_table_.end());
242}
243
244void
246{
247 std::fill(bit_table_.begin(), bit_table_.end(), static_cast<cell_type>(0x00));
248 inserted_element_count_ = 0;
249}
250
251void
252BloomFilter::insert(const ndn::Name& key)
253{
254 std::size_t bit_index = 0;
255 std::size_t bit = 0;
256
257 for (std::size_t i = 0; i < salt_.size(); ++i)
258 {
259 compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
260
261 bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
262 }
263
264 ++inserted_element_count_;
265}
266
267bool
268BloomFilter::contains(const ndn::Name& key) const
269{
270 std::size_t bit_index = 0;
271 std::size_t bit = 0;
272
273 for (std::size_t i = 0; i < salt_.size(); ++i)
274 {
275 compute_indices(murmurHash3(salt_[i], key), bit_index, bit);
276
277 if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
278 {
279 return false;
280 }
281 }
282
283 return true;
284}
285
286void
287BloomFilter::compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
288{
289 bit_index = hash % table_size_;
290 bit = bit_index % bits_per_char;
291}
292
293void
294BloomFilter::generate_unique_salt()
295{
296 /*
297 Note:
298 A distinct hash function need not be implementation-wise
299 distinct. In the current implementation "seeding" a common
300 hash function with different values seems to be adequate.
301 */
302 const unsigned int predef_salt_count = 128;
303
304 static const bloom_type predef_salt[predef_salt_count] =
305 {
306 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
307 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
308 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
309 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
310 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
311 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
312 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
313 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
314 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
315 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
316 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
317 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
318 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
319 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
320 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
321 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
322 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
323 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
324 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
325 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
326 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
327 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
328 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
329 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
330 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
331 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
332 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
333 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
334 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
335 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
336 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
337 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
338 };
339
340 if (salt_count_ <= predef_salt_count)
341 {
342 std::copy(predef_salt,
343 predef_salt + salt_count_,
344 std::back_inserter(salt_));
345
346 for (std::size_t i = 0; i < salt_.size(); ++i)
347 {
348 /*
349 Note:
350 This is done to integrate the user defined random seed,
351 so as to allow for the generation of unique bloom filter
352 instances.
353 */
354 salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + static_cast<bloom_type>(random_seed_);
355 }
356 }
357 else
358 {
359 std::copy(predef_salt, predef_salt + predef_salt_count, std::back_inserter(salt_));
360
361 srand(static_cast<unsigned int>(random_seed_));
362
363 while (salt_.size() < salt_count_)
364 {
365 bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
366
367 if (0 == current_salt)
368 continue;
369
370 if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
371 {
372 salt_.push_back(current_salt);
373 }
374 }
375 }
376}
377
378} // namespace psync::detail
void insert(const ndn::Name &key)
void appendToName(ndn::Name &name) const
Append our bloom filter to the given name.
bool contains(const ndn::Name &key) const
uint32_t murmurHash3(const void *key, size_t len, uint32_t seed)
Definition util.cpp:58