fzy
terminal fuzzy finder picker
git clone https://9o.is/git/fzy.git
theft_bloom.c
(4621B)
1 #include <string.h>
2 #include <stdio.h>
3
4 #include "theft.h"
5 #include "theft_bloom.h"
6
7 #define DEFAULT_BLOOM_BITS 17
8 #define DEBUG_BLOOM_FILTER 0
9 #define LOG2_BITS_PER_BYTE 3
10 #define HASH_COUNT 4
11
12 static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte);
13
14 /* Initialize a bloom filter. */
15 struct theft_bloom *theft_bloom_init(uint8_t bit_size2) {
16 size_t sz = 1 << (bit_size2 - LOG2_BITS_PER_BYTE);
17 struct theft_bloom *b = malloc(sizeof(struct theft_bloom) + sz);
18 if (b) {
19 b->size = sz;
20 b->bit_count = bit_size2;
21 memset(b->bits, 0, sz);
22 }
23 return b;
24 }
25
26 /* Hash data and mark it in the bloom filter. */
27 void theft_bloom_mark(struct theft_bloom *b, uint8_t *data, size_t data_size) {
28 uint64_t hash = theft_hash_onepass(data, data_size);
29 uint8_t bc = b->bit_count;
30 uint64_t mask = (1 << bc) - 1;
31
32 /* Use HASH_COUNT distinct slices of MASK bits as hashes for the bloom filter. */
33 int bit_inc = (64 - bc) / HASH_COUNT;
34
35 for (int i = 0; i < (64 - bc); i += bit_inc) {
36 uint64_t v = (hash & (mask << i)) >> i;
37 size_t offset = v / 8;
38 uint8_t bit = 1 << (v & 0x07);
39 b->bits[offset] |= bit;
40 }
41 }
42
43 /* Check whether the data's hash is in the bloom filter. */
44 bool theft_bloom_check(struct theft_bloom *b, uint8_t *data, size_t data_size) {
45 uint64_t hash = theft_hash_onepass(data, data_size);
46 uint8_t bc = b->bit_count;
47 uint64_t mask = (1 << bc) - 1;
48
49 int bit_inc = (64 - bc) / HASH_COUNT;
50
51 for (int i = 0; i < (64 - bc); i += bit_inc) {
52 uint64_t v = (hash & (mask << i)) >> i;
53 size_t offset = v / 8;
54 uint8_t bit = 1 << (v & 0x07);
55 if (0 == (b->bits[offset] & bit)) { return false; }
56 }
57
58 return true;
59 }
60
61 /* Free the bloom filter. */
62 void theft_bloom_free(struct theft_bloom *b) { free(b); }
63
64 /* Dump the bloom filter's contents. (Debugging.) */
65 void theft_bloom_dump(struct theft_bloom *b) {
66 uint8_t counts[256];
67 memset(counts, 0xFF, sizeof(counts));
68
69 size_t total = 0;
70 uint16_t row_total = 0;
71
72 for (size_t i = 0; i < b->size; i++) {
73 uint8_t count = get_bits_set_count(counts, b->bits[i]);
74 total += count;
75 row_total += count;
76 #if DEBUG_BLOOM_FILTER > 1
77 char c = (count == 0 ? '.' : '0' + count);
78 printf("%c", c);
79 if ((i & 63) == 0 || i == b->size - 1) {
80 printf(" - %2.1f%%\n",
81 (100 * row_total) / (64.0 * 8));
82 row_total = 0;
83 }
84 #endif
85 }
86
87 #if DEBUG_BLOOM_FILTER
88 printf(" -- bloom filter: %zd of %zd bits set (%2d%%)\n",
89 total, 8*b->size, (int)((100.0 * total)/(8.0*b->size)));
90 #endif
91
92 /* If the total number of bits set is > the number of bytes
93 * in the table (i.e., > 1/8 full) then warn the user. */
94 if (total > b->size) {
95 fprintf(stderr, "\nWARNING: bloom filter is %zd%% full, "
96 "larger bloom_bits value recommended.\n",
97 (size_t)((100 * total) / (8 * b->size)));
98 }
99 }
100
101 /* Recommend a bloom filter size for a given number of trials. */
102 uint8_t theft_bloom_recommendation(int trials) {
103 /* With a preferred priority of false positives under 0.1%,
104 * the required number of bits m in the bloom filter is:
105 * m = -lg(0.001)/(lg(2)^2) == 14.378 ~= 14,
106 * so we want an array with 1 << ceil(log2(14*trials)) cells.
107 *
108 * Note: The above formula is for the *ideal* number of hash
109 * functions, but we're using a hardcoded count. It appears to work
110 * well enough in practice, though, and this can be adjusted later
111 * without impacting the API. */
112 #define TRIAL_MULTIPILER 14
113 uint8_t res = DEFAULT_BLOOM_BITS;
114
115 const uint8_t min = THEFT_BLOOM_BITS_MIN - LOG2_BITS_PER_BYTE;
116 const uint8_t max = THEFT_BLOOM_BITS_MAX - LOG2_BITS_PER_BYTE;
117
118 for (uint8_t i = min; i < max; i++) {
119 int32_t v = (1 << i);
120 if (v > (TRIAL_MULTIPILER * trials)) {
121 res = i + LOG2_BITS_PER_BYTE;
122 break;
123 }
124 }
125
126 #if DEBUG_BLOOM_FILTER
127 size_t sz = 1 << (res - LOG2_BITS_PER_BYTE);
128 printf("Using %zd bytes for bloom filter: %d trials -> bit_size2 %u\n",
129 sizeof(struct theft_bloom) + sz, trials, res);
130 #endif
131
132 return res;
133 }
134
135 /* Check a byte->bits set table, and lazily populate it. */
136 static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte) {
137 uint8_t v = counts[byte];
138 if (v != 0xFF) { return v; }
139 uint8_t t = 0;
140 for (uint8_t i = 0; i < 8; i++) {
141 if (byte & (1 << i)) { t++; }
142 }
143 counts[byte] = t;
144 return t;
145 }