fzy

terminal fuzzy finder picker

git clone https://9o.is/git/fzy.git

theft_bloom.c

(4621B)


      1 #include <string.h>
      2 #include <stdio.h>
      3 
      4 #include "theft.h"
      5 #include "theft_bloom.h"
      6 
      7 #define DEFAULT_BLOOM_BITS 17
      8 #define DEBUG_BLOOM_FILTER 0
      9 #define LOG2_BITS_PER_BYTE 3
     10 #define HASH_COUNT 4
     11 
     12 static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte);
     13 
     14 /* Initialize a bloom filter. */
     15 struct theft_bloom *theft_bloom_init(uint8_t bit_size2) {
     16     size_t sz = 1 << (bit_size2 - LOG2_BITS_PER_BYTE);
     17     struct theft_bloom *b = malloc(sizeof(struct theft_bloom) + sz);
     18     if (b) {
     19         b->size = sz;
     20         b->bit_count = bit_size2;
     21         memset(b->bits, 0, sz);
     22     }
     23     return b;
     24 }
     25 
     26 /* Hash data and mark it in the bloom filter. */
     27 void theft_bloom_mark(struct theft_bloom *b, uint8_t *data, size_t data_size) {
     28     uint64_t hash = theft_hash_onepass(data, data_size);
     29     uint8_t bc = b->bit_count;
     30     uint64_t mask = (1 << bc) - 1;
     31 
     32     /* Use HASH_COUNT distinct slices of MASK bits as hashes for the bloom filter. */
     33     int bit_inc = (64 - bc) / HASH_COUNT;
     34 
     35     for (int i = 0; i < (64 - bc); i += bit_inc) {
     36         uint64_t v = (hash & (mask << i)) >> i;
     37         size_t offset = v / 8;
     38         uint8_t bit = 1 << (v & 0x07);
     39         b->bits[offset] |= bit;
     40     }
     41 }
     42 
     43 /* Check whether the data's hash is in the bloom filter. */
     44 bool theft_bloom_check(struct theft_bloom *b, uint8_t *data, size_t data_size) {
     45     uint64_t hash = theft_hash_onepass(data, data_size);
     46     uint8_t bc = b->bit_count;
     47     uint64_t mask = (1 << bc) - 1;
     48 
     49     int bit_inc = (64 - bc) / HASH_COUNT;
     50 
     51     for (int i = 0; i < (64 - bc); i += bit_inc) {
     52         uint64_t v = (hash & (mask << i)) >> i;
     53         size_t offset = v / 8;
     54         uint8_t bit = 1 << (v & 0x07);
     55         if (0 == (b->bits[offset] & bit)) { return false; }
     56     }
     57 
     58     return true;
     59 }
     60 
     61 /* Free the bloom filter. */
     62 void theft_bloom_free(struct theft_bloom *b) { free(b); }
     63 
     64 /* Dump the bloom filter's contents. (Debugging.) */
     65 void theft_bloom_dump(struct theft_bloom *b) {
     66     uint8_t counts[256];
     67     memset(counts, 0xFF, sizeof(counts));
     68 
     69     size_t total = 0;
     70     uint16_t row_total = 0;
     71     
     72     for (size_t i = 0; i < b->size; i++) {
     73         uint8_t count = get_bits_set_count(counts, b->bits[i]);
     74         total += count;
     75         row_total += count;
     76         #if DEBUG_BLOOM_FILTER > 1
     77         char c = (count == 0 ? '.' : '0' + count);
     78         printf("%c", c);
     79         if ((i & 63) == 0 || i == b->size - 1) {
     80             printf(" - %2.1f%%\n",
     81                 (100 * row_total) / (64.0 * 8));
     82             row_total = 0;
     83         }
     84         #endif
     85     }
     86 
     87     #if DEBUG_BLOOM_FILTER
     88     printf(" -- bloom filter: %zd of %zd bits set (%2d%%)\n",
     89         total, 8*b->size, (int)((100.0 * total)/(8.0*b->size)));
     90     #endif
     91 
     92     /* If the total number of bits set is > the number of bytes
     93      * in the table (i.e., > 1/8 full) then warn the user. */
     94     if (total > b->size) {
     95         fprintf(stderr, "\nWARNING: bloom filter is %zd%% full, "
     96             "larger bloom_bits value recommended.\n",
     97             (size_t)((100 * total) / (8 * b->size)));
     98     }
     99 }
    100 
    101 /* Recommend a bloom filter size for a given number of trials. */
    102 uint8_t theft_bloom_recommendation(int trials) {
    103     /* With a preferred priority of false positives under 0.1%,
    104      * the required number of bits m in the bloom filter is:
    105      *     m = -lg(0.001)/(lg(2)^2) == 14.378 ~= 14,
    106      * so we want an array with 1 << ceil(log2(14*trials)) cells.
    107      *
    108      * Note: The above formula is for the *ideal* number of hash
    109      * functions, but we're using a hardcoded count. It appears to work
    110      * well enough in practice, though, and this can be adjusted later
    111      * without impacting the API. */
    112     #define TRIAL_MULTIPILER 14
    113     uint8_t res = DEFAULT_BLOOM_BITS;
    114 
    115     const uint8_t min = THEFT_BLOOM_BITS_MIN - LOG2_BITS_PER_BYTE;
    116     const uint8_t max = THEFT_BLOOM_BITS_MAX - LOG2_BITS_PER_BYTE;
    117 
    118     for (uint8_t i = min; i < max; i++) {
    119         int32_t v = (1 << i);
    120         if (v > (TRIAL_MULTIPILER * trials)) {
    121             res = i + LOG2_BITS_PER_BYTE;
    122             break;
    123         }
    124     }
    125 
    126     #if DEBUG_BLOOM_FILTER
    127     size_t sz = 1 << (res - LOG2_BITS_PER_BYTE);
    128     printf("Using %zd bytes for bloom filter: %d trials -> bit_size2 %u\n",
    129         sizeof(struct theft_bloom) + sz, trials, res);
    130     #endif
    131 
    132     return res;
    133 }
    134 
    135 /* Check a byte->bits set table, and lazily populate it. */
    136 static uint8_t get_bits_set_count(uint8_t counts[256], uint8_t byte) {
    137     uint8_t v = counts[byte];
    138     if (v != 0xFF) { return v; }
    139     uint8_t t = 0;
    140     for (uint8_t i = 0; i < 8; i++) {
    141         if (byte & (1 << i)) { t++; }
    142     }
    143     counts[byte] = t;
    144     return t;
    145 }