001/*
002 * Copyright (C) 2011 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.hash;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019
020import com.google.common.annotations.Beta;
021import com.google.common.annotations.VisibleForTesting;
022import com.google.common.base.Objects;
023import com.google.common.base.Predicate;
024import com.google.common.hash.BloomFilterStrategies.BitArray;
025import com.google.common.primitives.SignedBytes;
026import com.google.common.primitives.UnsignedBytes;
027
028import java.io.DataInputStream;
029import java.io.DataOutputStream;
030import java.io.IOException;
031import java.io.InputStream;
032import java.io.OutputStream;
033import java.io.Serializable;
034
035import javax.annotation.Nullable;
036
037/**
038 * A Bloom filter for instances of {@code T}. A Bloom filter offers an approximate containment test
039 * with one-sided error: if it claims that an element is contained in it, this might be in error,
040 * but if it claims that an element is <i>not</i> contained in it, then this is definitely true.
041 *
042 * <p>If you are unfamiliar with Bloom filters, this nice
043 * <a href="http://llimllib.github.com/bloomfilter-tutorial/">tutorial</a> may help you understand
044 * how they work.
045 *
046 * <p>The false positive probability ({@code FPP}) of a bloom filter is defined as the probability
047 * that {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that
048 * has not actually been put in the {@code BloomFilter}.
049 *
050 * <p>Bloom filters are serializable. They also support a more compact serial representation via
051 * the {@link #writeTo} and {@link #readFrom} methods. Both serialized forms will continue to be
052 * supported by future versions of this library. However, serial forms generated by newer versions
053 * of the code may not be readable by older versions of the code (e.g., a serialized bloom filter
054 * generated today may <i>not</i> be readable by a binary that was compiled 6 months ago).
055 *
056 * @param <T> the type of instances that the {@code BloomFilter} accepts
057 * @author Dimitris Andreou
058 * @author Kevin Bourrillion
059 * @since 11.0
060 */
061@Beta
062public final class BloomFilter<T> implements Predicate<T>, Serializable {
063  /**
064   * A strategy to translate T instances, to {@code numHashFunctions} bit indexes.
065   *
066   * <p>Implementations should be collections of pure functions (i.e. stateless).
067   */
068  interface Strategy extends java.io.Serializable {
069
070    /**
071     * Sets {@code numHashFunctions} bits of the given bit array, by hashing a user element.
072     *
073     * <p>Returns whether any bits changed as a result of this operation.
074     */
075    <T> boolean put(T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
076
077    /**
078     * Queries {@code numHashFunctions} bits of the given bit array, by hashing a user element;
079     * returns {@code true} if and only if all selected bits are set.
080     */
081    <T> boolean mightContain(
082        T object, Funnel<? super T> funnel, int numHashFunctions, BitArray bits);
083
084    /**
085     * Identifier used to encode this strategy, when marshalled as part of a BloomFilter.
086     * Only values in the [-128, 127] range are valid for the compact serial form.
087     * Non-negative values are reserved for enums defined in BloomFilterStrategies;
088     * negative values are reserved for any custom, stateful strategy we may define
089     * (e.g. any kind of strategy that would depend on user input).
090     */
091    int ordinal();
092  }
093
094  /** The bit set of the BloomFilter (not necessarily power of 2!)*/
095  private final BitArray bits;
096
097  /** Number of hashes per element */
098  private final int numHashFunctions;
099
100  /** The funnel to translate Ts to bytes */
101  private final Funnel<? super T> funnel;
102
103  /**
104   * The strategy we employ to map an element T to {@code numHashFunctions} bit indexes.
105   */
106  private final Strategy strategy;
107
108  /**
109   * Creates a BloomFilter.
110   */
111  private BloomFilter(BitArray bits, int numHashFunctions, Funnel<? super T> funnel,
112      Strategy strategy) {
113    checkArgument(numHashFunctions > 0,
114        "numHashFunctions (%s) must be > 0", numHashFunctions);
115    checkArgument(numHashFunctions <= 255,
116        "numHashFunctions (%s) must be <= 255", numHashFunctions);
117    this.bits = checkNotNull(bits);
118    this.numHashFunctions = numHashFunctions;
119    this.funnel = checkNotNull(funnel);
120    this.strategy = checkNotNull(strategy);
121  }
122
123  /**
124   * Creates a new {@code BloomFilter} that's a copy of this instance. The new instance is equal to
125   * this instance but shares no mutable state.
126   *
127   * @since 12.0
128   */
129  public BloomFilter<T> copy() {
130    return new BloomFilter<T>(bits.copy(), numHashFunctions, funnel, strategy);
131  }
132
133  /**
134   * Returns {@code true} if the element <i>might</i> have been put in this Bloom filter,
135   * {@code false} if this is <i>definitely</i> not the case.
136   */
137  public boolean mightContain(T object) {
138    return strategy.mightContain(object, funnel, numHashFunctions, bits);
139  }
140
141  /**
142   * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #mightContain}
143   *     instead.
144   */
145  @Deprecated
146  @Override
147  public boolean apply(T input) {
148    return mightContain(input);
149  }
150
151  /**
152   * Puts an element into this {@code BloomFilter}. Ensures that subsequent invocations of
153   * {@link #mightContain(Object)} with the same element will always return {@code true}.
154   *
155   * @return true if the bloom filter's bits changed as a result of this operation. If the bits
156   *     changed, this is <i>definitely</i> the first time {@code object} has been added to the
157   *     filter. If the bits haven't changed, this <i>might</i> be the first time {@code object}
158   *     has been added to the filter. Note that {@code put(t)} always returns the
159   *     <i>opposite</i> result to what {@code mightContain(t)} would have returned at the time
160   *     it is called."
161   * @since 12.0 (present in 11.0 with {@code void} return type})
162   */
163  public boolean put(T object) {
164    return strategy.put(object, funnel, numHashFunctions, bits);
165  }
166
167  /**
168   * Returns the probability that {@linkplain #mightContain(Object)} will erroneously return
169   * {@code true} for an object that has not actually been put in the {@code BloomFilter}.
170   *
171   * <p>Ideally, this number should be close to the {@code fpp} parameter
172   * passed in {@linkplain #create(Funnel, int, double)}, or smaller. If it is
173   * significantly higher, it is usually the case that too many elements (more than
174   * expected) have been put in the {@code BloomFilter}, degenerating it.
175   *
176   * @since 14.0 (since 11.0 as expectedFalsePositiveProbability())
177   */
178  public double expectedFpp() {
179    // You down with FPP? (Yeah you know me!) Who's down with FPP? (Every last homie!)
180    return Math.pow((double) bits.bitCount() / bitSize(), numHashFunctions);
181  }
182
183  /**
184   * Returns the number of bits in the underlying bit array.
185   */
186  @VisibleForTesting long bitSize() {
187    return bits.bitSize();
188  }
189
190  /**
191   * Determines whether a given bloom filter is compatible with this bloom filter. For two
192   * bloom filters to be compatible, they must:
193   *
194   * <ul>
195   * <li>not be the same instance
196   * <li>have the same number of hash functions
197   * <li>have the same bit size
198   * <li>have the same strategy
199   * <li>have equal funnels
200   * <ul>
201   *
202   * @param that The bloom filter to check for compatibility.
203   * @since 15.0
204   */
205  public boolean isCompatible(BloomFilter<T> that) {
206    checkNotNull(that);
207    return (this != that) &&
208        (this.numHashFunctions == that.numHashFunctions) &&
209        (this.bitSize() == that.bitSize()) &&
210        (this.strategy.equals(that.strategy)) &&
211        (this.funnel.equals(that.funnel));
212  }
213
214  /**
215   * Combines this bloom filter with another bloom filter by performing a bitwise OR of the
216   * underlying data. The mutations happen to <b>this</b> instance. Callers must ensure the
217   * bloom filters are appropriately sized to avoid saturating them.
218   *
219   * @param that The bloom filter to combine this bloom filter with. It is not mutated.
220   * @throws IllegalArgumentException if {@code isCompatible(that) == false}
221   *
222   * @since 15.0
223   */
224  public void putAll(BloomFilter<T> that) {
225    checkNotNull(that);
226    checkArgument(this != that, "Cannot combine a BloomFilter with itself.");
227    checkArgument(this.numHashFunctions == that.numHashFunctions,
228        "BloomFilters must have the same number of hash functions (%s != %s)",
229        this.numHashFunctions, that.numHashFunctions);
230    checkArgument(this.bitSize() == that.bitSize(),
231        "BloomFilters must have the same size underlying bit arrays (%s != %s)",
232        this.bitSize(), that.bitSize());
233    checkArgument(this.strategy.equals(that.strategy),
234        "BloomFilters must have equal strategies (%s != %s)",
235        this.strategy, that.strategy);
236    checkArgument(this.funnel.equals(that.funnel),
237        "BloomFilters must have equal funnels (%s != %s)",
238        this.funnel, that.funnel);
239    this.bits.putAll(that.bits);
240  }
241
242  @Override
243  public boolean equals(@Nullable Object object) {
244    if (object == this) {
245      return true;
246    }
247    if (object instanceof BloomFilter) {
248      BloomFilter<?> that = (BloomFilter<?>) object;
249      return this.numHashFunctions == that.numHashFunctions
250          && this.funnel.equals(that.funnel)
251          && this.bits.equals(that.bits)
252          && this.strategy.equals(that.strategy);
253    }
254    return false;
255  }
256
257  @Override
258  public int hashCode() {
259    return Objects.hashCode(numHashFunctions, funnel, strategy, bits);
260  }
261
262  private static final Strategy DEFAULT_STRATEGY =
263      BloomFilterStrategies.MURMUR128_MITZ_64;
264
265  /**
266   * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of
267   * insertions and expected false positive probability.
268   *
269   * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
270   * than specified, will result in its saturation, and a sharp deterioration of its
271   * false positive probability.
272   *
273   * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
274   * {@code Funnel<T>} is.
275   *
276   * <p>It is recommended that the funnel be implemented as a Java enum. This has the
277   * benefit of ensuring proper serialization and deserialization, which is important
278   * since {@link #equals} also relies on object identity of funnels.
279   *
280   * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
281   * @param expectedInsertions the number of expected insertions to the constructed
282   *     {@code BloomFilter<T>}; must be positive
283   * @param fpp the desired false positive probability (must be positive and less than 1.0)
284   * @return a {@code BloomFilter}
285   */
286  public static <T> BloomFilter<T> create(
287      Funnel<? super T> funnel, int expectedInsertions /* n */, double fpp) {
288    return create(funnel, expectedInsertions, fpp, DEFAULT_STRATEGY);
289  }
290
291  @VisibleForTesting
292  static <T> BloomFilter<T> create(
293      Funnel<? super T> funnel, int expectedInsertions /* n */, double fpp, Strategy strategy) {
294    checkNotNull(funnel);
295    checkArgument(expectedInsertions >= 0, "Expected insertions (%s) must be >= 0",
296        expectedInsertions);
297    checkArgument(fpp > 0.0, "False positive probability (%s) must be > 0.0", fpp);
298    checkArgument(fpp < 1.0, "False positive probability (%s) must be < 1.0", fpp);
299    checkNotNull(strategy);
300
301    if (expectedInsertions == 0) {
302      expectedInsertions = 1;
303    }
304    /*
305     * TODO(user): Put a warning in the javadoc about tiny fpp values,
306     * since the resulting size is proportional to -log(p), but there is not
307     * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680
308     * which is less than 10kb. Who cares!
309     */
310    long numBits = optimalNumOfBits(expectedInsertions, fpp);
311    int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits);
312    try {
313      return new BloomFilter<T>(new BitArray(numBits), numHashFunctions, funnel, strategy);
314    } catch (IllegalArgumentException e) {
315      throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e);
316    }
317  }
318
319  /**
320   * Creates a {@link BloomFilter BloomFilter<T>} with the expected number of
321   * insertions and a default expected false positive probability of 3%.
322   *
323   * <p>Note that overflowing a {@code BloomFilter} with significantly more elements
324   * than specified, will result in its saturation, and a sharp deterioration of its
325   * false positive probability.
326   *
327   * <p>The constructed {@code BloomFilter<T>} will be serializable if the provided
328   * {@code Funnel<T>} is.
329   *
330   * @param funnel the funnel of T's that the constructed {@code BloomFilter<T>} will use
331   * @param expectedInsertions the number of expected insertions to the constructed
332   *     {@code BloomFilter<T>}; must be positive
333   * @return a {@code BloomFilter}
334   */
335  public static <T> BloomFilter<T> create(
336      Funnel<? super T> funnel, int expectedInsertions /* n */) {
337    return create(funnel, expectedInsertions, 0.03); // FYI, for 3%, we always get 5 hash functions
338  }
339
340  /*
341   * Cheat sheet:
342   *
343   * m: total bits
344   * n: expected insertions
345   * b: m/n, bits per insertion
346   * p: expected false positive probability
347   *
348   * 1) Optimal k = b * ln2
349   * 2) p = (1 - e ^ (-kn/m))^k
350   * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b
351   * 4) For optimal k: m = -nlnp / ((ln2) ^ 2)
352   */
353
354  /**
355   * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
356   * expected insertions and total number of bits in the Bloom filter.
357   *
358   * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
359   *
360   * @param n expected insertions (must be positive)
361   * @param m total number of bits in Bloom filter (must be positive)
362   */
363  @VisibleForTesting
364  static int optimalNumOfHashFunctions(long n, long m) {
365    // (m / n) * log(2), but avoid truncation due to division!
366    return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
367  }
368
369  /**
370   * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
371   * expected insertions, the required false positive probability.
372   *
373   * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
374   *
375   * @param n expected insertions (must be positive)
376   * @param p false positive rate (must be 0 < p < 1)
377   */
378  @VisibleForTesting
379  static long optimalNumOfBits(long n, double p) {
380    if (p == 0) {
381      p = Double.MIN_VALUE;
382    }
383    return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
384  }
385
386  private Object writeReplace() {
387    return new SerialForm<T>(this);
388  }
389
390  private static class SerialForm<T> implements Serializable {
391    final long[] data;
392    final int numHashFunctions;
393    final Funnel<? super T> funnel;
394    final Strategy strategy;
395
396    SerialForm(BloomFilter<T> bf) {
397      this.data = bf.bits.data;
398      this.numHashFunctions = bf.numHashFunctions;
399      this.funnel = bf.funnel;
400      this.strategy = bf.strategy;
401    }
402    Object readResolve() {
403      return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy);
404    }
405    private static final long serialVersionUID = 1;
406  }
407
408  /**
409   * Writes this {@code BloomFilter} to an output stream, with a custom format (not Java
410   * serialization). This has been measured to save at least 400 bytes compared to regular
411   * serialization.
412   *
413   * <p>Use {@linkplain #readFrom(InputStream, Funnel)} to reconstruct the written BloomFilter.
414   */
415  public void writeTo(OutputStream out) throws IOException {
416    /*
417     * Serial form:
418     * 1 signed byte for the strategy
419     * 1 unsigned byte for the number of hash functions
420     * 1 big endian int, the number of longs in our bitset
421     * N big endian longs of our bitset
422     */
423    DataOutputStream dout = new DataOutputStream(out);
424    dout.writeByte(SignedBytes.checkedCast(strategy.ordinal()));
425    dout.writeByte(UnsignedBytes.checkedCast(numHashFunctions)); // note: checked at the c'tor
426    dout.writeInt(bits.data.length);
427    for (long value : bits.data) {
428      dout.writeLong(value);
429    }
430  }
431
432  /**
433   * Reads a byte stream, which was written by {@linkplain #writeTo(OutputStream)}, into
434   * a {@code BloomFilter<T>}.
435   *
436   * The {@code Funnel} to be used is not encoded in the stream, so it must be provided here.
437   * <b>Warning:</b> the funnel provided <b>must</b> behave identically to the one used to
438   * populate the original Bloom filter!
439   *
440   * @throws IOException if the InputStream throws an {@code IOException}, or if its data does
441   *     not appear to be a BloomFilter serialized using the
442   *     {@linkplain #writeTo(OutputStream)} method.
443   */
444  public static <T> BloomFilter<T> readFrom(InputStream in, Funnel<T> funnel) throws IOException {
445    checkNotNull(in, "InputStream");
446    checkNotNull(funnel, "Funnel");
447    int strategyOrdinal = -1;
448    int numHashFunctions = -1;
449    int dataLength = -1;
450    try {
451      DataInputStream din = new DataInputStream(in);
452      // currently this assumes there is no negative ordinal; will have to be updated if we
453      // add non-stateless strategies (for which we've reserved negative ordinals; see
454      // Strategy.ordinal()).
455      strategyOrdinal = din.readByte();
456      numHashFunctions = UnsignedBytes.toInt(din.readByte());
457      dataLength = din.readInt();
458
459      Strategy strategy = BloomFilterStrategies.values()[strategyOrdinal];
460      long[] data = new long[dataLength];
461      for (int i = 0; i < data.length; i++) {
462        data[i] = din.readLong();
463      }
464      return new BloomFilter<T>(new BitArray(data), numHashFunctions, funnel, strategy);
465    } catch (RuntimeException e) {
466      IOException ioException = new IOException(
467          "Unable to deserialize BloomFilter from InputStream."
468          + " strategyOrdinal: " + strategyOrdinal
469          + " numHashFunctions: " + numHashFunctions
470          + " dataLength: " + dataLength);
471      ioException.initCause(e);
472      throw ioException;
473    }
474  }
475}