001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.base;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019import static com.google.common.base.Preconditions.checkPositionIndex;
020
021import com.google.common.annotations.GwtCompatible;
022import com.google.common.annotations.GwtIncompatible;
023import com.google.common.annotations.VisibleForTesting;
024import java.util.Arrays;
025import java.util.BitSet;
026
027/**
028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
029 * for any {@link Object}. Also offers basic text processing methods based on this function.
030 * Implementations are strongly encouraged to be side-effect-free and immutable.
031 *
032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}".
034 *
035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is,
036 * <a href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>.
037 * It does not understand
038 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode
039 * code points</a> in the range {@code 0x10000} to {@code 0x10FFFF}
040 * which includes the majority of assigned characters, including important CJK characters and emoji.
041 *
042 * <p>Supplementary characters are
043 * <a href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">
044 * encoded into a {@code String} using surrogate pairs</a>,
045 * and a {@code CharMatcher} treats these just as two separate characters.
046 * {@link #countIn} counts each supplementary character as 2 {@code char}s.
047 *
048 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for
049 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building).
050 * For basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner.
051 *
052 * <p>Example usages:
053 *
054 * <pre>
055 *   String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput);
056 *   if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
057 *
058 * <p>See the Guava User Guide article on <a
059 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher}
060 * </a>.
061 *
062 * @author Kevin Bourrillion
063 * @since 1.0
064 */
065@GwtCompatible(emulated = true)
066public abstract class CharMatcher implements Predicate<Character> {
067  /*
068   *           N777777777NO
069   *         N7777777777777N
070   *        M777777777777777N
071   *        $N877777777D77777M
072   *       N M77777777ONND777M
073   *       MN777777777NN  D777
074   *     N7ZN777777777NN ~M7778
075   *    N777777777777MMNN88777N
076   *    N777777777777MNZZZ7777O
077   *    DZN7777O77777777777777
078   *     N7OONND7777777D77777N
079   *      8$M++++?N???$77777$
080   *       M7++++N+M77777777N
081   *        N77O777777777777$                              M
082   *          DNNM$$$$777777N                              D
083   *         N$N:=N$777N7777M                             NZ
084   *        77Z::::N777777777                          ODZZZ
085   *       77N::::::N77777777M                         NNZZZ$
086   *     $777:::::::77777777MN                        ZM8ZZZZZ
087   *     777M::::::Z7777777Z77                        N++ZZZZNN
088   *    7777M:::::M7777777$777M                       $++IZZZZM
089   *   M777$:::::N777777$M7777M                       +++++ZZZDN
090   *     NN$::::::7777$$M777777N                      N+++ZZZZNZ
091   *       N::::::N:7$O:77777777                      N++++ZZZZN
092   *       M::::::::::::N77777777+                   +?+++++ZZZM
093   *       8::::::::::::D77777777M                    O+++++ZZ
094   *        ::::::::::::M777777777N                      O+?D
095   *        M:::::::::::M77777777778                     77=
096   *        D=::::::::::N7777777777N                    777
097   *       INN===::::::=77777777777N                  I777N
098   *      ?777N========N7777777777787M               N7777
099   *      77777$D======N77777777777N777N?         N777777
100   *     I77777$$$N7===M$$77777777$77777777$MMZ77777777N
101   *      $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON
102   *       M$$$$$$$$M    M$$$$$$$$N=N$$$$7777777$$$ND
103   *      O77Z$$$$$$$     M$$$$$$$$MNI==$DNNNNM=~N
104   *   7 :N MNN$$$$M$      $$$777$8      8D8I
105   *     NMM.:7O           777777778
106   *                       7777777MN
107   *                       M NO .7:
108   *                       M   :   M
109   *                            8
110   */
111
112  // Constant matcher factory methods
113
114  /**
115   * Matches any character.
116   *
117   * @since 19.0 (since 1.0 as constant {@code ANY})
118   */
119  public static CharMatcher any() {
120    return Any.INSTANCE;
121  }
122
123  /**
124   * Matches no characters.
125   *
126   * @since 19.0 (since 1.0 as constant {@code NONE})
127   */
128  public static CharMatcher none() {
129    return None.INSTANCE;
130  }
131
132  /**
133   * Determines whether a character is whitespace according to the latest Unicode standard, as
134   * illustrated
135   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
136   * This is not the same definition used by other Java APIs. (See a
137   * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of
138   * "whitespace"</a>.)
139   *
140   * <p>All Unicode White_Space characters are on the BMP and thus supported by this API.
141   *
142   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to
143   * date.
144   *
145   * @since 19.0 (since 1.0 as constant {@code WHITESPACE})
146   */
147  public static CharMatcher whitespace() {
148    return Whitespace.INSTANCE;
149  }
150
151  /**
152   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
153   * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a
154   * discussion of that term.
155   *
156   * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE})
157   */
158  public static CharMatcher breakingWhitespace() {
159    return BreakingWhitespace.INSTANCE;
160  }
161
162  /**
163   * Determines whether a character is ASCII, meaning that its code point is less than 128.
164   *
165   * @since 19.0 (since 1.0 as constant {@code ASCII})
166   */
167  public static CharMatcher ascii() {
168    return Ascii.INSTANCE;
169  }
170
171  /**
172   * Determines whether a character is a BMP digit according to
173   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If
174   * you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
175   *
176   * @deprecated Many digits are supplementary characters; see the class documentation.
177   * @since 19.0 (since 1.0 as constant {@code DIGIT})
178   */
179  @Deprecated
180  public static CharMatcher digit() {
181    return Digit.INSTANCE;
182  }
183
184  /**
185   * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char)
186   * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0',
187   * '9')}.
188   *
189   * @deprecated Many digits are supplementary characters; see the class documentation.
190   * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT})
191   */
192  @Deprecated
193  public static CharMatcher javaDigit() {
194    return JavaDigit.INSTANCE;
195  }
196
197  /**
198   * Determines whether a character is a BMP letter according to
199   * {@linkplain Character#isLetter(char) Java's definition}.
200   * If you only care to match letters of the Latin alphabet, you can use
201   * {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
202   *
203   * @deprecated Most letters are supplementary characters; see the class documentation.
204   * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER})
205   */
206  @Deprecated
207  public static CharMatcher javaLetter() {
208    return JavaLetter.INSTANCE;
209  }
210
211  /**
212   * Determines whether a character is a BMP letter or digit according to
213   * {@linkplain Character#isLetterOrDigit(char) Java's definition}.
214   *
215   * @deprecated Most letters and digits are supplementary characters; see the class documentation.
216   * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}).
217   */
218  @Deprecated
219  public static CharMatcher javaLetterOrDigit() {
220    return JavaLetterOrDigit.INSTANCE;
221  }
222
223  /**
224   * Determines whether a BMP character is upper case according to
225   * {@linkplain Character#isUpperCase(char) Java's definition}.
226   *
227   * @deprecated Some uppercase characters are supplementary characters;
228   *     see the class documentation.
229   * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE})
230   */
231  @Deprecated
232  public static CharMatcher javaUpperCase() {
233    return JavaUpperCase.INSTANCE;
234  }
235
236  /**
237   * Determines whether a BMP character is lower case according to
238   * {@linkplain Character#isLowerCase(char) Java's definition}.
239   *
240   * @deprecated Some lowercase characters are supplementary characters;
241   *     see the class documentation.
242   * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE})
243   */
244  @Deprecated
245  public static CharMatcher javaLowerCase() {
246    return JavaLowerCase.INSTANCE;
247  }
248
249  /**
250   * Determines whether a character is an ISO control character as specified by
251   * {@link Character#isISOControl(char)}.
252   *
253   * <p>All ISO control codes are on the BMP and thus supported by this API.
254   *
255   * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL})
256   */
257  public static CharMatcher javaIsoControl() {
258    return JavaIsoControl.INSTANCE;
259  }
260
261  /**
262   * Determines whether a character is invisible; that is, if its Unicode category is any of
263   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
264   * PRIVATE_USE according to ICU4J.
265   *
266   * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU).
267   *
268   * @deprecated Most invisible characters are supplementary characters;
269   *     see the class documentation.
270   * @since 19.0 (since 1.0 as constant {@code INVISIBLE})
271   */
272  @Deprecated
273  public static CharMatcher invisible() {
274    return Invisible.INSTANCE;
275  }
276
277  /**
278   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
279   * errs on the side of returning {@code false} (that is, it tends to assume a character is
280   * double-width).
281   *
282   * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to
283   * date.
284   *
285   * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>.
286   *
287   * @deprecated Many such characters are supplementary characters; see the class documentation.
288   * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH})
289   */
290  @Deprecated
291  public static CharMatcher singleWidth() {
292    return SingleWidth.INSTANCE;
293  }
294
295  // Legacy constants
296
297  /**
298   * Determines whether a character is whitespace according to the latest Unicode
299   * standard, as illustrated
300   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
301   * This is not the same definition used by other Java APIs. (See a
302   * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of
303   * "whitespace"</a>.)
304   *
305   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant
306   * to keep it up to date.
307   *
308   * @deprecated Use {@link #whitespace()} instead. This constant is scheduled to be
309   *     removed in June 2018.
310   */
311  @com.google.common.annotations.Beta
312  @Deprecated
313  public static final CharMatcher WHITESPACE = whitespace();
314
315  /**
316   * Determines whether a character is a breaking whitespace (that is, a whitespace
317   * which can be interpreted as a break between words for formatting purposes). See
318   * {@link #whitespace} for a discussion of that term.
319   *
320   * @since 2.0
321   * @deprecated Use {@link #breakingWhitespace()} instead. This constant is scheduled
322   *     to be removed in June 2018.
323   */
324  @com.google.common.annotations.Beta
325  @Deprecated
326  public static final CharMatcher BREAKING_WHITESPACE = breakingWhitespace();
327
328  /**
329   * Determines whether a character is ASCII, meaning that its code point is less than
330   * 128.
331   *
332   * @deprecated Use {@link #ascii()} instead. This constant is scheduled to be
333   *     removed in June 2018.
334   */
335  @com.google.common.annotations.Beta
336  @Deprecated
337  public static final CharMatcher ASCII = ascii();
338
339  /**
340   * Determines whether a character is a digit according to
341   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">
342   * Unicode</a>. If you only care to match ASCII digits, you can use
343   * {@code inRange('0', '9')}.
344   *
345   * @deprecated Many digits are supplementary characters; see the class
346   *     documentation. If you need to use this, use {@link #digit()} instead. This
347   * .   constant is scheduled to be removed in June 2018.
348   */
349  @com.google.common.annotations.Beta
350  @Deprecated
351  public static final CharMatcher DIGIT = digit();
352
353  /**
354   * Determines whether a character is a digit according to
355   * {@linkplain Character#isDigit(char) Java's definition}. If you only care to match
356   * ASCII digits, you can use {@code inRange('0', '9')}.
357   *
358   * @deprecated Many digits are supplementary characters; see the class
359   *     documentation. If you need to use this, use {@link #javaDigit()} instead.
360   *     This constant is scheduled to be removed in June 2018.
361   */
362  @com.google.common.annotations.Beta
363  @Deprecated
364  public static final CharMatcher JAVA_DIGIT = javaDigit();
365
366  /**
367   * Determines whether a character is a letter according to
368   * {@linkplain Character#isLetter(char) Java's definition}. If you only care to
369   * match letters of the Latin alphabet, you can use
370   * {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
371   *
372   * @deprecated Most letters are supplementary characters; see the class
373   *     documentation. If you need to use this, use {@link #javaLetter()} instead.
374   *     This constant is scheduled to be removed in June 2018.
375   */
376  @com.google.common.annotations.Beta
377  @Deprecated
378  public static final CharMatcher JAVA_LETTER = javaLetter();
379
380  /**
381   * Determines whether a character is a letter or digit according to
382   * {@linkplain Character#isLetterOrDigit(char) Java's definition}.
383   *
384   * @deprecated Most letters and digits are supplementary characters; see the class
385   *     documentation. If you need to use this, use {@link #javaLetterOrDigit()}
386   *     instead. This constant is scheduled to be removed in June 2018.
387   */
388  @com.google.common.annotations.Beta
389  @Deprecated
390  public static final CharMatcher JAVA_LETTER_OR_DIGIT = javaLetterOrDigit();
391
392  /**
393   * Determines whether a character is upper case according to
394   * {@linkplain Character#isUpperCase(char) Java's definition}.
395   *
396   * @deprecated Some uppercase letters are supplementary characters; see the class
397   *     documentation. If you need to use this, use {@link #javaUpperCase()} instead.
398   *     This constant is scheduled to be removed in June 2018.
399   */
400  @com.google.common.annotations.Beta
401  @Deprecated
402  public static final CharMatcher JAVA_UPPER_CASE = javaUpperCase();
403
404  /**
405   * Determines whether a character is lower case according to
406   * {@linkplain Character#isLowerCase(char) Java's definition}.
407   *
408   * @deprecated Some lowercase letters are supplementary characters; see the class
409   *     documentation. If you need to use this, use {@link #javaLowerCase()} instead.
410   *     This constant is scheduled to be removed in June 2018.
411   */
412  @com.google.common.annotations.Beta
413  @Deprecated
414  public static final CharMatcher JAVA_LOWER_CASE = javaLowerCase();
415
416  /**
417   * Determines whether a character is an ISO control character as specified by
418   * {@link Character#isISOControl(char)}.
419   *
420   * @deprecated Use {@link #javaIsoControl()} instead. This constant is scheduled to
421   *     be removed in June 2018.
422   */
423  @com.google.common.annotations.Beta
424  @Deprecated
425  public static final CharMatcher JAVA_ISO_CONTROL = javaIsoControl();
426
427  /**
428   * Determines whether a character is invisible; that is, if its Unicode category is
429   * any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT,
430   * SURROGATE, and PRIVATE_USE according to ICU4J.
431   *
432   * @deprecated Most invisible characters are supplementary characters; see the class
433   *     documentation. If you need to use this, use {@link #invisible()} instead.
434   *     This constant is scheduled to be removed in June 2018.
435   */
436  @com.google.common.annotations.Beta
437  @Deprecated
438  public static final CharMatcher INVISIBLE = invisible();
439
440  /**
441   * Determines whether a character is single-width (not double-width). When in doubt,
442   * this matcher errs on the side of returning {@code false} (that is, it tends to
443   * assume a character is double-width).
444   *
445   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to
446   * keep it up to date.
447   *
448   * @deprecated Many such characters are supplementary characters; see the class
449   *     documentation. If you need to use this, use {@link #singleWidth()} instead.
450   *     This constant is scheduled to be removed in June 2018.
451   */
452  @com.google.common.annotations.Beta
453  @Deprecated
454  public static final CharMatcher SINGLE_WIDTH = singleWidth();
455
456  /**
457   * Matches any character.
458   *
459   * @deprecated Use {@link #any()} instead. This constant is scheduled to be
460   *     removed in June 2018.
461   */
462  @com.google.common.annotations.Beta
463  @Deprecated
464  public static final CharMatcher ANY = any();
465
466  /**
467   * Matches no characters.
468   *
469   * @deprecated Use {@link #none()} instead. This constant is scheduled to be
470   *     removed in June 2018.
471   */
472  @com.google.common.annotations.Beta
473  @Deprecated
474  public static final CharMatcher NONE = none();
475
476  // Static factories
477
478  /**
479   * Returns a {@code char} matcher that matches only one specified BMP character.
480   */
481  public static CharMatcher is(final char match) {
482    return new Is(match);
483  }
484
485  /**
486   * Returns a {@code char} matcher that matches any character except the BMP character specified.
487   *
488   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
489   */
490  public static CharMatcher isNot(final char match) {
491    return new IsNot(match);
492  }
493
494  /**
495   * Returns a {@code char} matcher that matches any BMP character present in the given character
496   * sequence. Returns a bogus matcher if the sequence contains supplementary characters.
497   */
498  public static CharMatcher anyOf(final CharSequence sequence) {
499    switch (sequence.length()) {
500      case 0:
501        return none();
502      case 1:
503        return is(sequence.charAt(0));
504      case 2:
505        return isEither(sequence.charAt(0), sequence.charAt(1));
506      default:
507        // TODO(lowasser): is it potentially worth just going ahead and building a precomputed
508        // matcher?
509        return new AnyOf(sequence);
510    }
511  }
512
513  /**
514   * Returns a {@code char} matcher that matches any BMP character not present in the given
515   * character sequence. Returns a bogus matcher if the sequence contains supplementary characters.
516   */
517  public static CharMatcher noneOf(CharSequence sequence) {
518    return anyOf(sequence).negate();
519  }
520
521  /**
522   * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints
523   * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
524   * CharMatcher.inRange('a', 'z')}.
525   *
526   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
527   */
528  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
529    return new InRange(startInclusive, endInclusive);
530  }
531
532  /**
533   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
534   * which operates on primitive {@code char} instances instead.
535   */
536  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
537    return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate);
538  }
539
540  // Constructors
541
542  /**
543   * Constructor for use by subclasses. When subclassing, you may want to override
544   * {@code toString()} to provide a useful description.
545   */
546  protected CharMatcher() {}
547
548  // Abstract methods
549
550  /** Determines a true or false value for the given character. */
551  public abstract boolean matches(char c);
552
553  // Non-static factories
554
555  /**
556   * Returns a matcher that matches any character not matched by this matcher.
557   */
558  // @Override under Java 8 but not under Java 7
559  public CharMatcher negate() {
560    return new Negated(this);
561  }
562
563  /**
564   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
565   */
566  public CharMatcher and(CharMatcher other) {
567    return new And(this, other);
568  }
569
570  /**
571   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
572   */
573  public CharMatcher or(CharMatcher other) {
574    return new Or(this, other);
575  }
576
577  /**
578   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
579   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
580   * worthwhile only if the precomputed matcher is queried many thousands of times.
581   *
582   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
583   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
584   * worthwhile tradeoff in a browser.
585   */
586  public CharMatcher precomputed() {
587    return Platform.precomputeCharMatcher(this);
588  }
589
590  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
591
592  /**
593   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
594   * on {@link Platform} so that we can have different behavior in GWT.
595   *
596   * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the
597   * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables
598   * for matchers that only match a few characters, and so on. In the worst-case scenario, it
599   * constructs an eight-kilobyte bit array and queries that. In many situations this produces a
600   * matcher which is faster to query than the original.
601   */
602  @GwtIncompatible // SmallCharMatcher
603  CharMatcher precomputedInternal() {
604    final BitSet table = new BitSet();
605    setBits(table);
606    int totalCharacters = table.cardinality();
607    if (totalCharacters * 2 <= DISTINCT_CHARS) {
608      return precomputedPositive(totalCharacters, table, toString());
609    } else {
610      // TODO(lowasser): is it worth it to worry about the last character of large matchers?
611      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
612      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
613      String suffix = ".negate()";
614      final String description = toString();
615      String negatedDescription =
616          description.endsWith(suffix)
617              ? description.substring(0, description.length() - suffix.length())
618              : description + suffix;
619      return new NegatedFastMatcher(
620          precomputedPositive(negatedCharacters, table, negatedDescription)) {
621        @Override
622        public String toString() {
623          return description;
624        }
625      };
626    }
627  }
628
629  /**
630   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
631   */
632  @GwtIncompatible // SmallCharMatcher
633  private static CharMatcher precomputedPositive(
634      int totalCharacters, BitSet table, String description) {
635    switch (totalCharacters) {
636      case 0:
637        return none();
638      case 1:
639        return is((char) table.nextSetBit(0));
640      case 2:
641        char c1 = (char) table.nextSetBit(0);
642        char c2 = (char) table.nextSetBit(c1 + 1);
643        return isEither(c1, c2);
644      default:
645        return isSmall(totalCharacters, table.length())
646            ? SmallCharMatcher.from(table, description)
647            : new BitSetMatcher(table, description);
648    }
649  }
650
651  @GwtIncompatible // SmallCharMatcher
652  private static boolean isSmall(int totalCharacters, int tableLength) {
653    return totalCharacters <= SmallCharMatcher.MAX_SIZE
654        && tableLength > (totalCharacters * 4 * Character.SIZE);
655    // err on the side of BitSetMatcher
656  }
657
658  /**
659   * Sets bits in {@code table} matched by this matcher.
660   */
661  @GwtIncompatible // used only from other GwtIncompatible code
662  void setBits(BitSet table) {
663    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
664      if (matches((char) c)) {
665        table.set(c);
666      }
667    }
668  }
669
670  // Text processing routines
671
672  /**
673   * Returns {@code true} if a character sequence contains at least one matching BMP character.
674   * Equivalent to {@code !matchesNoneOf(sequence)}.
675   *
676   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
677   * character, until this returns {@code true} or the end is reached.
678   *
679   * @param sequence the character sequence to examine, possibly empty
680   * @return {@code true} if this matcher matches at least one character in the sequence
681   * @since 8.0
682   */
683  public boolean matchesAnyOf(CharSequence sequence) {
684    return !matchesNoneOf(sequence);
685  }
686
687  /**
688   * Returns {@code true} if a character sequence contains only matching BMP characters.
689   *
690   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
691   * character, until this returns {@code false} or the end is reached.
692   *
693   * @param sequence the character sequence to examine, possibly empty
694   * @return {@code true} if this matcher matches every character in the sequence, including when
695   *     the sequence is empty
696   */
697  public boolean matchesAllOf(CharSequence sequence) {
698    for (int i = sequence.length() - 1; i >= 0; i--) {
699      if (!matches(sequence.charAt(i))) {
700        return false;
701      }
702    }
703    return true;
704  }
705
706  /**
707   * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to
708   * {@code !matchesAnyOf(sequence)}.
709   *
710   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
711   * character, until this returns {@code true} or the end is reached.
712   *
713   * @param sequence the character sequence to examine, possibly empty
714   * @return {@code true} if this matcher matches no characters in the sequence, including when
715   *     the sequence is empty
716   */
717  public boolean matchesNoneOf(CharSequence sequence) {
718    return indexIn(sequence) == -1;
719  }
720
721  /**
722   * Returns the index of the first matching BMP character in a character sequence,
723   * or {@code -1} if no matching character is present.
724   *
725   * <p>The default implementation iterates over the sequence in forward order calling
726   * {@link #matches} for each character.
727   *
728   * @param sequence the character sequence to examine from the beginning
729   * @return an index, or {@code -1} if no character matches
730   */
731  public int indexIn(CharSequence sequence) {
732    return indexIn(sequence, 0);
733  }
734
735  /**
736   * Returns the index of the first matching BMP character in a character sequence, starting from a
737   * given position, or {@code -1} if no character matches after that position.
738   *
739   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
740   * start}, calling {@link #matches} for each character.
741   *
742   * @param sequence the character sequence to examine
743   * @param start the first index to examine; must be nonnegative and no greater than {@code
744   *        sequence.length()}
745   * @return the index of the first matching character, guaranteed to be no less than {@code start},
746   *     or {@code -1} if no character matches
747   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
748   *         sequence.length()}
749   */
750  public int indexIn(CharSequence sequence, int start) {
751    int length = sequence.length();
752    checkPositionIndex(start, length);
753    for (int i = start; i < length; i++) {
754      if (matches(sequence.charAt(i))) {
755        return i;
756      }
757    }
758    return -1;
759  }
760
761  /**
762   * Returns the index of the last matching BMP character in a character sequence,
763   * or {@code -1} if no matching character is present.
764   *
765   * <p>The default implementation iterates over the sequence in reverse order calling
766   * {@link #matches} for each character.
767   *
768   * @param sequence the character sequence to examine from the end
769   * @return an index, or {@code -1} if no character matches
770   */
771  public int lastIndexIn(CharSequence sequence) {
772    for (int i = sequence.length() - 1; i >= 0; i--) {
773      if (matches(sequence.charAt(i))) {
774        return i;
775      }
776    }
777    return -1;
778  }
779
780  /**
781   * Returns the number of matching {@code char}s found in a character sequence.
782   *
783   * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}().
784   */
785  public int countIn(CharSequence sequence) {
786    int count = 0;
787    for (int i = 0; i < sequence.length(); i++) {
788      if (matches(sequence.charAt(i))) {
789        count++;
790      }
791    }
792    return count;
793  }
794
795  /**
796   * Returns a string containing all non-matching characters of a character sequence, in order. For
797   * example: <pre>   {@code
798   *
799   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
800   *
801   * ... returns {@code "bzr"}.
802   */
803  public String removeFrom(CharSequence sequence) {
804    String string = sequence.toString();
805    int pos = indexIn(string);
806    if (pos == -1) {
807      return string;
808    }
809
810    char[] chars = string.toCharArray();
811    int spread = 1;
812
813    // This unusual loop comes from extensive benchmarking
814    OUT:
815    while (true) {
816      pos++;
817      while (true) {
818        if (pos == chars.length) {
819          break OUT;
820        }
821        if (matches(chars[pos])) {
822          break;
823        }
824        chars[pos - spread] = chars[pos];
825        pos++;
826      }
827      spread++;
828    }
829    return new String(chars, 0, pos - spread);
830  }
831
832  /**
833   * Returns a string containing all matching BMP characters of a character sequence, in order. For
834   * example: <pre>   {@code
835   *
836   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
837   *
838   * ... returns {@code "aaa"}.
839   */
840  public String retainFrom(CharSequence sequence) {
841    return negate().removeFrom(sequence);
842  }
843
844  /**
845   * Returns a string copy of the input character sequence, with each matching BMP character
846   * replaced by a given replacement character. For example: <pre>   {@code
847   *
848   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
849   *
850   * ... returns {@code "rodor"}.
851   *
852   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
853   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
854   * character.
855   *
856   * @param sequence the character sequence to replace matching characters in
857   * @param replacement the character to append to the result string in place of each matching
858   *     character in {@code sequence}
859   * @return the new string
860   */
861  public String replaceFrom(CharSequence sequence, char replacement) {
862    String string = sequence.toString();
863    int pos = indexIn(string);
864    if (pos == -1) {
865      return string;
866    }
867    char[] chars = string.toCharArray();
868    chars[pos] = replacement;
869    for (int i = pos + 1; i < chars.length; i++) {
870      if (matches(chars[i])) {
871        chars[i] = replacement;
872      }
873    }
874    return new String(chars);
875  }
876
877  /**
878   * Returns a string copy of the input character sequence, with each matching BMP character
879   * replaced by a given replacement sequence. For example: <pre>   {@code
880   *
881   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
882   *
883   * ... returns {@code "yoohoo"}.
884   *
885   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
886   * off calling {@link #replaceFrom(CharSequence, char)} directly.
887   *
888   * @param sequence the character sequence to replace matching characters in
889   * @param replacement the characters to append to the result string in place of each matching
890   *     character in {@code sequence}
891   * @return the new string
892   */
893  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
894    int replacementLen = replacement.length();
895    if (replacementLen == 0) {
896      return removeFrom(sequence);
897    }
898    if (replacementLen == 1) {
899      return replaceFrom(sequence, replacement.charAt(0));
900    }
901
902    String string = sequence.toString();
903    int pos = indexIn(string);
904    if (pos == -1) {
905      return string;
906    }
907
908    int len = string.length();
909    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
910
911    int oldpos = 0;
912    do {
913      buf.append(string, oldpos, pos);
914      buf.append(replacement);
915      oldpos = pos + 1;
916      pos = indexIn(string, oldpos);
917    } while (pos != -1);
918
919    buf.append(string, oldpos, len);
920    return buf.toString();
921  }
922
923  /**
924   * Returns a substring of the input character sequence that omits all matching BMP characters
925   * from the beginning and from the end of the string. For example: <pre>   {@code
926   *
927   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
928   *
929   * ... returns {@code "cat"}.
930   *
931   * <p>Note that: <pre>   {@code
932   *
933   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
934   *
935   * ... is equivalent to {@link String#trim()}.
936   */
937  public String trimFrom(CharSequence sequence) {
938    int len = sequence.length();
939    int first;
940    int last;
941
942    for (first = 0; first < len; first++) {
943      if (!matches(sequence.charAt(first))) {
944        break;
945      }
946    }
947    for (last = len - 1; last > first; last--) {
948      if (!matches(sequence.charAt(last))) {
949        break;
950      }
951    }
952
953    return sequence.subSequence(first, last + 1).toString();
954  }
955
956  /**
957   * Returns a substring of the input character sequence that omits all matching BMP characters
958   * from the beginning of the string. For example: <pre> {@code
959   *
960   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
961   *
962   * ... returns {@code "catbab"}.
963   */
964  public String trimLeadingFrom(CharSequence sequence) {
965    int len = sequence.length();
966    for (int first = 0; first < len; first++) {
967      if (!matches(sequence.charAt(first))) {
968        return sequence.subSequence(first, len).toString();
969      }
970    }
971    return "";
972  }
973
974  /**
975   * Returns a substring of the input character sequence that omits all matching BMP characters
976   * from the end of the string. For example: <pre> {@code
977   *
978   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
979   *
980   * ... returns {@code "abacat"}.
981   */
982  public String trimTrailingFrom(CharSequence sequence) {
983    int len = sequence.length();
984    for (int last = len - 1; last >= 0; last--) {
985      if (!matches(sequence.charAt(last))) {
986        return sequence.subSequence(0, last + 1).toString();
987      }
988    }
989    return "";
990  }
991
992  /**
993   * Returns a string copy of the input character sequence, with each group of consecutive
994   * matching BMP characters replaced by a single replacement character. For example:
995   * <pre>   {@code
996   *
997   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
998   *
999   * ... returns {@code "b-p-r"}.
1000   *
1001   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1002   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1003   * character.
1004   *
1005   * @param sequence the character sequence to replace matching groups of characters in
1006   * @param replacement the character to append to the result string in place of each group of
1007   *     matching characters in {@code sequence}
1008   * @return the new string
1009   */
1010  public String collapseFrom(CharSequence sequence, char replacement) {
1011    // This implementation avoids unnecessary allocation.
1012    int len = sequence.length();
1013    for (int i = 0; i < len; i++) {
1014      char c = sequence.charAt(i);
1015      if (matches(c)) {
1016        if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1017          // a no-op replacement
1018          i++;
1019        } else {
1020          StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement);
1021          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1022        }
1023      }
1024    }
1025    // no replacement needed
1026    return sequence.toString();
1027  }
1028
1029  /**
1030   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1031   * groups of matching BMP characters at the start or end of the sequence are removed without
1032   * replacement.
1033   */
1034  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1035    // This implementation avoids unnecessary allocation.
1036    int len = sequence.length();
1037    int first = 0;
1038    int last = len - 1;
1039
1040    while (first < len && matches(sequence.charAt(first))) {
1041      first++;
1042    }
1043
1044    while (last > first && matches(sequence.charAt(last))) {
1045      last--;
1046    }
1047
1048    return (first == 0 && last == len - 1)
1049        ? collapseFrom(sequence, replacement)
1050        : finishCollapseFrom(
1051            sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false);
1052  }
1053
1054  private String finishCollapseFrom(
1055      CharSequence sequence,
1056      int start,
1057      int end,
1058      char replacement,
1059      StringBuilder builder,
1060      boolean inMatchingGroup) {
1061    for (int i = start; i < end; i++) {
1062      char c = sequence.charAt(i);
1063      if (matches(c)) {
1064        if (!inMatchingGroup) {
1065          builder.append(replacement);
1066          inMatchingGroup = true;
1067        }
1068      } else {
1069        builder.append(c);
1070        inMatchingGroup = false;
1071      }
1072    }
1073    return builder.toString();
1074  }
1075
1076  /**
1077   * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
1078   *     instead.
1079   */
1080  @Deprecated
1081  @Override
1082  public boolean apply(Character character) {
1083    return matches(character);
1084  }
1085
1086  /**
1087   * Returns a string representation of this {@code CharMatcher}, such as
1088   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1089   */
1090  @Override
1091  public String toString() {
1092    return super.toString();
1093  }
1094
1095  /**
1096   * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB"
1097   * where "12AB" is the four hexadecimal digits representing the 16-bit code unit.
1098   */
1099  private static String showCharacter(char c) {
1100    String hex = "0123456789ABCDEF";
1101    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
1102    for (int i = 0; i < 4; i++) {
1103      tmp[5 - i] = hex.charAt(c & 0xF);
1104      c = (char) (c >> 4);
1105    }
1106    return String.copyValueOf(tmp);
1107  }
1108
1109  // Fast matchers
1110
1111  /** A matcher for which precomputation will not yield any significant benefit. */
1112  abstract static class FastMatcher extends CharMatcher {
1113
1114    @Override
1115    public final CharMatcher precomputed() {
1116      return this;
1117    }
1118
1119    @Override
1120    public CharMatcher negate() {
1121      return new NegatedFastMatcher(this);
1122    }
1123  }
1124
1125  /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */
1126  abstract static class NamedFastMatcher extends FastMatcher {
1127
1128    private final String description;
1129
1130    NamedFastMatcher(String description) {
1131      this.description = checkNotNull(description);
1132    }
1133
1134    @Override
1135    public final String toString() {
1136      return description;
1137    }
1138  }
1139
1140  /** Negation of a {@link FastMatcher}. */
1141  static class NegatedFastMatcher extends Negated {
1142
1143    NegatedFastMatcher(CharMatcher original) {
1144      super(original);
1145    }
1146
1147    @Override
1148    public final CharMatcher precomputed() {
1149      return this;
1150    }
1151  }
1152
1153  /** Fast matcher using a {@link BitSet} table of matching characters. */
1154  @GwtIncompatible // used only from other GwtIncompatible code
1155  private static final class BitSetMatcher extends NamedFastMatcher {
1156
1157    private final BitSet table;
1158
1159    private BitSetMatcher(BitSet table, String description) {
1160      super(description);
1161      if (table.length() + Long.SIZE < table.size()) {
1162        table = (BitSet) table.clone();
1163        // If only we could actually call BitSet.trimToSize() ourselves...
1164      }
1165      this.table = table;
1166    }
1167
1168    @Override
1169    public boolean matches(char c) {
1170      return table.get(c);
1171    }
1172
1173    @Override
1174    void setBits(BitSet bitSet) {
1175      bitSet.or(table);
1176    }
1177  }
1178
1179  // Static constant implementation classes
1180
1181  /** Implementation of {@link #any()}. */
1182  private static final class Any extends NamedFastMatcher {
1183
1184    static final Any INSTANCE = new Any();
1185
1186    private Any() {
1187      super("CharMatcher.any()");
1188    }
1189
1190    @Override
1191    public boolean matches(char c) {
1192      return true;
1193    }
1194
1195    @Override
1196    public int indexIn(CharSequence sequence) {
1197      return (sequence.length() == 0) ? -1 : 0;
1198    }
1199
1200    @Override
1201    public int indexIn(CharSequence sequence, int start) {
1202      int length = sequence.length();
1203      checkPositionIndex(start, length);
1204      return (start == length) ? -1 : start;
1205    }
1206
1207    @Override
1208    public int lastIndexIn(CharSequence sequence) {
1209      return sequence.length() - 1;
1210    }
1211
1212    @Override
1213    public boolean matchesAllOf(CharSequence sequence) {
1214      checkNotNull(sequence);
1215      return true;
1216    }
1217
1218    @Override
1219    public boolean matchesNoneOf(CharSequence sequence) {
1220      return sequence.length() == 0;
1221    }
1222
1223    @Override
1224    public String removeFrom(CharSequence sequence) {
1225      checkNotNull(sequence);
1226      return "";
1227    }
1228
1229    @Override
1230    public String replaceFrom(CharSequence sequence, char replacement) {
1231      char[] array = new char[sequence.length()];
1232      Arrays.fill(array, replacement);
1233      return new String(array);
1234    }
1235
1236    @Override
1237    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1238      StringBuilder result = new StringBuilder(sequence.length() * replacement.length());
1239      for (int i = 0; i < sequence.length(); i++) {
1240        result.append(replacement);
1241      }
1242      return result.toString();
1243    }
1244
1245    @Override
1246    public String collapseFrom(CharSequence sequence, char replacement) {
1247      return (sequence.length() == 0) ? "" : String.valueOf(replacement);
1248    }
1249
1250    @Override
1251    public String trimFrom(CharSequence sequence) {
1252      checkNotNull(sequence);
1253      return "";
1254    }
1255
1256    @Override
1257    public int countIn(CharSequence sequence) {
1258      return sequence.length();
1259    }
1260
1261    @Override
1262    public CharMatcher and(CharMatcher other) {
1263      return checkNotNull(other);
1264    }
1265
1266    @Override
1267    public CharMatcher or(CharMatcher other) {
1268      checkNotNull(other);
1269      return this;
1270    }
1271
1272    @Override
1273    public CharMatcher negate() {
1274      return none();
1275    }
1276  }
1277
1278  /** Implementation of {@link #none()}. */
1279  private static final class None extends NamedFastMatcher {
1280
1281    static final None INSTANCE = new None();
1282
1283    private None() {
1284      super("CharMatcher.none()");
1285    }
1286
1287    @Override
1288    public boolean matches(char c) {
1289      return false;
1290    }
1291
1292    @Override
1293    public int indexIn(CharSequence sequence) {
1294      checkNotNull(sequence);
1295      return -1;
1296    }
1297
1298    @Override
1299    public int indexIn(CharSequence sequence, int start) {
1300      int length = sequence.length();
1301      checkPositionIndex(start, length);
1302      return -1;
1303    }
1304
1305    @Override
1306    public int lastIndexIn(CharSequence sequence) {
1307      checkNotNull(sequence);
1308      return -1;
1309    }
1310
1311    @Override
1312    public boolean matchesAllOf(CharSequence sequence) {
1313      return sequence.length() == 0;
1314    }
1315
1316    @Override
1317    public boolean matchesNoneOf(CharSequence sequence) {
1318      checkNotNull(sequence);
1319      return true;
1320    }
1321
1322    @Override
1323    public String removeFrom(CharSequence sequence) {
1324      return sequence.toString();
1325    }
1326
1327    @Override
1328    public String replaceFrom(CharSequence sequence, char replacement) {
1329      return sequence.toString();
1330    }
1331
1332    @Override
1333    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1334      checkNotNull(replacement);
1335      return sequence.toString();
1336    }
1337
1338    @Override
1339    public String collapseFrom(CharSequence sequence, char replacement) {
1340      return sequence.toString();
1341    }
1342
1343    @Override
1344    public String trimFrom(CharSequence sequence) {
1345      return sequence.toString();
1346    }
1347
1348    @Override
1349    public String trimLeadingFrom(CharSequence sequence) {
1350      return sequence.toString();
1351    }
1352
1353    @Override
1354    public String trimTrailingFrom(CharSequence sequence) {
1355      return sequence.toString();
1356    }
1357
1358    @Override
1359    public int countIn(CharSequence sequence) {
1360      checkNotNull(sequence);
1361      return 0;
1362    }
1363
1364    @Override
1365    public CharMatcher and(CharMatcher other) {
1366      checkNotNull(other);
1367      return this;
1368    }
1369
1370    @Override
1371    public CharMatcher or(CharMatcher other) {
1372      return checkNotNull(other);
1373    }
1374
1375    @Override
1376    public CharMatcher negate() {
1377      return any();
1378    }
1379  }
1380
1381  /** Implementation of {@link #whitespace()}. */
1382  @VisibleForTesting
1383  static final class Whitespace extends NamedFastMatcher {
1384
1385    static final String TABLE =
1386        "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
1387            + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
1388            + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
1389            + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
1390    static final int MULTIPLIER = 1682554634;
1391    static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1);
1392
1393    static final Whitespace INSTANCE = new Whitespace();
1394
1395    Whitespace() {
1396      super("CharMatcher.whitespace()");
1397    }
1398
1399    @Override
1400    public boolean matches(char c) {
1401      return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c;
1402    }
1403
1404    @GwtIncompatible // used only from other GwtIncompatible code
1405    @Override
1406    void setBits(BitSet table) {
1407      for (int i = 0; i < TABLE.length(); i++) {
1408        table.set(TABLE.charAt(i));
1409      }
1410    }
1411  }
1412
1413  /** Implementation of {@link #breakingWhitespace()}. */
1414  private static final class BreakingWhitespace extends CharMatcher {
1415
1416    static final CharMatcher INSTANCE = new BreakingWhitespace();
1417
1418    @Override
1419    public boolean matches(char c) {
1420      switch (c) {
1421        case '\t':
1422        case '\n':
1423        case '\013':
1424        case '\f':
1425        case '\r':
1426        case ' ':
1427        case '\u0085':
1428        case '\u1680':
1429        case '\u2028':
1430        case '\u2029':
1431        case '\u205f':
1432        case '\u3000':
1433          return true;
1434        case '\u2007':
1435          return false;
1436        default:
1437          return c >= '\u2000' && c <= '\u200a';
1438      }
1439    }
1440
1441    @Override
1442    public String toString() {
1443      return "CharMatcher.breakingWhitespace()";
1444    }
1445  }
1446
1447  /** Implementation of {@link #ascii()}. */
1448  private static final class Ascii extends NamedFastMatcher {
1449
1450    static final Ascii INSTANCE = new Ascii();
1451
1452    Ascii() {
1453      super("CharMatcher.ascii()");
1454    }
1455
1456    @Override
1457    public boolean matches(char c) {
1458      return c <= '\u007f';
1459    }
1460  }
1461
1462  /** Implementation that matches characters that fall within multiple ranges. */
1463  private static class RangesMatcher extends CharMatcher {
1464
1465    private final String description;
1466    private final char[] rangeStarts;
1467    private final char[] rangeEnds;
1468
1469    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
1470      this.description = description;
1471      this.rangeStarts = rangeStarts;
1472      this.rangeEnds = rangeEnds;
1473      checkArgument(rangeStarts.length == rangeEnds.length);
1474      for (int i = 0; i < rangeStarts.length; i++) {
1475        checkArgument(rangeStarts[i] <= rangeEnds[i]);
1476        if (i + 1 < rangeStarts.length) {
1477          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
1478        }
1479      }
1480    }
1481
1482    @Override
1483    public boolean matches(char c) {
1484      int index = Arrays.binarySearch(rangeStarts, c);
1485      if (index >= 0) {
1486        return true;
1487      } else {
1488        index = ~index - 1;
1489        return index >= 0 && c <= rangeEnds[index];
1490      }
1491    }
1492
1493    @Override
1494    public String toString() {
1495      return description;
1496    }
1497  }
1498
1499  /** Implementation of {@link #digit()}. */
1500  private static final class Digit extends RangesMatcher {
1501    // Plug the following UnicodeSet pattern into
1502    // https://unicode.org/cldr/utility/list-unicodeset.jsp
1503    // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]]
1504    // and get the zeroes from there.
1505
1506    // Must be in ascending order.
1507    private static final String ZEROES =
1508        "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6"
1509            + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0"
1510            + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10";
1511
1512    private static char[] zeroes() {
1513      return ZEROES.toCharArray();
1514    }
1515
1516    private static char[] nines() {
1517      char[] nines = new char[ZEROES.length()];
1518      for (int i = 0; i < ZEROES.length(); i++) {
1519        nines[i] = (char) (ZEROES.charAt(i) + 9);
1520      }
1521      return nines;
1522    }
1523
1524    static final Digit INSTANCE = new Digit();
1525
1526    private Digit() {
1527      super("CharMatcher.digit()", zeroes(), nines());
1528    }
1529  }
1530
1531  /** Implementation of {@link #javaDigit()}. */
1532  private static final class JavaDigit extends CharMatcher {
1533
1534    static final JavaDigit INSTANCE = new JavaDigit();
1535
1536    @Override
1537    public boolean matches(char c) {
1538      return Character.isDigit(c);
1539    }
1540
1541    @Override
1542    public String toString() {
1543      return "CharMatcher.javaDigit()";
1544    }
1545  }
1546
1547  /** Implementation of {@link #javaLetter()}. */
1548  private static final class JavaLetter extends CharMatcher {
1549
1550    static final JavaLetter INSTANCE = new JavaLetter();
1551
1552    @Override
1553    public boolean matches(char c) {
1554      return Character.isLetter(c);
1555    }
1556
1557    @Override
1558    public String toString() {
1559      return "CharMatcher.javaLetter()";
1560    }
1561  }
1562
1563  /** Implementation of {@link #javaLetterOrDigit()}. */
1564  private static final class JavaLetterOrDigit extends CharMatcher {
1565
1566    static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit();
1567
1568    @Override
1569    public boolean matches(char c) {
1570      return Character.isLetterOrDigit(c);
1571    }
1572
1573    @Override
1574    public String toString() {
1575      return "CharMatcher.javaLetterOrDigit()";
1576    }
1577  }
1578
1579  /** Implementation of {@link #javaUpperCase()}. */
1580  private static final class JavaUpperCase extends CharMatcher {
1581
1582    static final JavaUpperCase INSTANCE = new JavaUpperCase();
1583
1584    @Override
1585    public boolean matches(char c) {
1586      return Character.isUpperCase(c);
1587    }
1588
1589    @Override
1590    public String toString() {
1591      return "CharMatcher.javaUpperCase()";
1592    }
1593  }
1594
1595  /** Implementation of {@link #javaLowerCase()}. */
1596  private static final class JavaLowerCase extends CharMatcher {
1597
1598    static final JavaLowerCase INSTANCE = new JavaLowerCase();
1599
1600    @Override
1601    public boolean matches(char c) {
1602      return Character.isLowerCase(c);
1603    }
1604
1605    @Override
1606    public String toString() {
1607      return "CharMatcher.javaLowerCase()";
1608    }
1609  }
1610
1611  /** Implementation of {@link #javaIsoControl()}. */
1612  private static final class JavaIsoControl extends NamedFastMatcher {
1613
1614    static final JavaIsoControl INSTANCE = new JavaIsoControl();
1615
1616    private JavaIsoControl() {
1617      super("CharMatcher.javaIsoControl()");
1618    }
1619
1620    @Override
1621    public boolean matches(char c) {
1622      return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f');
1623    }
1624  }
1625
1626  /** Implementation of {@link #invisible()}. */
1627  private static final class Invisible extends RangesMatcher {
1628    // Plug the following UnicodeSet pattern into
1629    // https://unicode.org/cldr/utility/list-unicodeset.jsp
1630    // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]]
1631    // with the "Abbreviate" option, and get the ranges from there.
1632    private static final String RANGE_STARTS =
1633        "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066"
1634            + "\u3000\ud800\ufeff\ufff9";
1635    private static final String RANGE_ENDS =  // inclusive ends
1636        "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f"
1637            + "\u3000\uf8ff\ufeff\ufffb";
1638
1639    static final Invisible INSTANCE = new Invisible();
1640
1641    private Invisible() {
1642      super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray());
1643    }
1644  }
1645
1646  /** Implementation of {@link #singleWidth()}. */
1647  private static final class SingleWidth extends RangesMatcher {
1648
1649    static final SingleWidth INSTANCE = new SingleWidth();
1650
1651    private SingleWidth() {
1652      super(
1653          "CharMatcher.singleWidth()",
1654          "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
1655          "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
1656    }
1657  }
1658
1659  // Non-static factory implementation classes
1660
1661  /** Implementation of {@link #negate()}. */
1662  private static class Negated extends CharMatcher {
1663
1664    final CharMatcher original;
1665
1666    Negated(CharMatcher original) {
1667      this.original = checkNotNull(original);
1668    }
1669
1670    @Override
1671    public boolean matches(char c) {
1672      return !original.matches(c);
1673    }
1674
1675    @Override
1676    public boolean matchesAllOf(CharSequence sequence) {
1677      return original.matchesNoneOf(sequence);
1678    }
1679
1680    @Override
1681    public boolean matchesNoneOf(CharSequence sequence) {
1682      return original.matchesAllOf(sequence);
1683    }
1684
1685    @Override
1686    public int countIn(CharSequence sequence) {
1687      return sequence.length() - original.countIn(sequence);
1688    }
1689
1690    @GwtIncompatible // used only from other GwtIncompatible code
1691    @Override
1692    void setBits(BitSet table) {
1693      BitSet tmp = new BitSet();
1694      original.setBits(tmp);
1695      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
1696      table.or(tmp);
1697    }
1698
1699    @Override
1700    public CharMatcher negate() {
1701      return original;
1702    }
1703
1704    @Override
1705    public String toString() {
1706      return original + ".negate()";
1707    }
1708  }
1709
1710  /** Implementation of {@link #and(CharMatcher)}. */
1711  private static final class And extends CharMatcher {
1712
1713    final CharMatcher first;
1714    final CharMatcher second;
1715
1716    And(CharMatcher a, CharMatcher b) {
1717      first = checkNotNull(a);
1718      second = checkNotNull(b);
1719    }
1720
1721    @Override
1722    public boolean matches(char c) {
1723      return first.matches(c) && second.matches(c);
1724    }
1725
1726    @GwtIncompatible // used only from other GwtIncompatible code
1727    @Override
1728    void setBits(BitSet table) {
1729      BitSet tmp1 = new BitSet();
1730      first.setBits(tmp1);
1731      BitSet tmp2 = new BitSet();
1732      second.setBits(tmp2);
1733      tmp1.and(tmp2);
1734      table.or(tmp1);
1735    }
1736
1737    @Override
1738    public String toString() {
1739      return "CharMatcher.and(" + first + ", " + second + ")";
1740    }
1741  }
1742
1743  /** Implementation of {@link #or(CharMatcher)}. */
1744  private static final class Or extends CharMatcher {
1745
1746    final CharMatcher first;
1747    final CharMatcher second;
1748
1749    Or(CharMatcher a, CharMatcher b) {
1750      first = checkNotNull(a);
1751      second = checkNotNull(b);
1752    }
1753
1754    @GwtIncompatible // used only from other GwtIncompatible code
1755    @Override
1756    void setBits(BitSet table) {
1757      first.setBits(table);
1758      second.setBits(table);
1759    }
1760
1761    @Override
1762    public boolean matches(char c) {
1763      return first.matches(c) || second.matches(c);
1764    }
1765
1766    @Override
1767    public String toString() {
1768      return "CharMatcher.or(" + first + ", " + second + ")";
1769    }
1770  }
1771
1772  // Static factory implementations
1773
1774  /** Implementation of {@link #is(char)}. */
1775  private static final class Is extends FastMatcher {
1776
1777    private final char match;
1778
1779    Is(char match) {
1780      this.match = match;
1781    }
1782
1783    @Override
1784    public boolean matches(char c) {
1785      return c == match;
1786    }
1787
1788    @Override
1789    public String replaceFrom(CharSequence sequence, char replacement) {
1790      return sequence.toString().replace(match, replacement);
1791    }
1792
1793    @Override
1794    public CharMatcher and(CharMatcher other) {
1795      return other.matches(match) ? this : none();
1796    }
1797
1798    @Override
1799    public CharMatcher or(CharMatcher other) {
1800      return other.matches(match) ? other : super.or(other);
1801    }
1802
1803    @Override
1804    public CharMatcher negate() {
1805      return isNot(match);
1806    }
1807
1808    @GwtIncompatible // used only from other GwtIncompatible code
1809    @Override
1810    void setBits(BitSet table) {
1811      table.set(match);
1812    }
1813
1814    @Override
1815    public String toString() {
1816      return "CharMatcher.is('" + showCharacter(match) + "')";
1817    }
1818  }
1819
1820  /** Implementation of {@link #isNot(char)}. */
1821  private static final class IsNot extends FastMatcher {
1822
1823    private final char match;
1824
1825    IsNot(char match) {
1826      this.match = match;
1827    }
1828
1829    @Override
1830    public boolean matches(char c) {
1831      return c != match;
1832    }
1833
1834    @Override
1835    public CharMatcher and(CharMatcher other) {
1836      return other.matches(match) ? super.and(other) : other;
1837    }
1838
1839    @Override
1840    public CharMatcher or(CharMatcher other) {
1841      return other.matches(match) ? any() : this;
1842    }
1843
1844    @GwtIncompatible // used only from other GwtIncompatible code
1845    @Override
1846    void setBits(BitSet table) {
1847      table.set(0, match);
1848      table.set(match + 1, Character.MAX_VALUE + 1);
1849    }
1850
1851    @Override
1852    public CharMatcher negate() {
1853      return is(match);
1854    }
1855
1856    @Override
1857    public String toString() {
1858      return "CharMatcher.isNot('" + showCharacter(match) + "')";
1859    }
1860  }
1861
1862  private static CharMatcher.IsEither isEither(char c1, char c2) {
1863    return new CharMatcher.IsEither(c1, c2);
1864  }
1865
1866  /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */
1867  private static final class IsEither extends FastMatcher {
1868
1869    private final char match1;
1870    private final char match2;
1871
1872    IsEither(char match1, char match2) {
1873      this.match1 = match1;
1874      this.match2 = match2;
1875    }
1876
1877    @Override
1878    public boolean matches(char c) {
1879      return c == match1 || c == match2;
1880    }
1881
1882    @GwtIncompatible // used only from other GwtIncompatible code
1883    @Override
1884    void setBits(BitSet table) {
1885      table.set(match1);
1886      table.set(match2);
1887    }
1888
1889    @Override
1890    public String toString() {
1891      return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")";
1892    }
1893  }
1894
1895  /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */
1896  private static final class AnyOf extends CharMatcher {
1897
1898    private final char[] chars;
1899
1900    public AnyOf(CharSequence chars) {
1901      this.chars = chars.toString().toCharArray();
1902      Arrays.sort(this.chars);
1903    }
1904
1905    @Override
1906    public boolean matches(char c) {
1907      return Arrays.binarySearch(chars, c) >= 0;
1908    }
1909
1910    @Override
1911    @GwtIncompatible // used only from other GwtIncompatible code
1912    void setBits(BitSet table) {
1913      for (char c : chars) {
1914        table.set(c);
1915      }
1916    }
1917
1918    @Override
1919    public String toString() {
1920      StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
1921      for (char c : chars) {
1922        description.append(showCharacter(c));
1923      }
1924      description.append("\")");
1925      return description.toString();
1926    }
1927  }
1928
1929  /** Implementation of {@link #inRange(char, char)}. */
1930  private static final class InRange extends FastMatcher {
1931
1932    private final char startInclusive;
1933    private final char endInclusive;
1934
1935    InRange(char startInclusive, char endInclusive) {
1936      checkArgument(endInclusive >= startInclusive);
1937      this.startInclusive = startInclusive;
1938      this.endInclusive = endInclusive;
1939    }
1940
1941    @Override
1942    public boolean matches(char c) {
1943      return startInclusive <= c && c <= endInclusive;
1944    }
1945
1946    @GwtIncompatible // used only from other GwtIncompatible code
1947    @Override
1948    void setBits(BitSet table) {
1949      table.set(startInclusive, endInclusive + 1);
1950    }
1951
1952    @Override
1953    public String toString() {
1954      return "CharMatcher.inRange('"
1955          + showCharacter(startInclusive)
1956          + "', '"
1957          + showCharacter(endInclusive)
1958          + "')";
1959    }
1960  }
1961
1962  /** Implementation of {@link #forPredicate(Predicate)}. */
1963  private static final class ForPredicate extends CharMatcher {
1964
1965    private final Predicate<? super Character> predicate;
1966
1967    ForPredicate(Predicate<? super Character> predicate) {
1968      this.predicate = checkNotNull(predicate);
1969    }
1970
1971    @Override
1972    public boolean matches(char c) {
1973      return predicate.apply(c);
1974    }
1975
1976    @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily
1977    @Override
1978    public boolean apply(Character character) {
1979      return predicate.apply(checkNotNull(character));
1980    }
1981
1982    @Override
1983    public String toString() {
1984      return "CharMatcher.forPredicate(" + predicate + ")";
1985    }
1986  }
1987}