001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024
025import java.util.Arrays;
026import javax.annotation.CheckReturnValue;
027
028/**
029 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
030 * for any {@link Object}. Also offers basic text processing methods based on this function.
031 * Implementations are strongly encouraged to be side-effect-free and immutable.
032 *
033 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
034 * "any character {@code c} for which {@code this.matches(c)} returns {@code true}".
035 *
036 * <p><b>Note:</b> This class deals only with {@code char} values; it does not understand
037 * supplementary Unicode code points in the range {@code 0x10000} to {@code 0x10FFFF}. Such logical
038 * characters are encoded into a {@code String} using surrogate pairs, and a {@code CharMatcher}
039 * treats these just as two separate characters.
040 *
041 * <p>Example usages: <pre>
042 *   String trimmed = {@link #WHITESPACE WHITESPACE}.{@link #trimFrom trimFrom}(userInput);
043 *   if ({@link #ASCII ASCII}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
044 *
045 * <p>See the Guava User Guide article on <a href=
046 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#CharMatcher">
047 * {@code CharMatcher}</a>.
048 *
049 * @author Kevin Bourrillion
050 * @since 1.0
051 */
052@Beta // Possibly change from chars to code points; decide constants vs. methods
053@GwtCompatible
054public abstract class CharMatcher implements Predicate<Character> {
055  // Constants
056  /**
057   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
058   * interpreted as a break between words for formatting purposes). See {@link #WHITESPACE} for a
059   * discussion of that term.
060   *
061   * @since 2.0
062   */
063  public static final CharMatcher BREAKING_WHITESPACE =
064      anyOf("\t\n\013\f\r \u0085\u1680\u2028\u2029\u205f\u3000")
065          .or(inRange('\u2000', '\u2006'))
066          .or(inRange('\u2008', '\u200a'))
067          .withToString("CharMatcher.BREAKING_WHITESPACE")
068          .precomputed();
069
070  /**
071   * Determines whether a character is ASCII, meaning that its code point is less than 128.
072   */
073  public static final CharMatcher ASCII = inRange('\0', '\u007f', "CharMatcher.ASCII");
074
075  /**
076   * Determines whether a character is a digit according to
077   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>.
078   */
079  public static final CharMatcher DIGIT;
080
081  static {
082    CharMatcher digit = inRange('0', '9');
083    String zeroes =
084        "\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66"
085            + "\u0ce6\u0d66\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946"
086            + "\u19d0\u1b50\u1bb0\u1c40\u1c50\ua620\ua8d0\ua900\uaa50\uff10";
087    for (char base : zeroes.toCharArray()) {
088      digit = digit.or(inRange(base, (char) (base + 9)));
089    }
090    DIGIT = digit.withToString("CharMatcher.DIGIT").precomputed();
091  }
092
093  /**
094   * Determines whether a character is a digit according to {@link Character#isDigit(char) Java's
095   * definition}. If you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
096   */
097  public static final CharMatcher JAVA_DIGIT = new CharMatcher("CharMatcher.JAVA_DIGIT") {
098    @Override public boolean matches(char c) {
099      return Character.isDigit(c);
100    }
101  };
102
103  /**
104   * Determines whether a character is a letter according to {@link Character#isLetter(char) Java's
105   * definition}. If you only care to match letters of the Latin alphabet, you can use {@code
106   * inRange('a', 'z').or(inRange('A', 'Z'))}.
107   */
108  public static final CharMatcher JAVA_LETTER = new CharMatcher("CharMatcher.JAVA_LETTER") {
109    @Override public boolean matches(char c) {
110      return Character.isLetter(c);
111    }
112
113    @Override public CharMatcher precomputed() {
114      return this;
115    }
116  };
117
118  /**
119   * Determines whether a character is a letter or digit according to {@link
120   * Character#isLetterOrDigit(char) Java's definition}.
121   */
122  public static final CharMatcher JAVA_LETTER_OR_DIGIT =
123      new CharMatcher("CharMatcher.JAVA_LETTER_OR_DIGIT") {
124    @Override public boolean matches(char c) {
125      return Character.isLetterOrDigit(c);
126    }
127  };
128
129  /**
130   * Determines whether a character is upper case according to {@link Character#isUpperCase(char)
131   * Java's definition}.
132   */
133  public static final CharMatcher JAVA_UPPER_CASE =
134      new CharMatcher("CharMatcher.JAVA_UPPER_CASE") {
135    @Override public boolean matches(char c) {
136      return Character.isUpperCase(c);
137    }
138  };
139
140  /**
141   * Determines whether a character is lower case according to {@link Character#isLowerCase(char)
142   * Java's definition}.
143   */
144  public static final CharMatcher JAVA_LOWER_CASE =
145      new CharMatcher("CharMatcher.JAVA_LOWER_CASE") {
146    @Override public boolean matches(char c) {
147      return Character.isLowerCase(c);
148    }
149  };
150
151  /**
152   * Determines whether a character is an ISO control character as specified by {@link
153   * Character#isISOControl(char)}.
154   */
155  public static final CharMatcher JAVA_ISO_CONTROL =
156      inRange('\u0000', '\u001f')
157      .or(inRange('\u007f', '\u009f'))
158      .withToString("CharMatcher.JAVA_ISO_CONTROL");
159
160  /**
161   * Determines whether a character is invisible; that is, if its Unicode category is any of
162   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
163   * PRIVATE_USE according to ICU4J.
164   */
165  public static final CharMatcher INVISIBLE = inRange('\u0000', '\u0020')
166      .or(inRange('\u007f', '\u00a0'))
167      .or(is('\u00ad'))
168      .or(inRange('\u0600', '\u0604'))
169      .or(anyOf("\u06dd\u070f\u1680\u180e"))
170      .or(inRange('\u2000', '\u200f'))
171      .or(inRange('\u2028', '\u202f'))
172      .or(inRange('\u205f', '\u2064'))
173      .or(inRange('\u206a', '\u206f'))
174      .or(is('\u3000'))
175      .or(inRange('\ud800', '\uf8ff'))
176      .or(anyOf("\ufeff\ufff9\ufffa\ufffb"))
177      .withToString("CharMatcher.INVISIBLE")
178      .precomputed();
179
180  /**
181   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
182   * errs on the side of returning {@code false} (that is, it tends to assume a character is
183   * double-width).
184   *
185   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to keep it up to
186   * date.
187   */
188  public static final CharMatcher SINGLE_WIDTH = inRange('\u0000', '\u04f9')
189      .or(is('\u05be'))
190      .or(inRange('\u05d0', '\u05ea'))
191      .or(is('\u05f3'))
192      .or(is('\u05f4'))
193      .or(inRange('\u0600', '\u06ff'))
194      .or(inRange('\u0750', '\u077f'))
195      .or(inRange('\u0e00', '\u0e7f'))
196      .or(inRange('\u1e00', '\u20af'))
197      .or(inRange('\u2100', '\u213a'))
198      .or(inRange('\ufb50', '\ufdff'))
199      .or(inRange('\ufe70', '\ufeff'))
200      .or(inRange('\uff61', '\uffdc'))
201      .withToString("CharMatcher.SINGLE_WIDTH")
202      .precomputed();
203
204  /** Matches any character. */
205  public static final CharMatcher ANY =
206      new CharMatcher("CharMatcher.ANY") {
207        @Override public boolean matches(char c) {
208          return true;
209        }
210
211        @Override public int indexIn(CharSequence sequence) {
212          return (sequence.length() == 0) ? -1 : 0;
213        }
214
215        @Override public int indexIn(CharSequence sequence, int start) {
216          int length = sequence.length();
217          Preconditions.checkPositionIndex(start, length);
218          return (start == length) ? -1 : start;
219        }
220
221        @Override public int lastIndexIn(CharSequence sequence) {
222          return sequence.length() - 1;
223        }
224
225        @Override public boolean matchesAllOf(CharSequence sequence) {
226          checkNotNull(sequence);
227          return true;
228        }
229
230        @Override public boolean matchesNoneOf(CharSequence sequence) {
231          return sequence.length() == 0;
232        }
233
234        @Override public String removeFrom(CharSequence sequence) {
235          checkNotNull(sequence);
236          return "";
237        }
238
239        @Override public String replaceFrom(CharSequence sequence, char replacement) {
240          char[] array = new char[sequence.length()];
241          Arrays.fill(array, replacement);
242          return new String(array);
243        }
244
245        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
246          StringBuilder retval = new StringBuilder(sequence.length() * replacement.length());
247          for (int i = 0; i < sequence.length(); i++) {
248            retval.append(replacement);
249          }
250          return retval.toString();
251        }
252
253        @Override public String collapseFrom(CharSequence sequence, char replacement) {
254          return (sequence.length() == 0) ? "" : String.valueOf(replacement);
255        }
256
257        @Override public String trimFrom(CharSequence sequence) {
258          checkNotNull(sequence);
259          return "";
260        }
261
262        @Override public int countIn(CharSequence sequence) {
263          return sequence.length();
264        }
265
266        @Override public CharMatcher and(CharMatcher other) {
267          return checkNotNull(other);
268        }
269
270        @Override public CharMatcher or(CharMatcher other) {
271          checkNotNull(other);
272          return this;
273        }
274
275        @Override public CharMatcher negate() {
276          return NONE;
277        }
278
279        @Override public CharMatcher precomputed() {
280          return this;
281        }
282      };
283
284  /** Matches no characters. */
285  public static final CharMatcher NONE =
286      new CharMatcher("CharMatcher.NONE") {
287        @Override public boolean matches(char c) {
288          return false;
289        }
290
291        @Override public int indexIn(CharSequence sequence) {
292          checkNotNull(sequence);
293          return -1;
294        }
295
296        @Override public int indexIn(CharSequence sequence, int start) {
297          int length = sequence.length();
298          Preconditions.checkPositionIndex(start, length);
299          return -1;
300        }
301
302        @Override public int lastIndexIn(CharSequence sequence) {
303          checkNotNull(sequence);
304          return -1;
305        }
306
307        @Override public boolean matchesAllOf(CharSequence sequence) {
308          return sequence.length() == 0;
309        }
310
311        @Override public boolean matchesNoneOf(CharSequence sequence) {
312          checkNotNull(sequence);
313          return true;
314        }
315
316        @Override public String removeFrom(CharSequence sequence) {
317          return sequence.toString();
318        }
319
320        @Override public String replaceFrom(CharSequence sequence, char replacement) {
321          return sequence.toString();
322        }
323
324        @Override public String replaceFrom(CharSequence sequence, CharSequence replacement) {
325          checkNotNull(replacement);
326          return sequence.toString();
327        }
328
329        @Override public String collapseFrom(CharSequence sequence, char replacement) {
330          return sequence.toString();
331        }
332
333        @Override public String trimFrom(CharSequence sequence) {
334          return sequence.toString();
335        }
336
337        @Override public int countIn(CharSequence sequence) {
338          checkNotNull(sequence);
339          return 0;
340        }
341
342        @Override public CharMatcher and(CharMatcher other) {
343          checkNotNull(other);
344          return this;
345        }
346
347        @Override public CharMatcher or(CharMatcher other) {
348          return checkNotNull(other);
349        }
350
351        @Override public CharMatcher negate() {
352          return ANY;
353        }
354
355        @Override void setBits(LookupTable table) {}
356
357        @Override public CharMatcher precomputed() {
358          return this;
359        }
360      };
361
362  // Static factories
363
364  /**
365   * Returns a {@code char} matcher that matches only one specified character.
366   */
367  public static CharMatcher is(final char match) {
368    String description = new StringBuilder("CharMatcher.is(")
369        .append(Integer.toHexString(match))
370        .append(")")
371        .toString();
372    return new CharMatcher(description) {
373      @Override public boolean matches(char c) {
374        return c == match;
375      }
376
377      @Override public String replaceFrom(CharSequence sequence, char replacement) {
378        return sequence.toString().replace(match, replacement);
379      }
380
381      @Override public CharMatcher and(CharMatcher other) {
382        return other.matches(match) ? this : NONE;
383      }
384
385      @Override public CharMatcher or(CharMatcher other) {
386        return other.matches(match) ? other : super.or(other);
387      }
388
389      @Override public CharMatcher negate() {
390        return isNot(match);
391      }
392
393      @Override void setBits(LookupTable table) {
394        table.set(match);
395      }
396
397      @Override public CharMatcher precomputed() {
398        return this;
399      }
400    };
401  }
402
403  /**
404   * Returns a {@code char} matcher that matches any character except the one specified.
405   *
406   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
407   */
408  public static CharMatcher isNot(final char match) {
409    String description = new StringBuilder("CharMatcher.isNot(")
410        .append(Integer.toHexString(match))
411        .append(")")
412        .toString();
413    return new CharMatcher(description) {
414      @Override public boolean matches(char c) {
415        return c != match;
416      }
417
418      @Override public CharMatcher and(CharMatcher other) {
419        return other.matches(match) ? super.and(other) : other;
420      }
421
422      @Override public CharMatcher or(CharMatcher other) {
423        return other.matches(match) ? ANY : this;
424      }
425
426      @Override public CharMatcher negate() {
427        return is(match);
428      }
429    };
430  }
431
432  /**
433   * Returns a {@code char} matcher that matches any character present in the given character
434   * sequence.
435   */
436  public static CharMatcher anyOf(final CharSequence sequence) {
437    switch (sequence.length()) {
438      case 0:
439        return NONE;
440      case 1:
441        return is(sequence.charAt(0));
442      case 2:
443        final char match1 = sequence.charAt(0);
444        final char match2 = sequence.charAt(1);
445        return new CharMatcher(
446            new StringBuilder("CharMatcher.anyOf(\"").append(sequence).append("\")").toString()) {
447          @Override public boolean matches(char c) {
448            return c == match1 || c == match2;
449          }
450
451          @Override void setBits(LookupTable table) {
452            table.set(match1);
453            table.set(match2);
454          }
455
456          @Override public CharMatcher precomputed() {
457            return this;
458          }
459        };
460    }
461    final char[] chars = sequence.toString().toCharArray();
462    Arrays.sort(chars);
463
464    return new CharMatcher(new StringBuilder("CharMatcher.anyOf(\"").append(chars)
465        .append("\")").toString()) {
466          @Override public boolean matches(char c) {
467            return Arrays.binarySearch(chars, c) >= 0;
468          }
469    };
470  }
471
472  /**
473   * Returns a {@code char} matcher that matches any character not present in the given character
474   * sequence.
475   */
476  public static CharMatcher noneOf(CharSequence sequence) {
477    return anyOf(sequence).negate();
478  }
479
480  /**
481   * Returns a {@code char} matcher that matches any character in a given range (both endpoints are
482   * inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
483   * CharMatcher.inRange('a', 'z')}.
484   *
485   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
486   */
487  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
488    checkArgument(endInclusive >= startInclusive);
489    String description = new StringBuilder("CharMatcher.inRange(")
490        .append(Integer.toHexString(startInclusive))
491        .append(", ")
492        .append(Integer.toHexString(endInclusive))
493        .append(")")
494        .toString();
495    return inRange(startInclusive, endInclusive, description);
496  }
497
498  static CharMatcher inRange(final char startInclusive, final char endInclusive,
499      String description) {
500    return new CharMatcher(description) {
501      @Override public boolean matches(char c) {
502        return startInclusive <= c && c <= endInclusive;
503      }
504
505      @Override void setBits(LookupTable table) {
506        char c = startInclusive;
507        while (true) {
508          table.set(c);
509          if (c++ == endInclusive) {
510            break;
511          }
512        }
513      }
514
515      @Override public CharMatcher precomputed() {
516        return this;
517      }
518    };
519  }
520
521  /**
522   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
523   * which operates on primitive {@code char} instances instead.
524   */
525  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
526    checkNotNull(predicate);
527    if (predicate instanceof CharMatcher) {
528      return (CharMatcher) predicate;
529    }
530    String description = new StringBuilder("CharMatcher.forPredicate(")
531        .append(predicate)
532        .append(')')
533        .toString();
534    return new CharMatcher(description) {
535      @Override public boolean matches(char c) {
536        return predicate.apply(c);
537      }
538
539      @Override public boolean apply(Character character) {
540        return predicate.apply(checkNotNull(character));
541      }
542    };
543  }
544
545  // State
546  final String description;
547
548  // Constructors
549
550  /**
551   * Sets the {@code toString()} from the given description.
552   */
553  CharMatcher(String description) {
554    this.description = description;
555  }
556
557  /**
558   * Constructor for use by subclasses. When subclassing, you may want to override
559   * {@code toString()} to provide a useful description.
560   */
561  protected CharMatcher() {
562    description = "UnknownCharMatcher";
563  }
564
565  // Abstract methods
566
567  /** Determines a true or false value for the given character. */
568  public abstract boolean matches(char c);
569
570  // Non-static factories
571
572  /**
573   * Returns a matcher that matches any character not matched by this matcher.
574   */
575  public CharMatcher negate() {
576    final CharMatcher original = this;
577    return new CharMatcher(original + ".negate()") {
578      @Override public boolean matches(char c) {
579        return !original.matches(c);
580      }
581
582      @Override public boolean matchesAllOf(CharSequence sequence) {
583        return original.matchesNoneOf(sequence);
584      }
585
586      @Override public boolean matchesNoneOf(CharSequence sequence) {
587        return original.matchesAllOf(sequence);
588      }
589
590      @Override public int countIn(CharSequence sequence) {
591        return sequence.length() - original.countIn(sequence);
592      }
593
594      @Override public CharMatcher negate() {
595        return original;
596      }
597    };
598  }
599
600  /**
601   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
602   */
603  public CharMatcher and(CharMatcher other) {
604    return new And(this, checkNotNull(other));
605  }
606
607  private static class And extends CharMatcher {
608    final CharMatcher first;
609    final CharMatcher second;
610
611    And(CharMatcher a, CharMatcher b) {
612      this(a, b, "CharMatcher.and(" + a + ", " + b + ")");
613    }
614
615    And(CharMatcher a, CharMatcher b, String description) {
616      super(description);
617      first = checkNotNull(a);
618      second = checkNotNull(b);
619    }
620
621    @Override
622    public CharMatcher and(CharMatcher other) {
623      return new And(this, other);
624    }
625
626    @Override
627    public boolean matches(char c) {
628      return first.matches(c) && second.matches(c);
629    }
630
631    @Override
632    CharMatcher withToString(String description) {
633      return new And(first, second, description);
634    }
635  }
636
637  /**
638   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
639   */
640  public CharMatcher or(CharMatcher other) {
641    return new Or(this, checkNotNull(other));
642  }
643
644  private static class Or extends CharMatcher {
645    final CharMatcher first;
646    final CharMatcher second;
647
648    Or(CharMatcher a, CharMatcher b, String description) {
649      super(description);
650      first = checkNotNull(a);
651      second = checkNotNull(b);
652    }
653
654    Or(CharMatcher a, CharMatcher b) {
655      this(a, b, "CharMatcher.or(" + a + ", " + b + ")");
656    }
657
658    @Override
659    public CharMatcher or(CharMatcher other) {
660      return new Or(this, checkNotNull(other));
661    }
662
663    @Override
664    public boolean matches(char c) {
665      return first.matches(c) || second.matches(c);
666    }
667
668    @Override
669    CharMatcher withToString(String description) {
670      return new Or(first, second, description);
671    }
672  }
673
674  /**
675   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
676   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
677   * worthwhile only if the precomputed matcher is queried many thousands of times.
678   *
679   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
680   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
681   * worthwhile tradeoff in a browser.
682   */
683  public CharMatcher precomputed() {
684    return Platform.precomputeCharMatcher(this);
685  }
686
687  /**
688   * Construct an array of all possible chars in the slowest way possible.
689   */
690  char[] slowGetChars() {
691    char[] allChars = new char[65536];
692    int size = 0;
693    for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
694      if (matches((char) c)) {
695        allChars[size++] = (char) c;
696      }
697    }
698    char[] retValue = new char[size];
699    System.arraycopy(allChars, 0, retValue, 0, size);
700    return retValue;
701  }
702
703  /**
704   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
705   * on {@link Platform} so that we can have different behavior in GWT.
706   *
707   * <p>If the number of matched characters is small enough, we try to build a small hash
708   * table to contain all of the characters. Otherwise, we record the characters in eight-kilobyte
709   * bit array. In many situations this produces a matcher which is faster to query
710   * than the original.
711   */
712  CharMatcher precomputedInternal() {
713    final char[] chars = slowGetChars();
714    int totalCharacters = chars.length;
715    if (totalCharacters == 0) {
716      return NONE;
717    } else if (totalCharacters == 1) {
718      return is(chars[0]);
719    } else if (totalCharacters < SmallCharMatcher.MAX_SIZE) {
720      return SmallCharMatcher.from(chars, toString());
721    } else if (totalCharacters < MediumCharMatcher.MAX_SIZE) {
722      return MediumCharMatcher.from(chars, toString());
723    }
724    // Otherwise, make the full lookup table.
725    final LookupTable table = new LookupTable();
726    setBits(table);
727    final CharMatcher outer = this;
728
729    return new CharMatcher(outer.toString()) {
730      @Override public boolean matches(char c) {
731        return table.get(c);
732      }
733
734      // TODO(kevinb): make methods like negate() smart?
735
736      @Override public CharMatcher precomputed() {
737        return this;
738      }
739    };
740  }
741
742  /**
743   * Subclasses should provide a new CharMatcher with the same characteristics as {@code this},
744   * but with their {@code toString} method overridden with the new description.
745   *
746   * <p>This is unsupported by default.
747   */
748  CharMatcher withToString(String description) {
749    throw new UnsupportedOperationException();
750
751  }
752
753  /**
754   * For use by implementors; sets the bit corresponding to each character ('\0' to '{@literal
755   * \}uFFFF') that matches this matcher in the given bit array, leaving all other bits untouched.
756   *
757   * <p>The default implementation loops over every possible character value, invoking {@link
758   * #matches} for each one.
759   */
760  void setBits(LookupTable table) {
761    char c = Character.MIN_VALUE;
762    while (true) {
763      if (matches(c)) {
764        table.set(c);
765      }
766      if (c++ == Character.MAX_VALUE) {
767        break;
768      }
769    }
770  }
771
772  /**
773   * A bit array with one bit per {@code char} value, used by {@link CharMatcher#precomputed}.
774   *
775   * <p>TODO(kevinb): possibly share a common BitArray class with BloomFilter and others... a
776   * simpler java.util.BitSet.
777   */
778  private static final class LookupTable {
779    int[] data = new int[2048];
780
781    void set(char index) {
782      data[index >> 5] |= (1 << index);
783    }
784
785    boolean get(char index) {
786      return (data[index >> 5] & (1 << index)) != 0;
787    }
788  }
789
790  // Text processing routines
791
792  /**
793   * Returns {@code true} if a character sequence contains at least one matching character.
794   * Equivalent to {@code !matchesNoneOf(sequence)}.
795   *
796   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
797   * character, until this returns {@code true} or the end is reached.
798   *
799   * @param sequence the character sequence to examine, possibly empty
800   * @return {@code true} if this matcher matches at least one character in the sequence
801   * @since 8.0
802   */
803  public boolean matchesAnyOf(CharSequence sequence) {
804    return !matchesNoneOf(sequence);
805  }
806
807  /**
808   * Returns {@code true} if a character sequence contains only matching characters.
809   *
810   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
811   * character, until this returns {@code false} or the end is reached.
812   *
813   * @param sequence the character sequence to examine, possibly empty
814   * @return {@code true} if this matcher matches every character in the sequence, including when
815   *         the sequence is empty
816   */
817  public boolean matchesAllOf(CharSequence sequence) {
818    for (int i = sequence.length() - 1; i >= 0; i--) {
819      if (!matches(sequence.charAt(i))) {
820        return false;
821      }
822    }
823    return true;
824  }
825
826  /**
827   * Returns {@code true} if a character sequence contains no matching characters. Equivalent to
828   * {@code !matchesAnyOf(sequence)}.
829   *
830   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
831   * character, until this returns {@code false} or the end is reached.
832   *
833   * @param sequence the character sequence to examine, possibly empty
834   * @return {@code true} if this matcher matches every character in the sequence, including when
835   *         the sequence is empty
836   */
837  public boolean matchesNoneOf(CharSequence sequence) {
838    return indexIn(sequence) == -1;
839  }
840
841  /**
842   * Returns the index of the first matching character in a character sequence, or {@code -1} if no
843   * matching character is present.
844   *
845   * <p>The default implementation iterates over the sequence in forward order calling {@link
846   * #matches} for each character.
847   *
848   * @param sequence the character sequence to examine from the beginning
849   * @return an index, or {@code -1} if no character matches
850   */
851  public int indexIn(CharSequence sequence) {
852    int length = sequence.length();
853    for (int i = 0; i < length; i++) {
854      if (matches(sequence.charAt(i))) {
855        return i;
856      }
857    }
858    return -1;
859  }
860
861  /**
862   * Returns the index of the first matching character in a character sequence, starting from a
863   * given position, or {@code -1} if no character matches after that position.
864   *
865   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
866   * start}, calling {@link #matches} for each character.
867   *
868   * @param sequence the character sequence to examine
869   * @param start the first index to examine; must be nonnegative and no greater than {@code
870   *        sequence.length()}
871   * @return the index of the first matching character, guaranteed to be no less than {@code start},
872   *         or {@code -1} if no character matches
873   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
874   *         sequence.length()}
875   */
876  public int indexIn(CharSequence sequence, int start) {
877    int length = sequence.length();
878    Preconditions.checkPositionIndex(start, length);
879    for (int i = start; i < length; i++) {
880      if (matches(sequence.charAt(i))) {
881        return i;
882      }
883    }
884    return -1;
885  }
886
887  /**
888   * Returns the index of the last matching character in a character sequence, or {@code -1} if no
889   * matching character is present.
890   *
891   * <p>The default implementation iterates over the sequence in reverse order calling {@link
892   * #matches} for each character.
893   *
894   * @param sequence the character sequence to examine from the end
895   * @return an index, or {@code -1} if no character matches
896   */
897  public int lastIndexIn(CharSequence sequence) {
898    for (int i = sequence.length() - 1; i >= 0; i--) {
899      if (matches(sequence.charAt(i))) {
900        return i;
901      }
902    }
903    return -1;
904  }
905
906  /**
907   * Returns the number of matching characters found in a character sequence.
908   */
909  public int countIn(CharSequence sequence) {
910    int count = 0;
911    for (int i = 0; i < sequence.length(); i++) {
912      if (matches(sequence.charAt(i))) {
913        count++;
914      }
915    }
916    return count;
917  }
918
919  /**
920   * Returns a string containing all non-matching characters of a character sequence, in order. For
921   * example: <pre>   {@code
922   *
923   *   CharMatcher.is('a').removeFrom("bazaar")}</pre>
924   *
925   * ... returns {@code "bzr"}.
926   */
927  @CheckReturnValue
928  public String removeFrom(CharSequence sequence) {
929    String string = sequence.toString();
930    int pos = indexIn(string);
931    if (pos == -1) {
932      return string;
933    }
934
935    char[] chars = string.toCharArray();
936    int spread = 1;
937
938    // This unusual loop comes from extensive benchmarking
939    OUT: while (true) {
940      pos++;
941      while (true) {
942        if (pos == chars.length) {
943          break OUT;
944        }
945        if (matches(chars[pos])) {
946          break;
947        }
948        chars[pos - spread] = chars[pos];
949        pos++;
950      }
951      spread++;
952    }
953    return new String(chars, 0, pos - spread);
954  }
955
956  /**
957   * Returns a string containing all matching characters of a character sequence, in order. For
958   * example: <pre>   {@code
959   *
960   *   CharMatcher.is('a').retainFrom("bazaar")}</pre>
961   *
962   * ... returns {@code "aaa"}.
963   */
964  @CheckReturnValue
965  public String retainFrom(CharSequence sequence) {
966    return negate().removeFrom(sequence);
967  }
968
969  /**
970   * Returns a string copy of the input character sequence, with each character that matches this
971   * matcher replaced by a given replacement character. For example: <pre>   {@code
972   *
973   *   CharMatcher.is('a').replaceFrom("radar", 'o')}</pre>
974   *
975   * ... returns {@code "rodor"}.
976   *
977   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
978   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
979   * character.
980   *
981   * @param sequence the character sequence to replace matching characters in
982   * @param replacement the character to append to the result string in place of each matching
983   *        character in {@code sequence}
984   * @return the new string
985   */
986  @CheckReturnValue
987  public String replaceFrom(CharSequence sequence, char replacement) {
988    String string = sequence.toString();
989    int pos = indexIn(string);
990    if (pos == -1) {
991      return string;
992    }
993    char[] chars = string.toCharArray();
994    chars[pos] = replacement;
995    for (int i = pos + 1; i < chars.length; i++) {
996      if (matches(chars[i])) {
997        chars[i] = replacement;
998      }
999    }
1000    return new String(chars);
1001  }
1002
1003  /**
1004   * Returns a string copy of the input character sequence, with each character that matches this
1005   * matcher replaced by a given replacement sequence. For example: <pre>   {@code
1006   *
1007   *   CharMatcher.is('a').replaceFrom("yaha", "oo")}</pre>
1008   *
1009   * ... returns {@code "yoohoo"}.
1010   *
1011   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
1012   * off calling {@link #replaceFrom(CharSequence, char)} directly.
1013   *
1014   * @param sequence the character sequence to replace matching characters in
1015   * @param replacement the characters to append to the result string in place of each matching
1016   *        character in {@code sequence}
1017   * @return the new string
1018   */
1019  @CheckReturnValue
1020  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1021    int replacementLen = replacement.length();
1022    if (replacementLen == 0) {
1023      return removeFrom(sequence);
1024    }
1025    if (replacementLen == 1) {
1026      return replaceFrom(sequence, replacement.charAt(0));
1027    }
1028
1029    String string = sequence.toString();
1030    int pos = indexIn(string);
1031    if (pos == -1) {
1032      return string;
1033    }
1034
1035    int len = string.length();
1036    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
1037
1038    int oldpos = 0;
1039    do {
1040      buf.append(string, oldpos, pos);
1041      buf.append(replacement);
1042      oldpos = pos + 1;
1043      pos = indexIn(string, oldpos);
1044    } while (pos != -1);
1045
1046    buf.append(string, oldpos, len);
1047    return buf.toString();
1048  }
1049
1050  /**
1051   * Returns a substring of the input character sequence that omits all characters this matcher
1052   * matches from the beginning and from the end of the string. For example: <pre>   {@code
1053   *
1054   *   CharMatcher.anyOf("ab").trimFrom("abacatbab")}</pre>
1055   *
1056   * ... returns {@code "cat"}.
1057   *
1058   * <p>Note that: <pre>   {@code
1059   *
1060   *   CharMatcher.inRange('\0', ' ').trimFrom(str)}</pre>
1061   *
1062   * ... is equivalent to {@link String#trim()}.
1063   */
1064  @CheckReturnValue
1065  public String trimFrom(CharSequence sequence) {
1066    int len = sequence.length();
1067    int first;
1068    int last;
1069
1070    for (first = 0; first < len; first++) {
1071      if (!matches(sequence.charAt(first))) {
1072        break;
1073      }
1074    }
1075    for (last = len - 1; last > first; last--) {
1076      if (!matches(sequence.charAt(last))) {
1077        break;
1078      }
1079    }
1080
1081    return sequence.subSequence(first, last + 1).toString();
1082  }
1083
1084  /**
1085   * Returns a substring of the input character sequence that omits all characters this matcher
1086   * matches from the beginning of the string. For example: <pre> {@code
1087   *
1088   *   CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")}</pre>
1089   *
1090   * ... returns {@code "catbab"}.
1091   */
1092  @CheckReturnValue
1093  public String trimLeadingFrom(CharSequence sequence) {
1094    int len = sequence.length();
1095    int first;
1096
1097    for (first = 0; first < len; first++) {
1098      if (!matches(sequence.charAt(first))) {
1099        break;
1100      }
1101    }
1102
1103    return sequence.subSequence(first, len).toString();
1104  }
1105
1106  /**
1107   * Returns a substring of the input character sequence that omits all characters this matcher
1108   * matches from the end of the string. For example: <pre> {@code
1109   *
1110   *   CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")}</pre>
1111   *
1112   * ... returns {@code "abacat"}.
1113   */
1114  @CheckReturnValue
1115  public String trimTrailingFrom(CharSequence sequence) {
1116    int len = sequence.length();
1117    int last;
1118
1119    for (last = len - 1; last >= 0; last--) {
1120      if (!matches(sequence.charAt(last))) {
1121        break;
1122      }
1123    }
1124
1125    return sequence.subSequence(0, last + 1).toString();
1126  }
1127
1128  /**
1129   * Returns a string copy of the input character sequence, with each group of consecutive
1130   * characters that match this matcher replaced by a single replacement character. For example:
1131   * <pre>   {@code
1132   *
1133   *   CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')}</pre>
1134   *
1135   * ... returns {@code "b-p-r"}.
1136   *
1137   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1138   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1139   * character.
1140   *
1141   * @param sequence the character sequence to replace matching groups of characters in
1142   * @param replacement the character to append to the result string in place of each group of
1143   *        matching characters in {@code sequence}
1144   * @return the new string
1145   */
1146  @CheckReturnValue
1147  public String collapseFrom(CharSequence sequence, char replacement) {
1148    int first = indexIn(sequence);
1149    if (first == -1) {
1150      return sequence.toString();
1151    }
1152
1153    // TODO(kevinb): see if this implementation can be made faster
1154    StringBuilder builder = new StringBuilder(sequence.length())
1155        .append(sequence.subSequence(0, first))
1156        .append(replacement);
1157    boolean in = true;
1158    for (int i = first + 1; i < sequence.length(); i++) {
1159      char c = sequence.charAt(i);
1160      if (matches(c)) {
1161        if (!in) {
1162          builder.append(replacement);
1163          in = true;
1164        }
1165      } else {
1166        builder.append(c);
1167        in = false;
1168      }
1169    }
1170    return builder.toString();
1171  }
1172
1173  /**
1174   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1175   * groups of matching characters at the start or end of the sequence are removed without
1176   * replacement.
1177   */
1178  @CheckReturnValue
1179  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1180    int first = negate().indexIn(sequence);
1181    if (first == -1) {
1182      return ""; // everything matches. nothing's left.
1183    }
1184    StringBuilder builder = new StringBuilder(sequence.length());
1185    boolean inMatchingGroup = false;
1186    for (int i = first; i < sequence.length(); i++) {
1187      char c = sequence.charAt(i);
1188      if (matches(c)) {
1189        inMatchingGroup = true;
1190      } else {
1191        if (inMatchingGroup) {
1192          builder.append(replacement);
1193          inMatchingGroup = false;
1194        }
1195        builder.append(c);
1196      }
1197    }
1198    return builder.toString();
1199  }
1200
1201  // Predicate interface
1202
1203  /**
1204   * Returns {@code true} if this matcher matches the given character.
1205   *
1206   * @throws NullPointerException if {@code character} is null
1207   */
1208  @Override public boolean apply(Character character) {
1209    return matches(character);
1210  }
1211
1212  /**
1213   * Returns a string representation of this {@code CharMatcher}, such as
1214   * {@code CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1215   */
1216  @Override
1217  public String toString() {
1218    return description;
1219  }
1220
1221  /**
1222   * Determines whether a character is whitespace according to the latest Unicode standard, as
1223   * illustrated
1224   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
1225   * This is not the same definition used by other Java APIs. (See a
1226   * <a href="http://spreadsheets.google.com/pub?key=pd8dAQyHbdewRsnE5x5GzKQ">comparison of several
1227   * definitions of "whitespace"</a>.)
1228   *
1229   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant to keep it up
1230   * to date.
1231   */
1232  public static final CharMatcher WHITESPACE = new CharMatcher("CharMatcher.WHITESPACE") {
1233    /**
1234     * A special-case CharMatcher for Unicode whitespace characters that is extremely
1235     * efficient both in space required and in time to check for matches.
1236     *
1237     * Implementation details.
1238     * It turns out that all current (early 2012) Unicode characters are unique modulo 79:
1239     * so we can construct a lookup table of exactly 79 entries, and just check the character code
1240     * mod 79, and see if that character is in the table.
1241     *
1242     * There is a 1 at the beginning of the table so that the null character is not listed
1243     * as whitespace.
1244     *
1245     * Other things we tried that did not prove to be beneficial, mostly due to speed concerns:
1246     *
1247     *   * Binary search into the sorted list of characters, i.e., what
1248     *     CharMatcher.anyOf() does</li>
1249     *   * Perfect hash function into a table of size 26 (using an offset table and a special
1250     *     Jenkins hash function)</li>
1251     *   * Perfect-ish hash function that required two lookups into a single table of size 26.</li>
1252     *   * Using a power-of-2 sized hash table (size 64) with linear probing.</li>
1253     *
1254     * --Christopher Swenson, February 2012.
1255     */
1256
1257    // Mod-79 lookup table.
1258    private final char[] table = {1, 0, 160, 0, 0, 0, 0, 0, 0, 9, 10, 11, 12, 13, 0, 0,
1259        8232, 8233, 0, 0, 0, 0, 0, 8239, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1260        12288, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 133, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199,
1261        8200, 8201, 8202, 0, 0, 0, 0, 0, 8287, 5760, 0, 0, 6158, 0, 0, 0};
1262
1263    @Override public boolean matches(char c) {
1264      return table[c % 79] == c;
1265    }
1266
1267    @Override public CharMatcher precomputed() {
1268      return this;
1269    }
1270  };
1271}