001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024import com.google.common.annotations.GwtIncompatible;
025
026import java.util.Collections;
027import java.util.Iterator;
028import java.util.LinkedHashMap;
029import java.util.Map;
030import java.util.regex.Matcher;
031import java.util.regex.Pattern;
032
033import javax.annotation.CheckReturnValue;
034
035/**
036 * An object that divides strings (or other instances of {@code CharSequence})
037 * into substrings, by recognizing a <i>separator</i> (a.k.a. "delimiter")
038 * which can be expressed as a single character, literal string, regular
039 * expression, {@code CharMatcher}, or by using a fixed substring length. This
040 * class provides the complementary functionality to {@link Joiner}.
041 *
042 * <p>Here is the most basic example of {@code Splitter} usage: <pre>   {@code
043 *
044 *   Splitter.on(',').split("foo,bar")}</pre>
045 *
046 * This invocation returns an {@code Iterable<String>} containing {@code "foo"}
047 * and {@code "bar"}, in that order.
048 *
049 * <p>By default {@code Splitter}'s behavior is very simplistic: <pre>   {@code
050 *
051 *   Splitter.on(',').split("foo,,bar, quux")}</pre>
052 *
053 * This returns an iterable containing {@code ["foo", "", "bar", " quux"]}.
054 * Notice that the splitter does not assume that you want empty strings removed,
055 * or that you wish to trim whitespace. If you want features like these, simply
056 * ask for them: <pre> {@code
057 *
058 *   private static final Splitter MY_SPLITTER = Splitter.on(',')
059 *       .trimResults()
060 *       .omitEmptyStrings();}</pre>
061 *
062 * Now {@code MY_SPLITTER.split("foo, ,bar, quux,")} returns an iterable
063 * containing just {@code ["foo", "bar", "quux"]}. Note that the order in which
064 * the configuration methods are called is never significant; for instance,
065 * trimming is always applied first before checking for an empty result,
066 * regardless of the order in which the {@link #trimResults()} and
067 * {@link #omitEmptyStrings()} methods were invoked.
068 *
069 * <p><b>Warning: splitter instances are always immutable</b>; a configuration
070 * method such as {@code omitEmptyStrings} has no effect on the instance it
071 * is invoked on! You must store and use the new splitter instance returned by
072 * the method. This makes splitters thread-safe, and safe to store as {@code
073 * static final} constants (as illustrated above). <pre>   {@code
074 *
075 *   // Bad! Do not do this!
076 *   Splitter splitter = Splitter.on('/');
077 *   splitter.trimResults(); // does nothing!
078 *   return splitter.split("wrong / wrong / wrong");}</pre>
079 *
080 * The separator recognized by the splitter does not have to be a single
081 * literal character as in the examples above. See the methods {@link
082 * #on(String)}, {@link #on(Pattern)} and {@link #on(CharMatcher)} for examples
083 * of other ways to specify separators.
084 *
085 * <p><b>Note:</b> this class does not mimic any of the quirky behaviors of
086 * similar JDK methods; for instance, it does not silently discard trailing
087 * separators, as does {@link String#split(String)}, nor does it have a default
088 * behavior of using five particular whitespace characters as separators, like
089 * {@link java.util.StringTokenizer}.
090 *
091 * <p>See the Guava User Guide article on <a href=
092 * "http://code.google.com/p/guava-libraries/wiki/StringsExplained#Splitter">
093 * {@code Splitter}</a>.
094 *
095 * @author Julien Silland
096 * @author Jesse Wilson
097 * @author Kevin Bourrillion
098 * @author Louis Wasserman
099 * @since 1.0
100 */
101@GwtCompatible(emulated = true)
102public final class Splitter {
103  private final CharMatcher trimmer;
104  private final boolean omitEmptyStrings;
105  private final Strategy strategy;
106  private final int limit;
107
108  private Splitter(Strategy strategy) {
109    this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
110  }
111
112  private Splitter(Strategy strategy, boolean omitEmptyStrings,
113      CharMatcher trimmer, int limit) {
114    this.strategy = strategy;
115    this.omitEmptyStrings = omitEmptyStrings;
116    this.trimmer = trimmer;
117    this.limit = limit;
118  }
119
120  /**
121   * Returns a splitter that uses the given single-character separator. For
122   * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
123   * containing {@code ["foo", "", "bar"]}.
124   *
125   * @param separator the character to recognize as a separator
126   * @return a splitter, with default settings, that recognizes that separator
127   */
128  public static Splitter on(char separator) {
129    return on(CharMatcher.is(separator));
130  }
131
132  /**
133   * Returns a splitter that considers any single character matched by the
134   * given {@code CharMatcher} to be a separator. For example, {@code
135   * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
136   * iterable containing {@code ["foo", "", "bar", "quux"]}.
137   *
138   * @param separatorMatcher a {@link CharMatcher} that determines whether a
139   *     character is a separator
140   * @return a splitter, with default settings, that uses this matcher
141   */
142  public static Splitter on(final CharMatcher separatorMatcher) {
143    checkNotNull(separatorMatcher);
144
145    return new Splitter(new Strategy() {
146      @Override public SplittingIterator iterator(
147          Splitter splitter, final CharSequence toSplit) {
148        return new SplittingIterator(splitter, toSplit) {
149          @Override int separatorStart(int start) {
150            return separatorMatcher.indexIn(toSplit, start);
151          }
152
153          @Override int separatorEnd(int separatorPosition) {
154            return separatorPosition + 1;
155          }
156        };
157      }
158    });
159  }
160
161  /**
162   * Returns a splitter that uses the given fixed string as a separator. For
163   * example, {@code Splitter.on(", ").split("foo, bar, baz,qux")} returns an
164   * iterable containing {@code ["foo", "bar", "baz,qux"]}.
165   *
166   * @param separator the literal, nonempty string to recognize as a separator
167   * @return a splitter, with default settings, that recognizes that separator
168   */
169  public static Splitter on(final String separator) {
170    checkArgument(separator.length() != 0,
171        "The separator may not be the empty string.");
172
173    return new Splitter(new Strategy() {
174      @Override public SplittingIterator iterator(
175          Splitter splitter, CharSequence toSplit) {
176        return new SplittingIterator(splitter, toSplit) {
177          @Override public int separatorStart(int start) {
178            int delimeterLength = separator.length();
179
180            positions:
181            for (int p = start, last = toSplit.length() - delimeterLength;
182                p <= last; p++) {
183              for (int i = 0; i < delimeterLength; i++) {
184                if (toSplit.charAt(i + p) != separator.charAt(i)) {
185                  continue positions;
186                }
187              }
188              return p;
189            }
190            return -1;
191          }
192
193          @Override public int separatorEnd(int separatorPosition) {
194            return separatorPosition + separator.length();
195          }
196        };
197      }
198    });
199  }
200
201  /**
202   * Returns a splitter that considers any subsequence matching {@code
203   * pattern} to be a separator. For example, {@code
204   * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
205   * into lines whether it uses DOS-style or UNIX-style line terminators.
206   *
207   * @param separatorPattern the pattern that determines whether a subsequence
208   *     is a separator. This pattern may not match the empty string.
209   * @return a splitter, with default settings, that uses this pattern
210   * @throws IllegalArgumentException if {@code separatorPattern} matches the
211   *     empty string
212   */
213  @GwtIncompatible("java.util.regex")
214  public static Splitter on(final Pattern separatorPattern) {
215    checkNotNull(separatorPattern);
216    checkArgument(!separatorPattern.matcher("").matches(),
217        "The pattern may not match the empty string: %s", separatorPattern);
218
219    return new Splitter(new Strategy() {
220      @Override public SplittingIterator iterator(
221          final Splitter splitter, CharSequence toSplit) {
222        final Matcher matcher = separatorPattern.matcher(toSplit);
223        return new SplittingIterator(splitter, toSplit) {
224          @Override public int separatorStart(int start) {
225            return matcher.find(start) ? matcher.start() : -1;
226          }
227
228          @Override public int separatorEnd(int separatorPosition) {
229            return matcher.end();
230          }
231        };
232      }
233    });
234  }
235
236  /**
237   * Returns a splitter that considers any subsequence matching a given
238   * pattern (regular expression) to be a separator. For example, {@code
239   * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
240   * whether it uses DOS-style or UNIX-style line terminators. This is
241   * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
242   *
243   * @param separatorPattern the pattern that determines whether a subsequence
244   *     is a separator. This pattern may not match the empty string.
245   * @return a splitter, with default settings, that uses this pattern
246   * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
247   *     is a malformed expression
248   * @throws IllegalArgumentException if {@code separatorPattern} matches the
249   *     empty string
250   */
251  @GwtIncompatible("java.util.regex")
252  public static Splitter onPattern(String separatorPattern) {
253    return on(Pattern.compile(separatorPattern));
254  }
255
256  /**
257   * Returns a splitter that divides strings into pieces of the given length.
258   * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
259   * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
260   * smaller than {@code length} but will never be empty.
261   *
262   * @param length the desired length of pieces after splitting
263   * @return a splitter, with default settings, that can split into fixed sized
264   *     pieces
265   */
266  public static Splitter fixedLength(final int length) {
267    checkArgument(length > 0, "The length may not be less than 1");
268
269    return new Splitter(new Strategy() {
270      @Override public SplittingIterator iterator(
271          final Splitter splitter, CharSequence toSplit) {
272        return new SplittingIterator(splitter, toSplit) {
273          @Override public int separatorStart(int start) {
274            int nextChunkStart = start + length;
275            return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
276          }
277
278          @Override public int separatorEnd(int separatorPosition) {
279            return separatorPosition;
280          }
281        };
282      }
283    });
284  }
285
286  /**
287   * Returns a splitter that behaves equivalently to {@code this} splitter, but
288   * automatically omits empty strings from the results. For example, {@code
289   * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
290   * iterable containing only {@code ["a", "b", "c"]}.
291   *
292   * <p>If either {@code trimResults} option is also specified when creating a
293   * splitter, that splitter always trims results first before checking for
294   * emptiness. So, for example, {@code
295   * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
296   * an empty iterable.
297   *
298   * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
299   * to return an empty iterable, but when using this option, it can (if the
300   * input sequence consists of nothing but separators).
301   *
302   * @return a splitter with the desired configuration
303   */
304  @CheckReturnValue
305  public Splitter omitEmptyStrings() {
306    return new Splitter(strategy, true, trimmer, limit);
307  }
308
309  /**
310   * Returns a splitter that behaves equivalently to {@code this} splitter but
311   * stops splitting after it reaches the limit.
312   * The limit defines the maximum number of items returned by the iterator.
313   *
314   * <p>For example,
315   * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
316   * containing {@code ["a", "b", "c,d"]}.  When omitting empty strings, the
317   * omitted strings do no count.  Hence,
318   * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
319   * returns an iterable containing {@code ["a", "b", "c,d"}.
320   * When trim is requested, all entries, including the last are trimmed.  Hence
321   * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
322   * results in @{code ["a", "b", "c , d"]}.
323   *
324   * @param limit the maximum number of items returns
325   * @return a splitter with the desired configuration
326   * @since 9.0
327   */
328  @CheckReturnValue
329  public Splitter limit(int limit) {
330    checkArgument(limit > 0, "must be greater than zero: %s", limit);
331    return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
332  }
333
334  /**
335   * Returns a splitter that behaves equivalently to {@code this} splitter, but
336   * automatically removes leading and trailing {@linkplain
337   * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
338   * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
339   * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
340   * containing {@code ["a", "b", "c"]}.
341   *
342   * @return a splitter with the desired configuration
343   */
344  @CheckReturnValue
345  public Splitter trimResults() {
346    return trimResults(CharMatcher.WHITESPACE);
347  }
348
349  /**
350   * Returns a splitter that behaves equivalently to {@code this} splitter, but
351   * removes all leading or trailing characters matching the given {@code
352   * CharMatcher} from each returned substring. For example, {@code
353   * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
354   * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
355   *
356   * @param trimmer a {@link CharMatcher} that determines whether a character
357   *     should be removed from the beginning/end of a subsequence
358   * @return a splitter with the desired configuration
359   */
360  // TODO(kevinb): throw if a trimmer was already specified!
361  @CheckReturnValue
362  public Splitter trimResults(CharMatcher trimmer) {
363    checkNotNull(trimmer);
364    return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
365  }
366
367  /**
368   * Splits {@code sequence} into string components and makes them available
369   * through an {@link Iterator}, which may be lazily evaluated.
370   *
371   * @param sequence the sequence of characters to split
372   * @return an iteration over the segments split from the parameter.
373   */
374  public Iterable<String> split(final CharSequence sequence) {
375    checkNotNull(sequence);
376
377    return new Iterable<String>() {
378      @Override public Iterator<String> iterator() {
379        return spliterator(sequence);
380      }
381      @Override public String toString() {
382        return Joiner.on(", ")
383            .appendTo(new StringBuilder().append('['), this)
384            .append(']')
385            .toString();
386      }
387    };
388  }
389
390  private Iterator<String> spliterator(CharSequence sequence) {
391    return strategy.iterator(this, sequence);
392  }
393
394  /**
395   * Returns a {@code MapSplitter} which splits entries based on this splitter,
396   * and splits entries into keys and values using the specified separator.
397   *
398   * @since 10.0
399   */
400  @CheckReturnValue
401  @Beta
402  public MapSplitter withKeyValueSeparator(String separator) {
403    return withKeyValueSeparator(on(separator));
404  }
405
406  /**
407   * Returns a {@code MapSplitter} which splits entries based on this splitter,
408   * and splits entries into keys and values using the specified key-value
409   * splitter.
410   *
411   * @since 10.0
412   */
413  @CheckReturnValue
414  @Beta
415  public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
416    return new MapSplitter(this, keyValueSplitter);
417  }
418
419  /**
420   * An object that splits strings into maps as {@code Splitter} splits
421   * iterables and lists. Like {@code Splitter}, it is thread-safe and
422   * immutable.
423   *
424   * @since 10.0
425   */
426  @Beta
427  public static final class MapSplitter {
428    private static final String INVALID_ENTRY_MESSAGE =
429        "Chunk [%s] is not a valid entry";
430    private final Splitter outerSplitter;
431    private final Splitter entrySplitter;
432
433    private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
434      this.outerSplitter = outerSplitter; // only "this" is passed
435      this.entrySplitter = checkNotNull(entrySplitter);
436    }
437
438    /**
439     * Splits {@code sequence} into substrings, splits each substring into
440     * an entry, and returns an unmodifiable map with each of the entries. For
441     * example, <code>
442     * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
443     * .split("a=>b ; c=>b")
444     * </code> will return a mapping from {@code "a"} to {@code "b"} and
445     * {@code "c"} to {@code b}.
446     *
447     * <p>The returned map preserves the order of the entries from
448     * {@code sequence}.
449     *
450     * @throws IllegalArgumentException if the specified sequence does not split
451     *         into valid map entries, or if there are duplicate keys
452     */
453    public Map<String, String> split(CharSequence sequence) {
454      Map<String, String> map = new LinkedHashMap<String, String>();
455      for (String entry : outerSplitter.split(sequence)) {
456        Iterator<String> entryFields = entrySplitter.spliterator(entry);
457
458        checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
459        String key = entryFields.next();
460        checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
461
462        checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
463        String value = entryFields.next();
464        map.put(key, value);
465
466        checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
467      }
468      return Collections.unmodifiableMap(map);
469    }
470  }
471
472  private interface Strategy {
473    Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
474  }
475
476  private abstract static class SplittingIterator extends AbstractIterator<String> {
477    final CharSequence toSplit;
478    final CharMatcher trimmer;
479    final boolean omitEmptyStrings;
480
481    /**
482     * Returns the first index in {@code toSplit} at or after {@code start}
483     * that contains the separator.
484     */
485    abstract int separatorStart(int start);
486
487    /**
488     * Returns the first index in {@code toSplit} after {@code
489     * separatorPosition} that does not contain a separator. This method is only
490     * invoked after a call to {@code separatorStart}.
491     */
492    abstract int separatorEnd(int separatorPosition);
493
494    int offset = 0;
495    int limit;
496
497    protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
498      this.trimmer = splitter.trimmer;
499      this.omitEmptyStrings = splitter.omitEmptyStrings;
500      this.limit = splitter.limit;
501      this.toSplit = toSplit;
502    }
503
504    @Override protected String computeNext() {
505      /*
506       * The returned string will be from the end of the last match to the
507       * beginning of the next one. nextStart is the start position of the
508       * returned substring, while offset is the place to start looking for a
509       * separator.
510       */
511      int nextStart = offset;
512      while (offset != -1) {
513        int start = nextStart;
514        int end;
515
516        int separatorPosition = separatorStart(offset);
517        if (separatorPosition == -1) {
518          end = toSplit.length();
519          offset = -1;
520        } else {
521          end = separatorPosition;
522          offset = separatorEnd(separatorPosition);
523        }
524        if (offset == nextStart) {
525          /*
526           * This occurs when some pattern has an empty match, even if it
527           * doesn't match the empty string -- for example, if it requires
528           * lookahead or the like. The offset must be increased to look for
529           * separators beyond this point, without changing the start position
530           * of the next returned substring -- so nextStart stays the same.
531           */
532          offset++;
533          if (offset >= toSplit.length()) {
534            offset = -1;
535          }
536          continue;
537        }
538
539        while (start < end && trimmer.matches(toSplit.charAt(start))) {
540          start++;
541        }
542        while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
543          end--;
544        }
545
546        if (omitEmptyStrings && start == end) {
547          // Don't include the (unused) separator in next split string.
548          nextStart = offset;
549          continue;
550        }
551
552        if (limit == 1) {
553          // The limit has been reached, return the rest of the string as the
554          // final item.  This is tested after empty string removal so that
555          // empty strings do not count towards the limit.
556          end = toSplit.length();
557          offset = -1;
558          // Since we may have changed the end, we need to trim it again.
559          while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
560            end--;
561          }
562        } else {
563          limit--;
564        }
565
566        return toSplit.subSequence(start, end).toString();
567      }
568      return endOfData();
569    }
570  }
571}