001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024import com.google.common.annotations.GwtIncompatible;
025
026import java.util.ArrayList;
027import java.util.Collections;
028import java.util.Iterator;
029import java.util.LinkedHashMap;
030import java.util.List;
031import java.util.Map;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034
035import javax.annotation.CheckReturnValue;
036
037/**
038 * Extracts non-overlapping substrings from an input string, typically by
039 * recognizing appearances of a <i>separator</i> sequence. This separator can be
040 * specified as a single {@linkplain #on(char) character}, fixed {@linkplain
041 * #on(String) string}, {@linkplain #onPattern regular expression} or {@link
042 * #on(CharMatcher) CharMatcher} instance. Or, instead of using a separator at
043 * all, a splitter can extract adjacent substrings of a given {@linkplain
044 * #fixedLength fixed length}.
045 *
046 * <p>For example, this expression: <pre>   {@code
047 *
048 *   Splitter.on(',').split("foo,bar,qux")}</pre>
049 *
050 * ... produces an {@code Iterable} containing {@code "foo"}, {@code "bar"} and
051 * {@code "qux"}, in that order.
052 *
053 * <p>By default, {@code Splitter}'s behavior is simplistic and unassuming. The
054 * following expression: <pre>   {@code
055 *
056 *   Splitter.on(',').split(" foo,,,  bar ,")}</pre>
057 *
058 * ... yields the substrings {@code [" foo", "", "", "  bar ", ""]}. If this
059 * is not the desired behavior, use configuration methods to obtain a <i>new</i>
060 * splitter instance with modified behavior: <pre>   {@code
061 *
062 *   private static final Splitter MY_SPLITTER = Splitter.on(',')
063 *       .trimResults()
064 *       .omitEmptyStrings();}</pre>
065 *
066 * <p>Now {@code MY_SPLITTER.split("foo,,,  bar ,")} returns just {@code ["foo",
067 * "bar"]}. Note that the order in which these configuration methods are called
068 * is never significant.
069 *
070 * <p><b>Warning:</b> Splitter instances are immutable. Invoking a configuration
071 * method has no effect on the receiving instance; you must store and use the
072 * new splitter instance it returns instead. <pre>   {@code
073 *
074 *   // Do NOT do this
075 *   Splitter splitter = Splitter.on('/');
076 *   splitter.trimResults(); // does nothing!
077 *   return splitter.split("wrong / wrong / wrong");}</pre>
078 *
079 * <p>For separator-based splitters that do not use {@code omitEmptyStrings}, an
080 * input string containing {@code n} occurrences of the separator naturally
081 * yields an iterable of size {@code n + 1}. So if the separator does not occur
082 * anywhere in the input, a single substring is returned containing the entire
083 * input. Consequently, all splitters split the empty string to {@code [""]}
084 * (note: even fixed-length splitters).
085 *
086 * <p>Splitter instances are thread-safe immutable, and are therefore safe to
087 * store as {@code static final} constants.
088 *
089 * <p>The {@link Joiner} class provides the inverse operation to splitting, but
090 * note that a round-trip between the two should be assumed to be lossy.
091 *
092 * <p>See the Guava User Guide article on <a href=
093 * "https://github.com/google/guava/wiki/StringsExplained#splitter">
094 * {@code Splitter}</a>.
095 *
096 * @author Julien Silland
097 * @author Jesse Wilson
098 * @author Kevin Bourrillion
099 * @author Louis Wasserman
100 * @since 1.0
101 */
102@GwtCompatible(emulated = true)
103public final class Splitter {
104  private final CharMatcher trimmer;
105  private final boolean omitEmptyStrings;
106  private final Strategy strategy;
107  private final int limit;
108
109  private Splitter(Strategy strategy) {
110    this(strategy, false, CharMatcher.NONE, Integer.MAX_VALUE);
111  }
112
113  private Splitter(Strategy strategy, boolean omitEmptyStrings, CharMatcher trimmer, int limit) {
114    this.strategy = strategy;
115    this.omitEmptyStrings = omitEmptyStrings;
116    this.trimmer = trimmer;
117    this.limit = limit;
118  }
119
120  /**
121   * Returns a splitter that uses the given single-character separator. For
122   * example, {@code Splitter.on(',').split("foo,,bar")} returns an iterable
123   * containing {@code ["foo", "", "bar"]}.
124   *
125   * @param separator the character to recognize as a separator
126   * @return a splitter, with default settings, that recognizes that separator
127   */
128  @CheckReturnValue
129  public static Splitter on(char separator) {
130    return on(CharMatcher.is(separator));
131  }
132
133  /**
134   * Returns a splitter that considers any single character matched by the
135   * given {@code CharMatcher} to be a separator. For example, {@code
136   * Splitter.on(CharMatcher.anyOf(";,")).split("foo,;bar,quux")} returns an
137   * iterable containing {@code ["foo", "", "bar", "quux"]}.
138   *
139   * @param separatorMatcher a {@link CharMatcher} that determines whether a
140   *     character is a separator
141   * @return a splitter, with default settings, that uses this matcher
142   */
143  @CheckReturnValue
144  public static Splitter on(final CharMatcher separatorMatcher) {
145    checkNotNull(separatorMatcher);
146
147    return new Splitter(
148        new Strategy() {
149          @Override
150          public SplittingIterator iterator(Splitter splitter, final CharSequence toSplit) {
151            return new SplittingIterator(splitter, toSplit) {
152              @Override
153              int separatorStart(int start) {
154                return separatorMatcher.indexIn(toSplit, start);
155              }
156
157              @Override
158              int separatorEnd(int separatorPosition) {
159                return separatorPosition + 1;
160              }
161            };
162          }
163        });
164  }
165
166  /**
167   * Returns a splitter that uses the given fixed string as a separator. For
168   * example, {@code Splitter.on(", ").split("foo, bar,baz")} returns an
169   * iterable containing {@code ["foo", "bar,baz"]}.
170   *
171   * @param separator the literal, nonempty string to recognize as a separator
172   * @return a splitter, with default settings, that recognizes that separator
173   */
174  @CheckReturnValue
175  public static Splitter on(final String separator) {
176    checkArgument(separator.length() != 0, "The separator may not be the empty string.");
177    if (separator.length() == 1) {
178      return Splitter.on(separator.charAt(0));
179    }
180    return new Splitter(
181        new Strategy() {
182          @Override
183          public SplittingIterator iterator(Splitter splitter, CharSequence toSplit) {
184            return new SplittingIterator(splitter, toSplit) {
185              @Override
186              public int separatorStart(int start) {
187                int separatorLength = separator.length();
188
189                positions:
190                for (int p = start, last = toSplit.length() - separatorLength; p <= last; p++) {
191                  for (int i = 0; i < separatorLength; i++) {
192                    if (toSplit.charAt(i + p) != separator.charAt(i)) {
193                      continue positions;
194                    }
195                  }
196                  return p;
197                }
198                return -1;
199              }
200
201              @Override
202              public int separatorEnd(int separatorPosition) {
203                return separatorPosition + separator.length();
204              }
205            };
206          }
207        });
208  }
209
210  /**
211   * Returns a splitter that considers any subsequence matching {@code
212   * pattern} to be a separator. For example, {@code
213   * Splitter.on(Pattern.compile("\r?\n")).split(entireFile)} splits a string
214   * into lines whether it uses DOS-style or UNIX-style line terminators.
215   *
216   * @param separatorPattern the pattern that determines whether a subsequence
217   *     is a separator. This pattern may not match the empty string.
218   * @return a splitter, with default settings, that uses this pattern
219   * @throws IllegalArgumentException if {@code separatorPattern} matches the
220   *     empty string
221   */
222  @CheckReturnValue
223  @GwtIncompatible("java.util.regex")
224  public static Splitter on(final Pattern separatorPattern) {
225    checkNotNull(separatorPattern);
226    checkArgument(
227        !separatorPattern.matcher("").matches(),
228        "The pattern may not match the empty string: %s",
229        separatorPattern);
230
231    return new Splitter(
232        new Strategy() {
233          @Override
234          public SplittingIterator iterator(final Splitter splitter, CharSequence toSplit) {
235            final Matcher matcher = separatorPattern.matcher(toSplit);
236            return new SplittingIterator(splitter, toSplit) {
237              @Override
238              public int separatorStart(int start) {
239                return matcher.find(start) ? matcher.start() : -1;
240              }
241
242              @Override
243              public int separatorEnd(int separatorPosition) {
244                return matcher.end();
245              }
246            };
247          }
248        });
249  }
250
251  /**
252   * Returns a splitter that considers any subsequence matching a given
253   * pattern (regular expression) to be a separator. For example, {@code
254   * Splitter.onPattern("\r?\n").split(entireFile)} splits a string into lines
255   * whether it uses DOS-style or UNIX-style line terminators. This is
256   * equivalent to {@code Splitter.on(Pattern.compile(pattern))}.
257   *
258   * @param separatorPattern the pattern that determines whether a subsequence
259   *     is a separator. This pattern may not match the empty string.
260   * @return a splitter, with default settings, that uses this pattern
261   * @throws java.util.regex.PatternSyntaxException if {@code separatorPattern}
262   *     is a malformed expression
263   * @throws IllegalArgumentException if {@code separatorPattern} matches the
264   *     empty string
265   */
266  @CheckReturnValue
267  @GwtIncompatible("java.util.regex")
268  public static Splitter onPattern(String separatorPattern) {
269    return on(Pattern.compile(separatorPattern));
270  }
271
272  /**
273   * Returns a splitter that divides strings into pieces of the given length.
274   * For example, {@code Splitter.fixedLength(2).split("abcde")} returns an
275   * iterable containing {@code ["ab", "cd", "e"]}. The last piece can be
276   * smaller than {@code length} but will never be empty.
277   *
278   * <p><b>Exception:</b> for consistency with separator-based splitters, {@code
279   * split("")} does not yield an empty iterable, but an iterable containing
280   * {@code ""}. This is the only case in which {@code
281   * Iterables.size(split(input))} does not equal {@code
282   * IntMath.divide(input.length(), length, CEILING)}. To avoid this behavior,
283   * use {@code omitEmptyStrings}.
284   *
285   * @param length the desired length of pieces after splitting, a positive
286   *     integer
287   * @return a splitter, with default settings, that can split into fixed sized
288   *     pieces
289   * @throws IllegalArgumentException if {@code length} is zero or negative
290   */
291  @CheckReturnValue
292  public static Splitter fixedLength(final int length) {
293    checkArgument(length > 0, "The length may not be less than 1");
294
295    return new Splitter(
296        new Strategy() {
297          @Override
298          public SplittingIterator iterator(final Splitter splitter, CharSequence toSplit) {
299            return new SplittingIterator(splitter, toSplit) {
300              @Override
301              public int separatorStart(int start) {
302                int nextChunkStart = start + length;
303                return (nextChunkStart < toSplit.length() ? nextChunkStart : -1);
304              }
305
306              @Override
307              public int separatorEnd(int separatorPosition) {
308                return separatorPosition;
309              }
310            };
311          }
312        });
313  }
314
315  /**
316   * Returns a splitter that behaves equivalently to {@code this} splitter, but
317   * automatically omits empty strings from the results. For example, {@code
318   * Splitter.on(',').omitEmptyStrings().split(",a,,,b,c,,")} returns an
319   * iterable containing only {@code ["a", "b", "c"]}.
320   *
321   * <p>If either {@code trimResults} option is also specified when creating a
322   * splitter, that splitter always trims results first before checking for
323   * emptiness. So, for example, {@code
324   * Splitter.on(':').omitEmptyStrings().trimResults().split(": : : ")} returns
325   * an empty iterable.
326   *
327   * <p>Note that it is ordinarily not possible for {@link #split(CharSequence)}
328   * to return an empty iterable, but when using this option, it can (if the
329   * input sequence consists of nothing but separators).
330   *
331   * @return a splitter with the desired configuration
332   */
333  @CheckReturnValue
334  public Splitter omitEmptyStrings() {
335    return new Splitter(strategy, true, trimmer, limit);
336  }
337
338  /**
339   * Returns a splitter that behaves equivalently to {@code this} splitter but
340   * stops splitting after it reaches the limit.
341   * The limit defines the maximum number of items returned by the iterator, or
342   * the maximum size of the list returned by {@link #splitToList}.
343   *
344   * <p>For example,
345   * {@code Splitter.on(',').limit(3).split("a,b,c,d")} returns an iterable
346   * containing {@code ["a", "b", "c,d"]}.  When omitting empty strings, the
347   * omitted strings do no count.  Hence,
348   * {@code Splitter.on(',').limit(3).omitEmptyStrings().split("a,,,b,,,c,d")}
349   * returns an iterable containing {@code ["a", "b", "c,d"}.
350   * When trim is requested, all entries, including the last are trimmed.  Hence
351   * {@code Splitter.on(',').limit(3).trimResults().split(" a , b , c , d ")}
352   * results in {@code ["a", "b", "c , d"]}.
353   *
354   * @param limit the maximum number of items returned
355   * @return a splitter with the desired configuration
356   * @since 9.0
357   */
358  @CheckReturnValue
359  public Splitter limit(int limit) {
360    checkArgument(limit > 0, "must be greater than zero: %s", limit);
361    return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
362  }
363
364  /**
365   * Returns a splitter that behaves equivalently to {@code this} splitter, but
366   * automatically removes leading and trailing {@linkplain
367   * CharMatcher#WHITESPACE whitespace} from each returned substring; equivalent
368   * to {@code trimResults(CharMatcher.WHITESPACE)}. For example, {@code
369   * Splitter.on(',').trimResults().split(" a, b ,c ")} returns an iterable
370   * containing {@code ["a", "b", "c"]}.
371   *
372   * @return a splitter with the desired configuration
373   */
374  @CheckReturnValue
375  public Splitter trimResults() {
376    return trimResults(CharMatcher.WHITESPACE);
377  }
378
379  /**
380   * Returns a splitter that behaves equivalently to {@code this} splitter, but
381   * removes all leading or trailing characters matching the given {@code
382   * CharMatcher} from each returned substring. For example, {@code
383   * Splitter.on(',').trimResults(CharMatcher.is('_')).split("_a ,_b_ ,c__")}
384   * returns an iterable containing {@code ["a ", "b_ ", "c"]}.
385   *
386   * @param trimmer a {@link CharMatcher} that determines whether a character
387   *     should be removed from the beginning/end of a subsequence
388   * @return a splitter with the desired configuration
389   */
390  // TODO(kevinb): throw if a trimmer was already specified!
391  @CheckReturnValue
392  public Splitter trimResults(CharMatcher trimmer) {
393    checkNotNull(trimmer);
394    return new Splitter(strategy, omitEmptyStrings, trimmer, limit);
395  }
396
397  /**
398   * Splits {@code sequence} into string components and makes them available
399   * through an {@link Iterator}, which may be lazily evaluated. If you want
400   * an eagerly computed {@link List}, use {@link #splitToList(CharSequence)}.
401   *
402   * @param sequence the sequence of characters to split
403   * @return an iteration over the segments split from the parameter.
404   */
405  @CheckReturnValue
406  public Iterable<String> split(final CharSequence sequence) {
407    checkNotNull(sequence);
408
409    return new Iterable<String>() {
410      @Override
411      public Iterator<String> iterator() {
412        return splittingIterator(sequence);
413      }
414
415      @Override
416      public String toString() {
417        return Joiner.on(", ")
418            .appendTo(new StringBuilder().append('['), this)
419            .append(']')
420            .toString();
421      }
422    };
423  }
424
425  private Iterator<String> splittingIterator(CharSequence sequence) {
426    return strategy.iterator(this, sequence);
427  }
428
429  /**
430   * Splits {@code sequence} into string components and returns them as
431   * an immutable list. If you want an {@link Iterable} which may be lazily
432   * evaluated, use {@link #split(CharSequence)}.
433   *
434   * @param sequence the sequence of characters to split
435   * @return an immutable list of the segments split from the parameter
436   * @since 15.0
437   */
438  @CheckReturnValue
439  @Beta
440  public List<String> splitToList(CharSequence sequence) {
441    checkNotNull(sequence);
442
443    Iterator<String> iterator = splittingIterator(sequence);
444    List<String> result = new ArrayList<String>();
445
446    while (iterator.hasNext()) {
447      result.add(iterator.next());
448    }
449
450    return Collections.unmodifiableList(result);
451  }
452
453  /**
454   * Returns a {@code MapSplitter} which splits entries based on this splitter,
455   * and splits entries into keys and values using the specified separator.
456   *
457   * @since 10.0
458   */
459  @CheckReturnValue
460  @Beta
461  public MapSplitter withKeyValueSeparator(String separator) {
462    return withKeyValueSeparator(on(separator));
463  }
464
465  /**
466   * Returns a {@code MapSplitter} which splits entries based on this splitter,
467   * and splits entries into keys and values using the specified separator.
468   *
469   * @since 14.0
470   */
471  @CheckReturnValue
472  @Beta
473  public MapSplitter withKeyValueSeparator(char separator) {
474    return withKeyValueSeparator(on(separator));
475  }
476
477  /**
478   * Returns a {@code MapSplitter} which splits entries based on this splitter,
479   * and splits entries into keys and values using the specified key-value
480   * splitter.
481   *
482   * @since 10.0
483   */
484  @CheckReturnValue
485  @Beta
486  public MapSplitter withKeyValueSeparator(Splitter keyValueSplitter) {
487    return new MapSplitter(this, keyValueSplitter);
488  }
489
490  /**
491   * An object that splits strings into maps as {@code Splitter} splits
492   * iterables and lists. Like {@code Splitter}, it is thread-safe and
493   * immutable.
494   *
495   * @since 10.0
496   */
497  @Beta
498  public static final class MapSplitter {
499    private static final String INVALID_ENTRY_MESSAGE = "Chunk [%s] is not a valid entry";
500    private final Splitter outerSplitter;
501    private final Splitter entrySplitter;
502
503    private MapSplitter(Splitter outerSplitter, Splitter entrySplitter) {
504      this.outerSplitter = outerSplitter; // only "this" is passed
505      this.entrySplitter = checkNotNull(entrySplitter);
506    }
507
508    /**
509     * Splits {@code sequence} into substrings, splits each substring into
510     * an entry, and returns an unmodifiable map with each of the entries. For
511     * example, <code>
512     * Splitter.on(';').trimResults().withKeyValueSeparator("=>")
513     * .split("a=>b ; c=>b")
514     * </code> will return a mapping from {@code "a"} to {@code "b"} and
515     * {@code "c"} to {@code b}.
516     *
517     * <p>The returned map preserves the order of the entries from
518     * {@code sequence}.
519     *
520     * @throws IllegalArgumentException if the specified sequence does not split
521     *         into valid map entries, or if there are duplicate keys
522     */
523    @CheckReturnValue
524    public Map<String, String> split(CharSequence sequence) {
525      Map<String, String> map = new LinkedHashMap<String, String>();
526      for (String entry : outerSplitter.split(sequence)) {
527        Iterator<String> entryFields = entrySplitter.splittingIterator(entry);
528
529        checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
530        String key = entryFields.next();
531        checkArgument(!map.containsKey(key), "Duplicate key [%s] found.", key);
532
533        checkArgument(entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
534        String value = entryFields.next();
535        map.put(key, value);
536
537        checkArgument(!entryFields.hasNext(), INVALID_ENTRY_MESSAGE, entry);
538      }
539      return Collections.unmodifiableMap(map);
540    }
541  }
542
543  private interface Strategy {
544    Iterator<String> iterator(Splitter splitter, CharSequence toSplit);
545  }
546
547  private abstract static class SplittingIterator extends AbstractIterator<String> {
548    final CharSequence toSplit;
549    final CharMatcher trimmer;
550    final boolean omitEmptyStrings;
551
552    /**
553     * Returns the first index in {@code toSplit} at or after {@code start}
554     * that contains the separator.
555     */
556    abstract int separatorStart(int start);
557
558    /**
559     * Returns the first index in {@code toSplit} after {@code
560     * separatorPosition} that does not contain a separator. This method is only
561     * invoked after a call to {@code separatorStart}.
562     */
563    abstract int separatorEnd(int separatorPosition);
564
565    int offset = 0;
566    int limit;
567
568    protected SplittingIterator(Splitter splitter, CharSequence toSplit) {
569      this.trimmer = splitter.trimmer;
570      this.omitEmptyStrings = splitter.omitEmptyStrings;
571      this.limit = splitter.limit;
572      this.toSplit = toSplit;
573    }
574
575    @Override
576    protected String computeNext() {
577      /*
578       * The returned string will be from the end of the last match to the
579       * beginning of the next one. nextStart is the start position of the
580       * returned substring, while offset is the place to start looking for a
581       * separator.
582       */
583      int nextStart = offset;
584      while (offset != -1) {
585        int start = nextStart;
586        int end;
587
588        int separatorPosition = separatorStart(offset);
589        if (separatorPosition == -1) {
590          end = toSplit.length();
591          offset = -1;
592        } else {
593          end = separatorPosition;
594          offset = separatorEnd(separatorPosition);
595        }
596        if (offset == nextStart) {
597          /*
598           * This occurs when some pattern has an empty match, even if it
599           * doesn't match the empty string -- for example, if it requires
600           * lookahead or the like. The offset must be increased to look for
601           * separators beyond this point, without changing the start position
602           * of the next returned substring -- so nextStart stays the same.
603           */
604          offset++;
605          if (offset >= toSplit.length()) {
606            offset = -1;
607          }
608          continue;
609        }
610
611        while (start < end && trimmer.matches(toSplit.charAt(start))) {
612          start++;
613        }
614        while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
615          end--;
616        }
617
618        if (omitEmptyStrings && start == end) {
619          // Don't include the (unused) separator in next split string.
620          nextStart = offset;
621          continue;
622        }
623
624        if (limit == 1) {
625          // The limit has been reached, return the rest of the string as the
626          // final item.  This is tested after empty string removal so that
627          // empty strings do not count towards the limit.
628          end = toSplit.length();
629          offset = -1;
630          // Since we may have changed the end, we need to trim it again.
631          while (end > start && trimmer.matches(toSplit.charAt(end - 1))) {
632            end--;
633          }
634        } else {
635          limit--;
636        }
637
638        return toSplit.subSequence(start, end).toString();
639      }
640      return endOfData();
641    }
642  }
643}