View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.text;
23  
24  import java.io.IOException;
25  import java.io.Writer;
26  import java.util.Iterator;
27  import java.util.Map;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.apache.commons.lang.StringUtils;
32  
33  /***
34   * A static class that provides utility constants and methods for working with
35   * texts and regular expressions.
36   * No instances of this class can be created, only the static members
37   * should be used.
38   *
39   * @author Christian Siefkes
40   * @version $Revision: 1.17 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
41   */
42  public final class TextUtils {
43  
44      /***
45       * The line separator on the current operating system ("\n" on Unix).
46       */
47      public static final String LINE_SEPARATOR =
48          System.getProperty("line.separator");
49  
50      /***
51       * Regex fragment listing the newline alternatives used by differents
52       * systems: "\r\n" (Windows), "\n" (Unix) or "\r" (Mac).
53       */
54      public static final String NEWLINE_ALTERNATIVES = "//r//n|//n|//r";
55  
56      /***
57      * A regular expression matching a non-line-breaking whitespace character
58      * (character class containing space and tab).
59      */
60      public static final Pattern SINGLE_LINE_WS =
61             Pattern.compile("[ //t]");
62  
63      /***
64      * A regular expression matching a single newline (build by enclosing
65      * {@link #NEWLINE_ALTERNATIVES} in a non-capturing group).
66      */
67      public static final Pattern NEWLINE_PATTERN =
68             Pattern.compile("(?:" + NEWLINE_ALTERNATIVES + ")");
69  
70      /***
71       * A regular expression matching a single newline or a tab character.
72       */
73       public static final Pattern NEWLINE_TAB_PATTERN =
74              Pattern.compile("(?:" + NEWLINE_ALTERNATIVES + "|//t)");
75  
76      /***
77       * A regular expression matching newlines, including surrounding whitespace.
78       * Will match several newlines if they immediately follow each other or
79       * are separated by whitespace only.
80       */
81      public static final Pattern NEWLINES_PATTERN =
82              Pattern.compile("//s*(?:[//r//n]+//s*)+");
83  
84      /***
85       * A simple regular expression for strings that contain only
86       * punctuation characters.
87       */
88      public static final Pattern PUNCTUATION_PATTERN =
89          Pattern.compile("//p{P}+");
90  
91      /***
92       * A simple regular expression for strings that contain only
93       * punctuation and symbol characters.
94       */
95      public static final Pattern PUNCTUATION_SYMBOL_PATTERN =
96          Pattern.compile("[//p{P}//p{S}]+");
97  
98      /***
99       * A simple regular expression for whitespace.
100      */
101     public static final Pattern WHITESPACE_PATTERN =
102         Pattern.compile("//s+");
103 
104     /***
105      * Helper method for building a regular expression {@link Pattern} by
106      * combining several alternatives in a capturing group.
107      *
108      * @param alternatives the alternatives to combine
109      * @param quote whether to quote the alternatives using the
110      * {@link Pattern#quote(java.lang.String)} method
111      * @return a pattern string containing the joined alternatives within a
112      * capturing group
113      */
114     public static String captureAlternatives(final String[] alternatives,
115             final boolean quote) {
116         final int length = alternatives.length;
117         // open capturing group
118         final StringBuilder result = new StringBuilder("(");
119 
120         // append alternatives
121         for (int i = 0; i < length; i++) {
122             // quote alternatives if required
123             if (quote) {
124                 result.append(Pattern.quote(alternatives[i]));
125             } else {
126                 result.append(alternatives[i]);
127             }
128 
129             if (i < length - 1) {
130                 result.append("|");   // separate alternatives
131             } else {
132                 result.append(")");   // close group
133             }
134         }
135         return result.toString();
136     }
137 
138     /***
139      * Counts how often a character is repeated at the begin of a string.
140      *
141      * @param str the string to check
142      * @param ch the character to count
143      * @return how often the character is repeated at the begin of the string
144      * (0 if the string starts with another character or is empty)
145      */
146     public static int countFirst(final String str, final char ch) {
147         for (int i = 0; i < str.length(); i++) {
148             if (str.charAt(i) != ch) {
149                 // found first non-match
150                 return i;
151             }
152         }
153 
154         // whole string matches
155         return str.length();
156     }
157 
158     /***
159      * Counts how often a character is repeated at the end of a string.
160      *
161      * @param str the string to check
162      * @param ch the character to count
163      * @return how often the character is repeated at the end of the string
164      * (0 if the string ends with another character or is empty)
165      */
166     public static int countLast(final String str, final char ch) {
167         final int length = str.length();
168 
169         for (int i = 0; i < length; i++) {
170             if (str.charAt(length - i - 1) != ch) {
171                 // found last non-match
172                 return i;
173             }
174         }
175 
176         // whole string matches
177         return length;
178     }
179 
180     /***
181      * Checks that a string is a printable name, meaning it has at at least
182      * one character and does not contain any whitespace.
183      *
184      * @param string the string to check
185      * @throws IllegalArgumentException if the given string is null or empty
186      * or contains whitespace
187      */
188     public static void ensurePrintableName(final String string)
189     throws IllegalArgumentException {
190         if (StringUtils.isEmpty(string)) {
191             throw new IllegalArgumentException("Name is null or empty: "
192                     + string);
193         }
194         if (WHITESPACE_PATTERN.matcher(string).find()) {
195             throw new IllegalArgumentException("Name contains whitespace: '"
196                     + string + "'");
197         }
198     }
199 
200     /***
201      * Helper method for building a regular expression {@link Pattern} by
202      * combining several alternatives.
203      *
204      * @param alternatives the alternatives to combine
205      * @return a pattern string containing the joined alternatives; two or
206      * more alternatives are combined in a non-capturing group; a single
207      * alternative is just returned as is; if the array is empty, an empty
208      * string is returned
209      */
210     public static String joinAlternatives(final String[] alternatives) {
211         final int length = alternatives.length;
212         if (length >= 2) {
213             // open non-capturing group
214             final StringBuilder result = new StringBuilder("(?:");
215             for (int i = 0; i < length; i++) {
216                 result.append(alternatives[i]);
217                 if (i < length - 1) {
218                     result.append("|");   // separate alternatives
219                 } else {
220                     result.append(")");   // close group
221                 }
222             }
223             return result.toString();
224         } else if (length == 1) {
225             // just return the single element
226             return alternatives[0];
227         } else {
228             // return empty string
229             return "";
230         }
231     }
232 
233     /***
234      * Performs multiple replace-all operations on a text. The replacements
235      * are performed in the order of the key-set iterator of the given map.
236      *
237      * @param input the character sequence to perform the replacements on
238      * @param replacements a mapping of regular expression
239      * {@link java.util.regex.Pattern}s to replacement {@link java.lang.String}s
240      * @return the string constructed by performing all replacements
241      */
242     public static String multipleReplaceAll(final CharSequence input,
243             final Map replacements) {
244         final Iterator patternIterator = replacements.keySet().iterator();
245         Pattern currentPattern;
246         String currentReplace;
247         String text = input.toString();
248 
249         while (patternIterator.hasNext()) {
250             currentPattern = (Pattern) patternIterator.next();
251             currentReplace = (String) replacements.get(currentPattern);
252             text = replaceAll(text, currentPattern, currentReplace);
253         }
254         return text;
255     }
256 
257     /***
258      * Normalizes the whitespace in a string, replacing all internal whitespace
259      * sequences with a single space character and trimming any leading and
260      * trailing whitespace.
261      *
262      * @param input the string to normalize
263      * @return the normalized string
264      */
265     public static String normalize(final String input) {
266         // replace whitespace by space char
267         final String result = replaceAll(input, WHITESPACE_PATTERN, " ");
268         // trim result
269         return result.trim();
270     }
271 
272     /***
273      * Helper method that handles the actual replacement process.
274      *
275      * @param input the string to process
276      * @param matcher a matcher on the pattern
277      * @param replacement the replacement string
278      * @param doReset whether to reset this matcher
279      * @return the resulting string; or a reference to the <code>input</code>
280      * string if no replacements were made
281      */
282     private static String replaceAll(final String input, final Matcher matcher,
283             final String replacement, final boolean doReset) {
284         if (doReset) {
285             matcher.reset(input);
286         }
287         boolean found = matcher.find();
288 
289         if (found) {
290             final StringBuffer result = new StringBuffer();
291             do {
292                 matcher.appendReplacement(result, replacement);
293                 found = matcher.find();
294             } while (found);
295 
296             matcher.appendTail(result);
297             return result.toString();
298         } else {
299             // nothing to replace
300             return input;
301         }
302     }
303 
304     /***
305      * Replaces each substring of the <code>input</code> matched by the
306      * given {@linkplain Pattern pattern} matcher with the given
307      * replacement. See {@link Matcher#replaceAll(java.lang.String)} for details
308      * of the replacement process and special characters in the
309      * <code>replacement</code> string.
310      *
311      * <p>This method only returns a new string if there is at least one match
312      * to replace. Otherwise the reference to the <code>input</code> object is
313      * returned. Thus you can use the <code>==</code> operator to find out
314      * whether replacements have been made, it is not necessary to use
315      * {@link String#equals(java.lang.Object)}. When there is nothing to
316      * replace, it might be more efficient than
317      * {@link Matcher#replaceAll(java.lang.String)} (and certainly than
318      * {@link String#replaceAll(java.lang.String, java.lang.String)}, because
319      * (as of JDK 1.4.2) these methods always create and return new objects.
320      *
321      * <p>Matchers are stateful and not thread-safe. It is not necessary to
322      * {@link Matcher#reset()} the matcher prior to calling this method but you
323      * should reset it if you want to used it in other matching operations
324      * afterwards.
325      *
326      * @param input the string to process
327      * @param matcher a matcher on the pattern
328      * @param replacement the replacement string
329      * @return the resulting string; or a reference to the <code>input</code>
330      * string if no replacements were made
331      */
332     public static String replaceAll(final String input, final Matcher matcher,
333             final String replacement) {
334         return replaceAll(input, matcher, replacement, true);
335     }
336 
337     /***
338      * Shortens a string, inserting an ellipsis ("...") in the middle if the
339      * string is too long. Specifically:
340      *
341      * <ul>
342      * <li>If the length of the <code>input</code> String isn't larger than
343      * <code>startChars + endChars + 3</code>, return it.
344      * <li>Otherwise return the first <code>startChars</code> characters of the
345      * <code>input</code> String, followed by "..." (an ellipsis) and the last
346      * <code>endChars</code> characters of the String
347      * </ul>
348      *
349      * <p>This method is similar to
350      * {@link org.apache.commons.lang.StringUtils#abbreviate(String, int)},
351      * but the ellipsis is inserted in the middle of the string, not at the
352      * end.
353      *
354      * @param input the input string
355      * @param startChars the number of characters to include before the ellipsis
356      * @param endChars the number of characters to include after the ellipsis
357      * @return a shortened string, as described above
358      */
359     public static String shorten(final String input, final int startChars,
360             final int endChars) {
361         final int length = input.length();
362 
363         if (length <= startChars + endChars + 3) {
364             // string is short enough -- return as is
365             return input;
366         } else {
367             // return first <startChars> characters + ellipsis + last
368             // <endChars> characters
369             return input.substring(0, startChars) + "..."
370             + input.substring(length - endChars);
371         }
372     }
373 
374     /***
375      * Delegates to {@link #shorten(String, int, int)}, using the same number
376      * of characters at the start and the end of the shortened string.
377      *
378      * @param input the input string
379      * @param numChars the number of characters to to use for
380      * both <code>startChars</code> and <code>endChars</code> parameter
381      * @return the shortened string
382      */
383     public static String shorten(final String input, final int numChars) {
384         return shorten(input, numChars, numChars);
385     }
386 
387     /***
388      * Checks whether a string contains only punctuation characters.
389      *
390      * @param text the test to check
391      * @return <code>true</code> iff the text contains one or more
392      * punctuation characters and no other characters
393      */
394     public static boolean punctuation(final CharSequence text) {
395         return PUNCTUATION_PATTERN.matcher(text).matches();
396     }
397 
398     /***
399      * Checks whether a string contains only punctuation and symbol characters.
400      *
401      * @param text the test to check
402      * @return <code>true</code> iff the text contains one or more
403      * punctuation or symbol characters and no other characters
404      */
405     public static boolean punctuationOrSymbol(final CharSequence text) {
406         return PUNCTUATION_SYMBOL_PATTERN.matcher(text).matches();
407     }
408 
409 
410     /***
411      * Delegates to {@link #shorten(String, int, int)}, showing up to
412      * <strong>24</strong> characters at the start and the end of the shortened
413      * string.
414      *
415      * @param input the input string
416      * @return the shortened string
417      */
418     public static String shorten(final String input) {
419         return shorten(input, 24);
420     }
421 
422     /***
423      * Replaces each substring of the <code>input</code> that matches the given
424      * {@link Pattern} with the given replacement. See
425      * {@link Matcher#replaceAll(java.lang.String)} for details of the
426      * replacement process and special characters in the
427      * <code>replacement</code> string.
428      *
429      * <p>This method only returns a new string if there is at least one match
430      * to replace. Otherwise the reference to the <code>input</code> object is
431      * returned. Thus you can use the <code>==</code> operator to find out
432      * whether replacements have been made, it is not necessary to use
433      * {@link String#equals(java.lang.Object)}.
434      *
435      * <p>This method is thread-safe since pattern objects are stateless. On the
436      * other hand, it needs to create a new {@link Matcher} object, thus
437      * {@link #replaceAll(String, Matcher, String)} is more efficient for
438      * multiple replacements on the same pattern.
439      *
440      * @param input the string to process
441      * @param pattern the regular expression {@link Pattern} to replace
442      * @param replacement the replacement string
443      * @return the resulting string; or a reference to the <code>input</code>
444      * string if no replacements were made
445      */
446     public static String replaceAll(final String input, final Pattern pattern,
447             final String replacement) {
448         final Matcher matcher = pattern.matcher(input);
449         // no need to reset newly created matcher
450         return replaceAll(input, matcher, replacement, false);
451     }
452 
453     /***
454      * Splits a text into an array of lines. Only the textual contents of
455      * non-empty lines are retained; empty lines and training and leading
456      * whitespace are removed.
457      *
458      * @param input the text to split
459      * @return an array of the lines contained in the text; each line is trimmed
460      * (trailing and leading whitespace is removed) and empty lines are
461      * suppressed
462      */
463     public static String[] splitLines(final CharSequence input) {
464         // remove whitespace at the very beginning and end
465         final String trimmed = input.toString().trim();
466         return NEWLINES_PATTERN.split(trimmed);
467     }
468 
469     /***
470      * Splits a text into an array of lines, without trimming lines and
471      * discarding empty lines.
472      *
473      * @param input the text to split
474      * @return an array of the lines contained in the text
475      */
476     public static String[] splitLinesExact(final CharSequence input) {
477         return NEWLINE_PATTERN.split(input);
478     }
479 
480     /***
481      * Splits a string around whitespace.
482      *
483      * @param input the string to split
484      * @return an array of strings computed by splitting the input
485      */
486     public static String[] splitString(final CharSequence input) {
487         return splitString(input, -1);
488     }
489 
490     /***
491      * Splits a string around whitespace. The number of returned subsequences
492      * won't be higher than the specified <code>splitMaximum</code>.
493      * If splitting results in more subsequences, only the <em>last</em>
494      * <code>splitMaximum</code> are kept, while the other ones are discarded.
495      * This implementation splits around the {@link #WHITESPACE_PATTERN}.
496      *
497      * @param input the string to split
498      * @param splitMaximum the maximum number of subsequences to keep;
499      * or <code>-1</code> if all subsequences should be kept
500      * @return an array of strings computed by splitting the input; will
501      * contain at least 1 and at most <code>splitMaximum</code> elements
502      */
503     public static String[] splitString(final CharSequence input,
504             final int splitMaximum) {
505         return splitString(input, WHITESPACE_PATTERN, splitMaximum);
506     }
507 
508     /***
509      * Splits a string around whitespace. The number of returned subsequences
510      * won't be higher than the specified <code>splitMaximum</code>.
511      * If splitting results in more subsequences, only the <em>last</em>
512      * <code>splitMaximum</code> are kept, while the other ones are discarded.
513      *
514      * @param input the string to split
515      * @param whitespacePattern the pattern around which to split
516      * @param splitMaximum the maximum number of subsequences to keep;
517      * or <code>-1</code> if all subsequences should be kept
518      * @return an array of strings computed by splitting the input; will
519      * contain at most <code>splitMaximum</code> elements
520      */
521     public static String[] splitString(final CharSequence input,
522             final Pattern whitespacePattern, final int splitMaximum) {
523         final String[] results;
524         final String[] allSplits = whitespacePattern.split(input);
525         final int surplus;
526         if (splitMaximum >= 0) {
527             surplus = allSplits.length - splitMaximum;
528         } else {
529             // return all matches
530             surplus = -1;
531         }
532 
533         if (surplus > 0) {
534             // keep only last splitMaximum splits
535             results = new String[splitMaximum];
536             for (int i = 0; i < results.length; i++) {
537                 results[i] = allSplits[i + surplus];
538             }
539         } else {
540             // keep all splits
541             results = allSplits;
542         }
543 
544         return results;
545     }
546 
547     /***
548      * Weakly normalizes the whitespace in a string, by replacing each
549      * whitespace element (space, tab, newline) with a space character.
550      * This is in accordance with the attribute-value normalization required
551      * by the XML specification.
552      *
553      * @param input the string to normalize
554      * @return the weakly normalized string
555      */
556     public static String weaklyNormalize(final String input) {
557         // replace newlines and tabs by space characters
558         return replaceAll(input, NEWLINE_TAB_PATTERN, " ");
559     }
560 
561     /***
562      * Convenience method that writes a text to a writer and appends to
563      * line separator.
564      *
565      * @param writer the writer to write to
566      * @param text the text to write
567      * @throws IOException if an I/O error occurs
568      */
569     public static void writeln(final Writer writer, final String text)
570     throws IOException {
571         writer.write(text);
572         writer.write(LINE_SEPARATOR);
573     }
574 
575     /***
576      * Private constructor prevents creation of instances.
577      */
578     private TextUtils() {
579         super();
580     }
581 
582 }