View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.text;
23  
24  import java.util.Iterator;
25  import java.util.Map;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  import org.apache.commons.lang.StringUtils;
30  
31  /***
32   * A static class that provides utility constants and methods for working with
33   * texts and regular expressions.
34   * No instances of this class can be created, only the static members
35   * should be used.
36   *
37   * @author Christian Siefkes
38   * @version $Revision: 1.10 $, $Date: 2004/12/06 17:59:28 $, $Author: siefkes $
39   */
40  public final class TextUtils {
41  
42      /***
43       * The line separator on the current operating system ("\n" on Unix).
44       */
45      public static final String LINE_SEPARATOR =
46          System.getProperty("line.separator");
47  
48      /***
49       * Regex fragment listing the newline alternatives used by differents
50       * systems: "\r\n" (Windows), "\n" (Unix) or "\r" (Mac).
51       */
52      public static final String NEWLINE_ALTERNATIVES = "//r//n|//n|//r";
53  
54      /***
55      * A regular expression matching a non-line-breaking whitespace character
56      * (character class containing space and tab).
57      */
58      public static final Pattern SINGLE_LINE_WS =
59             Pattern.compile("[ //t]");
60  
61      /***
62      * A regular expression matching a single newlines (build by enclosing
63      * {@link #NEWLINE_ALTERNATIVES} in a non-capturing group).
64      */
65      public static final Pattern NEWLINE_PATTERN =
66             Pattern.compile("(?:" + NEWLINE_ALTERNATIVES + ")");
67  
68      /***
69       * A regular expression matching newlines, including surrounding whitespace.
70       * Will match several newlines if they immediately follow each other or
71       * are separated by whitespace only.
72       */
73      public static final Pattern NEWLINES_PATTERN =
74              Pattern.compile("//s*(?:[//r//n]+//s*)+");
75  
76      /***
77       * A simple regular expression for strings that contain only
78       * punctuation characters.
79       */
80      public static final Pattern PUNCTUATION_PATTERN =
81          Pattern.compile("//p{P}+");
82  
83      /***
84       * A simple regular expression for strings that contain only
85       * punctuation and symbol characters.
86       */
87      public static final Pattern PUNCTUATION_SYMBOL_PATTERN =
88          Pattern.compile("[//p{P}//p{S}]+");
89  
90      /***
91       * A simple regular expression for whitespace.
92       */
93      public static final Pattern WHITESPACE_PATTERN =
94          Pattern.compile("//s+");
95  
96      /***
97       * Counts how often a character is repeated at the begin of a string.
98       *
99       * @param str the string to check
100      * @param ch the character to count
101      * @return how often the character is repeated at the begin of the string
102      * (0 if the string starts with another character or is empty)
103      */
104     public static int countFirst(final String str, final char ch) {
105         for (int i = 0; i < str.length(); i++) {
106             if (str.charAt(i) != ch) {
107                 // found first non-match
108                 return i;
109             }
110         }
111 
112         // whole string matches
113         return str.length();
114     }
115 
116     /***
117      * Counts how often a character is repeated at the end of a string.
118      *
119      * @param str the string to check
120      * @param ch the character to count
121      * @return how often the character is repeated at the end of the string
122      * (0 if the string ends with another character or is empty)
123      */
124     public static int countLast(final String str, final char ch) {
125         final int length = str.length();
126 
127         for (int i = 0; i < length; i++) {
128             if (str.charAt(length - i - 1) != ch) {
129                 // found last non-match
130                 return i;
131             }
132         }
133 
134         // whole string matches
135         return length;
136     }
137 
138     /***
139      * Checks that a string is a printable name, meaning it has at at least
140      * one character and does not contain any whitespace.
141      *
142      * @param string the string to check
143      * @throws IllegalArgumentException if the given string null or empty
144      * or contains whitespace
145      */
146     public static void ensurePrintableName(final String string)
147     throws IllegalArgumentException {
148         if (StringUtils.isEmpty(string)) {
149             throw new IllegalArgumentException("Name is null or empty: "
150                     + string);
151         }
152         if (WHITESPACE_PATTERN.matcher(string).find()) {
153             throw new IllegalArgumentException("Name contains whitespace: '"
154                     + string + "'");
155         }
156     }
157 
158     /***
159      * Helper method for building a regular expression {@link Pattern} by
160      * combining several alternatives.
161      *
162      * @param alternatives the alternatives to combine
163      * @return a pattern string containing the joined alternatives; two or
164      * more alternatives are combined in a non-capturing group; a single
165      * alternative is just returned as is; if the array is empty, an empty
166      * string is returned
167      */
168     public static String joinAlternatives(final String[] alternatives) {
169         final int length = alternatives.length;
170         if (length >= 2) {
171             // open non-capturing group
172             final StringBuffer result = new StringBuffer("(?:");
173             for (int i = 0; i < length; i++) {
174                 result.append(alternatives[i]);
175                 if (i < length - 1) {
176                     result.append("|");   // separate alternatives
177                 } else {
178                     result.append(")");   // close group
179                 }
180             }
181             return result.toString();
182         } else if (length == 1) {
183             // just return the single element
184             return alternatives[0];
185         } else {
186             // return empty string
187             return "";
188         }
189     }
190 
191     /***
192      * Performs multiple replace-all operations on a text. The replacements
193      * are performed in the order of the key-set iterator of the given map.
194      *
195      * @param input the character sequence to perform the replacements on
196      * @param replacements a mapping of regular expression
197      * {@link java.util.regex.Pattern}s to replacement {@link java.lang.String}s
198      * @return the string constructed by performing all replacements
199      */
200     public static String multipleReplaceAll(final CharSequence input,
201             final Map replacements) {
202         final Iterator patternIterator = replacements.keySet().iterator();
203         Pattern currentPattern;
204         String currentReplace;
205         String text = input.toString();
206 
207         while (patternIterator.hasNext()) {
208             currentPattern = (Pattern) patternIterator.next();
209             currentReplace = (String) replacements.get(currentPattern);
210             text = replaceAll(text, currentPattern, currentReplace);
211         }
212         return text;
213     }
214 
215     /***
216      * Normalizes the whitespace in a string, replacing all internal whitespace
217      * sequences with a single space character and trimming any leading and
218      * trailing whitespace.
219      *
220      * @param input the string to normalize
221      * @return the normalized string
222      */
223     public static String normalize(final String input) {
224         // replace whitespace by space char
225         final String result = replaceAll(input, WHITESPACE_PATTERN, " ");
226         // trim result
227         return result.trim();
228     }
229 
230     /***
231      * Helper method that handles the actual replacement process.
232      *
233      * @param input the string to process
234      * @param matcher a matcher on the pattern
235      * @param replacement the replacement string
236      * @param doReset whether to reset this matcher
237      * @return the resulting string; or a reference to the <code>input</code>
238      * string if no replacements were made
239      */
240     private static String replaceAll(final String input, final Matcher matcher,
241             final String replacement, final boolean doReset) {
242         if (doReset) {
243             matcher.reset(input);
244         }
245         boolean found = matcher.find();
246 
247         if (found) {
248             final StringBuffer result = new StringBuffer();
249             do {
250                 matcher.appendReplacement(result, replacement);
251                 found = matcher.find();
252             } while (found);
253 
254             matcher.appendTail(result);
255             return result.toString();
256         } else {
257             // nothing to replace
258             return input;
259         }
260     }
261 
262     /***
263      * Replaces each substring of the <code>input</code> matched by the
264      * given {@linkplain Pattern pattern} matcher with the given
265      * replacement. See {@link Matcher#replaceAll(java.lang.String)} for details
266      * of the replacement process and special characters in the
267      * <code>replacement</code> string.
268      *
269      * <p>This method only returns a new string if there is at least one match
270      * to replace. Otherwise the reference to the <code>input</code> object is
271      * returned. Thus you can use the <code>==</code> operator to find out
272      * whether replacements have been made, it is not necessary to use
273      * {@link String#equals(java.lang.Object)}. When there is nothing to
274      * replace, it might be more efficient than
275      * {@link Matcher#replaceAll(java.lang.String)} (and certainly than
276      * {@link String#replaceAll(java.lang.String, java.lang.String)}, because
277      * (as of JDK 1.4.2) these methods always create and return new objects.
278      *
279      * <p>Matchers are stateful and not thread-safe. It is not necessary to
280      * {@link Matcher#reset()} the matcher prior to calling this method but you
281      * should reset it if you want to used it in other matching operations
282      * afterwards.
283      *
284      * @param input the string to process
285      * @param matcher a matcher on the pattern
286      * @param replacement the replacement string
287      * @return the resulting string; or a reference to the <code>input</code>
288      * string if no replacements were made
289      */
290     public static String replaceAll(final String input, final Matcher matcher,
291             final String replacement) {
292         return replaceAll(input, matcher, replacement, true);
293     }
294 
295     /***
296      * Shortens a string, inserting an ellipsis ("...") in the middle if the
297      * string is too long. Specifically:
298      *
299      * <ul>
300      * <li>If the length of the <code>input</code> String isn't larger than
301      * <code>startChars + endChars + 3</code>, return it.
302      * <li>Otherwise return the first <code>startChars</code> characters of the
303      * <code>input</code> String, followed by "..." (an ellipsis) and the last
304      * <code>endChars</code> characters of the String
305      * </ul>
306      *
307      * <p>This method is similar to
308      * {@link org.apache.commons.lang.StringUtils#abbreviate(String, int)},
309      * but the ellipsis is inserted in the middle of the string, not at the
310      * end.
311      *
312      * @param input the input string
313      * @param startChars the number of characters to include before the ellipsis
314      * @param endChars the number of characters to include after the ellipsis
315      * @return a shortened string, as described above
316      */
317     public static String shorten(final String input, final int startChars,
318             final int endChars) {
319         final int length = input.length();
320 
321         if (length <= startChars + endChars + 3) {
322             // string is short enough -- return as is
323             return input;
324         } else {
325             // return first <startChars> characters + ellipsis + last
326             // <endChars> characters
327             return input.substring(0, startChars) + "..."
328             + input.substring(length - endChars);
329         }
330     }
331 
332     /***
333      * Delegates to {@link #shorten(String, int, int)}, using the same number
334      * of characters at the start and the end of the shortened string.
335      *
336      * @param input the input string
337      * @param numChars the number of characters to to use for
338      * both <code>startChars</code> and <code>endChars</code> parameter
339      * @return the shortened string
340      */
341     public static String shorten(final String input, final int numChars) {
342         return shorten(input, numChars, numChars);
343     }
344 
345     /***
346      * Checks whether a string contains only punctuation characters.
347      *
348      * @param text the test to check
349      * @return <code>true</code> iff the text contains one or more
350      * punctuation characters and no other characters
351      */
352     public static boolean punctuation(final CharSequence text) {
353         return PUNCTUATION_PATTERN.matcher(text).matches();
354     }
355 
356     /***
357      * Checks whether a string contains only punctuation and symbol characters.
358      *
359      * @param text the test to check
360      * @return <code>true</code> iff the text contains one or more
361      * punctuation or symbol characters and no other characters
362      */
363     public static boolean punctuationOrSymbol(final CharSequence text) {
364         return PUNCTUATION_SYMBOL_PATTERN.matcher(text).matches();
365     }
366 
367 
368     /***
369      * Delegates to {@link #shorten(String, int, int)}, showing up to
370      * <strong>24</strong> characters at the start and the end of the shortened
371      * string.
372      *
373      * @param input the input string
374      * @return the shortened string
375      */
376     public static String shorten(final String input) {
377         return shorten(input, 24);
378     }
379 
380     /***
381      * Replaces each substring of the <code>input</code> that matches the given
382      * {@link Pattern} with the given replacement. See
383      * {@link Matcher#replaceAll(java.lang.String)} for details of the
384      * replacement process and special characters in the
385      * <code>replacement</code> string.
386      *
387      * <p>This method only returns a new string if there is at least one match
388      * to replace. Otherwise the reference to the <code>input</code> object is
389      * returned. Thus you can use the <code>==</code> operator to find out
390      * whether replacements have been made, it is not necessary to use
391      * {@link String#equals(java.lang.Object)}.
392      *
393      * <p>This method is thread-safe since pattern objects are stateless. On the
394      * other hand, it needs to create a new {@link Matcher} object, thus
395      * {@link #replaceAll(String, Matcher, String)} is more efficient for
396      * multiple replacements on the same pattern.
397      *
398      * @param input the string to process
399      * @param pattern the regular expression {@link Pattern} to replace
400      * @param replacement the replacement string
401      * @return the resulting string; or a reference to the <code>input</code>
402      * string if no replacements were made
403      */
404     public static String replaceAll(final String input, final Pattern pattern,
405             final String replacement) {
406         final Matcher matcher = pattern.matcher(input);
407         // no need to reset newly created matcher
408         return replaceAll(input, matcher, replacement, false);
409     }
410 
411     /***
412      * Splits a text into an array of lines. Only the textual contents of
413      * non-empty lines are retained; empty lines and training and leading
414      * whitespace are removed.
415      *
416      * @param input the text to split
417      * @return an array of the lines contained in the text; each line is trimmed
418      * (trailing and leading whitespace is removed) and empty lines are
419      * suppressed
420      */
421     public static String[] splitLines(final CharSequence input) {
422         // remove whitespace at the very beginning and end
423         final String trimmed = input.toString().trim();
424         return NEWLINES_PATTERN.split(trimmed);
425     }
426 
427     /***
428      * Splits a text into an array of lines, without trimming lines and
429      * discarding empty lines.
430      *
431      * @param input the text to split
432      * @return an array of the lines contained in the text
433      */
434     public static String[] splitLinesExact(final CharSequence input) {
435         return NEWLINE_PATTERN.split(input);
436     }
437 
438     /***
439      * Splits a string around whitespace.
440      *
441      * @param input the string to split
442      * @return an array of strings computed by splitting the input
443      */
444     public static String[] splitString(final String input) {
445         return splitString(input, -1);
446     }
447 
448     /***
449      * Splits a string around whitespace. The number of returned subsequences
450      * won't be higher than the specified <code>splitMaximum</code>.
451      * If splitting results in more subsequences, only the <em>last</em>
452      * <code>splitMaximum</code> are kept, while the other ones are discarded.
453      * This implementation splits around the {@link #WHITESPACE_PATTERN}.
454      *
455      * @param input the string to split
456      * @param splitMaximum the maximum number of subsequences to keep;
457      * or <code>-1</code> if all subsequences should be kept
458      * @return an array of strings computed by splitting the input; will
459      * contain at least 1 and at most <code>splitMaximum</code> elements
460      */
461     public static String[] splitString(final String input,
462             final int splitMaximum) {
463         return splitString(input, WHITESPACE_PATTERN, splitMaximum);
464     }
465 
466     /***
467      * Splits a string around whitespace. The number of returned subsequences
468      * won't be higher than the specified <code>splitMaximum</code>.
469      * If splitting results in more subsequences, only the <em>last</em>
470      * <code>splitMaximum</code> are kept, while the other ones are discarded.
471      *
472      * @param input the string to split
473      * @param whitespacePattern the pattern around which to split
474      * @param splitMaximum the maximum number of subsequences to keep;
475      * or <code>-1</code> if all subsequences should be kept
476      * @return an array of strings computed by splitting the input; will
477      * contain at most <code>splitMaximum</code> elements
478      */
479     public static String[] splitString(final String input,
480             final Pattern whitespacePattern, final int splitMaximum) {
481         final String[] results;
482         final String[] allSplits = whitespacePattern.split(input);
483         final int surplus;
484         if (splitMaximum >= 0) {
485             surplus = allSplits.length - splitMaximum;
486         } else {
487             // return all matches
488             surplus = -1;
489         }
490 
491         if (surplus > 0) {
492             // keep only last splitMaximum splits
493             results = new String[splitMaximum];
494             for (int i = 0; i < results.length; i++) {
495                 results[i] = allSplits[i + surplus];
496             }
497         } else {
498             // keep all splits
499             results = allSplits;
500         }
501 
502         return results;
503     }
504 
505     /***
506      * Private constructor prevents creation of instances.
507      */
508     private TextUtils() {
509         super();
510     }
511 
512 }