1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import java.util.Iterator;
25 import java.util.Map;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 import org.apache.commons.lang.StringUtils;
30
31 /***
32 * A static class that provides utility constants and methods for working with
33 * texts and regular expressions.
34 * No instances of this class can be created, only the static members
35 * should be used.
36 *
37 * @author Christian Siefkes
38 * @version $Revision: 1.10 $, $Date: 2004/12/06 17:59:28 $, $Author: siefkes $
39 */
40 public final class TextUtils {
41
42 /***
43 * The line separator on the current operating system ("\n" on Unix).
44 */
45 public static final String LINE_SEPARATOR =
46 System.getProperty("line.separator");
47
48 /***
49 * Regex fragment listing the newline alternatives used by differents
50 * systems: "\r\n" (Windows), "\n" (Unix) or "\r" (Mac).
51 */
52 public static final String NEWLINE_ALTERNATIVES = "//r//n|//n|//r";
53
54 /***
55 * A regular expression matching a non-line-breaking whitespace character
56 * (character class containing space and tab).
57 */
58 public static final Pattern SINGLE_LINE_WS =
59 Pattern.compile("[ //t]");
60
61 /***
62 * A regular expression matching a single newlines (build by enclosing
63 * {@link #NEWLINE_ALTERNATIVES} in a non-capturing group).
64 */
65 public static final Pattern NEWLINE_PATTERN =
66 Pattern.compile("(?:" + NEWLINE_ALTERNATIVES + ")");
67
68 /***
69 * A regular expression matching newlines, including surrounding whitespace.
70 * Will match several newlines if they immediately follow each other or
71 * are separated by whitespace only.
72 */
73 public static final Pattern NEWLINES_PATTERN =
74 Pattern.compile("//s*(?:[//r//n]+//s*)+");
75
76 /***
77 * A simple regular expression for strings that contain only
78 * punctuation characters.
79 */
80 public static final Pattern PUNCTUATION_PATTERN =
81 Pattern.compile("//p{P}+");
82
83 /***
84 * A simple regular expression for strings that contain only
85 * punctuation and symbol characters.
86 */
87 public static final Pattern PUNCTUATION_SYMBOL_PATTERN =
88 Pattern.compile("[//p{P}//p{S}]+");
89
90 /***
91 * A simple regular expression for whitespace.
92 */
93 public static final Pattern WHITESPACE_PATTERN =
94 Pattern.compile("//s+");
95
96 /***
97 * Counts how often a character is repeated at the begin of a string.
98 *
99 * @param str the string to check
100 * @param ch the character to count
101 * @return how often the character is repeated at the begin of the string
102 * (0 if the string starts with another character or is empty)
103 */
104 public static int countFirst(final String str, final char ch) {
105 for (int i = 0; i < str.length(); i++) {
106 if (str.charAt(i) != ch) {
107
108 return i;
109 }
110 }
111
112
113 return str.length();
114 }
115
116 /***
117 * Counts how often a character is repeated at the end of a string.
118 *
119 * @param str the string to check
120 * @param ch the character to count
121 * @return how often the character is repeated at the end of the string
122 * (0 if the string ends with another character or is empty)
123 */
124 public static int countLast(final String str, final char ch) {
125 final int length = str.length();
126
127 for (int i = 0; i < length; i++) {
128 if (str.charAt(length - i - 1) != ch) {
129
130 return i;
131 }
132 }
133
134
135 return length;
136 }
137
138 /***
139 * Checks that a string is a printable name, meaning it has at at least
140 * one character and does not contain any whitespace.
141 *
142 * @param string the string to check
143 * @throws IllegalArgumentException if the given string null or empty
144 * or contains whitespace
145 */
146 public static void ensurePrintableName(final String string)
147 throws IllegalArgumentException {
148 if (StringUtils.isEmpty(string)) {
149 throw new IllegalArgumentException("Name is null or empty: "
150 + string);
151 }
152 if (WHITESPACE_PATTERN.matcher(string).find()) {
153 throw new IllegalArgumentException("Name contains whitespace: '"
154 + string + "'");
155 }
156 }
157
158 /***
159 * Helper method for building a regular expression {@link Pattern} by
160 * combining several alternatives.
161 *
162 * @param alternatives the alternatives to combine
163 * @return a pattern string containing the joined alternatives; two or
164 * more alternatives are combined in a non-capturing group; a single
165 * alternative is just returned as is; if the array is empty, an empty
166 * string is returned
167 */
168 public static String joinAlternatives(final String[] alternatives) {
169 final int length = alternatives.length;
170 if (length >= 2) {
171
172 final StringBuffer result = new StringBuffer("(?:");
173 for (int i = 0; i < length; i++) {
174 result.append(alternatives[i]);
175 if (i < length - 1) {
176 result.append("|");
177 } else {
178 result.append(")");
179 }
180 }
181 return result.toString();
182 } else if (length == 1) {
183
184 return alternatives[0];
185 } else {
186
187 return "";
188 }
189 }
190
191 /***
192 * Performs multiple replace-all operations on a text. The replacements
193 * are performed in the order of the key-set iterator of the given map.
194 *
195 * @param input the character sequence to perform the replacements on
196 * @param replacements a mapping of regular expression
197 * {@link java.util.regex.Pattern}s to replacement {@link java.lang.String}s
198 * @return the string constructed by performing all replacements
199 */
200 public static String multipleReplaceAll(final CharSequence input,
201 final Map replacements) {
202 final Iterator patternIterator = replacements.keySet().iterator();
203 Pattern currentPattern;
204 String currentReplace;
205 String text = input.toString();
206
207 while (patternIterator.hasNext()) {
208 currentPattern = (Pattern) patternIterator.next();
209 currentReplace = (String) replacements.get(currentPattern);
210 text = replaceAll(text, currentPattern, currentReplace);
211 }
212 return text;
213 }
214
215 /***
216 * Normalizes the whitespace in a string, replacing all internal whitespace
217 * sequences with a single space character and trimming any leading and
218 * trailing whitespace.
219 *
220 * @param input the string to normalize
221 * @return the normalized string
222 */
223 public static String normalize(final String input) {
224
225 final String result = replaceAll(input, WHITESPACE_PATTERN, " ");
226
227 return result.trim();
228 }
229
230 /***
231 * Helper method that handles the actual replacement process.
232 *
233 * @param input the string to process
234 * @param matcher a matcher on the pattern
235 * @param replacement the replacement string
236 * @param doReset whether to reset this matcher
237 * @return the resulting string; or a reference to the <code>input</code>
238 * string if no replacements were made
239 */
240 private static String replaceAll(final String input, final Matcher matcher,
241 final String replacement, final boolean doReset) {
242 if (doReset) {
243 matcher.reset(input);
244 }
245 boolean found = matcher.find();
246
247 if (found) {
248 final StringBuffer result = new StringBuffer();
249 do {
250 matcher.appendReplacement(result, replacement);
251 found = matcher.find();
252 } while (found);
253
254 matcher.appendTail(result);
255 return result.toString();
256 } else {
257
258 return input;
259 }
260 }
261
262 /***
263 * Replaces each substring of the <code>input</code> matched by the
264 * given {@linkplain Pattern pattern} matcher with the given
265 * replacement. See {@link Matcher#replaceAll(java.lang.String)} for details
266 * of the replacement process and special characters in the
267 * <code>replacement</code> string.
268 *
269 * <p>This method only returns a new string if there is at least one match
270 * to replace. Otherwise the reference to the <code>input</code> object is
271 * returned. Thus you can use the <code>==</code> operator to find out
272 * whether replacements have been made, it is not necessary to use
273 * {@link String#equals(java.lang.Object)}. When there is nothing to
274 * replace, it might be more efficient than
275 * {@link Matcher#replaceAll(java.lang.String)} (and certainly than
276 * {@link String#replaceAll(java.lang.String, java.lang.String)}, because
277 * (as of JDK 1.4.2) these methods always create and return new objects.
278 *
279 * <p>Matchers are stateful and not thread-safe. It is not necessary to
280 * {@link Matcher#reset()} the matcher prior to calling this method but you
281 * should reset it if you want to used it in other matching operations
282 * afterwards.
283 *
284 * @param input the string to process
285 * @param matcher a matcher on the pattern
286 * @param replacement the replacement string
287 * @return the resulting string; or a reference to the <code>input</code>
288 * string if no replacements were made
289 */
290 public static String replaceAll(final String input, final Matcher matcher,
291 final String replacement) {
292 return replaceAll(input, matcher, replacement, true);
293 }
294
295 /***
296 * Shortens a string, inserting an ellipsis ("...") in the middle if the
297 * string is too long. Specifically:
298 *
299 * <ul>
300 * <li>If the length of the <code>input</code> String isn't larger than
301 * <code>startChars + endChars + 3</code>, return it.
302 * <li>Otherwise return the first <code>startChars</code> characters of the
303 * <code>input</code> String, followed by "..." (an ellipsis) and the last
304 * <code>endChars</code> characters of the String
305 * </ul>
306 *
307 * <p>This method is similar to
308 * {@link org.apache.commons.lang.StringUtils#abbreviate(String, int)},
309 * but the ellipsis is inserted in the middle of the string, not at the
310 * end.
311 *
312 * @param input the input string
313 * @param startChars the number of characters to include before the ellipsis
314 * @param endChars the number of characters to include after the ellipsis
315 * @return a shortened string, as described above
316 */
317 public static String shorten(final String input, final int startChars,
318 final int endChars) {
319 final int length = input.length();
320
321 if (length <= startChars + endChars + 3) {
322
323 return input;
324 } else {
325
326
327 return input.substring(0, startChars) + "..."
328 + input.substring(length - endChars);
329 }
330 }
331
332 /***
333 * Delegates to {@link #shorten(String, int, int)}, using the same number
334 * of characters at the start and the end of the shortened string.
335 *
336 * @param input the input string
337 * @param numChars the number of characters to to use for
338 * both <code>startChars</code> and <code>endChars</code> parameter
339 * @return the shortened string
340 */
341 public static String shorten(final String input, final int numChars) {
342 return shorten(input, numChars, numChars);
343 }
344
345 /***
346 * Checks whether a string contains only punctuation characters.
347 *
348 * @param text the test to check
349 * @return <code>true</code> iff the text contains one or more
350 * punctuation characters and no other characters
351 */
352 public static boolean punctuation(final CharSequence text) {
353 return PUNCTUATION_PATTERN.matcher(text).matches();
354 }
355
356 /***
357 * Checks whether a string contains only punctuation and symbol characters.
358 *
359 * @param text the test to check
360 * @return <code>true</code> iff the text contains one or more
361 * punctuation or symbol characters and no other characters
362 */
363 public static boolean punctuationOrSymbol(final CharSequence text) {
364 return PUNCTUATION_SYMBOL_PATTERN.matcher(text).matches();
365 }
366
367
368 /***
369 * Delegates to {@link #shorten(String, int, int)}, showing up to
370 * <strong>24</strong> characters at the start and the end of the shortened
371 * string.
372 *
373 * @param input the input string
374 * @return the shortened string
375 */
376 public static String shorten(final String input) {
377 return shorten(input, 24);
378 }
379
380 /***
381 * Replaces each substring of the <code>input</code> that matches the given
382 * {@link Pattern} with the given replacement. See
383 * {@link Matcher#replaceAll(java.lang.String)} for details of the
384 * replacement process and special characters in the
385 * <code>replacement</code> string.
386 *
387 * <p>This method only returns a new string if there is at least one match
388 * to replace. Otherwise the reference to the <code>input</code> object is
389 * returned. Thus you can use the <code>==</code> operator to find out
390 * whether replacements have been made, it is not necessary to use
391 * {@link String#equals(java.lang.Object)}.
392 *
393 * <p>This method is thread-safe since pattern objects are stateless. On the
394 * other hand, it needs to create a new {@link Matcher} object, thus
395 * {@link #replaceAll(String, Matcher, String)} is more efficient for
396 * multiple replacements on the same pattern.
397 *
398 * @param input the string to process
399 * @param pattern the regular expression {@link Pattern} to replace
400 * @param replacement the replacement string
401 * @return the resulting string; or a reference to the <code>input</code>
402 * string if no replacements were made
403 */
404 public static String replaceAll(final String input, final Pattern pattern,
405 final String replacement) {
406 final Matcher matcher = pattern.matcher(input);
407
408 return replaceAll(input, matcher, replacement, false);
409 }
410
411 /***
412 * Splits a text into an array of lines. Only the textual contents of
413 * non-empty lines are retained; empty lines and training and leading
414 * whitespace are removed.
415 *
416 * @param input the text to split
417 * @return an array of the lines contained in the text; each line is trimmed
418 * (trailing and leading whitespace is removed) and empty lines are
419 * suppressed
420 */
421 public static String[] splitLines(final CharSequence input) {
422
423 final String trimmed = input.toString().trim();
424 return NEWLINES_PATTERN.split(trimmed);
425 }
426
427 /***
428 * Splits a text into an array of lines, without trimming lines and
429 * discarding empty lines.
430 *
431 * @param input the text to split
432 * @return an array of the lines contained in the text
433 */
434 public static String[] splitLinesExact(final CharSequence input) {
435 return NEWLINE_PATTERN.split(input);
436 }
437
438 /***
439 * Splits a string around whitespace.
440 *
441 * @param input the string to split
442 * @return an array of strings computed by splitting the input
443 */
444 public static String[] splitString(final String input) {
445 return splitString(input, -1);
446 }
447
448 /***
449 * Splits a string around whitespace. The number of returned subsequences
450 * won't be higher than the specified <code>splitMaximum</code>.
451 * If splitting results in more subsequences, only the <em>last</em>
452 * <code>splitMaximum</code> are kept, while the other ones are discarded.
453 * This implementation splits around the {@link #WHITESPACE_PATTERN}.
454 *
455 * @param input the string to split
456 * @param splitMaximum the maximum number of subsequences to keep;
457 * or <code>-1</code> if all subsequences should be kept
458 * @return an array of strings computed by splitting the input; will
459 * contain at least 1 and at most <code>splitMaximum</code> elements
460 */
461 public static String[] splitString(final String input,
462 final int splitMaximum) {
463 return splitString(input, WHITESPACE_PATTERN, splitMaximum);
464 }
465
466 /***
467 * Splits a string around whitespace. The number of returned subsequences
468 * won't be higher than the specified <code>splitMaximum</code>.
469 * If splitting results in more subsequences, only the <em>last</em>
470 * <code>splitMaximum</code> are kept, while the other ones are discarded.
471 *
472 * @param input the string to split
473 * @param whitespacePattern the pattern around which to split
474 * @param splitMaximum the maximum number of subsequences to keep;
475 * or <code>-1</code> if all subsequences should be kept
476 * @return an array of strings computed by splitting the input; will
477 * contain at most <code>splitMaximum</code> elements
478 */
479 public static String[] splitString(final String input,
480 final Pattern whitespacePattern, final int splitMaximum) {
481 final String[] results;
482 final String[] allSplits = whitespacePattern.split(input);
483 final int surplus;
484 if (splitMaximum >= 0) {
485 surplus = allSplits.length - splitMaximum;
486 } else {
487
488 surplus = -1;
489 }
490
491 if (surplus > 0) {
492
493 results = new String[splitMaximum];
494 for (int i = 0; i < results.length; i++) {
495 results[i] = allSplits[i + surplus];
496 }
497 } else {
498
499 results = allSplits;
500 }
501
502 return results;
503 }
504
505 /***
506 * Private constructor prevents creation of instances.
507 */
508 private TextUtils() {
509 super();
510 }
511
512 }