1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import java.io.IOException;
25 import java.io.Writer;
26 import java.util.Iterator;
27 import java.util.Map;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.apache.commons.lang.StringUtils;
32
33 /***
34 * A static class that provides utility constants and methods for working with
35 * texts and regular expressions.
36 * No instances of this class can be created, only the static members
37 * should be used.
38 *
39 * @author Christian Siefkes
40 * @version $Revision: 1.17 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
41 */
42 public final class TextUtils {
43
44 /***
45 * The line separator on the current operating system ("\n" on Unix).
46 */
47 public static final String LINE_SEPARATOR =
48 System.getProperty("line.separator");
49
50 /***
51 * Regex fragment listing the newline alternatives used by differents
52 * systems: "\r\n" (Windows), "\n" (Unix) or "\r" (Mac).
53 */
54 public static final String NEWLINE_ALTERNATIVES = "//r//n|//n|//r";
55
56 /***
57 * A regular expression matching a non-line-breaking whitespace character
58 * (character class containing space and tab).
59 */
60 public static final Pattern SINGLE_LINE_WS =
61 Pattern.compile("[ //t]");
62
63 /***
64 * A regular expression matching a single newline (build by enclosing
65 * {@link #NEWLINE_ALTERNATIVES} in a non-capturing group).
66 */
67 public static final Pattern NEWLINE_PATTERN =
68 Pattern.compile("(?:" + NEWLINE_ALTERNATIVES + ")");
69
70 /***
71 * A regular expression matching a single newline or a tab character.
72 */
73 public static final Pattern NEWLINE_TAB_PATTERN =
74 Pattern.compile("(?:" + NEWLINE_ALTERNATIVES + "|//t)");
75
76 /***
77 * A regular expression matching newlines, including surrounding whitespace.
78 * Will match several newlines if they immediately follow each other or
79 * are separated by whitespace only.
80 */
81 public static final Pattern NEWLINES_PATTERN =
82 Pattern.compile("//s*(?:[//r//n]+//s*)+");
83
84 /***
85 * A simple regular expression for strings that contain only
86 * punctuation characters.
87 */
88 public static final Pattern PUNCTUATION_PATTERN =
89 Pattern.compile("//p{P}+");
90
91 /***
92 * A simple regular expression for strings that contain only
93 * punctuation and symbol characters.
94 */
95 public static final Pattern PUNCTUATION_SYMBOL_PATTERN =
96 Pattern.compile("[//p{P}//p{S}]+");
97
98 /***
99 * A simple regular expression for whitespace.
100 */
101 public static final Pattern WHITESPACE_PATTERN =
102 Pattern.compile("//s+");
103
104 /***
105 * Helper method for building a regular expression {@link Pattern} by
106 * combining several alternatives in a capturing group.
107 *
108 * @param alternatives the alternatives to combine
109 * @param quote whether to quote the alternatives using the
110 * {@link Pattern#quote(java.lang.String)} method
111 * @return a pattern string containing the joined alternatives within a
112 * capturing group
113 */
114 public static String captureAlternatives(final String[] alternatives,
115 final boolean quote) {
116 final int length = alternatives.length;
117
118 final StringBuilder result = new StringBuilder("(");
119
120
121 for (int i = 0; i < length; i++) {
122
123 if (quote) {
124 result.append(Pattern.quote(alternatives[i]));
125 } else {
126 result.append(alternatives[i]);
127 }
128
129 if (i < length - 1) {
130 result.append("|");
131 } else {
132 result.append(")");
133 }
134 }
135 return result.toString();
136 }
137
138 /***
139 * Counts how often a character is repeated at the begin of a string.
140 *
141 * @param str the string to check
142 * @param ch the character to count
143 * @return how often the character is repeated at the begin of the string
144 * (0 if the string starts with another character or is empty)
145 */
146 public static int countFirst(final String str, final char ch) {
147 for (int i = 0; i < str.length(); i++) {
148 if (str.charAt(i) != ch) {
149
150 return i;
151 }
152 }
153
154
155 return str.length();
156 }
157
158 /***
159 * Counts how often a character is repeated at the end of a string.
160 *
161 * @param str the string to check
162 * @param ch the character to count
163 * @return how often the character is repeated at the end of the string
164 * (0 if the string ends with another character or is empty)
165 */
166 public static int countLast(final String str, final char ch) {
167 final int length = str.length();
168
169 for (int i = 0; i < length; i++) {
170 if (str.charAt(length - i - 1) != ch) {
171
172 return i;
173 }
174 }
175
176
177 return length;
178 }
179
180 /***
181 * Checks that a string is a printable name, meaning it has at at least
182 * one character and does not contain any whitespace.
183 *
184 * @param string the string to check
185 * @throws IllegalArgumentException if the given string is null or empty
186 * or contains whitespace
187 */
188 public static void ensurePrintableName(final String string)
189 throws IllegalArgumentException {
190 if (StringUtils.isEmpty(string)) {
191 throw new IllegalArgumentException("Name is null or empty: "
192 + string);
193 }
194 if (WHITESPACE_PATTERN.matcher(string).find()) {
195 throw new IllegalArgumentException("Name contains whitespace: '"
196 + string + "'");
197 }
198 }
199
200 /***
201 * Helper method for building a regular expression {@link Pattern} by
202 * combining several alternatives.
203 *
204 * @param alternatives the alternatives to combine
205 * @return a pattern string containing the joined alternatives; two or
206 * more alternatives are combined in a non-capturing group; a single
207 * alternative is just returned as is; if the array is empty, an empty
208 * string is returned
209 */
210 public static String joinAlternatives(final String[] alternatives) {
211 final int length = alternatives.length;
212 if (length >= 2) {
213
214 final StringBuilder result = new StringBuilder("(?:");
215 for (int i = 0; i < length; i++) {
216 result.append(alternatives[i]);
217 if (i < length - 1) {
218 result.append("|");
219 } else {
220 result.append(")");
221 }
222 }
223 return result.toString();
224 } else if (length == 1) {
225
226 return alternatives[0];
227 } else {
228
229 return "";
230 }
231 }
232
233 /***
234 * Performs multiple replace-all operations on a text. The replacements
235 * are performed in the order of the key-set iterator of the given map.
236 *
237 * @param input the character sequence to perform the replacements on
238 * @param replacements a mapping of regular expression
239 * {@link java.util.regex.Pattern}s to replacement {@link java.lang.String}s
240 * @return the string constructed by performing all replacements
241 */
242 public static String multipleReplaceAll(final CharSequence input,
243 final Map replacements) {
244 final Iterator patternIterator = replacements.keySet().iterator();
245 Pattern currentPattern;
246 String currentReplace;
247 String text = input.toString();
248
249 while (patternIterator.hasNext()) {
250 currentPattern = (Pattern) patternIterator.next();
251 currentReplace = (String) replacements.get(currentPattern);
252 text = replaceAll(text, currentPattern, currentReplace);
253 }
254 return text;
255 }
256
257 /***
258 * Normalizes the whitespace in a string, replacing all internal whitespace
259 * sequences with a single space character and trimming any leading and
260 * trailing whitespace.
261 *
262 * @param input the string to normalize
263 * @return the normalized string
264 */
265 public static String normalize(final String input) {
266
267 final String result = replaceAll(input, WHITESPACE_PATTERN, " ");
268
269 return result.trim();
270 }
271
272 /***
273 * Helper method that handles the actual replacement process.
274 *
275 * @param input the string to process
276 * @param matcher a matcher on the pattern
277 * @param replacement the replacement string
278 * @param doReset whether to reset this matcher
279 * @return the resulting string; or a reference to the <code>input</code>
280 * string if no replacements were made
281 */
282 private static String replaceAll(final String input, final Matcher matcher,
283 final String replacement, final boolean doReset) {
284 if (doReset) {
285 matcher.reset(input);
286 }
287 boolean found = matcher.find();
288
289 if (found) {
290 final StringBuffer result = new StringBuffer();
291 do {
292 matcher.appendReplacement(result, replacement);
293 found = matcher.find();
294 } while (found);
295
296 matcher.appendTail(result);
297 return result.toString();
298 } else {
299
300 return input;
301 }
302 }
303
304 /***
305 * Replaces each substring of the <code>input</code> matched by the
306 * given {@linkplain Pattern pattern} matcher with the given
307 * replacement. See {@link Matcher#replaceAll(java.lang.String)} for details
308 * of the replacement process and special characters in the
309 * <code>replacement</code> string.
310 *
311 * <p>This method only returns a new string if there is at least one match
312 * to replace. Otherwise the reference to the <code>input</code> object is
313 * returned. Thus you can use the <code>==</code> operator to find out
314 * whether replacements have been made, it is not necessary to use
315 * {@link String#equals(java.lang.Object)}. When there is nothing to
316 * replace, it might be more efficient than
317 * {@link Matcher#replaceAll(java.lang.String)} (and certainly than
318 * {@link String#replaceAll(java.lang.String, java.lang.String)}, because
319 * (as of JDK 1.4.2) these methods always create and return new objects.
320 *
321 * <p>Matchers are stateful and not thread-safe. It is not necessary to
322 * {@link Matcher#reset()} the matcher prior to calling this method but you
323 * should reset it if you want to used it in other matching operations
324 * afterwards.
325 *
326 * @param input the string to process
327 * @param matcher a matcher on the pattern
328 * @param replacement the replacement string
329 * @return the resulting string; or a reference to the <code>input</code>
330 * string if no replacements were made
331 */
332 public static String replaceAll(final String input, final Matcher matcher,
333 final String replacement) {
334 return replaceAll(input, matcher, replacement, true);
335 }
336
337 /***
338 * Shortens a string, inserting an ellipsis ("...") in the middle if the
339 * string is too long. Specifically:
340 *
341 * <ul>
342 * <li>If the length of the <code>input</code> String isn't larger than
343 * <code>startChars + endChars + 3</code>, return it.
344 * <li>Otherwise return the first <code>startChars</code> characters of the
345 * <code>input</code> String, followed by "..." (an ellipsis) and the last
346 * <code>endChars</code> characters of the String
347 * </ul>
348 *
349 * <p>This method is similar to
350 * {@link org.apache.commons.lang.StringUtils#abbreviate(String, int)},
351 * but the ellipsis is inserted in the middle of the string, not at the
352 * end.
353 *
354 * @param input the input string
355 * @param startChars the number of characters to include before the ellipsis
356 * @param endChars the number of characters to include after the ellipsis
357 * @return a shortened string, as described above
358 */
359 public static String shorten(final String input, final int startChars,
360 final int endChars) {
361 final int length = input.length();
362
363 if (length <= startChars + endChars + 3) {
364
365 return input;
366 } else {
367
368
369 return input.substring(0, startChars) + "..."
370 + input.substring(length - endChars);
371 }
372 }
373
374 /***
375 * Delegates to {@link #shorten(String, int, int)}, using the same number
376 * of characters at the start and the end of the shortened string.
377 *
378 * @param input the input string
379 * @param numChars the number of characters to to use for
380 * both <code>startChars</code> and <code>endChars</code> parameter
381 * @return the shortened string
382 */
383 public static String shorten(final String input, final int numChars) {
384 return shorten(input, numChars, numChars);
385 }
386
387 /***
388 * Checks whether a string contains only punctuation characters.
389 *
390 * @param text the test to check
391 * @return <code>true</code> iff the text contains one or more
392 * punctuation characters and no other characters
393 */
394 public static boolean punctuation(final CharSequence text) {
395 return PUNCTUATION_PATTERN.matcher(text).matches();
396 }
397
398 /***
399 * Checks whether a string contains only punctuation and symbol characters.
400 *
401 * @param text the test to check
402 * @return <code>true</code> iff the text contains one or more
403 * punctuation or symbol characters and no other characters
404 */
405 public static boolean punctuationOrSymbol(final CharSequence text) {
406 return PUNCTUATION_SYMBOL_PATTERN.matcher(text).matches();
407 }
408
409
410 /***
411 * Delegates to {@link #shorten(String, int, int)}, showing up to
412 * <strong>24</strong> characters at the start and the end of the shortened
413 * string.
414 *
415 * @param input the input string
416 * @return the shortened string
417 */
418 public static String shorten(final String input) {
419 return shorten(input, 24);
420 }
421
422 /***
423 * Replaces each substring of the <code>input</code> that matches the given
424 * {@link Pattern} with the given replacement. See
425 * {@link Matcher#replaceAll(java.lang.String)} for details of the
426 * replacement process and special characters in the
427 * <code>replacement</code> string.
428 *
429 * <p>This method only returns a new string if there is at least one match
430 * to replace. Otherwise the reference to the <code>input</code> object is
431 * returned. Thus you can use the <code>==</code> operator to find out
432 * whether replacements have been made, it is not necessary to use
433 * {@link String#equals(java.lang.Object)}.
434 *
435 * <p>This method is thread-safe since pattern objects are stateless. On the
436 * other hand, it needs to create a new {@link Matcher} object, thus
437 * {@link #replaceAll(String, Matcher, String)} is more efficient for
438 * multiple replacements on the same pattern.
439 *
440 * @param input the string to process
441 * @param pattern the regular expression {@link Pattern} to replace
442 * @param replacement the replacement string
443 * @return the resulting string; or a reference to the <code>input</code>
444 * string if no replacements were made
445 */
446 public static String replaceAll(final String input, final Pattern pattern,
447 final String replacement) {
448 final Matcher matcher = pattern.matcher(input);
449
450 return replaceAll(input, matcher, replacement, false);
451 }
452
453 /***
454 * Splits a text into an array of lines. Only the textual contents of
455 * non-empty lines are retained; empty lines and training and leading
456 * whitespace are removed.
457 *
458 * @param input the text to split
459 * @return an array of the lines contained in the text; each line is trimmed
460 * (trailing and leading whitespace is removed) and empty lines are
461 * suppressed
462 */
463 public static String[] splitLines(final CharSequence input) {
464
465 final String trimmed = input.toString().trim();
466 return NEWLINES_PATTERN.split(trimmed);
467 }
468
469 /***
470 * Splits a text into an array of lines, without trimming lines and
471 * discarding empty lines.
472 *
473 * @param input the text to split
474 * @return an array of the lines contained in the text
475 */
476 public static String[] splitLinesExact(final CharSequence input) {
477 return NEWLINE_PATTERN.split(input);
478 }
479
480 /***
481 * Splits a string around whitespace.
482 *
483 * @param input the string to split
484 * @return an array of strings computed by splitting the input
485 */
486 public static String[] splitString(final CharSequence input) {
487 return splitString(input, -1);
488 }
489
490 /***
491 * Splits a string around whitespace. The number of returned subsequences
492 * won't be higher than the specified <code>splitMaximum</code>.
493 * If splitting results in more subsequences, only the <em>last</em>
494 * <code>splitMaximum</code> are kept, while the other ones are discarded.
495 * This implementation splits around the {@link #WHITESPACE_PATTERN}.
496 *
497 * @param input the string to split
498 * @param splitMaximum the maximum number of subsequences to keep;
499 * or <code>-1</code> if all subsequences should be kept
500 * @return an array of strings computed by splitting the input; will
501 * contain at least 1 and at most <code>splitMaximum</code> elements
502 */
503 public static String[] splitString(final CharSequence input,
504 final int splitMaximum) {
505 return splitString(input, WHITESPACE_PATTERN, splitMaximum);
506 }
507
508 /***
509 * Splits a string around whitespace. The number of returned subsequences
510 * won't be higher than the specified <code>splitMaximum</code>.
511 * If splitting results in more subsequences, only the <em>last</em>
512 * <code>splitMaximum</code> are kept, while the other ones are discarded.
513 *
514 * @param input the string to split
515 * @param whitespacePattern the pattern around which to split
516 * @param splitMaximum the maximum number of subsequences to keep;
517 * or <code>-1</code> if all subsequences should be kept
518 * @return an array of strings computed by splitting the input; will
519 * contain at most <code>splitMaximum</code> elements
520 */
521 public static String[] splitString(final CharSequence input,
522 final Pattern whitespacePattern, final int splitMaximum) {
523 final String[] results;
524 final String[] allSplits = whitespacePattern.split(input);
525 final int surplus;
526 if (splitMaximum >= 0) {
527 surplus = allSplits.length - splitMaximum;
528 } else {
529
530 surplus = -1;
531 }
532
533 if (surplus > 0) {
534
535 results = new String[splitMaximum];
536 for (int i = 0; i < results.length; i++) {
537 results[i] = allSplits[i + surplus];
538 }
539 } else {
540
541 results = allSplits;
542 }
543
544 return results;
545 }
546
547 /***
548 * Weakly normalizes the whitespace in a string, by replacing each
549 * whitespace element (space, tab, newline) with a space character.
550 * This is in accordance with the attribute-value normalization required
551 * by the XML specification.
552 *
553 * @param input the string to normalize
554 * @return the weakly normalized string
555 */
556 public static String weaklyNormalize(final String input) {
557
558 return replaceAll(input, NEWLINE_TAB_PATTERN, " ");
559 }
560
561 /***
562 * Convenience method that writes a text to a writer and appends to
563 * line separator.
564 *
565 * @param writer the writer to write to
566 * @param text the text to write
567 * @throws IOException if an I/O error occurs
568 */
569 public static void writeln(final Writer writer, final String text)
570 throws IOException {
571 writer.write(text);
572 writer.write(LINE_SEPARATOR);
573 }
574
575 /***
576 * Private constructor prevents creation of instances.
577 */
578 private TextUtils() {
579 super();
580 }
581
582 }