1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26 import java.util.regex.PatternSyntaxException;
27
28 import org.apache.commons.lang.builder.ToStringBuilder;
29
30 /***
31 * Splits a text into a sequence of tokens.
32 *
33 * <p>This class is not thread-safe, so if you should want to share
34 * a tokenizer between threads you have to ensure adequate synchronization.
35 *
36 * @author Christian Siefkes
37 * @version $Revision: 1.13 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
38 */
39 public class TextTokenizer {
40
41 /***
42 * The index of the last character matched by the penultimate
43 * token, plus one; or <code>0</code> if there was no match yet.
44 */
45 private int afterPenultimateMatch;
46
47 /***
48 * The normalized whitespace representation prepended if {@link
49 * #normalizedWhitespacePrepended} is <code>true</code>. Defaults to a
50 * space character.
51 */
52 private String normalizedWhitespace = " ";
53
54 /***
55 * Whether whitespace is prepended in a normalized form ({@link
56 * #normalizedWhitespace}) to those tokens where {@link
57 * #hasPrecedingWhitespace()} would return <code>true</code>.
58 * Defaults to <code>false</code>.
59 */
60 private boolean normalizedWhitespacePrepended = false;
61
62 /***
63 * The whitespace preceding the last read token. Might be the empty
64 * string if there is no preceding whitespace, or <code>null</code>
65 * if the whitespace has not yet been determined.
66 */
67 private String precedingWhitespace;
68
69 /***
70 * The index of the first character matched by the last
71 * token; or <code>-1</code> if there was no match yet, or
72 * <code>textToTokenize.length()</code> if the last match wasn't
73 * succeesful.
74 */
75 private int startOfLastMatch;
76
77 /***
78 * The text to tokenize.
79 */
80 private CharSequence textToTokenize;
81
82 /***
83 * The matcher used to split the text into matching tokens.
84 */
85 private final Matcher tokenMatcher;
86
87 /***
88 * The whitespace (non-token) matcher.
89 */
90 private final Matcher whitespaceMatcher;
91
92 /***
93 * Whether whitespace (the text between patterns) is checked to
94 * ensure that the defined whitespace pattern is matched. If set to
95 * true (default), a call to {@link #hasPrecedingWhitespace()} or
96 * {@link #precedingWhitespace()} will throw an {@link
97 * java.lang.IllegalArgumentException} if the whitespace preceding the
98 * last read token does not match.
99 *
100 * <p>It might not be a good idea to enable this, because some characters
101 * that are theoretically not in use in a charset might cause problems.
102 * E.g. the French starting and ending quotes from the Windows charset
103 * (at unused positions in Latin1 + Unicode) are recognized as {@link
104 * java.lang.Character#INITIAL_QUOTE_PUNCTUATION} (<code>Pi</code>
105 * category) or {@link java.lang.Character#FINAL_QUOTE_PUNCTUATION}
106 * (<code>Pf</code>), but in some cases they do not seem to match any
107 * regex category patterns.
108 */
109 private boolean whitespacePatternEnsured = false;
110
111 /***
112 * Creates a new instance. Only use this constructor if you know what you
113 * are doing! Usually it should be sufficient to use one of the factory
114 * methods provided by {@link TokenizerFactory}.
115 *
116 * @param patterns a list of patterns to accept as tokens; patterns
117 * jointed and compiled with the {@link java.util.regex.Pattern#DOTALL}
118 * flag activated
119 * @param whitespacePattern a pattern that should match all text
120 * between tokens ("whitespace"), to ensure that no text is left out
121 * by mistake; the pattern is compiled with the
122 * {@link java.util.regex.Pattern#DOTALL} flag activated
123 * @param text the text to tokenize
124 * @throws PatternSyntaxException if the syntax of the provided
125 * patterns is invalid
126 */
127 public TextTokenizer(final String[] patterns,
128 final String whitespacePattern, final CharSequence text)
129 throws PatternSyntaxException {
130 super();
131 final StringBuilder patternBuffer;
132
133
134 if (patterns.length > 1) {
135 patternBuffer = new StringBuilder("(?:");
136 } else {
137 patternBuffer = new StringBuilder();
138 }
139
140
141 for (int i = 0; i < patterns.length; i++) {
142 patternBuffer.append(patterns[i]);
143
144 if (i == (patterns.length - 1)) {
145 if (patterns.length > 1) {
146
147 patternBuffer.append(")");
148 }
149 } else {
150 patternBuffer.append("|");
151 }
152 }
153
154 textToTokenize = text;
155
156 final Pattern tokenPattern = Pattern.compile(
157 patternBuffer.toString(), Pattern.DOTALL);
158 tokenMatcher = tokenPattern.matcher(text);
159
160 final Pattern wsPattern = Pattern.compile(whitespacePattern,
161 Pattern.DOTALL);
162 whitespaceMatcher = wsPattern.matcher("");
163
164
165 reinit();
166 }
167
168 /***
169 * Returns the text captured within "capturing groups" in the last token.
170 * All captured text sequences are joint in a single string.
171 * If there were no capturing groups involved in the last match, the
172 * empty string is returned.
173 *
174 * @return the joint text matched within captured groups in the last token
175 * match
176 */
177 public final String capturedText() {
178 final StringBuilder result = new StringBuilder();
179 String currentGroup;
180 for (int i = 1; i <= tokenMatcher.groupCount(); i++) {
181 currentGroup = tokenMatcher.group(i);
182 if (currentGroup != null) {
183 result.append(currentGroup);
184 }
185 }
186 return result.toString();
187 }
188
189 /***
190 * Helper method to find the preceding whitespace.
191 *
192 * @throws IllegalStateException if this method is called
193 * without a prior call to {@link #nextToken()}
194 */
195 private void doFindPrecedingWhitespace() throws IllegalStateException {
196 if (startOfLastMatch < 0) {
197 throw new IllegalStateException(
198 "Prior call to nextToken() required");
199 }
200
201 if (precedingWhitespace == null) {
202
203
204
205
206
207
208
209 precedingWhitespace = textToTokenize.subSequence(
210 afterPenultimateMatch, startOfLastMatch).toString();
211 }
212 }
213
214 /***
215 * Helper method to find and optionally validate the preceding whitespace.
216 *
217 * @throws IllegalStateException if this method is called
218 * without a prior call to {@link #nextToken()}
219 * @throws IllegalArgumentException if
220 * {@link #isWhitespacePatternEnsured()} is <code>true</code> and the
221 * whitespace preceding the last read token does not match the defined
222 * whitespace pattern
223 */
224 private void findPrecedingWhitespace() throws IllegalStateException,
225 IllegalArgumentException {
226 doFindPrecedingWhitespace();
227
228
229 if (isWhitespacePatternEnsured() && (!precedingWhitespaceIsValid())) {
230 throw new IllegalArgumentException(
231 "Supposed whitespace '" + precedingWhitespace
232 + "' between position " + afterPenultimateMatch + " and "
233 + startOfLastMatch + " doesn't match specified pattern "
234 + whitespaceMatcher.pattern().pattern());
235 }
236 }
237
238 /***
239 * Returns the normalized whitespace representation prepended if {@link
240 * #isNormalizedWhitespacePrepended()} is <code>true</code>. Defaults to
241 * a space character.
242 *
243 * @return the normalized representation
244 */
245 public final String getNormalizedWhitespace() {
246 return normalizedWhitespace;
247 }
248
249 /***
250 * Whether the token returned by the last call to {@link #nextToken()}
251 * is preceded by whitespace (i.e., text not matched by any token).
252 * If we arrived at the end of the text to tokenize (last call to
253 * {@link #nextToken()} returned <code>null</code>), this is the whitespace
254 * between the last existing token and the end of the text.
255 *
256 * @return whether the last token is preceded by whitespace
257 * @throws IllegalStateException if this method is called
258 * without a prior call to {@link #nextToken()}
259 * @throws IllegalArgumentException if
260 * {@link #isWhitespacePatternEnsured()} is <code>true</code> and the
261 * whitespace preceding the last read token does not match the defined
262 * whitespace pattern
263 */
264 public final boolean hasPrecedingWhitespace() throws IllegalStateException,
265 IllegalArgumentException {
266
267 findPrecedingWhitespace();
268
269 return precedingWhitespace.length() > 0;
270 }
271
272 /***
273 * Convenience method that counts the number of whitespace characters at the
274 * begin of a string, according to the defined whitespace pattern.
275 *
276 * @param text the text to check
277 * @return the number of whitespace characters at the begin, of 0 if there
278 * are none
279 */
280 public int initialWhitespaceCount(final String text) {
281 whitespaceMatcher.reset(text);
282 if (whitespaceMatcher.lookingAt()) {
283 return whitespaceMatcher.end();
284 } else {
285
286 return 0;
287 }
288 }
289
290 /***
291 * Returns whether whitespace is prepended in a normalized form ({@link
292 * #getNormalizedWhitespace()}) to those tokens where {@link
293 * #hasPrecedingWhitespace()} would return <code>true</code>.
294 * Defaults to <code>false</code>.
295 *
296 * @return whether whitespace is prepended
297 */
298 public final boolean isNormalizedWhitespacePrepended() {
299 return normalizedWhitespacePrepended;
300 }
301
302 /***
303 * Convenience method that checks whether a string matches the defined
304 * whitespace pattern.
305 *
306 * @param text the text to match
307 * @return <code>true</code> iff the given text matches the
308 * defined whitespace pattern or is the empty string
309 */
310 public boolean isValidWhitespace(final String text) {
311 if (text.length() > 0) {
312 whitespaceMatcher.reset(text);
313 return whitespaceMatcher.matches();
314 } else {
315
316 return true;
317 }
318 }
319
320 /***
321 * Whether whitespace (the text between patterns) is checked to
322 * ensure that the defined whitespace pattern is matched. If set to
323 * true (default), a call to {@link #hasPrecedingWhitespace()} or
324 * {@link #precedingWhitespace()} will throw an {@link
325 * java.lang.IllegalArgumentException} if the whitespace preceding the
326 * last read token does not match.
327 *
328 * @return the value of this property
329 */
330 public final boolean isWhitespacePatternEnsured() {
331 return whitespacePatternEnsured;
332 }
333
334 /***
335 * Returns the complete text to the left (preceding) the token returned
336 * by the last call to {@link #nextToken()}. This includes any
337 * {@link #precedingWhitespace()}.
338 *
339 * @return the complete text to the left of the last token
340 * @throws IllegalStateException if this method is called
341 * without a prior call to {@link #nextToken()}
342 */
343 public CharSequence leftText() throws IllegalStateException {
344 if (startOfLastMatch < 0) {
345 throw new IllegalStateException(
346 "Prior call to nextToken() required");
347 }
348
349 return textToTokenize.subSequence(0, startOfLastMatch);
350 }
351
352 /***
353 * Returns the next token, or <code>null</code> if there are no
354 * more tokens left in the provided text. When the tokenizer arrived
355 * at the end of the text, all subsequent calls to this method
356 * will return <code>null</code> until you call one of the
357 * {@link #reset() reset} methods. If the token is preceded by whitespace
358 * and {@link #isNormalizedWhitespacePrepended()} is <code>true</code>,
359 * the returned token will start with the normalized whitespace
360 * representation ({@link #getNormalizedWhitespace()}).
361 *
362 * @return the next token read from the provided text (with or without
363 * prepended whitespace), or <code>null</code> if no tokens are left
364 * @throws IllegalArgumentException if
365 * {@link #isWhitespacePatternEnsured()} and
366 * {@link #isNormalizedWhitespacePrepended()} are <code>true</code> and the
367 * whitespace preceding this token does not match the defined whitespace
368 * pattern
369 */
370 public final String nextToken() throws IllegalArgumentException {
371 final String theToken;
372 final String result;
373
374
375 precedingWhitespace = null;
376
377 if ((startOfLastMatch >= 0)
378 && (startOfLastMatch < textToTokenize.length())) {
379
380
381 afterPenultimateMatch = tokenMatcher.end();
382 }
383
384 if (tokenMatcher.find()) {
385 theToken = tokenMatcher.group();
386
387
388 startOfLastMatch = tokenMatcher.start();
389 } else {
390 theToken = null;
391
392
393 startOfLastMatch = textToTokenize.length();
394 }
395
396 if ((theToken != null) && isNormalizedWhitespacePrepended()
397 && hasPrecedingWhitespace()) {
398
399 result = getNormalizedWhitespace() + theToken;
400 } else {
401
402 result = theToken;
403 }
404
405 return result;
406 }
407
408 /***
409 * Returns the whitespace (i.e., text not matched by any token) preceding
410 * the token returned by the last call to {@link #nextToken()}.
411 * If we arrived at the end of the text to tokenize (last call to
412 * {@link #nextToken()} returned <code>null</code>), this is the
413 * whitespace between the last existing token and the end of the text.
414 *
415 * @return the whitespace preceding the last token, or the empty
416 * string if there is no preceding whitespace (i.e.
417 * {@link #hasPrecedingWhitespace()} would return <code>false</code>)
418 * @throws IllegalStateException if this method is called
419 * without a prior call to {@link #nextToken()}
420 * @throws IllegalArgumentException if
421 * {@link #isWhitespacePatternEnsured()} is <code>true</code> and the
422 * whitespace preceding the last read token does not match the defined
423 * whitespace pattern
424 */
425 public final String precedingWhitespace() throws IllegalStateException,
426 IllegalArgumentException {
427
428 findPrecedingWhitespace();
429 return precedingWhitespace;
430 }
431
432 /***
433 * Checks whether the whitespace (i.e., text not matched by any token)
434 * preceding the token returned by the last call to {@link #nextToken()}
435 * matches the defined whitespace pattern. This method is called
436 * automatically if {@link #isWhitespacePatternEnsured()} is
437 * <code>true</code>. Otherwise it can be called externally to check
438 * whether the whitespace is valid and take appropriate action if required.
439 *
440 * @return <code>true</code> iff the preceding whitespace matches the
441 * specified whitespace pattern or if there is no preceding whitespace
442 * @throws IllegalStateException if this method is called
443 * without a prior call to {@link #nextToken()}
444 */
445 public boolean precedingWhitespaceIsValid() throws IllegalStateException {
446 doFindPrecedingWhitespace();
447 return isValidWhitespace(precedingWhitespace);
448 }
449
450 /***
451 * Initializes or re-initializes the state of this tokenizer
452 * at construction or when resetting the text.
453 */
454 private void reinit() {
455 startOfLastMatch = -1;
456 afterPenultimateMatch = 0;
457 precedingWhitespace = null;
458 }
459
460 /***
461 * Resets this tokenizer, so it will restart at the begin of the
462 * current text.
463 */
464 public final void reset() {
465 tokenMatcher.reset();
466
467
468 reinit();
469 }
470
471 /***
472 * Resets this tokenizer, so it will restart at the begin of the
473 * provided text.
474 *
475 * @param newText the new text to tokenize
476 */
477 public final void reset(final CharSequence newText) {
478 tokenMatcher.reset(newText);
479 textToTokenize = newText;
480
481
482 reinit();
483 }
484
485 /***
486 * Returns the complete text to the right (following) the token returned
487 * by the last call to {@link #nextToken()}. This includes any following
488 * whitespace.
489 *
490 * @return the complete text to the right of the last token
491 * @throws IllegalStateException if this method is called
492 * without a prior call to {@link #nextToken()}
493 */
494 public CharSequence rightText() throws IllegalStateException {
495 if (startOfLastMatch < 0) {
496 throw new IllegalStateException(
497 "Prior call to nextToken() required");
498 }
499
500
501 return textToTokenize.subSequence(tokenMatcher.end(),
502 textToTokenize.length());
503 }
504
505 /***
506 * Changes the normalized whitespace representation prepended if {@link
507 * #isNormalizedWhitespacePrepended()} is <code>true</code>.
508 *
509 * @param newValue the new value
510 */
511 public final void setNormalizedWhitespace(final String newValue) {
512 normalizedWhitespace = newValue;
513 }
514
515 /***
516 * Changes whether whitespace is prepended in a normalized form ({@link
517 * #getNormalizedWhitespace()}) to those tokens where {@link
518 * #hasPrecedingWhitespace()} would return <code>true</code>.
519 *
520 * @param newValue the new value
521 */
522 public final void setNormalizedWhitespacePrepended(final boolean newValue) {
523 normalizedWhitespacePrepended = newValue;
524 }
525
526 /***
527 * Specifies whether whitespace (the text between patterns) is checked to
528 * ensure that the defined whitespace pattern is matched. If set to
529 * true (default), a call to {@link #hasPrecedingWhitespace()} or
530 * {@link #precedingWhitespace()} will throw an {@link
531 * java.lang.IllegalArgumentException} if the whitespace preceding the
532 * last read token does not match.
533 *
534 * @param ensured the new value of this property
535 */
536 public final void setWhitespacePatternEnsured(final boolean ensured) {
537 whitespacePatternEnsured = ensured;
538 }
539
540 /***
541 * Returns a string representation of this object.
542 *
543 * @return a textual representation
544 */
545 public String toString() {
546 final ToStringBuilder builder = new ToStringBuilder(this).
547 append("token pattern", tokenMatcher.pattern().pattern()).
548 append("whitespace pattern", whitespaceMatcher.pattern().pattern()).
549 append("whitespace pattern ensured", whitespacePatternEnsured);
550
551 if (normalizedWhitespacePrepended) {
552 builder.append("normalized whitespace (is prepended)",
553 normalizedWhitespace);
554 }
555 return builder.toString();
556 }
557
558 /***
559 * Convenience method that counts the number of whitespace characters at the
560 * end of a string, according to the defined whitespace pattern.
561 *
562 * @param text the text to check
563 * @return the number of whitespace characters at the end, of 0 if there
564 * are none
565 */
566 public int trailingWhitespaceCount(final String text) {
567
568 final Pattern trailingWSPattern = Pattern.compile(
569 whitespaceMatcher.pattern().pattern() + "//z");
570 final Matcher trailingWSMatcher = trailingWSPattern.matcher(text);
571
572 if (trailingWSMatcher.find()) {
573 return trailingWSMatcher.group().length();
574 } else {
575
576 return 0;
577 }
578 }
579
580 }