View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.text;
23  
24  import java.util.regex.Matcher;
25  import java.util.regex.Pattern;
26  import java.util.regex.PatternSyntaxException;
27  
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  
30  /***
31   * Splits a text into a sequence of tokens.
32   *
33   * <p>This class is not thread-safe, so if you should want to share
34   * a tokenizer between threads you have to ensure adequate synchronization.
35   *
36   * @author Christian Siefkes
37   * @version $Revision: 1.13 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
38   */
39  public class TextTokenizer {
40  
41      /***
42       * The index of the last character matched by the penultimate
43       * token, plus one; or <code>0</code> if there was no match yet.
44       */
45      private int afterPenultimateMatch;
46  
47      /***
48       * The normalized whitespace representation prepended if {@link
49       * #normalizedWhitespacePrepended} is <code>true</code>. Defaults to a
50       * space character.
51       */
52      private String normalizedWhitespace = " ";
53  
54      /***
55       * Whether whitespace is prepended in a normalized form ({@link
56       * #normalizedWhitespace}) to those tokens where {@link
57       * #hasPrecedingWhitespace()} would return <code>true</code>.
58       * Defaults to <code>false</code>.
59       */
60      private boolean normalizedWhitespacePrepended = false;
61  
62      /***
63       * The whitespace preceding the last read token. Might be the empty
64       * string if there is no preceding whitespace, or <code>null</code>
65       * if the whitespace has not yet been determined.
66       */
67      private String precedingWhitespace;
68  
69      /***
70       * The index of the first character matched by the last
71       * token; or <code>-1</code> if there was no match yet, or
72       * <code>textToTokenize.length()</code> if the last match wasn't
73       * succeesful.
74       */
75      private int startOfLastMatch;
76  
77      /***
78       * The text to tokenize.
79       */
80      private CharSequence textToTokenize;
81  
82      /***
83       * The matcher used to split the text into matching tokens.
84       */
85      private final Matcher tokenMatcher;
86  
87      /***
88       * The whitespace (non-token) matcher.
89       */
90      private final Matcher whitespaceMatcher;
91  
92      /***
93       * Whether whitespace (the text between patterns) is checked to
94       * ensure that the defined whitespace pattern is matched. If set to
95       * true (default), a call to {@link #hasPrecedingWhitespace()} or
96       * {@link #precedingWhitespace()} will throw an {@link
97       * java.lang.IllegalArgumentException} if the whitespace preceding the
98       * last read token does not match.
99       *
100      * <p>It might not be a good idea to enable this, because some characters
101      * that are theoretically not in use in a charset might cause problems.
102      * E.g. the French starting and ending quotes from the Windows charset
103      * (at unused positions in Latin1 + Unicode) are recognized as {@link
104      * java.lang.Character#INITIAL_QUOTE_PUNCTUATION} (<code>Pi</code>
105      * category) or {@link java.lang.Character#FINAL_QUOTE_PUNCTUATION}
106      * (<code>Pf</code>), but in some cases they do not seem to match any
107      * regex category patterns.
108      */
109     private boolean whitespacePatternEnsured = false;
110 
111     /***
112      * Creates a new instance. Only use this constructor if you know what you
113      * are doing! Usually it should be sufficient to use one of the factory
114      * methods provided by {@link TokenizerFactory}.
115      *
116      * @param patterns a list of patterns to accept as tokens; patterns
117      * jointed and compiled with the {@link java.util.regex.Pattern#DOTALL}
118      * flag activated
119      * @param whitespacePattern a pattern that should match all text
120      * between tokens ("whitespace"), to ensure that no text is left out
121      * by mistake; the pattern is compiled with the
122      * {@link java.util.regex.Pattern#DOTALL} flag activated
123      * @param text the text to tokenize
124      * @throws PatternSyntaxException if the syntax of the provided
125      * patterns is invalid
126      */
127     public TextTokenizer(final String[] patterns,
128             final String whitespacePattern, final CharSequence text)
129             throws PatternSyntaxException {
130         super();
131         final StringBuilder patternBuffer;
132 
133         // combine in non-capturing group, if more than one pattern
134         if (patterns.length > 1) {
135             patternBuffer = new StringBuilder("(?:");
136         } else {
137             patternBuffer = new StringBuilder();
138         }
139 
140         // build complete pattern
141         for (int i = 0; i < patterns.length; i++) {
142             patternBuffer.append(patterns[i]);
143 
144             if (i == (patterns.length - 1)) {
145                 if (patterns.length > 1) {
146                     // last of several patterns: close non-capturing group
147                     patternBuffer.append(")");
148                 }
149             } else {
150                 patternBuffer.append("|");
151             }
152         }
153 
154         textToTokenize = text;
155 
156         final Pattern tokenPattern = Pattern.compile(
157             patternBuffer.toString(), Pattern.DOTALL);
158         tokenMatcher = tokenPattern.matcher(text);
159 
160         final Pattern wsPattern = Pattern.compile(whitespacePattern,
161             Pattern.DOTALL);
162         whitespaceMatcher = wsPattern.matcher("");
163 
164         // initialize fields
165         reinit();
166     }
167 
168     /***
169      * Returns the text captured within "capturing groups" in the last token.
170      * All captured text sequences are joint in a single string.
171      * If there were no capturing groups involved in the last match, the
172      * empty string is returned.
173      *
174      * @return the joint text matched within captured groups in the last token
175      * match
176      */
177     public final String capturedText() {
178         final StringBuilder result = new StringBuilder();
179         String currentGroup;
180         for (int i = 1; i <= tokenMatcher.groupCount(); i++) {
181             currentGroup = tokenMatcher.group(i);
182             if (currentGroup != null) {
183                 result.append(currentGroup);
184             }
185         }
186         return result.toString();
187     }
188 
189     /***
190      * Helper method to find the preceding whitespace.
191      *
192      * @throws IllegalStateException if this method is called
193      * without a prior call to {@link #nextToken()}
194      */
195     private void doFindPrecedingWhitespace() throws IllegalStateException {
196         if (startOfLastMatch < 0) {
197             throw new IllegalStateException(
198                 "Prior call to nextToken() required");
199         }
200 
201         if (precedingWhitespace == null) {
202             // otherwise there is nothing to do
203 
204             /* rough debugging
205             System.out.print("/" + afterPenultimateMatch + "/"
206                 + startOfLastMatch + "/");
207             */
208 
209             precedingWhitespace = textToTokenize.subSequence(
210                 afterPenultimateMatch, startOfLastMatch).toString();
211         }
212     }
213 
214     /***
215      * Helper method to find and optionally validate the preceding whitespace.
216      *
217      * @throws IllegalStateException if this method is called
218      * without a prior call to {@link #nextToken()}
219      * @throws IllegalArgumentException if
220      * {@link #isWhitespacePatternEnsured()} is <code>true</code> and the
221      * whitespace preceding the last read token does not match the defined
222      * whitespace pattern
223      */
224     private void findPrecedingWhitespace() throws IllegalStateException,
225             IllegalArgumentException {
226         doFindPrecedingWhitespace();
227 
228         // validate
229         if (isWhitespacePatternEnsured() && (!precedingWhitespaceIsValid())) {
230             throw new IllegalArgumentException(
231                 "Supposed whitespace '" + precedingWhitespace
232                 + "' between position " + afterPenultimateMatch + " and "
233                 + startOfLastMatch + " doesn't match specified pattern "
234                 + whitespaceMatcher.pattern().pattern());
235         }
236     }
237 
238     /***
239      * Returns the normalized whitespace representation prepended if {@link
240      * #isNormalizedWhitespacePrepended()} is <code>true</code>. Defaults to
241      * a space character.
242      *
243      * @return the normalized representation
244      */
245     public final String getNormalizedWhitespace() {
246         return normalizedWhitespace;
247     }
248 
249     /***
250      * Whether the token returned by the last call to {@link #nextToken()}
251      * is preceded by whitespace (i.e., text not matched by any token).
252      * If we arrived at the end of the text to tokenize (last call to
253      * {@link #nextToken()} returned <code>null</code>), this is the whitespace
254      * between the last existing token and the end of the text.
255      *
256      * @return whether the last token is preceded by whitespace
257      * @throws IllegalStateException if this method is called
258      * without a prior call to {@link #nextToken()}
259      * @throws IllegalArgumentException if
260      * {@link #isWhitespacePatternEnsured()} is <code>true</code> and the
261      * whitespace preceding the last read token does not match the defined
262      * whitespace pattern
263      */
264     public final boolean hasPrecedingWhitespace() throws IllegalStateException,
265             IllegalArgumentException {
266         // determine whitespace, if not yet done
267         findPrecedingWhitespace();
268 
269         return precedingWhitespace.length() > 0;
270     }
271 
272     /***
273      * Convenience method that counts the number of whitespace characters at the
274      * begin of a string, according to the defined whitespace pattern.
275      *
276      * @param text the text to check
277      * @return the number of whitespace characters at the begin, of 0 if there
278      * are none
279      */
280     public int initialWhitespaceCount(final String text) {
281         whitespaceMatcher.reset(text);
282         if (whitespaceMatcher.lookingAt()) {
283             return whitespaceMatcher.end();
284         } else {
285             // no initial whitespace
286             return 0;
287         }
288     }
289 
290     /***
291      * Returns whether whitespace is prepended in a normalized form ({@link
292      * #getNormalizedWhitespace()}) to those tokens where {@link
293      * #hasPrecedingWhitespace()} would return <code>true</code>.
294      * Defaults to <code>false</code>.
295      *
296      * @return whether whitespace is prepended
297      */
298     public final boolean isNormalizedWhitespacePrepended() {
299         return normalizedWhitespacePrepended;
300     }
301 
302     /***
303      * Convenience method that checks whether a string matches the defined
304      * whitespace pattern.
305      *
306      * @param text the text to match
307      * @return <code>true</code> iff the given text matches the
308      * defined whitespace pattern or is the empty string
309      */
310     public boolean isValidWhitespace(final String text) {
311         if (text.length() > 0) {
312             whitespaceMatcher.reset(text);
313             return whitespaceMatcher.matches();
314         } else {
315             // empty string always matches
316             return true;
317         }
318     }
319 
320     /***
321      * Whether whitespace (the text between patterns) is checked to
322      * ensure that the defined whitespace pattern is matched. If set to
323      * true (default), a call to {@link #hasPrecedingWhitespace()} or
324      * {@link #precedingWhitespace()} will throw an {@link
325      * java.lang.IllegalArgumentException} if the whitespace preceding the
326      * last read token does not match.
327      *
328      * @return the value of this property
329      */
330     public final boolean isWhitespacePatternEnsured() {
331         return whitespacePatternEnsured;
332     }
333 
334     /***
335      * Returns the complete text to the left (preceding) the token returned
336      * by the last call to {@link #nextToken()}. This includes any
337      * {@link #precedingWhitespace()}.
338      *
339      * @return the complete text to the left of the last token
340      * @throws IllegalStateException if this method is called
341      * without a prior call to {@link #nextToken()}
342      */
343     public CharSequence leftText() throws IllegalStateException {
344         if (startOfLastMatch < 0) {
345             throw new IllegalStateException(
346                 "Prior call to nextToken() required");
347         }
348 
349         return textToTokenize.subSequence(0, startOfLastMatch);
350     }
351 
352     /***
353      * Returns the next token, or <code>null</code> if there are no
354      * more tokens left in the provided text. When the tokenizer arrived
355      * at the end of the text, all subsequent calls to this method
356      * will return <code>null</code> until you call one of the
357      * {@link #reset() reset} methods. If the token is preceded by whitespace
358      * and {@link #isNormalizedWhitespacePrepended()} is <code>true</code>,
359      * the returned token will start with the normalized whitespace
360      * representation ({@link #getNormalizedWhitespace()}).
361      *
362      * @return the next token read from the provided text (with or without
363      * prepended whitespace), or <code>null</code> if no tokens are left
364      * @throws IllegalArgumentException if
365      * {@link #isWhitespacePatternEnsured()} and
366      * {@link #isNormalizedWhitespacePrepended()} are <code>true</code> and the
367      * whitespace preceding this token does not match the defined whitespace
368      * pattern
369      */
370     public final String nextToken() throws IllegalArgumentException {
371         final String theToken;
372         final String result;
373 
374         // reset whitespace
375         precedingWhitespace = null;
376 
377         if ((startOfLastMatch >= 0)
378                 && (startOfLastMatch < textToTokenize.length())) {
379             // there was a successful match before, so we can safely call
380             // this method
381             afterPenultimateMatch = tokenMatcher.end();
382         }
383 
384         if (tokenMatcher.find()) {
385             theToken = tokenMatcher.group();
386 
387             // remember the start of this match
388             startOfLastMatch = tokenMatcher.start();
389         } else {
390             theToken = null;
391 
392             // non-match: matches "after text"
393             startOfLastMatch = textToTokenize.length();
394         }
395 
396         if ((theToken != null) && isNormalizedWhitespacePrepended()
397                 && hasPrecedingWhitespace()) {
398             // prepend whitespace
399             result = getNormalizedWhitespace() + theToken;
400         } else {
401             // return as is
402             result = theToken;
403         }
404 
405         return result;
406     }
407 
408     /***
409      * Returns the whitespace (i.e., text not matched by any token) preceding
410      * the token returned by the last call to {@link #nextToken()}.
411      * If we arrived at the end of the text to tokenize (last call to
412      * {@link #nextToken()} returned <code>null</code>), this is the
413      * whitespace between the last existing token and the end of the text.
414      *
415      * @return the whitespace preceding the last token, or the empty
416      * string if there is no preceding whitespace (i.e.
417      * {@link #hasPrecedingWhitespace()} would return <code>false</code>)
418      * @throws IllegalStateException if this method is called
419      * without a prior call to {@link #nextToken()}
420      * @throws IllegalArgumentException if
421      * {@link #isWhitespacePatternEnsured()} is <code>true</code> and the
422      * whitespace preceding the last read token does not match the defined
423      * whitespace pattern
424      */
425     public final String precedingWhitespace() throws IllegalStateException,
426             IllegalArgumentException {
427         // determine whitespace, if not yet done
428         findPrecedingWhitespace();
429         return precedingWhitespace;
430     }
431 
432     /***
433      * Checks whether the whitespace (i.e., text not matched by any token)
434      * preceding the token returned by the last call to {@link #nextToken()}
435      * matches the defined whitespace pattern. This method is called
436      * automatically if {@link #isWhitespacePatternEnsured()} is
437      * <code>true</code>. Otherwise it can be called externally to check
438      * whether the whitespace is valid and take appropriate action if required.
439      *
440      * @return <code>true</code> iff the preceding whitespace matches the
441      * specified whitespace pattern or if there is no preceding whitespace
442      * @throws IllegalStateException if this method is called
443      * without a prior call to {@link #nextToken()}
444      */
445     public boolean precedingWhitespaceIsValid() throws IllegalStateException {
446         doFindPrecedingWhitespace();
447         return isValidWhitespace(precedingWhitespace);
448     }
449 
450     /***
451      * Initializes or re-initializes the state of this tokenizer
452      * at construction or when resetting the text.
453      */
454     private void reinit() {
455         startOfLastMatch = -1;
456         afterPenultimateMatch = 0;
457         precedingWhitespace = null;
458     }
459 
460     /***
461      * Resets this tokenizer, so it will restart at the begin of the
462      * current text.
463      */
464     public final void reset() {
465         tokenMatcher.reset();
466 
467         // re-initialize fields
468         reinit();
469     }
470 
471     /***
472      * Resets this tokenizer, so it will restart at the begin of the
473      * provided text.
474      *
475      * @param newText the new text to tokenize
476      */
477     public final void reset(final CharSequence newText) {
478         tokenMatcher.reset(newText);
479         textToTokenize = newText;
480 
481         // re-initialize fields
482         reinit();
483     }
484 
485     /***
486      * Returns the complete text to the right (following) the token returned
487      * by the last call to {@link #nextToken()}. This includes any following
488      * whitespace.
489      *
490      * @return the complete text to the right of the last token
491      * @throws IllegalStateException if this method is called
492      * without a prior call to {@link #nextToken()}
493      */
494     public CharSequence rightText() throws IllegalStateException {
495         if (startOfLastMatch < 0) {
496             throw new IllegalStateException(
497                 "Prior call to nextToken() required");
498         }
499 
500         // return everything after the matched sequence
501         return textToTokenize.subSequence(tokenMatcher.end(),
502             textToTokenize.length());
503     }
504 
505     /***
506      * Changes the normalized whitespace representation prepended if {@link
507      * #isNormalizedWhitespacePrepended()} is <code>true</code>.
508      *
509      * @param newValue the new value
510      */
511     public final void setNormalizedWhitespace(final String newValue) {
512         normalizedWhitespace = newValue;
513     }
514 
515     /***
516      * Changes whether whitespace is prepended in a normalized form ({@link
517      * #getNormalizedWhitespace()}) to those tokens where {@link
518      * #hasPrecedingWhitespace()} would return <code>true</code>.
519      *
520      * @param newValue the new value
521      */
522     public final void setNormalizedWhitespacePrepended(final boolean newValue) {
523         normalizedWhitespacePrepended = newValue;
524     }
525 
526     /***
527      * Specifies whether whitespace (the text between patterns) is checked to
528      * ensure that the defined whitespace pattern is matched. If set to
529      * true (default), a call to {@link #hasPrecedingWhitespace()} or
530      * {@link #precedingWhitespace()} will throw an {@link
531      * java.lang.IllegalArgumentException} if the whitespace preceding the
532      * last read token does not match.
533      *
534      * @param ensured the new value of this property
535      */
536     public final void setWhitespacePatternEnsured(final boolean ensured) {
537         whitespacePatternEnsured = ensured;
538     }
539 
540     /***
541      * Returns a string representation of this object.
542      *
543      * @return a textual representation
544      */
545     public String toString() {
546         final ToStringBuilder builder = new ToStringBuilder(this).
547             append("token pattern", tokenMatcher.pattern().pattern()).
548             append("whitespace pattern", whitespaceMatcher.pattern().pattern()).
549             append("whitespace pattern ensured", whitespacePatternEnsured);
550 
551         if (normalizedWhitespacePrepended) {
552             builder.append("normalized whitespace (is prepended)",
553                 normalizedWhitespace);
554         }
555         return builder.toString();
556     }
557 
558     /***
559      * Convenience method that counts the number of whitespace characters at the
560      * end of a string, according to the defined whitespace pattern.
561      *
562      * @param text the text to check
563      * @return the number of whitespace characters at the end, of 0 if there
564      * are none
565      */
566     public int trailingWhitespaceCount(final String text) {
567         // append "\z" (end of input) to whitespace pattern
568         final Pattern trailingWSPattern = Pattern.compile(
569             whitespaceMatcher.pattern().pattern() + "//z");
570         final Matcher trailingWSMatcher = trailingWSPattern.matcher(text);
571 
572         if (trailingWSMatcher.find()) {
573             return trailingWSMatcher.group().length();
574         } else {
575             // no trailing whitespace
576             return 0;
577         }
578     }
579 
580 }