View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.text;
23  
24  import org.apache.commons.lang.ArrayUtils;
25  import org.apache.commons.lang.builder.ToStringBuilder;
26  
27  import de.fu_berlin.ties.TiesConfiguration;
28  
29  /***
30   * Factory for creating {@link de.fu_berlin.ties.text.TextTokenizer}s of
31   * different types.
32   *
33   * @author Christian Siefkes
34   * @version $Revision: 1.5 $, $Date: 2004/04/13 07:08:35 $, $Author: siefkes $
35   */
36  public class TokenizerFactory {
37  
38      /***
39       * Configuration key for the array of regular expressions defining the token
40       * types accepted by the tokenizer.
41       */
42      public static final String CONFIG_TOKEN_PATTERNS = "tokenizer.pattern";
43  
44      /***
45       * Configuration key for the regular expression giving the whitespace
46       * accepted by the tokenizer.
47       */
48      public static final String CONFIG_WHITESPACE_PATTERN =
49          "tokenizer.whitespace";
50  
51      /***
52       * Pattern string capturing whitespace and control/other characters.
53       */
54      public static final String WHITESPACE_CONTROL_OTHER = "[//p{Z}//p{C}]*";
55  
56      /***
57       * Static factory method to create an instance for tokenizing alphanumeric
58       * and symbol sequences and puntuation. Token types:
59       *
60       * <ul>
61       * <li>sequences of letters and digits (incl. marks, Unicode categories
62       * "L", "M", "N") -- full token is captured for
63       * {@link TextTokenizer#capturedText()}</li>
64       * <li>sequences of symbols ("S" category) -- nothing is captured</li>
65       * <li>a single punctuation sign, possibly repeated ("P" category) --
66       * nothing is captured</li>
67       * </ul>
68       *
69       * <p>When you are only interested in words and numbers (e.g. for indexing),
70       * you can use the {@link TextTokenizer#capturedText() captured text} --
71       * it will contain the full token for alphanumeric sequences, it will be
72       * empty for symbols and punctuation.
73       *
74       * <p>The whitespace pattern comprised a sequence of whitespace and
75       * control/other characters ("C" and "Z" categories).
76       *
77       * @param text the text to tokenize
78       * @return the created tokenizer
79       */
80      public static TextTokenizer createAlnumTokenizer(final CharSequence text) {
81          final TextTokenizer result = new TextTokenizer(new String[] {
82                  "([//p{L}//p{M}//p{N}]+)", // alphanumeric
83                  "//p{S}+",        // symbols
84                  "(//p{P})//1*",   // single punctuation sign, possibly repeated
85              },
86              WHITESPACE_CONTROL_OTHER, // whitespace + control/other characters
87              text);
88          return result;
89      }
90  
91      /***
92       * Static factory method to create an instance for tokenizing according to
93       * Unicode categories. Token types:
94       *
95       * <ul>
96       * <li>sequences of letters (incl. marks, "L" and "M" categories) -- full
97       * token is captured for {@link TextTokenizer#capturedText()}</li>
98       * <li>sequences of digits (integral numbers, "N" category) -- full token is
99       * captured</li>
100      * <li>sequences of symbols ("S" category) -- nothing is captured</li>
101      * <li>sequences of punctuation ("P" category) -- nothing is captured</li>
102      * </ul>
103      *
104      * <p>When you are only interested in words and numbers (e.g. for indexing),
105      * you can use the {@link TextTokenizer#capturedText() captured text} --
106      * it will contain the full token for letter and digit sequences, it will
107      * be empty for symbols and punctuation.
108      *
109      * <p>The whitespace pattern comprised a sequence of whitespace and
110      * control/other characters ("C" and "Z" categories).
111      *
112      * @param text the text to tokenize
113      * @return the created tokenizer
114      */
115     public static TextTokenizer createCategoryTokenizer(
116                 final CharSequence text) {
117         final TextTokenizer result = new TextTokenizer(new String[] {
118                 "([//p{L}//p{M}]+)",    // letters incl. marks, fully captured
119                 "(//p{N}+)",            // numbers, fully captured
120                 "//p{S}+",              // symbols
121                 "//p{P}+",              // punctuation
122             },
123             WHITESPACE_CONTROL_OTHER,   // whitespace + control/other characters
124             text);
125         return result;
126     }
127 
128     /***
129      * Static factory method to create an instance that uses the "thorough"
130      * patterns listed below.
131      *
132      * <ul>
133      * <li>sequences of letters (incl. marks)</li>
134      * <li>sequences of digits (integral numbers)</li>
135      * <li>sequences of math symbols</li>
136      * <li>sequences of currency symbols</li>
137      * <li>sequences of other symbols (modifiers and misc)</li>
138      * <li>a single punctuation sign, possibly repeated</li>
139      * </ul>
140      *
141      * <p>These patterns don't contain any useful information for
142      * {@link TextTokenizer#capturedText()}.
143      *
144      * <p>The whitespace pattern comprised a sequence of whitespace and
145      * control/other characters.
146      *
147      * @param text the text to tokenize
148      * @return the created tokenizer
149      */
150     public static TextTokenizer createThoroughTokenizer(
151             final CharSequence text) {
152         final TextTokenizer result = new TextTokenizer(new String[] {
153                 "[//p{L}//p{M}]+",    // letters incl. marks
154                 "//p{N}+",            // numbers
155                 "//p{Sm}+",           // math symbols
156                 "//p{Sc}+",           // currency symbols
157                 "[//p{Sk}//p{So}]+",  // modifier and other symbols
158                 "(//p{P})//1*",   // single punctuation sign, possibly repeated
159             },
160             WHITESPACE_CONTROL_OTHER, // whitespace + control/other characters
161             text);
162         return result;
163     }
164 
165     /***
166      * Array of regular expressions strings defining the token types accepted by
167      * the tokenizer.
168      */
169     private final String[] tokenPatterns;
170 
171     /***
172      * Regular expression giving the whitespace accepted by the tokenizer.
173      */
174     private final String whitespacePattern;
175 
176     /***
177      * Creates a new instance from the {@link #CONFIG_TOKEN_PATTERNS} and
178      * {@link #CONFIG_WHITESPACE_PATTERN} keys of the provided configuration.
179      *
180      * @param config the configuration to use
181      */
182     public TokenizerFactory(final TiesConfiguration config) {
183         this(config, null);
184     }
185 
186     /***
187      * Creates a new instance from the {@link #CONFIG_TOKEN_PATTERNS} and
188      * {@link #CONFIG_WHITESPACE_PATTERN} keys of the provided configuration,
189      * {@linkplain TiesConfiguration#adaptKey(String, String) adapted} by
190      * appending the <code>suffix</code>.
191      *
192      * @param config the configuration to use
193      * @param suffix the suffix to append to the keys
194      */
195     public TokenizerFactory(final TiesConfiguration config,
196                             final String suffix) {
197         super();
198         final String tokenKey = config.adaptKey(CONFIG_TOKEN_PATTERNS, suffix);
199         final String whitespaceKey =
200             config.adaptKey(CONFIG_WHITESPACE_PATTERN, suffix);
201         tokenPatterns = config.getStringArray(tokenKey);
202         whitespacePattern = config.getString(whitespaceKey);
203     }
204 
205     /***
206      * Factory method to create an instance from the configured token
207      * and whitespace patterns.
208      *
209      * @param text the text to tokenize
210      * @return the created tokenizer
211      */
212     public TextTokenizer createTokenizer(final CharSequence text) {
213         final TextTokenizer result =
214             new TextTokenizer(tokenPatterns, whitespacePattern, text);
215         return result;
216     }
217 
218     /***
219      * Returns a string representation of this object.
220      *
221      * @return a textual representation
222      */
223     public String toString() {
224         return new ToStringBuilder(this)
225             .append("token patterns", ArrayUtils.toString(tokenPatterns))
226             .append("whitespace pattern", whitespacePattern)
227             .toString();
228     }
229 
230 }