1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import org.apache.commons.lang.ArrayUtils;
25 import org.apache.commons.lang.builder.ToStringBuilder;
26
27 import de.fu_berlin.ties.TiesConfiguration;
28
29 /***
30 * Factory for creating {@link de.fu_berlin.ties.text.TextTokenizer}s of
31 * different types.
32 *
33 * @author Christian Siefkes
34 * @version $Revision: 1.8 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
35 */
36 public class TokenizerFactory {
37
38 /***
39 * Configuration key for the array of regular expressions defining the token
40 * types accepted by the tokenizer.
41 */
42 public static final String CONFIG_TOKEN_PATTERNS = "tokenizer.pattern";
43
44 /***
45 * Configuration key for the regular expression giving the whitespace
46 * accepted by the tokenizer.
47 */
48 public static final String CONFIG_WHITESPACE_PATTERN =
49 "tokenizer.whitespace";
50
51 /***
52 * Pattern string capturing whitespace and control/other characters.
53 */
54 public static final String WHITESPACE_CONTROL_OTHER = "[//p{Z}//p{C}]*";
55
56 /***
57 * Static factory method to create an instance for tokenizing alphanumeric
58 * and symbol sequences and puntuation. Token types:
59 *
60 * <ul>
61 * <li>sequences of letters and digits (incl. marks, Unicode categories
62 * "L", "M", "N") -- full token is captured for
63 * {@link TextTokenizer#capturedText()}</li>
64 * <li>sequences of symbols ("S" category) -- nothing is captured</li>
65 * <li>a single punctuation sign, possibly repeated ("P" category) --
66 * nothing is captured</li>
67 * </ul>
68 *
69 * <p>When you are only interested in words and numbers (e.g. for indexing),
70 * you can use the {@link TextTokenizer#capturedText() captured text} --
71 * it will contain the full token for alphanumeric sequences, it will be
72 * empty for symbols and punctuation.
73 *
74 * <p>The whitespace pattern comprised a sequence of whitespace and
75 * control/other characters ("C" and "Z" categories).
76 *
77 * @param text the text to tokenize
78 * @return the created tokenizer
79 */
80 public static TextTokenizer createAlnumTokenizer(final CharSequence text) {
81 final TextTokenizer result = new TextTokenizer(new String[] {
82 "([//p{L}//p{M}//p{N}]+)",
83 "//p{S}+",
84 "(//p{P})//1*",
85 },
86 WHITESPACE_CONTROL_OTHER,
87 text);
88 return result;
89 }
90
91 /***
92 * Static factory method to create an instance for tokenizing according to
93 * Unicode categories. Token types:
94 *
95 * <ul>
96 * <li>sequences of letters (incl. marks, "L" and "M" categories) -- full
97 * token is captured for {@link TextTokenizer#capturedText()}</li>
98 * <li>sequences of digits (integral numbers, "N" category) -- full token is
99 * captured</li>
100 * <li>sequences of symbols ("S" category) -- nothing is captured</li>
101 * <li>sequences of punctuation ("P" category) -- nothing is captured</li>
102 * </ul>
103 *
104 * <p>When you are only interested in words and numbers (e.g. for indexing),
105 * you can use the {@link TextTokenizer#capturedText() captured text} --
106 * it will contain the full token for letter and digit sequences, it will
107 * be empty for symbols and punctuation.
108 *
109 * <p>The whitespace pattern comprised a sequence of whitespace and
110 * control/other characters ("C" and "Z" categories).
111 *
112 * @param text the text to tokenize
113 * @return the created tokenizer
114 */
115 public static TextTokenizer createCategoryTokenizer(
116 final CharSequence text) {
117 final TextTokenizer result = new TextTokenizer(new String[] {
118 "([//p{L}//p{M}]+)",
119 "(//p{N}+)",
120 "//p{S}+",
121 "//p{P}+",
122 },
123 WHITESPACE_CONTROL_OTHER,
124 text);
125 return result;
126 }
127
128 /***
129 * Static factory method to create an instance that uses the "thorough"
130 * patterns listed below.
131 *
132 * <ul>
133 * <li>sequences of letters (incl. marks)</li>
134 * <li>sequences of digits (integral numbers)</li>
135 * <li>sequences of math symbols</li>
136 * <li>sequences of currency symbols</li>
137 * <li>sequences of other symbols (modifiers and misc)</li>
138 * <li>a single punctuation sign, possibly repeated</li>
139 * </ul>
140 *
141 * <p>These patterns don't contain any useful information for
142 * {@link TextTokenizer#capturedText()}.
143 *
144 * <p>The whitespace pattern comprised a sequence of whitespace and
145 * control/other characters.
146 *
147 * @param text the text to tokenize
148 * @return the created tokenizer
149 */
150 public static TextTokenizer createThoroughTokenizer(
151 final CharSequence text) {
152 final TextTokenizer result = new TextTokenizer(new String[] {
153 "[//p{L}//p{M}]+",
154 "//p{N}+",
155 "//p{Sm}+",
156 "//p{Sc}+",
157 "[//p{Sk}//p{So}]+",
158 "(//p{P})//1*",
159 },
160 WHITESPACE_CONTROL_OTHER,
161 text);
162 return result;
163 }
164
165 /***
166 * Array of regular expressions strings defining the token types accepted by
167 * the tokenizer.
168 */
169 private final String[] tokenPatterns;
170
171 /***
172 * Regular expression giving the whitespace accepted by the tokenizer.
173 */
174 private final String whitespacePattern;
175
176 /***
177 * Creates a new instance from the {@link #CONFIG_TOKEN_PATTERNS} and
178 * {@link #CONFIG_WHITESPACE_PATTERN} keys of the provided configuration.
179 *
180 * @param config the configuration to use
181 */
182 public TokenizerFactory(final TiesConfiguration config) {
183 this(config, null);
184 }
185
186 /***
187 * Creates a new instance from the {@link #CONFIG_TOKEN_PATTERNS} and
188 * {@link #CONFIG_WHITESPACE_PATTERN} keys of the provided configuration,
189 * {@linkplain TiesConfiguration#adaptKey(String, String) adapted} by
190 * appending the <code>suffix</code>.
191 *
192 * @param config the configuration to use
193 * @param suffix the suffix to append to the keys
194 */
195 public TokenizerFactory(final TiesConfiguration config,
196 final String suffix) {
197 super();
198 final String tokenKey = config.adaptKey(CONFIG_TOKEN_PATTERNS, suffix);
199 final String whitespaceKey =
200 config.adaptKey(CONFIG_WHITESPACE_PATTERN, suffix);
201 tokenPatterns = config.getStringArray(tokenKey);
202 whitespacePattern = config.getString(whitespaceKey);
203 }
204
205 /***
206 * Factory method to create an instance from the configured token
207 * and whitespace patterns.
208 *
209 * @param text the text to tokenize
210 * @return the created tokenizer
211 */
212 public TextTokenizer createTokenizer(final CharSequence text) {
213 final TextTokenizer result =
214 new TextTokenizer(tokenPatterns, whitespacePattern, text);
215 return result;
216 }
217
218 /***
219 * Returns a string representation of this object.
220 *
221 * @return a textual representation
222 */
223 public String toString() {
224 return new ToStringBuilder(this)
225 .append("token patterns", ArrayUtils.toString(tokenPatterns))
226 .append("whitespace pattern", whitespacePattern)
227 .toString();
228 }
229
230 }