1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.text;
23
24 import java.io.IOException;
25 import java.io.Reader;
26
27 import org.apache.commons.lang.builder.ToStringBuilder;
28
29 import de.fu_berlin.ties.TiesConfiguration;
30 import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
31 import de.fu_berlin.ties.classify.feature.FeatureExtractor;
32 import de.fu_berlin.ties.classify.feature.FeatureVector;
33 import de.fu_berlin.ties.io.IOUtils;
34
35 /***
36 * Uses a tokenizer to convert a text into a feature vector. Each token is
37 * stored as a feature, preserving the original order of tokens in a text.
38 *
39 * <p>Instances of this class are not thread-safe and must be synchronizing
40 * externally, if required.
41 *
42 * @author Christian Siefkes
43 * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
44 */
45 public class TokenizingExtractor implements FeatureExtractor {
46
47 /***
48 * Used to split character sequences into tokens.
49 */
50 private final TextTokenizer tokenizer;
51
52
53 /***
54 * Creates a new instance.
55 *
56 * @param conf used to configure this instance
57 * @param suffix optional suffix for
58 * {@linkplain de.fu_berlin.ties.TiesConfiguration#adaptKey(String, String)
59 * adapting configuration keys} if not <code>null</code>
60 */
61 public TokenizingExtractor(final TiesConfiguration conf,
62 final String suffix) {
63 super();
64 final TokenizerFactory factory = new TokenizerFactory(conf, suffix);
65 tokenizer = factory.createTokenizer("");
66 }
67
68
69 /***
70 * {@inheritDoc}
71 */
72 public FeatureVector buildFeatures(final Reader reader) throws IOException {
73 final FeatureVector features = new DefaultFeatureVector();
74 features.addAllTokens(IOUtils.readToString(reader), tokenizer);
75 return features;
76 }
77
78 /***
79 * Returns the tokenizer used by this instance.
80 * @return the value of the attribute
81 */
82 public TextTokenizer getTokenizer() {
83 return tokenizer;
84 }
85
86 /***
87 * Returns a string representation of this object.
88 *
89 * @return a textual representation
90 */
91 public String toString() {
92 return new ToStringBuilder(this)
93 .append("tokenizer", tokenizer)
94 .toString();
95 }
96
97 }