View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.text;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  
29  import de.fu_berlin.ties.TiesConfiguration;
30  import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
31  import de.fu_berlin.ties.classify.feature.FeatureExtractor;
32  import de.fu_berlin.ties.classify.feature.FeatureVector;
33  import de.fu_berlin.ties.io.IOUtils;
34  
35  /***
36   * Uses a tokenizer to convert a text into a feature vector. Each token is
37   * stored as a feature, preserving the original order of tokens in a text.
38   *
39   * <p>Instances of this class are not thread-safe and must be synchronizing
40   * externally, if required.
41   *
42   * @author Christian Siefkes
43   * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
44   */
45  public class TokenizingExtractor implements FeatureExtractor {
46  
47      /***
48       * Used to split character sequences into tokens.
49       */
50      private final TextTokenizer tokenizer;
51  
52  
53      /***
54       * Creates a new instance.
55       *
56       * @param conf used to configure this instance
57       * @param suffix optional suffix for
58       * {@linkplain de.fu_berlin.ties.TiesConfiguration#adaptKey(String, String)
59       * adapting configuration keys} if not <code>null</code>
60       */
61      public TokenizingExtractor(final TiesConfiguration conf,
62              final String suffix) {
63          super();
64          final TokenizerFactory factory = new TokenizerFactory(conf, suffix);
65          tokenizer = factory.createTokenizer("");
66      }
67  
68  
69      /***
70       * {@inheritDoc}
71       */
72      public FeatureVector buildFeatures(final Reader reader) throws IOException {
73          final FeatureVector features = new DefaultFeatureVector();
74          features.addAllTokens(IOUtils.readToString(reader), tokenizer);
75          return features;
76      }
77  
78      /***
79       * Returns the tokenizer used by this instance.
80       * @return the value of the attribute
81       */
82      public TextTokenizer getTokenizer() {
83          return tokenizer;
84      }
85  
86      /***
87       * Returns a string representation of this object.
88       *
89       * @return a textual representation
90       */
91      public String toString() {
92          return new ToStringBuilder(this)
93              .append("tokenizer", tokenizer)
94              .toString();
95      }
96  
97  }