View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.context;
23  
24  import java.util.Iterator;
25  import java.util.Map;
26  
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  import org.dom4j.Attribute;
29  import org.dom4j.Element;
30  
31  import de.fu_berlin.ties.ProcessingException;
32  import de.fu_berlin.ties.TiesConfiguration;
33  import de.fu_berlin.ties.classify.feature.DefaultFeature;
34  import de.fu_berlin.ties.classify.feature.DefaultFeatureVector;
35  import de.fu_berlin.ties.classify.feature.FeatureVector;
36  import de.fu_berlin.ties.io.IOUtils;
37  import de.fu_berlin.ties.text.TextTokenizer;
38  import de.fu_berlin.ties.text.TextUtils;
39  import de.fu_berlin.ties.text.TokenizerFactory;
40  import de.fu_berlin.ties.xml.dom.DOMUtils;
41  
42  /***
43   * A simple representation of an text in an element in an XML document.
44   * Instances of this class are thread-safe.
45   * 
46   * @author Christian Siefkes
47   * @version $Revision: 1.7 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
48   */
49  public class SimpleRepresentation extends AbstractRepresentation {
50  
51      /***
52       * Used to tokenize the textual content of the elements to represent.
53       * Synchronized on itself.
54       */
55      private final TextTokenizer tokenizer;
56  
57      /***
58       * Creates a new instance based on the
59       * {@linkplain TiesConfiguration#CONF standard configuration}.
60       *
61       * @throws ProcessingException if an error occurs while initializing this
62       * instance
63       */
64      public SimpleRepresentation() throws ProcessingException {
65          this(TiesConfiguration.CONF);
66      }
67  
68      /***
69       * Creates a new instance based on the provided configuration.
70       *
71       * @param config used to configure this instance
72       * @throws ProcessingException if an error occurs while initializing this
73       * instance
74       */
75      public SimpleRepresentation(final TiesConfiguration config)
76      throws ProcessingException {
77          this(config, null);
78      }
79  
80      /***
81       * Creates a new instance based on the provided configuration.
82       *
83       * @param config used to configure this instance
84       * @param suffix this suffix can be appended to the used configuration
85       * parameters to give values that are specific for this instance; may be
86       * <code>null</code>
87       * @throws ProcessingException if an error occurs while initializing this
88       * instance
89       */
90      public SimpleRepresentation(final TiesConfiguration config,
91              final String suffix) throws ProcessingException {
92          this(config.getInt(config.adaptKey(CONFIG_RECOGN_NUM, suffix)),
93              config.getInt(config.adaptKey(CONFIG_SPLIT_MAXIMUM, suffix)),
94              config.getInt(config.adaptKey(CONFIG_STORE_NTH, suffix)),
95              config.getString(config.adaptKey(IOUtils.KEY_LOCAL_CHARSET, suffix),
96                      null),
97              new TokenizerFactory(config, suffix).createTokenizer(""));
98      }
99  
100     /***
101      * Creates a new instance.
102      *
103      * @param recogNum the number of preceding recognitions to represent
104      * @param splitMax the maximum number of subsequences to keep when
105      * a feature value must be split (at whitespace)
106      * @param n Each <em>n</em>-th context representation is stored if &gt; 0;
107      * otherwise no representation is stored
108      * @param outCharset the output character set to use (only used to
109      * store some configurations for inspection purposes, if <code>n</code>
110      * &gt; 0); if <code>null</code>, the default charset of the current
111      * platform is used
112      * @param textTokenizer the tokenizer to use
113      */
114     public SimpleRepresentation(final int recogNum, final int splitMax,
115             final int n, final String outCharset,
116             final TextTokenizer textTokenizer) {
117         super(recogNum, splitMax, n, outCharset);
118         tokenizer = textTokenizer;
119     }
120 
121     /***
122      * Creates a feature and adds it to a feature vector. The feature is
123      * created by joining <code>prefix</code> and <code>value</code>, separated
124      * by a colon as separator character.
125      *
126      * @param features the feature vector to append to
127      * @param prefix the prefix of the new feature
128      * @param value the main value of the new feature
129      */
130     protected void addFeature(final FeatureVector features, final String prefix,
131             final String value) {
132         features.add(new DefaultFeature(prefix + ':' + value));
133     }
134 
135     /***
136      * Adds feature(s) representing text to a feature vector, using the
137      * instance tokenizer for splitting the text into tokens.
138      *
139      * @param features the feature vector to append to
140      * @param prefix the prefix of the new feature(s)
141      * @param text to text to tokenize and add
142      */
143     protected void addText(final FeatureVector features, final String prefix,
144             final String text) {
145         // synchronize on the tokenizer because it isn't thread-safe
146         synchronized (tokenizer) {
147             tokenizer.reset(text);
148             String token;
149 
150             while ((token = tokenizer.nextToken()) != null) {
151                 addFeature(features, prefix, token);
152             }
153         }
154     }
155 
156     /***
157      * {@inheritDoc}
158      */
159     protected FeatureVector doBuildContext(final Element element,
160             final String leftText, final String mainText,
161             final String rightText, final PriorRecognitions priorRecognitions,
162             final Map featureCache, final String logPurpose)
163     throws ClassCastException {
164         final FeatureVector result = new DefaultFeatureVector();
165 
166         // store name of element
167         addFeature(result, "type", DOMUtils.name(element));
168 
169         // store attributes, splitting values at whitespace
170         final Iterator attribIter = element.attributeIterator();
171         Attribute attrib;
172         String name;
173         String[] splittedVal;
174         int i;
175 
176         while (attribIter.hasNext()) {
177             attrib = (Attribute) attribIter.next();
178             name = DOMUtils.name(attrib);
179             splittedVal = TextUtils.splitString(attrib.getValue(),
180                     getSplitMaximum());
181 
182             for (i = 0; i < splittedVal.length; i++) {
183                 addFeature(result, '@' + name, splittedVal[i]);
184             }
185         }
186 
187         // tokenize + store left + main + right text
188         addText(result, "left", leftText);
189         addText(result, "text", mainText);
190         addText(result, "right", rightText);
191 
192         // add prior recognitions, if any (only the types, not the text)
193         if (priorRecognitions != null) {
194             PriorRecognitions.Pair pair;
195             Recognition recognition;
196             Iterator priorIter = priorRecognitions.iterator();
197 
198             while (priorIter.hasNext()) {
199                 pair = (PriorRecognitions.Pair) priorIter.next();
200                 recognition = pair.getRecognition();
201                 addFeature(result, "prior", recognition.getType());
202             }
203         }
204 
205         return result;
206     }
207 
208     /***
209      * Returns a string representation of this object.
210      *
211      * @return a textual representation
212      */
213     public String toString() {
214         return new ToStringBuilder(this)
215             .appendSuper(super.toString())
216             .append("tokenizer", tokenizer)
217             .toString();
218     }
219 
220 }