View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.context;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.util.HashMap;
28  import java.util.List;
29  import java.util.Map;
30  
31  import org.apache.commons.lang.builder.ToStringBuilder;
32  import org.dom4j.Document;
33  import org.dom4j.DocumentException;
34  import org.dom4j.Element;
35  
36  import de.fu_berlin.ties.ProcessingException;
37  import de.fu_berlin.ties.classify.feature.FeatureExtractor;
38  import de.fu_berlin.ties.classify.feature.FeatureVector;
39  import de.fu_berlin.ties.text.TokenizerFactory;
40  import de.fu_berlin.ties.xml.dom.DOMUtils;
41  
42  /***
43   * Abstract class that manages context representations for entity recognition
44   * and information extraction. Subclasses must implement the
45   * {@link #buildContext(Element, String, String, String, PriorRecognitions,
46   * Map, String)} method for building representations.
47   * 
48   * @author Christian Siefkes
49   * @version $Revision: 1.18 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
50   */
51  public abstract class Representation implements FeatureExtractor {
52  
53      /***
54       * The number of preceding recognitions to represent.
55       */
56      private final int recognitionNumber;
57  
58      /***
59       * Creates a new instance.
60       * 
61       * @param recogNum the number of preceding recognitions to represent
62       */
63      public Representation(final int recogNum) {
64          super();
65          recognitionNumber = recogNum;
66      }
67  
68      /***
69       * Builds the context representation of a document. The default
70       * implementation delegates to the {@link #buildContext(Element,
71       * PriorRecognitions, Map, String)} method, using the root element of the
72       * document.
73       * 
74       * @param document the XML document whose context should be represented
75       * @param priorRecognitions a buffer of the last {@link Recognition}s from
76       * the document, created by calling {@link #initDocument};
77       * might be <code>null</code>
78       * @param featureCache a cache of (local) feature, should be re-used between
79       * all calls for the nodes in a single document (but must not be re-used
80       * when building the context of nodes in different documents!)
81       * @param logPurpose the type of contexts of main interest to the caller
82       * (e.g. "Token" or "Sentence"), used for logging
83       * @return a vector of features considered relevant for representation
84       * @throws ClassCastException if the <code>priorRecognitions</code> buffer
85       * contains objects that aren't {@link Recognition}s
86       */
87      public FeatureVector buildContext(final Document document,
88              final PriorRecognitions priorRecognitions,
89              final Map<Element, List<LocalFeature>> featureCache,
90              final String logPurpose) throws ClassCastException {
91          // delegate using root element
92          return buildContext(document.getRootElement(), priorRecognitions,
93                  featureCache, logPurpose);
94      }
95  
96      /***
97       * Builds the context representation of an element. The default
98       * implementation delegates to the {@link #buildContext(Element, String,
99       * String, String, PriorRecognitions, Map, String)} method, using the
100      * {@linkplain DOMUtils#collectText(org.dom4j.Branch) full textual content}
101      * of the element as <code>mainText</code> and empty strings as
102      * <code>leftText</code> and <code>rightText</code>.
103      * 
104      * @param element the element whose context should be represented
105      * @param priorRecognitions a buffer of the last {@link Recognition}s from
106      * the document, created by calling {@link #initDocument};
107      * might be <code>null</code>
108      * @param featureCache a cache of (local) feature, should be re-used between
109      * all calls for the nodes in a single document (but must not be re-used
110      * when building the context of nodes in different documents!)
111      * @param logPurpose the type of contexts of main interest to the caller
112      * (e.g. "Token" or "Sentence"), used for logging
113      * @return a vector of features considered relevant for representation
114      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
115      * contains objects that aren't {@link Recognition}s
116      */
117     public FeatureVector buildContext(final Element element,
118             final PriorRecognitions priorRecognitions,
119             final Map<Element, List<LocalFeature>> featureCache,
120             final String logPurpose) throws ClassCastException {
121         // delegate to abstract method
122         return buildContext(element, "", DOMUtils.collectText(element), "",
123                 priorRecognitions, featureCache, logPurpose);
124     }
125 
126     /***
127      * Builds the context representation of text in an element.
128      * Returns a feature vector of all context features considered relevant for
129      * representation.
130      * 
131      * @param element the element whose context should be represented
132      * @param leftText textual content to the left of (preceding)
133      * <code>mainText</code>, might be empty
134      * @param mainText the main textual content to represent, might be empty
135      * @param rightText textual content to the right of (following)
136      * <code>mainText</code>, might be empty
137      * @param priorRecognitions a buffer of the last {@link Recognition}s from
138      * the document, created by calling {@link #initDocument};
139      * might be <code>null</code>
140      * @param featureCache a cache of (local) feature, should be re-used between
141      * all calls for the nodes in a single document (but must not be re-used
142      * when building the context of nodes in different documents!)
143      * @param logPurpose the type of contexts of main interest to the caller
144      * (e.g. "Token" or "Sentence"), used for logging
145      * @return a vector of features considered relevant for representation
146      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
147      * contains objects that aren't {@link Recognition}s
148      */
149     public abstract FeatureVector buildContext(final Element element,
150             final String leftText, final String mainText,
151             final String rightText, final PriorRecognitions priorRecognitions,
152             final Map<Element, List<LocalFeature>> featureCache,
153             final String logPurpose) throws ClassCastException;
154 
155     /***
156      * {@inheritDoc} The <code>input</code> text must contain a well-formed
157      * XML element, otherwise this method will not work.
158      */
159     public FeatureVector buildFeatures(final Reader reader)
160     throws IOException, ProcessingException {
161         final Document doc;
162 
163         // parse XML input
164         try {
165             doc = DOMUtils.readDocument(reader);
166         } catch (DocumentException de) {
167             // wrap + rethrow
168             throw new ProcessingException(de);
169         }
170 
171         // delegate
172         return buildContext(doc, null,
173                 new HashMap<Element, List<LocalFeature>>(),
174                 "Feature-Extraction");
175     }
176 
177     /***
178      * Initializes the processing of a new document and creates a buffer
179      * to be filled with prior {@link Recognition}s and passed
180      * as argument to the {@link #buildContext(Element, String, String, String,
181      * PriorRecognitions, Map, String)} method. The caller must
182      * {@link PriorRecognitions#add(Recognition)}new recognitions to the buffer
183      * but it is not necessary to remove them -- the buffer will automatically
184      * delete the oldest recognitions when appropriate.
185      * 
186      * @param filename the name of the file
187      * @param tFactory a factory that can be used for creating tokenizers,
188      * if required
189      * @throws ProcessingException if an error occurs while starting to
190      * process the document
191      * @throws IOException if an I/O error occurs
192      * @return a buffer to be used for collecting prior {@link Recognition}s
193      */
194     public PriorRecognitions initDocument(final File filename,
195             final TokenizerFactory tFactory)
196     throws ProcessingException, IOException {
197         return new PriorRecognitions(getRecognitionNumber());
198     }
199 
200     /***
201      * Returns the number of preceding recognitions to represent.
202      * 
203      * @return the value of the attibute
204      */
205     public int getRecognitionNumber() {
206         return recognitionNumber;
207     }
208 
209     /***
210      * Returns a string representation of this object.
211      * 
212      * @return a textual representation
213      */
214     public String toString() {
215         return new ToStringBuilder(this).append("number of recognitions",
216                 recognitionNumber).toString();
217     }
218 
219 }