View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.context;
23  
24  import java.util.List;
25  import java.util.Map;
26  
27  import org.apache.commons.lang.builder.ToStringBuilder;
28  import org.dom4j.Document;
29  import org.dom4j.Element;
30  
31  import de.fu_berlin.ties.classify.feature.FeatureVector;
32  import de.fu_berlin.ties.xml.dom.DOMUtils;
33  
34  /***
35   * Abstract class that manages context representations for entity recognition
36   * and information extraction. Subclasses must implement the
37   * {@link #buildContext(Element, String, String, String, PriorRecognitions,
38   * Map, String)} method for building representations.
39   * 
40   * @author Christian Siefkes
41   * @version $Revision: 1.13 $, $Date: 2004/11/04 15:26:51 $, $Author: siefkes $
42   */
43  public abstract class Representation {
44  
45      /***
46       * The number of preceding recognitions to represent.
47       */
48      private final int recognitionNumber;
49  
50      /***
51       * Creates a new instance.
52       * 
53       * @param recogNum the number of preceding recognitions to represent
54       */
55      public Representation(final int recogNum) {
56          super();
57          recognitionNumber = recogNum;
58      }
59  
60      /***
61       * Builds the context representation of a document. The default
62       * implementation delegates to the {@link #buildContext(Element,
63       * PriorRecognitions, Map, String)} method, using the root element of the
64       * document.
65       * 
66       * @param document the XML document whose context should be represented
67       * @param priorRecognitions a buffer of the last {@link Recognition}s from
68       * the document, created by calling {@link #createRecognitionBuffer()};
69       * might be <code>null</code>
70       * @param featureCache a cache of (local) feature, should be re-used between
71       * all calls for the nodes in a single document (but must not be re-used
72       * when building the context of nodes in different documents!)
73       * @param logPurpose the type of contexts of main interest to the caller
74       * (e.g. "Token" or "Sentence"), used for logging
75       * @return a vector of features considered relevant for representation
76       * @throws ClassCastException if the <code>priorRecognitions</code> buffer
77       * contains objects that aren't {@link Recognition}s
78       */
79      public FeatureVector buildContext(final Document document,
80              final PriorRecognitions priorRecognitions,
81              final Map<Element, List<LocalFeature>> featureCache,
82              final String logPurpose) throws ClassCastException {
83          // delegate using root element
84          return buildContext(document.getRootElement(), priorRecognitions,
85                  featureCache, logPurpose);
86      }
87  
88      /***
89       * Builds the context representation of an element. The default
90       * implementation delegates to the {@link #buildContext(Element, String,
91       * String, String, PriorRecognitions, Map, String)} method, using the
92       * {@linkplain DOMUtils#collectText(org.dom4j.Branch) full textual content}
93       * of the element as <code>mainText</code> and empty strings as
94       * <code>leftText</code> and <code>rightText</code>.
95       * 
96       * @param element the element whose context should be represented
97       * @param priorRecognitions a buffer of the last {@link Recognition}s from
98       * the document, created by calling {@link #createRecognitionBuffer()};
99       * might be <code>null</code>
100      * @param featureCache a cache of (local) feature, should be re-used between
101      * all calls for the nodes in a single document (but must not be re-used
102      * when building the context of nodes in different documents!)
103      * @param logPurpose the type of contexts of main interest to the caller
104      * (e.g. "Token" or "Sentence"), used for logging
105      * @return a vector of features considered relevant for representation
106      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
107      * contains objects that aren't {@link Recognition}s
108      */
109     public FeatureVector buildContext(final Element element,
110             final PriorRecognitions priorRecognitions,
111             final Map<Element, List<LocalFeature>> featureCache,
112             final String logPurpose) throws ClassCastException {
113         // delegate to abstract method
114         return buildContext(element, "", DOMUtils.collectText(element), "",
115                 priorRecognitions, featureCache, logPurpose);
116     }
117 
118     /***
119      * Builds the context representation of text in an element.
120      * Returns a feature vector of all context features considered relevant for
121      * representation.
122      * 
123      * @param element the element whose context should be represented
124      * @param leftText textual content to the left of (preceding)
125      * <code>mainText</code>, might be empty
126      * @param mainText the main textual content to represent, might be empty
127      * @param rightText textual content to the right of (following)
128      * <code>mainText</code>, might be empty
129      * @param priorRecognitions a buffer of the last {@link Recognition}s from
130      * the document, created by calling {@link #createRecognitionBuffer()};
131      * might be <code>null</code>
132      * @param featureCache a cache of (local) feature, should be re-used between
133      * all calls for the nodes in a single document (but must not be re-used
134      * when building the context of nodes in different documents!)
135      * @param logPurpose the type of contexts of main interest to the caller
136      * (e.g. "Token" or "Sentence"), used for logging
137      * @return a vector of features considered relevant for representation
138      * @throws ClassCastException if the <code>priorRecognitions</code> buffer
139      * contains objects that aren't {@link Recognition}s
140      */
141     public abstract FeatureVector buildContext(final Element element,
142             final String leftText, final String mainText,
143             final String rightText, final PriorRecognitions priorRecognitions,
144             final Map<Element, List<LocalFeature>> featureCache,
145             final String logPurpose) throws ClassCastException;
146 
147     /***
148      * Creates a buffer to be filled with prior {@link Recognition}s and passed
149      * as argument to the {@link #buildContext(Element, String, String, String,
150      * PriorRecognitions, Map, String)} method. The caller must
151      * {@link PriorRecognitions#add(Recognition)}new recognitions to the buffer
152      * but it is not necessary to remove them -- the buffer will automatically
153      * delete the oldest recognitions when appropriate.
154      * 
155      * @return a buffer to be used for collecting prior {@link Recognition}s
156      */
157     public PriorRecognitions createRecognitionBuffer() {
158         return new PriorRecognitions(getRecognitionNumber());
159     }
160 
161     /***
162      * Returns the number of preceding recognitions to represent.
163      * 
164      * @return the value of the attibute
165      */
166     public int getRecognitionNumber() {
167         return recognitionNumber;
168     }
169 
170     /***
171      * Returns a string representation of this object.
172      * 
173      * @return a textual representation
174      */
175     public String toString() {
176         return new ToStringBuilder(this).append("number of recognitions",
177                 recognitionNumber).toString();
178     }
179 
180 }