1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Reader;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30
31 import org.apache.commons.lang.builder.ToStringBuilder;
32 import org.dom4j.Document;
33 import org.dom4j.DocumentException;
34 import org.dom4j.Element;
35
36 import de.fu_berlin.ties.ProcessingException;
37 import de.fu_berlin.ties.classify.feature.FeatureExtractor;
38 import de.fu_berlin.ties.classify.feature.FeatureVector;
39 import de.fu_berlin.ties.text.TokenizerFactory;
40 import de.fu_berlin.ties.xml.dom.DOMUtils;
41
42 /***
43 * Abstract class that manages context representations for entity recognition
44 * and information extraction. Subclasses must implement the
45 * {@link #buildContext(Element, String, String, String, PriorRecognitions,
46 * Map, String)} method for building representations.
47 *
48 * @author Christian Siefkes
49 * @version $Revision: 1.18 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
50 */
51 public abstract class Representation implements FeatureExtractor {
52
53 /***
54 * The number of preceding recognitions to represent.
55 */
56 private final int recognitionNumber;
57
58 /***
59 * Creates a new instance.
60 *
61 * @param recogNum the number of preceding recognitions to represent
62 */
63 public Representation(final int recogNum) {
64 super();
65 recognitionNumber = recogNum;
66 }
67
68 /***
69 * Builds the context representation of a document. The default
70 * implementation delegates to the {@link #buildContext(Element,
71 * PriorRecognitions, Map, String)} method, using the root element of the
72 * document.
73 *
74 * @param document the XML document whose context should be represented
75 * @param priorRecognitions a buffer of the last {@link Recognition}s from
76 * the document, created by calling {@link #initDocument};
77 * might be <code>null</code>
78 * @param featureCache a cache of (local) feature, should be re-used between
79 * all calls for the nodes in a single document (but must not be re-used
80 * when building the context of nodes in different documents!)
81 * @param logPurpose the type of contexts of main interest to the caller
82 * (e.g. "Token" or "Sentence"), used for logging
83 * @return a vector of features considered relevant for representation
84 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
85 * contains objects that aren't {@link Recognition}s
86 */
87 public FeatureVector buildContext(final Document document,
88 final PriorRecognitions priorRecognitions,
89 final Map<Element, List<LocalFeature>> featureCache,
90 final String logPurpose) throws ClassCastException {
91
92 return buildContext(document.getRootElement(), priorRecognitions,
93 featureCache, logPurpose);
94 }
95
96 /***
97 * Builds the context representation of an element. The default
98 * implementation delegates to the {@link #buildContext(Element, String,
99 * String, String, PriorRecognitions, Map, String)} method, using the
100 * {@linkplain DOMUtils#collectText(org.dom4j.Branch) full textual content}
101 * of the element as <code>mainText</code> and empty strings as
102 * <code>leftText</code> and <code>rightText</code>.
103 *
104 * @param element the element whose context should be represented
105 * @param priorRecognitions a buffer of the last {@link Recognition}s from
106 * the document, created by calling {@link #initDocument};
107 * might be <code>null</code>
108 * @param featureCache a cache of (local) feature, should be re-used between
109 * all calls for the nodes in a single document (but must not be re-used
110 * when building the context of nodes in different documents!)
111 * @param logPurpose the type of contexts of main interest to the caller
112 * (e.g. "Token" or "Sentence"), used for logging
113 * @return a vector of features considered relevant for representation
114 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
115 * contains objects that aren't {@link Recognition}s
116 */
117 public FeatureVector buildContext(final Element element,
118 final PriorRecognitions priorRecognitions,
119 final Map<Element, List<LocalFeature>> featureCache,
120 final String logPurpose) throws ClassCastException {
121
122 return buildContext(element, "", DOMUtils.collectText(element), "",
123 priorRecognitions, featureCache, logPurpose);
124 }
125
126 /***
127 * Builds the context representation of text in an element.
128 * Returns a feature vector of all context features considered relevant for
129 * representation.
130 *
131 * @param element the element whose context should be represented
132 * @param leftText textual content to the left of (preceding)
133 * <code>mainText</code>, might be empty
134 * @param mainText the main textual content to represent, might be empty
135 * @param rightText textual content to the right of (following)
136 * <code>mainText</code>, might be empty
137 * @param priorRecognitions a buffer of the last {@link Recognition}s from
138 * the document, created by calling {@link #initDocument};
139 * might be <code>null</code>
140 * @param featureCache a cache of (local) feature, should be re-used between
141 * all calls for the nodes in a single document (but must not be re-used
142 * when building the context of nodes in different documents!)
143 * @param logPurpose the type of contexts of main interest to the caller
144 * (e.g. "Token" or "Sentence"), used for logging
145 * @return a vector of features considered relevant for representation
146 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
147 * contains objects that aren't {@link Recognition}s
148 */
149 public abstract FeatureVector buildContext(final Element element,
150 final String leftText, final String mainText,
151 final String rightText, final PriorRecognitions priorRecognitions,
152 final Map<Element, List<LocalFeature>> featureCache,
153 final String logPurpose) throws ClassCastException;
154
155 /***
156 * {@inheritDoc} The <code>input</code> text must contain a well-formed
157 * XML element, otherwise this method will not work.
158 */
159 public FeatureVector buildFeatures(final Reader reader)
160 throws IOException, ProcessingException {
161 final Document doc;
162
163
164 try {
165 doc = DOMUtils.readDocument(reader);
166 } catch (DocumentException de) {
167
168 throw new ProcessingException(de);
169 }
170
171
172 return buildContext(doc, null,
173 new HashMap<Element, List<LocalFeature>>(),
174 "Feature-Extraction");
175 }
176
177 /***
178 * Initializes the processing of a new document and creates a buffer
179 * to be filled with prior {@link Recognition}s and passed
180 * as argument to the {@link #buildContext(Element, String, String, String,
181 * PriorRecognitions, Map, String)} method. The caller must
182 * {@link PriorRecognitions#add(Recognition)}new recognitions to the buffer
183 * but it is not necessary to remove them -- the buffer will automatically
184 * delete the oldest recognitions when appropriate.
185 *
186 * @param filename the name of the file
187 * @param tFactory a factory that can be used for creating tokenizers,
188 * if required
189 * @throws ProcessingException if an error occurs while starting to
190 * process the document
191 * @throws IOException if an I/O error occurs
192 * @return a buffer to be used for collecting prior {@link Recognition}s
193 */
194 public PriorRecognitions initDocument(final File filename,
195 final TokenizerFactory tFactory)
196 throws ProcessingException, IOException {
197 return new PriorRecognitions(getRecognitionNumber());
198 }
199
200 /***
201 * Returns the number of preceding recognitions to represent.
202 *
203 * @return the value of the attibute
204 */
205 public int getRecognitionNumber() {
206 return recognitionNumber;
207 }
208
209 /***
210 * Returns a string representation of this object.
211 *
212 * @return a textual representation
213 */
214 public String toString() {
215 return new ToStringBuilder(this).append("number of recognitions",
216 recognitionNumber).toString();
217 }
218
219 }