1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context;
23
24 import java.util.List;
25 import java.util.Map;
26
27 import org.apache.commons.lang.builder.ToStringBuilder;
28 import org.dom4j.Document;
29 import org.dom4j.Element;
30
31 import de.fu_berlin.ties.classify.feature.FeatureVector;
32 import de.fu_berlin.ties.xml.dom.DOMUtils;
33
34 /***
35 * Abstract class that manages context representations for entity recognition
36 * and information extraction. Subclasses must implement the
37 * {@link #buildContext(Element, String, String, String, PriorRecognitions,
38 * Map, String)} method for building representations.
39 *
40 * @author Christian Siefkes
41 * @version $Revision: 1.13 $, $Date: 2004/11/04 15:26:51 $, $Author: siefkes $
42 */
43 public abstract class Representation {
44
45 /***
46 * The number of preceding recognitions to represent.
47 */
48 private final int recognitionNumber;
49
50 /***
51 * Creates a new instance.
52 *
53 * @param recogNum the number of preceding recognitions to represent
54 */
55 public Representation(final int recogNum) {
56 super();
57 recognitionNumber = recogNum;
58 }
59
60 /***
61 * Builds the context representation of a document. The default
62 * implementation delegates to the {@link #buildContext(Element,
63 * PriorRecognitions, Map, String)} method, using the root element of the
64 * document.
65 *
66 * @param document the XML document whose context should be represented
67 * @param priorRecognitions a buffer of the last {@link Recognition}s from
68 * the document, created by calling {@link #createRecognitionBuffer()};
69 * might be <code>null</code>
70 * @param featureCache a cache of (local) feature, should be re-used between
71 * all calls for the nodes in a single document (but must not be re-used
72 * when building the context of nodes in different documents!)
73 * @param logPurpose the type of contexts of main interest to the caller
74 * (e.g. "Token" or "Sentence"), used for logging
75 * @return a vector of features considered relevant for representation
76 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
77 * contains objects that aren't {@link Recognition}s
78 */
79 public FeatureVector buildContext(final Document document,
80 final PriorRecognitions priorRecognitions,
81 final Map<Element, List<LocalFeature>> featureCache,
82 final String logPurpose) throws ClassCastException {
83
84 return buildContext(document.getRootElement(), priorRecognitions,
85 featureCache, logPurpose);
86 }
87
88 /***
89 * Builds the context representation of an element. The default
90 * implementation delegates to the {@link #buildContext(Element, String,
91 * String, String, PriorRecognitions, Map, String)} method, using the
92 * {@linkplain DOMUtils#collectText(org.dom4j.Branch) full textual content}
93 * of the element as <code>mainText</code> and empty strings as
94 * <code>leftText</code> and <code>rightText</code>.
95 *
96 * @param element the element whose context should be represented
97 * @param priorRecognitions a buffer of the last {@link Recognition}s from
98 * the document, created by calling {@link #createRecognitionBuffer()};
99 * might be <code>null</code>
100 * @param featureCache a cache of (local) feature, should be re-used between
101 * all calls for the nodes in a single document (but must not be re-used
102 * when building the context of nodes in different documents!)
103 * @param logPurpose the type of contexts of main interest to the caller
104 * (e.g. "Token" or "Sentence"), used for logging
105 * @return a vector of features considered relevant for representation
106 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
107 * contains objects that aren't {@link Recognition}s
108 */
109 public FeatureVector buildContext(final Element element,
110 final PriorRecognitions priorRecognitions,
111 final Map<Element, List<LocalFeature>> featureCache,
112 final String logPurpose) throws ClassCastException {
113
114 return buildContext(element, "", DOMUtils.collectText(element), "",
115 priorRecognitions, featureCache, logPurpose);
116 }
117
118 /***
119 * Builds the context representation of text in an element.
120 * Returns a feature vector of all context features considered relevant for
121 * representation.
122 *
123 * @param element the element whose context should be represented
124 * @param leftText textual content to the left of (preceding)
125 * <code>mainText</code>, might be empty
126 * @param mainText the main textual content to represent, might be empty
127 * @param rightText textual content to the right of (following)
128 * <code>mainText</code>, might be empty
129 * @param priorRecognitions a buffer of the last {@link Recognition}s from
130 * the document, created by calling {@link #createRecognitionBuffer()};
131 * might be <code>null</code>
132 * @param featureCache a cache of (local) feature, should be re-used between
133 * all calls for the nodes in a single document (but must not be re-used
134 * when building the context of nodes in different documents!)
135 * @param logPurpose the type of contexts of main interest to the caller
136 * (e.g. "Token" or "Sentence"), used for logging
137 * @return a vector of features considered relevant for representation
138 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
139 * contains objects that aren't {@link Recognition}s
140 */
141 public abstract FeatureVector buildContext(final Element element,
142 final String leftText, final String mainText,
143 final String rightText, final PriorRecognitions priorRecognitions,
144 final Map<Element, List<LocalFeature>> featureCache,
145 final String logPurpose) throws ClassCastException;
146
147 /***
148 * Creates a buffer to be filled with prior {@link Recognition}s and passed
149 * as argument to the {@link #buildContext(Element, String, String, String,
150 * PriorRecognitions, Map, String)} method. The caller must
151 * {@link PriorRecognitions#add(Recognition)}new recognitions to the buffer
152 * but it is not necessary to remove them -- the buffer will automatically
153 * delete the oldest recognitions when appropriate.
154 *
155 * @return a buffer to be used for collecting prior {@link Recognition}s
156 */
157 public PriorRecognitions createRecognitionBuffer() {
158 return new PriorRecognitions(getRecognitionNumber());
159 }
160
161 /***
162 * Returns the number of preceding recognitions to represent.
163 *
164 * @return the value of the attibute
165 */
166 public int getRecognitionNumber() {
167 return recognitionNumber;
168 }
169
170 /***
171 * Returns a string representation of this object.
172 *
173 * @return a textual representation
174 */
175 public String toString() {
176 return new ToStringBuilder(this).append("number of recognitions",
177 recognitionNumber).toString();
178 }
179
180 }