1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Writer;
27 import java.util.List;
28 import java.util.Map;
29
30 import org.apache.commons.lang.builder.ToStringBuilder;
31 import org.dom4j.Element;
32
33 import de.fu_berlin.ties.classify.feature.FeatureVector;
34 import de.fu_berlin.ties.io.IOUtils;
35 import de.fu_berlin.ties.util.MutableInt;
36 import de.fu_berlin.ties.util.Util;
37 import de.fu_berlin.ties.xml.dom.DOMUtils;
38
39 /***
40 * Provides basic functionality shared by different representations.
41 *
42 * @author Christian Siefkes
43 * @version $Revision: 1.2 $, $Date: 2004/09/06 17:22:41 $, $Author: siefkes $
44 */
45 public abstract class AbstractRepresentation extends Representation {
46
47 /***
48 * Configuration key: The number of preceding recognitions to represent.
49 */
50 public static final String CONFIG_RECOGN_NUM = "representation.recogn.num";
51
52 /***
53 * Configuration key: The maximum number of subsequences to keep when a
54 * feature value must be split.
55 */
56 public static final String CONFIG_SPLIT_MAXIMUM =
57 "representation.split.maximum";
58
59 /***
60 * Configuration key: Each <em>n</em>-th context representation is stored
61 * for debugging and inspection purposes,if > 0.
62 */
63 public static final String CONFIG_STORE_NTH = "representation.store.nth";
64
65 /***
66 * The maximum number of subsequences to keep when a feature value must
67 * be split (at whitespace).
68 */
69 private final int splitMaximum;
70
71 /***
72 * Counts how many representations were already generated (if some of them
73 * are {@linkplain #storeN stored}).
74 */
75 private long repCount = 0;
76
77 /***
78 * Each <em>storeN</em>-th context representation is stored for debugging
79 * and inspection purposes (if > 0, otherwise no representation is
80 * stored).
81 */
82 private final int storeN;
83
84 /***
85 * Last integer suffix used to store a context representation, if enabled
86 * ({@link #storeN} > 0). Also used to synchronize the {@link #repCount}.
87 */
88 private final MutableInt lastStoredSuffix = new MutableInt(0);
89
90 /***
91 * The output character set to use (only used to
92 * {@linkplain #storeIfNth(FeatureVector, String) store} some configurations
93 * for inspection purposes).
94 */
95 private final String outputCharset;
96
97 /***
98 * Creates a new instance.
99 *
100 * @param recogNum the number of preceding recognitions to represent
101 * @param splitMax the maximum number of subsequences to keep when
102 * a feature value must be split (at whitespace).
103 * @param n Each <em>n</em>-th context representation is stored if > 0;
104 * otherwise no representation is stored
105 * @param outCharset the output character set to use (only used to
106 * store some configurations for inspection purposes, if <code>n</code>
107 * > 0); if <code>null</code>, the default charset of the current
108 * platform is used
109 */
110 public AbstractRepresentation(final int recogNum, final int splitMax,
111 final int n, final String outCharset) {
112 super(recogNum);
113 splitMaximum = splitMax;
114 storeN = n;
115 outputCharset = outCharset;
116 }
117
118 /***
119 * Builds the context representation of text in an element. Returns a
120 * feature vector of all context features considered relevant for
121 * representation.
122 *
123 * @param element the element whose context should be represented
124 * @param leftText textual content to the left of (preceding)
125 * <code>mainText</code>, might be empty
126 * @param mainText the main textual content to represent, might be empty
127 * @param rightText textual content to the right of (following)
128 * <code>mainText</code>, might be empty
129 * @param priorRecognitions a buffer of the last {@link Recognition}s from
130 * the document, created by calling {@link #createRecognitionBuffer()};
131 * might be <code>null</code>
132 * @param featureCache a cache of (local) feature, should be re-used
133 * between all calls for the nodes in a single document (but must not be
134 * re-used when building the context of nodes in different documents!)
135 * @param logPurpose the type of contexts of main interest to the caller
136 * (e.g. "Token" or "Sentence"), used for logging
137 * @return a vector of features considered relevant for representation
138 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
139 * contains objects that aren't {@link Recognition}s
140 * @throws IllegalArgumentException if the specified node is of an
141 * unsupported type
142 */
143 public final FeatureVector buildContext(final Element element,
144 final String leftText, final String mainText,
145 final String rightText, final PriorRecognitions priorRecognitions,
146 final Map<Element, List<LocalFeature>> featureCache,
147 final String logPurpose)
148 throws ClassCastException, IllegalArgumentException {
149 FeatureVector result = null;
150
151 try {
152 result = doBuildContext(element, leftText, mainText, rightText,
153 priorRecognitions, featureCache, logPurpose);
154
155
156 if (storeN > 0) {
157 storeIfNth(result, logPurpose);
158 }
159
160 return result;
161 } catch (Error e) {
162
163 Util.LOG.error("Error while building " + logPurpose
164 + " context representation for "
165 + DOMUtils.showToken(element, mainText)
166 + "(current size of rep.: " + result.size()
167 + ", size of feature cache: " + featureCache.size() + ")",
168 e);
169 Util.LOG.debug("Snapshot of context rep.: " + result);
170
171 throw e;
172 }
173 }
174
175 /***
176 * Builds the context representation of text in an element. Returns a
177 * feature vector of all context features considered relevant for
178 * representation.
179 *
180 * @param element the element whose context should be represented
181 * @param leftText textual content to the left of (preceding)
182 * <code>mainText</code>, might be empty
183 * @param mainText the main textual content to represent, might be empty
184 * @param rightText textual content to the right of (following)
185 * <code>mainText</code>, might be empty
186 * @param priorRecognitions a buffer of the last {@link Recognition}s from
187 * the document, created by calling {@link #createRecognitionBuffer()};
188 * might be <code>null</code>
189 * @param featureCache a cache of (local) feature, should be re-used
190 * between all calls for the nodes in a single document (but must not be
191 * re-used when building the context of nodes in different documents!)
192 * @param logPurpose the type of contexts of main interest to the caller
193 * (e.g. "Token" or "Sentence"), used for logging
194 * @return a vector of features considered relevant for representation
195 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
196 * contains objects that aren't {@link Recognition}s
197 * @throws IllegalArgumentException if the specified node is of an
198 * unsupported type
199 */
200 protected abstract FeatureVector doBuildContext(final Element element,
201 final String leftText, final String mainText,
202 final String rightText, final PriorRecognitions priorRecognitions,
203 final Map<Element, List<LocalFeature>> featureCache,
204 final String logPurpose)
205 throws ClassCastException, IllegalArgumentException;
206
207 /***
208 * Returns the maximum number of subsequences to keep when a feature
209 * value must be split (at whitespace).
210 *
211 * @return the maximum number
212 */
213 public int getSplitMaximum() {
214 return splitMaximum;
215 }
216
217 /***
218 * Each <em>storeN</em>-th context representation is stored for debugging
219 * and inspection purposes (if > 0, otherwise no representation is
220 * stored).
221 *
222 * @return the value of the attribute
223 */
224 public int getStoreN() {
225 return storeN;
226 }
227
228 /***
229 * Checks whether the given representation should be stored for inspection
230 * purposes and stores it, if required. This method should be only called
231 * if {@link #storeN} > 0.
232 *
233 * @param representation the representation to store, if required
234 * @param type the type of contexts of main interest to the caller
235 * (e.g. "Token" or "Sentence"), used when storing the representation
236 */
237 private void storeIfNth(final FeatureVector representation,
238 final String type) {
239 File storeFile = null;
240
241 long localCount = -1;
242
243
244 synchronized (lastStoredSuffix) {
245 ++repCount;
246 localCount = repCount;
247 if (repCount % storeN == 0) {
248
249
250 storeFile = IOUtils.createOutFile(IOUtils.getDefaultDirectory(),
251 type + "Context", "rep", lastStoredSuffix);
252 }
253 }
254
255
256 if (storeFile != null) {
257 Writer writer = null;
258 try {
259 writer = IOUtils.openWriter(storeFile, outputCharset);
260 final CharSequence flatRep =
261 representation.flatten(true);
262 writer.write(flatRep.toString());
263 writer.flush();
264 Util.LOG.info("Stored " + type + " representation no. "
265 + localCount + " in file " + storeFile
266 + " for inspection purposes");
267 } catch (IOException ie) {
268 Util.LOG.error("Error while storing " + type
269 + " representation no. " + localCount + " in file "
270 + storeFile + " for inspection purposes", ie);
271 } finally {
272 IOUtils.tryToClose(writer);
273 }
274 }
275 }
276
277 /***
278 * Returns a string representation of this object.
279 *
280 * @return a textual representation
281 */
282 public String toString() {
283 return new ToStringBuilder(this)
284 .appendSuper(super.toString())
285 .append("split maximum", splitMaximum)
286 .append("store n-th", storeN)
287 .toString();
288 }
289
290 }