1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.context;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Writer;
27 import java.util.List;
28 import java.util.Map;
29
30 import org.apache.commons.lang.builder.ToStringBuilder;
31 import org.dom4j.Element;
32
33 import de.fu_berlin.ties.classify.feature.FeatureVector;
34 import de.fu_berlin.ties.io.IOUtils;
35 import de.fu_berlin.ties.util.MutableInt;
36 import de.fu_berlin.ties.util.Util;
37 import de.fu_berlin.ties.xml.dom.DOMUtils;
38
39 /***
40 * Provides basic functionality shared by different representations.
41 *
42 * @author Christian Siefkes
43 * @version $Revision: 1.8 $, $Date: 2006/10/21 16:04:03 $, $Author: siefkes $
44 */
45 public abstract class AbstractRepresentation extends Representation {
46
47 /***
48 * Configuration key: The number of preceding recognitions to represent.
49 */
50 public static final String CONFIG_RECOGN_NUM = "representation.recogn.num";
51
52 /***
53 * Configuration key: The maximum number of subsequences to keep when a
54 * feature value must be split.
55 */
56 public static final String CONFIG_SPLIT_MAXIMUM =
57 "representation.split.maximum";
58
59 /***
60 * Configuration key: Each <em>n</em>-th context representation is stored
61 * for debugging and inspection purposes,if > 0.
62 */
63 public static final String CONFIG_STORE_NTH = "representation.store.nth";
64
65 /***
66 * The maximum number of subsequences to keep when a feature value must
67 * be split (at whitespace).
68 */
69 private final int splitMaximum;
70
71 /***
72 * Counts how many representations were already generated (if some of them
73 * are {@linkplain #storeN stored}).
74 */
75 private long repCount = 0;
76
77 /***
78 * Each <em>storeN</em>-th context representation is stored for debugging
79 * and inspection purposes (if > 0, otherwise no representation is
80 * stored).
81 */
82 private final int storeN;
83
84 /***
85 * Last integer suffix used to store a context representation, if enabled
86 * ({@link #storeN} > 0). Also used to synchronize the {@link #repCount}.
87 */
88 private final MutableInt lastStoredSuffix = new MutableInt(0);
89
90 /***
91 * The output character set to use (only used to
92 * {@linkplain #storeIfNth(FeatureVector, String) store} some configurations
93 * for inspection purposes).
94 */
95 private final String outputCharset;
96
97
98 /***
99 * Creates a new instance.
100 *
101 * @param recogNum the number of preceding recognitions to represent
102 * @param splitMax the maximum number of subsequences to keep when
103 * a feature value must be split (at whitespace).
104 * @param n Each <em>n</em>-th context representation is stored if > 0;
105 * otherwise no representation is stored
106 * @param outCharset the output character set to use (only used to
107 * store some configurations for inspection purposes, if <code>n</code>
108 * > 0); if <code>null</code>, the default charset of the current
109 * platform is used
110 */
111 public AbstractRepresentation(final int recogNum, final int splitMax,
112 final int n, final String outCharset) {
113 super(recogNum);
114 splitMaximum = splitMax;
115 storeN = n;
116 outputCharset = outCharset;
117 }
118
119 /***
120 * Builds the context representation of text in an element. Returns a
121 * feature vector of all context features considered relevant for
122 * representation.
123 *
124 * @param element the element whose context should be represented
125 * @param leftText textual content to the left of (preceding)
126 * <code>mainText</code>, might be empty
127 * @param mainText the main textual content to represent, might be empty
128 * @param rightText textual content to the right of (following)
129 * <code>mainText</code>, might be empty
130 * @param priorRecognitions a buffer of the last {@link Recognition}s from
131 * the document, created by calling {@link Representation#initDocument};
132 * might be <code>null</code>
133 * @param featureCache a cache of (local) feature, should be re-used
134 * between all calls for the nodes in a single document (but must not be
135 * re-used when building the context of nodes in different documents!)
136 * @param logPurpose the type of contexts of main interest to the caller
137 * (e.g. "Token" or "Sentence"), used for logging
138 * @return a vector of features considered relevant for representation
139 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
140 * contains objects that aren't {@link Recognition}s
141 * @throws IllegalArgumentException if the specified node is of an
142 * unsupported type
143 */
144 public final FeatureVector buildContext(final Element element,
145 final String leftText, final String mainText,
146 final String rightText, final PriorRecognitions priorRecognitions,
147 final Map<Element, List<LocalFeature>> featureCache,
148 final String logPurpose)
149 throws ClassCastException, IllegalArgumentException {
150 FeatureVector result = null;
151
152 try {
153 result = doBuildContext(element, leftText, mainText, rightText,
154 priorRecognitions, featureCache, logPurpose);
155
156
157 if (storeN > 0) {
158 storeIfNth(result, logPurpose);
159 }
160
161 return result;
162 } catch (Error e) {
163
164 Util.LOG.error("Error while building " + logPurpose
165 + " context representation for "
166 + DOMUtils.showToken(element, mainText) + " ("
167 + (result != null ? "current size of rep.: " + result.size()
168 : "rep. is <null>")
169 + ", size of feature cache: " + featureCache.size() + ")",
170 e);
171 Util.LOG.debug("Snapshot of context rep.: " + result);
172
173 throw e;
174 }
175 }
176
177 /***
178 * Builds the context representation of text in an element. Returns a
179 * feature vector of all context features considered relevant for
180 * representation.
181 *
182 * @param element the element whose context should be represented
183 * @param leftText textual content to the left of (preceding)
184 * <code>mainText</code>, might be empty
185 * @param mainText the main textual content to represent, might be empty
186 * @param rightText textual content to the right of (following)
187 * <code>mainText</code>, might be empty
188 * @param priorRecognitions a buffer of the last {@link Recognition}s from
189 * the document, created by calling {@link Representation#initDocument};
190 * might be <code>null</code>
191 * @param featureCache a cache of (local) feature, should be re-used
192 * between all calls for the nodes in a single document (but must not be
193 * re-used when building the context of nodes in different documents!)
194 * @param logPurpose the type of contexts of main interest to the caller
195 * (e.g. "Token" or "Sentence"), used for logging
196 * @return a vector of features considered relevant for representation
197 * @throws ClassCastException if the <code>priorRecognitions</code> buffer
198 * contains objects that aren't {@link Recognition}s
199 * @throws IllegalArgumentException if the specified node is of an
200 * unsupported type
201 */
202 protected abstract FeatureVector doBuildContext(final Element element,
203 final String leftText, final String mainText,
204 final String rightText, final PriorRecognitions priorRecognitions,
205 final Map<Element, List<LocalFeature>> featureCache,
206 final String logPurpose)
207 throws ClassCastException, IllegalArgumentException;
208
209 /***
210 * Returns the maximum number of subsequences to keep when a feature
211 * value must be split (at whitespace).
212 *
213 * @return the maximum number
214 */
215 public int getSplitMaximum() {
216 return splitMaximum;
217 }
218
219 /***
220 * Each <em>storeN</em>-th context representation is stored for debugging
221 * and inspection purposes (if > 0, otherwise no representation is
222 * stored).
223 *
224 * @return the value of the attribute
225 */
226 public int getStoreN() {
227 return storeN;
228 }
229
230 /***
231 * Checks whether the given representation should be stored for inspection
232 * purposes and stores it, if required. This method should be only called
233 * if {@link #storeN} > 0.
234 *
235 * @param representation the representation to store, if required
236 * @param type the type of contexts of main interest to the caller
237 * (e.g. "Token" or "Sentence"), used when storing the representation
238 */
239 private void storeIfNth(final FeatureVector representation,
240 final String type) {
241 File storeFile = null;
242
243 long localCount = -1;
244
245
246 synchronized (lastStoredSuffix) {
247 ++repCount;
248 localCount = repCount;
249 if (repCount % storeN == 0) {
250
251
252 storeFile = IOUtils.createOutFile(IOUtils.getDefaultDirectory(),
253 type + "Context", "rep", lastStoredSuffix);
254 }
255 }
256
257
258 if (storeFile != null) {
259 Writer writer = null;
260 try {
261 writer = IOUtils.openWriter(storeFile, outputCharset);
262 final CharSequence flatRep =
263 representation.flatten(true);
264 writer.write(flatRep.toString());
265 writer.flush();
266 Util.LOG.info("Stored " + type + " representation no. "
267 + localCount + " in file " + storeFile
268 + " for inspection purposes");
269 } catch (IOException ie) {
270 Util.LOG.error("Error while storing " + type
271 + " representation no. " + localCount + " in file "
272 + storeFile + " for inspection purposes", ie);
273 } finally {
274 IOUtils.tryToClose(writer);
275 }
276 }
277 }
278
279 /***
280 * Returns a string representation of this object.
281 *
282 * @return a textual representation
283 */
284 public String toString() {
285 return new ToStringBuilder(this)
286 .appendSuper(super.toString())
287 .append("split maximum", splitMaximum)
288 .append("store n-th", storeN)
289 .toString();
290 }
291
292 }