1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.StringReader;
27 import java.io.StringWriter;
28 import java.io.Writer;
29
30 import org.apache.commons.lang.builder.ToStringBuilder;
31 import org.dom4j.Document;
32 import org.dom4j.DocumentException;
33 import org.dom4j.Element;
34 import org.dom4j.io.XMLWriter;
35
36 import de.fu_berlin.ties.ContextMap;
37 import de.fu_berlin.ties.ProcessingException;
38 import de.fu_berlin.ties.TiesConfiguration;
39 import de.fu_berlin.ties.extract.AnswerBuilder;
40 import de.fu_berlin.ties.extract.ExtractionContainer;
41 import de.fu_berlin.ties.extract.ExtractionLocator;
42 import de.fu_berlin.ties.extract.TargetStructure;
43 import de.fu_berlin.ties.io.IOUtils;
44 import de.fu_berlin.ties.text.TokenCounter;
45 import de.fu_berlin.ties.text.TokenDetails;
46 import de.fu_berlin.ties.text.TokenizerFactory;
47 import de.fu_berlin.ties.util.Util;
48 import de.fu_berlin.ties.xml.XMLAdjuster;
49 import de.fu_berlin.ties.xml.dom.DOMUtils;
50 import de.fu_berlin.ties.xml.dom.TokenProcessor;
51 import de.fu_berlin.ties.xml.dom.TokenWalker;
52
53 /***
54 * A prediction rewriter uses predictions from another process (e.g. named
55 * entities) and stores them as XML elements to provide additional semantic
56 * information.
57 *
58 * <p><strong>Generally, you should NOT use this class -- use
59 * {@link de.fu_berlin.ties.filter.PredictionRewriter2} instead.</strong>
60 * Instances of this class are not thread-safe and must not be used to
61 * process multiple documents in parallel.
62 *
63 * @author Christian Siefkes
64 * @version $Revision: 1.5 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
65 */
66 public class PredictionRewriter implements DocumentRewriter, TokenProcessor {
67
68 /***
69 * An instance of a token walker that writes copies of all start and end
70 * elements to an XML writer. Also handles
71 * {@link TokenWalker#trailingWhitespaceHook(ContextMap) trailing
72 * whitespace} by writing whitespace.
73 */
74 private static class WritingTokenWalker extends TokenWalker {
75
76 /***
77 * The used XML writer.
78 */
79 private final XMLWriter xmlWriter;
80
81 /***
82 * Creates a new instance.
83 *
84 * @param processor used to process the tokens
85 * @param tFactory used to instantiate tokenizers
86 * @param writer the XML writer to use
87 */
88 public WritingTokenWalker(final TokenProcessor processor,
89 final TokenizerFactory tFactory, final XMLWriter writer) {
90 super(processor, tFactory);
91 xmlWriter = writer;
92 }
93
94 /***
95 * {@inheritDoc}
96 */
97 protected void endElementHook(final Element element,
98 final ContextMap context) throws IOException {
99
100 xmlWriter.writeClose(element);
101 }
102
103 /***
104 * {@inheritDoc}
105 */
106 protected void startElementHook(final Element element,
107 final ContextMap context) throws IOException {
108
109 xmlWriter.writeOpen(element);
110 }
111
112 /***
113 * {@inheritDoc}
114 */
115 protected void trailingWhitespaceHook(final ContextMap context)
116 throws IOException {
117
118 xmlWriter.write(" ");
119 }
120 }
121
122 /***
123 * Configuration key: extension of prediction files.
124 */
125 public static final String CONFIG_PRED_EXT = "rewriter.pred.ext";
126
127
128 /***
129 * Used to configure this instance.
130 */
131 private final TiesConfiguration config;
132
133 /***
134 * Extension of the files containing predictions.
135 */
136 private final String extension;
137
138 /***
139 * Prediction locator for the current document.
140 */
141 private ExtractionLocator predLocator;
142
143 /***
144 * Dummy target structure (any types are accepted).
145 */
146 private final TargetStructure targetStruct =
147 new TargetStructure(new String[] {});
148
149 /***
150 * Factory used to create tokenizers.
151 */
152 private final TokenizerFactory tFactory;
153
154 /***
155 * Counts how often tokens are repeated in a document -- required to
156 * localize predictions.
157 */
158 private final TokenCounter tCount = new TokenCounter();
159
160 /***
161 * Used to store XML documents in memory while adding prediction elements.
162 */
163 private Writer writer;
164
165 /***
166 * XMLWriter wrapping the raw {@link #writer}.
167 */
168 private XMLWriter xmlWriter;
169
170 /***
171 * Used to walk through documents.
172 */
173 private TokenWalker walker;
174
175 /***
176 * Fixes nesting errors that can occur when interweaving predictions with
177 * original augmented XML file.
178 */
179 private final XMLAdjuster xmlAdjuster;
180
181
182 /***
183 * Creates a new instance.
184 *
185 * @param conf used to configure this instance; must not be
186 * <code>null</code>
187 * @throws ProcessingException if an error occurs while initializing the
188 * combination strategies
189 */
190 public PredictionRewriter(final TiesConfiguration conf)
191 throws ProcessingException {
192 this(conf.getString(CONFIG_PRED_EXT),
193 new TokenizerFactory(conf), conf);
194 }
195
196 /***
197 * Creates a new instance.
198 *
199 * @param fileExtension extension of the files containing predictions
200 * @param factory used to instantiate tokenizers
201 * @param conf used to configure this instance; must not be
202 * <code>null</code>
203 * @throws ProcessingException if an error occurs while initializing the
204 * combination strategies
205 */
206 public PredictionRewriter(final String fileExtension,
207 final TokenizerFactory factory, final TiesConfiguration conf)
208 throws ProcessingException {
209 super();
210 config = conf;
211 extension = fileExtension;
212 tFactory = factory;
213
214
215 xmlAdjuster = new XMLAdjuster(null, null, null, false, false, false,
216 false, conf);
217 }
218
219
220 /***
221 * Initializes a document to process, reading the corresponding prediction
222 * file(s).
223 *
224 * @param filename the file name of the current document
225 * @return <code>true</code> iff any predictions for this document exist
226 * @throws IOException if an I/O error occurs
227 */
228 private boolean initDocument(final File filename)
229 throws IOException {
230 final File directory = filename.getParentFile();
231 final String localName = filename.getName();
232 final String prefix =
233 IOUtils.getBaseName(localName) + IOUtils.EXT_SEPARATOR;
234 File predFile;
235 ExtractionContainer predictions;
236
237
238 tCount.clear();
239
240
241 if (predLocator != null) {
242 predLocator.reachedEndOfDocument();
243 }
244
245
246 predFile = new File(directory, prefix + extension);
247
248 if (predFile.exists()) {
249 predictions = AnswerBuilder.readAnswerKeys(targetStruct,
250 predFile, config);
251 predLocator = new ExtractionLocator(predictions,
252 tFactory.createTokenizer(""));
253 return true;
254 } else {
255 Util.LOG.info("No '" + extension + "' file found for "
256 + localName + " -- assuming there are no predictions");
257 predLocator = null;
258 return false;
259 }
260 }
261
262 /***
263 * {@inheritDoc}
264 */
265 public void processToken(final Element element, final String left,
266 final TokenDetails details, final String right,
267 final ContextMap context) throws IOException {
268 final String token = details.getToken();
269
270 tCount.add(false, token);
271 final int tokenRep = tCount.getLastRep();
272
273
274 if (predLocator != null) {
275
276 if (details.isWhitespaceBefore()) {
277 xmlWriter.write(" ");
278 }
279
280 if (predLocator.startOfExtraction(token, tokenRep)) {
281
282 writer.write("<"
283 + predLocator.getCurrentExtraction().getType() + ">");
284 }
285
286
287 xmlWriter.write(details.getToken());
288
289 if (predLocator.inExtraction()) {
290
291 predLocator.updateExtraction(token, tokenRep);
292
293
294 if (predLocator.endOfExtraction()) {
295
296
297 writer.write("</"
298 + predLocator.getCurrentExtraction().getType()
299 + ">");
300 predLocator.switchToNextExtraction();
301 }
302 }
303 }
304 }
305
306 /***
307 * {@inheritDoc}
308 */
309 public Document rewrite(final Document document, final File filename)
310 throws IOException, ProcessingException {
311 final ContextMap dummyContext = new ContextMap();
312
313 if (initDocument(filename)) {
314
315 writer = new StringWriter();
316 xmlWriter = new XMLWriter(writer);
317 walker = new WritingTokenWalker(this, tFactory, xmlWriter);
318
319
320
321 walker.walk(document, dummyContext);
322
323
324
325 final String rawDoc = writer.toString();
326
327
328
329
330
331 final Writer repairedDoc = new StringWriter();
332 xmlAdjuster.adjust(rawDoc, repairedDoc);
333
334 try {
335
336 return DOMUtils.readDocument(
337 new StringReader(repairedDoc.toString()));
338 } catch (DocumentException de) {
339
340 throw new RuntimeException("Implementation error: "
341 + "failed to repair interweaved document", de);
342 }
343 } else {
344
345 return document;
346 }
347 }
348
349 /***
350 * Returns a string representation of this object.
351 *
352 * @return a textual representation
353 */
354 public String toString() {
355 return new ToStringBuilder(this)
356 .append("extension", extension)
357 .toString();
358 }
359
360 }
361