View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.StringReader;
27  import java.io.StringWriter;
28  import java.io.Writer;
29  
30  import org.apache.commons.lang.builder.ToStringBuilder;
31  import org.dom4j.Document;
32  import org.dom4j.DocumentException;
33  import org.dom4j.Element;
34  import org.dom4j.io.XMLWriter;
35  
36  import de.fu_berlin.ties.ContextMap;
37  import de.fu_berlin.ties.ProcessingException;
38  import de.fu_berlin.ties.TiesConfiguration;
39  import de.fu_berlin.ties.extract.AnswerBuilder;
40  import de.fu_berlin.ties.extract.ExtractionContainer;
41  import de.fu_berlin.ties.extract.ExtractionLocator;
42  import de.fu_berlin.ties.extract.TargetStructure;
43  import de.fu_berlin.ties.io.IOUtils;
44  import de.fu_berlin.ties.text.TokenCounter;
45  import de.fu_berlin.ties.text.TokenDetails;
46  import de.fu_berlin.ties.text.TokenizerFactory;
47  import de.fu_berlin.ties.util.Util;
48  import de.fu_berlin.ties.xml.XMLAdjuster;
49  import de.fu_berlin.ties.xml.dom.DOMUtils;
50  import de.fu_berlin.ties.xml.dom.TokenProcessor;
51  import de.fu_berlin.ties.xml.dom.TokenWalker;
52  
53  /***
54   * A prediction rewriter uses predictions from another process (e.g. named
55   * entities) and stores them as XML elements to provide additional semantic
56   * information.
57   *
58   * <p><strong>Generally, you should NOT use this class -- use
59   * {@link de.fu_berlin.ties.filter.PredictionRewriter2} instead.</strong>
60   * Instances of this class are not thread-safe and must not be used to
61   * process multiple documents in parallel. 
62   * 
63   * @author Christian Siefkes
64   * @version $Revision: 1.5 $, $Date: 2006/10/21 16:04:20 $, $Author: siefkes $
65   */
66  public class PredictionRewriter implements DocumentRewriter, TokenProcessor {
67  
68      /***
69       * An instance of a token walker that writes copies of all start and end
70       * elements to an XML writer. Also handles
71       * {@link TokenWalker#trailingWhitespaceHook(ContextMap) trailing
72       * whitespace} by writing whitespace.
73       */
74      private static class WritingTokenWalker extends TokenWalker {
75  
76          /***
77           * The used XML writer.
78           */
79          private final XMLWriter xmlWriter;
80  
81          /***
82           * Creates a new instance.
83           *
84           * @param processor used to process the tokens
85           * @param tFactory used to instantiate tokenizers
86           * @param writer the XML writer to use
87           */
88          public WritingTokenWalker(final TokenProcessor processor,
89                  final TokenizerFactory tFactory, final XMLWriter writer) {
90              super(processor, tFactory);
91              xmlWriter = writer;
92          }
93  
94          /***
95           * {@inheritDoc}
96           */
97          protected void endElementHook(final Element element,
98                  final ContextMap context) throws IOException {
99              // write closing tag
100             xmlWriter.writeClose(element);
101         }
102 
103         /***
104          * {@inheritDoc}
105          */
106         protected void startElementHook(final Element element,
107                 final ContextMap context) throws IOException {
108             // write opening tag with attributes
109             xmlWriter.writeOpen(element);
110         }
111 
112         /***
113          * {@inheritDoc}
114          */
115         protected void trailingWhitespaceHook(final ContextMap context)
116         throws IOException {
117             // write space charater
118             xmlWriter.write(" ");
119         }
120     }
121 
122     /***
123      * Configuration key: extension of prediction files.
124      */
125     public static final String CONFIG_PRED_EXT = "rewriter.pred.ext";
126 
127 
128     /***
129      * Used to configure this instance.
130      */
131     private final TiesConfiguration config;
132 
133     /***
134      * Extension of the files containing predictions.
135      */
136     private final String extension;
137 
138     /***
139      * Prediction locator for the current document.
140      */
141     private ExtractionLocator predLocator;
142 
143     /***
144      * Dummy target structure (any types are accepted).
145      */
146     private final TargetStructure targetStruct =
147         new TargetStructure(new String[] {});
148 
149     /***
150      * Factory used to create tokenizers.
151      */
152     private final TokenizerFactory tFactory;
153 
154     /***
155      * Counts how often tokens are repeated in a document -- required to
156      * localize predictions.
157      */
158     private final TokenCounter tCount = new TokenCounter();
159 
160     /***
161      * Used to store XML documents in memory while adding prediction elements.
162      */
163     private Writer writer;
164 
165     /***
166      * XMLWriter wrapping the raw {@link #writer}.
167      */
168     private XMLWriter xmlWriter;
169 
170     /***
171      * Used to walk through documents.
172      */
173     private TokenWalker walker;
174 
175     /***
176      * Fixes nesting errors that can occur when interweaving predictions with
177      * original augmented XML file.
178      */
179     private final XMLAdjuster xmlAdjuster;
180 
181 
182     /***
183      * Creates a new instance.
184      *
185      * @param conf used to configure this instance; must not be
186      * <code>null</code>
187      * @throws ProcessingException if an error occurs while initializing the
188      * combination strategies
189      */
190     public PredictionRewriter(final TiesConfiguration conf)
191     throws ProcessingException {
192         this(conf.getString(CONFIG_PRED_EXT), 
193                 new TokenizerFactory(conf), conf);
194     }
195 
196     /***
197      * Creates a new instance.
198      *
199      * @param fileExtension extension of the files containing predictions
200      * @param factory used to instantiate tokenizers
201      * @param conf used to configure this instance; must not be
202      * <code>null</code>
203      * @throws ProcessingException if an error occurs while initializing the
204      * combination strategies
205      */
206     public PredictionRewriter(final String fileExtension,
207             final TokenizerFactory factory, final TiesConfiguration conf)
208     throws ProcessingException {
209         super();
210         config = conf;
211         extension = fileExtension;
212         tFactory = factory;
213 
214         // initialize XML adjuster without doing any extras
215         xmlAdjuster = new XMLAdjuster(null, null, null, false, false, false,
216                 false, conf);
217     }
218 
219 
220     /***
221      * Initializes a document to process, reading the corresponding prediction
222      * file(s).
223      *
224      * @param filename the file name of the current document
225      * @return <code>true</code> iff any predictions for this document exist
226      * @throws IOException if an I/O error occurs
227      */
228     private boolean initDocument(final File filename)
229     throws IOException {
230         final File directory = filename.getParentFile();
231         final String localName = filename.getName();
232         final String prefix =
233             IOUtils.getBaseName(localName) + IOUtils.EXT_SEPARATOR;
234         File predFile;
235         ExtractionContainer predictions;
236 
237         // reset token counter
238         tCount.clear();
239 
240         // terminate old locator, if any
241         if (predLocator != null) {
242             predLocator.reachedEndOfDocument();
243         }
244 
245         // read prediction file
246         predFile = new File(directory, prefix + extension);
247 
248         if (predFile.exists()) {
249             predictions = AnswerBuilder.readAnswerKeys(targetStruct,
250                     predFile, config);
251             predLocator = new ExtractionLocator(predictions,
252                     tFactory.createTokenizer(""));
253             return true;
254         } else {
255             Util.LOG.info("No '" + extension + "' file found for "
256                     + localName + " -- assuming there are no predictions");
257             predLocator = null;
258             return false;
259         }
260     }
261 
262     /***
263      * {@inheritDoc}
264      */
265     public void processToken(final Element element, final String left,
266             final TokenDetails details, final String right,
267             final ContextMap context) throws IOException {
268         final String token = details.getToken();
269 
270         tCount.add(false, token);
271         final int tokenRep = tCount.getLastRep();
272 
273         // lookup and flatten predictions of all types
274         if (predLocator != null) {
275             // write space to escaped XML output, if any
276             if (details.isWhitespaceBefore()) {
277                 xmlWriter.write(" ");
278             }
279 
280             if (predLocator.startOfExtraction(token, tokenRep)) {
281                 // write opening tag to raw output (nesting will be fixed later)
282                 writer.write("<"
283                         + predLocator.getCurrentExtraction().getType() + ">");
284             }
285 
286             // write token to escaped XML output
287             xmlWriter.write(details.getToken());
288 
289             if (predLocator.inExtraction()) {
290                 // update extraction (remove current token from remaining)
291                 predLocator.updateExtraction(token, tokenRep);
292 
293                 // check whether this is the last token of the extraction
294                 if (predLocator.endOfExtraction()) {
295                     // reached end of current prediction: write closing tag to
296                     // raw output (nesting will be fixed later) + switch to next
297                     writer.write("</"
298                             + predLocator.getCurrentExtraction().getType()
299                             + ">");
300                     predLocator.switchToNextExtraction();
301                 }
302             }
303         }
304     }
305 
306     /***
307      * {@inheritDoc}
308      */
309     public Document rewrite(final Document document, final File filename)
310             throws IOException, ProcessingException {
311         final ContextMap dummyContext = new ContextMap();
312 
313         if (initDocument(filename)) {
314             // prepare fields to walk thru document
315             writer = new StringWriter();
316             xmlWriter = new XMLWriter(writer);
317             walker = new WritingTokenWalker(this, tFactory, xmlWriter);
318 
319             // walk thru document to add raw prediction elements,
320             // printing the resulting raw XML to the writer
321             walker.walk(document, dummyContext);
322 
323             // fix nesting errors that might have been introduced by
324             // interweaving predictions with original XML
325             final String rawDoc = writer.toString();
326 
327 /*            // testing only
328             Util.LOG.debug("Raw interweaved document: "
329                     + TextUtils.LINE_SEPARATOR + WordUtils.wrap(rawDoc, 78)); */
330 
331             final Writer repairedDoc = new StringWriter();
332             xmlAdjuster.adjust(rawDoc, repairedDoc);
333 
334             try {
335                 // parse modified document and return as DOM tree
336                 return DOMUtils.readDocument(
337                         new StringReader(repairedDoc.toString()));
338             } catch (DocumentException de) {
339                 // not supposed to happen -- repaired doc should be error-free
340                 throw new RuntimeException("Implementation error: "
341                         + "failed to repair interweaved document", de);
342             }
343         } else {
344             // no predictions: return document unchanged
345             return document;
346         }
347     }
348 
349     /***
350      * Returns a string representation of this object.
351      *
352      * @return a textual representation
353      */
354     public String toString() {
355         return new ToStringBuilder(this)
356             .append("extension", extension)
357             .toString();
358     }
359 
360 }
361