View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Writer;
27  import java.util.ArrayList;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Set;
31  
32  import org.dom4j.Document;
33  import org.dom4j.Element;
34  
35  import de.fu_berlin.ties.combi.CombinationState;
36  import de.fu_berlin.ties.combi.CombinationStrategy;
37  import de.fu_berlin.ties.ContextMap;
38  import de.fu_berlin.ties.ProcessingException;
39  import de.fu_berlin.ties.TiesConfiguration;
40  
41  import de.fu_berlin.ties.classify.Classifier;
42  import de.fu_berlin.ties.classify.Prediction;
43  import de.fu_berlin.ties.classify.PredictionDistribution;
44  import de.fu_berlin.ties.classify.Probability;
45  import de.fu_berlin.ties.classify.Reranker;
46  import de.fu_berlin.ties.context.ContextDetails;
47  import de.fu_berlin.ties.context.Recognition;
48  import de.fu_berlin.ties.context.Representation;
49  import de.fu_berlin.ties.eval.FMetricsView;
50  import de.fu_berlin.ties.extract.amend.FinalReextractor;
51  import de.fu_berlin.ties.extract.reestimate.Reestimator;
52  import de.fu_berlin.ties.filter.DocumentRewriter;
53  import de.fu_berlin.ties.filter.EmbeddingElements;
54  import de.fu_berlin.ties.filter.FilteringTokenWalker;
55  import de.fu_berlin.ties.filter.TrainableFilter;
56  import de.fu_berlin.ties.io.FieldContainer;
57  import de.fu_berlin.ties.text.TokenDetails;
58  import de.fu_berlin.ties.text.TokenizerFactory;
59  import de.fu_berlin.ties.util.Util;
60  
61  /***
62   * An extractor runs a local {@link de.fu_berlin.ties.classify.Classifier}
63   * on a list of items/nodes and combines their results using a
64   * {@link de.fu_berlin.ties.combi.CombinationStrategy}.
65   *
66   * <p>Instances of this class are not thread-safe and cannot extract from
67   * several documents in parallel.
68   *
69   * @author Christian Siefkes
70   * @version $Revision: 1.61 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
71   */
72  public class Extractor extends ExtractorBase {
73  
74      /***
75       * The recommended file extension to use for storing extractions.
76       */
77      public static final String EXT_EXTRACTIONS = "ext";
78  
79  
80      /***
81       * Helper method that creates a reranker. The reranker is configured from
82       * the "extract" prefix. E.g. an entry such as "extract.bias.A = 0.99"
83       * could be used to introduce a small malus for the background class used
84       * in most combination strategies, thus favoring recall over precision.
85       *
86       * @param config the configuration to use
87       * @return the initialized reranker
88       */
89      private static Reranker createReranker(final TiesConfiguration config) {
90          return new Reranker(config.subset("extract"));
91      }
92  
93  
94      /***
95       * The last document processed by this instance.
96       */
97      private Document lastDocument;
98  
99      /***
100      * The extraction container used for storing the predicted extractions.
101      */
102     private ExtractionContainer predictedExtractions;
103 
104     /***
105      * A list of punctuation tokens collected between non-puncutation tokens.
106      */
107     private final List<TokenDetails> punctuationDetails =
108         new ArrayList<TokenDetails>();
109 
110     /***
111      * An optional reranker that recalculates probabilities to introduce a bias.
112      * This can be used to favor recall over precision (by setting a bias &lt;
113      * 1 for the background class) etc.
114      */
115     private final Reranker reranker;
116 
117 
118     /***
119      * Creates a new instance using a default extension. Delegates to
120      * {@link #Extractor(String, TiesConfiguration)} using the
121      * {@linkplain TiesConfiguration#CONF standard configuration}.
122      *
123      * @throws IllegalArgumentException if the combination strategy cannot be
124      * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
125      * TiesConfiguration)})
126      * @throws ProcessingException if an error occurs during initialization
127      */
128     public Extractor() throws IllegalArgumentException, ProcessingException {
129         this(EXT_EXTRACTIONS);
130     }
131 
132     /***
133      * Creates a new instance. Delegates to
134      * {@link #Extractor(String, TiesConfiguration)} using the
135      * {@linkplain TiesConfiguration#CONF standard configuration}.
136      *
137      * @param outExt the extension to use for output files
138      * @throws IllegalArgumentException if the combination strategy cannot be
139      * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
140      * TiesConfiguration)})
141      * @throws ProcessingException if an error occurs during initialization
142      */
143     public Extractor(final String outExt)
144             throws IllegalArgumentException, ProcessingException {
145         this(outExt, TiesConfiguration.CONF);
146     }
147 
148     /***
149      * Creates a new instance. Delegates to the corresponding
150      * {@linkplain ExtractorBase#ExtractorBase(String, TiesConfiguration) super
151      * constructor} to configure the fields.
152      *
153      * @param outExt the extension to use for output files
154      * @param config the configuration to use
155      * @throws IllegalArgumentException if the combination strategy cannot be
156      * initialized
157      * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
158      * TiesConfiguration)})
159      * @throws ProcessingException if an error occurs during initialization
160      */
161     public Extractor(final String outExt, final TiesConfiguration config)
162             throws IllegalArgumentException, ProcessingException {
163         super(outExt, config);
164         reranker = createReranker(config);
165     }
166 
167     /***
168      * Creates a new instance. Delegates to the corresponding
169      * {@linkplain ExtractorBase#ExtractorBase(String, File, TiesConfiguration)
170      * super constructor} to configure the fields.
171      *
172      * @param outExt the extension to use for output files
173      * @param runDirectory the directory to run the classifier in; used instead
174      * of the
175      * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
176      * configured directory} if not <code>null</code>
177      * @param config the configuration to use
178      * @throws IllegalArgumentException if the combination strategy cannot be
179      * initialized
180      * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
181      * TiesConfiguration)})
182      * @throws ProcessingException if an error occurs during initialization
183      */
184     public Extractor(final String outExt, final File runDirectory,
185             final TiesConfiguration config)
186             throws IllegalArgumentException, ProcessingException {
187         super(outExt, runDirectory, config);
188         reranker = createReranker(config);
189     }
190 
191     /***
192      * Creates a new instance, re-using the components from the provided
193      * trainer.
194      *
195      * @param outExt the extension to use for output files
196      * @param trainer trainer whose components should be re-used
197      */
198     public Extractor(final String outExt, final Trainer trainer) {
199         this(outExt, trainer.getTargetStructure(), trainer.getClassifiers(),
200             trainer.getRepresentation(), trainer.getStrategy(),
201             trainer.getReextractor(), trainer.getFactory(),
202             trainer.getReestimator(), trainer.getDocumentRewriters(),
203             trainer.getSentenceFilter(), createReranker(trainer.getConfig()),
204             trainer.viewRelevantPunctuation(), trainer.getConfig());
205     }
206 
207     /***
208      * Creates a new instance.
209      *
210      * @param outExt the extension to use for output files
211      * @param targetStruct the target structure specifying the classes to
212      * recognize
213      * @param theClassifiers the classifiers to use for the local classification
214      * decisions
215      * @param theRepresentation the context representation to use for local
216      * classifications
217      * @param combiStrat the combination strategy to use
218      * @param reextract an optional re-extractor that can modify extractions in
219      * any suitable way
220      * @param tFactory used to instantiate tokenizers
221      * @param estimator the last element of the re-estimator chain, or
222      * <code>null</code> if the chain is empty
223      * @param docFilters a list (possibly empty) of document processors that are
224      * invoked to modify the XML representations of the documents to process
225      * @param sentFilter the filter used in the first step of a double
226      * classification approach ("sentence filtering"); if <code>null</code>,
227      * no sentence filtering is used
228      * @param rerank a reranker that recalculates probabilities to
229      * introduce a bias (can be used to favor recall over precision, by setting
230      * a bias &lt; 1 for the background class, etc.); must not be
231      * <code>null</code>
232      * @param relevantPunct a set of punctuation tokens that have been found to
233      * be relevant for token classification; might be empty but not
234      * <code>null</code>
235      * @param config used to configure superclasses; if <code>null</code>,
236      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
237      */
238     public Extractor(final String outExt, final TargetStructure targetStruct,
239             final Classifier[] theClassifiers,
240             final Representation theRepresentation,
241             final CombinationStrategy combiStrat,
242             final FinalReextractor reextract, final TokenizerFactory tFactory,
243             final Reestimator estimator, final DocumentRewriter[] docFilters,
244             final TrainableFilter sentFilter, final Reranker rerank,
245             final Set<String> relevantPunct, final TiesConfiguration config) {
246         super(outExt, targetStruct, theClassifiers, theRepresentation,
247             combiStrat, reextract, tFactory, estimator, docFilters, sentFilter,
248             relevantPunct, config);
249         reranker = rerank;
250     }
251 
252 
253     /***
254      * Adds an element to the collected punctuation details.
255      *
256      * @param details the element to add
257      */
258     protected void addPunctuationDetails(final TokenDetails details) {
259         punctuationDetails.add(details);
260     }
261 
262     /***
263      * Appends the collected punctuation details (if any) to the provided
264      * extraction. Finally delegates to {@link #clearPunctuation()}
265      * to dleetes the processed punctuation.
266      *
267      * @param ext the extraction to append to
268      */
269     protected void appendPunctuation(final Extraction ext) {
270         if (!punctuationDetails.isEmpty()) {
271             final Iterator<TokenDetails> detailsIter =
272                 punctuationDetails.iterator();
273             TokenDetails currentDetails;
274 
275             // append
276             while (detailsIter.hasNext()) {
277                 currentDetails = detailsIter.next();
278                 ext.addToken(currentDetails, null, true);
279             }
280 
281             // and clear
282             clearPunctuation();
283         }
284     }
285 
286     /***
287      * Clears the collected punctuation details.
288      */
289     protected void clearPunctuation() {
290         punctuationDetails.clear();
291     }
292 
293     /***
294      * {@inheritDoc}
295      */
296     protected FilteringTokenWalker createFilteringTokenWalker(
297             final TrainableFilter repFilter) {
298         return new FilteringTokenWalker(this, getFactory(), repFilter, this);
299     }
300 
301     /***
302      * Helper method that discards the last extraction, removing it from
303      * {@link #getPredictedExtractions() prior recognitions} and
304      * {@link #getPredictedExtractions() predicted extractions}.
305      */
306     private void discardLastExtraction() {
307         final Extraction removedFromContainer =
308             getPredictedExtractions().removeLast();
309 
310         // this one might be null if no prior recognitions are stored
311         final Recognition removedFromRecognitions =
312             getPriorRecognitions().removeLast();
313 
314         // both must be equal, otherwise something would be very wrong
315         if ((removedFromRecognitions != null)
316                 && !removedFromContainer.equals(removedFromRecognitions)) {
317             throw new IllegalStateException("Extractions discarded from "
318                 + "container " + removedFromContainer + " and from prior "
319                 + "recognitions " + removedFromRecognitions + " differ");
320         }
321 
322         Util.LOG.debug("Discarded last extraction " + removedFromRecognitions);
323     }
324 
325     /***
326      * Destroys the internal classifers. This method must only be used if the
327      * extractor will never be used again.
328      *
329      * @throws ProcessingException if an error occurs while the classifiers are
330      * being destroyed
331      */
332     public void destroy() throws ProcessingException {
333         final Classifier[] classifiers = getClassifiers();
334         for (int i = 0; i < classifiers.length; i++) {
335             classifiers[i].destroy();
336         }
337     }
338 
339     /***
340      * Extracts items of interest from the contents of an XML document, based on
341      * context representation and local classifier.
342      *
343      * @param doc a document whose contents should be classified
344      * @param filename the name of the document
345      * @return a container of all extractions from the document, in document
346      * order
347      * @throws IOException if an I/O error occurs
348      * @throws ProcessingException if an error occurs during processing
349      */
350     public ExtractionContainer extract(final Document doc,
351             final File filename) throws IOException, ProcessingException {
352         // initialize local fields and filter document
353         initFields(filename);
354         final Document document = filterDocument(doc, filename);
355         lastDocument = document;
356         predictedExtractions = new ExtractionContainer(getTargetStructure());
357 
358         // the walker will call back (processToken method) where appropriate
359         getWalker().walk(document, null);
360 
361         // reset strategy and discard last prediction extraction if requested
362         resetStrategy();
363 
364         ExtractionContainer finalExtractions;
365 
366         if (getReestimator() != null) {
367             // invoke estimator chain on each extraction in the container
368             finalExtractions = new ExtractionContainer(getTargetStructure());
369             Iterator extIter = predictedExtractions.iterator();
370             Extraction reestimatedExt;
371 
372             while (extIter.hasNext()) {
373                 reestimatedExt =
374                     getReestimator().reestimate((Extraction) extIter.next());
375 
376                 if (reestimatedExt != null) {
377                     finalExtractions.add(reestimatedExt);
378                 }
379             }
380 
381         } else {
382             // store extractions "as is"
383             finalExtractions = predictedExtractions;
384         }
385 
386         // call reextractor, if any
387         if (getReextractor() != null) {
388             final ContextMap reexContext =
389                 getStrategy().contextForReextractor();
390             finalExtractions = getReextractor().reextract(finalExtractions,
391                     getContextDetails(), reexContext);
392         }
393 
394         return finalExtractions;
395     }
396 
397     /***
398      * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
399      * sentence filtering} on the last processed document.
400      *
401      * @param correctExtractions a container of all correct extractions for the
402      * document
403      * @return the calculated statistics for sentence filtering on the
404      * last document; <code>null</code> if {@linkplain
405      * #isSentenceFiltering() sentence filtering} is disabled
406      */
407     public FMetricsView evaluateSentenceFiltering(
408             final ExtractionContainer correctExtractions) {
409         return evaluateSentenceFiltering(new EmbeddingElements(lastDocument,
410                 correctExtractions, getFactory()));
411     }
412 
413     /***
414      * Returns the extraction container used for storing the predicted
415      * extractions.
416      * @return the extraction container
417      */
418     protected ExtractionContainer getPredictedExtractions() {
419         return predictedExtractions;
420     }
421 
422     /***
423      * Extracts items of interest from the contents of an XML document and
424      * serializes the extractions.
425      *
426      * @param document the document to read
427      * @param writer the writer to write the extracted items to; flushed
428      * but not closed by this method
429      * @param context a map of objects that are made available for processing
430      * @throws IOException if an I/O error occurs
431      * @throws ProcessingException if an error occurs during processing
432      */
433     public void process(final Document document, final Writer writer,
434             final ContextMap context) throws IOException, ProcessingException {
435         // determine file name
436         final File filename = new File((File) context.get(KEY_DIRECTORY),
437                 (String) context.get(KEY_LOCAL_NAME));
438 
439         // delegate to extract + serialize results
440         final ExtractionContainer extractions = extract(document, filename);
441         serializeExtractions(extractions, writer);
442     }
443 
444     /***
445      * {@inheritDoc}
446      */
447     public void processToken(final Element element, final String left,
448             final TokenDetails details, final String right,
449             final ContextMap context) throws ProcessingException {
450         final CombinationState translatedState;
451         final boolean relevant = isRelevant(details.getToken());
452 
453         if (relevant) {
454             // update context rep + active classes
455             updateState(element, left, details.getToken(), right);
456 
457             final Classifier[] classifiers = getClassifiers();
458             // call classifiers and translate result
459             final PredictionDistribution[] origDists =
460                 new PredictionDistribution[classifiers.length];
461             final PredictionDistribution[] finalDists =
462                 new PredictionDistribution[classifiers.length];
463             final Prediction[] predictions =
464                 new Prediction[classifiers.length];
465             final String[] predictedTypes = new String[classifiers.length];
466             Probability currentProb;
467             final double[] probs = new double[classifiers.length];
468             final double[] pRs = new double[classifiers.length];
469 
470             for (int i = 0; i < origDists.length; i++) {
471                 origDists[i] = classifiers[i].classify(getFeatures(),
472                     getActiveClasses()[i]);
473                 finalDists[i] = reranker.rerank(origDists[i]);
474                 predictions[i] = finalDists[i].best();
475                 predictedTypes[i] = predictions[i].getType();
476                 currentProb = predictions[i].getProbability();
477                 probs[i] = currentProb.getProb();
478                 pRs[i] = currentProb.getPR();
479             }
480 
481             translatedState =
482                 getStrategy().translateResult(finalDists, details);
483 /*            Util.LOG.debug("Predicted types: '"
484                 + ArrayUtils.toString(predictedTypes)
485                 + "'; translated state: " + translatedState); */
486 
487             if (translatedState.isDiscardPreceding()) {
488                 discardLastExtraction();
489             }
490 
491             // update extraction container if the result isn't null
492             if (translatedState.getType() == null) {
493                 // outside: seal last extraction if not yet done
494                 final Extraction lastExtraction =
495                     getPredictedExtractions().last();
496 
497                 if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
498                     lastExtraction.setSealed(true);
499                 }
500 
501                 // discard any collected punctuation tokens
502                 clearPunctuation();
503             } else if (translatedState.isBegin()) {
504                 // start of new instance: add to container & recognition buffer
505                 final Extraction newExtraction =
506                     new Extraction(translatedState.getType(), details,
507                             translatedState.getProbability());
508                 getPredictedExtractions().add(newExtraction);
509                 getPriorRecognitions().add(newExtraction);
510 
511                 // discard any collected punctuation tokens
512                 clearPunctuation();
513             } else {
514                 // continuation of current instance
515                 final Extraction currentExtraction =
516                     getPredictedExtractions().last();
517 
518                 // append and discard any collected punctuation tokens
519                 appendPunctuation(currentExtraction);
520 
521                 // ensure that states match prior to appending
522                 if (!currentExtraction.getType().equals(
523                         translatedState.getType())) {
524                     throw new IllegalStateException("Type mismatch: "
525                         + translatedState + " cannot continue extraction "
526                         + currentExtraction);
527                 }
528                 currentExtraction.addToken(details,
529                         translatedState.getProbability(), true);
530             }
531 
532             // update the state of the strategy
533             getStrategy().updateState(translatedState, finalDists, details);
534         } else {
535             // irrevalant (punctuation) token:
536             translatedState = null;
537             final Extraction lastExtraction = getPredictedExtractions().last();
538 
539             // remember for later appending if in extraction
540             if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
541                 addPunctuationDetails(details);
542 /*                Util.LOG.debug("Keeping irrelevant punctuation token "
543                         + details.getToken()
544                         + " -- might become part of the current "
545                         + lastExtraction.getType() + " extraction"); */
546             } /* else {
547                 Util.LOG.debug("Skipping over irrelevant punctuation token "
548                         + details.getToken());
549             } */
550         }
551 
552         // update contextDetails list
553         addContextDetails(new ContextDetails(details, getFeatures(),
554                 translatedState, relevant));
555     }
556 
557     /***
558      * Reset strategy and discard last prediction extraction if requested.
559      */
560     protected void resetStrategy() {
561         final boolean discardLast = getStrategy().reset();
562         if (discardLast) {
563             discardLastExtraction();
564         }
565     }
566 
567     /***
568      * Helper method that serializes the content of an extraction container
569      * to a writer.
570      *
571      * @param extractions the extraction container to serialize
572      * @param writer the writer to write; will be flushed but not closed by
573      * this method
574      * @throws IOException if an I/O error occurs
575      */
576     public void serializeExtractions(final ExtractionContainer extractions,
577             final Writer writer) throws IOException {
578         // serialize results
579         final FieldContainer storage =
580             FieldContainer.createFieldContainer(getConfig());
581         extractions.storeEntries(storage);
582         storage.store(writer);
583     }
584 
585 }