View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Writer;
27  import java.util.ArrayList;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.Set;
31  
32  import org.apache.commons.lang.ArrayUtils;
33  import org.apache.commons.lang.builder.ToStringBuilder;
34  import org.dom4j.Document;
35  import org.dom4j.Element;
36  
37  import de.fu_berlin.ties.combi.CombinationState;
38  import de.fu_berlin.ties.combi.CombinationStrategy;
39  import de.fu_berlin.ties.ContextMap;
40  import de.fu_berlin.ties.ProcessingException;
41  import de.fu_berlin.ties.TiesConfiguration;
42  
43  import de.fu_berlin.ties.classify.Classifier;
44  import de.fu_berlin.ties.classify.Prediction;
45  import de.fu_berlin.ties.classify.PredictionDistribution;
46  import de.fu_berlin.ties.classify.Probability;
47  import de.fu_berlin.ties.classify.Reranker;
48  import de.fu_berlin.ties.context.Recognition;
49  import de.fu_berlin.ties.context.Representation;
50  import de.fu_berlin.ties.eval.FMetricsView;
51  import de.fu_berlin.ties.filter.EmbeddingElements;
52  import de.fu_berlin.ties.filter.FilteringTokenWalker;
53  import de.fu_berlin.ties.filter.TrainableFilter;
54  import de.fu_berlin.ties.io.FieldContainer;
55  import de.fu_berlin.ties.text.TokenDetails;
56  import de.fu_berlin.ties.text.TokenizerFactory;
57  import de.fu_berlin.ties.util.MathUtils;
58  import de.fu_berlin.ties.util.Util;
59  
60  /***
61   * An extractor runs a local {@link de.fu_berlin.ties.classify.Classifier}
62   * on a list of items/nodes and combines their results using a
63   * {@link de.fu_berlin.ties.combi.CombinationStrategy}.
64   *
65   * <p>Instances of this class are not thread-safe and cannot extract from
66   * several documents in parallel.
67   *
68   * @author Christian Siefkes
69   * @version $Revision: 1.41 $, $Date: 2004/12/06 09:21:06 $, $Author: siefkes $
70   */
71  public class Extractor extends ExtractorBase {
72  
73      /***
74       * The recommended file extension to use for storing extractions.
75       */
76      public static final String EXT_EXTRACTIONS = "ext";
77  
78  
79      /***
80       * Helper method that creates a reranker. The reranker is configured from
81       * the "extract" prefix. E.g. an entry such as "extract.bias.A = 0.99"
82       * could be used to introduce a small malus for the background class used
83       * in most combination strategies, thus favoring recall over precision.
84       *
85       * @param config the configuration to use
86       * @return the initialized reranker
87       */
88      private static Reranker createReranker(final TiesConfiguration config) {
89          return new Reranker(config.subset("extract"));        
90      }
91  
92  
93      /***
94       * The last document processed by this instance.
95       */
96      private Document lastDocument;
97  
98      /***
99       * The extraction container used for storing the predicted extractions.
100      */
101     private ExtractionContainer predictedExtractions;
102 
103     /***
104      * A list of punctuation tokens collected between non-puncutation tokens.
105      */
106     private final List<TokenDetails> punctuationDetails =
107         new ArrayList<TokenDetails>();
108 
109     /***
110      * An optional reranker that recalculates probabilities to introduce a bias.
111      * This can be used to favor recall over precision (by setting a bias &lt;
112      * 1 for the background class) etc.
113      */
114     private final Reranker reranker;
115 
116 
117     /***
118      * Creates a new instance using a default extension. Delegates to
119      * {@link #Extractor(String, TiesConfiguration)} using the
120      * {@linkplain TiesConfiguration#CONF standard configuration}.
121      *
122      * @throws IllegalArgumentException if the combination strategy cannot be
123      * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
124      * TiesConfiguration)})
125      * @throws ProcessingException if an error occurs during initialization
126      */
127     public Extractor() throws IllegalArgumentException, ProcessingException {
128         this(EXT_EXTRACTIONS);
129     }
130 
131     /***
132      * Creates a new instance. Delegates to
133      * {@link #Extractor(String, TiesConfiguration)} using the
134      * {@linkplain TiesConfiguration#CONF standard configuration}.
135      *
136      * @param outExt the extension to use for output files
137      * @throws IllegalArgumentException if the combination strategy cannot be
138      * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
139      * TiesConfiguration)})
140      * @throws ProcessingException if an error occurs during initialization
141      */
142     public Extractor(final String outExt)
143             throws IllegalArgumentException, ProcessingException {
144         this(outExt, TiesConfiguration.CONF);
145     }
146 
147     /***
148      * Creates a new instance. Delegates to the corresponding
149      * {@linkplain ExtractorBase#ExtractorBase(String, TiesConfiguration) super
150      * constructor} to configure the fields.
151      *
152      * @param outExt the extension to use for output files
153      * @param config the configuration to use
154      * @throws IllegalArgumentException if the combination strategy cannot be
155      * initialized
156      * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
157      * TiesConfiguration)})
158      * @throws ProcessingException if an error occurs during initialization
159      */
160     public Extractor(final String outExt, final TiesConfiguration config)
161             throws IllegalArgumentException, ProcessingException {
162         super(outExt, config);
163         reranker = createReranker(config);
164     }
165 
166     /***
167      * Creates a new instance. Delegates to the corresponding
168      * {@linkplain ExtractorBase#ExtractorBase(String, File, TiesConfiguration)
169      * super constructor} to configure the fields.
170      *
171      * @param outExt the extension to use for output files
172      * @param runDirectory the directory to run the classifier in; used instead
173      * of the
174      * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
175      * configured directory} if not <code>null</code>
176      * @param config the configuration to use
177      * @throws IllegalArgumentException if the combination strategy cannot be
178      * initialized
179      * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
180      * TiesConfiguration)})
181      * @throws ProcessingException if an error occurs during initialization
182      */
183     public Extractor(final String outExt, final File runDirectory,
184             final TiesConfiguration config)
185             throws IllegalArgumentException, ProcessingException {
186         super(outExt, runDirectory, config);
187         reranker = createReranker(config);
188     }
189 
190     /***
191      * Creates a new instance, re-using the components from the provided
192      * trainer.
193      *
194      * @param outExt the extension to use for output files
195      * @param trainer trainer whose components should be re-used
196      */
197     public Extractor(final String outExt, final Trainer trainer) {
198         this(outExt, trainer.getTargetStructure(), trainer.getClassifiers(),
199             trainer.getRepresentation(), trainer.getStrategy(),
200             trainer.getFactory(), trainer.getSentenceFilter(),
201             createReranker(trainer.getConfig()),
202             trainer.viewRelevantPunctuation(), trainer.getConfig());
203     }
204 
205     /***
206      * Creates a new instance.
207      *
208      * @param outExt the extension to use for output files
209      * @param targetStruct the target structure specifying the classes to
210      * recognize
211      * @param theClassifiers the classifiers to use for the local classification
212      * decisions
213      * @param theRepresentation the context representation to use for local
214      * classifications
215      * @param combiStrat the combination strategy to use
216      * @param tFactory used to instantiate tokenizers
217      * @param sentFilter the filter used in the first step of a double
218      * classification approach ("sentence filtering"); if <code>null</code>,
219      * no sentence filtering is used
220      * @param rerank a reranker that recalculates probabilities to
221      * introduce a bias (can be used to favor recall over precision, by setting
222      * a bias &lt; 1 for the background class, etc.); must not be
223      * <code>null</code>
224      * @param relevantPunct a set of punctuation tokens that have been found to
225      * be relevant for token classification; might be empty but not
226      * <code>null</code>
227      * @param config used to configure superclasses; if <code>null</code>,
228      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
229      */
230     public Extractor(final String outExt, final TargetStructure targetStruct,
231             final Classifier[] theClassifiers,
232             final Representation theRepresentation,
233             final CombinationStrategy combiStrat,
234             final TokenizerFactory tFactory, final TrainableFilter sentFilter,
235             final Reranker rerank, final Set<String> relevantPunct,
236             final TiesConfiguration config) {
237         super(outExt, targetStruct, theClassifiers, theRepresentation,
238             combiStrat, tFactory, sentFilter, relevantPunct, config);
239         reranker = rerank;
240     }
241 
242 
243     /***
244      * Adds an element to the collected punctuation details.
245      *
246      * @param details the element to add
247      */
248     protected void addPunctuationDetails(final TokenDetails details) {
249         punctuationDetails.add(details);
250     }
251 
252     /***
253      * Appends the collected punctuation details (if any) to the provided
254      * extraction. Finally delegates to {@link #clearPunctuation()}
255      * to dleetes the processed punctuation.
256      *
257      * @param ext the extraction to append to
258      */
259     protected void appendPunctuation(final Extraction ext) {
260         if (!punctuationDetails.isEmpty()) {
261             final Iterator<TokenDetails> detailsIter =
262                 punctuationDetails.iterator();
263             TokenDetails currentDetails;
264 
265             // append
266             while (detailsIter.hasNext()) {
267                 currentDetails = detailsIter.next();
268                 ext.addToken(currentDetails, null, true);
269             }
270 
271             // and clear
272             clearPunctuation();            
273         }
274     }
275 
276     /***
277      * Clears the collected punctuation details.
278      */
279     protected void clearPunctuation() {
280         punctuationDetails.clear();
281     }
282 
283     /***
284      * {@inheritDoc}
285      */
286     protected FilteringTokenWalker createFilteringTokenWalker(
287             final TrainableFilter repFilter) {
288         return new FilteringTokenWalker(this, getFactory(), repFilter, this);
289     }
290 
291     /***
292      * Helper method that discards the last extraction, removing it from
293      * {@link #getPredictedExtractions() prior recognitions} and
294      * {@link #getPredictedExtractions() predicted extractions}.
295      */
296     private void discardLastExtraction() {
297         final Extraction removedFromContainer =
298             getPredictedExtractions().removeLast();
299 
300         // this one might be null if no prior recognitions are stored
301         final Recognition removedFromRecognitions =
302             getPriorRecognitions().removeLast();
303 
304         // both must be equal, otherwise something would be very wrong
305         if ((removedFromRecognitions != null)
306                 && !removedFromContainer.equals(removedFromRecognitions)) {
307             throw new IllegalStateException("Extractions discarded from "
308                 + "container " + removedFromContainer + " and from prior "
309                 + "recognitions " + removedFromRecognitions + " differ");
310         }
311 
312         Util.LOG.debug("Discarded last extraction " + removedFromRecognitions);
313     }
314 
315     /***
316      * Extracts items of interest from the contents of an XML document, based on
317      * context representation and local classifier.
318      *
319      * @param document a document whose contents should be classified
320      * @return a container of all extractions from the document, in document
321      * order
322      * @throws IOException if an I/O error occurs
323      * @throws ProcessingException if an error occurs during processing
324      */
325     public ExtractionContainer extract(final Document document)
326             throws IOException, ProcessingException {
327         // initialize local fields
328         initFields();
329         lastDocument = document;
330         predictedExtractions = new ExtractionContainer(getTargetStructure());
331 
332         // the walker will call back (processToken method) where appropriate
333         getWalker().walk(document, null);
334 
335         // reset strategy and discard last prediction extraction if requested
336         resetStrategy();
337 
338         // return the container of predicted extractions
339         return getPredictedExtractions();
340     }
341 
342     /***
343      * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
344      * sentence filtering} on the last processed document.
345      *
346      * @param correctExtractions a container of all correct extractions for the
347      * document
348      * @return the calculated statistics for sentence filtering on the
349      * last document; <code>null</code> if {@linkplain
350      * #isSentenceFiltering() sentence filtering} is disabled
351      */
352     public FMetricsView evaluateSentenceFiltering(
353             final ExtractionContainer correctExtractions) {
354         return evaluateSentenceFiltering(new EmbeddingElements(lastDocument,
355                 correctExtractions, getFactory()));
356     }
357 
358     /***
359      * Returns the extraction container used for storing the predicted
360      * extractions.
361      * @return the extraction container
362      */
363     protected ExtractionContainer getPredictedExtractions() {
364         return predictedExtractions;
365     }
366 
367     /***
368      * Extracts items of interest from the contents of an XML document and
369      * serializes the extractions.
370      *
371      * @param document the document to read
372      * @param writer the writer to write the extracted items to; flushed
373      * but not closed by this method
374      * @param context a map of objects that are made available for processing
375      * @throws IOException if an I/O error occurs
376      * @throws ProcessingException if an error occurs during processing
377      */
378     public void process(final Document document, final Writer writer,
379             final ContextMap context) throws IOException, ProcessingException {
380         // delegate to extract
381         final ExtractionContainer extractions = extract(document);
382 
383         // serialize results
384         final FieldContainer storage = FieldContainer.createFieldContainer();
385         extractions.storeEntries(storage);
386         storage.store(writer);
387     }
388 
389     /***
390      * {@inheritDoc}
391      */
392     public void processToken(final Element element, final String left,
393             final TokenDetails details, final String right,
394             final ContextMap context) throws ProcessingException {
395         if (isRelevant(details.getToken())) {
396             // update context rep + active classes
397             updateState(element, left, details.getToken(), right);
398     
399             final Classifier[] classifiers = getClassifiers();
400             // call classifiers and translate result
401             final PredictionDistribution[] origDists =
402                 new PredictionDistribution[classifiers.length];
403             final PredictionDistribution[] finalDists =
404                 new PredictionDistribution[classifiers.length];
405             final Prediction[] predictions =
406                 new Prediction[classifiers.length];
407             final String[] predictedTypes = new String[classifiers.length];
408             Probability currentProb;
409             final double[] probs = new double[classifiers.length];
410             final double[] pRs = new double[classifiers.length];
411     
412             for (int i = 0; i < origDists.length; i++) {
413                 origDists[i] = classifiers[i].classify(getFeatures(),
414                     getActiveClasses()[i]);
415                 finalDists[i] = reranker.rerank(origDists[i]);
416                 predictions[i] = finalDists[i].best();
417                 predictedTypes[i] = predictions[i].getType();
418                 currentProb = predictions[i].getProbability();
419                 probs[i] = currentProb.getProb();
420                 pRs[i] = currentProb.getPR();
421             }
422     
423             final CombinationState translatedState =
424                 getStrategy().translateResult(finalDists);
425             final double meanProb = MathUtils.mean(probs);
426             final double meanPR = MathUtils.mean(pRs);
427             Util.LOG.debug("Predicted types: '"
428                 + ArrayUtils.toString(predictedTypes)
429                 + "'; translated state: " + translatedState + ", mean prob.: "
430                 + meanProb + ", mean pR: " + meanPR);
431     
432             if (translatedState.isDiscardPreceding()) {
433                 discardLastExtraction();
434             }
435     
436             // update extraction container if the result isn't null
437             if (translatedState.getType() == null) {
438                 // outside: seal last extraction if not yet done
439                 final Extraction lastExtraction =
440                     getPredictedExtractions().last();
441 
442                 if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
443                     lastExtraction.setSealed(true);
444                 }
445 
446                 // discard any collected punctuation tokens
447                 clearPunctuation();
448             } else if (translatedState.isBegin()) {
449                 // start of new instance: add to container & recognition buffer
450                 final Extraction newExtraction =
451                     new Extraction(translatedState.getType(), details,
452                             new Probability(meanProb, meanPR));
453                 getPredictedExtractions().add(newExtraction);
454                 getPriorRecognitions().add(newExtraction);
455 
456                 // discard any collected punctuation tokens
457                 clearPunctuation();
458             } else {
459                 // continuation of current instance
460                 final Extraction currentExtraction =
461                     getPredictedExtractions().last();
462     
463                 // append and discard any collected punctuation tokens
464                 appendPunctuation(currentExtraction);
465 
466                 // ensure that states match prior to appending
467                 if (!currentExtraction.getType().equals(
468                         translatedState.getType())) {
469                     throw new IllegalStateException("Type mismatch: "
470                         + translatedState + " cannot continue extraction "
471                         + currentExtraction);
472                 }
473                 currentExtraction.addToken(details,
474                         new Probability(meanProb, meanPR), true);
475             }
476     
477             // update the state of the strategy
478             getStrategy().updateState(translatedState);
479         } else {
480             // irrevalant (punctuation) token:
481             final Extraction lastExtraction = getPredictedExtractions().last();
482 
483             // remember for later appending if in extraction
484             if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
485                 addPunctuationDetails(details);
486                 Util.LOG.debug("Keeping irrelevant punctuation token "
487                         + details.getToken()
488                         + " -- might become part of the current "
489                         + lastExtraction.getType() + " extraction");
490             } else {
491                 Util.LOG.debug("Skipping over irrelevant punctuation token "
492                         + details.getToken());                
493             }
494         }
495     }
496 
497     /***
498      * Reset strategy and discard last prediction extraction if requested.
499      */
500     protected void resetStrategy() {        
501         final boolean discardLast = getStrategy().reset();
502         if (discardLast) {
503             discardLastExtraction();
504         }       
505     }
506 
507     /***
508      * Returns a string representation of this object.
509      * @return a textual representation
510      */
511     public String toString() {
512         return new ToStringBuilder(this)
513             .appendSuper(super.toString())
514             .append("reranker", reranker)
515             .append("predicted extractions", predictedExtractions)
516             .toString();
517     }
518 
519 }