View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.io.File;
25  import java.util.Collections;
26  import java.util.HashMap;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  
31  import org.apache.commons.lang.builder.ToStringBuilder;
32  import org.dom4j.Element;
33  import org.dom4j.NodeFilter;
34  
35  import de.fu_berlin.ties.combi.CombinationStrategy;
36  import de.fu_berlin.ties.DocumentReader;
37  import de.fu_berlin.ties.ProcessingException;
38  import de.fu_berlin.ties.TiesConfiguration;
39  
40  import de.fu_berlin.ties.classify.Classifier;
41  import de.fu_berlin.ties.classify.Reranker;
42  import de.fu_berlin.ties.classify.TrainableClassifier;
43  import de.fu_berlin.ties.classify.feature.FeatureVector;
44  import de.fu_berlin.ties.context.DefaultRepresentation;
45  import de.fu_berlin.ties.context.LocalFeature;
46  import de.fu_berlin.ties.context.PriorRecognitions;
47  import de.fu_berlin.ties.context.Representation;
48  import de.fu_berlin.ties.eval.FMetricsView;
49  import de.fu_berlin.ties.eval.FeatureCount;
50  import de.fu_berlin.ties.eval.FeatureCountView;
51  import de.fu_berlin.ties.filter.EmbeddingElements;
52  import de.fu_berlin.ties.filter.FilterEvaluator;
53  import de.fu_berlin.ties.filter.FilteringTokenWalker;
54  import de.fu_berlin.ties.filter.RepresentationFilter;
55  import de.fu_berlin.ties.filter.SkipHandler;
56  import de.fu_berlin.ties.filter.TrainableFilter;
57  import de.fu_berlin.ties.text.TextUtils;
58  import de.fu_berlin.ties.text.TokenizerFactory;
59  import de.fu_berlin.ties.util.CollectionUtils;
60  import de.fu_berlin.ties.util.Util;
61  import de.fu_berlin.ties.xml.dom.ElementNameFilter;
62  import de.fu_berlin.ties.xml.dom.TokenProcessor;
63  import de.fu_berlin.ties.xml.dom.TokenWalker;
64  
65  /***
66   * Common code base shared by {@link de.fu_berlin.ties.extract.Extractor}and
67   * {@link de.fu_berlin.ties.extract.Trainer}.
68   * <p>
69   * Instances of subclasses are not thread-safe and cannot process several
70   * documents in parallel.
71   *
72   * @author Christian Siefkes
73   * @version $Revision: 1.39 $, $Date: 2004/12/07 12:01:48 $, $Author: siefkes $
74   */
75  public abstract class ExtractorBase extends DocumentReader implements
76          SkipHandler, TokenProcessor {
77  
78      /***
79       * Configuration key: List of elements to filter.
80       */
81      public static final String CONFIG_ELEMENTS = "filter.elements";
82  
83      /***
84       * Configuration key: List of elements that should be avoided when filtering
85       * (using parent element instead).
86       */
87      public static final String CONFIG_AVOID = "filter.avoid";
88  
89      /***
90       * Configuration key: list of punctuation and symbol tokens that are
91       * considered as relevant from the very start.
92       */
93      public static final String CONFIG_RELEVANT_PUNCTUATION =
94          "extract.punctuation.relevant";
95  
96      /***
97       * Configuration suffix/prefix used for sentence filtering.
98       */
99      public static final String CONFIG_SENTENCE = "sent";
100 
101     /***
102      * Helper methat that initializes the filter to be used for the first step
103      * of a double classification approach ("sentence filtering").
104      * 
105      * @param conf the filter is initialized from the "filter" parameters in
106      * this configuration
107      * @param representation the representation to use
108      * @return the created filter; or <code>null</code> if no sentence
109      * filtering should be used
110      * @throws ProcessingException if an error occurs while creating the filter
111      */
112     public static TrainableFilter createSentenceFilter(
113             final TiesConfiguration conf, final Representation representation)
114             throws ProcessingException {
115         final RepresentationFilter result;
116 
117         if (conf.containsKey(CONFIG_ELEMENTS)) {
118             final String[] filteredElements =
119                 conf.getStringArray(CONFIG_ELEMENTS);
120 
121             if (!TiesConfiguration.arrayIsEmpty(filteredElements)) {
122                 // elements to prefer when filtering
123                 final NodeFilter positiveFilter = new ElementNameFilter(
124                         filteredElements);
125                 final NodeFilter negativeFilter = new ElementNameFilter(
126                         conf.getStringArray(CONFIG_AVOID));
127 
128                 // configure reranker from "sent" prefix
129                 // (e.g. sent.bias.true = 1.4)
130                 final Reranker reranker =
131                     new Reranker(conf.subset(CONFIG_SENTENCE));
132                 result = new RepresentationFilter(conf, positiveFilter,
133                         negativeFilter, reranker, representation, "Sentence");
134                 Util.LOG.debug("Initialized representation filter for sentence "
135                         + "filtering: " + result);
136             } else {
137                 result = null;
138             }
139         } else {
140             result = null;
141         }
142 
143         return result;
144     }
145 
146     /***
147      * The classifier(s) used for the local classification decisions.
148      */
149     private final Classifier[] classifiers;
150 
151     /***
152      * The context representation used for local classifications.
153      */
154     private final Representation representation;
155 
156     /***
157      * The target structure specifying the classes to recognize.
158      */
159     private final TargetStructure targetStructure;
160 
161     /***
162      * Used to instantiate tokenizers.
163      */
164     private final TokenizerFactory factory;
165 
166     /***
167      * The filter used in the first step of a double classification approach
168      * ("sentence filtering"); if <code>null</code>, no sentence filtering is
169      * used.
170      */
171     private final TrainableFilter sentenceFilter;
172 
173     /***
174      * The set of candidate classes to consider for the current element for each
175      * classifier.
176      */
177     private Set[] activeClasses;
178 
179     /***
180      * The feature cache used by the context representation.
181      */
182     private Map<Element, List<LocalFeature>> featureCache;
183 
184     /***
185      * The vector of features representing the currently processed element.
186      */
187     private FeatureVector features;
188 
189     /***
190      * Used to count documents, contexts, and features and to calculate
191      * averages.
192      */
193     private final FeatureCount featureCount = new FeatureCount();
194 
195     /***
196      * A set of punctuation tokens that have been found to be relevant for
197      * token classification (because they sometimes occur as the first or
198      * last token of an extraction).
199      */
200     private final Set<String> relevantPunctuation;
201 
202     /***
203      * A buffer of preceding {@link de.fu_berlin.ties.context.Recognition}s
204      * from the current document.
205      */
206     private PriorRecognitions priorRecognitions;
207 
208     /***
209      * The combination strategy used.
210      */
211     private final CombinationStrategy strategy;
212 
213     /***
214      * Used to walk thru documents.
215      */
216     private TokenWalker walker;
217 
218     /***
219      * Creates a new instance. Delegates to
220      * {@link #ExtractorBase(String, TiesConfiguration)}using the
221      * {@linkplain TiesConfiguration#CONF standard configuration}.
222      * 
223      * @param outExt the extension to use for output files
224      * @throws IllegalArgumentException if the combination strategy cannot be
225      * initialized (cf.
226      * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
227      * @throws ProcessingException if an error occurs during initialization
228      */
229     public ExtractorBase(final String outExt) throws IllegalArgumentException,
230             ProcessingException {
231         this(outExt, TiesConfiguration.CONF);
232     }
233 
234     /***
235      * Creates a new instance, configuring target structure, classifier,
236      * {@link DefaultRepresentation}, node filter and combination strategy from
237      * the provided configuration.
238      * 
239      * @param outExt the extension to use for output files
240      * @param config the configuration to use
241      * @throws IllegalArgumentException if the combination strategy cannot be
242      * initialized (cf.
243      * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
244      * @throws ProcessingException if an error occurs during initialization
245      */
246     public ExtractorBase(final String outExt, final TiesConfiguration config)
247             throws IllegalArgumentException, ProcessingException {
248         this(outExt, null, config);
249     }
250 
251     /***
252      * Creates a new instance, configuring target structure, classifier,
253      * {@link DefaultRepresentation}, node filter, combination strategy and
254      * tokenizer factory from the provided configuration.
255      * 
256      * @param outExt the extension to use for output files
257      * @param runDirectory the directory to run the classifier in; used instead
258      * of the
259      * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
260      * configured directory} if not <code>null</code>
261      * @param config the configuration to use
262      * @throws IllegalArgumentException if the combination strategy cannot be
263      * initialized (cf.
264      * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
265      * @throws ProcessingException if an error occurs during initialization
266      */
267     public ExtractorBase(final String outExt, final File runDirectory,
268             final TiesConfiguration config) throws IllegalArgumentException,
269             ProcessingException {
270         super(outExt, config);
271         targetStructure = new TargetStructure(config);
272         representation = new DefaultRepresentation(config);
273         strategy = CombinationStrategy.createStrategy(
274                 targetStructure.getClassNames(), config);
275         sentenceFilter = createSentenceFilter(config, representation);
276 
277         // initialize classifier(s) suitable fo the combi strategy
278         final Set<String>[] allClasses = strategy.allClasses();
279         classifiers = new Classifier[allClasses.length];
280 
281         for (int i = 0; i < allClasses.length; i++) {
282             classifiers[i] = TrainableClassifier.createClassifier(
283                     allClasses[i], runDirectory, config);
284         }
285 
286         // start with configured set of relevant punctuation+symbol tokens
287         relevantPunctuation = CollectionUtils.arrayAsSet(
288                 config.getStringArray(CONFIG_RELEVANT_PUNCTUATION));
289         Util.LOG.debug("Initialized set of relevant punctuation + symbol "
290                 + "tokens to " + relevantPunctuation);
291         factory = new TokenizerFactory(config);
292     }
293 
294     /***
295      * Creates a new instance.
296      * 
297      * @param outExt the extension to use for output files
298      * @param targetStruct the target structure specifying the classes to
299      * recognize
300      * @param theClassifiers the array of classifiers to use for the local
301      * classification decisions
302      * @param theRepresentation the context representation to use for local
303      * classifications
304      * @param combiStrat the combination strategy to use
305      * @param tFactory used to instantiate tokenizers
306      * @param sentFilter the filter used in the first step of a double
307      * classification approach ("sentence filtering"); if <code>null</code>,
308      * no sentence filtering is used
309      * @param relevantPunct a set of punctuation tokens that have been found to
310      * be relevant for token classification; might be empty but not
311      * <code>null</code>
312      * @param config used to configure superclasses; if <code>null</code>,
313      * the {@linkplain TiesConfiguration#CONF standard configuration}is used
314      */
315     public ExtractorBase(final String outExt,
316             final TargetStructure targetStruct,
317             final Classifier[] theClassifiers,
318             final Representation theRepresentation,
319             final CombinationStrategy combiStrat,
320             final TokenizerFactory tFactory, final TrainableFilter sentFilter,
321             final Set<String> relevantPunct, final TiesConfiguration config) {
322         super(outExt, config);
323         targetStructure = targetStruct;
324         classifiers = theClassifiers;
325         representation = theRepresentation;
326         strategy = combiStrat;
327         factory = tFactory;
328         sentenceFilter = sentFilter;
329         relevantPunctuation = relevantPunct;
330         Util.LOG.debug("Initialized set of relevant punctuation + symbol "
331                 + "tokens to " + relevantPunctuation);
332     }
333 
334     /***
335      * Creates a filtering token walker to be used for walking through a
336      * document and sentence classification if a double classification approach
337      * is used.
338      * 
339      * @param repFilter the trainable filter to use
340      * @return the created walker
341      */
342     protected abstract FilteringTokenWalker createFilteringTokenWalker(
343             final TrainableFilter repFilter);
344 
345     /***
346      * Helper method that creates the token walker to use for walking through a
347      * document. This walker automatically handles sentence filtering if a
348      * double classification approach should be used. Delegates to the abstract
349      * {@link #createFilteringTokenWalker(RepresentationFilter)}method if
350      * sentence filtering should be used.
351      * 
352      * @return the created walker
353      * @throws ProcessingException if an error occurs while initializing the
354      * walker
355      */
356     private TokenWalker createTokenWalker() throws ProcessingException {
357         if (isSentenceFiltering()) {
358             // use sentence filter to create a filtering waker
359             return createFilteringTokenWalker(sentenceFilter);
360         } else {
361             // no sentence filtering used -- create simple walker
362             return new TokenWalker(this, getFactory());
363         }
364     }
365 
366     /***
367      * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
368      * sentence filtering} on the last processed document.
369      * 
370      * @param embeddingElements the correct set of embedding elements
371      * @return the calculated statistics for sentence filtering on the last
372      * document; <code>null</code> if {@linkplain #isSentenceFiltering()
373      * sentence filtering} is disabled
374      */
375     protected FMetricsView evaluateSentenceFiltering(
376             final EmbeddingElements embeddingElements) {
377         if (isSentenceFiltering() && (walker != null)) {
378             // we know we're using a filtering walker
379             return FilterEvaluator.evaluate(embeddingElements,
380                     (FilteringTokenWalker) walker);
381         } else {
382             return null;
383         }
384     }
385 
386     /***
387      * Returns the set of candidate classes to consider for the current element
388      * for each classifier.
389      * 
390      * @return the value of the attribute
391      */
392     protected Set[] getActiveClasses() {
393         return activeClasses;
394     }
395 
396     /***
397      * Returns the array of classifiers used for the local classification
398      * decisions.
399      * 
400      * @return the local classifier
401      */
402     public Classifier[] getClassifiers() {
403         return classifiers;
404     }
405 
406     /***
407      * Returns the factory used to instantiate tokenizers.
408      * 
409      * @return the value of the attribute
410      */
411     public TokenizerFactory getFactory() {
412         return factory;
413     }
414 
415     /***
416      * Returns the object used to count documents, contexts, and features and to
417      * calculate averages.
418      * 
419      * @return the used feature count
420      */
421     public FeatureCount getFeatureCount() {
422         return featureCount;
423     }
424 
425     /***
426      * Returns vector of features representing the currently processed element.
427      * 
428      * @return the value of the attribute
429      */
430     protected FeatureVector getFeatures() {
431         return features;
432     }
433 
434     /***
435      * Returns the buffer of preceding
436      * {@link de.fu_berlin.ties.context.Recognition}s from the current
437      * document.
438      * 
439      * @return the buffer
440      */
441     public PriorRecognitions getPriorRecognitions() {
442         return priorRecognitions;
443     }
444 
445     /***
446      * Returns the context representation used for local classifications.
447      * 
448      * @return the context representation
449      */
450     public Representation getRepresentation() {
451         return representation;
452     }
453 
454     /***
455      * Returns the filter used in the first step of a double classification
456      * approach ("sentence filtering").
457      * 
458      * @return the node filter, or <code>null</code> if no sentence filtering
459      * is used
460      */
461     protected TrainableFilter getSentenceFilter() {
462         return sentenceFilter;
463     }
464 
465     /***
466      * Returns the combination strategy used.
467      * 
468      * @return the combination strategy
469      */
470     protected CombinationStrategy getStrategy() {
471         return strategy;
472     }
473 
474     /***
475      * Returns the target structure specifying the classes to recognize.
476      * 
477      * @return the used target structure
478      */
479     public TargetStructure getTargetStructure() {
480         return targetStructure;
481     }
482 
483     /***
484      * Returns the token walker used to walk thru documents.
485      * 
486      * @return the token walker
487      */
488     protected TokenWalker getWalker() {
489         return walker;
490     }
491 
492     /***
493      * Initializes the fields used for processing a document (feature cache,
494      * buffer of prior recognitions, token walker, and statistics) and resets
495      * the combination strategy.
496      * 
497      * @throws ProcessingException if an error occurs while initializing
498      */
499     protected void initFields() throws ProcessingException {
500         featureCache = new HashMap<Element, List<LocalFeature>>();
501         priorRecognitions = representation.createRecognitionBuffer();
502         walker = createTokenWalker();
503         featureCount.countDocument();
504         strategy.reset();
505     }
506 
507     /***
508      * Checks whether a token is relevant for training and extraction.
509      * Tokens containing only punctuation or symbol characters are considered
510      * irrevelant unless they have been {@linkplain #markRelevant(String)
511      * marked to be relevant}.
512      *
513      * @param token the token to check
514      * @return <code>true</code> if the is relevant for training and
515      * extraction; <code>false</code>  it is can be ignored
516      */
517     protected boolean isRelevant(final String token) {
518         // check the relevantPunctuation set for punctuation tokens
519         return !TextUtils.punctuationOrSymbol(token)
520             || relevantPunctuation.contains(token);
521     }
522 
523     /***
524      * Whether this instance uses sentence filtering (classification of relevant
525      * versus irrelevant sentences in a double classification approach).
526      * 
527      * @return <code>true</code> if sentence filtering is used
528      */
529     public boolean isSentenceFiltering() {
530         return sentenceFilter != null;
531     }
532 
533     /***
534      * Marks a punctuation token as relevant for classification
535      * ((because it did occur as the first or last token of an extraction).
536      *
537      * @param token the token to mark as relevant
538      */
539     protected void markRelevant(final String token) {
540         relevantPunctuation.add(token);
541     }
542 
543     /***
544      * Reset the combination strategy, handling the boolean result value
545      * in an appropriate way.
546      */
547     protected abstract void resetStrategy();
548 
549     /***
550      * {@inheritDoc}
551      */
552     public void skip() {
553         // call abstract method to reset the combination strategy
554         // -- currently disabled because somehow it degrades F1 measure
555         //resetStrategy();
556     }
557 
558     /***
559      * Returns a string representation of this object.
560      * 
561      * @return a textual representation
562      */
563     public String toString() {
564         return new ToStringBuilder(this).appendSuper(super.toString())
565                 .append("classifiers", classifiers)
566                 .append("representation", representation)
567                 .append("target structure", targetStructure)
568                 .append("combination strategy", strategy)
569                 .append("sentence filter", sentenceFilter)
570                 .append("active classes", activeClasses)
571                 .append("prior recognitions", priorRecognitions)
572                 .toString();
573     }
574 
575     /***
576      * Helper that build the {@linkplain #getFeatures() features}and determines
577      * the {@linkplain #getActiveClasses() active classes}for an element.
578      * 
579      * @param element the element to process
580      * @param leftText textual content to the left of (preceding)
581      * <code>mainText</code>, might be empty
582      * @param mainText the main textual content to represent, might be empty
583      * @param rightText textual content to the right of (following)
584      * <code>mainText</code>, might be empty
585      */
586     protected void updateState(final Element element, final String leftText,
587             final String mainText, final String rightText) {
588         // build context representation + update statistics
589         features = getRepresentation().buildContext(element, leftText,
590                 mainText, rightText, priorRecognitions, featureCache, "Token");
591         featureCount.update(features);
592 
593         // use strategy to determine candidate classes
594         activeClasses = getStrategy().activeClasses();
595         //Util.LOG.debug("Determined active classes: " + activeClasses);
596     }
597 
598     /***
599      * Returns a read-only view on the counted documents, contexts, and features
600      * and the calculated averages. This is not a snapshot but will change
601      * whenever the a document is processed.
602      * 
603      * @return a view on the counts and averages
604      */
605     public FeatureCountView viewFeatureCount() {
606         return featureCount;
607     }
608 
609     /***
610      * Returns a read-only view on the set of punctuation tokens that have been
611      * found to be relevant for token classification (because they sometimes
612      * occur as the first or last token of an extraction).
613      *
614      * @return a read-only view on the relevant punctuation
615      */
616     public Set<String> viewRelevantPunctuation() {
617         return Collections.unmodifiableSet(relevantPunctuation);
618     }
619 
620 }