View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.util.ArrayList;
27  import java.util.Collections;
28  import java.util.HashMap;
29  import java.util.List;
30  import java.util.Map;
31  import java.util.Set;
32  
33  import org.apache.commons.lang.builder.ToStringBuilder;
34  import org.dom4j.Document;
35  import org.dom4j.Element;
36  import org.dom4j.NodeFilter;
37  
38  import de.fu_berlin.ties.combi.CombinationStrategy;
39  import de.fu_berlin.ties.filter.DocumentRewriter;
40  import de.fu_berlin.ties.DocumentReader;
41  import de.fu_berlin.ties.ProcessingException;
42  import de.fu_berlin.ties.TiesConfiguration;
43  
44  import de.fu_berlin.ties.classify.Classifier;
45  import de.fu_berlin.ties.classify.Reranker;
46  import de.fu_berlin.ties.classify.TrainableClassifier;
47  import de.fu_berlin.ties.classify.feature.FeatureVector;
48  import de.fu_berlin.ties.context.ContextDetails;
49  import de.fu_berlin.ties.context.DefaultRepresentation;
50  import de.fu_berlin.ties.context.LocalFeature;
51  import de.fu_berlin.ties.context.PriorRecognitions;
52  import de.fu_berlin.ties.context.Representation;
53  import de.fu_berlin.ties.eval.FMetricsView;
54  import de.fu_berlin.ties.extract.amend.FinalReextractor;
55  import de.fu_berlin.ties.extract.reestimate.Reestimator;
56  import de.fu_berlin.ties.filter.EmbeddingElements;
57  import de.fu_berlin.ties.filter.FilterEvaluator;
58  import de.fu_berlin.ties.filter.FilteringTokenWalker;
59  import de.fu_berlin.ties.filter.RepresentationFilter;
60  import de.fu_berlin.ties.filter.SkipHandler;
61  import de.fu_berlin.ties.filter.TrainableFilter;
62  import de.fu_berlin.ties.text.TextUtils;
63  import de.fu_berlin.ties.text.TokenizerFactory;
64  import de.fu_berlin.ties.util.CollUtils;
65  import de.fu_berlin.ties.util.Util;
66  import de.fu_berlin.ties.xml.dom.ElementNameFilter;
67  import de.fu_berlin.ties.xml.dom.TokenProcessor;
68  import de.fu_berlin.ties.xml.dom.TokenWalker;
69  
70  /***
71   * Common code base shared by {@link de.fu_berlin.ties.extract.Extractor}and
72   * {@link de.fu_berlin.ties.extract.Trainer}.
73   * <p>
74   * Instances of subclasses are not thread-safe and cannot process several
75   * documents in parallel.
76   *
77   * @author Christian Siefkes
78   * @version $Revision: 1.54 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
79   */
80  public abstract class ExtractorBase extends DocumentReader implements
81          SkipHandler, TokenProcessor {
82  
83      /***
84       * Configuration key: List of elements to filter.
85       */
86      public static final String CONFIG_ELEMENTS = "filter.elements";
87  
88      /***
89       * Configuration key: List of elements that should be avoided when filtering
90       * (using parent element instead).
91       */
92      public static final String CONFIG_AVOID = "filter.avoid";
93  
94      /***
95       * Configuration key: list of punctuation and symbol tokens that are
96       * considered as relevant from the very start.
97       */
98      public static final String CONFIG_RELEVANT_PUNCTUATION =
99          "extract.punctuation.relevant";
100 
101     /***
102      * Configuration suffix/prefix used for sentence filtering.
103      */
104     public static final String CONFIG_SENTENCE = "sent";
105 
106     /***
107      * Configuration suffix used for information extraction--specific settings.
108      */
109     public static final String CONFIG_SUFFIX_IE = "ie";
110 
111 
112     /***
113      * Initializes the list of document rewriters.
114      *
115      * @param conf the filters are initialized from the optional
116      * "rewriters" parameter in this configuration
117      * @return the created list of filters, might be empty bot not
118      * <code>null</code>
119      * @throws ProcessingException if an error occurred while creating the
120      * classifier
121      */
122     protected static DocumentRewriter[] createDocumentRewriters(
123             final TiesConfiguration conf) throws ProcessingException {
124         final String[] classNames =
125             conf.getStringArray("rewriters");
126         final DocumentRewriter[] docFilters;
127 
128         if (!TiesConfiguration.arrayIsEmpty(classNames)) {
129             // init list elements via reflection,
130             // using the configuration as single constructor parameter
131             docFilters = new DocumentRewriter[classNames.length];
132             final Object[] params = new Object[] {conf};
133 
134             for (int i = 0; i < docFilters.length; i++) {
135                 try {
136                     docFilters[i] = (DocumentRewriter) Util.createObject(
137                         Class.forName(classNames[i]), params,
138                         TiesConfiguration.class);
139                 } catch (ClassNotFoundException cnfe) {
140                     // convert and rethrow exception
141                     throw new ProcessingException(
142                         "Cannot initialize document rewriter " + classNames[i],
143                         cnfe);
144                 } catch (InstantiationException ie) {
145                     // convert and rethrow exception
146                     throw new ProcessingException(
147                         "Cannot initialize document rewriter " + classNames[i],
148                         ie);
149                 }
150 
151                 Util.LOG.debug("Initialized document rewriter no." + i
152                         + ": " + classNames[i]);
153             }
154         } else {
155             // return empty list
156             docFilters = new DocumentRewriter[0];
157         }
158 
159         return docFilters;
160     }
161 
162     /***
163      * Helper methat that initializes the filter to be used for the first step
164      * of a double classification approach ("sentence filtering").
165      * 
166      * @param conf the filter is initialized from the "filter" parameters in
167      * this configuration
168      * @param representation the representation to use
169      * @return the created filter; or <code>null</code> if no sentence
170      * filtering should be used
171      * @throws ProcessingException if an error occurs while creating the filter
172      */
173     private static TrainableFilter createSentenceFilter(
174             final TiesConfiguration conf, final Representation representation)
175             throws ProcessingException {
176         final RepresentationFilter result;
177 
178         if (conf.containsKey(CONFIG_ELEMENTS)) {
179             final String[] filteredElements =
180                 conf.getStringArray(CONFIG_ELEMENTS);
181 
182             if (!TiesConfiguration.arrayIsEmpty(filteredElements)) {
183                 // elements to prefer when filtering
184                 final NodeFilter positiveFilter = new ElementNameFilter(
185                         filteredElements);
186                 final NodeFilter negativeFilter = new ElementNameFilter(
187                         conf.getStringArray(CONFIG_AVOID));
188 
189                 // configure reranker from "sent" prefix
190                 // (e.g. sent.bias.true = 1.4)
191                 final Reranker reranker =
192                     new Reranker(conf.subset(CONFIG_SENTENCE));
193                 result = new RepresentationFilter(conf, positiveFilter,
194                         negativeFilter, reranker, representation, "Sentence");
195                 Util.LOG.debug("Initialized representation filter for sentence "
196                         + "filtering: " + result);
197             } else {
198                 result = null;
199             }
200         } else {
201             result = null;
202         }
203 
204         return result;
205     }
206 
207 
208     /***
209      * The classifier(s) used for the local classification decisions.
210      */
211     private final Classifier[] classifiers;
212 
213     /***
214      * A list of context details representing all tokens in a document.
215      */
216     private final List<ContextDetails> contextDetails =
217         new ArrayList<ContextDetails>();
218 
219     /***
220      * The last element of the re-estimator chain, might be <code>null</code>
221      * if the chain is empty.
222      */
223     private final Reestimator reestimator;
224 
225     /***
226      * An optional re-extractor that can modify extractions in any suitable way.
227      */
228     private final FinalReextractor reextractor;
229 
230     /***
231      * The context representation used for local classifications.
232      */
233     private final Representation representation;
234 
235     /***
236      * The target structure specifying the classes to recognize.
237      */
238     private final TargetStructure targetStructure;
239 
240     /***
241      * Used to instantiate tokenizers.
242      */
243     private final TokenizerFactory factory;
244 
245     /***
246      * The filter used in the first step of a double classification approach
247      * ("sentence filtering"); if <code>null</code>, no sentence filtering is
248      * used.
249      */
250     private final TrainableFilter sentenceFilter;
251 
252     /***
253      * A list (possibly empty) of document processors that are invoked to modify
254      * the XML representations of the documents to process, e.g. by adding
255      * semantic information such as named-entity predictions.
256      */
257     private final DocumentRewriter[] documentRewriters;
258 
259     /***
260      * The set of candidate classes to consider for the current element for each
261      * classifier.
262      */
263     private Set[] activeClasses;
264 
265     /***
266      * The feature cache used by the context representation.
267      */
268     private Map<Element, List<LocalFeature>> featureCache;
269 
270     /***
271      * The vector of features representing the currently processed element.
272      */
273     private FeatureVector features;
274 
275     /***
276      * Used to count documents, contexts, and features and to calculate
277      * averages.
278      */
279 //    private final FeatureCount featureCount = new FeatureCount();
280 
281     /***
282      * A set of punctuation tokens that have been found to be relevant for
283      * token classification (because they sometimes occur as the first or
284      * last token of an extraction).
285      */
286     private final Set<String> relevantPunctuation;
287 
288     /***
289      * A buffer of preceding {@link de.fu_berlin.ties.context.Recognition}s
290      * from the current document.
291      */
292     private PriorRecognitions priorRecognitions;
293 
294     /***
295      * The combination strategy used.
296      */
297     private final CombinationStrategy strategy;
298 
299     /***
300      * Used to walk thru documents.
301      */
302     private TokenWalker walker;
303 
304     /***
305      * Creates a new instance. Delegates to
306      * {@link #ExtractorBase(String, TiesConfiguration)}using the
307      * {@linkplain TiesConfiguration#CONF standard configuration}.
308      * 
309      * @param outExt the extension to use for output files
310      * @throws IllegalArgumentException if the combination strategy cannot be
311      * initialized (cf.
312      * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
313      * @throws ProcessingException if an error occurs during initialization
314      */
315     public ExtractorBase(final String outExt) throws IllegalArgumentException,
316             ProcessingException {
317         this(outExt, TiesConfiguration.CONF);
318     }
319 
320     /***
321      * Creates a new instance, configuring target structure, classifier,
322      * {@link DefaultRepresentation}, node filter and combination strategy from
323      * the provided configuration.
324      * 
325      * @param outExt the extension to use for output files
326      * @param config the configuration to use
327      * @throws IllegalArgumentException if the combination strategy cannot be
328      * initialized (cf.
329      * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
330      * @throws ProcessingException if an error occurs during initialization
331      */
332     public ExtractorBase(final String outExt, final TiesConfiguration config)
333             throws IllegalArgumentException, ProcessingException {
334         this(outExt, null, config);
335     }
336 
337     /***
338      * Creates a new instance, configuring target structure, classifier,
339      * {@link DefaultRepresentation}, node filter, combination strategy and
340      * tokenizer factory from the provided configuration.
341      * 
342      * @param outExt the extension to use for output files
343      * @param runDirectory the directory to run the classifier in; used instead
344      * of the
345      * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
346      * configured directory} if not <code>null</code>
347      * @param config the configuration to use
348      * @throws IllegalArgumentException if the combination strategy cannot be
349      * initialized (cf.
350      * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
351      * @throws ProcessingException if an error occurs during initialization
352      */
353     public ExtractorBase(final String outExt, final File runDirectory,
354             final TiesConfiguration config) throws IllegalArgumentException,
355             ProcessingException {
356         super(outExt, config);
357         targetStructure = new TargetStructure(config);
358         representation = new DefaultRepresentation(config);
359         strategy = CombinationStrategy.createStrategy(
360                 targetStructure.getClassNames(), config);
361         reestimator = Reestimator.createReestimators(config);
362         reextractor = strategy.initReextractor(reestimator);
363         documentRewriters = createDocumentRewriters(config);
364         sentenceFilter = createSentenceFilter(config, representation);
365 
366         // initialize classifier(s) suitable fo the combi strategy
367         final Set[] allClasses = strategy.allClasses();
368         classifiers = new Classifier[allClasses.length];
369 
370         for (int i = 0; i < allClasses.length; i++) {
371             classifiers[i] = TrainableClassifier.createClassifier(
372                     CollUtils.asStringSet(allClasses[i]), runDirectory,
373                     config, CONFIG_SUFFIX_IE);
374         }
375 
376         // start with configured set of relevant punctuation+symbol tokens
377         relevantPunctuation = CollUtils.arrayAsSet(
378                 config.getStringArray(CONFIG_RELEVANT_PUNCTUATION));
379         Util.LOG.debug("Initialized set of relevant punctuation + symbol "
380                 + "tokens to " + relevantPunctuation);
381         factory = new TokenizerFactory(config);
382     }
383 
384     /***
385      * Creates a new instance.
386      * 
387      * @param outExt the extension to use for output files
388      * @param targetStruct the target structure specifying the classes to
389      * recognize
390      * @param theClassifiers the array of classifiers to use for the local
391      * classification decisions
392      * @param theRepresentation the context representation to use for local
393      * classifications
394      * @param combiStrat the combination strategy to use
395      * @param reextract an optional re-extractor that can modify extractions in
396      * any suitable way
397      * @param tFactory used to instantiate tokenizers
398      * @param estimator the last element of the re-estimator chain, or
399      * <code>null</code> if the chain is empty
400      * @param docFilters a list (possibly empty) of document processors that are
401      * invoked to modify the XML representations of the documents to process
402      * @param sentFilter the filter used in the first step of a double
403      * classification approach ("sentence filtering"); if <code>null</code>,
404      * no sentence filtering is used
405      * @param relevantPunct a set of punctuation tokens that have been found to
406      * be relevant for token classification; might be empty but not
407      * <code>null</code>
408      * @param config used to configure superclasses; if <code>null</code>,
409      * the {@linkplain TiesConfiguration#CONF standard configuration}is used
410      */
411     public ExtractorBase(final String outExt,
412             final TargetStructure targetStruct,
413             final Classifier[] theClassifiers,
414             final Representation theRepresentation,
415             final CombinationStrategy combiStrat,
416             final FinalReextractor reextract, final TokenizerFactory tFactory,
417             final Reestimator estimator, final DocumentRewriter[] docFilters,
418             final TrainableFilter sentFilter, final Set<String> relevantPunct,
419             final TiesConfiguration config) {
420         super(outExt, config);
421         targetStructure = targetStruct;
422         classifiers = theClassifiers;
423         representation = theRepresentation;
424         strategy = combiStrat;
425         reextractor = reextract;
426         factory = tFactory;
427         reestimator = estimator;
428         documentRewriters = docFilters;
429         sentenceFilter = sentFilter;
430         relevantPunctuation = relevantPunct;
431         Util.LOG.debug("Initialized set of relevant punctuation + symbol "
432                 + "tokens to " + relevantPunctuation);
433     }
434 
435 
436     /***
437      * Adds an element to the collected context details.
438      *
439      * @param details the element to add
440      */
441     protected void addContextDetails(final ContextDetails details) {
442         // there should be one context per index position, i.e. the old length
443         // of the list should be identical to the current index
444         if (contextDetails.size() != details.getIndex()) {
445             Util.LOG.warn("Length of context details list is "
446                     + contextDetails.size() + ", but index of next context is "
447                     + details.getIndex() + " (these numbers should be identical"
448                     + " since there is one context per index position)");
449         }
450         contextDetails.add(details);
451     }
452 
453     /***
454      * Creates a filtering token walker to be used for walking through a
455      * document and sentence classification if a double classification approach
456      * is used.
457      * 
458      * @param repFilter the trainable filter to use
459      * @return the created walker
460      */
461     protected abstract FilteringTokenWalker createFilteringTokenWalker(
462             final TrainableFilter repFilter);
463 
464     /***
465      * Helper method that creates the token walker to use for walking through a
466      * document. This walker automatically handles sentence filtering if a
467      * double classification approach should be used. Delegates to the abstract
468      * {@link #createFilteringTokenWalker(RepresentationFilter)}method if
469      * sentence filtering should be used.
470      * 
471      * @return the created walker
472      * @throws ProcessingException if an error occurs while initializing the
473      * walker
474      */
475     private TokenWalker createTokenWalker() throws ProcessingException {
476         if (isSentenceFiltering()) {
477             // use sentence filter to create a filtering waker
478             return createFilteringTokenWalker(sentenceFilter);
479         } else {
480             // no sentence filtering used -- create simple walker
481             return new TokenWalker(this, getFactory());
482         }
483     }
484 
485     /***
486      * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
487      * sentence filtering} on the last processed document.
488      * 
489      * @param embeddingElements the correct set of embedding elements
490      * @return the calculated statistics for sentence filtering on the last
491      * document; <code>null</code> if {@linkplain #isSentenceFiltering()
492      * sentence filtering} is disabled
493      */
494     protected FMetricsView evaluateSentenceFiltering(
495             final EmbeddingElements embeddingElements) {
496         if (isSentenceFiltering() && (walker != null)) {
497             // we know we're using a filtering walker
498             return FilterEvaluator.evaluate(embeddingElements,
499                     (FilteringTokenWalker) walker);
500         } else {
501             return null;
502         }
503     }
504 
505     /***
506      * Runs a document though the list of {@link #getDocumentRewriters()
507      * document filters} (if any) to modify it.
508      *
509      * @param orgDocument the original document
510      * @param filename the file name of the document
511      * @return the resulting document as modified by the filters
512      * @throws IOException if an I/O error occurs during filtering
513      * @throws ProcessingException if a processing error occurs during filtering
514      */
515     protected Document filterDocument(final Document orgDocument,
516             final File filename)
517     throws IOException, ProcessingException {
518         if ((documentRewriters != null) && (documentRewriters.length > 0)) {
519             Document doc = orgDocument;
520 
521             // invoke all filters in turn
522             for (int i = 0; i < documentRewriters.length; i++) {
523                 doc = documentRewriters[i].rewrite(doc, filename);
524             }
525 
526             return doc;
527         } else {
528             // no filters defined
529             return orgDocument;
530         }
531     }
532 
533     /***
534      * Returns the set of candidate classes to consider for the current element
535      * for each classifier.
536      * 
537      * @return the value of the attribute
538      */
539     protected Set[] getActiveClasses() {
540         return activeClasses;
541     }
542 
543     /***
544      * Returns the array of classifiers used for the local classification
545      * decisions.
546      * 
547      * @return the local classifier
548      */
549     public Classifier[] getClassifiers() {
550         return classifiers;
551     }
552 
553     /***
554      * Returns the list of context details representing all tokens in the
555      * current document.
556      *
557      * @return the list of context details
558      */
559     protected List<ContextDetails> getContextDetails() {
560         return contextDetails;
561     }
562 
563     /***
564      * Returns the list of document processors that are invoked to modify
565      * the XML representations of the documents to process, e.g. by adding
566      * semantic information such as named-entity predictions.
567      *
568      * @return the list of filters used, might be empty
569      */
570     protected DocumentRewriter[] getDocumentRewriters() {
571         return documentRewriters;
572     }
573 
574     /***
575      * Returns the factory used to instantiate tokenizers.
576      * 
577      * @return the value of the attribute
578      */
579     public TokenizerFactory getFactory() {
580         return factory;
581     }
582 
583     /***
584      * Returns the object used to count documents, contexts, and features and to
585      * calculate averages.
586      * 
587      * @return the used feature count
588      */
589 /*    public FeatureCount getFeatureCount() {
590         return featureCount;
591     } */
592 
593     /***
594      * Returns vector of features representing the currently processed element.
595      *
596      * @return the value of the attribute
597      */
598     protected FeatureVector getFeatures() {
599         return features;
600     }
601 
602     /***
603      * Returns the buffer of preceding
604      * {@link de.fu_berlin.ties.context.Recognition}s from the current
605      * document.
606      * 
607      * @return the buffer
608      */
609     public PriorRecognitions getPriorRecognitions() {
610         return priorRecognitions;
611     }
612 
613     /***
614      * Returns the re-estimator chain.
615      *
616      * @return the last element of the re-estimator chain, or <code>null</code>
617      * if the chain is empty
618      */
619     protected Reestimator getReestimator() {
620         return reestimator;
621     }
622 
623     /***
624      * Returns an optional re-extractor that can modify extractions in any
625      * suitable way.
626      *
627      * @return the re-extractor used; may be <code>null</code>
628      */
629     protected FinalReextractor getReextractor() {
630         return reextractor;
631     }
632 
633     /***
634      * Returns the context representation used for local classifications.
635      * 
636      * @return the context representation
637      */
638     public Representation getRepresentation() {
639         return representation;
640     }
641 
642     /***
643      * Returns the filter used in the first step of a double classification
644      * approach ("sentence filtering").
645      * 
646      * @return the node filter, or <code>null</code> if no sentence filtering
647      * is used
648      */
649     protected TrainableFilter getSentenceFilter() {
650         return sentenceFilter;
651     }
652 
653     /***
654      * Returns the combination strategy used.
655      * 
656      * @return the combination strategy
657      */
658     protected CombinationStrategy getStrategy() {
659         return strategy;
660     }
661 
662     /***
663      * Returns the target structure specifying the classes to recognize.
664      * 
665      * @return the used target structure
666      */
667     public TargetStructure getTargetStructure() {
668         return targetStructure;
669     }
670 
671     /***
672      * Returns the token walker used to walk thru documents.
673      * 
674      * @return the token walker
675      */
676     protected TokenWalker getWalker() {
677         return walker;
678     }
679 
680     /***
681      * Initializes the fields used for processing a document (feature cache,
682      * buffer of prior recognitions, token walker, and statistics) and resets
683      * the combination strategy.
684      * 
685      * @param filename the name of the document
686      * @throws ProcessingException if an error occurs while initializing
687      * @throws IOException if an I/O error occurs
688      */
689     protected void initFields(final File filename)
690     throws ProcessingException, IOException {
691         featureCache = new HashMap<Element, List<LocalFeature>>();
692         priorRecognitions = representation.initDocument(filename, getFactory());
693         walker = createTokenWalker();
694 //        featureCount.countDocument();
695         strategy.reset();
696         contextDetails.clear();
697     }
698 
699     /***
700      * Checks whether a token is relevant for training and extraction.
701      * Tokens containing only punctuation or symbol characters are considered
702      * irrevelant unless they have been {@linkplain #markRelevant(String)
703      * marked to be relevant}.
704      *
705      * @param token the token to check
706      * @return <code>true</code> if the is relevant for training and
707      * extraction; <code>false</code>  it is can be ignored
708      */
709     protected boolean isRelevant(final String token) {
710         // check the relevantPunctuation set for punctuation tokens
711         return !TextUtils.punctuationOrSymbol(token)
712             || relevantPunctuation.contains(token);
713     }
714 
715     /***
716      * Whether this instance uses sentence filtering (classification of relevant
717      * versus irrelevant sentences in a double classification approach).
718      * 
719      * @return <code>true</code> if sentence filtering is used
720      */
721     public boolean isSentenceFiltering() {
722         return sentenceFilter != null;
723     }
724 
725     /***
726      * Marks a punctuation token as relevant for classification
727      * ((because it did occur as the first or last token of an extraction).
728      *
729      * @param token the token to mark as relevant
730      */
731     protected void markRelevant(final String token) {
732         relevantPunctuation.add(token);
733     }
734 
735     /***
736      * Reset the combination strategy, handling the boolean result value
737      * in an appropriate way.
738      */
739     protected abstract void resetStrategy();
740 
741     /***
742      * {@inheritDoc}
743      */
744     public void skip() {
745         // call abstract method to reset the combination strategy
746         // -- currently disabled because somehow it degrades F1 measure
747         //resetStrategy();
748     }
749 
750     /***
751      * Returns a string representation of this object.
752      * 
753      * @return a textual representation
754      */
755     public String toString() {
756         final ToStringBuilder result = new ToStringBuilder(this)
757 //                .appendSuper(super.toString())
758                 .append("classifiers", classifiers);
759 //                .append("representation", representation)
760 //                .append("target structure", targetStructure)
761 //               .append("combination strategy", strategy);
762         if (reestimator != null) {
763             result.append("re-estimator", reestimator);
764         }
765         if (reextractor != null) {
766             result.append("re-extractor", reextractor);
767         }
768 //                .append("sentence filter", sentenceFilter)
769 //                .append("active classes", activeClasses)
770 //                .append("prior recognitions", priorRecognitions);
771         return result.toString();
772     }
773 
774     /***
775      * Helper that build the {@linkplain #getFeatures() features}and determines
776      * the {@linkplain #getActiveClasses() active classes}for an element.
777      * 
778      * @param element the element to process
779      * @param leftText textual content to the left of (preceding)
780      * <code>mainText</code>, might be empty
781      * @param mainText the main textual content to represent, might be empty
782      * @param rightText textual content to the right of (following)
783      * <code>mainText</code>, might be empty
784      */
785     protected void updateState(final Element element, final String leftText,
786             final String mainText, final String rightText) {
787         // build context representation + update statistics
788         features = getRepresentation().buildContext(element, leftText,
789                 mainText, rightText, priorRecognitions, featureCache, "Token");
790 //        featureCount.update(features);
791 
792         // use strategy to determine candidate classes
793         activeClasses = getStrategy().activeClasses();
794         //Util.LOG.debug("Determined active classes: " + activeClasses);
795     }
796 
797     /***
798      * Returns a read-only view on the counted documents, contexts, and features
799      * and the calculated averages. This is not a snapshot but will change
800      * whenever the a document is processed.
801      * 
802      * @return a view on the counts and averages
803      */
804 /*    public FeatureCountView viewFeatureCount() {
805         return featureCount;
806     } */
807 
808     /***
809      * Returns a read-only view on the set of punctuation tokens that have been
810      * found to be relevant for token classification (because they sometimes
811      * occur as the first or last token of an extraction).
812      *
813      * @return a read-only view on the relevant punctuation
814      */
815     public Set<String> viewRelevantPunctuation() {
816         return Collections.unmodifiableSet(relevantPunctuation);
817     }
818 
819 }