1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.util.ArrayList;
27 import java.util.Collections;
28 import java.util.HashMap;
29 import java.util.List;
30 import java.util.Map;
31 import java.util.Set;
32
33 import org.apache.commons.lang.builder.ToStringBuilder;
34 import org.dom4j.Document;
35 import org.dom4j.Element;
36 import org.dom4j.NodeFilter;
37
38 import de.fu_berlin.ties.combi.CombinationStrategy;
39 import de.fu_berlin.ties.filter.DocumentRewriter;
40 import de.fu_berlin.ties.DocumentReader;
41 import de.fu_berlin.ties.ProcessingException;
42 import de.fu_berlin.ties.TiesConfiguration;
43
44 import de.fu_berlin.ties.classify.Classifier;
45 import de.fu_berlin.ties.classify.Reranker;
46 import de.fu_berlin.ties.classify.TrainableClassifier;
47 import de.fu_berlin.ties.classify.feature.FeatureVector;
48 import de.fu_berlin.ties.context.ContextDetails;
49 import de.fu_berlin.ties.context.DefaultRepresentation;
50 import de.fu_berlin.ties.context.LocalFeature;
51 import de.fu_berlin.ties.context.PriorRecognitions;
52 import de.fu_berlin.ties.context.Representation;
53 import de.fu_berlin.ties.eval.FMetricsView;
54 import de.fu_berlin.ties.extract.amend.FinalReextractor;
55 import de.fu_berlin.ties.extract.reestimate.Reestimator;
56 import de.fu_berlin.ties.filter.EmbeddingElements;
57 import de.fu_berlin.ties.filter.FilterEvaluator;
58 import de.fu_berlin.ties.filter.FilteringTokenWalker;
59 import de.fu_berlin.ties.filter.RepresentationFilter;
60 import de.fu_berlin.ties.filter.SkipHandler;
61 import de.fu_berlin.ties.filter.TrainableFilter;
62 import de.fu_berlin.ties.text.TextUtils;
63 import de.fu_berlin.ties.text.TokenizerFactory;
64 import de.fu_berlin.ties.util.CollUtils;
65 import de.fu_berlin.ties.util.Util;
66 import de.fu_berlin.ties.xml.dom.ElementNameFilter;
67 import de.fu_berlin.ties.xml.dom.TokenProcessor;
68 import de.fu_berlin.ties.xml.dom.TokenWalker;
69
70 /***
71 * Common code base shared by {@link de.fu_berlin.ties.extract.Extractor}and
72 * {@link de.fu_berlin.ties.extract.Trainer}.
73 * <p>
74 * Instances of subclasses are not thread-safe and cannot process several
75 * documents in parallel.
76 *
77 * @author Christian Siefkes
78 * @version $Revision: 1.54 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
79 */
80 public abstract class ExtractorBase extends DocumentReader implements
81 SkipHandler, TokenProcessor {
82
83 /***
84 * Configuration key: List of elements to filter.
85 */
86 public static final String CONFIG_ELEMENTS = "filter.elements";
87
88 /***
89 * Configuration key: List of elements that should be avoided when filtering
90 * (using parent element instead).
91 */
92 public static final String CONFIG_AVOID = "filter.avoid";
93
94 /***
95 * Configuration key: list of punctuation and symbol tokens that are
96 * considered as relevant from the very start.
97 */
98 public static final String CONFIG_RELEVANT_PUNCTUATION =
99 "extract.punctuation.relevant";
100
101 /***
102 * Configuration suffix/prefix used for sentence filtering.
103 */
104 public static final String CONFIG_SENTENCE = "sent";
105
106 /***
107 * Configuration suffix used for information extraction--specific settings.
108 */
109 public static final String CONFIG_SUFFIX_IE = "ie";
110
111
112 /***
113 * Initializes the list of document rewriters.
114 *
115 * @param conf the filters are initialized from the optional
116 * "rewriters" parameter in this configuration
117 * @return the created list of filters, might be empty bot not
118 * <code>null</code>
119 * @throws ProcessingException if an error occurred while creating the
120 * classifier
121 */
122 protected static DocumentRewriter[] createDocumentRewriters(
123 final TiesConfiguration conf) throws ProcessingException {
124 final String[] classNames =
125 conf.getStringArray("rewriters");
126 final DocumentRewriter[] docFilters;
127
128 if (!TiesConfiguration.arrayIsEmpty(classNames)) {
129
130
131 docFilters = new DocumentRewriter[classNames.length];
132 final Object[] params = new Object[] {conf};
133
134 for (int i = 0; i < docFilters.length; i++) {
135 try {
136 docFilters[i] = (DocumentRewriter) Util.createObject(
137 Class.forName(classNames[i]), params,
138 TiesConfiguration.class);
139 } catch (ClassNotFoundException cnfe) {
140
141 throw new ProcessingException(
142 "Cannot initialize document rewriter " + classNames[i],
143 cnfe);
144 } catch (InstantiationException ie) {
145
146 throw new ProcessingException(
147 "Cannot initialize document rewriter " + classNames[i],
148 ie);
149 }
150
151 Util.LOG.debug("Initialized document rewriter no." + i
152 + ": " + classNames[i]);
153 }
154 } else {
155
156 docFilters = new DocumentRewriter[0];
157 }
158
159 return docFilters;
160 }
161
162 /***
163 * Helper methat that initializes the filter to be used for the first step
164 * of a double classification approach ("sentence filtering").
165 *
166 * @param conf the filter is initialized from the "filter" parameters in
167 * this configuration
168 * @param representation the representation to use
169 * @return the created filter; or <code>null</code> if no sentence
170 * filtering should be used
171 * @throws ProcessingException if an error occurs while creating the filter
172 */
173 private static TrainableFilter createSentenceFilter(
174 final TiesConfiguration conf, final Representation representation)
175 throws ProcessingException {
176 final RepresentationFilter result;
177
178 if (conf.containsKey(CONFIG_ELEMENTS)) {
179 final String[] filteredElements =
180 conf.getStringArray(CONFIG_ELEMENTS);
181
182 if (!TiesConfiguration.arrayIsEmpty(filteredElements)) {
183
184 final NodeFilter positiveFilter = new ElementNameFilter(
185 filteredElements);
186 final NodeFilter negativeFilter = new ElementNameFilter(
187 conf.getStringArray(CONFIG_AVOID));
188
189
190
191 final Reranker reranker =
192 new Reranker(conf.subset(CONFIG_SENTENCE));
193 result = new RepresentationFilter(conf, positiveFilter,
194 negativeFilter, reranker, representation, "Sentence");
195 Util.LOG.debug("Initialized representation filter for sentence "
196 + "filtering: " + result);
197 } else {
198 result = null;
199 }
200 } else {
201 result = null;
202 }
203
204 return result;
205 }
206
207
208 /***
209 * The classifier(s) used for the local classification decisions.
210 */
211 private final Classifier[] classifiers;
212
213 /***
214 * A list of context details representing all tokens in a document.
215 */
216 private final List<ContextDetails> contextDetails =
217 new ArrayList<ContextDetails>();
218
219 /***
220 * The last element of the re-estimator chain, might be <code>null</code>
221 * if the chain is empty.
222 */
223 private final Reestimator reestimator;
224
225 /***
226 * An optional re-extractor that can modify extractions in any suitable way.
227 */
228 private final FinalReextractor reextractor;
229
230 /***
231 * The context representation used for local classifications.
232 */
233 private final Representation representation;
234
235 /***
236 * The target structure specifying the classes to recognize.
237 */
238 private final TargetStructure targetStructure;
239
240 /***
241 * Used to instantiate tokenizers.
242 */
243 private final TokenizerFactory factory;
244
245 /***
246 * The filter used in the first step of a double classification approach
247 * ("sentence filtering"); if <code>null</code>, no sentence filtering is
248 * used.
249 */
250 private final TrainableFilter sentenceFilter;
251
252 /***
253 * A list (possibly empty) of document processors that are invoked to modify
254 * the XML representations of the documents to process, e.g. by adding
255 * semantic information such as named-entity predictions.
256 */
257 private final DocumentRewriter[] documentRewriters;
258
259 /***
260 * The set of candidate classes to consider for the current element for each
261 * classifier.
262 */
263 private Set[] activeClasses;
264
265 /***
266 * The feature cache used by the context representation.
267 */
268 private Map<Element, List<LocalFeature>> featureCache;
269
270 /***
271 * The vector of features representing the currently processed element.
272 */
273 private FeatureVector features;
274
275 /***
276 * Used to count documents, contexts, and features and to calculate
277 * averages.
278 */
279
280
281 /***
282 * A set of punctuation tokens that have been found to be relevant for
283 * token classification (because they sometimes occur as the first or
284 * last token of an extraction).
285 */
286 private final Set<String> relevantPunctuation;
287
288 /***
289 * A buffer of preceding {@link de.fu_berlin.ties.context.Recognition}s
290 * from the current document.
291 */
292 private PriorRecognitions priorRecognitions;
293
294 /***
295 * The combination strategy used.
296 */
297 private final CombinationStrategy strategy;
298
299 /***
300 * Used to walk thru documents.
301 */
302 private TokenWalker walker;
303
304 /***
305 * Creates a new instance. Delegates to
306 * {@link #ExtractorBase(String, TiesConfiguration)}using the
307 * {@linkplain TiesConfiguration#CONF standard configuration}.
308 *
309 * @param outExt the extension to use for output files
310 * @throws IllegalArgumentException if the combination strategy cannot be
311 * initialized (cf.
312 * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
313 * @throws ProcessingException if an error occurs during initialization
314 */
315 public ExtractorBase(final String outExt) throws IllegalArgumentException,
316 ProcessingException {
317 this(outExt, TiesConfiguration.CONF);
318 }
319
320 /***
321 * Creates a new instance, configuring target structure, classifier,
322 * {@link DefaultRepresentation}, node filter and combination strategy from
323 * the provided configuration.
324 *
325 * @param outExt the extension to use for output files
326 * @param config the configuration to use
327 * @throws IllegalArgumentException if the combination strategy cannot be
328 * initialized (cf.
329 * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
330 * @throws ProcessingException if an error occurs during initialization
331 */
332 public ExtractorBase(final String outExt, final TiesConfiguration config)
333 throws IllegalArgumentException, ProcessingException {
334 this(outExt, null, config);
335 }
336
337 /***
338 * Creates a new instance, configuring target structure, classifier,
339 * {@link DefaultRepresentation}, node filter, combination strategy and
340 * tokenizer factory from the provided configuration.
341 *
342 * @param outExt the extension to use for output files
343 * @param runDirectory the directory to run the classifier in; used instead
344 * of the
345 * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
346 * configured directory} if not <code>null</code>
347 * @param config the configuration to use
348 * @throws IllegalArgumentException if the combination strategy cannot be
349 * initialized (cf.
350 * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
351 * @throws ProcessingException if an error occurs during initialization
352 */
353 public ExtractorBase(final String outExt, final File runDirectory,
354 final TiesConfiguration config) throws IllegalArgumentException,
355 ProcessingException {
356 super(outExt, config);
357 targetStructure = new TargetStructure(config);
358 representation = new DefaultRepresentation(config);
359 strategy = CombinationStrategy.createStrategy(
360 targetStructure.getClassNames(), config);
361 reestimator = Reestimator.createReestimators(config);
362 reextractor = strategy.initReextractor(reestimator);
363 documentRewriters = createDocumentRewriters(config);
364 sentenceFilter = createSentenceFilter(config, representation);
365
366
367 final Set[] allClasses = strategy.allClasses();
368 classifiers = new Classifier[allClasses.length];
369
370 for (int i = 0; i < allClasses.length; i++) {
371 classifiers[i] = TrainableClassifier.createClassifier(
372 CollUtils.asStringSet(allClasses[i]), runDirectory,
373 config, CONFIG_SUFFIX_IE);
374 }
375
376
377 relevantPunctuation = CollUtils.arrayAsSet(
378 config.getStringArray(CONFIG_RELEVANT_PUNCTUATION));
379 Util.LOG.debug("Initialized set of relevant punctuation + symbol "
380 + "tokens to " + relevantPunctuation);
381 factory = new TokenizerFactory(config);
382 }
383
384 /***
385 * Creates a new instance.
386 *
387 * @param outExt the extension to use for output files
388 * @param targetStruct the target structure specifying the classes to
389 * recognize
390 * @param theClassifiers the array of classifiers to use for the local
391 * classification decisions
392 * @param theRepresentation the context representation to use for local
393 * classifications
394 * @param combiStrat the combination strategy to use
395 * @param reextract an optional re-extractor that can modify extractions in
396 * any suitable way
397 * @param tFactory used to instantiate tokenizers
398 * @param estimator the last element of the re-estimator chain, or
399 * <code>null</code> if the chain is empty
400 * @param docFilters a list (possibly empty) of document processors that are
401 * invoked to modify the XML representations of the documents to process
402 * @param sentFilter the filter used in the first step of a double
403 * classification approach ("sentence filtering"); if <code>null</code>,
404 * no sentence filtering is used
405 * @param relevantPunct a set of punctuation tokens that have been found to
406 * be relevant for token classification; might be empty but not
407 * <code>null</code>
408 * @param config used to configure superclasses; if <code>null</code>,
409 * the {@linkplain TiesConfiguration#CONF standard configuration}is used
410 */
411 public ExtractorBase(final String outExt,
412 final TargetStructure targetStruct,
413 final Classifier[] theClassifiers,
414 final Representation theRepresentation,
415 final CombinationStrategy combiStrat,
416 final FinalReextractor reextract, final TokenizerFactory tFactory,
417 final Reestimator estimator, final DocumentRewriter[] docFilters,
418 final TrainableFilter sentFilter, final Set<String> relevantPunct,
419 final TiesConfiguration config) {
420 super(outExt, config);
421 targetStructure = targetStruct;
422 classifiers = theClassifiers;
423 representation = theRepresentation;
424 strategy = combiStrat;
425 reextractor = reextract;
426 factory = tFactory;
427 reestimator = estimator;
428 documentRewriters = docFilters;
429 sentenceFilter = sentFilter;
430 relevantPunctuation = relevantPunct;
431 Util.LOG.debug("Initialized set of relevant punctuation + symbol "
432 + "tokens to " + relevantPunctuation);
433 }
434
435
436 /***
437 * Adds an element to the collected context details.
438 *
439 * @param details the element to add
440 */
441 protected void addContextDetails(final ContextDetails details) {
442
443
444 if (contextDetails.size() != details.getIndex()) {
445 Util.LOG.warn("Length of context details list is "
446 + contextDetails.size() + ", but index of next context is "
447 + details.getIndex() + " (these numbers should be identical"
448 + " since there is one context per index position)");
449 }
450 contextDetails.add(details);
451 }
452
453 /***
454 * Creates a filtering token walker to be used for walking through a
455 * document and sentence classification if a double classification approach
456 * is used.
457 *
458 * @param repFilter the trainable filter to use
459 * @return the created walker
460 */
461 protected abstract FilteringTokenWalker createFilteringTokenWalker(
462 final TrainableFilter repFilter);
463
464 /***
465 * Helper method that creates the token walker to use for walking through a
466 * document. This walker automatically handles sentence filtering if a
467 * double classification approach should be used. Delegates to the abstract
468 * {@link #createFilteringTokenWalker(RepresentationFilter)}method if
469 * sentence filtering should be used.
470 *
471 * @return the created walker
472 * @throws ProcessingException if an error occurs while initializing the
473 * walker
474 */
475 private TokenWalker createTokenWalker() throws ProcessingException {
476 if (isSentenceFiltering()) {
477
478 return createFilteringTokenWalker(sentenceFilter);
479 } else {
480
481 return new TokenWalker(this, getFactory());
482 }
483 }
484
485 /***
486 * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
487 * sentence filtering} on the last processed document.
488 *
489 * @param embeddingElements the correct set of embedding elements
490 * @return the calculated statistics for sentence filtering on the last
491 * document; <code>null</code> if {@linkplain #isSentenceFiltering()
492 * sentence filtering} is disabled
493 */
494 protected FMetricsView evaluateSentenceFiltering(
495 final EmbeddingElements embeddingElements) {
496 if (isSentenceFiltering() && (walker != null)) {
497
498 return FilterEvaluator.evaluate(embeddingElements,
499 (FilteringTokenWalker) walker);
500 } else {
501 return null;
502 }
503 }
504
505 /***
506 * Runs a document though the list of {@link #getDocumentRewriters()
507 * document filters} (if any) to modify it.
508 *
509 * @param orgDocument the original document
510 * @param filename the file name of the document
511 * @return the resulting document as modified by the filters
512 * @throws IOException if an I/O error occurs during filtering
513 * @throws ProcessingException if a processing error occurs during filtering
514 */
515 protected Document filterDocument(final Document orgDocument,
516 final File filename)
517 throws IOException, ProcessingException {
518 if ((documentRewriters != null) && (documentRewriters.length > 0)) {
519 Document doc = orgDocument;
520
521
522 for (int i = 0; i < documentRewriters.length; i++) {
523 doc = documentRewriters[i].rewrite(doc, filename);
524 }
525
526 return doc;
527 } else {
528
529 return orgDocument;
530 }
531 }
532
533 /***
534 * Returns the set of candidate classes to consider for the current element
535 * for each classifier.
536 *
537 * @return the value of the attribute
538 */
539 protected Set[] getActiveClasses() {
540 return activeClasses;
541 }
542
543 /***
544 * Returns the array of classifiers used for the local classification
545 * decisions.
546 *
547 * @return the local classifier
548 */
549 public Classifier[] getClassifiers() {
550 return classifiers;
551 }
552
553 /***
554 * Returns the list of context details representing all tokens in the
555 * current document.
556 *
557 * @return the list of context details
558 */
559 protected List<ContextDetails> getContextDetails() {
560 return contextDetails;
561 }
562
563 /***
564 * Returns the list of document processors that are invoked to modify
565 * the XML representations of the documents to process, e.g. by adding
566 * semantic information such as named-entity predictions.
567 *
568 * @return the list of filters used, might be empty
569 */
570 protected DocumentRewriter[] getDocumentRewriters() {
571 return documentRewriters;
572 }
573
574 /***
575 * Returns the factory used to instantiate tokenizers.
576 *
577 * @return the value of the attribute
578 */
579 public TokenizerFactory getFactory() {
580 return factory;
581 }
582
583 /***
584 * Returns the object used to count documents, contexts, and features and to
585 * calculate averages.
586 *
587 * @return the used feature count
588 */
589
590
591
592
593 /***
594 * Returns vector of features representing the currently processed element.
595 *
596 * @return the value of the attribute
597 */
598 protected FeatureVector getFeatures() {
599 return features;
600 }
601
602 /***
603 * Returns the buffer of preceding
604 * {@link de.fu_berlin.ties.context.Recognition}s from the current
605 * document.
606 *
607 * @return the buffer
608 */
609 public PriorRecognitions getPriorRecognitions() {
610 return priorRecognitions;
611 }
612
613 /***
614 * Returns the re-estimator chain.
615 *
616 * @return the last element of the re-estimator chain, or <code>null</code>
617 * if the chain is empty
618 */
619 protected Reestimator getReestimator() {
620 return reestimator;
621 }
622
623 /***
624 * Returns an optional re-extractor that can modify extractions in any
625 * suitable way.
626 *
627 * @return the re-extractor used; may be <code>null</code>
628 */
629 protected FinalReextractor getReextractor() {
630 return reextractor;
631 }
632
633 /***
634 * Returns the context representation used for local classifications.
635 *
636 * @return the context representation
637 */
638 public Representation getRepresentation() {
639 return representation;
640 }
641
642 /***
643 * Returns the filter used in the first step of a double classification
644 * approach ("sentence filtering").
645 *
646 * @return the node filter, or <code>null</code> if no sentence filtering
647 * is used
648 */
649 protected TrainableFilter getSentenceFilter() {
650 return sentenceFilter;
651 }
652
653 /***
654 * Returns the combination strategy used.
655 *
656 * @return the combination strategy
657 */
658 protected CombinationStrategy getStrategy() {
659 return strategy;
660 }
661
662 /***
663 * Returns the target structure specifying the classes to recognize.
664 *
665 * @return the used target structure
666 */
667 public TargetStructure getTargetStructure() {
668 return targetStructure;
669 }
670
671 /***
672 * Returns the token walker used to walk thru documents.
673 *
674 * @return the token walker
675 */
676 protected TokenWalker getWalker() {
677 return walker;
678 }
679
680 /***
681 * Initializes the fields used for processing a document (feature cache,
682 * buffer of prior recognitions, token walker, and statistics) and resets
683 * the combination strategy.
684 *
685 * @param filename the name of the document
686 * @throws ProcessingException if an error occurs while initializing
687 * @throws IOException if an I/O error occurs
688 */
689 protected void initFields(final File filename)
690 throws ProcessingException, IOException {
691 featureCache = new HashMap<Element, List<LocalFeature>>();
692 priorRecognitions = representation.initDocument(filename, getFactory());
693 walker = createTokenWalker();
694
695 strategy.reset();
696 contextDetails.clear();
697 }
698
699 /***
700 * Checks whether a token is relevant for training and extraction.
701 * Tokens containing only punctuation or symbol characters are considered
702 * irrevelant unless they have been {@linkplain #markRelevant(String)
703 * marked to be relevant}.
704 *
705 * @param token the token to check
706 * @return <code>true</code> if the is relevant for training and
707 * extraction; <code>false</code> it is can be ignored
708 */
709 protected boolean isRelevant(final String token) {
710
711 return !TextUtils.punctuationOrSymbol(token)
712 || relevantPunctuation.contains(token);
713 }
714
715 /***
716 * Whether this instance uses sentence filtering (classification of relevant
717 * versus irrelevant sentences in a double classification approach).
718 *
719 * @return <code>true</code> if sentence filtering is used
720 */
721 public boolean isSentenceFiltering() {
722 return sentenceFilter != null;
723 }
724
725 /***
726 * Marks a punctuation token as relevant for classification
727 * ((because it did occur as the first or last token of an extraction).
728 *
729 * @param token the token to mark as relevant
730 */
731 protected void markRelevant(final String token) {
732 relevantPunctuation.add(token);
733 }
734
735 /***
736 * Reset the combination strategy, handling the boolean result value
737 * in an appropriate way.
738 */
739 protected abstract void resetStrategy();
740
741 /***
742 * {@inheritDoc}
743 */
744 public void skip() {
745
746
747
748 }
749
750 /***
751 * Returns a string representation of this object.
752 *
753 * @return a textual representation
754 */
755 public String toString() {
756 final ToStringBuilder result = new ToStringBuilder(this)
757
758 .append("classifiers", classifiers);
759
760
761
762 if (reestimator != null) {
763 result.append("re-estimator", reestimator);
764 }
765 if (reextractor != null) {
766 result.append("re-extractor", reextractor);
767 }
768
769
770
771 return result.toString();
772 }
773
774 /***
775 * Helper that build the {@linkplain #getFeatures() features}and determines
776 * the {@linkplain #getActiveClasses() active classes}for an element.
777 *
778 * @param element the element to process
779 * @param leftText textual content to the left of (preceding)
780 * <code>mainText</code>, might be empty
781 * @param mainText the main textual content to represent, might be empty
782 * @param rightText textual content to the right of (following)
783 * <code>mainText</code>, might be empty
784 */
785 protected void updateState(final Element element, final String leftText,
786 final String mainText, final String rightText) {
787
788 features = getRepresentation().buildContext(element, leftText,
789 mainText, rightText, priorRecognitions, featureCache, "Token");
790
791
792
793 activeClasses = getStrategy().activeClasses();
794
795 }
796
797 /***
798 * Returns a read-only view on the counted documents, contexts, and features
799 * and the calculated averages. This is not a snapshot but will change
800 * whenever the a document is processed.
801 *
802 * @return a view on the counts and averages
803 */
804
805
806
807
808 /***
809 * Returns a read-only view on the set of punctuation tokens that have been
810 * found to be relevant for token classification (because they sometimes
811 * occur as the first or last token of an extraction).
812 *
813 * @return a read-only view on the relevant punctuation
814 */
815 public Set<String> viewRelevantPunctuation() {
816 return Collections.unmodifiableSet(relevantPunctuation);
817 }
818
819 }