1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.io.File;
25 import java.util.Collections;
26 import java.util.HashMap;
27 import java.util.List;
28 import java.util.Map;
29 import java.util.Set;
30
31 import org.apache.commons.lang.builder.ToStringBuilder;
32 import org.dom4j.Element;
33 import org.dom4j.NodeFilter;
34
35 import de.fu_berlin.ties.combi.CombinationStrategy;
36 import de.fu_berlin.ties.DocumentReader;
37 import de.fu_berlin.ties.ProcessingException;
38 import de.fu_berlin.ties.TiesConfiguration;
39
40 import de.fu_berlin.ties.classify.Classifier;
41 import de.fu_berlin.ties.classify.Reranker;
42 import de.fu_berlin.ties.classify.TrainableClassifier;
43 import de.fu_berlin.ties.classify.feature.FeatureVector;
44 import de.fu_berlin.ties.context.DefaultRepresentation;
45 import de.fu_berlin.ties.context.LocalFeature;
46 import de.fu_berlin.ties.context.PriorRecognitions;
47 import de.fu_berlin.ties.context.Representation;
48 import de.fu_berlin.ties.eval.FMetricsView;
49 import de.fu_berlin.ties.eval.FeatureCount;
50 import de.fu_berlin.ties.eval.FeatureCountView;
51 import de.fu_berlin.ties.filter.EmbeddingElements;
52 import de.fu_berlin.ties.filter.FilterEvaluator;
53 import de.fu_berlin.ties.filter.FilteringTokenWalker;
54 import de.fu_berlin.ties.filter.RepresentationFilter;
55 import de.fu_berlin.ties.filter.SkipHandler;
56 import de.fu_berlin.ties.filter.TrainableFilter;
57 import de.fu_berlin.ties.text.TextUtils;
58 import de.fu_berlin.ties.text.TokenizerFactory;
59 import de.fu_berlin.ties.util.CollectionUtils;
60 import de.fu_berlin.ties.util.Util;
61 import de.fu_berlin.ties.xml.dom.ElementNameFilter;
62 import de.fu_berlin.ties.xml.dom.TokenProcessor;
63 import de.fu_berlin.ties.xml.dom.TokenWalker;
64
65 /***
66 * Common code base shared by {@link de.fu_berlin.ties.extract.Extractor}and
67 * {@link de.fu_berlin.ties.extract.Trainer}.
68 * <p>
69 * Instances of subclasses are not thread-safe and cannot process several
70 * documents in parallel.
71 *
72 * @author Christian Siefkes
73 * @version $Revision: 1.39 $, $Date: 2004/12/07 12:01:48 $, $Author: siefkes $
74 */
75 public abstract class ExtractorBase extends DocumentReader implements
76 SkipHandler, TokenProcessor {
77
78 /***
79 * Configuration key: List of elements to filter.
80 */
81 public static final String CONFIG_ELEMENTS = "filter.elements";
82
83 /***
84 * Configuration key: List of elements that should be avoided when filtering
85 * (using parent element instead).
86 */
87 public static final String CONFIG_AVOID = "filter.avoid";
88
89 /***
90 * Configuration key: list of punctuation and symbol tokens that are
91 * considered as relevant from the very start.
92 */
93 public static final String CONFIG_RELEVANT_PUNCTUATION =
94 "extract.punctuation.relevant";
95
96 /***
97 * Configuration suffix/prefix used for sentence filtering.
98 */
99 public static final String CONFIG_SENTENCE = "sent";
100
101 /***
102 * Helper methat that initializes the filter to be used for the first step
103 * of a double classification approach ("sentence filtering").
104 *
105 * @param conf the filter is initialized from the "filter" parameters in
106 * this configuration
107 * @param representation the representation to use
108 * @return the created filter; or <code>null</code> if no sentence
109 * filtering should be used
110 * @throws ProcessingException if an error occurs while creating the filter
111 */
112 public static TrainableFilter createSentenceFilter(
113 final TiesConfiguration conf, final Representation representation)
114 throws ProcessingException {
115 final RepresentationFilter result;
116
117 if (conf.containsKey(CONFIG_ELEMENTS)) {
118 final String[] filteredElements =
119 conf.getStringArray(CONFIG_ELEMENTS);
120
121 if (!TiesConfiguration.arrayIsEmpty(filteredElements)) {
122
123 final NodeFilter positiveFilter = new ElementNameFilter(
124 filteredElements);
125 final NodeFilter negativeFilter = new ElementNameFilter(
126 conf.getStringArray(CONFIG_AVOID));
127
128
129
130 final Reranker reranker =
131 new Reranker(conf.subset(CONFIG_SENTENCE));
132 result = new RepresentationFilter(conf, positiveFilter,
133 negativeFilter, reranker, representation, "Sentence");
134 Util.LOG.debug("Initialized representation filter for sentence "
135 + "filtering: " + result);
136 } else {
137 result = null;
138 }
139 } else {
140 result = null;
141 }
142
143 return result;
144 }
145
146 /***
147 * The classifier(s) used for the local classification decisions.
148 */
149 private final Classifier[] classifiers;
150
151 /***
152 * The context representation used for local classifications.
153 */
154 private final Representation representation;
155
156 /***
157 * The target structure specifying the classes to recognize.
158 */
159 private final TargetStructure targetStructure;
160
161 /***
162 * Used to instantiate tokenizers.
163 */
164 private final TokenizerFactory factory;
165
166 /***
167 * The filter used in the first step of a double classification approach
168 * ("sentence filtering"); if <code>null</code>, no sentence filtering is
169 * used.
170 */
171 private final TrainableFilter sentenceFilter;
172
173 /***
174 * The set of candidate classes to consider for the current element for each
175 * classifier.
176 */
177 private Set[] activeClasses;
178
179 /***
180 * The feature cache used by the context representation.
181 */
182 private Map<Element, List<LocalFeature>> featureCache;
183
184 /***
185 * The vector of features representing the currently processed element.
186 */
187 private FeatureVector features;
188
189 /***
190 * Used to count documents, contexts, and features and to calculate
191 * averages.
192 */
193 private final FeatureCount featureCount = new FeatureCount();
194
195 /***
196 * A set of punctuation tokens that have been found to be relevant for
197 * token classification (because they sometimes occur as the first or
198 * last token of an extraction).
199 */
200 private final Set<String> relevantPunctuation;
201
202 /***
203 * A buffer of preceding {@link de.fu_berlin.ties.context.Recognition}s
204 * from the current document.
205 */
206 private PriorRecognitions priorRecognitions;
207
208 /***
209 * The combination strategy used.
210 */
211 private final CombinationStrategy strategy;
212
213 /***
214 * Used to walk thru documents.
215 */
216 private TokenWalker walker;
217
218 /***
219 * Creates a new instance. Delegates to
220 * {@link #ExtractorBase(String, TiesConfiguration)}using the
221 * {@linkplain TiesConfiguration#CONF standard configuration}.
222 *
223 * @param outExt the extension to use for output files
224 * @throws IllegalArgumentException if the combination strategy cannot be
225 * initialized (cf.
226 * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
227 * @throws ProcessingException if an error occurs during initialization
228 */
229 public ExtractorBase(final String outExt) throws IllegalArgumentException,
230 ProcessingException {
231 this(outExt, TiesConfiguration.CONF);
232 }
233
234 /***
235 * Creates a new instance, configuring target structure, classifier,
236 * {@link DefaultRepresentation}, node filter and combination strategy from
237 * the provided configuration.
238 *
239 * @param outExt the extension to use for output files
240 * @param config the configuration to use
241 * @throws IllegalArgumentException if the combination strategy cannot be
242 * initialized (cf.
243 * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
244 * @throws ProcessingException if an error occurs during initialization
245 */
246 public ExtractorBase(final String outExt, final TiesConfiguration config)
247 throws IllegalArgumentException, ProcessingException {
248 this(outExt, null, config);
249 }
250
251 /***
252 * Creates a new instance, configuring target structure, classifier,
253 * {@link DefaultRepresentation}, node filter, combination strategy and
254 * tokenizer factory from the provided configuration.
255 *
256 * @param outExt the extension to use for output files
257 * @param runDirectory the directory to run the classifier in; used instead
258 * of the
259 * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
260 * configured directory} if not <code>null</code>
261 * @param config the configuration to use
262 * @throws IllegalArgumentException if the combination strategy cannot be
263 * initialized (cf.
264 * {@link CombinationStrategy#createStrategy(Set, TiesConfiguration)})
265 * @throws ProcessingException if an error occurs during initialization
266 */
267 public ExtractorBase(final String outExt, final File runDirectory,
268 final TiesConfiguration config) throws IllegalArgumentException,
269 ProcessingException {
270 super(outExt, config);
271 targetStructure = new TargetStructure(config);
272 representation = new DefaultRepresentation(config);
273 strategy = CombinationStrategy.createStrategy(
274 targetStructure.getClassNames(), config);
275 sentenceFilter = createSentenceFilter(config, representation);
276
277
278 final Set<String>[] allClasses = strategy.allClasses();
279 classifiers = new Classifier[allClasses.length];
280
281 for (int i = 0; i < allClasses.length; i++) {
282 classifiers[i] = TrainableClassifier.createClassifier(
283 allClasses[i], runDirectory, config);
284 }
285
286
287 relevantPunctuation = CollectionUtils.arrayAsSet(
288 config.getStringArray(CONFIG_RELEVANT_PUNCTUATION));
289 Util.LOG.debug("Initialized set of relevant punctuation + symbol "
290 + "tokens to " + relevantPunctuation);
291 factory = new TokenizerFactory(config);
292 }
293
294 /***
295 * Creates a new instance.
296 *
297 * @param outExt the extension to use for output files
298 * @param targetStruct the target structure specifying the classes to
299 * recognize
300 * @param theClassifiers the array of classifiers to use for the local
301 * classification decisions
302 * @param theRepresentation the context representation to use for local
303 * classifications
304 * @param combiStrat the combination strategy to use
305 * @param tFactory used to instantiate tokenizers
306 * @param sentFilter the filter used in the first step of a double
307 * classification approach ("sentence filtering"); if <code>null</code>,
308 * no sentence filtering is used
309 * @param relevantPunct a set of punctuation tokens that have been found to
310 * be relevant for token classification; might be empty but not
311 * <code>null</code>
312 * @param config used to configure superclasses; if <code>null</code>,
313 * the {@linkplain TiesConfiguration#CONF standard configuration}is used
314 */
315 public ExtractorBase(final String outExt,
316 final TargetStructure targetStruct,
317 final Classifier[] theClassifiers,
318 final Representation theRepresentation,
319 final CombinationStrategy combiStrat,
320 final TokenizerFactory tFactory, final TrainableFilter sentFilter,
321 final Set<String> relevantPunct, final TiesConfiguration config) {
322 super(outExt, config);
323 targetStructure = targetStruct;
324 classifiers = theClassifiers;
325 representation = theRepresentation;
326 strategy = combiStrat;
327 factory = tFactory;
328 sentenceFilter = sentFilter;
329 relevantPunctuation = relevantPunct;
330 Util.LOG.debug("Initialized set of relevant punctuation + symbol "
331 + "tokens to " + relevantPunctuation);
332 }
333
334 /***
335 * Creates a filtering token walker to be used for walking through a
336 * document and sentence classification if a double classification approach
337 * is used.
338 *
339 * @param repFilter the trainable filter to use
340 * @return the created walker
341 */
342 protected abstract FilteringTokenWalker createFilteringTokenWalker(
343 final TrainableFilter repFilter);
344
345 /***
346 * Helper method that creates the token walker to use for walking through a
347 * document. This walker automatically handles sentence filtering if a
348 * double classification approach should be used. Delegates to the abstract
349 * {@link #createFilteringTokenWalker(RepresentationFilter)}method if
350 * sentence filtering should be used.
351 *
352 * @return the created walker
353 * @throws ProcessingException if an error occurs while initializing the
354 * walker
355 */
356 private TokenWalker createTokenWalker() throws ProcessingException {
357 if (isSentenceFiltering()) {
358
359 return createFilteringTokenWalker(sentenceFilter);
360 } else {
361
362 return new TokenWalker(this, getFactory());
363 }
364 }
365
366 /***
367 * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
368 * sentence filtering} on the last processed document.
369 *
370 * @param embeddingElements the correct set of embedding elements
371 * @return the calculated statistics for sentence filtering on the last
372 * document; <code>null</code> if {@linkplain #isSentenceFiltering()
373 * sentence filtering} is disabled
374 */
375 protected FMetricsView evaluateSentenceFiltering(
376 final EmbeddingElements embeddingElements) {
377 if (isSentenceFiltering() && (walker != null)) {
378
379 return FilterEvaluator.evaluate(embeddingElements,
380 (FilteringTokenWalker) walker);
381 } else {
382 return null;
383 }
384 }
385
386 /***
387 * Returns the set of candidate classes to consider for the current element
388 * for each classifier.
389 *
390 * @return the value of the attribute
391 */
392 protected Set[] getActiveClasses() {
393 return activeClasses;
394 }
395
396 /***
397 * Returns the array of classifiers used for the local classification
398 * decisions.
399 *
400 * @return the local classifier
401 */
402 public Classifier[] getClassifiers() {
403 return classifiers;
404 }
405
406 /***
407 * Returns the factory used to instantiate tokenizers.
408 *
409 * @return the value of the attribute
410 */
411 public TokenizerFactory getFactory() {
412 return factory;
413 }
414
415 /***
416 * Returns the object used to count documents, contexts, and features and to
417 * calculate averages.
418 *
419 * @return the used feature count
420 */
421 public FeatureCount getFeatureCount() {
422 return featureCount;
423 }
424
425 /***
426 * Returns vector of features representing the currently processed element.
427 *
428 * @return the value of the attribute
429 */
430 protected FeatureVector getFeatures() {
431 return features;
432 }
433
434 /***
435 * Returns the buffer of preceding
436 * {@link de.fu_berlin.ties.context.Recognition}s from the current
437 * document.
438 *
439 * @return the buffer
440 */
441 public PriorRecognitions getPriorRecognitions() {
442 return priorRecognitions;
443 }
444
445 /***
446 * Returns the context representation used for local classifications.
447 *
448 * @return the context representation
449 */
450 public Representation getRepresentation() {
451 return representation;
452 }
453
454 /***
455 * Returns the filter used in the first step of a double classification
456 * approach ("sentence filtering").
457 *
458 * @return the node filter, or <code>null</code> if no sentence filtering
459 * is used
460 */
461 protected TrainableFilter getSentenceFilter() {
462 return sentenceFilter;
463 }
464
465 /***
466 * Returns the combination strategy used.
467 *
468 * @return the combination strategy
469 */
470 protected CombinationStrategy getStrategy() {
471 return strategy;
472 }
473
474 /***
475 * Returns the target structure specifying the classes to recognize.
476 *
477 * @return the used target structure
478 */
479 public TargetStructure getTargetStructure() {
480 return targetStructure;
481 }
482
483 /***
484 * Returns the token walker used to walk thru documents.
485 *
486 * @return the token walker
487 */
488 protected TokenWalker getWalker() {
489 return walker;
490 }
491
492 /***
493 * Initializes the fields used for processing a document (feature cache,
494 * buffer of prior recognitions, token walker, and statistics) and resets
495 * the combination strategy.
496 *
497 * @throws ProcessingException if an error occurs while initializing
498 */
499 protected void initFields() throws ProcessingException {
500 featureCache = new HashMap<Element, List<LocalFeature>>();
501 priorRecognitions = representation.createRecognitionBuffer();
502 walker = createTokenWalker();
503 featureCount.countDocument();
504 strategy.reset();
505 }
506
507 /***
508 * Checks whether a token is relevant for training and extraction.
509 * Tokens containing only punctuation or symbol characters are considered
510 * irrevelant unless they have been {@linkplain #markRelevant(String)
511 * marked to be relevant}.
512 *
513 * @param token the token to check
514 * @return <code>true</code> if the is relevant for training and
515 * extraction; <code>false</code> it is can be ignored
516 */
517 protected boolean isRelevant(final String token) {
518
519 return !TextUtils.punctuationOrSymbol(token)
520 || relevantPunctuation.contains(token);
521 }
522
523 /***
524 * Whether this instance uses sentence filtering (classification of relevant
525 * versus irrelevant sentences in a double classification approach).
526 *
527 * @return <code>true</code> if sentence filtering is used
528 */
529 public boolean isSentenceFiltering() {
530 return sentenceFilter != null;
531 }
532
533 /***
534 * Marks a punctuation token as relevant for classification
535 * ((because it did occur as the first or last token of an extraction).
536 *
537 * @param token the token to mark as relevant
538 */
539 protected void markRelevant(final String token) {
540 relevantPunctuation.add(token);
541 }
542
543 /***
544 * Reset the combination strategy, handling the boolean result value
545 * in an appropriate way.
546 */
547 protected abstract void resetStrategy();
548
549 /***
550 * {@inheritDoc}
551 */
552 public void skip() {
553
554
555
556 }
557
558 /***
559 * Returns a string representation of this object.
560 *
561 * @return a textual representation
562 */
563 public String toString() {
564 return new ToStringBuilder(this).appendSuper(super.toString())
565 .append("classifiers", classifiers)
566 .append("representation", representation)
567 .append("target structure", targetStructure)
568 .append("combination strategy", strategy)
569 .append("sentence filter", sentenceFilter)
570 .append("active classes", activeClasses)
571 .append("prior recognitions", priorRecognitions)
572 .toString();
573 }
574
575 /***
576 * Helper that build the {@linkplain #getFeatures() features}and determines
577 * the {@linkplain #getActiveClasses() active classes}for an element.
578 *
579 * @param element the element to process
580 * @param leftText textual content to the left of (preceding)
581 * <code>mainText</code>, might be empty
582 * @param mainText the main textual content to represent, might be empty
583 * @param rightText textual content to the right of (following)
584 * <code>mainText</code>, might be empty
585 */
586 protected void updateState(final Element element, final String leftText,
587 final String mainText, final String rightText) {
588
589 features = getRepresentation().buildContext(element, leftText,
590 mainText, rightText, priorRecognitions, featureCache, "Token");
591 featureCount.update(features);
592
593
594 activeClasses = getStrategy().activeClasses();
595
596 }
597
598 /***
599 * Returns a read-only view on the counted documents, contexts, and features
600 * and the calculated averages. This is not a snapshot but will change
601 * whenever the a document is processed.
602 *
603 * @return a view on the counts and averages
604 */
605 public FeatureCountView viewFeatureCount() {
606 return featureCount;
607 }
608
609 /***
610 * Returns a read-only view on the set of punctuation tokens that have been
611 * found to be relevant for token classification (because they sometimes
612 * occur as the first or last token of an extraction).
613 *
614 * @return a read-only view on the relevant punctuation
615 */
616 public Set<String> viewRelevantPunctuation() {
617 return Collections.unmodifiableSet(relevantPunctuation);
618 }
619
620 }