1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Writer;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Set;
31
32 import org.dom4j.Document;
33 import org.dom4j.Element;
34
35 import de.fu_berlin.ties.combi.CombinationState;
36 import de.fu_berlin.ties.combi.CombinationStrategy;
37 import de.fu_berlin.ties.ContextMap;
38 import de.fu_berlin.ties.ProcessingException;
39 import de.fu_berlin.ties.TiesConfiguration;
40
41 import de.fu_berlin.ties.classify.Classifier;
42 import de.fu_berlin.ties.classify.Prediction;
43 import de.fu_berlin.ties.classify.PredictionDistribution;
44 import de.fu_berlin.ties.classify.Probability;
45 import de.fu_berlin.ties.classify.Reranker;
46 import de.fu_berlin.ties.context.ContextDetails;
47 import de.fu_berlin.ties.context.Recognition;
48 import de.fu_berlin.ties.context.Representation;
49 import de.fu_berlin.ties.eval.FMetricsView;
50 import de.fu_berlin.ties.extract.amend.FinalReextractor;
51 import de.fu_berlin.ties.extract.reestimate.Reestimator;
52 import de.fu_berlin.ties.filter.DocumentRewriter;
53 import de.fu_berlin.ties.filter.EmbeddingElements;
54 import de.fu_berlin.ties.filter.FilteringTokenWalker;
55 import de.fu_berlin.ties.filter.TrainableFilter;
56 import de.fu_berlin.ties.io.FieldContainer;
57 import de.fu_berlin.ties.text.TokenDetails;
58 import de.fu_berlin.ties.text.TokenizerFactory;
59 import de.fu_berlin.ties.util.Util;
60
61 /***
62 * An extractor runs a local {@link de.fu_berlin.ties.classify.Classifier}
63 * on a list of items/nodes and combines their results using a
64 * {@link de.fu_berlin.ties.combi.CombinationStrategy}.
65 *
66 * <p>Instances of this class are not thread-safe and cannot extract from
67 * several documents in parallel.
68 *
69 * @author Christian Siefkes
70 * @version $Revision: 1.61 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
71 */
72 public class Extractor extends ExtractorBase {
73
74 /***
75 * The recommended file extension to use for storing extractions.
76 */
77 public static final String EXT_EXTRACTIONS = "ext";
78
79
80 /***
81 * Helper method that creates a reranker. The reranker is configured from
82 * the "extract" prefix. E.g. an entry such as "extract.bias.A = 0.99"
83 * could be used to introduce a small malus for the background class used
84 * in most combination strategies, thus favoring recall over precision.
85 *
86 * @param config the configuration to use
87 * @return the initialized reranker
88 */
89 private static Reranker createReranker(final TiesConfiguration config) {
90 return new Reranker(config.subset("extract"));
91 }
92
93
94 /***
95 * The last document processed by this instance.
96 */
97 private Document lastDocument;
98
99 /***
100 * The extraction container used for storing the predicted extractions.
101 */
102 private ExtractionContainer predictedExtractions;
103
104 /***
105 * A list of punctuation tokens collected between non-puncutation tokens.
106 */
107 private final List<TokenDetails> punctuationDetails =
108 new ArrayList<TokenDetails>();
109
110 /***
111 * An optional reranker that recalculates probabilities to introduce a bias.
112 * This can be used to favor recall over precision (by setting a bias <
113 * 1 for the background class) etc.
114 */
115 private final Reranker reranker;
116
117
118 /***
119 * Creates a new instance using a default extension. Delegates to
120 * {@link #Extractor(String, TiesConfiguration)} using the
121 * {@linkplain TiesConfiguration#CONF standard configuration}.
122 *
123 * @throws IllegalArgumentException if the combination strategy cannot be
124 * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
125 * TiesConfiguration)})
126 * @throws ProcessingException if an error occurs during initialization
127 */
128 public Extractor() throws IllegalArgumentException, ProcessingException {
129 this(EXT_EXTRACTIONS);
130 }
131
132 /***
133 * Creates a new instance. Delegates to
134 * {@link #Extractor(String, TiesConfiguration)} using the
135 * {@linkplain TiesConfiguration#CONF standard configuration}.
136 *
137 * @param outExt the extension to use for output files
138 * @throws IllegalArgumentException if the combination strategy cannot be
139 * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
140 * TiesConfiguration)})
141 * @throws ProcessingException if an error occurs during initialization
142 */
143 public Extractor(final String outExt)
144 throws IllegalArgumentException, ProcessingException {
145 this(outExt, TiesConfiguration.CONF);
146 }
147
148 /***
149 * Creates a new instance. Delegates to the corresponding
150 * {@linkplain ExtractorBase#ExtractorBase(String, TiesConfiguration) super
151 * constructor} to configure the fields.
152 *
153 * @param outExt the extension to use for output files
154 * @param config the configuration to use
155 * @throws IllegalArgumentException if the combination strategy cannot be
156 * initialized
157 * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
158 * TiesConfiguration)})
159 * @throws ProcessingException if an error occurs during initialization
160 */
161 public Extractor(final String outExt, final TiesConfiguration config)
162 throws IllegalArgumentException, ProcessingException {
163 super(outExt, config);
164 reranker = createReranker(config);
165 }
166
167 /***
168 * Creates a new instance. Delegates to the corresponding
169 * {@linkplain ExtractorBase#ExtractorBase(String, File, TiesConfiguration)
170 * super constructor} to configure the fields.
171 *
172 * @param outExt the extension to use for output files
173 * @param runDirectory the directory to run the classifier in; used instead
174 * of the
175 * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
176 * configured directory} if not <code>null</code>
177 * @param config the configuration to use
178 * @throws IllegalArgumentException if the combination strategy cannot be
179 * initialized
180 * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
181 * TiesConfiguration)})
182 * @throws ProcessingException if an error occurs during initialization
183 */
184 public Extractor(final String outExt, final File runDirectory,
185 final TiesConfiguration config)
186 throws IllegalArgumentException, ProcessingException {
187 super(outExt, runDirectory, config);
188 reranker = createReranker(config);
189 }
190
191 /***
192 * Creates a new instance, re-using the components from the provided
193 * trainer.
194 *
195 * @param outExt the extension to use for output files
196 * @param trainer trainer whose components should be re-used
197 */
198 public Extractor(final String outExt, final Trainer trainer) {
199 this(outExt, trainer.getTargetStructure(), trainer.getClassifiers(),
200 trainer.getRepresentation(), trainer.getStrategy(),
201 trainer.getReextractor(), trainer.getFactory(),
202 trainer.getReestimator(), trainer.getDocumentRewriters(),
203 trainer.getSentenceFilter(), createReranker(trainer.getConfig()),
204 trainer.viewRelevantPunctuation(), trainer.getConfig());
205 }
206
207 /***
208 * Creates a new instance.
209 *
210 * @param outExt the extension to use for output files
211 * @param targetStruct the target structure specifying the classes to
212 * recognize
213 * @param theClassifiers the classifiers to use for the local classification
214 * decisions
215 * @param theRepresentation the context representation to use for local
216 * classifications
217 * @param combiStrat the combination strategy to use
218 * @param reextract an optional re-extractor that can modify extractions in
219 * any suitable way
220 * @param tFactory used to instantiate tokenizers
221 * @param estimator the last element of the re-estimator chain, or
222 * <code>null</code> if the chain is empty
223 * @param docFilters a list (possibly empty) of document processors that are
224 * invoked to modify the XML representations of the documents to process
225 * @param sentFilter the filter used in the first step of a double
226 * classification approach ("sentence filtering"); if <code>null</code>,
227 * no sentence filtering is used
228 * @param rerank a reranker that recalculates probabilities to
229 * introduce a bias (can be used to favor recall over precision, by setting
230 * a bias < 1 for the background class, etc.); must not be
231 * <code>null</code>
232 * @param relevantPunct a set of punctuation tokens that have been found to
233 * be relevant for token classification; might be empty but not
234 * <code>null</code>
235 * @param config used to configure superclasses; if <code>null</code>,
236 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
237 */
238 public Extractor(final String outExt, final TargetStructure targetStruct,
239 final Classifier[] theClassifiers,
240 final Representation theRepresentation,
241 final CombinationStrategy combiStrat,
242 final FinalReextractor reextract, final TokenizerFactory tFactory,
243 final Reestimator estimator, final DocumentRewriter[] docFilters,
244 final TrainableFilter sentFilter, final Reranker rerank,
245 final Set<String> relevantPunct, final TiesConfiguration config) {
246 super(outExt, targetStruct, theClassifiers, theRepresentation,
247 combiStrat, reextract, tFactory, estimator, docFilters, sentFilter,
248 relevantPunct, config);
249 reranker = rerank;
250 }
251
252
253 /***
254 * Adds an element to the collected punctuation details.
255 *
256 * @param details the element to add
257 */
258 protected void addPunctuationDetails(final TokenDetails details) {
259 punctuationDetails.add(details);
260 }
261
262 /***
263 * Appends the collected punctuation details (if any) to the provided
264 * extraction. Finally delegates to {@link #clearPunctuation()}
265 * to dleetes the processed punctuation.
266 *
267 * @param ext the extraction to append to
268 */
269 protected void appendPunctuation(final Extraction ext) {
270 if (!punctuationDetails.isEmpty()) {
271 final Iterator<TokenDetails> detailsIter =
272 punctuationDetails.iterator();
273 TokenDetails currentDetails;
274
275
276 while (detailsIter.hasNext()) {
277 currentDetails = detailsIter.next();
278 ext.addToken(currentDetails, null, true);
279 }
280
281
282 clearPunctuation();
283 }
284 }
285
286 /***
287 * Clears the collected punctuation details.
288 */
289 protected void clearPunctuation() {
290 punctuationDetails.clear();
291 }
292
293 /***
294 * {@inheritDoc}
295 */
296 protected FilteringTokenWalker createFilteringTokenWalker(
297 final TrainableFilter repFilter) {
298 return new FilteringTokenWalker(this, getFactory(), repFilter, this);
299 }
300
301 /***
302 * Helper method that discards the last extraction, removing it from
303 * {@link #getPredictedExtractions() prior recognitions} and
304 * {@link #getPredictedExtractions() predicted extractions}.
305 */
306 private void discardLastExtraction() {
307 final Extraction removedFromContainer =
308 getPredictedExtractions().removeLast();
309
310
311 final Recognition removedFromRecognitions =
312 getPriorRecognitions().removeLast();
313
314
315 if ((removedFromRecognitions != null)
316 && !removedFromContainer.equals(removedFromRecognitions)) {
317 throw new IllegalStateException("Extractions discarded from "
318 + "container " + removedFromContainer + " and from prior "
319 + "recognitions " + removedFromRecognitions + " differ");
320 }
321
322 Util.LOG.debug("Discarded last extraction " + removedFromRecognitions);
323 }
324
325 /***
326 * Destroys the internal classifers. This method must only be used if the
327 * extractor will never be used again.
328 *
329 * @throws ProcessingException if an error occurs while the classifiers are
330 * being destroyed
331 */
332 public void destroy() throws ProcessingException {
333 final Classifier[] classifiers = getClassifiers();
334 for (int i = 0; i < classifiers.length; i++) {
335 classifiers[i].destroy();
336 }
337 }
338
339 /***
340 * Extracts items of interest from the contents of an XML document, based on
341 * context representation and local classifier.
342 *
343 * @param doc a document whose contents should be classified
344 * @param filename the name of the document
345 * @return a container of all extractions from the document, in document
346 * order
347 * @throws IOException if an I/O error occurs
348 * @throws ProcessingException if an error occurs during processing
349 */
350 public ExtractionContainer extract(final Document doc,
351 final File filename) throws IOException, ProcessingException {
352
353 initFields(filename);
354 final Document document = filterDocument(doc, filename);
355 lastDocument = document;
356 predictedExtractions = new ExtractionContainer(getTargetStructure());
357
358
359 getWalker().walk(document, null);
360
361
362 resetStrategy();
363
364 ExtractionContainer finalExtractions;
365
366 if (getReestimator() != null) {
367
368 finalExtractions = new ExtractionContainer(getTargetStructure());
369 Iterator extIter = predictedExtractions.iterator();
370 Extraction reestimatedExt;
371
372 while (extIter.hasNext()) {
373 reestimatedExt =
374 getReestimator().reestimate((Extraction) extIter.next());
375
376 if (reestimatedExt != null) {
377 finalExtractions.add(reestimatedExt);
378 }
379 }
380
381 } else {
382
383 finalExtractions = predictedExtractions;
384 }
385
386
387 if (getReextractor() != null) {
388 final ContextMap reexContext =
389 getStrategy().contextForReextractor();
390 finalExtractions = getReextractor().reextract(finalExtractions,
391 getContextDetails(), reexContext);
392 }
393
394 return finalExtractions;
395 }
396
397 /***
398 * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
399 * sentence filtering} on the last processed document.
400 *
401 * @param correctExtractions a container of all correct extractions for the
402 * document
403 * @return the calculated statistics for sentence filtering on the
404 * last document; <code>null</code> if {@linkplain
405 * #isSentenceFiltering() sentence filtering} is disabled
406 */
407 public FMetricsView evaluateSentenceFiltering(
408 final ExtractionContainer correctExtractions) {
409 return evaluateSentenceFiltering(new EmbeddingElements(lastDocument,
410 correctExtractions, getFactory()));
411 }
412
413 /***
414 * Returns the extraction container used for storing the predicted
415 * extractions.
416 * @return the extraction container
417 */
418 protected ExtractionContainer getPredictedExtractions() {
419 return predictedExtractions;
420 }
421
422 /***
423 * Extracts items of interest from the contents of an XML document and
424 * serializes the extractions.
425 *
426 * @param document the document to read
427 * @param writer the writer to write the extracted items to; flushed
428 * but not closed by this method
429 * @param context a map of objects that are made available for processing
430 * @throws IOException if an I/O error occurs
431 * @throws ProcessingException if an error occurs during processing
432 */
433 public void process(final Document document, final Writer writer,
434 final ContextMap context) throws IOException, ProcessingException {
435
436 final File filename = new File((File) context.get(KEY_DIRECTORY),
437 (String) context.get(KEY_LOCAL_NAME));
438
439
440 final ExtractionContainer extractions = extract(document, filename);
441 serializeExtractions(extractions, writer);
442 }
443
444 /***
445 * {@inheritDoc}
446 */
447 public void processToken(final Element element, final String left,
448 final TokenDetails details, final String right,
449 final ContextMap context) throws ProcessingException {
450 final CombinationState translatedState;
451 final boolean relevant = isRelevant(details.getToken());
452
453 if (relevant) {
454
455 updateState(element, left, details.getToken(), right);
456
457 final Classifier[] classifiers = getClassifiers();
458
459 final PredictionDistribution[] origDists =
460 new PredictionDistribution[classifiers.length];
461 final PredictionDistribution[] finalDists =
462 new PredictionDistribution[classifiers.length];
463 final Prediction[] predictions =
464 new Prediction[classifiers.length];
465 final String[] predictedTypes = new String[classifiers.length];
466 Probability currentProb;
467 final double[] probs = new double[classifiers.length];
468 final double[] pRs = new double[classifiers.length];
469
470 for (int i = 0; i < origDists.length; i++) {
471 origDists[i] = classifiers[i].classify(getFeatures(),
472 getActiveClasses()[i]);
473 finalDists[i] = reranker.rerank(origDists[i]);
474 predictions[i] = finalDists[i].best();
475 predictedTypes[i] = predictions[i].getType();
476 currentProb = predictions[i].getProbability();
477 probs[i] = currentProb.getProb();
478 pRs[i] = currentProb.getPR();
479 }
480
481 translatedState =
482 getStrategy().translateResult(finalDists, details);
483
484
485
486
487 if (translatedState.isDiscardPreceding()) {
488 discardLastExtraction();
489 }
490
491
492 if (translatedState.getType() == null) {
493
494 final Extraction lastExtraction =
495 getPredictedExtractions().last();
496
497 if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
498 lastExtraction.setSealed(true);
499 }
500
501
502 clearPunctuation();
503 } else if (translatedState.isBegin()) {
504
505 final Extraction newExtraction =
506 new Extraction(translatedState.getType(), details,
507 translatedState.getProbability());
508 getPredictedExtractions().add(newExtraction);
509 getPriorRecognitions().add(newExtraction);
510
511
512 clearPunctuation();
513 } else {
514
515 final Extraction currentExtraction =
516 getPredictedExtractions().last();
517
518
519 appendPunctuation(currentExtraction);
520
521
522 if (!currentExtraction.getType().equals(
523 translatedState.getType())) {
524 throw new IllegalStateException("Type mismatch: "
525 + translatedState + " cannot continue extraction "
526 + currentExtraction);
527 }
528 currentExtraction.addToken(details,
529 translatedState.getProbability(), true);
530 }
531
532
533 getStrategy().updateState(translatedState, finalDists, details);
534 } else {
535
536 translatedState = null;
537 final Extraction lastExtraction = getPredictedExtractions().last();
538
539
540 if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
541 addPunctuationDetails(details);
542
543
544
545
546 }
547
548
549
550 }
551
552
553 addContextDetails(new ContextDetails(details, getFeatures(),
554 translatedState, relevant));
555 }
556
557 /***
558 * Reset strategy and discard last prediction extraction if requested.
559 */
560 protected void resetStrategy() {
561 final boolean discardLast = getStrategy().reset();
562 if (discardLast) {
563 discardLastExtraction();
564 }
565 }
566
567 /***
568 * Helper method that serializes the content of an extraction container
569 * to a writer.
570 *
571 * @param extractions the extraction container to serialize
572 * @param writer the writer to write; will be flushed but not closed by
573 * this method
574 * @throws IOException if an I/O error occurs
575 */
576 public void serializeExtractions(final ExtractionContainer extractions,
577 final Writer writer) throws IOException {
578
579 final FieldContainer storage =
580 FieldContainer.createFieldContainer(getConfig());
581 extractions.storeEntries(storage);
582 storage.store(writer);
583 }
584
585 }