1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.Writer;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.Set;
31
32 import org.apache.commons.lang.ArrayUtils;
33 import org.apache.commons.lang.builder.ToStringBuilder;
34 import org.dom4j.Document;
35 import org.dom4j.Element;
36
37 import de.fu_berlin.ties.combi.CombinationState;
38 import de.fu_berlin.ties.combi.CombinationStrategy;
39 import de.fu_berlin.ties.ContextMap;
40 import de.fu_berlin.ties.ProcessingException;
41 import de.fu_berlin.ties.TiesConfiguration;
42
43 import de.fu_berlin.ties.classify.Classifier;
44 import de.fu_berlin.ties.classify.Prediction;
45 import de.fu_berlin.ties.classify.PredictionDistribution;
46 import de.fu_berlin.ties.classify.Probability;
47 import de.fu_berlin.ties.classify.Reranker;
48 import de.fu_berlin.ties.context.Recognition;
49 import de.fu_berlin.ties.context.Representation;
50 import de.fu_berlin.ties.eval.FMetricsView;
51 import de.fu_berlin.ties.filter.EmbeddingElements;
52 import de.fu_berlin.ties.filter.FilteringTokenWalker;
53 import de.fu_berlin.ties.filter.TrainableFilter;
54 import de.fu_berlin.ties.io.FieldContainer;
55 import de.fu_berlin.ties.text.TokenDetails;
56 import de.fu_berlin.ties.text.TokenizerFactory;
57 import de.fu_berlin.ties.util.MathUtils;
58 import de.fu_berlin.ties.util.Util;
59
60 /***
61 * An extractor runs a local {@link de.fu_berlin.ties.classify.Classifier}
62 * on a list of items/nodes and combines their results using a
63 * {@link de.fu_berlin.ties.combi.CombinationStrategy}.
64 *
65 * <p>Instances of this class are not thread-safe and cannot extract from
66 * several documents in parallel.
67 *
68 * @author Christian Siefkes
69 * @version $Revision: 1.41 $, $Date: 2004/12/06 09:21:06 $, $Author: siefkes $
70 */
71 public class Extractor extends ExtractorBase {
72
73 /***
74 * The recommended file extension to use for storing extractions.
75 */
76 public static final String EXT_EXTRACTIONS = "ext";
77
78
79 /***
80 * Helper method that creates a reranker. The reranker is configured from
81 * the "extract" prefix. E.g. an entry such as "extract.bias.A = 0.99"
82 * could be used to introduce a small malus for the background class used
83 * in most combination strategies, thus favoring recall over precision.
84 *
85 * @param config the configuration to use
86 * @return the initialized reranker
87 */
88 private static Reranker createReranker(final TiesConfiguration config) {
89 return new Reranker(config.subset("extract"));
90 }
91
92
93 /***
94 * The last document processed by this instance.
95 */
96 private Document lastDocument;
97
98 /***
99 * The extraction container used for storing the predicted extractions.
100 */
101 private ExtractionContainer predictedExtractions;
102
103 /***
104 * A list of punctuation tokens collected between non-puncutation tokens.
105 */
106 private final List<TokenDetails> punctuationDetails =
107 new ArrayList<TokenDetails>();
108
109 /***
110 * An optional reranker that recalculates probabilities to introduce a bias.
111 * This can be used to favor recall over precision (by setting a bias <
112 * 1 for the background class) etc.
113 */
114 private final Reranker reranker;
115
116
117 /***
118 * Creates a new instance using a default extension. Delegates to
119 * {@link #Extractor(String, TiesConfiguration)} using the
120 * {@linkplain TiesConfiguration#CONF standard configuration}.
121 *
122 * @throws IllegalArgumentException if the combination strategy cannot be
123 * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
124 * TiesConfiguration)})
125 * @throws ProcessingException if an error occurs during initialization
126 */
127 public Extractor() throws IllegalArgumentException, ProcessingException {
128 this(EXT_EXTRACTIONS);
129 }
130
131 /***
132 * Creates a new instance. Delegates to
133 * {@link #Extractor(String, TiesConfiguration)} using the
134 * {@linkplain TiesConfiguration#CONF standard configuration}.
135 *
136 * @param outExt the extension to use for output files
137 * @throws IllegalArgumentException if the combination strategy cannot be
138 * initialized (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
139 * TiesConfiguration)})
140 * @throws ProcessingException if an error occurs during initialization
141 */
142 public Extractor(final String outExt)
143 throws IllegalArgumentException, ProcessingException {
144 this(outExt, TiesConfiguration.CONF);
145 }
146
147 /***
148 * Creates a new instance. Delegates to the corresponding
149 * {@linkplain ExtractorBase#ExtractorBase(String, TiesConfiguration) super
150 * constructor} to configure the fields.
151 *
152 * @param outExt the extension to use for output files
153 * @param config the configuration to use
154 * @throws IllegalArgumentException if the combination strategy cannot be
155 * initialized
156 * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
157 * TiesConfiguration)})
158 * @throws ProcessingException if an error occurs during initialization
159 */
160 public Extractor(final String outExt, final TiesConfiguration config)
161 throws IllegalArgumentException, ProcessingException {
162 super(outExt, config);
163 reranker = createReranker(config);
164 }
165
166 /***
167 * Creates a new instance. Delegates to the corresponding
168 * {@linkplain ExtractorBase#ExtractorBase(String, File, TiesConfiguration)
169 * super constructor} to configure the fields.
170 *
171 * @param outExt the extension to use for output files
172 * @param runDirectory the directory to run the classifier in; used instead
173 * of the
174 * {@linkplain de.fu_berlin.ties.classify.ExternalClassifier#CONFIG_DIR
175 * configured directory} if not <code>null</code>
176 * @param config the configuration to use
177 * @throws IllegalArgumentException if the combination strategy cannot be
178 * initialized
179 * (cf. {@link CombinationStrategy#createStrategy(java.util.Set,
180 * TiesConfiguration)})
181 * @throws ProcessingException if an error occurs during initialization
182 */
183 public Extractor(final String outExt, final File runDirectory,
184 final TiesConfiguration config)
185 throws IllegalArgumentException, ProcessingException {
186 super(outExt, runDirectory, config);
187 reranker = createReranker(config);
188 }
189
190 /***
191 * Creates a new instance, re-using the components from the provided
192 * trainer.
193 *
194 * @param outExt the extension to use for output files
195 * @param trainer trainer whose components should be re-used
196 */
197 public Extractor(final String outExt, final Trainer trainer) {
198 this(outExt, trainer.getTargetStructure(), trainer.getClassifiers(),
199 trainer.getRepresentation(), trainer.getStrategy(),
200 trainer.getFactory(), trainer.getSentenceFilter(),
201 createReranker(trainer.getConfig()),
202 trainer.viewRelevantPunctuation(), trainer.getConfig());
203 }
204
205 /***
206 * Creates a new instance.
207 *
208 * @param outExt the extension to use for output files
209 * @param targetStruct the target structure specifying the classes to
210 * recognize
211 * @param theClassifiers the classifiers to use for the local classification
212 * decisions
213 * @param theRepresentation the context representation to use for local
214 * classifications
215 * @param combiStrat the combination strategy to use
216 * @param tFactory used to instantiate tokenizers
217 * @param sentFilter the filter used in the first step of a double
218 * classification approach ("sentence filtering"); if <code>null</code>,
219 * no sentence filtering is used
220 * @param rerank a reranker that recalculates probabilities to
221 * introduce a bias (can be used to favor recall over precision, by setting
222 * a bias < 1 for the background class, etc.); must not be
223 * <code>null</code>
224 * @param relevantPunct a set of punctuation tokens that have been found to
225 * be relevant for token classification; might be empty but not
226 * <code>null</code>
227 * @param config used to configure superclasses; if <code>null</code>,
228 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
229 */
230 public Extractor(final String outExt, final TargetStructure targetStruct,
231 final Classifier[] theClassifiers,
232 final Representation theRepresentation,
233 final CombinationStrategy combiStrat,
234 final TokenizerFactory tFactory, final TrainableFilter sentFilter,
235 final Reranker rerank, final Set<String> relevantPunct,
236 final TiesConfiguration config) {
237 super(outExt, targetStruct, theClassifiers, theRepresentation,
238 combiStrat, tFactory, sentFilter, relevantPunct, config);
239 reranker = rerank;
240 }
241
242
243 /***
244 * Adds an element to the collected punctuation details.
245 *
246 * @param details the element to add
247 */
248 protected void addPunctuationDetails(final TokenDetails details) {
249 punctuationDetails.add(details);
250 }
251
252 /***
253 * Appends the collected punctuation details (if any) to the provided
254 * extraction. Finally delegates to {@link #clearPunctuation()}
255 * to dleetes the processed punctuation.
256 *
257 * @param ext the extraction to append to
258 */
259 protected void appendPunctuation(final Extraction ext) {
260 if (!punctuationDetails.isEmpty()) {
261 final Iterator<TokenDetails> detailsIter =
262 punctuationDetails.iterator();
263 TokenDetails currentDetails;
264
265
266 while (detailsIter.hasNext()) {
267 currentDetails = detailsIter.next();
268 ext.addToken(currentDetails, null, true);
269 }
270
271
272 clearPunctuation();
273 }
274 }
275
276 /***
277 * Clears the collected punctuation details.
278 */
279 protected void clearPunctuation() {
280 punctuationDetails.clear();
281 }
282
283 /***
284 * {@inheritDoc}
285 */
286 protected FilteringTokenWalker createFilteringTokenWalker(
287 final TrainableFilter repFilter) {
288 return new FilteringTokenWalker(this, getFactory(), repFilter, this);
289 }
290
291 /***
292 * Helper method that discards the last extraction, removing it from
293 * {@link #getPredictedExtractions() prior recognitions} and
294 * {@link #getPredictedExtractions() predicted extractions}.
295 */
296 private void discardLastExtraction() {
297 final Extraction removedFromContainer =
298 getPredictedExtractions().removeLast();
299
300
301 final Recognition removedFromRecognitions =
302 getPriorRecognitions().removeLast();
303
304
305 if ((removedFromRecognitions != null)
306 && !removedFromContainer.equals(removedFromRecognitions)) {
307 throw new IllegalStateException("Extractions discarded from "
308 + "container " + removedFromContainer + " and from prior "
309 + "recognitions " + removedFromRecognitions + " differ");
310 }
311
312 Util.LOG.debug("Discarded last extraction " + removedFromRecognitions);
313 }
314
315 /***
316 * Extracts items of interest from the contents of an XML document, based on
317 * context representation and local classifier.
318 *
319 * @param document a document whose contents should be classified
320 * @return a container of all extractions from the document, in document
321 * order
322 * @throws IOException if an I/O error occurs
323 * @throws ProcessingException if an error occurs during processing
324 */
325 public ExtractionContainer extract(final Document document)
326 throws IOException, ProcessingException {
327
328 initFields();
329 lastDocument = document;
330 predictedExtractions = new ExtractionContainer(getTargetStructure());
331
332
333 getWalker().walk(document, null);
334
335
336 resetStrategy();
337
338
339 return getPredictedExtractions();
340 }
341
342 /***
343 * Evaluates precision and recall for {@linkplain #isSentenceFiltering()
344 * sentence filtering} on the last processed document.
345 *
346 * @param correctExtractions a container of all correct extractions for the
347 * document
348 * @return the calculated statistics for sentence filtering on the
349 * last document; <code>null</code> if {@linkplain
350 * #isSentenceFiltering() sentence filtering} is disabled
351 */
352 public FMetricsView evaluateSentenceFiltering(
353 final ExtractionContainer correctExtractions) {
354 return evaluateSentenceFiltering(new EmbeddingElements(lastDocument,
355 correctExtractions, getFactory()));
356 }
357
358 /***
359 * Returns the extraction container used for storing the predicted
360 * extractions.
361 * @return the extraction container
362 */
363 protected ExtractionContainer getPredictedExtractions() {
364 return predictedExtractions;
365 }
366
367 /***
368 * Extracts items of interest from the contents of an XML document and
369 * serializes the extractions.
370 *
371 * @param document the document to read
372 * @param writer the writer to write the extracted items to; flushed
373 * but not closed by this method
374 * @param context a map of objects that are made available for processing
375 * @throws IOException if an I/O error occurs
376 * @throws ProcessingException if an error occurs during processing
377 */
378 public void process(final Document document, final Writer writer,
379 final ContextMap context) throws IOException, ProcessingException {
380
381 final ExtractionContainer extractions = extract(document);
382
383
384 final FieldContainer storage = FieldContainer.createFieldContainer();
385 extractions.storeEntries(storage);
386 storage.store(writer);
387 }
388
389 /***
390 * {@inheritDoc}
391 */
392 public void processToken(final Element element, final String left,
393 final TokenDetails details, final String right,
394 final ContextMap context) throws ProcessingException {
395 if (isRelevant(details.getToken())) {
396
397 updateState(element, left, details.getToken(), right);
398
399 final Classifier[] classifiers = getClassifiers();
400
401 final PredictionDistribution[] origDists =
402 new PredictionDistribution[classifiers.length];
403 final PredictionDistribution[] finalDists =
404 new PredictionDistribution[classifiers.length];
405 final Prediction[] predictions =
406 new Prediction[classifiers.length];
407 final String[] predictedTypes = new String[classifiers.length];
408 Probability currentProb;
409 final double[] probs = new double[classifiers.length];
410 final double[] pRs = new double[classifiers.length];
411
412 for (int i = 0; i < origDists.length; i++) {
413 origDists[i] = classifiers[i].classify(getFeatures(),
414 getActiveClasses()[i]);
415 finalDists[i] = reranker.rerank(origDists[i]);
416 predictions[i] = finalDists[i].best();
417 predictedTypes[i] = predictions[i].getType();
418 currentProb = predictions[i].getProbability();
419 probs[i] = currentProb.getProb();
420 pRs[i] = currentProb.getPR();
421 }
422
423 final CombinationState translatedState =
424 getStrategy().translateResult(finalDists);
425 final double meanProb = MathUtils.mean(probs);
426 final double meanPR = MathUtils.mean(pRs);
427 Util.LOG.debug("Predicted types: '"
428 + ArrayUtils.toString(predictedTypes)
429 + "'; translated state: " + translatedState + ", mean prob.: "
430 + meanProb + ", mean pR: " + meanPR);
431
432 if (translatedState.isDiscardPreceding()) {
433 discardLastExtraction();
434 }
435
436
437 if (translatedState.getType() == null) {
438
439 final Extraction lastExtraction =
440 getPredictedExtractions().last();
441
442 if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
443 lastExtraction.setSealed(true);
444 }
445
446
447 clearPunctuation();
448 } else if (translatedState.isBegin()) {
449
450 final Extraction newExtraction =
451 new Extraction(translatedState.getType(), details,
452 new Probability(meanProb, meanPR));
453 getPredictedExtractions().add(newExtraction);
454 getPriorRecognitions().add(newExtraction);
455
456
457 clearPunctuation();
458 } else {
459
460 final Extraction currentExtraction =
461 getPredictedExtractions().last();
462
463
464 appendPunctuation(currentExtraction);
465
466
467 if (!currentExtraction.getType().equals(
468 translatedState.getType())) {
469 throw new IllegalStateException("Type mismatch: "
470 + translatedState + " cannot continue extraction "
471 + currentExtraction);
472 }
473 currentExtraction.addToken(details,
474 new Probability(meanProb, meanPR), true);
475 }
476
477
478 getStrategy().updateState(translatedState);
479 } else {
480
481 final Extraction lastExtraction = getPredictedExtractions().last();
482
483
484 if ((lastExtraction != null) && (!lastExtraction.isSealed())) {
485 addPunctuationDetails(details);
486 Util.LOG.debug("Keeping irrelevant punctuation token "
487 + details.getToken()
488 + " -- might become part of the current "
489 + lastExtraction.getType() + " extraction");
490 } else {
491 Util.LOG.debug("Skipping over irrelevant punctuation token "
492 + details.getToken());
493 }
494 }
495 }
496
497 /***
498 * Reset strategy and discard last prediction extraction if requested.
499 */
500 protected void resetStrategy() {
501 final boolean discardLast = getStrategy().reset();
502 if (discardLast) {
503 discardLastExtraction();
504 }
505 }
506
507 /***
508 * Returns a string representation of this object.
509 * @return a textual representation
510 */
511 public String toString() {
512 return new ToStringBuilder(this)
513 .appendSuper(super.toString())
514 .append("reranker", reranker)
515 .append("predicted extractions", predictedExtractions)
516 .toString();
517 }
518
519 }