1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract.amend;
23
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.LinkedHashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.Set;
31 import java.util.SortedMap;
32 import java.util.SortedSet;
33 import java.util.TreeMap;
34 import java.util.TreeSet;
35
36 import org.apache.commons.lang.builder.ToStringBuilder;
37
38 import de.fu_berlin.ties.ContextMap;
39 import de.fu_berlin.ties.ProcessingException;
40 import de.fu_berlin.ties.TiesConfiguration;
41 import de.fu_berlin.ties.classify.Classifier;
42 import de.fu_berlin.ties.classify.Prediction;
43 import de.fu_berlin.ties.classify.PredictionComparator;
44 import de.fu_berlin.ties.classify.PredictionDistribution;
45 import de.fu_berlin.ties.classify.Probability;
46 import de.fu_berlin.ties.classify.TrainableClassifier;
47 import de.fu_berlin.ties.classify.feature.FeatureVector;
48 import de.fu_berlin.ties.context.ContextDetails;
49 import de.fu_berlin.ties.extract.Extraction;
50 import de.fu_berlin.ties.extract.ExtractionContainer;
51 import de.fu_berlin.ties.extract.TargetStructure;
52 import de.fu_berlin.ties.extract.reestimate.LengthFilter;
53 import de.fu_berlin.ties.filter.TrainableFilter;
54 import de.fu_berlin.ties.util.Util;
55
56 /***
57 * A second level for the {@link de.fu_berlin.ties.combi.BeginEndStrategy},
58 * similar to the ELIE/2 system by Finn and Kushmerick.
59 *
60 * @author Christian Siefkes
61 * @version $Revision: 1.19 $, $Date: 2006/10/21 16:04:15 $, $Author: siefkes $
62 */
63 public class BeginEndReextractor implements FinalReextractor {
64
65 /***
66 * An inner class used for communication with the first level
67 * ({@link de.fu_berlin.ties.combi.BeginEndStrategy}). Instances of this
68 * class map from integers representing the index of a token in a document
69 * to all positive (more likely than "OTHER") predictions of a classifier.
70 */
71 public static class PositivePredictionsMap
72 extends LinkedHashMap<Integer, LinkedHashMap<String, Prediction>> {
73
74 /***
75 * Creates a new instance.
76 */
77 public PositivePredictionsMap() {
78 super();
79 }
80
81 /***
82 * Creates a new instance.
83 *
84 * @param initialCapacity the initial capacity
85 * @throws IllegalArgumentException if the initial capacity is negative
86 * or the load factor is nonpositive
87 */
88 public PositivePredictionsMap(final int initialCapacity)
89 throws IllegalArgumentException {
90 super(initialCapacity);
91 }
92
93 /***
94 * Creates a new instance.
95 *
96 * @param initialCapacity the initial capacity
97 * @param loadFactor the load factor
98 * @throws IllegalArgumentException if the initial capacity is negative
99 * or the load factor is nonpositive
100 */
101 public PositivePredictionsMap(final int initialCapacity,
102 final float loadFactor) throws IllegalArgumentException {
103 super(initialCapacity, loadFactor);
104 }
105
106 /***
107 * Creates a new instance.
108 * @param m the map whose mappings are to be placed in this map
109 * @throws NullPointerException if the specified map is null
110 */
111 public PositivePredictionsMap(
112 final Map<Integer, LinkedHashMap<String, Prediction>> m)
113 throws NullPointerException {
114 super(m);
115 }
116
117 }
118
119
120 /***
121 * Context key for the map from token indices to positive predictions of
122 * the begin classifier.
123 */
124 public static final String CONTEXT_BEGIN_MAP = "begin";
125
126 /***
127 * Context key for the map from token indices to positive predictions of
128 * the end classifier.
129 */
130 public static final String CONTEXT_END_MAP = "end";
131
132 /***
133 * Optional suffix that can be appended to the
134 * {@link Classifier#CONFIG_CLASSIFIER} configuration key to modify
135 * the type of classifiers used by this instance.
136 */
137 public static final String SUFFIX_AMEND = "amend";
138
139
140 /***
141 * A map from class names to binary classifiers used to match widowed
142 * predictions of the level 1 begin classifiers.
143 */
144 private final Map<String, TrainableClassifier> beginWidowMatchers
145 = new HashMap<String, TrainableClassifier>();
146
147 /***
148 * A map from class names to binary classifiers used to match widowed
149 * predictions of the level 1 end classifiers.
150 */
151 private final Map<String, TrainableClassifier> endWidowMatchers
152 = new HashMap<String, TrainableClassifier>();
153
154 /***
155 * Used to determine the maximum tolerated length of extractions.
156 */
157 private final LengthFilter lengthFilter;
158
159 /***
160 * Creates a new instance.
161 *
162 * @param classNames a set of valid class names (String)
163 * @param config used to configure this instance
164 * @param myLengthFilter used to determine the maximum tolerated length of
165 * extractions
166 * @throws ProcessingException if an error occurs during initialization
167 * of the used classifeirs
168 */
169 public BeginEndReextractor(final Set<String> classNames,
170 final TiesConfiguration config, final LengthFilter myLengthFilter)
171 throws ProcessingException {
172 super();
173 lengthFilter = myLengthFilter;
174
175 final String[] classifierSpec = config.getStringArray(
176 config.adaptKey(Classifier.CONFIG_CLASSIFIER, SUFFIX_AMEND));
177 String currentClass;
178 TrainableClassifier beginWidowClassifier, endWidowClassifier;
179
180
181 for (Iterator<String> iter = classNames.iterator(); iter.hasNext();) {
182 currentClass = iter.next();
183
184 beginWidowClassifier = TrainableClassifier.createClassifier(
185 TrainableFilter.BOOLEAN_CLASSES, null, null, classifierSpec,
186 config);
187 beginWidowMatchers.put(currentClass, beginWidowClassifier);
188 endWidowClassifier = TrainableClassifier.createClassifier(
189 TrainableFilter.BOOLEAN_CLASSES, null, null, classifierSpec,
190 config);
191 endWidowMatchers.put(currentClass, endWidowClassifier);
192 }
193 }
194
195 /***
196 * Completes the set of context to the considered relevant for a
197 * widow matcher.
198 *
199 * @param relevantContexts the set of relevant contexts to complete
200 * @param startIndex the index of the first/last token to consider
201 * @param toleratedLength the number of tokens to consider relevant
202 * @param completeForwards whether to complete forwards or backwards
203 * @param size the overall number of tokens in this document
204 */
205 private void addRelevantContexts(final SortedSet<Integer> relevantContexts,
206 final int startIndex, final int toleratedLength,
207 final boolean completeForwards, final int size) {
208 int positionToConsider;
209 boolean inRange = true;
210
211
212
213 for (int i = 0; (i < toleratedLength) && inRange; i++) {
214 if (completeForwards) {
215 positionToConsider = startIndex + i;
216 } else {
217 positionToConsider = startIndex - i;
218 }
219
220
221 inRange = (positionToConsider >= 0) && (positionToConsider < size);
222
223 if (inRange) {
224 relevantContexts.add(positionToConsider);
225 }
226 }
227 }
228
229 /***
230 * Discard all begin/end candidates starting/ending an extraction from the
231 * given maps.
232 *
233 * @param originalExtractions the original extractions predicted for this
234 * document
235 * @param beginMap a map of widowed positive predictions of the level 1
236 * begin classifiers
237 * @param endMap map of widowed positive predictions of the level 1
238 * end classifiers
239 */
240 private void discardNonWidows(final ExtractionContainer originalExtractions,
241 final PositivePredictionsMap beginMap,
242 final PositivePredictionsMap endMap) {
243 final Iterator<Extraction> iter = originalExtractions.iterator();
244 Extraction ext;
245 String type;
246 int beginIndex, endIndex;
247
248 while (iter.hasNext()) {
249 ext = iter.next();
250 type = ext.getType();
251 beginIndex = ext.getIndex();
252 endIndex = ext.getLastIndex();
253
254
255
256 if (beginMap.containsKey(beginIndex)) {
257 beginMap.get(beginIndex).remove(type);
258 }
259
260 if (endMap.containsKey(endIndex)) {
261 endMap.get(endIndex).remove(type);
262 }
263 }
264 }
265
266 /***
267 * Completes widowed predictions by finding for each widow the best match
268 * (if any) within the tolerated length window. The returned extraction
269 * container might contain overlapping extractions which must be discarded.
270 *
271 * @param widowMap a map of widowed positive predictions of the level 1
272 * classifiers
273 * @param widowMatchers a map from class names to binary classifiers to be
274 * used to match the widowed predictions
275 * @param contextDetails a list of context details representing all
276 * tokens in the document
277 * @param targetStruct the used target structure
278 * @param completeForwards whether to complete forwards (if these are begin
279 * widows) or backwards (if these are end widows)
280 * @return the new extractions creating by finding the best match (if any)
281 * for each of the begin widows
282 * @throws ProcessingException might be thrown by the widow matchers during
283 * classification
284 */
285 private ExtractionContainer matchWidows(
286 final PositivePredictionsMap widowMap,
287 final Map<String, TrainableClassifier> widowMatchers,
288 final List<ContextDetails> contextDetails,
289 final TargetStructure targetStruct, final boolean completeForwards)
290 throws ProcessingException {
291 final ExtractionContainer result =
292 new ExtractionContainer(targetStruct);
293 final Iterator<Integer> outerIter = widowMap.keySet().iterator();
294 Integer index;
295 Map<String, Prediction> positivePredictions;
296 Iterator<String> innerIter;
297 String extractionType;
298 TrainableClassifier widowMatcher;
299 int toleratedLength;
300 Prediction widowedPred, newPred, bestCompletionPred;
301 int bestCompletionPos;
302 final PredictionComparator predComp = new PredictionComparator();
303 boolean inRange;
304 int i;
305 int positionToConsider;
306 ContextDetails contextToConsider;
307 FeatureVector features;
308 Probability prob;
309 Extraction newExtraction;
310 boolean extractionCompleted;
311 final String widowType = (completeForwards ? "begin" : "end");
312
313
314 while (outerIter.hasNext()) {
315 index = outerIter.next();
316 positivePredictions = widowMap.get(index);
317 innerIter = positivePredictions.keySet().iterator();
318
319
320 while (innerIter.hasNext()) {
321 extractionType = innerIter.next();
322 widowedPred = positivePredictions.get(extractionType);
323 toleratedLength = lengthFilter.toleratedLength(extractionType);
324 widowMatcher = widowMatchers.get(extractionType);
325 bestCompletionPred = null;
326 bestCompletionPos = -1;
327 inRange = true;
328
329
330
331 for (i = 0; (i < toleratedLength) && inRange; i++) {
332 if (completeForwards) {
333 positionToConsider = index + i;
334 } else {
335 positionToConsider = index - i;
336 }
337
338
339 inRange = (positionToConsider >= 0)
340 && (positionToConsider < contextDetails.size());
341
342 if (inRange) {
343
344
345 contextToConsider =
346 contextDetails.get(positionToConsider);
347 features = contextToConsider.getContext();
348
349
350 if (contextToConsider.isRelevant()) {
351 newPred = widowMatcher.classify(features,
352 widowMatcher.getAllClasses()).best();
353
354
355 if (newPred.getType().equals(
356 Boolean.TRUE.toString())) {
357
358 if ((bestCompletionPred == null)
359 || (predComp.compare(bestCompletionPred,
360 newPred) < 0)) {
361
362
363
364
365
366 bestCompletionPred = newPred;
367 bestCompletionPos = positionToConsider;
368 }
369 }
370 } else {
371 Util.LOG.debug("Ignoring irrelevant token during "
372 + "widow matching: " + contextToConsider);
373 }
374 } else {
375 Util.LOG.debug("Cannot consider token at position "
376 + positionToConsider + " since it does not exist "
377 + "in the current document (length: "
378 + contextDetails.size() + " tokens)");
379 }
380 }
381
382
383 if (bestCompletionPred != null) {
384
385 final Prediction pred = new Prediction(null,
386 widowedPred.getProbability());
387 pred.addProb(bestCompletionPred.getProbability(), true);
388 prob = pred.getProbability();
389
390
391 newExtraction = new Extraction(extractionType,
392 contextDetails.get(index), prob);
393 extractionCompleted = (index == bestCompletionPos);
394
395
396 for (i = 1; !extractionCompleted; i++) {
397 if (completeForwards) {
398 positionToConsider = index + i;
399 } else {
400 positionToConsider = index - i;
401 }
402 newExtraction.addToken(contextDetails.get(
403 positionToConsider), null, completeForwards);
404
405
406 extractionCompleted =
407 (positionToConsider == bestCompletionPos);
408 }
409
410
411 result.add(newExtraction);
412 Util.LOG.debug("Created new extraction by matching "
413 + widowedPred.getType() + " " + widowType
414 + " prediction " + widowedPred + " with "
415 + bestCompletionPred + ": " + newExtraction);
416 } else {
417 Util.LOG.debug("No completion prediction found for "
418 + widowedPred.getType() + " " + widowType
419 + " prediction " + widowedPred);
420 }
421 }
422 }
423 return result;
424 }
425
426 /***
427 * {@inheritDoc}
428 */
429 public ExtractionContainer reextract(
430 final ExtractionContainer originalExtractions,
431 final List<ContextDetails> contextDetails, final ContextMap context)
432 throws ProcessingException {
433
434 final PositivePredictionsMap beginMap =
435 (PositivePredictionsMap) context.get(CONTEXT_BEGIN_MAP);
436 final PositivePredictionsMap endMap =
437 (PositivePredictionsMap) context.get(CONTEXT_END_MAP);
438
439
440 discardNonWidows(originalExtractions, beginMap, endMap);
441
442
443 final TargetStructure targetStruct =
444 originalExtractions.getTargetStructure();
445 final ExtractionContainer completedBeginWidows = matchWidows(beginMap,
446 beginWidowMatchers, contextDetails, targetStruct, true);
447 final ExtractionContainer completedEndWidows = matchWidows(endMap,
448 endWidowMatchers, contextDetails, targetStruct, false);
449
450
451 originalExtractions.addAll(completedBeginWidows.iterator());
452 originalExtractions.addAll(completedEndWidows.iterator());
453 originalExtractions.filterOverlapping();
454 return originalExtractions;
455 }
456
457 /***
458 * Returns a string representation of this object.
459 * @return a textual representation
460 */
461 public String toString() {
462 return new ToStringBuilder(this)
463 .append("begin widow matchers", beginWidowMatchers)
464 .append("end widow matchers", endWidowMatchers)
465 .toString();
466 }
467
468 /***
469 * {@inheritDoc}
470 */
471 public void train(final ExtractionContainer answerKeys,
472 final List<ContextDetails> contextDetails, final ContextMap context)
473 throws ProcessingException {
474
475 final PositivePredictionsMap beginMap =
476 (PositivePredictionsMap) context.get(CONTEXT_BEGIN_MAP);
477 final PositivePredictionsMap endMap =
478 (PositivePredictionsMap) context.get(CONTEXT_END_MAP);
479
480
481 discardNonWidows(answerKeys, beginMap, endMap);
482
483
484 final TargetStructure targetStruct = answerKeys.getTargetStructure();
485 trainWidowMatchers(answerKeys, beginMap, beginWidowMatchers,
486 contextDetails, targetStruct, true);
487 trainWidowMatchers(answerKeys, endMap, endWidowMatchers,
488 contextDetails, targetStruct, false);
489 }
490
491 /***
492 * Trains the widow matchers. Determining for each relevant context within
493 * the tolerated length window of each whether this context should be
494 * trained as positive or negative instance and trains the widow matcher
495 * accordingly.
496 *
497 * @param answerKeys the true extractions to train for this document
498 * @param widowMap a map of widowed positive predictions of the level 1
499 * classifiers
500 * @param widowMatchers a map from class names to binary classifiers to be
501 * used to match the widowed predictions
502 * @param contextDetails a list of context details representing all
503 * tokens in the document
504 * @param targetStruct the used target structure
505 * @param completeForwards whether to complete forwards (if these are begin
506 * widows) or backwards (if these are end widows)
507 * @throws ProcessingException might be thrown by the widow matchers during
508 * classification
509 */
510 private void trainWidowMatchers(final ExtractionContainer answerKeys,
511 final PositivePredictionsMap widowMap,
512 final Map<String, TrainableClassifier> widowMatchers,
513 final List<ContextDetails> contextDetails,
514 final TargetStructure targetStruct, final boolean completeForwards)
515 throws ProcessingException {
516 Iterator<String> classIter = widowMatchers.keySet().iterator();
517 String extractionType;
518 int toleratedLength;
519 final SortedMap<String, SortedSet<Integer>> relevantContextsForClass
520 = new TreeMap<String, SortedSet<Integer>>();
521 SortedSet<Integer> relevantContexts;
522 final Map<String, Set<Integer>> positiveContextsForClass
523 = new HashMap<String, Set<Integer>>();
524 Set<Integer> positiveContexts;
525 Iterator<Extraction> answerIter;
526 Extraction answerKey;
527 int startIndex;
528 final String widowType = (completeForwards ? "begin" : "end");
529
530
531 while (classIter.hasNext()) {
532 extractionType = classIter.next();
533 toleratedLength = lengthFilter.toleratedLength(extractionType);
534 relevantContexts = new TreeSet<Integer>();
535 relevantContextsForClass.put(extractionType, relevantContexts);
536 positiveContexts = new HashSet<Integer>();
537 positiveContextsForClass.put(extractionType, positiveContexts);
538 answerIter = answerKeys.iterator(extractionType);
539
540
541
542 while (answerIter.hasNext()) {
543 answerKey = answerIter.next();
544 startIndex = completeForwards
545 ? answerKey.getIndex() : answerKey.getLastIndex();
546 addRelevantContexts(relevantContexts, startIndex,
547 toleratedLength, completeForwards, contextDetails.size());
548
549
550
551 if (completeForwards) {
552 positiveContexts.add(answerKey.getLastIndex());
553 } else {
554 positiveContexts.add(answerKey.getIndex());
555 }
556 }
557 }
558
559 final Iterator<Integer> outerIter = widowMap.keySet().iterator();
560 Map<String, Prediction> positivePredictions;
561 Iterator<String> innerIter;
562 Integer index;
563
564
565 while (outerIter.hasNext()) {
566 index = outerIter.next();
567 positivePredictions = widowMap.get(index);
568 innerIter = positivePredictions.keySet().iterator();
569
570
571
572 while (innerIter.hasNext()) {
573 extractionType = innerIter.next();
574 relevantContexts = relevantContextsForClass.get(extractionType);
575 toleratedLength = lengthFilter.toleratedLength(extractionType);
576 addRelevantContexts(relevantContexts, index, toleratedLength,
577 completeForwards, contextDetails.size());
578 }
579 }
580
581 TrainableClassifier widowMatcher;
582 Integer positionToConsider;
583 ContextDetails contextToConsider;
584 FeatureVector features;
585 classIter = relevantContextsForClass.keySet().iterator();
586 Iterator<Integer> positionIterator;
587 String classToTrain;
588 PredictionDistribution predDist;
589 String logMsgStart;
590 String classificationStatus;
591
592
593 while (classIter.hasNext()) {
594 extractionType = classIter.next();
595 relevantContexts = relevantContextsForClass.get(extractionType);
596 positiveContexts = positiveContextsForClass.get(extractionType);
597 widowMatcher = widowMatchers.get(extractionType);
598 positionIterator = relevantContexts.iterator();
599 logMsgStart = "Trained " + extractionType + " " + widowType
600 + " widow matcher to use ";
601
602
603 while (positionIterator.hasNext()) {
604 positionToConsider = positionIterator.next();
605 contextToConsider = contextDetails.get(positionToConsider);
606 features = contextToConsider.getContext();
607
608
609 if (contextToConsider.isRelevant()) {
610
611 if (positiveContexts.contains(positionToConsider)) {
612 classToTrain = Boolean.TRUE.toString();
613 } else {
614 classToTrain = Boolean.FALSE.toString();
615 }
616
617
618 predDist = widowMatcher.trainOnError(features, classToTrain,
619 widowMatcher.getAllClasses());
620
621
622 if (predDist == null) {
623 classificationStatus = " classified correctly";
624 } else {
625 classificationStatus = " misclassified";
626 }
627 Util.LOG.debug(logMsgStart + classToTrain + " class for '"
628 + contextToConsider.getToken() + "': "
629 + classificationStatus);
630 } else {
631 Util.LOG.debug("Ignoring irrelevant token while training "
632 + "widow matchers: "
633 + contextToConsider.getToken());
634 }
635 }
636 }
637 }
638
639 }