View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract.amend;
23  
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.LinkedHashMap;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Set;
31  import java.util.SortedMap;
32  import java.util.SortedSet;
33  import java.util.TreeMap;
34  import java.util.TreeSet;
35  
36  import org.apache.commons.lang.builder.ToStringBuilder;
37  
38  import de.fu_berlin.ties.ContextMap;
39  import de.fu_berlin.ties.ProcessingException;
40  import de.fu_berlin.ties.TiesConfiguration;
41  import de.fu_berlin.ties.classify.Classifier;
42  import de.fu_berlin.ties.classify.Prediction;
43  import de.fu_berlin.ties.classify.PredictionComparator;
44  import de.fu_berlin.ties.classify.PredictionDistribution;
45  import de.fu_berlin.ties.classify.Probability;
46  import de.fu_berlin.ties.classify.TrainableClassifier;
47  import de.fu_berlin.ties.classify.feature.FeatureVector;
48  import de.fu_berlin.ties.context.ContextDetails;
49  import de.fu_berlin.ties.extract.Extraction;
50  import de.fu_berlin.ties.extract.ExtractionContainer;
51  import de.fu_berlin.ties.extract.TargetStructure;
52  import de.fu_berlin.ties.extract.reestimate.LengthFilter;
53  import de.fu_berlin.ties.filter.TrainableFilter;
54  import de.fu_berlin.ties.util.Util;
55  
56  /***
57   * A second level for the {@link de.fu_berlin.ties.combi.BeginEndStrategy},
58   * similar to the ELIE/2 system by Finn and Kushmerick.
59   *
60   * @author Christian Siefkes
61   * @version $Revision: 1.19 $, $Date: 2006/10/21 16:04:15 $, $Author: siefkes $
62   */
63  public class BeginEndReextractor implements FinalReextractor {
64  
65      /***
66       * An inner class used for communication with the first level
67       * ({@link de.fu_berlin.ties.combi.BeginEndStrategy}). Instances of this
68       * class map from integers representing the index of a token in a document
69       * to all positive (more likely than "OTHER") predictions of a classifier.
70       */
71      public static class PositivePredictionsMap
72      extends LinkedHashMap<Integer, LinkedHashMap<String, Prediction>> {
73  
74          /***
75           * Creates a new instance.
76           */
77          public PositivePredictionsMap() {
78              super();
79          }
80  
81          /***
82           * Creates a new instance.
83           *
84           * @param initialCapacity the initial capacity
85           * @throws IllegalArgumentException if the initial capacity is negative
86           * or the load factor is nonpositive
87           */
88          public PositivePredictionsMap(final int initialCapacity)
89                  throws IllegalArgumentException {
90              super(initialCapacity);
91          }
92  
93          /***
94           * Creates a new instance.
95           *
96           * @param initialCapacity the initial capacity
97           * @param loadFactor the load factor
98           * @throws IllegalArgumentException if the initial capacity is negative
99           * or the load factor is nonpositive
100          */
101         public PositivePredictionsMap(final int initialCapacity,
102                 final float loadFactor) throws IllegalArgumentException {
103             super(initialCapacity, loadFactor);
104         }
105 
106         /***
107          * Creates a new instance.
108          * @param m the map whose mappings are to be placed in this map
109          * @throws NullPointerException if the specified map is null
110          */
111         public PositivePredictionsMap(
112                 final Map<Integer, LinkedHashMap<String, Prediction>> m)
113         throws NullPointerException {
114             super(m);
115         }
116 
117     }
118 
119 
120     /***
121      * Context key for the map from token indices to positive predictions of
122      * the begin classifier.
123      */
124     public static final String CONTEXT_BEGIN_MAP = "begin";
125 
126     /***
127      * Context key for the map from token indices to positive predictions of
128      * the end classifier.
129      */
130     public static final String CONTEXT_END_MAP = "end";
131 
132     /***
133      * Optional suffix that can be appended to the
134      * {@link Classifier#CONFIG_CLASSIFIER} configuration key to modify
135      * the type of classifiers used by this instance.
136      */
137     public static final String SUFFIX_AMEND = "amend";
138 
139 
140     /***
141      * A map from class names to binary classifiers used to match widowed
142      * predictions of the level 1 begin classifiers.
143      */
144     private final Map<String, TrainableClassifier>  beginWidowMatchers
145             = new HashMap<String, TrainableClassifier>();
146 
147     /***
148      * A map from class names to binary classifiers used to match widowed
149      * predictions of the level 1 end classifiers.
150      */
151     private final Map<String, TrainableClassifier>  endWidowMatchers
152             = new HashMap<String, TrainableClassifier>();
153 
154     /***
155      * Used to determine the maximum tolerated length of extractions.
156      */
157     private final LengthFilter lengthFilter;
158 
159     /***
160      * Creates a new instance.
161      *
162      * @param classNames a set of valid class names (String)
163      * @param config used to configure this instance
164      * @param myLengthFilter used to determine the maximum tolerated length of
165      * extractions
166      * @throws ProcessingException if an error occurs during initialization
167      * of the used classifeirs
168      */
169     public BeginEndReextractor(final Set<String> classNames,
170             final TiesConfiguration config, final LengthFilter myLengthFilter)
171     throws ProcessingException {
172         super();
173         lengthFilter = myLengthFilter;
174 
175         final String[] classifierSpec = config.getStringArray(
176                 config.adaptKey(Classifier.CONFIG_CLASSIFIER, SUFFIX_AMEND));
177         String currentClass;
178         TrainableClassifier beginWidowClassifier, endWidowClassifier;
179 
180         // init begin and end widow matchers for all classes
181         for (Iterator<String> iter = classNames.iterator(); iter.hasNext();) {
182             currentClass = iter.next();
183             // no need for feature transformers since we use last transformation
184             beginWidowClassifier = TrainableClassifier.createClassifier(
185                     TrainableFilter.BOOLEAN_CLASSES, null, null, classifierSpec,
186                     config);
187             beginWidowMatchers.put(currentClass, beginWidowClassifier);
188             endWidowClassifier = TrainableClassifier.createClassifier(
189                     TrainableFilter.BOOLEAN_CLASSES, null, null, classifierSpec,
190                     config);
191             endWidowMatchers.put(currentClass, endWidowClassifier);
192         }
193     }
194 
195     /***
196      * Completes the set of context to the considered relevant for a
197      * widow matcher.
198      *
199      * @param relevantContexts the set of relevant contexts to complete
200      * @param startIndex the index of the first/last token to consider
201      * @param toleratedLength the number of tokens to consider relevant
202      * @param completeForwards whether to complete forwards or backwards
203      * @param size the overall number of tokens in this document
204      */
205     private void addRelevantContexts(final SortedSet<Integer> relevantContexts,
206             final int startIndex, final int toleratedLength,
207             final boolean completeForwards, final int size) {
208         int positionToConsider;
209         boolean inRange = true;
210 
211         // add tokens within tolerated length window to relevantContexts
212         // (starting with the current token)
213         for (int i = 0; (i < toleratedLength) && inRange; i++) {
214             if (completeForwards) {
215                 positionToConsider = startIndex + i;
216             } else {
217                 positionToConsider = startIndex - i;
218             }
219 
220             // check that the calculated position exists within the doc
221             inRange = (positionToConsider >= 0) && (positionToConsider < size);
222 
223             if (inRange) {
224                 relevantContexts.add(positionToConsider);
225             }
226         }
227     }
228  
229     /***
230      * Discard all begin/end candidates starting/ending an extraction from the
231      * given maps.
232      *
233      * @param originalExtractions the original extractions predicted for this
234      * document
235      * @param beginMap a map of widowed positive predictions of the level 1
236      * begin classifiers
237      * @param endMap map of widowed positive predictions of the level 1
238      * end classifiers
239      */
240     private void discardNonWidows(final ExtractionContainer originalExtractions,
241             final PositivePredictionsMap beginMap,
242             final PositivePredictionsMap endMap) {
243         final Iterator<Extraction> iter = originalExtractions.iterator();
244         Extraction ext;
245         String type;
246         int beginIndex, endIndex;
247 
248         while (iter.hasNext()) {
249             ext = iter.next();
250             type = ext.getType();
251             beginIndex = ext.getIndex();
252             endIndex = ext.getLastIndex();
253 
254             // remove positive begin extraction for the first and a positive
255             // end extraction for the last token of each extraction, if exists
256             if (beginMap.containsKey(beginIndex)) {
257                 beginMap.get(beginIndex).remove(type);
258             }
259 
260             if (endMap.containsKey(endIndex)) {
261                 endMap.get(endIndex).remove(type);
262             }
263         }
264     }
265 
266     /***
267      * Completes widowed predictions by finding for each widow the best match
268      * (if any) within the tolerated length window. The returned extraction
269      * container might contain overlapping extractions which must be discarded.
270      *
271      * @param widowMap a map of widowed positive predictions of the level 1
272      * classifiers
273      * @param widowMatchers a map from class names to binary classifiers to be
274      * used to match the widowed predictions
275      * @param contextDetails a list of context details representing all
276      * tokens in the document
277      * @param targetStruct the used target structure
278      * @param completeForwards whether to complete forwards (if these are begin
279      * widows) or backwards (if these are end widows)
280      * @return the new extractions creating by finding the best match (if any)
281      * for each of the begin widows
282      * @throws ProcessingException might be thrown by the widow matchers during
283      * classification
284      */
285     private ExtractionContainer matchWidows(
286             final PositivePredictionsMap widowMap,
287             final Map<String, TrainableClassifier> widowMatchers,
288             final List<ContextDetails> contextDetails,
289             final TargetStructure targetStruct, final boolean completeForwards)
290     throws ProcessingException {
291         final ExtractionContainer result =
292             new ExtractionContainer(targetStruct);
293         final Iterator<Integer> outerIter = widowMap.keySet().iterator();
294         Integer index;
295         Map<String, Prediction> positivePredictions;
296         Iterator<String> innerIter;
297         String extractionType;
298         TrainableClassifier widowMatcher;
299         int toleratedLength;
300         Prediction widowedPred, newPred, bestCompletionPred;
301         int bestCompletionPos;
302         final PredictionComparator predComp = new PredictionComparator();
303         boolean inRange;
304         int i;
305         int positionToConsider;
306         ContextDetails contextToConsider;
307         FeatureVector features;
308         Probability prob;
309         Extraction newExtraction;
310         boolean extractionCompleted;
311         final String widowType = (completeForwards ? "begin" : "end");
312 
313         // iterate all index positions in the widow map
314         while (outerIter.hasNext()) {
315             index = outerIter.next();
316             positivePredictions = widowMap.get(index);
317             innerIter = positivePredictions.keySet().iterator();
318 
319             // iterate + complete positive predictions at this index position
320             while (innerIter.hasNext()) {
321                 extractionType = innerIter.next();
322                 widowedPred = positivePredictions.get(extractionType);
323                 toleratedLength = lengthFilter.toleratedLength(extractionType);
324                 widowMatcher = widowMatchers.get(extractionType);
325                 bestCompletionPred = null;
326                 bestCompletionPos = -1;
327                 inRange = true;
328 
329                 // run completion classifier on tokens within tolerated length
330                 // window (starting with the current token)
331                 for (i = 0; (i < toleratedLength) && inRange; i++) {
332                     if (completeForwards) {
333                         positionToConsider = index + i;
334                     } else {
335                         positionToConsider = index - i;
336                     }
337 
338                     // check that the calculated position exists within the doc
339                     inRange = (positionToConsider >= 0)
340                             && (positionToConsider < contextDetails.size());
341 
342                     if (inRange) {
343                         // complete remaining widows by finding best match
344                         // (if any) within the tolerated length window
345                         contextToConsider =
346                             contextDetails.get(positionToConsider);
347                         features = contextToConsider.getContext();
348 
349                         // is the token relevant?
350                         if (contextToConsider.isRelevant()) {
351                             newPred = widowMatcher.classify(features,
352                                     widowMatcher.getAllClasses()).best();
353 
354                             // did the positive type win?
355                             if (newPred.getType().equals(
356                                     Boolean.TRUE.toString())) {
357                                 // more likely than previous (if any) best pred?
358                                 if ((bestCompletionPred == null)
359                                         || (predComp.compare(bestCompletionPred,
360                                                 newPred) < 0)) {
361 /*                                  Util.LOG.debug("New completion prediction "
362                                         + newPred + " for '"+ contextDetails.
363                                         get(positionToConsider).getToken()
364                                         + "' token is more likely than "
365                                         + bestCompletionPred); */
366                                     bestCompletionPred = newPred;
367                                     bestCompletionPos = positionToConsider;
368                                 }
369                             }
370                         } else {
371                             Util.LOG.debug("Ignoring irrelevant token during "
372                                     + "widow matching: " + contextToConsider);
373                         }
374                     } else {
375                         Util.LOG.debug("Cannot consider token at position "
376                             + positionToConsider + " since it does not exist "
377                             + "in the current document (length: "
378                             + contextDetails.size() + " tokens)");
379                     } // inRange
380                 } // for i
381 
382                 // did we find a completion candidate?
383                 if (bestCompletionPred != null) {
384                     // use prediction to determine average probability
385                     final Prediction pred = new Prediction(null,
386                             widowedPred.getProbability());
387                     pred.addProb(bestCompletionPred.getProbability(), true);
388                     prob = pred.getProbability();
389 
390                     // create new extraction
391                     newExtraction = new Extraction(extractionType,
392                             contextDetails.get(index), prob);
393                     extractionCompleted = (index == bestCompletionPos);
394 
395                     // add further tokens (if any)
396                     for (i = 1; !extractionCompleted; i++) {
397                         if (completeForwards) {
398                             positionToConsider = index + i;
399                         } else {
400                             positionToConsider = index - i;
401                         }
402                         newExtraction.addToken(contextDetails.get(
403                                 positionToConsider), null, completeForwards);
404 
405                         // done now?
406                         extractionCompleted =
407                             (positionToConsider == bestCompletionPos);
408                     }
409 
410                     // store extraction + log
411                     result.add(newExtraction);
412                     Util.LOG.debug("Created new extraction by matching "
413                         + widowedPred.getType() + " " + widowType
414                         + " prediction " + widowedPred + " with "
415                         + bestCompletionPred + ": " + newExtraction);
416                 } else {
417                     Util.LOG.debug("No completion prediction found for "
418                         + widowedPred.getType() + " " + widowType
419                         + " prediction " + widowedPred);
420                 }
421             } // while innerIter
422         } // while outerIter
423         return result;
424     }
425 
426     /***
427      * {@inheritDoc}
428      */
429     public ExtractionContainer reextract(
430             final ExtractionContainer originalExtractions,
431             final List<ContextDetails> contextDetails, final ContextMap context)
432     throws ProcessingException {
433         // retrieve maps of positive predictions
434         final PositivePredictionsMap beginMap =
435             (PositivePredictionsMap) context.get(CONTEXT_BEGIN_MAP);
436         final PositivePredictionsMap endMap =
437             (PositivePredictionsMap) context.get(CONTEXT_END_MAP);
438 
439         // discard all candidates starting/ending an extraction from the maps
440         discardNonWidows(originalExtractions, beginMap, endMap);
441 
442         // complete remaining widows
443         final TargetStructure targetStruct =
444             originalExtractions.getTargetStructure();
445         final ExtractionContainer completedBeginWidows = matchWidows(beginMap,
446                 beginWidowMatchers, contextDetails, targetStruct, true);
447         final ExtractionContainer completedEndWidows = matchWidows(endMap,
448                 endWidowMatchers, contextDetails, targetStruct, false);
449 
450         // combine extractions in a single container + filter overlapping ones
451         originalExtractions.addAll(completedBeginWidows.iterator());
452         originalExtractions.addAll(completedEndWidows.iterator());
453         originalExtractions.filterOverlapping();
454         return originalExtractions;
455     }
456 
457     /***
458      * Returns a string representation of this object.
459      * @return a textual representation
460      */
461     public String toString() {
462         return new ToStringBuilder(this)
463             .append("begin widow matchers", beginWidowMatchers)
464             .append("end widow matchers", endWidowMatchers)
465             .toString();
466     }
467 
468     /***
469      * {@inheritDoc}
470      */
471     public void train(final ExtractionContainer answerKeys,
472             final List<ContextDetails> contextDetails, final ContextMap context)
473     throws ProcessingException {
474         // retrieve maps of positive predictions
475         final PositivePredictionsMap beginMap =
476             (PositivePredictionsMap) context.get(CONTEXT_BEGIN_MAP);
477         final PositivePredictionsMap endMap =
478             (PositivePredictionsMap) context.get(CONTEXT_END_MAP);
479 
480         // discard all candidates starting/ending an answer key from the maps
481         discardNonWidows(answerKeys, beginMap, endMap);
482 
483         // train widow matchers
484         final TargetStructure targetStruct = answerKeys.getTargetStructure();
485         trainWidowMatchers(answerKeys, beginMap, beginWidowMatchers,
486                 contextDetails, targetStruct, true);
487         trainWidowMatchers(answerKeys, endMap, endWidowMatchers,
488                 contextDetails, targetStruct, false);
489     }
490 
491     /***
492      * Trains the widow matchers. Determining for each relevant context within
493      * the tolerated length window of each whether this context should be
494      * trained as positive or negative instance and trains the widow matcher
495      * accordingly.
496      *
497      * @param answerKeys the true extractions to train for this document
498      * @param widowMap a map of widowed positive predictions of the level 1
499      * classifiers
500      * @param widowMatchers a map from class names to binary classifiers to be
501      * used to match the widowed predictions
502      * @param contextDetails a list of context details representing all
503      * tokens in the document
504      * @param targetStruct the used target structure
505      * @param completeForwards whether to complete forwards (if these are begin
506      * widows) or backwards (if these are end widows)
507      * @throws ProcessingException might be thrown by the widow matchers during
508      * classification
509      */
510     private void trainWidowMatchers(final ExtractionContainer answerKeys,
511             final PositivePredictionsMap widowMap,
512             final Map<String, TrainableClassifier> widowMatchers,
513             final List<ContextDetails> contextDetails,
514             final TargetStructure targetStruct, final boolean completeForwards)
515     throws ProcessingException {
516         Iterator<String> classIter = widowMatchers.keySet().iterator();
517         String extractionType;
518         int toleratedLength;
519         final SortedMap<String, SortedSet<Integer>> relevantContextsForClass
520                 = new TreeMap<String, SortedSet<Integer>>();
521         SortedSet<Integer> relevantContexts;
522         final Map<String, Set<Integer>> positiveContextsForClass
523                 = new HashMap<String, Set<Integer>>();
524         Set<Integer> positiveContexts;
525         Iterator<Extraction> answerIter;
526         Extraction answerKey;
527         int startIndex;
528         final String widowType = (completeForwards ? "begin" : "end");
529 
530         // iterate all extraction types (class names)
531         while (classIter.hasNext()) {
532             extractionType = classIter.next();
533             toleratedLength = lengthFilter.toleratedLength(extractionType);
534             relevantContexts = new TreeSet<Integer>();
535             relevantContextsForClass.put(extractionType, relevantContexts);
536             positiveContexts = new HashSet<Integer>();
537             positiveContextsForClass.put(extractionType, positiveContexts);
538             answerIter = answerKeys.iterator(extractionType);
539 
540             // all tokens within the tolerated length window after/before the
541             // first/last token of an answer key are relevant for the matcher
542             while (answerIter.hasNext()) {
543                 answerKey = answerIter.next();
544                 startIndex = completeForwards
545                         ? answerKey.getIndex() : answerKey.getLastIndex();
546                 addRelevantContexts(relevantContexts, startIndex,
547                     toleratedLength, completeForwards, contextDetails.size());
548 
549                 // determine positive instance: last (if completing forwards) or
550                 // first (otherwise) token of an answer key
551                 if (completeForwards) {
552                     positiveContexts.add(answerKey.getLastIndex());
553                 } else {
554                     positiveContexts.add(answerKey.getIndex());
555                 }
556             }
557         }
558 
559         final Iterator<Integer> outerIter = widowMap.keySet().iterator();
560         Map<String, Prediction> positivePredictions;
561         Iterator<String> innerIter;
562         Integer index;
563 
564         // iterate all index positions in the widow map
565         while (outerIter.hasNext()) {
566             index = outerIter.next();
567             positivePredictions = widowMap.get(index);
568             innerIter = positivePredictions.keySet().iterator();
569 
570             // all tokens within the tolerated length window after/before a
571             // widowed prediction are relevant for the corresponding matcher too
572             while (innerIter.hasNext()) {
573                 extractionType = innerIter.next();
574                 relevantContexts = relevantContextsForClass.get(extractionType);
575                 toleratedLength = lengthFilter.toleratedLength(extractionType);
576                 addRelevantContexts(relevantContexts, index, toleratedLength,
577                         completeForwards, contextDetails.size());
578             }
579         }
580 
581         TrainableClassifier widowMatcher;
582         Integer positionToConsider;
583         ContextDetails contextToConsider;
584         FeatureVector features;
585         classIter = relevantContextsForClass.keySet().iterator();
586         Iterator<Integer> positionIterator;
587         String classToTrain;
588         PredictionDistribution predDist;
589         String logMsgStart;
590         String classificationStatus;
591 
592         // train matchers for all types
593         while (classIter.hasNext()) {
594             extractionType = classIter.next();
595             relevantContexts = relevantContextsForClass.get(extractionType);
596             positiveContexts = positiveContextsForClass.get(extractionType);
597             widowMatcher = widowMatchers.get(extractionType);
598             positionIterator = relevantContexts.iterator();
599             logMsgStart = "Trained " + extractionType + " " + widowType
600                     + " widow matcher to use "; 
601 
602             // train each relevant position
603             while (positionIterator.hasNext()) {
604                 positionToConsider = positionIterator.next();
605                 contextToConsider = contextDetails.get(positionToConsider);
606                 features = contextToConsider.getContext();
607 
608                 // ignore irrelevant tokens
609                 if (contextToConsider.isRelevant()) {
610                     // determine class to train
611                     if (positiveContexts.contains(positionToConsider)) {
612                         classToTrain = Boolean.TRUE.toString();
613                     } else {
614                         classToTrain = Boolean.FALSE.toString();
615                     }
616 
617                     // TOE train widow matcher
618                     predDist = widowMatcher.trainOnError(features, classToTrain,
619                             widowMatcher.getAllClasses());
620 
621                     // log outcome (null means that TOE prediction was correct)
622                     if (predDist == null) {
623                         classificationStatus = " classified correctly";
624                     } else {
625                         classificationStatus = " misclassified";
626                     }
627                     Util.LOG.debug(logMsgStart + classToTrain + " class for '"
628                             + contextToConsider.getToken() + "': "
629                             + classificationStatus);
630                 } else {
631                     Util.LOG.debug("Ignoring irrelevant token while training "
632                             + "widow matchers: "
633                             + contextToConsider.getToken());
634                 }
635             }
636         }
637     }
638 
639 }