View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.combi;
23  
24  import java.util.Collections;
25  import java.util.Iterator;
26  import java.util.LinkedHashMap;
27  import java.util.Set;
28  import java.util.SortedSet;
29  import java.util.TreeSet;
30  
31  import de.fu_berlin.ties.ContextMap;
32  import de.fu_berlin.ties.ProcessingException;
33  import de.fu_berlin.ties.TiesConfiguration;
34  import de.fu_berlin.ties.classify.Prediction;
35  import de.fu_berlin.ties.classify.PredictionComparator;
36  import de.fu_berlin.ties.classify.PredictionDistribution;
37  import de.fu_berlin.ties.classify.Probability;
38  import de.fu_berlin.ties.extract.amend.BeginEndReextractor;
39  import de.fu_berlin.ties.extract.amend.FinalReextractor;
40  import de.fu_berlin.ties.extract.reestimate.LengthFilter;
41  import de.fu_berlin.ties.extract.reestimate.Reestimator;
42  import de.fu_berlin.ties.text.TokenDetails;
43  import de.fu_berlin.ties.util.Util;
44  
45  /***
46   * A combination strategy that uses two classifiers, one to recognize the begin
47   * of extractions and one to recognize the end.
48   *
49   * @author Christian Siefkes
50   * @version $Revision: 1.25 $, $Date: 2006/10/21 16:04:01 $, $Author: siefkes $
51   */
52  public class BeginEndStrategy extends CombinationStrategy {
53  
54      /***
55       * The character terminating each prefix.
56       */
57      private static final char PREFIX_TERMINATOR = '-';
58  
59      /***
60       * Prefix used for the begin classifier.
61       */
62      private static final String BEGIN_PREFIX = "B" + PREFIX_TERMINATOR;
63  
64      /***
65       * Prefix used for the end classifier.
66       */
67      private static final String END_PREFIX = "E" + PREFIX_TERMINATOR;
68  
69      /***
70       * String used to classify other (not begin resp. end). Actually called
71       * "A" so it will be picked in case of a tie.
72       */
73      private static final String OTHER = "A";
74  
75  
76      /***
77       * The set of all classes for the begin classifier, stored for efficiency.
78       */
79      private final SortedSet<String> beginClasses;
80  
81      /***
82       * The set of all classes for the end classifier, stored for efficiency.
83       */
84      private final SortedSet<String> endClasses;
85  
86      /***
87       * An immutable set of all classes (Strings) that can possible occur
88       * during classification.
89       */
90      private final SortedSet[] allClasses;
91  
92      /***
93       * Used to configure this instance.
94       */
95      private final TiesConfiguration config;
96  
97      /***
98       * Whether to use a {@link BeginEndReextractor} as second level.
99       */
100     private final boolean reextract;
101 
102     /***
103      * Used to notify the {@link BeginEndReextractor} (if used) of positive
104      * begin predictions for each token index.
105      */
106     private BeginEndReextractor.PositivePredictionsMap beginMap = null;
107 
108     /***
109      * Used to notify the {@link BeginEndReextractor} (if used) of positive
110      * end predictions for each token index.
111      */
112     private BeginEndReextractor.PositivePredictionsMap endMap = null;
113 
114 
115     /***
116      * Creates a new instance.
117      *
118      * @param theClasses a set of valid class names (String)
119      * @param conf used to configure this instance
120      */
121     public BeginEndStrategy(final Set<String> theClasses,
122             final TiesConfiguration conf) {
123         super(theClasses);
124 
125         // initialize fields
126         config = conf;
127         reextract = conf.getBoolean("combination.begin-end.level2");
128 
129         if (reextract) {
130             // init maps for communication with re-extractor
131             beginMap = new BeginEndReextractor.PositivePredictionsMap();
132             endMap = new BeginEndReextractor.PositivePredictionsMap();
133         }
134 
135         final TreeSet<String> bClasses = new TreeSet<String>();
136         final TreeSet<String> eClasses = new TreeSet<String>();
137         final Iterator iter = theClasses.iterator();
138         String currentClass;
139 
140         bClasses.add(OTHER);
141         eClasses.add(OTHER);
142 
143         // full sets of classes
144         while (iter.hasNext()) {
145             currentClass = (String) iter.next();
146             bClasses.add(BEGIN_PREFIX + currentClass);
147             eClasses.add(END_PREFIX + currentClass);
148         }
149 
150         // initialize unmodifyable sets + set array
151         beginClasses = Collections.unmodifiableSortedSet(bClasses);
152         endClasses = Collections.unmodifiableSortedSet(eClasses);
153         allClasses = new SortedSet[] {beginClasses, endClasses};
154     }
155 
156 
157     /***
158      * {@inheritDoc}
159      */
160     public Set[] activeClasses() {
161         // all classes are always active
162         return allClasses;
163     }
164 
165     /***
166      * {@inheritDoc}
167      */
168     public Set[] allClasses() {
169         return allClasses;
170     }
171 
172     /***
173      * {@inheritDoc}
174      */
175     public ContextMap contextForReextractor() {
176         final ContextMap context;
177 
178         if (reextract) {
179             // store old maps in context and init new ones
180             context = new ContextMap();
181             context.put(BeginEndReextractor.CONTEXT_BEGIN_MAP, beginMap);
182             context.put(BeginEndReextractor.CONTEXT_END_MAP, endMap);
183             beginMap = new BeginEndReextractor.PositivePredictionsMap();
184             endMap = new BeginEndReextractor.PositivePredictionsMap();
185         } else {
186             context = null;
187         }
188 
189         return context;
190     }
191 
192     /***
193      * {@inheritDoc} This implementation returns a {@link BeginEndReextractor},
194      * if configured. In this case the re-estimator chain <strong>must</strong>
195      * contain a {@link LengthFilter}, otherwise this method will throw an
196      * {@link IllegalArgumentException}.
197      */
198     public FinalReextractor initReextractor(final Reestimator reestimatorChain)
199     throws ProcessingException {
200         // use 2nd level, if configured
201         if (reextract) {
202             // find LengthFilter is re-estimator chain
203             Reestimator estimator = reestimatorChain;
204             LengthFilter lengthFilter = null;
205 
206             while ((estimator != null) && (lengthFilter == null)) {
207                 if (estimator instanceof LengthFilter) {
208                     // got it!
209                     lengthFilter = (LengthFilter) estimator;
210                 } else {
211                     // look at previous re-estimator in chain
212                     estimator = estimator.getPrecedingReestimator();
213                 }
214             }
215 
216             if (lengthFilter != null) {
217                 return new BeginEndReextractor(getValidClasses(), config,
218                         lengthFilter);
219             } else {
220                 throw new IllegalArgumentException("No LengthFilter found in "
221                     + "re-estimator chain -- cannot init BeginEndReextractor");
222             }
223         } else {
224             return null;
225         }
226     }
227 
228     /***
229      * {@inheritDoc}
230      */
231     protected boolean resetHook() {
232         final boolean result;
233 
234         if (!state().isEnd() && (state().getType() != null)) {
235             // discard not-yet-finished extraction
236             result = true;
237         } else {
238             return false;
239         }
240         return result;
241     }
242 
243     /***
244      * Helper method that stores the positive (more likely than {@link #OTHER})
245      * predictions of a classifier in a map for communication with the
246      * {@link BeginEndReextractor}. This method should only be called if there
247      * is at least one positive prediction
248      *
249      * @param predIter an iterator over the predictions to store
250      * @param map the map to add to
251      * @param prefix a prefix to strip from the type of predictions
252      * @param tokenIndex the index of the current token
253      * @throws IllegalStateException if there are no positive predictions, i.e.
254      * if {@link #OTHER} is the most likely prediction
255      */
256     private void storePositivePredictions(final Iterator<Prediction> predIter,
257             final BeginEndReextractor.PositivePredictionsMap map,
258             final String prefix, final int tokenIndex)
259     throws IllegalStateException {
260         // build map of all end pred's with prob. greater than O
261         final LinkedHashMap<String, Prediction> bestMap =
262             new LinkedHashMap<String, Prediction>();
263         boolean done = false;
264         Prediction currentPred;
265         String currentType;
266 
267         currentPred = predIter.next();
268         currentType = currentPred.getType();
269 
270         while (predIter.hasNext() && !done) {
271             currentPred = predIter.next();
272             currentType = currentPred.getType();
273 
274             if (currentType.equals(OTHER)) {
275                 // reached O prediction
276                 done = true;
277             } else {
278                 // map from class (w/o prefix) to prediction
279                 bestMap.put(currentType.substring(prefix.length()),
280                         currentPred);
281             }
282         }
283 
284         if (!bestMap.isEmpty()) {
285             // store in provided outer map, using token index as key
286             map.put(tokenIndex, bestMap);
287         } else {
288             // inner map is empty -- method should not have been called
289             throw new IllegalStateException("No positive predictions");
290         }
291     }
292 
293     /***
294      * {@inheritDoc}
295      */
296     public String[] translateCurrentState(final CombinationState currentState)
297             throws IllegalArgumentException {
298         final String beginResult;
299         final String endResult;
300 
301         if (currentState.isBegin()) {
302             // positive instance for the begin classifier
303             beginResult = BEGIN_PREFIX + currentState.getType();
304         } else {
305             beginResult = OTHER;
306         }
307 
308         if (currentState.isEnd()) {
309             // positive instance for the end classifier
310             endResult = END_PREFIX + currentState.getType();
311         } else {
312             endResult = OTHER;
313         }
314 
315         return new String[] {beginResult, endResult};
316     }
317 
318     /***
319      * {@inheritDoc}
320      */
321     public CombinationState translateResult(
322             final PredictionDistribution[] predictions,
323             final TokenDetails details)
324     throws IllegalArgumentException {
325         if (predictions.length != 2) {
326             throw new IllegalArgumentException(
327                 "Illegal number of classifiers: " + predictions.length
328                 + " instead of 2");
329         }
330 
331         final CombinationState result;
332         final Probability prob;
333         final Prediction beginBest = predictions[0].best();
334         final Prediction endBest = predictions[1].best();
335         final PredictionComparator predComp = new PredictionComparator();
336 
337         boolean isBegin = !beginBest.getType().equals(OTHER);
338         boolean isEnd = !endBest.getType().equals(OTHER);
339 
340         // previous state
341         final String lastType = state().getType();
342         final boolean lastEnd = state().isEnd();
343         final String newClass;
344 
345         if (isBegin) {
346             // strip prefix to determine actual class
347             final String beginClass =
348                 beginBest.getType().substring(BEGIN_PREFIX.length());
349 
350             if (isEnd) {
351                 final String endClass =
352                     endBest.getType().substring(END_PREFIX.length());
353 
354                 if (beginClass.equals(endClass)) {
355                     // start new single-token extraction
356                     newClass = beginClass;
357 
358                     // use prediction to determine average probability
359                     final Prediction pred = new Prediction(null,
360                             beginBest.getProbability());
361                     pred.addProb(endBest.getProbability(), true);
362                     prob = pred.getProbability();
363                 } else if (endClass.equals(lastType) && !lastEnd) {
364                     // end class is valid: trust the more confident classifier
365                     // (end classifier in case of a tie)
366                     if (predComp.compare(beginBest, endBest) <= 0) {
367                         // end classifier is more or equally confident
368                         newClass = endClass;
369                         isBegin = false;
370                         Util.LOG.debug("BeginEnd: end classification "
371                             + endBest + " wins over begin classification "
372                             + beginBest + "due to higher/equal confidence");
373 
374                         // use end probability
375                         prob = endBest.getProbability();
376                     } else {
377                         // begin classifier is more confident
378                         newClass = beginClass;
379                         isEnd = false;
380                         Util.LOG.debug("BeginEnd: begin classification "
381                             + beginBest + " wins over end classification "
382                             + endBest + "due to higher confidence");
383 
384                         // use begin probability
385                         prob = beginBest.getProbability();
386                     }
387                 } else {
388                     // end class is invalid: use the begin classifier
389                     newClass = beginClass;
390 
391                     // use begin probability
392                     prob = beginBest.getProbability();
393                 }
394             } else {
395                 // begin of new instance
396                 newClass = beginClass;
397 
398                 // use begin probability
399                 prob = beginBest.getProbability();
400             }
401         } else {
402             if (isEnd) {
403                 final String endClass =
404                     endBest.getType().substring(END_PREFIX.length());
405 
406                 // check whether end prediction is valid
407                 if (endClass.equals(lastType) && !lastEnd) {
408                     newClass = endClass;
409 
410                     // use end probability
411                     prob = endBest.getProbability();
412                 } else {
413                     // discard invalid prediction
414                     isEnd = false;
415 /*                    Util.LOG.debug("BeginEnd: end classification " + endBest
416                         + " is invalid -- using state of preceding token"); */
417 
418                     if (lastEnd) {
419                         // set type to null because last extraction is finished
420                         newClass = null;
421                     } else {
422                         // re-use type from last state
423                         newClass = lastType;
424                     }
425 
426                     // set probability to null (unspecified)
427                     prob = null;
428                 }
429             } else {
430                 // neither begin nor end: use state of preceding token
431                 if (lastEnd) {
432                     // set type to null because last extraction is finished
433                     newClass = null;
434                 } else {
435                     // re-use type from last state
436                     newClass = lastType;
437                 }
438 
439                 // set probability to null (unspecified)
440                 prob = null;
441             }
442         }
443 
444         final boolean discardPrevious;
445 
446         if (isBegin && !lastEnd && (lastType != null)
447                 && !lastType.equals(newClass)) {
448             // starting a new extraction but the last extraction is unfinished
449             // and has an different type: discard it
450             discardPrevious = true;
451         } else {
452             discardPrevious = false;
453         }
454 
455         if (newClass != null) {
456             if (getValidClasses().contains(newClass)) {
457                 result = new CombinationState(newClass, isBegin, isEnd, prob,
458                     discardPrevious);
459             } else {
460                 throw new IllegalArgumentException(
461                     "Type " + newClass + " derived from predictions "
462                         + beginBest + " and " + endBest + " is invalid");
463             }
464         } else {
465             result = CombinationState.OUTSIDE;
466         }
467 
468         return result;
469     }
470 
471     /***
472      * {@inheritDoc}
473      */
474     public void updateState(final CombinationState newState,
475             final PredictionDistribution[] predictions,
476             final TokenDetails details) throws IllegalArgumentException {
477         // delegate to superclass
478         super.updateState(newState, predictions, details);
479 
480         // build maps for communication with re-extractor if required
481         if (reextract) {
482             // during training, prediction distributions will be null if the
483             // classifier was correct
484             if (predictions[0] != null) {
485                 final Prediction beginBest = predictions[0].best();
486                 final boolean isBegin = !beginBest.getType().equals(OTHER);
487 
488                 if (isBegin) {
489                     storePositivePredictions(predictions[0].iterator(),
490                             beginMap, BEGIN_PREFIX, details.getIndex());
491                 }
492             }
493 
494             if (predictions[1] != null) {
495                 final Prediction endBest = predictions[1].best();
496                 final boolean isEnd = !endBest.getType().equals(OTHER);
497 
498                 if (isEnd) {
499                     storePositivePredictions(predictions[1].iterator(), endMap,
500                             END_PREFIX, details.getIndex());
501                 }
502             }
503 
504         }
505     }
506 
507 }