View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.combi;
23  
24  import java.util.Collections;
25  import java.util.Iterator;
26  import java.util.Set;
27  import java.util.SortedSet;
28  import java.util.TreeSet;
29  
30  import de.fu_berlin.ties.classify.Prediction;
31  import de.fu_berlin.ties.classify.PredictionComparator;
32  import de.fu_berlin.ties.classify.PredictionDistribution;
33  import de.fu_berlin.ties.util.Util;
34  
35  /***
36   * A combination strategy that uses two classifiers, one to recognize the begin
37   * of extractions and one to recognize the end.
38   *
39   * @author Christian Siefkes
40   * @version $Revision: 1.7 $, $Date: 2004/09/15 16:08:45 $, $Author: siefkes $
41   */
42  public class BeginEndStrategy extends CombinationStrategy {
43  
44      /***
45       * The character terminating each prefix.
46       */
47      private static final char PREFIX_TERMINATOR = '-';
48  
49      /***
50       * Prefix used for the begin classifier.
51       */
52      private static final String BEGIN_PREFIX = "B" + PREFIX_TERMINATOR;
53  
54      /***
55       * Prefix used for the end classifier.
56       */
57      private static final String END_PREFIX = "E" + PREFIX_TERMINATOR;
58  
59      /***
60       * String used to classify other (not begin resp. end). Actually called
61       * "A" so it will be picked in case of a tie.
62       */
63      private static final String OTHER = "A";
64  
65      /***
66       * The set of all classes for the begin classifier, stored for efficiency.
67       */
68      private final SortedSet<String> beginClasses;
69  
70      /***
71       * The set of all classes for the end classifier, stored for efficiency.
72       */
73      private final SortedSet<String> endClasses;
74  
75      /***
76       * An immutable set of all classes (Strings) that can possible occur
77       * during classification.
78       */
79      private final SortedSet[] allClasses;
80  
81      /***
82       * Creates a new instance.
83       *
84       * @param theClasses a set of valid class names (String)
85       */
86      public BeginEndStrategy(final Set<String> theClasses) {
87          super(theClasses);
88  
89          final TreeSet<String> bClasses = new TreeSet<String>();
90          final TreeSet<String> eClasses = new TreeSet<String>();
91          final Iterator iter = theClasses.iterator();
92          String currentClass;
93  
94          bClasses.add(OTHER);
95          eClasses.add(OTHER);
96  
97          // full sets of classes
98          while (iter.hasNext()) {
99              currentClass = (String) iter.next();
100             bClasses.add(BEGIN_PREFIX + currentClass);
101             eClasses.add(END_PREFIX + currentClass);
102         }
103 
104         // initialize unmodifyable sets + set array
105         beginClasses = Collections.unmodifiableSortedSet(bClasses);
106         endClasses = Collections.unmodifiableSortedSet(eClasses);
107         allClasses = new SortedSet[] {beginClasses, endClasses};
108     }
109 
110     /***
111      * {@inheritDoc}
112      */
113     public Set<String>[] activeClasses() {
114         // all classes are always active
115         return allClasses;
116     }
117 
118     /***
119      * {@inheritDoc}
120      */
121     public Set<String>[] allClasses() {
122         return allClasses;
123     }
124 
125     /***
126      * {@inheritDoc}
127      */
128     protected boolean resetHook() {
129         final boolean result;
130 
131         if (!state().isEnd() && (state().getType() != null)) {
132             // discard not-yet-finished extraction
133             result = true;
134         } else {
135             return false;
136         }
137         return result;
138     }
139 
140     /***
141      * {@inheritDoc}
142      */
143     public String[] translateCurrentState(final CombinationState currentState)
144             throws IllegalArgumentException {
145         final String beginResult;
146         final String endResult;
147 
148         if (currentState.isBegin()) {
149             // positive instance for the begin classifier
150             beginResult = BEGIN_PREFIX + currentState.getType();
151         } else {
152             beginResult = OTHER;
153         }
154 
155         if (currentState.isEnd()) {
156             // positive instance for the end classifier
157             endResult = END_PREFIX + currentState.getType();
158         } else {
159             endResult = OTHER;
160         }
161 
162         return new String[] {beginResult, endResult};
163     }
164 
165     /***
166      * {@inheritDoc}
167      */
168     public CombinationState translateResult(
169             final PredictionDistribution[] predictions)
170     throws IllegalArgumentException {
171         if (predictions.length != 2) {
172             throw new IllegalArgumentException(
173                 "Illegal number of classifiers: " + predictions.length
174                 + " instead of 2");
175         }
176 
177         final CombinationState result;
178         final Prediction beginBest = predictions[0].best();
179         final Prediction endBest = predictions[1].best();
180         final PredictionComparator predComp = new PredictionComparator();
181 
182         boolean isBegin = !beginBest.getType().equals(OTHER);
183         boolean isEnd = !endBest.getType().equals(OTHER);
184         final String newClass;
185 
186         // previous state
187         final String lastType = state().getType();
188         final boolean lastEnd = state().isEnd();
189 
190         if (isBegin) {
191             // strip prefix to determine actual class
192             final String beginClass =
193                 beginBest.getType().substring(BEGIN_PREFIX.length());
194 
195             if (isEnd) {
196                 final String endClass =
197                     endBest.getType().substring(END_PREFIX.length());
198 
199                 if (beginClass.equals(endClass)) {
200                     // start new single-token extraction
201                     newClass = beginClass;
202                 } else if (endClass.equals(lastType) && !lastEnd) {
203                     // end class is valid: trust the more confident classifier
204                     // (end classifier in case of a type)
205                     if (predComp.compare(beginBest, endBest) <= 0) {
206                         // end classifier is more or equally confident
207                         newClass = endClass;
208                         isBegin = false;
209                         Util.LOG.debug("BeginEnd: end classification "
210                             + endBest + " wins over begin classification "
211                             + beginBest + "due to higher/equal confidence");
212                     } else {
213                         // begin classifier is more confident
214                         newClass = beginClass;
215                         isEnd = false;
216                         Util.LOG.debug("BeginEnd: begin classification "
217                             + beginBest + " wins over end classification "
218                             + endBest + "due to higher confidence");
219                     }
220                 } else {
221                     // end class is invalid: use the begin classifier
222                     newClass = beginClass;
223 /*                    Util.LOG.debug("BeginEnd: end classification " + endBest
224                         + " is invalid -- using begin classification"); */
225                 }
226             } else {
227                 // begin of new instance
228                 newClass = beginClass;
229             }
230         } else {
231             if (isEnd) {
232                 final String endClass =
233                     endBest.getType().substring(END_PREFIX.length());
234 
235                 // check whether end prediction is valid
236                 if (endClass.equals(lastType) && !lastEnd) {
237                     newClass = endClass;
238                 } else {
239                     // discard invalid prediction
240                     isEnd = false;
241 /*                    Util.LOG.debug("BeginEnd: end classification " + endBest
242                         + " is invalid -- using state of preceding token"); */
243 
244                     if (lastEnd) {
245                         // set type to null because last extraction is finished
246                         newClass = null;
247                     } else {
248                         // re-use type from last state
249                         newClass = lastType;
250                     }
251                 }
252             } else {
253                 // neither begin nor end: use state of preceding token
254                 if (lastEnd) {
255                     // set type to null because last extraction is finished
256                     newClass = null;
257                 } else {
258                     // re-use type from last state
259                     newClass = lastType;
260                 }
261             }
262         }
263 
264         final boolean discardPrevious;
265 
266         if (isBegin && !lastEnd && (lastType != null)
267                 && !lastType.equals(newClass)) {
268             // starting a new extraction but the last extraction is unfinished
269             // and has an different type: discard it
270             discardPrevious = true;
271         } else {
272             discardPrevious = false;
273         }
274 
275         if (newClass != null) {
276             if (getValidClasses().contains(newClass)) {
277                 result = new CombinationState(newClass, isBegin, isEnd,
278                     discardPrevious);
279             } else {
280                 throw new IllegalArgumentException(
281                     "Type " + newClass + " derived from predictions "
282                         + beginBest + " and " + endBest + " is invalid");
283             }
284         } else {
285             result = CombinationState.OUTSIDE;
286         }
287 
288         return result;
289     }
290 
291 }