View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.combi;
23  
24  import java.util.Collections;
25  import java.util.Iterator;
26  import java.util.Set;
27  import java.util.SortedSet;
28  import java.util.TreeSet;
29  
30  import org.apache.commons.lang.builder.ToStringBuilder;
31  
32  import de.fu_berlin.ties.classify.Prediction;
33  import de.fu_berlin.ties.classify.PredictionDistribution;
34  import de.fu_berlin.ties.text.TokenDetails;
35  
36  /***
37   * A combination strategy using inside/outside tagging. Classes are prefixed
38   * with "I-" (in this class) or "B-" (begin of a new instance of this class),
39   * "A" is used for anything else/outside of any class.
40   *
41   * <p>{@link de.fu_berlin.ties.combi.CombinationState#isEnd()} is not
42   * supported by this class and will always return <code>false</code>.
43   *
44   * @author Christian Siefkes
45   * @version $Revision: 1.16 $, $Date: 2006/10/21 16:04:01 $, $Author: siefkes $
46   */
47  public class InsideOutsideStrategy extends CombinationStrategy {
48  
49      /***
50       * Prefix used to classify the begin of a new instance.
51       */
52      private static final String BEGIN_PREFIX = "B-";
53  
54      /***
55       * Prefix used to classify items within instances.
56       */
57      private static final String IN_PREFIX = "I-";
58  
59      /***
60       * String used to classify outside/other. Actually called "A" so it will be
61       * picked in case of a tie.
62       */
63      private static final String OUTSIDE = "A";
64  
65      /***
66       * Whether the "B-" prefix is used for the first item of each instance
67       * ("O/B/I" mode) or only for the first item of instances immediately
68       * following an instance of the same class ("O/I/B" mode).
69       */
70      private final boolean bStartingAll;
71  
72      /***
73       * The set of all classes prefixed with {@link #BEGIN_PREFIX}, stored for
74       * efficiency.
75       */
76      private final TreeSet<String> bClasses;
77  
78      /***
79       * The set of all classes prefixed with {@link #IN_PREFIX}, stored for
80       * efficiency.
81       */
82      private final TreeSet<String> iClasses;
83  
84      /***
85       * An immutable set of all classes (Strings) that can possible occur
86       * during classification.
87       */
88      private final SortedSet[] allClasses;
89  
90      /***
91       * Creates a new instance, using "O/I/B" mode: the "B-" is only used
92       * where required for disambiguation, otherwise the "I-" prefix is used.
93       *
94       * @param theClasses a set of valid class names (String)
95       */
96      public InsideOutsideStrategy(final Set<String> theClasses) {
97          this(theClasses, false);
98      }
99  
100     /***
101      * Creates a new instance, using "O/I/B" mode (setting
102      * {@link #isBStartingAll()} to <code>false</code>). In this mode, the "B-"
103      * is only used where required for disambiguation, otherwise the "I-" prefix
104      * is used.
105      *
106      * @param theClasses a set of valid class names (String)
107      * @param bStartingAllClasses if <code>true</code>, the "B-" prefix is used
108      * for the first item of each instance ("O/B/I" mode); otherwise it is used
109      * only for first item of instances immediately following an instance of the
110      * same class ("O/I/B" mode)
111      */
112     public InsideOutsideStrategy(final Set<String> theClasses,
113                                  final boolean bStartingAllClasses) {
114         super(theClasses);
115         bStartingAll = bStartingAllClasses;
116         bClasses = new TreeSet<String>();
117         iClasses = new TreeSet<String>();
118         final Iterator iter = theClasses.iterator();
119         String currentClass;
120 
121         // prepend prefixes to all classes
122         while (iter.hasNext()) {
123             currentClass = (String) iter.next();
124             bClasses.add(BEGIN_PREFIX + currentClass);
125             iClasses.add(IN_PREFIX + currentClass);
126         }
127 
128         // initialize immutable set of all classes
129         final SortedSet<String> allValidClasses = new TreeSet<String>();
130         allValidClasses.addAll(bClasses);
131         allValidClasses.addAll(iClasses);
132         allValidClasses.add(OUTSIDE);
133         allClasses = new SortedSet[] {
134             Collections.unmodifiableSortedSet(allValidClasses)
135         };
136     }
137 
138     /***
139      * Builds a set of class names (Strings) to pass to the classifier
140      * to consider for the next decision. This implementation returns the
141      * classes in alphabetic order (using a {@link SortedSet}).
142      *
143      * @return a set of class names
144      */
145     public Set[] activeClasses() {
146         final SortedSet<String> result;
147         if (bStartingAll) {
148             // all B classes allowed
149             result = new TreeSet<String>(bClasses);
150         } else {
151             // all I classes allowed
152             result = new TreeSet<String>(iClasses);
153         }
154 
155         if (state().getType() != null) {
156             if (bStartingAll) {
157                 // I is used to continue current instance
158                 result.add(IN_PREFIX + state().getType());
159             } else {
160                 // B is used to start new instance of same type
161                 result.add(BEGIN_PREFIX + state().getType());
162             }
163         }
164 
165         // O is always allows
166         result.add(OUTSIDE);
167         return new Set[] {result};
168     }
169 
170     /***
171      * {@inheritDoc}
172      */
173     public Set[] allClasses() {
174         return allClasses;
175     }
176 
177     /***
178      * Whether the "B-" prefix is used for the first item of each instance
179      * ("O/B/I" mode) or only for the first item of instances immediately
180      * following an instance of the same class ("O/I/B" mode).
181      *
182      * @return the value of the attribute
183      */
184     public boolean isBStartingAll() {
185         return bStartingAll;
186     }
187 
188     /***
189      * {@inheritDoc}
190      */
191     public String[] translateCurrentState(final CombinationState currentState)
192             throws IllegalArgumentException {
193         final String result;
194         final String type = currentState.getType();
195 
196         if (type == null) {
197             // null: outside
198             result = OUTSIDE;
199         } else if (!currentState.isBegin()) {
200             // continuation of last instance (must have same type)
201             if (type.equals(state().getType())) {
202                 result = IN_PREFIX + type;
203             } else {
204                 throw new IllegalArgumentException(
205                     "Illegal continuation state: last state "
206                         + state().getType() + " differs from current state "
207                         + type);
208             }
209         } else if (type.equals(state().getType())) {
210             // new instance of same type
211             result = BEGIN_PREFIX + type;
212         } else {
213             // new instance, last class was different
214             if (bStartingAll) {
215                 result = BEGIN_PREFIX + type;
216             } else {
217                 result = IN_PREFIX + type;
218             }
219         }
220         return new String[] {result};
221     }
222 
223     /***
224      * {@inheritDoc}
225      */
226     public CombinationState translateResult(
227             final PredictionDistribution[] predictions,
228             final TokenDetails details)
229     throws IllegalArgumentException {
230         final CombinationState result;
231         final Prediction best = predictions[0].best();
232         final String predictedClass = best.getType();
233 
234         if (OUTSIDE.equals(predictedClass)) {
235             // outside any instance
236             result = CombinationState.OUTSIDE;
237         } else {
238             final String newClass;
239             final boolean isBegin;
240 
241             if (predictedClass.startsWith(BEGIN_PREFIX)) {
242                 // begin of a instance
243                 isBegin = true;
244                 newClass = predictedClass.substring(BEGIN_PREFIX.length());
245             } else if (predictedClass.startsWith(IN_PREFIX)) {
246                 // within a instance
247                 isBegin = false;
248                 newClass = predictedClass.substring(IN_PREFIX.length());
249             } else {
250                 throw new IllegalArgumentException(
251                     "Invalid predicted class: " + predictedClass);
252             }
253 
254             if (isBegin) {
255                 if (!bStartingAll) {
256                     // check that this is allowed (last item had same class)
257                     if (!newClass.equals(state().getType())) {
258                         throw new IllegalArgumentException("'O/I/B' mode: "
259                             + "B must occur after item of same class, but "
260                             + "old class (" + state().getType()
261                             + ") and new class (" + newClass + ") differ");
262                     }
263                 }
264 
265                 // start of new instance
266                 result = new CombinationState(newClass, true, false,
267                         best.getProbability());
268             } else {
269                 if (bStartingAll) {
270                     // check that this is allowed (last item had same class)
271                     if (!newClass.equals(state().getType())) {
272                         throw new IllegalArgumentException("'O/B/I' mode: "
273                             + "I must occur after item of same class, but"
274                             + "old class (" + state().getType()
275                             + ") and new class (" + newClass + ") differ");
276                     }
277 
278                     // continue last instance
279                     result = new CombinationState(newClass, false, false,
280                             best.getProbability());
281                 } else {
282                     // O/I/B mode:
283                     if (newClass.equals(state().getType())) {
284                         // continue last instance
285                         result = new CombinationState(newClass, false, false,
286                                 best.getProbability());
287                     } else {
288                         // start of new instance
289                         result = new CombinationState(newClass, true, false,
290                                 best.getProbability());
291                     }
292                 }
293             }
294         }
295         return result;
296     }
297 
298     /***
299      * Returns a string representation of this object.
300      *
301      * @return a textual representation
302      */
303     public String toString() {
304         return new ToStringBuilder(this)
305             .appendSuper(super.toString())
306             .append("B starts all", bStartingAll)
307             .toString();
308     }
309 
310 }