View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.combi;
23  
24  import java.util.Collections;
25  import java.util.Iterator;
26  import java.util.Set;
27  import java.util.SortedSet;
28  import java.util.TreeSet;
29  
30  import de.fu_berlin.ties.classify.Prediction;
31  import de.fu_berlin.ties.classify.PredictionDistribution;
32  import de.fu_berlin.ties.text.TokenDetails;
33  
34  /***
35   * A combination strategy using open/close tagging (also called "BIE" tagging
36   * due to the prefixed used). Classes are prefixed with "B-" (first element of
37   * an instance), "I-" (inner element of an instance), "E-" (last element of an
38   * instance). Optionally ("BIE2") tagging, a fourth type "BE-" is used to mark
39   * the extractions consisting in a single token (which is both begin and end);
40   * otherwise ("BIE1") these are simple tagged with the "B-" prefix.
41   * "A" is used for anything else/outside of any class.
42   *
43   * @author Christian Siefkes
44   * @version $Revision: 1.16 $, $Date: 2006/10/21 16:04:01 $, $Author: siefkes $
45   */
46  public class OpenCloseStrategy extends CombinationStrategy {
47  
48      /***
49       * The character terminating each prefix.
50       */
51      private static final char PREFIX_TERMINATOR = '-';
52  
53      /***
54       * Prefix used to classify the first element of an instance.
55       */
56      private static final String BEGIN_PREFIX = "B" + PREFIX_TERMINATOR;
57  
58      /***
59       * Prefix used to classify inner elements of an instance.
60       */
61      private static final String INNER_PREFIX = "I" + PREFIX_TERMINATOR;
62  
63      /***
64       * Prefix used to classify the last element of an instance.
65       */
66      private static final String END_PREFIX = "E" + PREFIX_TERMINATOR;
67  
68      /***
69       * Prefix used to classify the only element of an instance (begin and end)
70       * -- BIE2 tagging only.
71       */
72      private static final String BEGIN_END_PREFIX = "BE" + PREFIX_TERMINATOR;
73  
74      /***
75       * String used to classify outside/other. Actually called "A" so it will be
76       * picked in case of a tie.
77       */
78      private static final String OUTSIDE = "A";
79  
80  
81      /***
82       * Whether the {@link #BEGIN_END_PREFIX} prefix is used as fourth class
83       * ("BIE2" tagging).
84       */
85      private final boolean usingBE;
86  
87      /***
88       * The set of all classes prefixed with {@link #BEGIN_PREFIX}, stored for
89       * efficiency.
90       */
91      private final TreeSet<String> bClasses = new TreeSet<String>();
92  
93      /***
94       * The set of all classes prefixed with {@link #BEGIN_END_PREFIX}, stored
95       * for efficiency.
96       */
97      private final TreeSet<String> beClasses;
98  
99      /***
100      * An immutable set of all classes (Strings) that can possible occur
101      * during classification.
102      */
103     private final SortedSet[] allClasses;
104 
105     /***
106      * Creates a new instance.
107      *
108      * @param theClasses a set of valid class names (String)
109      * @param useBE whether or not to use a fourth prefix type ("BE-") for
110      * single-token extractions (otherwise these are simple tagging with
111      * the begin prefix "B-")
112      */
113     public OpenCloseStrategy(final Set<String> theClasses,
114             final boolean useBE) {
115         super(theClasses);
116         usingBE = useBE;
117 
118         final SortedSet<String> allValidClasses = new TreeSet<String>();
119         final Iterator<String> iter = theClasses.iterator();
120         String currentClass;
121 
122         // init set if required
123         beClasses = (usingBE) ? new TreeSet<String>() : null;
124 
125         // initialize immutable set of all classes + sets of B/BE classes
126         while (iter.hasNext()) {
127             currentClass = iter.next();
128             allValidClasses.add(INNER_PREFIX + currentClass);
129             allValidClasses.add(END_PREFIX + currentClass);
130             bClasses.add(BEGIN_PREFIX + currentClass);
131 
132             if (usingBE) {
133                 beClasses.add(BEGIN_END_PREFIX + currentClass);
134             }
135         }
136 
137         allValidClasses.add(OUTSIDE);
138         allValidClasses.addAll(bClasses);
139 
140         if (usingBE) {
141             allValidClasses.addAll(beClasses);
142         }
143 
144         allClasses = new SortedSet[] {
145             Collections.unmodifiableSortedSet(allValidClasses)
146         };
147     }
148 
149     /***
150      * Builds a set of class names (Strings) to pass to the classifier
151      * to consider for the next decision. This implementation returns the
152      * classes in alphabetic order (using a {@link SortedSet}).
153      *
154      * @return a set of class names
155      */
156     public Set[] activeClasses() {
157         final SortedSet<String> result;
158         final String lastType = state().getType();
159 
160         if ((lastType == null) || state().isEnd()) {
161             // all B + BE classes allowed + O
162             result = new TreeSet<String>(bClasses);
163             if (usingBE) {
164                 result.addAll(beClasses);
165             }
166             result.add(OUTSIDE);
167         } else {
168             // BIE2 tagging: only I + E for current class allowed
169             result = new TreeSet<String>();
170             result.add(INNER_PREFIX + lastType);
171             result.add(END_PREFIX + lastType);
172 
173             // BIE1 tagging: O + all B classes are allowed after B-
174             if (!usingBE && state().isBegin()) {
175                 result.addAll(bClasses);
176                 result.add(OUTSIDE);
177             }
178         }
179         //Util.LOG.debug("Active classes: " + result);
180         return new Set[] {result};
181     }
182 
183     /***
184      * {@inheritDoc}
185      */
186     public Set[] allClasses() {
187         return allClasses;
188     }
189 
190     /***
191      * Whether a special "BE-" prefix is used as fourth type for single-token
192      * extractions ("BIE2" tagging).
193      *
194      * @return the value of the attribute
195      */
196     public boolean isUsingBE() {
197         return usingBE;
198     }
199 
200     /***
201      * {@inheritDoc}
202      */
203     public String[] translateCurrentState(final CombinationState currentState)
204             throws IllegalArgumentException {
205         final String result;
206         final String type = currentState.getType();
207 
208         if (type == null) {
209             // null: outside
210             result = OUTSIDE;
211         } else {
212             final String prefix;
213 
214             if (currentState.isBegin()) {
215                 if (usingBE && currentState.isEnd()) {
216                     prefix = BEGIN_END_PREFIX;
217                 } else {
218                     prefix = BEGIN_PREFIX;
219                 }
220             } else {
221                 if (currentState.isEnd()) {
222                     prefix = END_PREFIX;
223                 } else {
224                     prefix = INNER_PREFIX;
225                 }
226             }
227             result = prefix + type;
228         }
229         //Util.LOG.debug("State " + currentState + " translated to: " + result);
230         return new String[] {result};
231     }
232 
233     /***
234      * {@inheritDoc}
235      */
236     public CombinationState translateResult(
237             final PredictionDistribution[] predictions,
238             final TokenDetails details)
239     throws IllegalArgumentException {
240         final CombinationState result;
241         final Prediction best = predictions[0].best();
242         final String predictedClass = best.getType();
243 
244         if (OUTSIDE.equals(predictedClass)) {
245             // outside any instance
246             result = CombinationState.OUTSIDE;
247         } else {
248             final int prefixEndPos =
249                 predictedClass.indexOf(PREFIX_TERMINATOR);
250 
251             if (prefixEndPos >= 0) {
252                 final String prefix;
253                 final String newClass;
254                 final boolean isBegin;
255                 final boolean isEnd;
256 
257                 prefix = predictedClass.substring(0, prefixEndPos + 1);
258                 newClass = predictedClass.substring(prefixEndPos + 1);
259 
260                 if (BEGIN_PREFIX.equals(prefix)) {
261                     isBegin = true;
262                     isEnd = false;
263                 } else if (usingBE && BEGIN_END_PREFIX.equals(prefix)) {
264                     isBegin = true;
265                     isEnd = true;
266                 } else if (INNER_PREFIX.equals(prefix)) {
267                     isBegin = false;
268                     isEnd = false;
269                 } else if (END_PREFIX.equals(prefix)) {
270                     isBegin = false;
271                     isEnd = true;
272                 } else {
273                     throw new IllegalArgumentException(
274                         "Prefix " + prefix + " of predicted class "
275                             + predictedClass + " is invalid");
276                 }
277 
278                 if (getValidClasses().contains(newClass)) {
279                     result = new CombinationState(newClass, isBegin, isEnd,
280                             best.getProbability());
281                 } else {
282                     throw new IllegalArgumentException(
283                         "Type " + newClass + " of predicted class "
284                             + predictedClass + " is invalid");
285                 }
286             } else {
287                 throw new IllegalArgumentException(
288                     "Invalid predicted class (no prefix): " + predictedClass);
289             }
290         }
291         /* Util.LOG.debug("Translated predicted class " + predictedClass
292                 + " to: "+ result); */
293         return result;
294     }
295 
296 }