1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.combi;
23
24 import java.util.Collections;
25 import java.util.Iterator;
26 import java.util.Set;
27 import java.util.SortedSet;
28 import java.util.TreeSet;
29
30 import de.fu_berlin.ties.classify.Prediction;
31 import de.fu_berlin.ties.classify.PredictionDistribution;
32 import de.fu_berlin.ties.text.TokenDetails;
33
34 /***
35 * A combination strategy using open/close tagging (also called "BIE" tagging
36 * due to the prefixed used). Classes are prefixed with "B-" (first element of
37 * an instance), "I-" (inner element of an instance), "E-" (last element of an
38 * instance). Optionally ("BIE2") tagging, a fourth type "BE-" is used to mark
39 * the extractions consisting in a single token (which is both begin and end);
40 * otherwise ("BIE1") these are simple tagged with the "B-" prefix.
41 * "A" is used for anything else/outside of any class.
42 *
43 * @author Christian Siefkes
44 * @version $Revision: 1.16 $, $Date: 2006/10/21 16:04:01 $, $Author: siefkes $
45 */
46 public class OpenCloseStrategy extends CombinationStrategy {
47
48 /***
49 * The character terminating each prefix.
50 */
51 private static final char PREFIX_TERMINATOR = '-';
52
53 /***
54 * Prefix used to classify the first element of an instance.
55 */
56 private static final String BEGIN_PREFIX = "B" + PREFIX_TERMINATOR;
57
58 /***
59 * Prefix used to classify inner elements of an instance.
60 */
61 private static final String INNER_PREFIX = "I" + PREFIX_TERMINATOR;
62
63 /***
64 * Prefix used to classify the last element of an instance.
65 */
66 private static final String END_PREFIX = "E" + PREFIX_TERMINATOR;
67
68 /***
69 * Prefix used to classify the only element of an instance (begin and end)
70 * -- BIE2 tagging only.
71 */
72 private static final String BEGIN_END_PREFIX = "BE" + PREFIX_TERMINATOR;
73
74 /***
75 * String used to classify outside/other. Actually called "A" so it will be
76 * picked in case of a tie.
77 */
78 private static final String OUTSIDE = "A";
79
80
81 /***
82 * Whether the {@link #BEGIN_END_PREFIX} prefix is used as fourth class
83 * ("BIE2" tagging).
84 */
85 private final boolean usingBE;
86
87 /***
88 * The set of all classes prefixed with {@link #BEGIN_PREFIX}, stored for
89 * efficiency.
90 */
91 private final TreeSet<String> bClasses = new TreeSet<String>();
92
93 /***
94 * The set of all classes prefixed with {@link #BEGIN_END_PREFIX}, stored
95 * for efficiency.
96 */
97 private final TreeSet<String> beClasses;
98
99 /***
100 * An immutable set of all classes (Strings) that can possible occur
101 * during classification.
102 */
103 private final SortedSet[] allClasses;
104
105 /***
106 * Creates a new instance.
107 *
108 * @param theClasses a set of valid class names (String)
109 * @param useBE whether or not to use a fourth prefix type ("BE-") for
110 * single-token extractions (otherwise these are simple tagging with
111 * the begin prefix "B-")
112 */
113 public OpenCloseStrategy(final Set<String> theClasses,
114 final boolean useBE) {
115 super(theClasses);
116 usingBE = useBE;
117
118 final SortedSet<String> allValidClasses = new TreeSet<String>();
119 final Iterator<String> iter = theClasses.iterator();
120 String currentClass;
121
122
123 beClasses = (usingBE) ? new TreeSet<String>() : null;
124
125
126 while (iter.hasNext()) {
127 currentClass = iter.next();
128 allValidClasses.add(INNER_PREFIX + currentClass);
129 allValidClasses.add(END_PREFIX + currentClass);
130 bClasses.add(BEGIN_PREFIX + currentClass);
131
132 if (usingBE) {
133 beClasses.add(BEGIN_END_PREFIX + currentClass);
134 }
135 }
136
137 allValidClasses.add(OUTSIDE);
138 allValidClasses.addAll(bClasses);
139
140 if (usingBE) {
141 allValidClasses.addAll(beClasses);
142 }
143
144 allClasses = new SortedSet[] {
145 Collections.unmodifiableSortedSet(allValidClasses)
146 };
147 }
148
149 /***
150 * Builds a set of class names (Strings) to pass to the classifier
151 * to consider for the next decision. This implementation returns the
152 * classes in alphabetic order (using a {@link SortedSet}).
153 *
154 * @return a set of class names
155 */
156 public Set[] activeClasses() {
157 final SortedSet<String> result;
158 final String lastType = state().getType();
159
160 if ((lastType == null) || state().isEnd()) {
161
162 result = new TreeSet<String>(bClasses);
163 if (usingBE) {
164 result.addAll(beClasses);
165 }
166 result.add(OUTSIDE);
167 } else {
168
169 result = new TreeSet<String>();
170 result.add(INNER_PREFIX + lastType);
171 result.add(END_PREFIX + lastType);
172
173
174 if (!usingBE && state().isBegin()) {
175 result.addAll(bClasses);
176 result.add(OUTSIDE);
177 }
178 }
179
180 return new Set[] {result};
181 }
182
183 /***
184 * {@inheritDoc}
185 */
186 public Set[] allClasses() {
187 return allClasses;
188 }
189
190 /***
191 * Whether a special "BE-" prefix is used as fourth type for single-token
192 * extractions ("BIE2" tagging).
193 *
194 * @return the value of the attribute
195 */
196 public boolean isUsingBE() {
197 return usingBE;
198 }
199
200 /***
201 * {@inheritDoc}
202 */
203 public String[] translateCurrentState(final CombinationState currentState)
204 throws IllegalArgumentException {
205 final String result;
206 final String type = currentState.getType();
207
208 if (type == null) {
209
210 result = OUTSIDE;
211 } else {
212 final String prefix;
213
214 if (currentState.isBegin()) {
215 if (usingBE && currentState.isEnd()) {
216 prefix = BEGIN_END_PREFIX;
217 } else {
218 prefix = BEGIN_PREFIX;
219 }
220 } else {
221 if (currentState.isEnd()) {
222 prefix = END_PREFIX;
223 } else {
224 prefix = INNER_PREFIX;
225 }
226 }
227 result = prefix + type;
228 }
229
230 return new String[] {result};
231 }
232
233 /***
234 * {@inheritDoc}
235 */
236 public CombinationState translateResult(
237 final PredictionDistribution[] predictions,
238 final TokenDetails details)
239 throws IllegalArgumentException {
240 final CombinationState result;
241 final Prediction best = predictions[0].best();
242 final String predictedClass = best.getType();
243
244 if (OUTSIDE.equals(predictedClass)) {
245
246 result = CombinationState.OUTSIDE;
247 } else {
248 final int prefixEndPos =
249 predictedClass.indexOf(PREFIX_TERMINATOR);
250
251 if (prefixEndPos >= 0) {
252 final String prefix;
253 final String newClass;
254 final boolean isBegin;
255 final boolean isEnd;
256
257 prefix = predictedClass.substring(0, prefixEndPos + 1);
258 newClass = predictedClass.substring(prefixEndPos + 1);
259
260 if (BEGIN_PREFIX.equals(prefix)) {
261 isBegin = true;
262 isEnd = false;
263 } else if (usingBE && BEGIN_END_PREFIX.equals(prefix)) {
264 isBegin = true;
265 isEnd = true;
266 } else if (INNER_PREFIX.equals(prefix)) {
267 isBegin = false;
268 isEnd = false;
269 } else if (END_PREFIX.equals(prefix)) {
270 isBegin = false;
271 isEnd = true;
272 } else {
273 throw new IllegalArgumentException(
274 "Prefix " + prefix + " of predicted class "
275 + predictedClass + " is invalid");
276 }
277
278 if (getValidClasses().contains(newClass)) {
279 result = new CombinationState(newClass, isBegin, isEnd,
280 best.getProbability());
281 } else {
282 throw new IllegalArgumentException(
283 "Type " + newClass + " of predicted class "
284 + predictedClass + " is invalid");
285 }
286 } else {
287 throw new IllegalArgumentException(
288 "Invalid predicted class (no prefix): " + predictedClass);
289 }
290 }
291
292
293 return result;
294 }
295
296 }