1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.util.Iterator;
25 import java.util.LinkedList;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28
29 import org.apache.commons.lang.builder.EqualsBuilder;
30 import org.apache.commons.lang.builder.HashCodeBuilder;
31
32 import de.fu_berlin.ties.classify.Prediction;
33 import de.fu_berlin.ties.classify.Probability;
34 import de.fu_berlin.ties.context.Recognition;
35 import de.fu_berlin.ties.eval.EvalStatus;
36 import de.fu_berlin.ties.io.FieldMap;
37 import de.fu_berlin.ties.text.TextUtils;
38 import de.fu_berlin.ties.text.TokenDetails;
39 import de.fu_berlin.ties.text.TokenizerFactory;
40 import de.fu_berlin.ties.util.Util;
41
42 /***
43 * Extends a {@link de.fu_berlin.ties.classify.Prediction} by also storing the
44 * extracted text and location data.
45 *
46 * <p>Instances of this class are not thread-safe.
47 *
48 * @author Christian Siefkes
49 * @version $Revision: 1.12 $, $Date: 2004/11/25 13:36:08 $, $Author: siefkes $
50 */
51 public class Extraction extends Prediction implements Recognition {
52
53 /***
54 * Serialization key for the extracted text.
55 */
56 public static final String KEY_TEXT = "Text";
57
58 /***
59 * Serialization key for the repetition of the first token.
60 */
61 public static final String KEY_FIRST_TOKEN_REP = "FirstTokenRep";
62
63 /***
64 * Serialization key for the index.
65 */
66 public static final String KEY_INDEX = "Index";
67
68 /***
69 * A list of {@link TokenDetails} describing the tokens combined in this
70 * extraction.
71 */
72 private final LinkedList<TokenDetails> detailsList =
73 new LinkedList<TokenDetails>();
74
75 /***
76 * The visible characters of the text fragment (everything except whitespace
77 * and control characters).
78 */
79 private final StringBuilder visibleChars;
80
81 /***
82 * Matches characters that are not visible (whitespace and control
83 * characters). Used to delete them for updating {@link #visibleChars}.
84 */
85 private final Matcher nonVisibleMatcher =
86 Pattern.compile(TokenizerFactory.WHITESPACE_CONTROL_OTHER).matcher("");
87
88 /***
89 * A sealed extraction cannot be changed. This means that
90 * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
91 * is not allowed after sealing.
92 */
93 private boolean sealed = false;
94
95 /***
96 * Whether the {@linkplain TokenDetails#getRep() repetition of the first
97 * token} should be ignored, comparing only the text but not the position of
98 * extractions. Defaults to <code>false</code>.
99 */
100 private boolean firstTokenRepIgnored = false;
101
102 /***
103 * Creates a new instance from a field map, fulfilling the
104 * {@link de.fu_berlin.ties.io.Storable} contract. An extraction created
105 * this way will be immediately {@link #isSealed() sealed}, thus the
106 * extracted text cannot be changed.
107 *
108 * @param fieldMap map containing the serialized fields
109 */
110 public Extraction(final FieldMap fieldMap) {
111
112 super(fieldMap);
113
114
115 final Object rawText = fieldMap.get(KEY_TEXT);
116 final String newText = (rawText == null) ? "" : rawText.toString();
117 final int firstTokenRep = Util.asInt(fieldMap.get(KEY_FIRST_TOKEN_REP));
118 final int index = Util.asInt(fieldMap.get(KEY_INDEX));
119
120
121 detailsList.add(new TokenDetails(newText, firstTokenRep, index, false));
122 visibleChars = new StringBuilder(TextUtils.replaceAll(newText,
123 nonVisibleMatcher, ""));
124
125
126 setSealed(true);
127 }
128
129 /***
130 * Creates a new instance without locating it in a text (using -1 for
131 * first token rep + index), setting the probability to -1 ("confirmed")
132 * and the evaluation status to {@link EvalStatus#TRUTH}.
133 *
134 * @param predicted the predicted class
135 * @param extracted the (first part) extracted text fragment; must not be
136 * <code>null</code>
137 */
138 public Extraction(final String predicted, final String extracted) {
139 this(predicted, new TokenDetails(extracted, -1, -1, false));
140 }
141
142 /***
143 * Creates a new instance, setting the probability to -1 ("confirmed")
144 * and the evaluation status to {@link EvalStatus#TRUTH}.
145 * Use this constructor to build answer keys.
146 *
147 * @param predicted the predicted class
148 * @param details details about the extracted text fragment or its first
149 * token
150 */
151 public Extraction(final String predicted, final TokenDetails details) {
152 this(predicted, details, new Probability(-1.0), EvalStatus.TRUTH);
153 }
154
155 /***
156 * Creates a new instance, setting the evaluation status to
157 * {@link EvalStatus#UNKNOWN}.
158 *
159 * @param predicted the predicted class
160 * @param details details about the extracted text fragment or its first
161 * token
162 * @param prob the probability of the prediction
163 */
164 public Extraction(final String predicted, final TokenDetails details,
165 final Probability prob) {
166 this(predicted, details, prob, EvalStatus.UNKNOWN);
167 }
168
169 /***
170 * Creates a new instance.
171 *
172 * @param predicted the predicted class
173 * @param details details about the extracted text fragment or its first
174 * token
175 * @param prob the probability of the prediction
176 * @param status the {@linkplain EvalStatus evaluation status} of this
177 * instance
178 */
179 public Extraction(final String predicted, final TokenDetails details,
180 final Probability prob, final EvalStatus status) {
181 super(predicted, prob, status);
182 detailsList.add(details);
183 visibleChars = new StringBuilder(TextUtils.replaceAll(
184 details.getToken(), nonVisibleMatcher, ""));
185 }
186
187 /***
188 * Adds a token to this extraction, delegating to
189 * {@link #addToken(TokenDetails, Probability, boolean)} with a probability
190 * of -1 ("confirmed"). Use this method when building answer keys.
191 *
192 * @param details details about the new token
193 * @param atEnd whether to add the new token at the end or at the
194 * start
195 * @throws IllegalStateException if this extraction
196 * {@link #isSealed() is sealed}
197 */
198 public void addToken(final TokenDetails details, final boolean atEnd)
199 throws IllegalStateException {
200 addToken(details, new Probability(-1.0), atEnd);
201 }
202
203 /***
204 * Adds a token to this extraction, recalculating the probability by
205 * multiplying the prior probability value with the probability of the
206 * new text. Increments the token count by 1.
207 *
208 * @param details details about the new token
209 * @param prob the probability of the new token; might be <code>null</code>
210 * if the overall probability of the extraction should not be changed
211 * @param atEnd whether to add the new token at the end or at the
212 * start
213 * @throws IllegalStateException if this extraction
214 * {@link #isSealed() is sealed}; or if new and old probabilities/pRs
215 * cannot be combined
216 */
217 public void addToken(final TokenDetails details, final Probability prob,
218 final boolean atEnd) throws IllegalStateException {
219
220 if (isSealed()) {
221 throw new IllegalStateException(
222 "Cannot change text of sealed extraction");
223 }
224
225
226 super.addProb(prob, atEnd);
227 final String newVisibleChars =
228 TextUtils.replaceAll(details.getToken(), nonVisibleMatcher, "");
229
230 if (atEnd) {
231 detailsList.addLast(details);
232 visibleChars.append(newVisibleChars);
233 } else {
234 detailsList.addFirst(details);
235 visibleChars.insert(0, newVisibleChars);
236 }
237 }
238
239 /***
240 * Indicates whether some other object is "equal to" this one, fulfulling
241 * the {@link Object#equals(java.lang.Object)} contract. The
242 * {@linkplain Prediction#getEvalStatus() evaluation status} is ignored when
243 * checking equality, thus if all other fields of two extractions are equal,
244 * this method will return <code>true</code> even if their evaluation states
245 * differ. Only the {@link #getVisibleChars() visible characters} of the
246 * extractions are compared, whitespace and control characters are ignored.
247 *
248 * @param obj the reference object with which to compare
249 * @return <code>true</code> iff the specified object is an
250 * {@link Extraction} equal to this instance
251 */
252 public boolean equals(final Object obj) {
253 if (obj == this) {
254 return true;
255 } else if ((obj != null) && (getClass().equals(obj.getClass()))) {
256
257
258 final Extraction other = (Extraction) obj;
259 return new EqualsBuilder()
260 .appendSuper(super.equals(obj))
261 .append(getVisibleChars(), other.getVisibleChars())
262 .append(getFirstTokenRep(), other.getFirstTokenRep())
263 .isEquals();
264 } else {
265 return false;
266 }
267 }
268
269 /***
270 * Returns the repetition of the first token of the extraction in the
271 * original text (counting starts with 0, as the first occurrence is the
272 * "0th repetition"), -1 if unknown or if {@link #isFirstTokenRepIgnored()}
273 * is <code>true</code>. This is useful to locate this extraction in the
274 * original text.
275 *
276 * @return the value of the attribute
277 */
278 public int getFirstTokenRep() {
279 if (firstTokenRepIgnored) {
280 return -1;
281 } else {
282 return detailsList.getFirst().getRep();
283 }
284 }
285
286 /***
287 * Returns the index of the first token in the text (indexing starts with
288 * 0); or -1 if unknown/irrelevant.
289 *
290 * @return the value of the attribute
291 */
292 public int getIndex() {
293 return detailsList.getFirst().getIndex();
294 }
295
296 /***
297 * Returns the extracted text fragment.
298 * @return the extracted text
299 */
300 public String getText() {
301 final StringBuilder text = new StringBuilder();
302 final Iterator<TokenDetails> detailsIter = detailsList.iterator();
303 TokenDetails details;
304
305 while (detailsIter.hasNext()) {
306 details = detailsIter.next();
307 if (details.isWhitespaceBefore() && (text.length() > 0)) {
308 text.append(' ');
309 }
310 text.append(details.getToken());
311 }
312 return text.toString();
313 }
314
315 /***
316 * Returns the visible characters of the text fragment (everything except
317 * whitespace and control characters).
318 *
319 * @return the visible characters
320 */
321 public String getVisibleChars() {
322 return visibleChars.toString();
323 }
324
325 /***
326 * Returns a hash code value for this object, fulfulling the
327 * {@link Object#hashCode()} contract.
328 * @return a hash code value for this object
329 */
330 public int hashCode() {
331
332
333 return new HashCodeBuilder(7, 11)
334 .appendSuper(super.hashCode())
335 .append(getVisibleChars())
336 .append(getFirstTokenRep())
337 .toHashCode();
338 }
339
340 /***
341 * Whether the {@linkplain TokenDetails#getRep() repetition of the first
342 * token} should be ignored, comparing only the text but not the position of
343 * extractions. Defaults to <code>false</code>.
344 *
345 * @return the value of the attribute
346 */
347 public boolean isFirstTokenRepIgnored() {
348 return firstTokenRepIgnored;
349 }
350
351 /***
352 * Whether this extraction has been sealed. The text of a sealed extraction
353 * cannot be changed. This means that
354 * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
355 * is not allowed after sealing.
356 *
357 * @return <code>true</code> iff this extraction is sealed
358 */
359 public boolean isSealed() {
360 return sealed;
361 }
362
363 /***
364 * Deletes one of the tokens from this prediction. At least one
365 * token must always remain, i.e. {@link #tokenCount()} must be 2 or
366 * more prior to calling this method.
367 *
368 * @param atEnd whether to delete the first or the last token
369 * @return details describing the removed token
370 * @throws IllegalStateException if there is only one token left or
371 * if this extraction {@link #isSealed() is sealed}
372 */
373 public TokenDetails removeToken(final boolean atEnd)
374 throws IllegalStateException {
375
376 if (isSealed()) {
377 throw new IllegalStateException(
378 "Cannot change text of sealed extraction");
379 }
380
381 if (tokenCount() != probCount()) {
382 throw new IllegalStateException(
383 "Invariant violation: number of tokens " + tokenCount() +
384 " != number of probabilities " + probCount());
385 }
386
387
388 super.removeProb(atEnd);
389
390
391 final TokenDetails removed;
392 if (atEnd) {
393 removed = detailsList.removeLast();
394 } else {
395 removed = detailsList.removeFirst();
396 }
397
398
399 final String charsToDelete =
400 TextUtils.replaceAll(removed.getToken(), nonVisibleMatcher, "");
401 final int startIndex, endIndex;
402
403 if (atEnd) {
404
405 endIndex = visibleChars.length();
406 startIndex = endIndex - charsToDelete.length();
407 } else {
408
409 endIndex = charsToDelete.length();
410 startIndex = 0;
411 }
412
413
414 if (visibleChars.substring(startIndex, endIndex).equals(
415 charsToDelete)) {
416 visibleChars.delete(startIndex, endIndex);
417 } else {
418 throw new RuntimeException("Invariant violation: "
419 + (atEnd ? "last" : "first")
420 + " visible chararacters are "
421 + visibleChars.substring(startIndex, endIndex)
422 + " instead of " + charsToDelete);
423 }
424
425 return removed;
426 }
427
428
429 /***
430 * Specified whether the {@linkplain TokenDetails#getRep() repetition of
431 * the first token} should be ignored, comparing only the text but not the
432 * position of extractions.
433 *
434 * @param firstTokenRepIgnored the new value of the attribute
435 */
436 public void setFirstTokenRepIgnored(boolean firstTokenRepIgnored) {
437 this.firstTokenRepIgnored = firstTokenRepIgnored;
438 }
439
440 /***
441 * Seals or unseals this extraction. The text of a sealed extraction cannot
442 * be changed. This means that
443 * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
444 * is not allowed after sealing.
445 *
446 * @param newSealed the new value of the attribute
447 */
448 public void setSealed(final boolean newSealed) {
449 sealed = newSealed;
450 }
451
452 /***
453 * Stores all relevant fields of this object in a field map for
454 * serialization. An equivalent object can be created by calling
455 * {@link de.fu_berlin.ties.io.FieldMap#createObject(Class)} on the created
456 * field map.
457 *
458 * @return the created field map
459 */
460 public FieldMap storeFields() {
461
462 final FieldMap result = super.storeFields();
463 result.put(KEY_TEXT, getText());
464 final TokenDetails firstDetails = detailsList.getFirst();
465
466
467 if (!firstTokenRepIgnored && (firstDetails.getRep() >= 0)) {
468 result.put(KEY_FIRST_TOKEN_REP, new Integer(firstDetails.getRep()));
469 }
470 if (firstDetails.getIndex() >= 0) {
471 result.put(KEY_INDEX, new Integer(firstDetails.getIndex()));
472 }
473
474 return result;
475 }
476
477 /***
478 * Returns the number of tokens in this extraction. This will only be
479 * reliable if a constructor is used to give the first token and operations
480 * such as {@link Extraction#addToken(TokenDetails, Probability, boolean)
481 * addToken} are used for each further token. Omitted when serializing
482 * so it cannot be restored.
483 *
484 * @return the value of the attribute
485 */
486 public int tokenCount() {
487 return detailsList.size();
488 }
489
490 }