1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.util.HashSet;
25 import java.util.Iterator;
26 import java.util.LinkedList;
27 import java.util.Set;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.apache.commons.lang.builder.EqualsBuilder;
32 import org.apache.commons.lang.builder.HashCodeBuilder;
33
34 import de.fu_berlin.ties.classify.Prediction;
35 import de.fu_berlin.ties.classify.Probability;
36 import de.fu_berlin.ties.context.Recognition;
37 import de.fu_berlin.ties.eval.EvalStatus;
38 import de.fu_berlin.ties.io.FieldMap;
39 import de.fu_berlin.ties.text.TextUtils;
40 import de.fu_berlin.ties.text.TokenDetails;
41 import de.fu_berlin.ties.text.TokenizerFactory;
42 import de.fu_berlin.ties.util.Util;
43
44 /***
45 * Extends a {@link de.fu_berlin.ties.classify.Prediction} by also storing the
46 * extracted text and location data.
47 *
48 * <p>Instances of this class are not thread-safe.
49 *
50 * @author Christian Siefkes
51 * @version $Revision: 1.22 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
52 */
53 public class Extraction extends Prediction implements Cloneable, Recognition {
54
55 /***
56 * Serialization key for the extracted text.
57 */
58 public static final String KEY_TEXT = "Text";
59
60 /***
61 * Serialization key for the repetition of the first token.
62 */
63 public static final String KEY_FIRST_TOKEN_REP = "FirstTokenRep";
64
65 /***
66 * Serialization key for the index.
67 */
68 public static final String KEY_INDEX = "Index";
69
70 /***
71 * A list of {@link TokenDetails} describing the tokens combined in this
72 * extraction.
73 */
74 private final LinkedList<TokenDetails> detailsList =
75 new LinkedList<TokenDetails>();
76
77 /***
78 * Overrides the index of the first token in the extraction if set to a
79 * non-negative value.
80 */
81 private int explicitIndex = -1;
82
83 /***
84 * Overrides the index of the last token in the extraction if set to a
85 * non-negative value.
86 */
87 private int explicitLastIndex = -1;
88
89 /***
90 * The visible characters of the text fragment (everything except whitespace
91 * and control characters).
92 */
93 private final StringBuilder visibleChars;
94
95 /***
96 * Matches characters that are not visible (whitespace and control
97 * characters). Used to delete them for updating {@link #visibleChars}.
98 */
99 private final Matcher nonVisibleMatcher =
100 Pattern.compile(TokenizerFactory.WHITESPACE_CONTROL_OTHER).matcher("");
101
102 /***
103 * A sealed extraction cannot be changed. This means that
104 * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
105 * is not allowed after sealing.
106 */
107 private boolean sealed = false;
108
109 /***
110 * Whether the {@linkplain TokenDetails#getRep() repetition of the first
111 * token} should be ignored, comparing only the text but not the position of
112 * extractions. Defaults to <code>false</code>.
113 */
114 private boolean firstTokenRepIgnored = false;
115
116 /***
117 * A set which allows setting any user-defined properties for the
118 * extraction.
119 */
120 private Set<Object> properties = null;
121
122
123 /***
124 * Creates a new instance from a field map, fulfilling the
125 * {@link de.fu_berlin.ties.io.Storable} contract. An extraction created
126 * this way will be immediately {@link #isSealed() sealed}, thus the
127 * extracted text cannot be changed.
128 *
129 * @param fieldMap map containing the serialized fields
130 */
131 public Extraction(final FieldMap fieldMap) {
132
133 super(fieldMap);
134
135
136 final Object rawText = fieldMap.get(KEY_TEXT);
137 final String newText = (rawText == null) ? "" : rawText.toString();
138 final int firstTokenRep = Util.asInt(fieldMap.get(KEY_FIRST_TOKEN_REP));
139 final int index = Util.asInt(fieldMap.get(KEY_INDEX));
140
141
142 detailsList.add(new TokenDetails(newText, firstTokenRep, index, false));
143 visibleChars = new StringBuilder(TextUtils.replaceAll(newText,
144 nonVisibleMatcher, ""));
145
146
147 setSealed(true);
148 }
149
150 /***
151 * Creates a new instance without locating it in a text (using -1 for
152 * first token rep + index), setting the probability to -1 ("confirmed")
153 * and the evaluation status to {@link EvalStatus#TRUTH}.
154 *
155 * @param predicted the predicted class
156 * @param extracted the (first part) extracted text fragment; must not be
157 * <code>null</code>
158 */
159 public Extraction(final String predicted, final String extracted) {
160 this(predicted, new TokenDetails(extracted, -1, -1, false));
161 }
162
163 /***
164 * Creates a new instance, setting the probability to -1 ("confirmed")
165 * and the evaluation status to {@link EvalStatus#TRUTH}.
166 * Use this constructor to build answer keys.
167 *
168 * @param predicted the predicted class
169 * @param details details about the extracted text fragment or its first
170 * token
171 */
172 public Extraction(final String predicted, final TokenDetails details) {
173 this(predicted, details, new Probability(-1.0), EvalStatus.TRUTH);
174 }
175
176 /***
177 * Creates a new instance, setting the evaluation status to
178 * {@link EvalStatus#UNKNOWN}.
179 *
180 * @param predicted the predicted class
181 * @param details details about the extracted text fragment or its first
182 * token
183 * @param prob the probability of the prediction
184 */
185 public Extraction(final String predicted, final TokenDetails details,
186 final Probability prob) {
187 this(predicted, details, prob, EvalStatus.UNKNOWN);
188 }
189
190 /***
191 * Creates a new instance.
192 *
193 * @param predicted the predicted class
194 * @param details details about the extracted text fragment or its first
195 * token
196 * @param prob the probability of the prediction
197 * @param status the {@linkplain EvalStatus evaluation status} of this
198 * instance
199 */
200 public Extraction(final String predicted, final TokenDetails details,
201 final Probability prob, final EvalStatus status) {
202 super(predicted, prob, status);
203 detailsList.add(details);
204 visibleChars = new StringBuilder(TextUtils.replaceAll(
205 details.getToken(), nonVisibleMatcher, ""));
206 }
207
208
209 /***
210 * Adds a token to this extraction, delegating to
211 * {@link #addToken(TokenDetails, Probability, boolean)} with a probability
212 * of -1 ("confirmed"). Use this method when building answer keys.
213 *
214 * @param details details about the new token
215 * @param atEnd whether to add the new token at the end or at the
216 * start
217 * @throws IllegalStateException if this extraction
218 * {@link #isSealed() is sealed}
219 */
220 public void addToken(final TokenDetails details, final boolean atEnd)
221 throws IllegalStateException {
222 addToken(details, new Probability(-1.0), atEnd);
223 }
224
225 /***
226 * Adds a token to this extraction, recalculating the probability by
227 * multiplying the prior probability value with the probability of the
228 * new text. Increments the token count by 1.
229 *
230 * @param details details about the new token
231 * @param prob the probability of the new token; might be <code>null</code>
232 * if the overall probability of the extraction should not be changed
233 * @param atEnd whether to add the new token at the end or at the
234 * start
235 * @throws IllegalStateException if this extraction
236 * {@link #isSealed() is sealed}; or if new and old probabilities/pRs
237 * cannot be combined
238 */
239 public void addToken(final TokenDetails details, final Probability prob,
240 final boolean atEnd) throws IllegalStateException {
241
242 if (isSealed()) {
243 throw new IllegalStateException(
244 "Cannot change text of sealed extraction");
245 }
246
247
248 super.addProb(prob, atEnd);
249 final String newVisibleChars =
250 TextUtils.replaceAll(details.getToken(), nonVisibleMatcher, "");
251
252 if (atEnd) {
253 detailsList.addLast(details);
254 visibleChars.append(newVisibleChars);
255 } else {
256 detailsList.addFirst(details);
257 visibleChars.insert(0, newVisibleChars);
258 }
259 }
260
261 /***
262 * Creates and returns a deep copy of this object. "Deep" means that there
263 * are no dependencies between the two objects -- modifying any fields of
264 * the copy will not affect this object, and vice versa. Any user-set
265 * {@link #setProperty(Object) properties}, however, are only copied,
266 * not cloned.
267 *
268 * @return a deep copy of this object
269 */
270 public Extraction clone() {
271
272 final FieldMap myFields = storeFields();
273 final Extraction clone = new Extraction(myFields);
274
275
276 clone.setLastIndex(getLastIndex());
277 clone.setFirstTokenRepIgnored(isFirstTokenRepIgnored());
278 clone.setSealed(isSealed());
279 if (properties != null) {
280 clone.properties = new HashSet<Object>(properties);
281 }
282
283 return clone;
284 }
285
286 /***
287 * Indicates whether some other object is "equal to" this one, fulfulling
288 * the {@link Object#equals(java.lang.Object)} contract. The
289 * {@linkplain Prediction#getEvalStatus() evaluation status} is ignored when
290 * checking equality, thus if all other fields of two extractions are equal,
291 * this method will return <code>true</code> even if their evaluation states
292 * differ. Only the {@link #getVisibleChars() visible characters} of the
293 * extractions are compared, whitespace and control characters are ignored.
294 *
295 * @param obj the reference object with which to compare
296 * @return <code>true</code> iff the specified object is an
297 * {@link Extraction} equal to this instance
298 */
299 public boolean equals(final Object obj) {
300 if (obj == this) {
301 return true;
302 } else if ((obj != null) && (getClass().equals(obj.getClass()))) {
303
304
305 final Extraction other = (Extraction) obj;
306 return new EqualsBuilder()
307 .appendSuper(super.equals(obj))
308 .append(getVisibleChars(), other.getVisibleChars())
309 .append(getFirstTokenRep(), other.getFirstTokenRep())
310 .isEquals();
311 } else {
312 return false;
313 }
314 }
315
316 /***
317 * Returns the repetition of the first token of the extraction in the
318 * original text (counting starts with 0, as the first occurrence is the
319 * "0th repetition"), -1 if unknown or if {@link #isFirstTokenRepIgnored()}
320 * is <code>true</code>. This is useful to locate this extraction in the
321 * original text.
322 *
323 * @return the value of the attribute
324 */
325 public int getFirstTokenRep() {
326 if (firstTokenRepIgnored) {
327 return -1;
328 } else {
329 return detailsList.getFirst().getRep();
330 }
331 }
332
333 /***
334 * Returns the index of the first token in the text (indexing starts with
335 * 0); or -1 if unknown/irrelevant.
336 *
337 * @return the value of the attribute
338 */
339 public int getIndex() {
340 if (explicitIndex >= 0) {
341
342 return explicitIndex;
343 } else {
344
345 return detailsList.getFirst().getIndex();
346 }
347 }
348
349 /***
350 * Returns the index of the last token in the text (indexing starts with
351 * 0); or -1 if unknown/irrelevant.
352 *
353 * @return the value of the attribute
354 */
355 public int getLastIndex() {
356 if (explicitLastIndex >= 0) {
357
358 return explicitLastIndex;
359 } else {
360
361 return detailsList.getLast().getIndex();
362 }
363 }
364
365 /***
366 * Returns the extracted text fragment.
367 * @return the extracted text
368 */
369 public String getText() {
370 final StringBuilder text = new StringBuilder();
371 final Iterator<TokenDetails> detailsIter = detailsList.iterator();
372 TokenDetails details;
373
374 while (detailsIter.hasNext()) {
375 details = detailsIter.next();
376 if (details.isWhitespaceBefore() && (text.length() > 0)) {
377 text.append(' ');
378 }
379 text.append(details.getToken());
380 }
381 return text.toString();
382 }
383
384 /***
385 * Returns the visible characters of the text fragment (everything except
386 * whitespace and control characters).
387 *
388 * @return the visible characters
389 */
390 public String getVisibleChars() {
391 return visibleChars.toString();
392 }
393
394 /***
395 * Returns a hash code value for this object, fulfulling the
396 * {@link Object#hashCode()} contract.
397 * @return a hash code value for this object
398 */
399 public int hashCode() {
400
401
402 return new HashCodeBuilder(7, 11)
403 .appendSuper(super.hashCode())
404 .append(getVisibleChars())
405 .append(getFirstTokenRep())
406 .toHashCode();
407 }
408
409 /***
410 * Checks if a specific user-defined property is set for this extraction.
411 *
412 * @param prop the property to check
413 * @return <code>true</code> iff the property is set
414 */
415 public boolean hasProperty(final Object prop) {
416 if (properties != null) {
417 return properties.contains(prop);
418 } else {
419 return false;
420 }
421 }
422
423 /***
424 * Sets a user-defined property for this extraction.
425 *
426 * @param prop the property to set
427 * @return <code>true</code> iff the property had not been set before
428 */
429 public boolean setProperty(final Object prop) {
430 if (properties == null) {
431 properties = new HashSet<Object>();
432 }
433 return properties.add(prop);
434 }
435
436 /***
437 * Unsets a user-defined property for this extraction.
438 *
439 * @param prop the property to unset
440 * @return <code>true</code> iff the property had been set before
441 */
442 public boolean unsetProperty(final Object prop) {
443 if (properties != null) {
444 return properties.remove(prop);
445 } else {
446 return false;
447 }
448 }
449
450 /***
451 * Whether the {@linkplain TokenDetails#getRep() repetition of the first
452 * token} should be ignored, comparing only the text but not the position of
453 * extractions. Defaults to <code>false</code>.
454 *
455 * @return the value of the attribute
456 */
457 public boolean isFirstTokenRepIgnored() {
458 return firstTokenRepIgnored;
459 }
460
461 /***
462 * Whether this extraction has been sealed. The text of a sealed extraction
463 * cannot be changed. This means that
464 * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
465 * is not allowed after sealing.
466 *
467 * @return <code>true</code> iff this extraction is sealed
468 */
469 public boolean isSealed() {
470 return sealed;
471 }
472
473 /***
474 * Modifies the probability of an extraction.
475 *
476 * @param prob the new probability, will be combined with the current
477 * token probabilities to calculate the average
478 */
479 public void modifyProbability(final Probability prob) {
480 super.addProb(prob, true);
481 }
482
483 /***
484 * Deletes one of the tokens from this prediction. At least one
485 * token must always remain, i.e. {@link #tokenCount()} must be 2 or
486 * more prior to calling this method.
487 *
488 * @param atEnd whether to delete the first or the last token
489 * @return details describing the removed token
490 * @throws IllegalStateException if there is only one token left or
491 * if this extraction {@link #isSealed() is sealed}
492 */
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550 /***
551 * Modifies the repetition of the first token of the extraction in the
552 * original text (counting starts with 0, as the first occurrence is the
553 * "0th repetition"). This also sets {@link #isFirstTokenRepIgnored()} to
554 * <code>false</code> (since it wouldn't make much sense to update the
555 * repetition if you want it to be ignored anyway).
556 *
557 * @param newFirstTokenRep the new value of the attribute
558 */
559 public void setFirstTokenRep(final int newFirstTokenRep) {
560 detailsList.getFirst().setRep(newFirstTokenRep);
561 firstTokenRepIgnored = false;
562 }
563
564 /***
565 * Specified whether the {@linkplain TokenDetails#getRep() repetition of
566 * the first token} should be ignored, comparing only the text but not the
567 * position of extractions.
568 *
569 * @param ftRepIgnored the new value of the attribute
570 */
571 public void setFirstTokenRepIgnored(final boolean ftRepIgnored) {
572 this.firstTokenRepIgnored = ftRepIgnored;
573 }
574
575 /***
576 * Overrides the index of the first token in the text (indexing starts with
577 * 0).
578 *
579 * @param newIndex the value of the attribute; if negative, the index of the
580 * first token will be used instead
581 */
582 public void setIndex(final int newIndex) {
583 explicitIndex = newIndex;
584 }
585
586 /***
587 * Overrides the index of the last token in the text (indexing starts with
588 * 0).
589 *
590 * @param newLastIndex the value of the attribute; if negative, the index
591 * of the last token will be used instead
592 */
593 public void setLastIndex(final int newLastIndex) {
594 explicitLastIndex = newLastIndex;
595 }
596
597 /***
598 * Seals or unseals this extraction. The text of a sealed extraction cannot
599 * be changed. This means that
600 * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
601 * is not allowed after sealing.
602 *
603 * @param newSealed the new value of the attribute
604 */
605 public void setSealed(final boolean newSealed) {
606 sealed = newSealed;
607 }
608
609 /***
610 * Stores all relevant fields of this object in a field map for
611 * serialization. An equivalent object can be created by calling
612 * {@link de.fu_berlin.ties.io.FieldMap#createObject(Class)} on the created
613 * field map.
614 *
615 * @return the created field map
616 */
617 public FieldMap storeFields() {
618
619 final FieldMap result = super.storeFields();
620 result.put(KEY_TEXT, getText());
621 final TokenDetails firstDetails = detailsList.getFirst();
622 final int index = getIndex();
623
624
625 if (!firstTokenRepIgnored && (firstDetails.getRep() >= 0)) {
626 result.put(KEY_FIRST_TOKEN_REP, new Integer(firstDetails.getRep()));
627 }
628 if (index >= 0) {
629 result.put(KEY_INDEX, new Integer(index));
630 }
631
632 return result;
633 }
634
635 /***
636 * Returns the number of tokens in this extraction. This will only be
637 * reliable if a constructor is used to give the first token and operations
638 * such as {@link Extraction#addToken(TokenDetails, Probability, boolean)
639 * addToken} are used for each further token. Omitted when serializing
640 * so it cannot be restored.
641 *
642 * @return the value of the attribute
643 */
644 public int tokenCount() {
645 return detailsList.size();
646 }
647
648 }