View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.util.HashSet;
25  import java.util.Iterator;
26  import java.util.LinkedList;
27  import java.util.Set;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.apache.commons.lang.builder.EqualsBuilder;
32  import org.apache.commons.lang.builder.HashCodeBuilder;
33  
34  import de.fu_berlin.ties.classify.Prediction;
35  import de.fu_berlin.ties.classify.Probability;
36  import de.fu_berlin.ties.context.Recognition;
37  import de.fu_berlin.ties.eval.EvalStatus;
38  import de.fu_berlin.ties.io.FieldMap;
39  import de.fu_berlin.ties.text.TextUtils;
40  import de.fu_berlin.ties.text.TokenDetails;
41  import de.fu_berlin.ties.text.TokenizerFactory;
42  import de.fu_berlin.ties.util.Util;
43  
44  /***
45   * Extends a {@link de.fu_berlin.ties.classify.Prediction} by also storing the
46   * extracted text and location data.
47   *
48   * <p>Instances of this class are not thread-safe.
49   *
50   * @author Christian Siefkes
51   * @version $Revision: 1.22 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
52   */
53  public class Extraction extends Prediction implements Cloneable, Recognition {
54  
55      /***
56       * Serialization key for the extracted text.
57       */
58      public static final String KEY_TEXT = "Text";
59  
60      /***
61       * Serialization key for the repetition of the first token.
62       */
63      public static final String KEY_FIRST_TOKEN_REP = "FirstTokenRep";
64  
65      /***
66       * Serialization key for the index.
67       */
68      public static final String KEY_INDEX = "Index";
69  
70      /***
71       * A list of {@link TokenDetails} describing the tokens combined in this
72       * extraction.
73       */
74      private final LinkedList<TokenDetails> detailsList =
75          new LinkedList<TokenDetails>();
76  
77      /***
78       * Overrides the index of the first token in the extraction if set to a
79       * non-negative value.
80       */
81      private int explicitIndex = -1;
82  
83      /***
84       * Overrides the index of the last token in the extraction if set to a
85       * non-negative value.
86       */
87      private int explicitLastIndex = -1;
88  
89      /***
90       * The visible characters of the text fragment (everything except whitespace
91       * and control characters).
92       */
93      private final StringBuilder visibleChars;
94  
95      /***
96       * Matches characters that are not visible (whitespace and control
97       * characters). Used to delete them for updating {@link #visibleChars}.
98       */
99      private final Matcher nonVisibleMatcher =
100         Pattern.compile(TokenizerFactory.WHITESPACE_CONTROL_OTHER).matcher("");
101 
102     /***
103      * A sealed extraction cannot be changed. This means that
104      * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
105      * is not allowed after sealing.
106      */
107     private boolean sealed = false;
108 
109     /***
110      * Whether the {@linkplain TokenDetails#getRep() repetition of the first
111      * token} should be ignored, comparing only the text but not the position of
112      * extractions. Defaults to <code>false</code>.
113      */
114     private boolean firstTokenRepIgnored = false;
115 
116     /***
117      * A set which allows setting any user-defined properties for the
118      * extraction.
119      */
120     private Set<Object> properties = null;
121 
122 
123     /***
124      * Creates a new instance from a field map, fulfilling the
125      * {@link de.fu_berlin.ties.io.Storable} contract. An extraction created
126      * this way will be immediately {@link #isSealed() sealed}, thus the
127      * extracted text cannot be changed.
128      *
129      * @param fieldMap map containing the serialized fields
130      */
131     public Extraction(final FieldMap fieldMap) {
132         // delegate to super constructor
133         super(fieldMap);
134 
135         // read values from map
136         final Object rawText = fieldMap.get(KEY_TEXT);
137         final String newText = (rawText == null) ? "" : rawText.toString();
138         final int firstTokenRep = Util.asInt(fieldMap.get(KEY_FIRST_TOKEN_REP));
139         final int index = Util.asInt(fieldMap.get(KEY_INDEX));
140 
141         // initialize own fields
142         detailsList.add(new TokenDetails(newText, firstTokenRep, index, false));
143         visibleChars = new StringBuilder(TextUtils.replaceAll(newText,
144             nonVisibleMatcher, ""));
145 
146         // seal extraction
147         setSealed(true);
148     }
149 
150     /***
151      * Creates a new instance without locating it in a text (using -1 for 
152      * first token rep + index), setting the probability to -1 ("confirmed")
153      * and the evaluation status to {@link EvalStatus#TRUTH}.
154      *
155      * @param predicted the predicted class
156      * @param extracted the (first part) extracted text fragment; must not be
157      * <code>null</code>
158      */
159     public Extraction(final String predicted, final String extracted) {
160         this(predicted, new TokenDetails(extracted, -1, -1, false));
161     }
162 
163     /***
164      * Creates a new instance, setting the probability to -1 ("confirmed")
165      * and the evaluation status to {@link EvalStatus#TRUTH}.
166      * Use this constructor to build answer keys.
167      *
168      * @param predicted the predicted class
169      * @param details details about the extracted text fragment or its first
170      * token
171      */
172     public Extraction(final String predicted, final TokenDetails details) {
173         this(predicted, details, new Probability(-1.0), EvalStatus.TRUTH);
174     }
175 
176     /***
177      * Creates a new instance, setting the evaluation status to
178      * {@link EvalStatus#UNKNOWN}.
179      *
180      * @param predicted the predicted class
181      * @param details details about the extracted text fragment or its first
182      * token
183      * @param prob the probability of the prediction
184      */
185     public Extraction(final String predicted, final TokenDetails details,
186             final Probability prob) {
187         this(predicted, details, prob, EvalStatus.UNKNOWN);
188     }
189 
190     /***
191      * Creates a new instance.
192      *
193      * @param predicted the predicted class
194      * @param details details about the extracted text fragment or its first
195      * token
196      * @param prob the probability of the prediction
197      * @param status the {@linkplain EvalStatus evaluation status} of this
198      * instance
199      */
200     public Extraction(final String predicted, final TokenDetails details,
201             final Probability prob, final EvalStatus status) {
202         super(predicted, prob, status);
203         detailsList.add(details);
204         visibleChars = new StringBuilder(TextUtils.replaceAll(
205                 details.getToken(), nonVisibleMatcher, ""));
206     }
207 
208 
209     /***
210      * Adds a token to this extraction, delegating to
211      * {@link #addToken(TokenDetails, Probability, boolean)} with a probability
212      * of -1 ("confirmed"). Use this method when building answer keys.
213      *
214      * @param details details about the new token
215      * @param atEnd whether to add the new token at the end or at the
216      * start
217      * @throws IllegalStateException if this extraction
218      * {@link #isSealed() is sealed}
219      */
220     public void addToken(final TokenDetails details, final boolean atEnd)
221             throws IllegalStateException {
222         addToken(details, new Probability(-1.0), atEnd);
223     }
224 
225     /***
226      * Adds a token to this extraction, recalculating the probability by
227      * multiplying the prior probability value with the probability of the
228      * new text. Increments the token count by 1.
229      *
230      * @param details details about the new token
231      * @param prob the probability of the new token; might be <code>null</code>
232      * if the overall probability of the extraction should not be changed
233      * @param atEnd whether to add the new token at the end or at the
234      * start
235      * @throws IllegalStateException if this extraction
236      * {@link #isSealed() is sealed}; or if new and old probabilities/pRs
237      * cannot be combined
238      */
239     public void addToken(final TokenDetails details, final Probability prob,
240             final boolean atEnd) throws IllegalStateException {
241         // check state
242         if (isSealed()) {
243             throw new IllegalStateException(
244                 "Cannot change text of sealed extraction");
245         }
246 
247         // update superclass + fields
248         super.addProb(prob, atEnd);
249         final String newVisibleChars =
250             TextUtils.replaceAll(details.getToken(), nonVisibleMatcher, "");
251 
252         if (atEnd) {
253             detailsList.addLast(details);
254             visibleChars.append(newVisibleChars);
255         } else {
256             detailsList.addFirst(details);
257             visibleChars.insert(0, newVisibleChars);
258         }
259     }
260 
261     /***
262      * Creates and returns a deep copy of this object. "Deep" means that there
263      * are no dependencies between the two objects -- modifying any fields of
264      * the copy will not affect this object, and vice versa. Any user-set
265      * {@link #setProperty(Object) properties}, however, are only copied,
266      * not cloned.
267      *
268      * @return a deep copy of this object
269      */
270     public Extraction clone() {
271         // most fields will by reliably duplicated in this way
272         final FieldMap myFields = storeFields();
273         final Extraction clone = new Extraction(myFields);
274 
275         // adapt the remaining fields
276         clone.setLastIndex(getLastIndex());
277         clone.setFirstTokenRepIgnored(isFirstTokenRepIgnored());
278         clone.setSealed(isSealed());
279         if (properties != null) {
280             clone.properties = new HashSet<Object>(properties);
281         }
282 
283         return clone;
284     }
285 
286     /***
287      * Indicates whether some other object is "equal to" this one, fulfulling
288      * the {@link Object#equals(java.lang.Object)} contract. The
289      * {@linkplain Prediction#getEvalStatus() evaluation status} is ignored when
290      * checking equality, thus if all other fields of two extractions are equal,
291      * this method will return <code>true</code> even if their evaluation states
292      * differ. Only the {@link #getVisibleChars() visible characters} of the
293      * extractions are compared, whitespace and control characters are ignored.
294      *
295      * @param obj the reference object with which to compare
296      * @return <code>true</code> iff the specified object is an
297      * {@link Extraction} equal to this instance
298      */
299     public boolean equals(final Object obj) {
300         if (obj == this) {
301             return true;
302         } else if ((obj != null) && (getClass().equals(obj.getClass()))) {
303             // used getClass instead of instanceof because otherwise subclasses
304             // with additional fields would break the contract
305             final Extraction other = (Extraction) obj;
306             return new EqualsBuilder()
307                 .appendSuper(super.equals(obj))
308                 .append(getVisibleChars(), other.getVisibleChars())
309                 .append(getFirstTokenRep(), other.getFirstTokenRep())
310                 .isEquals();
311         } else {
312             return false;
313         }
314     }
315 
316     /***
317      * Returns the repetition of the first token of the extraction in the
318      * original text (counting starts with 0, as the first occurrence is the
319      * "0th repetition"), -1 if unknown or if {@link #isFirstTokenRepIgnored()}
320      * is <code>true</code>. This is useful to locate this extraction in the
321      * original text.
322      *
323      * @return the value of the attribute
324      */
325     public int getFirstTokenRep() {
326         if (firstTokenRepIgnored) {
327             return -1;
328         } else {
329             return detailsList.getFirst().getRep();
330         }
331     }
332 
333     /***
334      * Returns the index of the first token in the text (indexing starts with
335      * 0); or -1 if unknown/irrelevant.
336      *
337      * @return the value of the attribute
338      */
339     public int getIndex() {
340         if (explicitIndex >= 0) {
341             // return explicitly set index
342             return explicitIndex;
343         } else {
344             // return index of first token
345             return detailsList.getFirst().getIndex();
346         }
347     }
348 
349     /***
350      * Returns the index of the last token in the text (indexing starts with
351      * 0); or -1 if unknown/irrelevant.
352      *
353      * @return the value of the attribute
354      */
355     public int getLastIndex() {
356         if (explicitLastIndex >= 0) {
357             // return explicitly set last index
358             return explicitLastIndex;
359         } else {
360             // return index of last token
361             return detailsList.getLast().getIndex();
362         }
363     }
364 
365     /***
366      * Returns the extracted text fragment.
367      * @return the extracted text
368      */
369     public String getText() {
370         final StringBuilder text = new StringBuilder();
371         final Iterator<TokenDetails> detailsIter = detailsList.iterator();
372         TokenDetails details;
373 
374         while (detailsIter.hasNext()) {
375             details = detailsIter.next();
376             if (details.isWhitespaceBefore() && (text.length() > 0)) {
377                 text.append(' ');
378             }
379             text.append(details.getToken());
380         }
381         return text.toString();
382     }
383 
384     /***
385      * Returns the visible characters of the text fragment (everything except
386      * whitespace and control characters).
387      *
388      * @return the visible characters
389      */
390     public String getVisibleChars() {
391         return visibleChars.toString();
392     }
393 
394     /***
395      * Returns a hash code value for this object, fulfulling the
396      * {@link Object#hashCode()} contract.
397      * @return a hash code value for this object
398      */
399     public int hashCode() {
400         // you pick two hard-coded, randomly chosen, non-zero, odd numbers
401         // (preferably primes); ideally different for each class
402         return new HashCodeBuilder(7, 11)
403             .appendSuper(super.hashCode())
404             .append(getVisibleChars())
405             .append(getFirstTokenRep())
406             .toHashCode();
407     }
408 
409     /***
410      * Checks if a specific user-defined property is set for this extraction.
411      *
412      * @param prop the property to check
413      * @return <code>true</code> iff the property is set
414      */
415     public boolean hasProperty(final Object prop) {
416         if (properties != null) {
417             return properties.contains(prop);
418         } else {
419             return false;
420         }
421     }
422 
423     /***
424      * Sets a user-defined property for this extraction.
425      *
426      * @param prop the property to set
427      * @return <code>true</code> iff the property had not been set before
428      */
429     public boolean setProperty(final Object prop) {
430         if (properties == null) { // lazy init
431             properties = new HashSet<Object>();
432         }
433         return properties.add(prop);
434     }
435 
436     /***
437      * Unsets a user-defined property for this extraction.
438      *
439      * @param prop the property to unset
440      * @return <code>true</code> iff the property had been set before
441      */
442     public boolean unsetProperty(final Object prop) {
443         if (properties != null) {
444             return properties.remove(prop);
445         } else {
446             return false;
447         }
448     }
449 
450     /***
451      * Whether the {@linkplain TokenDetails#getRep() repetition of the first
452      * token} should be ignored, comparing only the text but not the position of
453      * extractions. Defaults to <code>false</code>.
454      *
455      * @return the value of the attribute
456      */
457     public boolean isFirstTokenRepIgnored() {
458         return firstTokenRepIgnored;
459     }
460 
461     /***
462      * Whether this extraction has been sealed. The text of a sealed extraction
463      * cannot be changed. This means that
464      * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
465      * is not allowed after sealing.
466      *
467      * @return <code>true</code> iff this extraction is sealed
468      */
469     public boolean isSealed() {
470         return sealed;
471     }
472 
473     /***
474      * Modifies the probability of an extraction.
475      *
476      * @param prob the new probability, will be combined with the current
477      * token probabilities to calculate the average
478      */
479     public void modifyProbability(final Probability prob) {
480         super.addProb(prob, true);
481     }
482 
483     /***
484      * Deletes one of the tokens from this prediction. At least one
485      * token must always remain, i.e. {@link #tokenCount()} must be 2 or
486      * more prior to calling this method.
487      *
488      * @param atEnd whether to delete the first or the last token
489      * @return details describing the removed token
490      * @throws IllegalStateException if there is only one token left or
491      * if this extraction {@link #isSealed() is sealed}
492      */
493 /* Does not work with current modifyProbability implementation
494     public TokenDetails removeToken(final boolean atEnd)
495     throws IllegalStateException {
496         // check state
497         if (isSealed()) {
498             throw new IllegalStateException(
499                 "Cannot change text of sealed extraction");
500         }
501         // check invariant: there must at many probabilities as there are tokens
502         if (tokenCount() != probCount()) {
503             throw new IllegalStateException(
504                     "Invariant violation: number of tokens " + tokenCount() +
505                     " != number of probabilities " + probCount());
506         }
507 
508         // remove probabilities from superclass
509         super.removeProb(atEnd);
510 
511         // remove token details
512         final TokenDetails removed;
513         if (atEnd) {
514             removed = detailsList.removeLast();
515         } else {
516             removed = detailsList.removeFirst();
517         }
518 
519         // prepare to adapt visibleText
520         final String charsToDelete =
521             TextUtils.replaceAll(removed.getToken(), nonVisibleMatcher, "");
522         final int startIndex, endIndex;
523 
524         if (atEnd) {
525             // delete last n visible chars
526             endIndex = visibleChars.length();
527             startIndex = endIndex - charsToDelete.length();
528         } else {
529             // delete first n visible chars
530             endIndex = charsToDelete.length();
531             startIndex = 0;
532         }
533 
534         // delete chars after ensuring that everything is as it should
535         if (visibleChars.substring(startIndex, endIndex).equals(
536                 charsToDelete)) {
537             visibleChars.delete(startIndex, endIndex);
538         } else {
539             throw new RuntimeException("Invariant violation: "
540                     + (atEnd ? "last" : "first")
541                     + " visible chararacters are "
542                     + visibleChars.substring(startIndex, endIndex)
543                     + " instead of " + charsToDelete);
544         }
545 
546         return removed;
547     }
548 */
549 
550     /***
551      * Modifies the repetition of the first token of the extraction in the
552      * original text (counting starts with 0, as the first occurrence is the
553      * "0th repetition"). This also sets {@link #isFirstTokenRepIgnored()} to
554      * <code>false</code> (since it wouldn't make much sense to update the
555      * repetition if you want it to be ignored anyway).
556      *
557      * @param newFirstTokenRep the new value of the attribute
558      */
559     public void setFirstTokenRep(final int newFirstTokenRep) {
560         detailsList.getFirst().setRep(newFirstTokenRep);
561         firstTokenRepIgnored = false;
562     }
563 
564     /***
565      * Specified whether the {@linkplain TokenDetails#getRep() repetition of
566      * the first token} should be ignored, comparing only the text but not the
567      * position of extractions.
568      *
569      * @param ftRepIgnored the new value of the attribute
570      */
571     public void setFirstTokenRepIgnored(final boolean ftRepIgnored) {
572         this.firstTokenRepIgnored = ftRepIgnored;
573     }
574 
575     /***
576      * Overrides the index of the first token in the text (indexing starts with
577      * 0).
578      *
579      * @param newIndex the value of the attribute; if negative, the index of the
580      * first token will be used instead
581      */
582     public void setIndex(final int newIndex) {
583         explicitIndex = newIndex;
584     }
585 
586     /***
587      * Overrides the index of the last token in the text (indexing starts with
588      * 0).
589      *
590      * @param newLastIndex the value of the attribute; if negative, the index
591      * of the last token will be used instead
592      */
593     public void setLastIndex(final int newLastIndex) {
594         explicitLastIndex = newLastIndex;
595     }
596 
597     /***
598      * Seals or unseals this extraction. The text of a sealed extraction cannot
599      * be changed. This means that
600      * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
601      * is not allowed after sealing.
602      *
603      * @param newSealed the new value of the attribute
604      */
605     public void setSealed(final boolean newSealed) {
606         sealed = newSealed;
607     }
608 
609     /***
610      * Stores all relevant fields of this object in a field map for
611      * serialization. An equivalent object can be created by calling
612      * {@link de.fu_berlin.ties.io.FieldMap#createObject(Class)} on the created
613      * field map.
614      *
615      * @return the created field map
616      */
617     public FieldMap storeFields() {
618         // delegate to super class and add own fields
619         final FieldMap result = super.storeFields();
620         result.put(KEY_TEXT, getText());
621         final TokenDetails firstDetails = detailsList.getFirst();
622         final int index = getIndex();
623 
624         // ignore if ignored or unknown (-1)
625         if (!firstTokenRepIgnored && (firstDetails.getRep() >= 0)) {
626             result.put(KEY_FIRST_TOKEN_REP, new Integer(firstDetails.getRep()));
627         }
628         if (index >= 0) {
629             result.put(KEY_INDEX, new Integer(index));
630         }
631 
632         return result;
633     }
634 
635     /***
636      * Returns the number of tokens in this extraction. This will only be
637      * reliable if a constructor is used to give the first token and operations
638      * such as {@link Extraction#addToken(TokenDetails, Probability, boolean)
639      * addToken} are used for each further token. Omitted when serializing
640      * so it cannot be restored.
641      *
642      * @return the value of the attribute
643      */
644     public int tokenCount() {
645         return detailsList.size();
646     }
647 
648 }