View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.util.Iterator;
25  import java.util.LinkedList;
26  import java.util.regex.Matcher;
27  import java.util.regex.Pattern;
28  
29  import org.apache.commons.lang.builder.EqualsBuilder;
30  import org.apache.commons.lang.builder.HashCodeBuilder;
31  
32  import de.fu_berlin.ties.classify.Prediction;
33  import de.fu_berlin.ties.classify.Probability;
34  import de.fu_berlin.ties.context.Recognition;
35  import de.fu_berlin.ties.eval.EvalStatus;
36  import de.fu_berlin.ties.io.FieldMap;
37  import de.fu_berlin.ties.text.TextUtils;
38  import de.fu_berlin.ties.text.TokenDetails;
39  import de.fu_berlin.ties.text.TokenizerFactory;
40  import de.fu_berlin.ties.util.Util;
41  
42  /***
43   * Extends a {@link de.fu_berlin.ties.classify.Prediction} by also storing the
44   * extracted text and location data.
45   *
46   * <p>Instances of this class are not thread-safe.
47   *
48   * @author Christian Siefkes
49   * @version $Revision: 1.12 $, $Date: 2004/11/25 13:36:08 $, $Author: siefkes $
50   */
51  public class Extraction extends Prediction implements Recognition {
52  
53      /***
54       * Serialization key for the extracted text.
55       */
56      public static final String KEY_TEXT = "Text";
57  
58      /***
59       * Serialization key for the repetition of the first token.
60       */
61      public static final String KEY_FIRST_TOKEN_REP = "FirstTokenRep";
62  
63      /***
64       * Serialization key for the index.
65       */
66      public static final String KEY_INDEX = "Index";
67  
68      /***
69       * A list of {@link TokenDetails} describing the tokens combined in this
70       * extraction.
71       */
72      private final LinkedList<TokenDetails> detailsList =
73          new LinkedList<TokenDetails>();
74  
75      /***
76       * The visible characters of the text fragment (everything except whitespace
77       * and control characters).
78       */
79      private final StringBuilder visibleChars;
80  
81      /***
82       * Matches characters that are not visible (whitespace and control
83       * characters). Used to delete them for updating {@link #visibleChars}.
84       */
85      private final Matcher nonVisibleMatcher =
86          Pattern.compile(TokenizerFactory.WHITESPACE_CONTROL_OTHER).matcher("");
87  
88      /***
89       * A sealed extraction cannot be changed. This means that
90       * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
91       * is not allowed after sealing.
92       */
93      private boolean sealed = false;
94  
95      /***
96       * Whether the {@linkplain TokenDetails#getRep() repetition of the first
97       * token} should be ignored, comparing only the text but not the position of
98       * extractions. Defaults to <code>false</code>.
99       */
100     private boolean firstTokenRepIgnored = false;
101 
102     /***
103      * Creates a new instance from a field map, fulfilling the
104      * {@link de.fu_berlin.ties.io.Storable} contract. An extraction created
105      * this way will be immediately {@link #isSealed() sealed}, thus the
106      * extracted text cannot be changed.
107      *
108      * @param fieldMap map containing the serialized fields
109      */
110     public Extraction(final FieldMap fieldMap) {
111         // delegate to super constructor
112         super(fieldMap);
113 
114         // read values from map
115         final Object rawText = fieldMap.get(KEY_TEXT);
116         final String newText = (rawText == null) ? "" : rawText.toString();
117         final int firstTokenRep = Util.asInt(fieldMap.get(KEY_FIRST_TOKEN_REP));
118         final int index = Util.asInt(fieldMap.get(KEY_INDEX));
119 
120         // initialize own fields
121         detailsList.add(new TokenDetails(newText, firstTokenRep, index, false));
122         visibleChars = new StringBuilder(TextUtils.replaceAll(newText,
123             nonVisibleMatcher, ""));
124 
125         // seal extraction
126         setSealed(true);
127     }
128 
129     /***
130      * Creates a new instance without locating it in a text (using -1 for 
131      * first token rep + index), setting the probability to -1 ("confirmed")
132      * and the evaluation status to {@link EvalStatus#TRUTH}.
133      *
134      * @param predicted the predicted class
135      * @param extracted the (first part) extracted text fragment; must not be
136      * <code>null</code>
137      */
138     public Extraction(final String predicted, final String extracted) {
139         this(predicted, new TokenDetails(extracted, -1, -1, false));
140     }
141 
142     /***
143      * Creates a new instance, setting the probability to -1 ("confirmed")
144      * and the evaluation status to {@link EvalStatus#TRUTH}.
145      * Use this constructor to build answer keys.
146      *
147      * @param predicted the predicted class
148      * @param details details about the extracted text fragment or its first
149      * token
150      */
151     public Extraction(final String predicted, final TokenDetails details) {
152         this(predicted, details, new Probability(-1.0), EvalStatus.TRUTH);
153     }
154 
155     /***
156      * Creates a new instance, setting the evaluation status to
157      * {@link EvalStatus#UNKNOWN}.
158      *
159      * @param predicted the predicted class
160      * @param details details about the extracted text fragment or its first
161      * token
162      * @param prob the probability of the prediction
163      */
164     public Extraction(final String predicted, final TokenDetails details,
165             final Probability prob) {
166         this(predicted, details, prob, EvalStatus.UNKNOWN);
167     }
168 
169     /***
170      * Creates a new instance.
171      *
172      * @param predicted the predicted class
173      * @param details details about the extracted text fragment or its first
174      * token
175      * @param prob the probability of the prediction
176      * @param status the {@linkplain EvalStatus evaluation status} of this
177      * instance
178      */
179     public Extraction(final String predicted, final TokenDetails details,
180             final Probability prob, final EvalStatus status) {
181         super(predicted, prob, status);
182         detailsList.add(details);
183         visibleChars = new StringBuilder(TextUtils.replaceAll(
184                 details.getToken(), nonVisibleMatcher, ""));
185     }
186 
187     /***
188      * Adds a token to this extraction, delegating to
189      * {@link #addToken(TokenDetails, Probability, boolean)} with a probability
190      * of -1 ("confirmed"). Use this method when building answer keys.
191      *
192      * @param details details about the new token
193      * @param atEnd whether to add the new token at the end or at the
194      * start
195      * @throws IllegalStateException if this extraction
196      * {@link #isSealed() is sealed}
197      */
198     public void addToken(final TokenDetails details, final boolean atEnd)
199             throws IllegalStateException {
200         addToken(details, new Probability(-1.0), atEnd);
201     }
202 
203     /***
204      * Adds a token to this extraction, recalculating the probability by
205      * multiplying the prior probability value with the probability of the
206      * new text. Increments the token count by 1.
207      *
208      * @param details details about the new token
209      * @param prob the probability of the new token; might be <code>null</code>
210      * if the overall probability of the extraction should not be changed
211      * @param atEnd whether to add the new token at the end or at the
212      * start
213      * @throws IllegalStateException if this extraction
214      * {@link #isSealed() is sealed}; or if new and old probabilities/pRs
215      * cannot be combined
216      */
217     public void addToken(final TokenDetails details, final Probability prob,
218             final boolean atEnd) throws IllegalStateException {
219         // check state
220         if (isSealed()) {
221             throw new IllegalStateException(
222                 "Cannot change text of sealed extraction");
223         }
224 
225         // update superclass + fields
226         super.addProb(prob, atEnd);
227         final String newVisibleChars =
228             TextUtils.replaceAll(details.getToken(), nonVisibleMatcher, "");
229 
230         if (atEnd) {
231             detailsList.addLast(details);
232             visibleChars.append(newVisibleChars);
233         } else {
234             detailsList.addFirst(details);
235             visibleChars.insert(0, newVisibleChars);
236         }
237     }
238 
239     /***
240      * Indicates whether some other object is "equal to" this one, fulfulling
241      * the {@link Object#equals(java.lang.Object)} contract. The
242      * {@linkplain Prediction#getEvalStatus() evaluation status} is ignored when
243      * checking equality, thus if all other fields of two extractions are equal,
244      * this method will return <code>true</code> even if their evaluation states
245      * differ. Only the {@link #getVisibleChars() visible characters} of the
246      * extractions are compared, whitespace and control characters are ignored.
247      *
248      * @param obj the reference object with which to compare
249      * @return <code>true</code> iff the specified object is an
250      * {@link Extraction} equal to this instance
251      */
252     public boolean equals(final Object obj) {
253         if (obj == this) {
254             return true;
255         } else if ((obj != null) && (getClass().equals(obj.getClass()))) {
256             // used getClass instead of instanceof because otherwise subclasses
257             // with additional fields would break the contract
258             final Extraction other = (Extraction) obj;
259             return new EqualsBuilder()
260                 .appendSuper(super.equals(obj))
261                 .append(getVisibleChars(), other.getVisibleChars())
262                 .append(getFirstTokenRep(), other.getFirstTokenRep())
263                 .isEquals();
264         } else {
265             return false;
266         }
267     }
268 
269     /***
270      * Returns the repetition of the first token of the extraction in the
271      * original text (counting starts with 0, as the first occurrence is the
272      * "0th repetition"), -1 if unknown or if {@link #isFirstTokenRepIgnored()}
273      * is <code>true</code>. This is useful to locate this extraction in the
274      * original text.
275      *
276      * @return the value of the attribute
277      */
278     public int getFirstTokenRep() {
279         if (firstTokenRepIgnored) {
280             return -1;
281         } else {
282             return detailsList.getFirst().getRep();
283         }
284     }
285 
286     /***
287      * Returns the index of the first token in the text (indexing starts with
288      * 0); or -1 if unknown/irrelevant.
289      *
290      * @return the value of the attribute
291      */
292     public int getIndex() {
293         return detailsList.getFirst().getIndex();
294     }
295 
296     /***
297      * Returns the extracted text fragment.
298      * @return the extracted text
299      */
300     public String getText() {
301         final StringBuilder text = new StringBuilder();
302         final Iterator<TokenDetails> detailsIter = detailsList.iterator();
303         TokenDetails details;
304 
305         while (detailsIter.hasNext()) {
306             details = detailsIter.next();
307             if (details.isWhitespaceBefore() && (text.length() > 0)) {
308                 text.append(' ');
309             }
310             text.append(details.getToken());
311         }
312         return text.toString();
313     }
314 
315     /***
316      * Returns the visible characters of the text fragment (everything except
317      * whitespace and control characters).
318      *
319      * @return the visible characters
320      */
321     public String getVisibleChars() {
322         return visibleChars.toString();
323     }
324 
325     /***
326      * Returns a hash code value for this object, fulfulling the
327      * {@link Object#hashCode()} contract.
328      * @return a hash code value for this object
329      */
330     public int hashCode() {
331         // you pick two hard-coded, randomly chosen, non-zero, odd numbers
332         // (preferably primes); ideally different for each class
333         return new HashCodeBuilder(7, 11)
334             .appendSuper(super.hashCode())
335             .append(getVisibleChars())
336             .append(getFirstTokenRep())
337             .toHashCode();
338     }
339 
340     /***
341      * Whether the {@linkplain TokenDetails#getRep() repetition of the first
342      * token} should be ignored, comparing only the text but not the position of
343      * extractions. Defaults to <code>false</code>.
344      *
345      * @return the value of the attribute
346      */
347     public boolean isFirstTokenRepIgnored() {
348         return firstTokenRepIgnored;
349     }
350 
351     /***
352      * Whether this extraction has been sealed. The text of a sealed extraction
353      * cannot be changed. This means that
354      * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
355      * is not allowed after sealing.
356      *
357      * @return <code>true</code> iff this extraction is sealed
358      */
359     public boolean isSealed() {
360         return sealed;
361     }
362 
363     /***
364      * Deletes one of the tokens from this prediction. At least one
365      * token must always remain, i.e. {@link #tokenCount()} must be 2 or
366      * more prior to calling this method.
367      *
368      * @param atEnd whether to delete the first or the last token
369      * @return details describing the removed token
370      * @throws IllegalStateException if there is only one token left or
371      * if this extraction {@link #isSealed() is sealed}
372      */
373     public TokenDetails removeToken(final boolean atEnd)
374     throws IllegalStateException {
375         // check state
376         if (isSealed()) {
377             throw new IllegalStateException(
378                 "Cannot change text of sealed extraction");
379         }
380         // check invariant: there must at many probabilities as there are tokens
381         if (tokenCount() != probCount()) {
382             throw new IllegalStateException(
383                     "Invariant violation: number of tokens " + tokenCount() +
384                     " != number of probabilities " + probCount());
385         }
386 
387         // remove probabilities from superclass
388         super.removeProb(atEnd);
389 
390         // remove token details
391         final TokenDetails removed;
392         if (atEnd) {
393             removed = detailsList.removeLast();
394         } else {
395             removed = detailsList.removeFirst();
396         }
397 
398         // prepare to adapt visibleText
399         final String charsToDelete =
400             TextUtils.replaceAll(removed.getToken(), nonVisibleMatcher, "");
401         final int startIndex, endIndex;
402 
403         if (atEnd) {
404             // delete last n visible chars
405             endIndex = visibleChars.length();
406             startIndex = endIndex - charsToDelete.length();
407         } else {
408             // delete first n visible chars
409             endIndex = charsToDelete.length();
410             startIndex = 0;
411         }
412 
413         // delete chars after ensuring that everything is as it should
414         if (visibleChars.substring(startIndex, endIndex).equals(
415                 charsToDelete)) {
416             visibleChars.delete(startIndex, endIndex);
417         } else {
418             throw new RuntimeException("Invariant violation: "
419                     + (atEnd ? "last" : "first")
420                     + " visible chararacters are "
421                     + visibleChars.substring(startIndex, endIndex)
422                     + " instead of " + charsToDelete);
423         }
424 
425         return removed;
426     }
427 
428 
429     /***
430      * Specified whether the {@linkplain TokenDetails#getRep() repetition of
431      * the first token} should be ignored, comparing only the text but not the
432      * position of extractions.
433      *
434      * @param firstTokenRepIgnored the new value of the attribute
435      */
436     public void setFirstTokenRepIgnored(boolean firstTokenRepIgnored) {
437         this.firstTokenRepIgnored = firstTokenRepIgnored;
438     }
439 
440     /***
441      * Seals or unseals this extraction. The text of a sealed extraction cannot
442      * be changed. This means that
443      * {@linkplain #addToken(TokenDetails, Probability, boolean) adding tokens}
444      * is not allowed after sealing.
445      *
446      * @param newSealed the new value of the attribute
447      */
448     public void setSealed(final boolean newSealed) {
449         sealed = newSealed;
450     }
451 
452     /***
453      * Stores all relevant fields of this object in a field map for
454      * serialization. An equivalent object can be created by calling
455      * {@link de.fu_berlin.ties.io.FieldMap#createObject(Class)} on the created
456      * field map.
457      *
458      * @return the created field map
459      */
460     public FieldMap storeFields() {
461         // delegate to super class and add own fields
462         final FieldMap result = super.storeFields();
463         result.put(KEY_TEXT, getText());
464         final TokenDetails firstDetails = detailsList.getFirst();
465 
466         // ignore if ignored or unknown (-1)
467         if (!firstTokenRepIgnored && (firstDetails.getRep() >= 0)) {
468             result.put(KEY_FIRST_TOKEN_REP, new Integer(firstDetails.getRep()));
469         }
470         if (firstDetails.getIndex() >= 0) {
471             result.put(KEY_INDEX, new Integer(firstDetails.getIndex()));
472         }
473 
474         return result;
475     }
476 
477     /***
478      * Returns the number of tokens in this extraction. This will only be
479      * reliable if a constructor is used to give the first token and operations
480      * such as {@link Extraction#addToken(TokenDetails, Probability, boolean)
481      * addToken} are used for each further token. Omitted when serializing
482      * so it cannot be restored.
483      *
484      * @return the value of the attribute
485      */
486     public int tokenCount() {
487         return detailsList.size();
488     }
489 
490 }