View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.util.Iterator;
25  
26  import org.apache.commons.collections.Bag;
27  import org.apache.commons.collections.bag.HashBag;
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  
30  import de.fu_berlin.ties.text.TextTokenizer;
31  import de.fu_berlin.ties.util.Util;
32  
33  /***
34   * Locates extractions in a document.
35   *
36   * @author Christian Siefkes
37   * @version $Revision: 1.11 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
38   */
39  public class ExtractionLocator {
40  
41      /***
42       * An iterator over the extractions to train.
43       */
44      private Iterator extractionIter;
45  
46      /***
47       * The extraction that is currently processed or will be processed next;
48       * or <code>null</code> if there are no more extractions to process.
49       */
50      private Extraction currentExtraction;
51  
52      /***
53       * The first token from the current extaction, utilized together with
54       * {@link Extraction#getFirstTokenRep()} to locate the start of the
55       * extraction.
56       */
57      private String firstToken;
58  
59      /***
60       * <code>true</code> if we are processing the {@link #currentExtraction},
61       * <code>false</code> otherwise (we are waiting for it to start or there
62       * are no more extractions).
63       */
64      private boolean inExtraction;
65  
66      /***
67       * A multiset of the tokens remaing (not yet trained) in the current
68       * extraction.
69       */
70      private Bag remainingTokens = new HashBag();
71  
72      /***
73       * See {@link #isRetrySilently()}.
74       */
75      private final boolean retrySilently;
76  
77      /***
78       * The tokenizer used to split extractions into tokens.
79       */
80      private final TextTokenizer tokenizer;
81  
82      /***
83       * Creates a new instance, setting {@link #isRetrySilently()} to
84       * <code>false</code>.
85       *
86       * @param extractions the extractions in this document
87       * @param textTokenizer the tokenizer used to split extractions into tokens
88       */
89      public ExtractionLocator(final ExtractionContainer extractions,
90                               final TextTokenizer textTokenizer) {
91          this(extractions, textTokenizer, false);
92      }
93  
94      /***
95       * Creates a new instance.
96       *
97       * @param extractions the extractions in this document
98       * @param textTokenizer the tokenizer used to split extractions into tokens
99       * @param doRetrySilently sets the state of {@link #isRetrySilently()}
100      */
101     public ExtractionLocator(final ExtractionContainer extractions,
102                              final TextTokenizer textTokenizer,
103                              final boolean doRetrySilently) {
104         super();
105         tokenizer = textTokenizer;
106         retrySilently = doRetrySilently;
107 
108         // initialize state
109         extractionIter = extractions.iterator();
110         switchToNextExtraction();
111     }
112 
113     /***
114      * Whether we reached the end of the current extraction.
115      * @return <code>true</code> iff the current extraction has ended
116      */
117     public boolean endOfExtraction() {
118         return inExtraction && remainingTokens.isEmpty();
119     }
120 
121     /***
122      * Returns the current extraction.
123      * @return the current extraction
124      */
125     public Extraction getCurrentExtraction() {
126         return currentExtraction;
127     }
128 
129     /***
130      * Whether we are currently within an extraction.
131      *
132      * @return <code>true</code> iff are processing the
133      * {@link #getCurrentExtraction()}, <code>false</code> otherwise (we are
134      * waiting for it to start or there are no more extractions)
135      */
136     public boolean inExtraction() {
137         return inExtraction;
138     }
139 
140     /***
141      * Whether the locator accepts extractions that are not explicitly located
142      * in the document. If <code>true</code>, the locator accepts extractions
143      * that are not explicitly located in the document (negative
144      * ({@link Extraction#getFirstTokenRep() FirstTokenRep}). If such an
145      * extraction is encountered, the locator will try to matching at all
146      * possible positions. When {@link #updateExtraction(String, int)} fails
147      * (returns <code>false</code>) in such a case (indicating that only the
148      * first token(s) of the extraction could be matched, but not the full
149      * extraction), the locator will silently to locate the extraction against
150      * the next possible position.
151      *
152      * @return the value of the attribute, <code>false</code> by default
153      */
154     public boolean isRetrySilently() {
155         return retrySilently;
156     }
157 
158     /***
159      * This method must be called at the end of the current document. It will
160      * log an error when there are still unprocessed or incompletely processed
161      * extractions.
162      */
163     public void reachedEndOfDocument() {
164         if (!remainingTokens.isEmpty()) {
165             if (inExtraction) {
166                 Util.LOG.error("Document ended while processing extraction: "
167                     + currentExtraction + ", unprocessed tokens: "
168                     + remainingTokens);
169             } else {
170                 Util.LOG.error("Document ended while waiting for start of "
171                     + "extraction: " + currentExtraction + ", first token: "
172                     + firstToken);
173             }
174 
175             remainingTokens.clear();
176         }
177         if (extractionIter.hasNext()) {
178             Util.LOG.error("Unprocessed extractions at end of document: "
179                 + extractionIter.next());
180         }
181     }
182 
183     /***
184      * Whether the current token starts a new extraction. <em>This method must
185      * be called once for each token in a document, otherwise we might miss
186      * extractions.</em>
187      *
188      * @param token the token to check
189      * @param tokenRep the repetition of the <code>token</code> in the document
190      * (counting starts with 0, as the first occurrence is the "0th
191      * repetition").
192      * @return <code>true</code> iff the given token starts a new extraction
193      */
194     public boolean startOfExtraction(final String token, final int tokenRep) {
195         if ((!inExtraction) && (currentExtraction != null)
196                 && (token.equals(firstToken))
197                 && (currentExtraction.getFirstTokenRep() <= tokenRep)) {
198             // found first element of extraction
199             inExtraction = true;
200             return true;
201         } else {
202             return false;
203         }
204     }
205 
206     /***
207      * Helper method that switches to a new extraction.
208      *
209      * @param newExt the new extraction to switch to, or <code>null</code> if
210      * there are no more extractions
211      */
212     private void switchToNew(final Extraction newExt) {
213         currentExtraction = newExt;
214         inExtraction = false;
215         firstToken = null;
216         remainingTokens.clear();
217 
218         if (currentExtraction != null) {
219             tokenizer.reset(currentExtraction.getText());
220             String token;
221 
222             // update first token + remaining tokens
223             while ((token = tokenizer.nextToken()) != null) {
224                 if (firstToken == null) {
225                     firstToken = token;
226                 }
227                 remainingTokens.add(token);
228             }
229         }
230     }
231 
232     /***
233      * Switches to the next extraction, updating the current
234      * extraction and related fields. The prior current extraction must have
235      * been fully processed when this method is called, i.e.
236      * {@link #endOfExtraction()} must be <code>true</code>.
237      *
238      * @throws IllegalStateException if {@link #endOfExtraction()} is not
239      * <code>true</code> (there are still remaining tokens to process
240      */
241     public void switchToNextExtraction() throws IllegalStateException {
242         // check state
243         if (!remainingTokens.isEmpty()) {
244             throw new IllegalStateException("Cannot update current extraction "
245                     + "while there are remaining tokens: " + remainingTokens);
246         }
247 
248         final Extraction newExt;
249 
250         if (extractionIter.hasNext()) {
251             newExt = (Extraction) extractionIter.next();
252 
253             if ((!retrySilently) && (newExt.getFirstTokenRep() < 0)) {
254                 // not allowed
255                 Util.LOG.error("Extraction is not explictly located: "
256                         + newExt);
257             }
258         } else {
259             newExt = null;
260         }
261 
262         // actual switch
263         switchToNew(newExt);
264     }
265 
266     /***
267      * Returns a string representation of this object.
268      * @return a textual representation
269      */
270     public String toString() {
271         return new ToStringBuilder(this)
272             .appendSuper(super.toString())
273             .append("tokenizer", tokenizer)
274             .append("in extraction", inExtraction)
275             .append("current extraction", currentExtraction)
276             .append("first token", firstToken)
277             .append("remaining tokens", remainingTokens)
278             .toString();
279     }
280 
281     /***
282      * Updates the currently processed extraction. This method must be called
283      * once for each token in each extraction.
284      *
285      * @param token the token to process
286      * @param tokenRep the repetition of the <code>token</code> in the document
287      * (counting starts with 0, as the first occurrence is the "0th
288      * repetition").
289      * @return <code>true</code> iff the extraction was successfully updated;
290      * <code>false</code> if the token was erroneous (not expected to occur
291      * within the current extraction)
292      */
293     public boolean updateExtraction(final String token, final int tokenRep) {
294         boolean removedToken = remainingTokens.remove(token, 1);
295 
296         if (!removedToken) {
297             if (retrySilently) {
298                 Util.LOG.debug("Will retry locating extraction "
299                         + currentExtraction
300                         + " since it didn't match the current token " + token);
301 
302                 // re-init extraction
303                 switchToNew(currentExtraction);
304 
305                 // try if this token itself could be the begin of the extraction
306                 if (startOfExtraction(token, tokenRep)) {
307                     // call myself recursively
308                     removedToken = updateExtraction(token, tokenRep);
309                 }
310             } else {
311                 // token didn't match though there are still remaing tokens:
312                 // this means document and extraction container are out of sync
313                 Util.LOG.error("Extractions don't match document: "
314                         + "still missing tokens " + remainingTokens
315                         + " from extraction " + currentExtraction
316                         + " but current token '" + token
317                         + "' (token rep=" + tokenRep + ") doesn't match");
318 
319                 // stop handling of current extraction, treat as other/outside
320                 remainingTokens.clear();
321             }
322         }
323 
324         return removedToken;
325     }
326 
327 }