View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.util.Iterator;
25  
26  import org.apache.commons.collections.Bag;
27  import org.apache.commons.collections.bag.HashBag;
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  import org.dom4j.Document;
30  
31  import de.fu_berlin.ties.text.TextTokenizer;
32  import de.fu_berlin.ties.util.Util;
33  
34  /***
35   * Locates extractions in a document.
36   *
37   * @author Christian Siefkes
38   * @version $Revision: 1.4 $, $Date: 2004/08/30 17:24:43 $, $Author: siefkes $
39   */
40  public class ExtractionLocator {
41  
42      /***
43       * An iterator over the extractions to train.
44       */
45      private Iterator extractionIter;
46  
47      /***
48       * The extraction that is currently processed or will be processed next;
49       * or <code>null</code> if there are no more extractions to process.
50       */
51      private Extraction currentExtraction;
52  
53      /***
54       * The first token from the current extaction, utilized together with
55       * {@link Extraction#getFirstTokenRep()} to locate the start of the
56       * extraction.
57       */
58      private String firstToken;
59  
60      /***
61       * <code>true</code> if we are processing the {@link #currentExtraction},
62       * <code>false</code> otherwise (we are waiting for it to start or there
63       * are no more extractions).
64       */
65      private boolean inExtraction;
66  
67      /***
68       * A multiset of the tokens remaing (not yet trained) in the current
69       * extraction.
70       */
71      private Bag remainingTokens = new HashBag();
72  
73      /***
74       * The tokenizer used to split extractions into tokens.
75       */
76      private final TextTokenizer tokenizer;
77  
78  
79      /***
80       * Creates a new instance.
81       *
82       * @param document the document to use
83       * @param extractions the extractions in this document
84       * @param textTokenizer the tokenizer used to split extractions into tokens
85       */
86      public ExtractionLocator(final Document document,
87                               final ExtractionContainer extractions,
88                               final TextTokenizer textTokenizer) {
89          super();
90          tokenizer = textTokenizer;
91  
92          // initialize state
93          extractionIter = extractions.iterator();
94          switchToNextExtraction();
95      }
96  
97      /***
98       * Whether we reached the end of the current extraction.
99       * @return <code>true</code> iff the current extraction has ended
100      */
101     public boolean endOfExtraction() {
102         return inExtraction && remainingTokens.isEmpty();
103     }
104 
105     /***
106      * Returns the current extraction.
107      * @return the current extraction
108      */
109     public Extraction getCurrentExtraction() {
110         return currentExtraction;
111     }
112 
113     /***
114      * Whether we are currently within an extraction.
115      *
116      * @return <code>true</code> iff are processing the
117      * {@link #getCurrentExtraction()}, <code>false</code> otherwise (we are
118      * waiting for it to start or there are no more extractions)
119      */
120     public boolean inExtraction() {
121         return inExtraction;
122     }
123 
124     /***
125      * This method must be called at the end of the current document. It will
126      * log an error when there are still unprocessed or incompletely processed
127      * extractions.
128      */
129     public void reachedEndOfDocument() {
130         if (!remainingTokens.isEmpty()) {
131             if (inExtraction) {
132                 Util.LOG.error("Document ended while processing extraction: "
133                     + currentExtraction + ", unprocessed tokens: "
134                     + remainingTokens);
135             } else {
136                 Util.LOG.error("Document ended while waiting for start of "
137                     + "extraction: " + currentExtraction);
138             }
139 
140             remainingTokens.clear();
141         }
142         if (extractionIter.hasNext()) {
143             Util.LOG.error("Unprocessed extractions at end of document: "
144                 + extractionIter.next());
145         }
146     }
147 
148     /***
149      * Whether the current token starts a new extraction. <em>This method must
150      * be called once for each token in a document, otherwise we might miss
151      * extractions.</em>
152      *
153      * @param token the token to check
154      * @param tokenRep the repetition of the <code>token</code> in the document
155      * (counting starts with 0, as the first occurrence is the "0th
156      * repetition").
157      * @return <code>true</code> iff the given token starts a new extraction
158      */
159     public boolean startOfExtraction(final String token, final int tokenRep) {
160         if ((!inExtraction) && (currentExtraction != null)
161                 && (token.equals(firstToken))
162                 && (currentExtraction.getFirstTokenRep() <= tokenRep)) {
163             // found first element of extraction
164             inExtraction = true;
165             return true;
166         } else {
167             return false;
168         }
169     }
170 
171     /***
172      * Switches to the next extraction, updating the current
173      * extraction and related fields. The prior current extraction must have
174      * been fully processed when this method is called, i.e.
175      * {@link #endOfExtraction()} must be <code>true</code>.
176      *
177      * @throws IllegalStateException if {@link #endOfExtraction()} is not
178      * <code>true</code> (there are still remaining tokens to process
179      */
180     public void switchToNextExtraction() throws IllegalStateException {
181         // check state
182         if (!remainingTokens.isEmpty()) {
183             throw new IllegalStateException("Cannot update current extraction "
184                     + "while there are remaining tokens: " + remainingTokens);
185         }
186 
187         inExtraction = false;
188         firstToken = null;
189 
190         if (extractionIter.hasNext()) {
191             currentExtraction = (Extraction) extractionIter.next();
192             tokenizer.reset(currentExtraction.getText());
193             String token;
194 
195             // update first token + remaining tokens
196             while ((token = tokenizer.nextToken()) != null) {
197                 if (firstToken == null) {
198                     firstToken = token;
199                 }
200                 remainingTokens.add(token);
201             }
202         } else {
203             // no more extractions
204             currentExtraction = null;
205         }
206     }
207 
208     /***
209      * Returns a string representation of this object.
210      * @return a textual representation
211      */
212     public String toString() {
213         return new ToStringBuilder(this)
214             .appendSuper(super.toString())
215             .append("tokenizer", tokenizer)
216             .append("in extraction", inExtraction)
217             .append("current extraction", currentExtraction)
218             .append("first token", firstToken)
219             .append("remaining tokens", remainingTokens)
220             .toString();
221     }
222 
223     /***
224      * Updates the currently processed extraction. This method must be called
225      * once for each token in each extraction.
226      *
227      * @param token the token to process
228      * @param tokenRep the repetition of the <code>token</code> in the document
229      * (counting starts with 0, as the first occurrence is the "0th
230      * repetition").
231      * @return <code>true</code> iff the extraction was successfully updated;
232      * <code>false</code> if the token was erroneous (not expected to occur
233      * within the current extraction)
234      */
235     public boolean updateExtraction(final String token, final int tokenRep) {
236         boolean removedToken = remainingTokens.remove(token, 1);
237 
238         if (!removedToken) {
239             // token didn't match thru there are still remaing tokens:
240             // this means document and extraction container are out of sync
241             Util.LOG.error("Extractions don't match document: "
242                     + "still missing tokens " + remainingTokens
243                     + " from extraction " + currentExtraction
244                     + " but current token '" + token
245                     + "' (token rep=" + tokenRep + ") doesn't match");
246 
247             // stop handling of current extraction, treat as other/outside
248             remainingTokens.clear();
249         }
250 
251         return removedToken;
252     }
253 
254 }