1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.util.Iterator;
25
26 import org.apache.commons.collections.Bag;
27 import org.apache.commons.collections.bag.HashBag;
28 import org.apache.commons.lang.builder.ToStringBuilder;
29
30 import de.fu_berlin.ties.text.TextTokenizer;
31 import de.fu_berlin.ties.util.Util;
32
33 /***
34 * Locates extractions in a document.
35 *
36 * @author Christian Siefkes
37 * @version $Revision: 1.11 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
38 */
39 public class ExtractionLocator {
40
41 /***
42 * An iterator over the extractions to train.
43 */
44 private Iterator extractionIter;
45
46 /***
47 * The extraction that is currently processed or will be processed next;
48 * or <code>null</code> if there are no more extractions to process.
49 */
50 private Extraction currentExtraction;
51
52 /***
53 * The first token from the current extaction, utilized together with
54 * {@link Extraction#getFirstTokenRep()} to locate the start of the
55 * extraction.
56 */
57 private String firstToken;
58
59 /***
60 * <code>true</code> if we are processing the {@link #currentExtraction},
61 * <code>false</code> otherwise (we are waiting for it to start or there
62 * are no more extractions).
63 */
64 private boolean inExtraction;
65
66 /***
67 * A multiset of the tokens remaing (not yet trained) in the current
68 * extraction.
69 */
70 private Bag remainingTokens = new HashBag();
71
72 /***
73 * See {@link #isRetrySilently()}.
74 */
75 private final boolean retrySilently;
76
77 /***
78 * The tokenizer used to split extractions into tokens.
79 */
80 private final TextTokenizer tokenizer;
81
82 /***
83 * Creates a new instance, setting {@link #isRetrySilently()} to
84 * <code>false</code>.
85 *
86 * @param extractions the extractions in this document
87 * @param textTokenizer the tokenizer used to split extractions into tokens
88 */
89 public ExtractionLocator(final ExtractionContainer extractions,
90 final TextTokenizer textTokenizer) {
91 this(extractions, textTokenizer, false);
92 }
93
94 /***
95 * Creates a new instance.
96 *
97 * @param extractions the extractions in this document
98 * @param textTokenizer the tokenizer used to split extractions into tokens
99 * @param doRetrySilently sets the state of {@link #isRetrySilently()}
100 */
101 public ExtractionLocator(final ExtractionContainer extractions,
102 final TextTokenizer textTokenizer,
103 final boolean doRetrySilently) {
104 super();
105 tokenizer = textTokenizer;
106 retrySilently = doRetrySilently;
107
108
109 extractionIter = extractions.iterator();
110 switchToNextExtraction();
111 }
112
113 /***
114 * Whether we reached the end of the current extraction.
115 * @return <code>true</code> iff the current extraction has ended
116 */
117 public boolean endOfExtraction() {
118 return inExtraction && remainingTokens.isEmpty();
119 }
120
121 /***
122 * Returns the current extraction.
123 * @return the current extraction
124 */
125 public Extraction getCurrentExtraction() {
126 return currentExtraction;
127 }
128
129 /***
130 * Whether we are currently within an extraction.
131 *
132 * @return <code>true</code> iff are processing the
133 * {@link #getCurrentExtraction()}, <code>false</code> otherwise (we are
134 * waiting for it to start or there are no more extractions)
135 */
136 public boolean inExtraction() {
137 return inExtraction;
138 }
139
140 /***
141 * Whether the locator accepts extractions that are not explicitly located
142 * in the document. If <code>true</code>, the locator accepts extractions
143 * that are not explicitly located in the document (negative
144 * ({@link Extraction#getFirstTokenRep() FirstTokenRep}). If such an
145 * extraction is encountered, the locator will try to matching at all
146 * possible positions. When {@link #updateExtraction(String, int)} fails
147 * (returns <code>false</code>) in such a case (indicating that only the
148 * first token(s) of the extraction could be matched, but not the full
149 * extraction), the locator will silently to locate the extraction against
150 * the next possible position.
151 *
152 * @return the value of the attribute, <code>false</code> by default
153 */
154 public boolean isRetrySilently() {
155 return retrySilently;
156 }
157
158 /***
159 * This method must be called at the end of the current document. It will
160 * log an error when there are still unprocessed or incompletely processed
161 * extractions.
162 */
163 public void reachedEndOfDocument() {
164 if (!remainingTokens.isEmpty()) {
165 if (inExtraction) {
166 Util.LOG.error("Document ended while processing extraction: "
167 + currentExtraction + ", unprocessed tokens: "
168 + remainingTokens);
169 } else {
170 Util.LOG.error("Document ended while waiting for start of "
171 + "extraction: " + currentExtraction + ", first token: "
172 + firstToken);
173 }
174
175 remainingTokens.clear();
176 }
177 if (extractionIter.hasNext()) {
178 Util.LOG.error("Unprocessed extractions at end of document: "
179 + extractionIter.next());
180 }
181 }
182
183 /***
184 * Whether the current token starts a new extraction. <em>This method must
185 * be called once for each token in a document, otherwise we might miss
186 * extractions.</em>
187 *
188 * @param token the token to check
189 * @param tokenRep the repetition of the <code>token</code> in the document
190 * (counting starts with 0, as the first occurrence is the "0th
191 * repetition").
192 * @return <code>true</code> iff the given token starts a new extraction
193 */
194 public boolean startOfExtraction(final String token, final int tokenRep) {
195 if ((!inExtraction) && (currentExtraction != null)
196 && (token.equals(firstToken))
197 && (currentExtraction.getFirstTokenRep() <= tokenRep)) {
198
199 inExtraction = true;
200 return true;
201 } else {
202 return false;
203 }
204 }
205
206 /***
207 * Helper method that switches to a new extraction.
208 *
209 * @param newExt the new extraction to switch to, or <code>null</code> if
210 * there are no more extractions
211 */
212 private void switchToNew(final Extraction newExt) {
213 currentExtraction = newExt;
214 inExtraction = false;
215 firstToken = null;
216 remainingTokens.clear();
217
218 if (currentExtraction != null) {
219 tokenizer.reset(currentExtraction.getText());
220 String token;
221
222
223 while ((token = tokenizer.nextToken()) != null) {
224 if (firstToken == null) {
225 firstToken = token;
226 }
227 remainingTokens.add(token);
228 }
229 }
230 }
231
232 /***
233 * Switches to the next extraction, updating the current
234 * extraction and related fields. The prior current extraction must have
235 * been fully processed when this method is called, i.e.
236 * {@link #endOfExtraction()} must be <code>true</code>.
237 *
238 * @throws IllegalStateException if {@link #endOfExtraction()} is not
239 * <code>true</code> (there are still remaining tokens to process
240 */
241 public void switchToNextExtraction() throws IllegalStateException {
242
243 if (!remainingTokens.isEmpty()) {
244 throw new IllegalStateException("Cannot update current extraction "
245 + "while there are remaining tokens: " + remainingTokens);
246 }
247
248 final Extraction newExt;
249
250 if (extractionIter.hasNext()) {
251 newExt = (Extraction) extractionIter.next();
252
253 if ((!retrySilently) && (newExt.getFirstTokenRep() < 0)) {
254
255 Util.LOG.error("Extraction is not explictly located: "
256 + newExt);
257 }
258 } else {
259 newExt = null;
260 }
261
262
263 switchToNew(newExt);
264 }
265
266 /***
267 * Returns a string representation of this object.
268 * @return a textual representation
269 */
270 public String toString() {
271 return new ToStringBuilder(this)
272 .appendSuper(super.toString())
273 .append("tokenizer", tokenizer)
274 .append("in extraction", inExtraction)
275 .append("current extraction", currentExtraction)
276 .append("first token", firstToken)
277 .append("remaining tokens", remainingTokens)
278 .toString();
279 }
280
281 /***
282 * Updates the currently processed extraction. This method must be called
283 * once for each token in each extraction.
284 *
285 * @param token the token to process
286 * @param tokenRep the repetition of the <code>token</code> in the document
287 * (counting starts with 0, as the first occurrence is the "0th
288 * repetition").
289 * @return <code>true</code> iff the extraction was successfully updated;
290 * <code>false</code> if the token was erroneous (not expected to occur
291 * within the current extraction)
292 */
293 public boolean updateExtraction(final String token, final int tokenRep) {
294 boolean removedToken = remainingTokens.remove(token, 1);
295
296 if (!removedToken) {
297 if (retrySilently) {
298 Util.LOG.debug("Will retry locating extraction "
299 + currentExtraction
300 + " since it didn't match the current token " + token);
301
302
303 switchToNew(currentExtraction);
304
305
306 if (startOfExtraction(token, tokenRep)) {
307
308 removedToken = updateExtraction(token, tokenRep);
309 }
310 } else {
311
312
313 Util.LOG.error("Extractions don't match document: "
314 + "still missing tokens " + remainingTokens
315 + " from extraction " + currentExtraction
316 + " but current token '" + token
317 + "' (token rep=" + tokenRep + ") doesn't match");
318
319
320 remainingTokens.clear();
321 }
322 }
323
324 return removedToken;
325 }
326
327 }