1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.util.Iterator;
25
26 import org.apache.commons.collections.Bag;
27 import org.apache.commons.collections.bag.HashBag;
28 import org.apache.commons.lang.builder.ToStringBuilder;
29 import org.dom4j.Document;
30
31 import de.fu_berlin.ties.text.TextTokenizer;
32 import de.fu_berlin.ties.util.Util;
33
34 /***
35 * Locates extractions in a document.
36 *
37 * @author Christian Siefkes
38 * @version $Revision: 1.4 $, $Date: 2004/08/30 17:24:43 $, $Author: siefkes $
39 */
40 public class ExtractionLocator {
41
42 /***
43 * An iterator over the extractions to train.
44 */
45 private Iterator extractionIter;
46
47 /***
48 * The extraction that is currently processed or will be processed next;
49 * or <code>null</code> if there are no more extractions to process.
50 */
51 private Extraction currentExtraction;
52
53 /***
54 * The first token from the current extaction, utilized together with
55 * {@link Extraction#getFirstTokenRep()} to locate the start of the
56 * extraction.
57 */
58 private String firstToken;
59
60 /***
61 * <code>true</code> if we are processing the {@link #currentExtraction},
62 * <code>false</code> otherwise (we are waiting for it to start or there
63 * are no more extractions).
64 */
65 private boolean inExtraction;
66
67 /***
68 * A multiset of the tokens remaing (not yet trained) in the current
69 * extraction.
70 */
71 private Bag remainingTokens = new HashBag();
72
73 /***
74 * The tokenizer used to split extractions into tokens.
75 */
76 private final TextTokenizer tokenizer;
77
78
79 /***
80 * Creates a new instance.
81 *
82 * @param document the document to use
83 * @param extractions the extractions in this document
84 * @param textTokenizer the tokenizer used to split extractions into tokens
85 */
86 public ExtractionLocator(final Document document,
87 final ExtractionContainer extractions,
88 final TextTokenizer textTokenizer) {
89 super();
90 tokenizer = textTokenizer;
91
92
93 extractionIter = extractions.iterator();
94 switchToNextExtraction();
95 }
96
97 /***
98 * Whether we reached the end of the current extraction.
99 * @return <code>true</code> iff the current extraction has ended
100 */
101 public boolean endOfExtraction() {
102 return inExtraction && remainingTokens.isEmpty();
103 }
104
105 /***
106 * Returns the current extraction.
107 * @return the current extraction
108 */
109 public Extraction getCurrentExtraction() {
110 return currentExtraction;
111 }
112
113 /***
114 * Whether we are currently within an extraction.
115 *
116 * @return <code>true</code> iff are processing the
117 * {@link #getCurrentExtraction()}, <code>false</code> otherwise (we are
118 * waiting for it to start or there are no more extractions)
119 */
120 public boolean inExtraction() {
121 return inExtraction;
122 }
123
124 /***
125 * This method must be called at the end of the current document. It will
126 * log an error when there are still unprocessed or incompletely processed
127 * extractions.
128 */
129 public void reachedEndOfDocument() {
130 if (!remainingTokens.isEmpty()) {
131 if (inExtraction) {
132 Util.LOG.error("Document ended while processing extraction: "
133 + currentExtraction + ", unprocessed tokens: "
134 + remainingTokens);
135 } else {
136 Util.LOG.error("Document ended while waiting for start of "
137 + "extraction: " + currentExtraction);
138 }
139
140 remainingTokens.clear();
141 }
142 if (extractionIter.hasNext()) {
143 Util.LOG.error("Unprocessed extractions at end of document: "
144 + extractionIter.next());
145 }
146 }
147
148 /***
149 * Whether the current token starts a new extraction. <em>This method must
150 * be called once for each token in a document, otherwise we might miss
151 * extractions.</em>
152 *
153 * @param token the token to check
154 * @param tokenRep the repetition of the <code>token</code> in the document
155 * (counting starts with 0, as the first occurrence is the "0th
156 * repetition").
157 * @return <code>true</code> iff the given token starts a new extraction
158 */
159 public boolean startOfExtraction(final String token, final int tokenRep) {
160 if ((!inExtraction) && (currentExtraction != null)
161 && (token.equals(firstToken))
162 && (currentExtraction.getFirstTokenRep() <= tokenRep)) {
163
164 inExtraction = true;
165 return true;
166 } else {
167 return false;
168 }
169 }
170
171 /***
172 * Switches to the next extraction, updating the current
173 * extraction and related fields. The prior current extraction must have
174 * been fully processed when this method is called, i.e.
175 * {@link #endOfExtraction()} must be <code>true</code>.
176 *
177 * @throws IllegalStateException if {@link #endOfExtraction()} is not
178 * <code>true</code> (there are still remaining tokens to process
179 */
180 public void switchToNextExtraction() throws IllegalStateException {
181
182 if (!remainingTokens.isEmpty()) {
183 throw new IllegalStateException("Cannot update current extraction "
184 + "while there are remaining tokens: " + remainingTokens);
185 }
186
187 inExtraction = false;
188 firstToken = null;
189
190 if (extractionIter.hasNext()) {
191 currentExtraction = (Extraction) extractionIter.next();
192 tokenizer.reset(currentExtraction.getText());
193 String token;
194
195
196 while ((token = tokenizer.nextToken()) != null) {
197 if (firstToken == null) {
198 firstToken = token;
199 }
200 remainingTokens.add(token);
201 }
202 } else {
203
204 currentExtraction = null;
205 }
206 }
207
208 /***
209 * Returns a string representation of this object.
210 * @return a textual representation
211 */
212 public String toString() {
213 return new ToStringBuilder(this)
214 .appendSuper(super.toString())
215 .append("tokenizer", tokenizer)
216 .append("in extraction", inExtraction)
217 .append("current extraction", currentExtraction)
218 .append("first token", firstToken)
219 .append("remaining tokens", remainingTokens)
220 .toString();
221 }
222
223 /***
224 * Updates the currently processed extraction. This method must be called
225 * once for each token in each extraction.
226 *
227 * @param token the token to process
228 * @param tokenRep the repetition of the <code>token</code> in the document
229 * (counting starts with 0, as the first occurrence is the "0th
230 * repetition").
231 * @return <code>true</code> iff the extraction was successfully updated;
232 * <code>false</code> if the token was erroneous (not expected to occur
233 * within the current extraction)
234 */
235 public boolean updateExtraction(final String token, final int tokenRep) {
236 boolean removedToken = remainingTokens.remove(token, 1);
237
238 if (!removedToken) {
239
240
241 Util.LOG.error("Extractions don't match document: "
242 + "still missing tokens " + remainingTokens
243 + " from extraction " + currentExtraction
244 + " but current token '" + token
245 + "' (token rep=" + tokenRep + ") doesn't match");
246
247
248 remainingTokens.clear();
249 }
250
251 return removedToken;
252 }
253
254 }