1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.extract;
23
24 import java.io.IOException;
25 import java.util.HashMap;
26 import java.util.Iterator;
27 import java.util.Map;
28
29 import org.dom4j.Document;
30 import org.dom4j.Element;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.TiesConfiguration;
35 import de.fu_berlin.ties.text.TokenDetails;
36 import de.fu_berlin.ties.text.TokenizerFactory;
37 import de.fu_berlin.ties.util.Util;
38 import de.fu_berlin.ties.xml.dom.TokenProcessor;
39 import de.fu_berlin.ties.xml.dom.TokenWalker;
40
41 /***
42 * Matches all extractions from an extraction container against a
43 * preprocessed document, ensuring that they can be located and ordering them.
44 * See {@link #matchAndOrderExtractions(ExtractionContainer, Document)} for
45 * details.
46 *
47 * <p>Instances of this class are <em>not</em> thread-safe and cannot be used to
48 * match and order extractions from different documents in parallel.
49 *
50 * @author Christian Siefkes
51 * @version $Revision: 1.5 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
52 */
53 public class ExtractionMatcher implements TokenProcessor {
54
55 /***
56 * All duplicate extractions will be marked with this property.
57 * @see Extraction#hasProperty(Object)
58 */
59 public static final Object PROP_DUPLICATE = new Object();
60
61 /***
62 * Used to instantiate tokenizers.
63 */
64 private final TokenizerFactory factory;
65
66 /***
67 * Used to walk thru documents.
68 */
69 private final TokenWalker walker;
70
71 /***
72 * Used to locate extractions.
73 */
74 private ExtractionLocator[] locators;
75
76 /***
77 * For each extractor, we store the extraction that should be located.
78 */
79 private Extraction[] extToLocate;
80
81 /***
82 * How many extractions that have been found by each locator (including
83 * matches that have been discarded due to overlaps with better matches).
84 */
85 private int[] extractionsFound;
86
87 /***
88 * The ordered and corrected extractions will be added to this container.
89 */
90 private ExtractionContainer resultContainer;
91
92 /***
93 * The target structure used by the current extraction container.
94 */
95 private TargetStructure targetStruct;
96
97
98 /***
99 * Creates a new instance.
100 *
101 * @param conf the configuration to use
102 */
103 public ExtractionMatcher(final TiesConfiguration conf) {
104 super();
105 factory = new TokenizerFactory(conf);
106 walker = new TokenWalker(this, factory);
107 }
108
109 /***
110 * Handles a token with an extraction locator. If the locator finds the
111 * begin/end of an extraction, the Index/LastIndex of the extraction is
112 * adapted as required. If an extraction ends at this token, it is returned.
113 *
114 * @param locator the locator to use
115 * @param details details about the token to process
116 * @return the extraction that ended at this token, if any
117 */
118 private Extraction handleToken(final ExtractionLocator locator,
119 final TokenDetails details) {
120 final Extraction extraction;
121 final String token = details.getToken();
122 final int rep = details.getRep();
123 final int index = details.getIndex();
124
125 final boolean startOfExtraction =
126 locator.startOfExtraction(token, rep);
127 final boolean endOfExtraction;
128
129 if (startOfExtraction) {
130 extraction = locator.getCurrentExtraction();
131
132
133 if (extraction.getIndex() != index) {
134 extraction.setIndex(index);
135 }
136 } else if (locator.inExtraction()) {
137 extraction = locator.getCurrentExtraction();
138 } else {
139 extraction = null;
140 }
141
142 if (extraction != null) {
143
144 locator.updateExtraction(token, rep);
145
146 endOfExtraction = locator.endOfExtraction();
147 } else {
148
149 endOfExtraction = false;
150 }
151
152 if (endOfExtraction) {
153
154 if (extraction.getLastIndex() != index) {
155 extraction.setLastIndex(index);
156 }
157
158
159 locator.switchToNextExtraction();
160
161
162 return extraction;
163 } else {
164 return null;
165 }
166 }
167
168 /***
169 * Helper method that initializes a locator that will just locate a single
170 * extraction.
171 *
172 * @param ext the extraction to locate
173 * @return a locator for this extraction
174 */
175 private ExtractionLocator initLocator(final Extraction ext) {
176 final ExtractionContainer container =
177 new ExtractionContainer(targetStruct);
178 container.add(ext);
179 final boolean retrySilently = (ext.getFirstTokenRep() < 0)
180 ? true : false;
181 return new ExtractionLocator(container, factory.createTokenizer(""),
182 retrySilently);
183 }
184
185 /***
186 * Matches all extractions from an extraction container against a
187 * preprocessing document, ensuring that they can be located. Returns
188 * a new extraction container containing all extraction in document order
189 * (the order in which they occur in the document). Also ensures that
190 * {@link Extraction#getFirstTokenRep()} and {@link Extraction#getIndex()}
191 * and {@link Extraction#getLastIndex()} are set to the correct values.
192 *
193 * <p>If the {@link Extraction#getFirstTokenRep() FirstTokenRep} of an
194 * extraction is negative, this implies that <em>all</em> instances of the
195 * extracted text are to be found in the document, not just a single
196 * instance at a specific position. In this case, the returned container
197 * will contain a copy of the extraction for each match, with the
198 * FirstTokenRep + Index + LastIndex values of each copy set to their
199 * correct values. If each such extraction not at least found
200 * <em>once</em>, an error is reported.
201 *
202 * @param orgContainer the original container of extractions to process
203 * @param doc the preprocessed document to match the extractions agains
204 * @return a new extraction container as described above
205 * @throws IOException if an I/O error occurs during processing
206 * @throws ProcessingException if an error occurs during processing
207 */
208 public ExtractionContainer matchAndOrderExtractions(
209 final ExtractionContainer orgContainer, final Document doc)
210 throws IOException, ProcessingException {
211 targetStruct = orgContainer.getTargetStructure();
212 resultContainer = new ExtractionContainer(targetStruct);
213
214
215
216 final int numExtractions = orgContainer.size();
217 locators = new ExtractionLocator[numExtractions];
218 extToLocate = new Extraction[numExtractions];
219 extractionsFound = new int[numExtractions];
220 Iterator<Extraction> extIter = orgContainer.iterator();
221 Extraction ext;
222
223 for (int i = 0; i < numExtractions; i++) {
224 ext = extIter.next();
225 locators[i] = initLocator(ext);
226 extToLocate[i] = ext;
227 extractionsFound[i] = 0;
228 }
229
230
231 walker.walk(doc, null);
232
233
234 for (int i = 0; i < numExtractions; i++) {
235 if (extractionsFound[i] < 1) {
236 Util.LOG.error("Failed to locate extraction: "
237 + extToLocate[i].toString());
238 }
239
240
241 }
242
243
244 final Iterator<Extraction> resultIter = resultContainer.iterator();
245 final Map<String, Extraction> extMap =
246 new HashMap<String, Extraction>(resultContainer.size());
247 Extraction newExt, prevExt;
248 String key;
249
250 while (resultIter.hasNext()) {
251 newExt = resultIter.next();
252
253 key = newExt.getType() + ": " + newExt.getText();
254 prevExt = extMap.get(key);
255
256 if (prevExt != null) {
257 newExt.setProperty(PROP_DUPLICATE);
258 Util.LOG.debug("Marked " + newExt + " as duplicate of "
259 + prevExt);
260 } else {
261 extMap.put(key, newExt);
262 }
263 }
264
265 return resultContainer;
266 }
267
268 /***
269 * {@inheritDoc}
270 */
271 public void processToken(final Element element, final String left,
272 final TokenDetails details, final String right,
273 final ContextMap context)
274 throws IOException, ProcessingException {
275 Extraction ext;
276
277
278 for (int i = 0; i < locators.length; i++) {
279 ext = handleToken(locators[i], details);
280
281 if (ext != null) {
282 if (ext.getFirstTokenRep() < 0) {
283
284 final Extraction clonedExt = ext.clone();
285 locators[i] = initLocator(clonedExt);
286 }
287
288
289 ext.setFirstTokenRep(details.getRep());
290 extractionsFound[i]++;
291 boolean addNew = true;
292 boolean mayOverlap = true;
293 Extraction last;
294 int lastLength, extLength;
295
296
297 while (mayOverlap && ((last = resultContainer.last()) != null)
298 && (last.getLastIndex() >= ext.getIndex())) {
299 lastLength = last.getLastIndex() - last.getIndex() + 1;
300 extLength = ext.getLastIndex() - ext.getIndex() + 1;
301
302
303 if (extLength > lastLength) {
304 if (resultContainer.removeLast() != last) {
305
306 throw new RuntimeException("Implementation error: "
307 + "container didn't remove last extraction "
308 + last + " as expected");
309 }
310
311 Util.LOG.debug("Kept longer overlapping match: " + ext
312 + " instead of " + last);
313 } else {
314
315 addNew = false;
316 mayOverlap = false;
317
318 if (extLength == lastLength) {
319 Util.LOG.debug("Kept earlier overlapping match: "
320 + last + " instead of " + ext);
321 } else {
322 Util.LOG.debug("Kept longer overlapping match: "
323 + last + " instead of " + ext);
324 }
325 }
326 }
327
328 if (addNew) {
329
330 resultContainer.add(ext);
331 }
332 }
333 }
334 }
335
336 }