View Javadoc

1   /*
2    * Copyright (C) 2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.extract;
23  
24  import java.io.IOException;
25  import java.util.HashMap;
26  import java.util.Iterator;
27  import java.util.Map;
28  
29  import org.dom4j.Document;
30  import org.dom4j.Element;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.TiesConfiguration;
35  import de.fu_berlin.ties.text.TokenDetails;
36  import de.fu_berlin.ties.text.TokenizerFactory;
37  import de.fu_berlin.ties.util.Util;
38  import de.fu_berlin.ties.xml.dom.TokenProcessor;
39  import de.fu_berlin.ties.xml.dom.TokenWalker;
40  
41  /***
42   * Matches all extractions from an extraction container against a
43   * preprocessed document, ensuring that they can be located and ordering them.
44   * See {@link #matchAndOrderExtractions(ExtractionContainer, Document)} for
45   * details.
46   *
47   * <p>Instances of this class are <em>not</em> thread-safe and cannot be used to
48   * match and order extractions from different documents in parallel.
49   * 
50   * @author Christian Siefkes
51   * @version $Revision: 1.5 $, $Date: 2006/10/21 16:04:13 $, $Author: siefkes $
52   */
53  public class ExtractionMatcher implements TokenProcessor {
54  
55      /***
56       * All duplicate extractions will be marked with this property.
57       * @see Extraction#hasProperty(Object)
58       */
59      public static final Object PROP_DUPLICATE = new Object();
60  
61      /***
62       * Used to instantiate tokenizers.
63       */
64      private final TokenizerFactory factory;
65  
66      /***
67       * Used to walk thru documents.
68       */
69      private final TokenWalker walker;
70  
71      /***
72       * Used to locate extractions.
73       */
74      private ExtractionLocator[] locators;
75  
76      /***
77       * For each extractor, we store the extraction that should be located.
78       */
79      private Extraction[] extToLocate;
80  
81      /***
82       * How many extractions that have been found by each locator (including
83       * matches that have been discarded due to overlaps with better matches).
84       */
85      private int[] extractionsFound;
86  
87      /***
88       * The ordered and corrected extractions will be added to this container.
89       */
90      private ExtractionContainer resultContainer;
91  
92      /***
93       * The target structure used by the current extraction container.
94       */
95      private TargetStructure targetStruct;
96  
97  
98      /***
99       * Creates a new instance.
100      *
101      * @param conf the configuration to use
102      */
103     public ExtractionMatcher(final TiesConfiguration conf) {
104         super();
105         factory = new TokenizerFactory(conf);
106         walker = new TokenWalker(this, factory);
107     }
108 
109     /***
110      * Handles a token with an extraction locator. If the locator finds the
111      * begin/end of an extraction, the Index/LastIndex of the extraction is
112      * adapted as required. If an extraction ends at this token, it is returned.
113      *
114      * @param locator the locator to use
115      * @param details details about the token to process
116      * @return the extraction that ended at this token, if any
117      */
118     private Extraction handleToken(final ExtractionLocator locator,
119             final TokenDetails details) {
120         final Extraction extraction;
121         final String token = details.getToken();
122         final int rep = details.getRep();
123         final int index = details.getIndex();
124 
125         final boolean startOfExtraction =
126             locator.startOfExtraction(token, rep);
127         final boolean endOfExtraction;
128 
129         if (startOfExtraction) {
130             extraction = locator.getCurrentExtraction();
131 
132             // correct index if necessary
133             if (extraction.getIndex() != index) {
134                 extraction.setIndex(index);
135             }
136         } else if (locator.inExtraction()) {
137             extraction = locator.getCurrentExtraction();
138         } else {
139             extraction = null;
140         }
141 
142         if (extraction != null) {
143             // update extraction (remove current token from remaining tokens)
144             locator.updateExtraction(token, rep);
145             // check whether this is the last token of the extraction
146             endOfExtraction = locator.endOfExtraction();
147         } else {
148             // outside any extraction
149             endOfExtraction = false;
150         }
151 
152         if (endOfExtraction) {
153             // correct last index of extraction, if necessary
154             if (extraction.getLastIndex() != index) {
155                 extraction.setLastIndex(index);
156             }
157 
158             // reached the end of the current extraction -- switch to next one
159             locator.switchToNextExtraction();
160 
161             // return the extraction that ended here
162             return extraction;
163         } else {
164             return null;
165         }
166     }
167 
168     /***
169      * Helper method that initializes a locator that will just locate a single
170      * extraction.
171      *
172      * @param ext the extraction to locate
173      * @return a locator for this extraction
174      */
175     private ExtractionLocator initLocator(final Extraction ext) {
176         final ExtractionContainer container =
177             new ExtractionContainer(targetStruct);
178         container.add(ext);
179         final boolean retrySilently = (ext.getFirstTokenRep() < 0)
180             ? true : false;
181         return new ExtractionLocator(container, factory.createTokenizer(""),
182                 retrySilently);
183     }
184 
185     /***
186      * Matches all extractions from an extraction container against a
187      * preprocessing document, ensuring that they can be located. Returns
188      * a new extraction container containing all extraction in document order
189      * (the order in which they occur in the document). Also ensures that
190      * {@link Extraction#getFirstTokenRep()} and {@link Extraction#getIndex()}
191      * and {@link Extraction#getLastIndex()} are set to the correct values.
192      *
193      * <p>If the {@link Extraction#getFirstTokenRep() FirstTokenRep} of an
194      * extraction is negative, this implies that <em>all</em> instances of the
195      * extracted text are to be found in the document, not just a single
196      * instance at a specific position. In this case, the returned container
197      * will contain a copy of the extraction for each match, with the
198      * FirstTokenRep + Index + LastIndex values of each copy set to their
199      * correct values.  If each such extraction not at least found
200      * <em>once</em>, an error is reported.
201      *
202      * @param orgContainer the original container of extractions to process
203      * @param doc the preprocessed document to match the extractions agains
204      * @return a new extraction container as described above
205      * @throws IOException if an I/O error occurs during processing
206      * @throws ProcessingException if an error occurs during processing
207      */
208     public ExtractionContainer matchAndOrderExtractions(
209             final ExtractionContainer orgContainer, final Document doc)
210     throws IOException, ProcessingException {
211         targetStruct = orgContainer.getTargetStructure();
212         resultContainer = new ExtractionContainer(targetStruct);
213 
214         // wrap each extraction into a separate container to locate them in
215         // parallel (might not be in original order)
216         final int numExtractions = orgContainer.size();
217         locators = new ExtractionLocator[numExtractions];
218         extToLocate = new Extraction[numExtractions];
219         extractionsFound = new int[numExtractions];
220         Iterator<Extraction> extIter = orgContainer.iterator();
221         Extraction ext;
222 
223         for (int i = 0; i < numExtractions; i++) {
224             ext = extIter.next();
225             locators[i]  = initLocator(ext);
226             extToLocate[i] = ext;
227             extractionsFound[i] = 0;
228         }
229 
230         // the walker will call back (processToken method) where appropriate
231         walker.walk(doc, null);
232 
233         // ensure that at each extraction was found at least once
234         for (int i = 0; i < numExtractions; i++) {
235             if (extractionsFound[i] < 1) {
236                 Util.LOG.error("Failed to locate extraction: "
237                         + extToLocate[i].toString());
238             }
239             // no need to all locators[i].reachedEndOfDocument() since we
240             // ensure ourselves that each extraction has been found
241         }
242 
243         // mark duplicates (same type + text, but different positions) as such
244         final Iterator<Extraction> resultIter = resultContainer.iterator();
245         final Map<String, Extraction> extMap =
246             new HashMap<String, Extraction>(resultContainer.size());
247         Extraction newExt, prevExt;
248         String key;
249 
250         while (resultIter.hasNext()) {
251             newExt =  resultIter.next();
252             // use type + text as key to identify duplicates
253             key = newExt.getType() + ": " + newExt.getText();
254             prevExt = extMap.get(key);
255 
256             if (prevExt != null) {
257                 newExt.setProperty(PROP_DUPLICATE);
258                 Util.LOG.debug("Marked " + newExt + " as duplicate of "
259                         + prevExt);
260             } else {
261                 extMap.put(key, newExt);
262             }
263         }
264 
265         return resultContainer;
266     }
267 
268     /***
269      * {@inheritDoc}
270      */
271     public void processToken(final Element element, final String left,
272             final TokenDetails details, final String right,
273             final ContextMap context)
274             throws IOException, ProcessingException {
275         Extraction ext;
276 
277         // invoke all locators
278         for (int i = 0; i < locators.length; i++) {
279             ext = handleToken(locators[i], details);
280 
281             if (ext != null) { // extraction ended here
282                 if (ext.getFirstTokenRep() < 0) {
283                     // clone extraction to find another occurrence (if any)
284                     final Extraction clonedExt = ext.clone();
285                     locators[i] = initLocator(clonedExt);
286                 }
287 
288                 // correct firstTokenRep + update find count + store extraction
289                 ext.setFirstTokenRep(details.getRep());
290                 extractionsFound[i]++;
291                 boolean addNew = true;
292                 boolean mayOverlap = true;
293                 Extraction last;
294                 int lastLength, extLength;
295 
296                 // Check if it overlaps with last extraction
297                 while (mayOverlap && ((last = resultContainer.last()) != null)
298                         && (last.getLastIndex() >= ext.getIndex())) {
299                     lastLength = last.getLastIndex() - last.getIndex() + 1;
300                     extLength = ext.getLastIndex() - ext.getIndex() + 1;
301 
302                     // keep better (longer) match, or first one in case of a tie
303                     if (extLength > lastLength) {
304                         if (resultContainer.removeLast() != last) {
305                             // should always remove and return the last ext.
306                             throw new RuntimeException("Implementation error: "
307                                     + "container didn't remove last extraction "
308                                     + last + " as expected");
309                         }
310 
311                         Util.LOG.debug("Kept longer overlapping match: " + ext
312                                 + " instead of " + last);
313                     } else {
314                         // discard new extraction -- no more risk of overlaps
315                         addNew = false;
316                         mayOverlap = false;
317 
318                         if (extLength == lastLength) {
319                             Util.LOG.debug("Kept earlier overlapping match: "
320                                     + last + " instead of " + ext);
321                         } else {
322                             Util.LOG.debug("Kept longer overlapping match: "
323                                     + last + " instead of " + ext);
324                         }
325                     }
326                 }
327 
328                 if (addNew) {
329                     // no overlap or other was deleted
330                     resultContainer.add(ext);
331                 }
332             }
333         }
334     }
335 
336 }