View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.io.IOException;
25  import java.util.HashSet;
26  import java.util.Set;
27  
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  import org.dom4j.Document;
30  import org.dom4j.Element;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.extract.ExtractionContainer;
35  import de.fu_berlin.ties.extract.ExtractionLocator;
36  import de.fu_berlin.ties.text.TokenDetails;
37  import de.fu_berlin.ties.text.TokenizerFactory;
38  import de.fu_berlin.ties.xml.dom.TokenProcessor;
39  import de.fu_berlin.ties.xml.dom.TokenWalker;
40  
41  /***
42   * Determines which elements in a document contain extractions (or parts of
43   * extractions). This class is used for sentence filtering.
44   *
45   * @author Christian Siefkes
46   * @version $Revision: 1.11 $, $Date: 2006/10/21 16:04:19 $, $Author: siefkes $
47   */
48  public class EmbeddingElements implements TokenProcessor {
49  
50      /***
51       * The set of elements containing extractions.
52       */
53      private final Set<Element> elements;
54  
55      /***
56       * Used to locate extractions.
57       */
58      private final ExtractionLocator locator;
59  
60      /***
61       * Creates a new instance.
62       *
63       * @param document the document to use
64       * @param extractions the extractions in this document
65       * @param tFactory used to instantiate tokenizers
66       */
67      public EmbeddingElements(final Document document,
68                               final ExtractionContainer extractions,
69                               final TokenizerFactory tFactory) {
70          super();
71          elements = new HashSet<Element>();
72          locator = new ExtractionLocator(extractions,
73                  tFactory.createTokenizer(""));
74  
75          // walk document to populate elements set
76          final TokenWalker walker = new TokenWalker(this, tFactory);
77  
78          try {
79              // the walker will call back (processToken method) where appropriate
80              walker.walk(document, null);
81          } catch (IOException ioe) {
82              // should never occur since our processToken method doesn't throw
83              throw new RuntimeException(ioe);
84          } catch (ProcessingException pe) {
85              // should never occur since our processToken method doesn't throw
86              throw new RuntimeException(pe);
87          }
88  
89          // ensure that all extractions have been fully processed
90          locator.reachedEndOfDocument();
91      }
92  
93      /***
94       * Checks whether the given element contains an extraction (or parts of an
95       * extraction).
96       *
97       * @param element the element to check
98       * @return <code>true</code> if the element contains an extraction (fully
99       * or partially)
100      */
101     public boolean containsExtraction(final Element element) {
102         return elements.contains(element);
103     }
104 
105     /***
106      * {@inheritDoc}
107      */
108     public void processToken(final Element element, final String left,
109             final TokenDetails details, final String right,
110             final ContextMap context) {
111         // look for start of extraction
112         locator.startOfExtraction(details.getToken(), details.getRep());
113 
114         if (locator.inExtraction()) {
115             // update extraction (remove current token from remaining tokens)
116             final boolean updatedExtraction =
117                 locator.updateExtraction(details.getToken(), details.getRep());
118 
119             if (updatedExtraction) {
120                 // add this element and all its parents to embedding elements
121                 // (stop if an element is already known)
122                 Element currentElement = element;
123 
124                 while ((currentElement != null)
125                         && !elements.contains(currentElement)) {
126                     elements.add(currentElement);
127                     currentElement = currentElement.getParent();
128                 }
129             }
130         }
131 
132         if (locator.endOfExtraction()) {
133             // reached the end of the current extraction -- switch to next one
134             locator.switchToNextExtraction();
135         }
136     }
137 
138     /***
139      * Returns a string representation of this object.
140      *
141      * @return a textual representation
142      */
143     public String toString() {
144         return new ToStringBuilder(this)
145             .append("elements", elements)
146             .toString();
147     }
148 
149 }