1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import java.io.IOException;
25 import java.util.HashSet;
26 import java.util.Set;
27
28 import org.apache.commons.lang.builder.ToStringBuilder;
29 import org.dom4j.Document;
30 import org.dom4j.Element;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.extract.ExtractionContainer;
35 import de.fu_berlin.ties.extract.ExtractionLocator;
36 import de.fu_berlin.ties.text.TokenDetails;
37 import de.fu_berlin.ties.text.TokenizerFactory;
38 import de.fu_berlin.ties.xml.dom.TokenProcessor;
39 import de.fu_berlin.ties.xml.dom.TokenWalker;
40
41 /***
42 * Determines which elements in a document contain extractions (or parts of
43 * extractions). This class is used for sentence filtering.
44 *
45 * @author Christian Siefkes
46 * @version $Revision: 1.7 $, $Date: 2004/11/19 14:05:00 $, $Author: siefkes $
47 */
48 public class EmbeddingElements implements TokenProcessor {
49
50 /***
51 * The set of elements containing extractions.
52 */
53 private final Set<Element> elements;
54
55 /***
56 * Used to locate extractions.
57 */
58 private final ExtractionLocator locator;
59
60 /***
61 * Creates a new instance.
62 *
63 * @param document the document to use
64 * @param extractions the extractions in this document
65 * @param tFactory used to instantiate tokenizers
66 */
67 public EmbeddingElements(final Document document,
68 final ExtractionContainer extractions,
69 final TokenizerFactory tFactory) {
70 super();
71 elements = new HashSet<Element>();
72 locator = new ExtractionLocator(document, extractions,
73 tFactory.createTokenizer(""));
74
75
76 final TokenWalker walker = new TokenWalker(this, tFactory);
77
78 try {
79
80 walker.walk(document, null);
81 } catch (IOException ioe) {
82
83 throw new RuntimeException(ioe);
84 } catch (ProcessingException pe) {
85
86 throw new RuntimeException(pe);
87 }
88
89
90 locator.reachedEndOfDocument();
91 }
92
93 /***
94 * Checks whether the given element contains an extraction (or parts of an
95 * extraction).
96 *
97 * @param element the element to check
98 * @return <code>true</code> if the element contains an extraction (fully
99 * or partially)
100 */
101 public boolean containsExtraction(final Element element) {
102 return elements.contains(element);
103 }
104
105 /***
106 * {@inheritDoc}
107 */
108 public void processToken(final Element element, final String left,
109 final TokenDetails details, final String right,
110 final ContextMap context) {
111
112 locator.startOfExtraction(details.getToken(), details.getRep());
113
114 if (locator.inExtraction()) {
115
116 final boolean updatedExtraction =
117 locator.updateExtraction(details.getToken(), details.getRep());
118
119 if (updatedExtraction) {
120
121
122 Element currentElement = element;
123
124 while ((currentElement != null)
125 && !elements.contains(currentElement)) {
126 elements.add(currentElement);
127 currentElement = currentElement.getParent();
128 }
129 }
130 }
131
132 if (locator.endOfExtraction()) {
133
134 locator.switchToNextExtraction();
135 }
136 }
137
138 /***
139 * Returns a string representation of this object.
140 *
141 * @return a textual representation
142 */
143 public String toString() {
144 return new ToStringBuilder(this)
145 .append("elements", elements)
146 .toString();
147 }
148
149 }