1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.filter;
23
24 import java.io.IOException;
25 import java.util.HashSet;
26 import java.util.Set;
27
28 import org.apache.commons.lang.builder.ToStringBuilder;
29 import org.dom4j.Document;
30 import org.dom4j.Element;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34 import de.fu_berlin.ties.text.TokenDetails;
35 import de.fu_berlin.ties.text.TokenizerFactory;
36 import de.fu_berlin.ties.util.Util;
37 import de.fu_berlin.ties.xml.dom.DOMUtils;
38 import de.fu_berlin.ties.xml.dom.TokenProcessor;
39 import de.fu_berlin.ties.xml.dom.TokenWalker;
40
41 /***
42 * A token walker that only invokes a provided
43 * {@link de.fu_berlin.ties.xml.dom.TokenProcessor} on the subset of tokens
44 * that are children of an element accepted by a provided
45 * {@link de.fu_berlin.ties.filter.ElementFilter}.
46 *
47 * <p>Instances of this class are <em>not</em> thread-safe.
48 *
49 * @author Christian Siefkes
50 * @version $Revision: 1.17 $, $Date: 2006/10/21 16:04:19 $, $Author: siefkes $
51 */
52 public class FilteringTokenWalker extends TokenWalker {
53
54 /***
55 * The element filter used by this instance.
56 */
57 private final ElementFilter filter;
58
59 /***
60 * A handler that is called whenever some tokens are skipped; may be
61 * <code>null</code>.
62 */
63 private final SkipHandler skipHandler;
64
65 /***
66 * Stores the last decision whether or not to process a token.
67 */
68 private boolean lastDecision = true;
69
70 /***
71 * The elements that so far have been accepted by the filter in the
72 * current document.
73 */
74 private Set<Element> acceptedElements;
75
76 /***
77 * The elements that so far have been rejected by the filter in the
78 * current document.
79 */
80 private Set<Element> rejectedElements;
81
82 /***
83 * Creates a new instance.
84 *
85 * @param processor used to process the tokens
86 * @param tFactory used to instantiate tokenizers
87 * @param elementFilter the element filter to use
88 * @param sHandler a handler that is called whenever some tokens are
89 * skipped; may be <code>null</code>
90 */
91 public FilteringTokenWalker(final TokenProcessor processor,
92 final TokenizerFactory tFactory,
93 final ElementFilter elementFilter,
94 final SkipHandler sHandler) {
95 super(processor, tFactory);
96 filter = elementFilter;
97 skipHandler = sHandler;
98 }
99
100 /***
101 * Returns the set of elements that have been accepted by the filter in the
102 * current document.
103 *
104 * @return the accepted elements
105 */
106 public Set getAcceptedElements() {
107 return acceptedElements;
108 }
109
110 /***
111 * Returns the set of elements that have been rejected by the filter in the
112 * current document.
113 *
114 * @return the rejected elements
115 */
116 public Set getRejectedElements() {
117 return rejectedElements;
118 }
119
120 /***
121 * Returns the element filter used by this instance.
122 * @return the used element filter
123 */
124 protected ElementFilter getFilter() {
125 return filter;
126 }
127
128 /***
129 * This method can be overwritten by subclasses to modify decisions of
130 * the element filter. The standard behavior is to accept the decision
131 * as is.
132 *
133 * @param element the element to test
134 * @param filteredElement the element that was actually filtered
135 * (<code>element</code> or a parent), or <code>null</code> if the decision
136 * had been cached (no filtering took place)
137 * @param decision the decision of the element filer
138 * @return the revised decision
139 * @throws ProcessingException if an error occurs while revising the
140 * decision
141 */
142 protected boolean handleAccept(final Element element,
143 final Element filteredElement, final boolean decision)
144 throws ProcessingException {
145 return decision;
146 }
147
148 /***
149 * {@inheritDoc}
150 */
151 protected void processToken(final Element element, final String left,
152 final TokenDetails details, final String right,
153 final ContextMap context) throws IOException, ProcessingException {
154
155 boolean accepted = false;
156 boolean foundMatch = false;
157 Element currentParent = element;
158 Element nearestPreferred = null;
159 Element nearestAcceptable = null;
160
161
162 while ((currentParent != null) && !foundMatch) {
163 if (acceptedElements.contains(currentParent)) {
164
165 accepted = true;
166 foundMatch = true;
167 } else if (rejectedElements.contains(currentParent)) {
168
169 accepted = false;
170 foundMatch = true;
171 } else {
172 if ((nearestPreferred == null)
173 && (filter.prefers(currentParent))) {
174
175
176 nearestPreferred = currentParent;
177 }
178 if ((nearestAcceptable == null)
179 && (!filter.avoids(currentParent))) {
180
181
182
183 nearestAcceptable = currentParent;
184 }
185
186
187 currentParent = currentParent.getParent();
188 }
189 }
190
191
192
193
194
195
196
197 final Element filteredElement;
198
199 if (!foundMatch) {
200
201
202 if (nearestPreferred != null) {
203
204 filteredElement = nearestPreferred;
205
206
207
208 } else if (nearestAcceptable != null) {
209
210 filteredElement = nearestAcceptable;
211 Util.LOG.debug("Filter nearest accepted (no preferred found): "
212 + DOMUtils.showElement(filteredElement));
213 } else {
214
215 filteredElement = element;
216 Util.LOG.debug("Filter element itself (no preferred or accepted"
217 + " found): " + DOMUtils.showElement(filteredElement));
218 }
219
220
221 accepted = filter.matches(filteredElement);
222
223
224 if (accepted) {
225 acceptedElements.add(filteredElement);
226
227 if (element != filteredElement) {
228 acceptedElements.add(element);
229 }
230 } else {
231 rejectedElements.add(filteredElement);
232
233 if (element != filteredElement) {
234 rejectedElements.add(element);
235 }
236 }
237 } else {
238
239 filteredElement = null;
240 }
241
242
243 final boolean finalDecision =
244 handleAccept(element, filteredElement, accepted);
245
246
247 if (finalDecision) {
248 super.processToken(element, left, details, right, context);
249 } else {
250 if (lastDecision && (skipHandler != null)) {
251
252
253 skipHandler.skip();
254 }
255 }
256
257
258 lastDecision = finalDecision;
259 }
260
261 /***
262 * {@inheritDoc}
263 */
264 public void walk(final Document document, final ContextMap context)
265 throws IOException, ProcessingException {
266
267 acceptedElements = new HashSet<Element>();
268 rejectedElements = new HashSet<Element>();
269 filter.init(document, null);
270
271
272 super.walk(document, context);
273 }
274
275 /***
276 * Returns a string representation of this object.
277 *
278 * @return a textual representation
279 */
280 public String toString() {
281 final ToStringBuilder builder = new ToStringBuilder(this)
282 .appendSuper(super.toString())
283 .append("filter", filter);
284
285 if (skipHandler != null) {
286 builder.append("skip handler", skipHandler);
287 }
288 return builder.toString();
289 }
290
291 }