View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.filter;
23  
24  import java.io.IOException;
25  import java.util.HashSet;
26  import java.util.Set;
27  
28  import org.apache.commons.lang.builder.ToStringBuilder;
29  import org.dom4j.Document;
30  import org.dom4j.Element;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.text.TokenDetails;
35  import de.fu_berlin.ties.text.TokenizerFactory;
36  import de.fu_berlin.ties.util.Util;
37  import de.fu_berlin.ties.xml.dom.DOMUtils;
38  import de.fu_berlin.ties.xml.dom.TokenProcessor;
39  import de.fu_berlin.ties.xml.dom.TokenWalker;
40  
41  /***
42   * A token walker that only invokes a provided
43   * {@link de.fu_berlin.ties.xml.dom.TokenProcessor} on the subset of tokens
44   * that are children of an element accepted by a provided
45   * {@link de.fu_berlin.ties.filter.ElementFilter}.
46   *
47   * <p>Instances of this class are <em>not</em> thread-safe.
48   *
49   * @author Christian Siefkes
50   * @version $Revision: 1.13 $, $Date: 2004/11/19 14:05:00 $, $Author: siefkes $
51   */
52  public class FilteringTokenWalker extends TokenWalker {
53  
54      /***
55       * The element filter used by this instance.
56       */
57      private final ElementFilter filter;
58  
59      /***
60       * A handler that is called whenever some tokens are skipped; may be
61       * <code>null</code>.
62       */
63      private final SkipHandler skipHandler;
64  
65      /***
66       * Stores the last decision whether or not to process a token.
67       */
68      private boolean lastDecision = true;
69  
70      /***
71       * The elements that so far have been accepted by the filter in the
72       * current document.
73       */
74      private Set<Element> acceptedElements;
75  
76      /***
77       * The elements that so far have been rejected by the filter in the
78       * current document.
79       */
80      private Set<Element> rejectedElements;
81  
82      /***
83       * Creates a new instance.
84       *
85       * @param processor used to process the tokens
86       * @param tFactory used to instantiate tokenizers
87       * @param elementFilter the element filter to use
88       * @param sHandler a handler that is called whenever some tokens are
89       * skipped; may be <code>null</code>
90       */
91      public FilteringTokenWalker(final TokenProcessor processor,
92                                  final TokenizerFactory tFactory,
93                                  final ElementFilter elementFilter,
94                                  final SkipHandler sHandler) {
95          super(processor, tFactory);
96          filter = elementFilter;
97          skipHandler = sHandler;
98      }
99  
100     /***
101      * Returns the set of elements that have been accepted by the filter in the
102      * current document.
103      *
104      * @return the accepted elements
105      */
106     public Set getAcceptedElements() {
107         return acceptedElements;
108     }
109 
110     /***
111      * Returns the set of elements that have been rejected by the filter in the
112      * current document.
113      *
114      * @return the rejected elements
115      */
116     public Set getRejectedElements() {
117         return rejectedElements;
118     }
119 
120     /***
121      * Returns the element filter used by this instance.
122      * @return the used element filter
123      */
124     protected ElementFilter getFilter() {
125         return filter;
126     }
127 
128     /***
129      * This method can be overwritten by subclasses to modify decisions of
130      * the element filter. The standard behavior is to accept the decision
131      * as is.
132      *
133      * @param element the element to test
134      * @param filteredElement the element that was actually filtered
135      * (<code>element</code> or a parent), or <code>null</code> if the decision
136      * had been cached (no filtering took place)
137      * @param decision the decision of the element filer
138      * @return the revised decision
139      * @throws ProcessingException if an error occurs while revising the
140      * decision
141      */
142     protected boolean handleAccept(final Element element,
143             final Element filteredElement, final boolean decision)
144     throws ProcessingException {
145         return decision;
146     }
147 
148     /***
149      * {@inheritDoc}
150      */
151     protected void processToken(final Element element, final String left,
152             final TokenDetails details, final String right,
153             final ContextMap context) throws IOException, ProcessingException {
154         // find out whether element is accepted by the filter
155         boolean accepted = false;
156         boolean foundMatch = false;
157         Element currentParent = element;
158         Element nearestPreferred = null;  // first element that is preferred
159         Element nearestAcceptable = null; // first element that is not avoided
160 
161         // did we filter this element or one of its parents already?
162         while ((currentParent != null) && !foundMatch) {
163             if (acceptedElements.contains(currentParent)) {
164                 // found accepted
165                 accepted = true;
166                 foundMatch = true;
167             } else if (rejectedElements.contains(currentParent)) {
168                 // found rejected
169                 accepted = false;
170                 foundMatch = true;
171             } else {
172                 if ((nearestPreferred == null)
173                         && (filter.prefers(currentParent))) {
174                     // found the nearest parent preferred by the filter --
175                     // will use this in case we need to query the filter
176                     nearestPreferred = currentParent;
177                 }
178                 if ((nearestAcceptable == null)
179                         && (!filter.avoids(currentParent))) {
180                     // found the nearest parent not avoided by the filter --
181                     // will use this in case we need to query the filter and
182                     // there is no preferred element
183                     nearestAcceptable = currentParent;
184                 }
185 
186                 // look at parent element
187                 currentParent = currentParent.getParent();
188             }
189         }
190 
191         /* Util.LOG.debug((foundMatch ? "Found" : "Didn't find")
192                 + " match for (parent of) element: "
193                 + DOMUtils.showElement(element) + "; accepted: " + accepted
194                 + "; current parent: "
195                 + DOMUtils.showElement(currentParent)); */
196 
197         final Element filteredElement;
198 
199         if (!foundMatch) {
200             // no match found: need to call the filter
201 
202             if (nearestPreferred != null) {
203                 // filter the first (nearest) element preferred by the filter
204                 filteredElement = nearestPreferred;
205 
206                 /*Util.LOG.debug("Filter nearest preferred: "
207                         + DOMUtils.showElement(filteredElement)); */
208             } else if (nearestAcceptable != null) {
209                 // filter the first (nearest) element not avoided by the filter
210                 filteredElement = nearestAcceptable;
211                 Util.LOG.debug("Filter nearest accepted (no preferred found): "
212                         + DOMUtils.showElement(filteredElement));
213             } else {
214                 // no acceptable element -- filter the current element directly
215                 filteredElement = element;
216                 Util.LOG.debug("Filter element itself (no preferred or accepted"
217                         + " found): " + DOMUtils.showElement(filteredElement));
218             }
219 
220             // query filter
221             accepted = filter.matches(filteredElement);
222 
223             // store decision (both for filteredElement and for current element)
224             if (accepted) {
225                 acceptedElements.add(filteredElement);
226 
227                 if (element != filteredElement) {
228                     acceptedElements.add(element);
229                 }
230             } else {
231                 rejectedElements.add(filteredElement);
232 
233                 if (element != filteredElement) {
234                     rejectedElements.add(element);
235                }
236            }
237         } else {
238             // re-use cached decision: no need to filter any
239             filteredElement = null;
240         }
241 
242         // give subclasses a chance to revise the decision
243         final boolean finalDecision =
244             handleAccept(element, filteredElement, accepted);
245 
246         // delegate to super if accepted, discard text otherwise
247         if (finalDecision) {
248             super.processToken(element, left, details, right, context);
249         } else {
250             if (lastDecision && (skipHandler != null)) {
251                 // last token was accepted but this one wasn't --
252                 // call the skip handler
253                 skipHandler.skip();
254             }
255         }
256 
257         // update stored decision
258         lastDecision = finalDecision;
259     }
260 
261     /***
262      * {@inheritDoc}
263      */
264     public void walk(final Document document, final ContextMap context)
265                            throws IOException, ProcessingException {
266         // (re)-initialize maps + filter
267         acceptedElements = new HashSet<Element>();
268         rejectedElements = new HashSet<Element>();
269         filter.init(document);
270 
271         // delegate to superclass
272         super.walk(document, context);
273     }
274 
275     /***
276      * Returns a string representation of this object.
277      *
278      * @return a textual representation
279      */
280     public String toString() {
281         final ToStringBuilder builder = new ToStringBuilder(this)
282             .appendSuper(super.toString())
283             .append("filter", filter);
284 
285         if (skipHandler != null) {
286             builder.append("skip handler", skipHandler);
287         }
288         return  builder.toString();
289     }
290 
291 }