View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.xml.dom;
23  
24  import java.io.IOException;
25  
26  import org.apache.commons.lang.builder.ToStringBuilder;
27  import org.dom4j.Document;
28  import org.dom4j.Element;
29  import org.dom4j.Node;
30  import org.dom4j.NodeFilter;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  
35  import de.fu_berlin.ties.text.TokenContainer;
36  import de.fu_berlin.ties.text.TokenizerFactory;
37  
38  /***
39   * Walks through a document, handing the elements matched by a
40   * {@link org.dom4j.NodeFilter} over to an
41   * {@link de.fu_berlin.ties.xml.dom.ElementProcessor}.
42   * The textual contents of the document are tokenized; the resulting tokens
43   * are stored in a multi-set ({@link org.apache.commons.collections.Bag}).
44   *
45   * @author Christian Siefkes
46   * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
47   */
48  public class DocumentWalker {
49  
50      /***
51       * A filter used to decide which elements to hand over to the element
52       * processor.
53       */
54      private final NodeFilter elementFilter;
55  
56      /***
57       * Used to process the elements selected by the element filter.
58       */
59      private final ElementProcessor elementProcessor;
60  
61      /***
62       * Used to instantiate tokenizers.
63       */
64      private final TokenizerFactory factory;
65  
66      /***
67       * Creates a new instance.
68       *
69       * @param filter a filter used to decide which elements to hand over to the
70       * element processor
71       * @param processor used to process the elements selected by the filter
72       * @param tFactory used to instantiate tokenizers
73       */
74      public DocumentWalker(final NodeFilter filter,
75                            final ElementProcessor processor,
76                            final TokenizerFactory tFactory) {
77          super();
78          elementFilter = filter;
79          elementProcessor = processor;
80          factory = tFactory;
81      }
82  
83      /***
84       * Walks through the contents of an XML document, tokenizing the textual
85       * contents. The resulting tokens are stored in a {@link TokenContainer}.
86       *
87       * @param document the document to walk through
88       * @param context a map of objects that are made available for processing;
89       * might be <code>null</code> if not requred by the element processor
90       * @throws IOException might be thrown by the element processor
91       * @throws ProcessingException might be thrown by the element processor
92       */
93      public final void walk(final Document document, final ContextMap context)
94                             throws IOException, ProcessingException {
95          final TokenContainer tokenContainer = new TokenContainer(factory);
96          final Element root = document.getRootElement();
97          walk(root, tokenContainer, context);
98      }
99  
100     /***
101      * Walks through the contents of a node, tokenizing textual contents
102      * and recursing through nested elements. For elements matched by the
103      * registered node filter, the registered element processor is called
104      * -- the full textual content of the matched element and its children is
105      * available via {@link TokenContainer#getLast()}. For other elements,
106      * the textual contents are stored and child elements are walked through
107      * and matched recursively.
108      *
109      * <p><i>A successful match stops recursion, i.e. child elements of a
110      * matching element are never handed over to the node filter for
111      * testing (in the case, only the textual contents are recursively
112      * collected).</i>
113      *
114      * @param element the element to walk through
115      * @param tokenContainer container storing all tokens
116      * @param context a map of objects that are made available for processing;
117      * might be <code>null</code> if not required by the element processor
118      * @throws IOException might be thrown by the element processor
119      * @throws ProcessingException might be thrown by the element processor
120      */
121     protected void walk(final Element element,
122                         final TokenContainer tokenContainer,
123                         final ContextMap context)
124                         throws IOException, ProcessingException {
125         final boolean matched = elementFilter.matches(element);
126         final StringBuilder collectedText = new StringBuilder();
127         Node currentChild;
128         int currentType;
129 
130         // process child nodes
131         for (int i = 0; i < element.nodeCount(); i++) {
132             currentChild = element.node(i);
133             currentType = currentChild.getNodeType();
134 
135             if ((currentType == Node.TEXT_NODE)
136                     || (currentType == Node.CDATA_SECTION_NODE)
137                     || (currentType == Node.ENTITY_REFERENCE_NODE)) {
138                 // collect textual content
139                 collectedText.append(currentChild.getText());
140             } else if (currentType == Node.ELEMENT_NODE) {
141                 if (matched) {
142                     // recursively collect text of child elements
143                     DOMUtils.collectText((Element) currentChild, collectedText);
144                 } else {
145                     // flush text
146                     if (collectedText.length() > 0) {
147                         tokenContainer.add(collectedText.toString());
148                         collectedText.setLength(0); // reset StringBuilder
149                     }
150 
151                     // walk though child elements for matching
152                     walk((Element) currentChild, tokenContainer, context);
153                 }
154             }
155         }
156 
157         // tokenize (remaining) text
158         tokenContainer.add(collectedText.toString());
159 
160         if (matched) {
161             // delegate to element processor
162             elementProcessor.processElement(element, tokenContainer, context);
163         }
164     }
165 
166     /***
167      * Returns a string representation of this object.
168      *
169      * @return a textual representation
170      */
171     public String toString() {
172         return new ToStringBuilder(this)
173             .append("element filer", elementFilter)
174             .append("element processor", elementProcessor)
175             .append("factory", factory)
176             .toString();
177     }
178 
179 }