View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.xml.dom;
23  
24  import java.io.IOException;
25  
26  import org.apache.commons.lang.builder.ToStringBuilder;
27  import org.dom4j.Document;
28  import org.dom4j.Element;
29  import org.dom4j.Node;
30  
31  import de.fu_berlin.ties.ContextMap;
32  import de.fu_berlin.ties.ProcessingException;
33  
34  import de.fu_berlin.ties.text.TextTokenizer;
35  import de.fu_berlin.ties.text.TokenCounter;
36  import de.fu_berlin.ties.text.TokenDetails;
37  import de.fu_berlin.ties.text.TokenizerFactory;
38  
39  /***
40   * Walks through a document, handing all textual tokens over to a
41   * {@link de.fu_berlin.ties.xml.dom.TokenProcessor}.
42   *
43   * <p>Instances of this class are thread-safe iff the provided
44   * <code>TokenProcessor</code> is -- but subclass implementations might be not.
45   *
46   * @author Christian Siefkes
47   * @version $Revision: 1.14 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
48   */
49  public class TokenWalker {
50  
51      /***
52       * Used to process the tokens.
53       */
54      private final TokenProcessor tokenProcessor;
55  
56      /***
57       * Used to instantiate tokenizers.
58       */
59      private final TokenizerFactory factory;
60  
61      /***
62       * Creates a new instance.
63       *
64       * @param processor used to process the tokens
65       * @param tFactory used to instantiate tokenizers
66       */
67      public TokenWalker(final TokenProcessor processor,
68              final TokenizerFactory tFactory) {
69          super();
70          tokenProcessor = processor;
71          factory = tFactory;
72      }
73  
74      /***
75       * Empty method that can be overwritten by subclasses to handle the end
76       * of elements in a special way.
77       *
78       * @param element the element
79       * @param context a map of objects that are made available for processing
80       * @throws IOException might be thrown if an I/O error occurs
81       * @throws ProcessingException might be thrown if an error occurs during
82       * processing
83       */
84      protected void endElementHook(final Element element,
85              final ContextMap context) throws IOException, ProcessingException {
86          // hook does nothing unless overwritten
87      }
88  
89      /***
90       * Helper method that tokenizes the collected textual contents of an
91       * element and delegates to the token processor for each of them.
92       *
93       * @param element the element to walk through
94       * @param collectedText the collected textual contents (limited to the
95       * text between/before/after child elements in case of mixed content)
96       * @param tokenCounter keeps track of the encountered tokens
97       * @param tokenizer used to tokenize text
98       * @param context a map of objects that are made available for processing
99       * @throws IOException might be thrown by the token processor
100      * @throws ProcessingException might be thrown by the token processor
101      */
102     protected void processCollectedText(final Element element,
103                         final CharSequence collectedText,
104                         final TokenCounter tokenCounter,
105                         final TextTokenizer tokenizer,
106                         final ContextMap context)
107                         throws IOException, ProcessingException {
108         String currentToken;
109         tokenizer.reset(collectedText);
110         TokenDetails details;
111 
112         while ((currentToken = tokenizer.nextToken()) != null) {
113             // add each token to container (incl. whitespace) + call processor
114             tokenCounter.add(tokenizer.hasPrecedingWhitespace(), currentToken);
115             details = new TokenDetails(currentToken, tokenCounter.getLastRep(),
116                     tokenCounter.size() - 1,
117                     tokenCounter.isWhitespaceBeforeLast());
118             processToken(element, tokenizer.leftText().toString(), details,
119                 tokenizer.rightText().toString(), context);
120         }
121 
122         // add trailing whitespace, if any
123         if (tokenizer.hasPrecedingWhitespace()) {
124             tokenCounter.addWhitespace();
125             // invoke hook that might be implemented by subclasses
126             trailingWhitespaceHook(context);
127         }
128     }
129 
130     /***
131      * Processes a token in an XML element by delegating to the configured
132      * {@link TokenProcessor}.
133      *
134      * @param element the element containing the token
135      * @param left the textual contents of the element to the left of the
136      * <code>token</code> (in case of mixed contents, only up to the last
137      * preceding child element, if any)
138      * @param details details about the token to process
139      * @param right the textual contents of the element to the right of the
140      * <code>token</code> (in case of mixed contents, only up to the next
141      * following child element, if any)
142      * @param context a map of objects that are made available for processing
143      * @throws IOException if an I/O error occurs
144      * @throws ProcessingException if an error occurs during processing
145      */
146     protected void processToken(final Element element, final String left,
147             final TokenDetails details, final String right,
148             final ContextMap context) throws IOException, ProcessingException {
149         // simply delegate to tokenProcessor
150         tokenProcessor.processToken(element, left, details, right, context);
151     }
152 
153     /***
154      * Empty method that can be overwritten by subclasses to handle the start
155      * of elements in a special way.
156      *
157      * @param element the element
158      * @param context a map of objects that are made available for processing
159      * @throws IOException might be thrown if an I/O error occurs
160      * @throws ProcessingException might be thrown if an error occurs during
161      * processing
162      */
163     protected void startElementHook(final Element element,
164             final ContextMap context) throws IOException, ProcessingException {
165         // hook does nothing unless overwritten
166     }
167 
168     /***
169      * Empty method that can be overwritten by subclasses to handle whitespace
170      * at the end of element content in a special way.
171      *
172      * @param context a map of objects that are made available for processing
173      * @throws IOException might be thrown if an I/O error occurs
174      * @throws ProcessingException might be thrown if an error occurs during
175      * processing
176      */
177     protected void trailingWhitespaceHook(final ContextMap context)
178     throws IOException, ProcessingException {
179         // hook does nothing unless overwritten
180     }
181 
182     /***
183      * Walks through the contents of an XML document, tokenizing the textual
184      * contents. The resulting tokens are handed over to the stored
185      * {@link TokenProcessor}.
186      *
187      * @param document the document to walk through
188      * @param context a map of objects that are made available for processing;
189      * might be <code>null</code> if not requred by the token processor
190      * @throws IOException might be thrown by the token processor
191      * @throws ProcessingException might be thrown by the token processor
192      */
193     public void walk(final Document document, final ContextMap context)
194                            throws IOException, ProcessingException {
195         final TokenCounter tokenCounter = new TokenCounter();
196         final TextTokenizer tokenizer = factory.createTokenizer("");
197         final Element root = document.getRootElement();
198         walk(root, tokenCounter, tokenizer, context);
199     }
200 
201     /***
202      * Walks through the contents of a node, tokenizing textual contents
203      * and recursing through nested elements. The registered token processor
204      * is called for each token.
205      *
206      * @param element the element to walk through
207      * @param tokenCounter keeps track of the encountered tokens
208      * @param tokenizer used to tokenize text
209      * @param context a map of objects that are made available for processing
210      * @throws IOException might be thrown by the token processor
211      * @throws ProcessingException might be thrown by the token processor
212      */
213     protected void walk(final Element element,
214                         final TokenCounter tokenCounter,
215                         final TextTokenizer tokenizer,
216                         final ContextMap context)
217                         throws IOException, ProcessingException {
218         final StringBuilder collectedText = new StringBuilder();
219         Node currentChild;
220         int currentType;
221 
222         // invoke hook that might be implemented by subclasses
223         startElementHook(element, context);
224 
225         // process child nodes
226         for (int i = 0; i < element.nodeCount(); i++) {
227             currentChild = element.node(i);
228             currentType = currentChild.getNodeType();
229 
230             if ((currentType == Node.TEXT_NODE)
231                     || (currentType == Node.CDATA_SECTION_NODE)
232                     || (currentType == Node.ENTITY_REFERENCE_NODE)) {
233                 // collect textual content
234                 collectedText.append(currentChild.getText());
235             } else if (currentType == Node.ELEMENT_NODE) {
236                 // process any text collected so far
237                 if (collectedText.length() > 0) {
238                     processCollectedText(element, collectedText,
239                         tokenCounter, tokenizer, context);
240                     collectedText.setLength(0); // reset StringBuilder
241                 }
242 
243                 // walk though child elements for matching
244                 walk((Element) currentChild, tokenCounter, tokenizer,
245                     context);
246             }
247         }
248 
249         // process any remaining collected text
250         if (collectedText.length() > 0) {
251             processCollectedText(element, collectedText,
252                 tokenCounter, tokenizer, context);
253         }
254 
255         // invoke hook that might be implemented by subclasses
256         endElementHook(element, context);
257     }
258 
259     /***
260      * Returns a string representation of this object.
261      *
262      * @return a textual representation
263      */
264     public String toString() {
265         return new ToStringBuilder(this)
266             .append("token processor", tokenProcessor)
267             .append("factory", factory)
268             .toString();
269     }
270 
271 }