View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.xml.dom;
23  
24  import java.io.IOException;
25  
26  import org.apache.commons.lang.builder.ToStringBuilder;
27  import org.dom4j.Document;
28  import org.dom4j.Element;
29  import org.dom4j.Node;
30  
31  import de.fu_berlin.ties.ContextMap;
32  import de.fu_berlin.ties.ProcessingException;
33  
34  import de.fu_berlin.ties.text.TextTokenizer;
35  import de.fu_berlin.ties.text.TokenCounter;
36  import de.fu_berlin.ties.text.TokenDetails;
37  import de.fu_berlin.ties.text.TokenizerFactory;
38  
39  /***
40   * Walks through a document, handing all textual tokens over to a
41   * {@link de.fu_berlin.ties.xml.dom.TokenProcessor}.
42   *
43   * <p>Instances of this class are thread-safe iff the provided
44   * <code>TokenProcessor</code> is -- but subclass implementations might be not.
45   *
46   * @author Christian Siefkes
47   * @version $Revision: 1.9 $, $Date: 2004/11/19 14:05:09 $, $Author: siefkes $
48   */
49  public class TokenWalker {
50  
51      /***
52       * Used to process the tokens.
53       */
54      private final TokenProcessor tokenProcessor;
55  
56      /***
57       * Used to instantiate tokenizers.
58       */
59      private final TokenizerFactory factory;
60  
61      /***
62       * Creates a new instance.
63       *
64       * @param processor used to process the tokens
65       * @param tFactory used to instantiate tokenizers
66       */
67      public TokenWalker(final TokenProcessor processor,
68              final TokenizerFactory tFactory) {
69          super();
70          tokenProcessor = processor;
71          factory = tFactory;
72      }
73  
74      /***
75       * Helper method that tokenizes the collected textual contents of an
76       * element and delegates to the token processor for each of them.
77       *
78       * @param element the element to walk through
79       * @param collectedText the collected textual contents (limited to the
80       * text between/before/after child elements in case of mixed content)
81       * @param tokenCounter keeps track of the encountered tokens
82       * @param tokenizer used to tokenize text
83       * @param context a map of objects that are made available for processing
84       * @throws IOException might be throws by the token processor
85       * @throws ProcessingException might be throws by the token processor
86       */
87      protected void processCollectedText(final Element element,
88                          final CharSequence collectedText,
89                          final TokenCounter tokenCounter,
90                          final TextTokenizer tokenizer,
91                          final ContextMap context)
92                          throws IOException, ProcessingException {
93          String currentToken;
94          tokenizer.reset(collectedText);
95          TokenDetails details;
96  
97          while ((currentToken = tokenizer.nextToken()) != null) {
98              // add each token to container (incl. whitespace) + call processor
99              tokenCounter.add(tokenizer.hasPrecedingWhitespace(), currentToken);
100             details = new TokenDetails(currentToken, tokenCounter.getLastRep(),
101                     tokenCounter.size() - 1,
102                     tokenCounter.isWhitespaceBeforeLast());
103             processToken(element, tokenizer.leftText().toString(), details,
104                 tokenizer.rightText().toString(), context);
105         }
106 
107         // add trailing whitespace, if any
108         if (tokenizer.hasPrecedingWhitespace()) {
109             tokenCounter.addWhitespace();
110         }
111     }
112 
113     /***
114      * Processes a token in an XML element by delegating to the configured
115      * {@link TokenProcessor}.
116      *
117      * @param element the element containing the token
118      * @param left the textual contents of the element to the left of the
119      * <code>token</code> (in case of mixed contents, only up to the last
120      * preceding child element, if any)
121      * @param details details about the token to process
122      * @param right the textual contents of the element to the right of the
123      * <code>token</code> (in case of mixed contents, only up to the next
124      * following child element, if any)
125      * @param context a map of objects that are made available for processing
126      * @throws IOException if an I/O error occurs
127      * @throws ProcessingException if an error occurs during processing
128      */
129     protected void processToken(final Element element, final String left,
130             final TokenDetails details, final String right,
131             final ContextMap context) throws IOException, ProcessingException {
132         // simply delegate to tokenProcessor
133         tokenProcessor.processToken(element, left, details, right, context);
134     }
135 
136     /***
137      * Walks through the contents of an XML document, tokenizing the textual
138      * contents. The resulting tokens are handed over to the stored
139      * {@link TokenProcessor}.
140      *
141      * @param document the document to walk through
142      * @param context a map of objects that are made available for processing;
143      * might be <code>null</code> if not requred by the token processor
144      * @throws IOException might be throws by the token processor
145      * @throws ProcessingException might be throws by the token processor
146      */
147     public void walk(final Document document, final ContextMap context)
148                            throws IOException, ProcessingException {
149         final TokenCounter tokenCounter = new TokenCounter();
150         final TextTokenizer tokenizer = factory.createTokenizer("");
151         final Element root = document.getRootElement();
152         walk(root, tokenCounter, tokenizer, context);
153     }
154 
155     /***
156      * Walks through the contents of a node, tokenizing textual contents
157      * and recursing through nested elements. The registered token processor
158      * is called for each token.
159      *
160      * @param element the element to walk through
161      * @param tokenCounter keeps track of the encountered tokens
162      * @param tokenizer used to tokenize text
163      * @param context a map of objects that are made available for processing
164      * @throws IOException might be throws by the token processor
165      * @throws ProcessingException might be throws by the token processor
166      */
167     protected void walk(final Element element,
168                         final TokenCounter tokenCounter,
169                         final TextTokenizer tokenizer,
170                         final ContextMap context)
171                         throws IOException, ProcessingException {
172         StringBuffer collectedText = new StringBuffer();
173         Node currentChild;
174         int currentType;
175 
176         // process child nodes
177         for (int i = 0; i < element.nodeCount(); i++) {
178             currentChild = element.node(i);
179             currentType = currentChild.getNodeType();
180 
181             if ((currentType == Node.TEXT_NODE)
182                     || (currentType == Node.CDATA_SECTION_NODE)
183                     || (currentType == Node.ENTITY_REFERENCE_NODE)) {
184                 // collect textual content
185                 collectedText.append(currentChild.getText());
186             } else if (currentType == Node.ELEMENT_NODE) {
187                 // process any text collected so far
188                 if (collectedText.length() > 0) {
189                     processCollectedText(element, collectedText,
190                         tokenCounter, tokenizer, context);
191                     collectedText.setLength(0); // reset StringBuffer
192                 }
193 
194                 // walk though child elements for matching
195                 walk((Element) currentChild, tokenCounter, tokenizer,
196                     context);
197             }
198         }
199 
200         // process any remaining collected text
201         if (collectedText.length() > 0) {
202             processCollectedText(element, collectedText,
203                 tokenCounter, tokenizer, context);
204         }
205     }
206 
207     /***
208      * Returns a string representation of this object.
209      *
210      * @return a textual representation
211      */
212     public String toString() {
213         return new ToStringBuilder(this)
214             .append("token processor", tokenProcessor)
215             .append("factory", factory)
216             .toString();
217     }
218 
219 }