1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.dom;
23
24 import java.io.IOException;
25
26 import org.apache.commons.lang.builder.ToStringBuilder;
27 import org.dom4j.Document;
28 import org.dom4j.Element;
29 import org.dom4j.Node;
30
31 import de.fu_berlin.ties.ContextMap;
32 import de.fu_berlin.ties.ProcessingException;
33
34 import de.fu_berlin.ties.text.TextTokenizer;
35 import de.fu_berlin.ties.text.TokenCounter;
36 import de.fu_berlin.ties.text.TokenDetails;
37 import de.fu_berlin.ties.text.TokenizerFactory;
38
39 /***
40 * Walks through a document, handing all textual tokens over to a
41 * {@link de.fu_berlin.ties.xml.dom.TokenProcessor}.
42 *
43 * <p>Instances of this class are thread-safe iff the provided
44 * <code>TokenProcessor</code> is -- but subclass implementations might be not.
45 *
46 * @author Christian Siefkes
47 * @version $Revision: 1.14 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
48 */
49 public class TokenWalker {
50
51 /***
52 * Used to process the tokens.
53 */
54 private final TokenProcessor tokenProcessor;
55
56 /***
57 * Used to instantiate tokenizers.
58 */
59 private final TokenizerFactory factory;
60
61 /***
62 * Creates a new instance.
63 *
64 * @param processor used to process the tokens
65 * @param tFactory used to instantiate tokenizers
66 */
67 public TokenWalker(final TokenProcessor processor,
68 final TokenizerFactory tFactory) {
69 super();
70 tokenProcessor = processor;
71 factory = tFactory;
72 }
73
74 /***
75 * Empty method that can be overwritten by subclasses to handle the end
76 * of elements in a special way.
77 *
78 * @param element the element
79 * @param context a map of objects that are made available for processing
80 * @throws IOException might be thrown if an I/O error occurs
81 * @throws ProcessingException might be thrown if an error occurs during
82 * processing
83 */
84 protected void endElementHook(final Element element,
85 final ContextMap context) throws IOException, ProcessingException {
86
87 }
88
89 /***
90 * Helper method that tokenizes the collected textual contents of an
91 * element and delegates to the token processor for each of them.
92 *
93 * @param element the element to walk through
94 * @param collectedText the collected textual contents (limited to the
95 * text between/before/after child elements in case of mixed content)
96 * @param tokenCounter keeps track of the encountered tokens
97 * @param tokenizer used to tokenize text
98 * @param context a map of objects that are made available for processing
99 * @throws IOException might be thrown by the token processor
100 * @throws ProcessingException might be thrown by the token processor
101 */
102 protected void processCollectedText(final Element element,
103 final CharSequence collectedText,
104 final TokenCounter tokenCounter,
105 final TextTokenizer tokenizer,
106 final ContextMap context)
107 throws IOException, ProcessingException {
108 String currentToken;
109 tokenizer.reset(collectedText);
110 TokenDetails details;
111
112 while ((currentToken = tokenizer.nextToken()) != null) {
113
114 tokenCounter.add(tokenizer.hasPrecedingWhitespace(), currentToken);
115 details = new TokenDetails(currentToken, tokenCounter.getLastRep(),
116 tokenCounter.size() - 1,
117 tokenCounter.isWhitespaceBeforeLast());
118 processToken(element, tokenizer.leftText().toString(), details,
119 tokenizer.rightText().toString(), context);
120 }
121
122
123 if (tokenizer.hasPrecedingWhitespace()) {
124 tokenCounter.addWhitespace();
125
126 trailingWhitespaceHook(context);
127 }
128 }
129
130 /***
131 * Processes a token in an XML element by delegating to the configured
132 * {@link TokenProcessor}.
133 *
134 * @param element the element containing the token
135 * @param left the textual contents of the element to the left of the
136 * <code>token</code> (in case of mixed contents, only up to the last
137 * preceding child element, if any)
138 * @param details details about the token to process
139 * @param right the textual contents of the element to the right of the
140 * <code>token</code> (in case of mixed contents, only up to the next
141 * following child element, if any)
142 * @param context a map of objects that are made available for processing
143 * @throws IOException if an I/O error occurs
144 * @throws ProcessingException if an error occurs during processing
145 */
146 protected void processToken(final Element element, final String left,
147 final TokenDetails details, final String right,
148 final ContextMap context) throws IOException, ProcessingException {
149
150 tokenProcessor.processToken(element, left, details, right, context);
151 }
152
153 /***
154 * Empty method that can be overwritten by subclasses to handle the start
155 * of elements in a special way.
156 *
157 * @param element the element
158 * @param context a map of objects that are made available for processing
159 * @throws IOException might be thrown if an I/O error occurs
160 * @throws ProcessingException might be thrown if an error occurs during
161 * processing
162 */
163 protected void startElementHook(final Element element,
164 final ContextMap context) throws IOException, ProcessingException {
165
166 }
167
168 /***
169 * Empty method that can be overwritten by subclasses to handle whitespace
170 * at the end of element content in a special way.
171 *
172 * @param context a map of objects that are made available for processing
173 * @throws IOException might be thrown if an I/O error occurs
174 * @throws ProcessingException might be thrown if an error occurs during
175 * processing
176 */
177 protected void trailingWhitespaceHook(final ContextMap context)
178 throws IOException, ProcessingException {
179
180 }
181
182 /***
183 * Walks through the contents of an XML document, tokenizing the textual
184 * contents. The resulting tokens are handed over to the stored
185 * {@link TokenProcessor}.
186 *
187 * @param document the document to walk through
188 * @param context a map of objects that are made available for processing;
189 * might be <code>null</code> if not requred by the token processor
190 * @throws IOException might be thrown by the token processor
191 * @throws ProcessingException might be thrown by the token processor
192 */
193 public void walk(final Document document, final ContextMap context)
194 throws IOException, ProcessingException {
195 final TokenCounter tokenCounter = new TokenCounter();
196 final TextTokenizer tokenizer = factory.createTokenizer("");
197 final Element root = document.getRootElement();
198 walk(root, tokenCounter, tokenizer, context);
199 }
200
201 /***
202 * Walks through the contents of a node, tokenizing textual contents
203 * and recursing through nested elements. The registered token processor
204 * is called for each token.
205 *
206 * @param element the element to walk through
207 * @param tokenCounter keeps track of the encountered tokens
208 * @param tokenizer used to tokenize text
209 * @param context a map of objects that are made available for processing
210 * @throws IOException might be thrown by the token processor
211 * @throws ProcessingException might be thrown by the token processor
212 */
213 protected void walk(final Element element,
214 final TokenCounter tokenCounter,
215 final TextTokenizer tokenizer,
216 final ContextMap context)
217 throws IOException, ProcessingException {
218 final StringBuilder collectedText = new StringBuilder();
219 Node currentChild;
220 int currentType;
221
222
223 startElementHook(element, context);
224
225
226 for (int i = 0; i < element.nodeCount(); i++) {
227 currentChild = element.node(i);
228 currentType = currentChild.getNodeType();
229
230 if ((currentType == Node.TEXT_NODE)
231 || (currentType == Node.CDATA_SECTION_NODE)
232 || (currentType == Node.ENTITY_REFERENCE_NODE)) {
233
234 collectedText.append(currentChild.getText());
235 } else if (currentType == Node.ELEMENT_NODE) {
236
237 if (collectedText.length() > 0) {
238 processCollectedText(element, collectedText,
239 tokenCounter, tokenizer, context);
240 collectedText.setLength(0);
241 }
242
243
244 walk((Element) currentChild, tokenCounter, tokenizer,
245 context);
246 }
247 }
248
249
250 if (collectedText.length() > 0) {
251 processCollectedText(element, collectedText,
252 tokenCounter, tokenizer, context);
253 }
254
255
256 endElementHook(element, context);
257 }
258
259 /***
260 * Returns a string representation of this object.
261 *
262 * @return a textual representation
263 */
264 public String toString() {
265 return new ToStringBuilder(this)
266 .append("token processor", tokenProcessor)
267 .append("factory", factory)
268 .toString();
269 }
270
271 }