1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.dom;
23
24 import java.io.IOException;
25
26 import org.apache.commons.lang.builder.ToStringBuilder;
27 import org.dom4j.Document;
28 import org.dom4j.Element;
29 import org.dom4j.Node;
30
31 import de.fu_berlin.ties.ContextMap;
32 import de.fu_berlin.ties.ProcessingException;
33
34 import de.fu_berlin.ties.text.TextTokenizer;
35 import de.fu_berlin.ties.text.TokenCounter;
36 import de.fu_berlin.ties.text.TokenDetails;
37 import de.fu_berlin.ties.text.TokenizerFactory;
38
39 /***
40 * Walks through a document, handing all textual tokens over to a
41 * {@link de.fu_berlin.ties.xml.dom.TokenProcessor}.
42 *
43 * <p>Instances of this class are thread-safe iff the provided
44 * <code>TokenProcessor</code> is -- but subclass implementations might be not.
45 *
46 * @author Christian Siefkes
47 * @version $Revision: 1.9 $, $Date: 2004/11/19 14:05:09 $, $Author: siefkes $
48 */
49 public class TokenWalker {
50
51 /***
52 * Used to process the tokens.
53 */
54 private final TokenProcessor tokenProcessor;
55
56 /***
57 * Used to instantiate tokenizers.
58 */
59 private final TokenizerFactory factory;
60
61 /***
62 * Creates a new instance.
63 *
64 * @param processor used to process the tokens
65 * @param tFactory used to instantiate tokenizers
66 */
67 public TokenWalker(final TokenProcessor processor,
68 final TokenizerFactory tFactory) {
69 super();
70 tokenProcessor = processor;
71 factory = tFactory;
72 }
73
74 /***
75 * Helper method that tokenizes the collected textual contents of an
76 * element and delegates to the token processor for each of them.
77 *
78 * @param element the element to walk through
79 * @param collectedText the collected textual contents (limited to the
80 * text between/before/after child elements in case of mixed content)
81 * @param tokenCounter keeps track of the encountered tokens
82 * @param tokenizer used to tokenize text
83 * @param context a map of objects that are made available for processing
84 * @throws IOException might be throws by the token processor
85 * @throws ProcessingException might be throws by the token processor
86 */
87 protected void processCollectedText(final Element element,
88 final CharSequence collectedText,
89 final TokenCounter tokenCounter,
90 final TextTokenizer tokenizer,
91 final ContextMap context)
92 throws IOException, ProcessingException {
93 String currentToken;
94 tokenizer.reset(collectedText);
95 TokenDetails details;
96
97 while ((currentToken = tokenizer.nextToken()) != null) {
98
99 tokenCounter.add(tokenizer.hasPrecedingWhitespace(), currentToken);
100 details = new TokenDetails(currentToken, tokenCounter.getLastRep(),
101 tokenCounter.size() - 1,
102 tokenCounter.isWhitespaceBeforeLast());
103 processToken(element, tokenizer.leftText().toString(), details,
104 tokenizer.rightText().toString(), context);
105 }
106
107
108 if (tokenizer.hasPrecedingWhitespace()) {
109 tokenCounter.addWhitespace();
110 }
111 }
112
113 /***
114 * Processes a token in an XML element by delegating to the configured
115 * {@link TokenProcessor}.
116 *
117 * @param element the element containing the token
118 * @param left the textual contents of the element to the left of the
119 * <code>token</code> (in case of mixed contents, only up to the last
120 * preceding child element, if any)
121 * @param details details about the token to process
122 * @param right the textual contents of the element to the right of the
123 * <code>token</code> (in case of mixed contents, only up to the next
124 * following child element, if any)
125 * @param context a map of objects that are made available for processing
126 * @throws IOException if an I/O error occurs
127 * @throws ProcessingException if an error occurs during processing
128 */
129 protected void processToken(final Element element, final String left,
130 final TokenDetails details, final String right,
131 final ContextMap context) throws IOException, ProcessingException {
132
133 tokenProcessor.processToken(element, left, details, right, context);
134 }
135
136 /***
137 * Walks through the contents of an XML document, tokenizing the textual
138 * contents. The resulting tokens are handed over to the stored
139 * {@link TokenProcessor}.
140 *
141 * @param document the document to walk through
142 * @param context a map of objects that are made available for processing;
143 * might be <code>null</code> if not requred by the token processor
144 * @throws IOException might be throws by the token processor
145 * @throws ProcessingException might be throws by the token processor
146 */
147 public void walk(final Document document, final ContextMap context)
148 throws IOException, ProcessingException {
149 final TokenCounter tokenCounter = new TokenCounter();
150 final TextTokenizer tokenizer = factory.createTokenizer("");
151 final Element root = document.getRootElement();
152 walk(root, tokenCounter, tokenizer, context);
153 }
154
155 /***
156 * Walks through the contents of a node, tokenizing textual contents
157 * and recursing through nested elements. The registered token processor
158 * is called for each token.
159 *
160 * @param element the element to walk through
161 * @param tokenCounter keeps track of the encountered tokens
162 * @param tokenizer used to tokenize text
163 * @param context a map of objects that are made available for processing
164 * @throws IOException might be throws by the token processor
165 * @throws ProcessingException might be throws by the token processor
166 */
167 protected void walk(final Element element,
168 final TokenCounter tokenCounter,
169 final TextTokenizer tokenizer,
170 final ContextMap context)
171 throws IOException, ProcessingException {
172 StringBuffer collectedText = new StringBuffer();
173 Node currentChild;
174 int currentType;
175
176
177 for (int i = 0; i < element.nodeCount(); i++) {
178 currentChild = element.node(i);
179 currentType = currentChild.getNodeType();
180
181 if ((currentType == Node.TEXT_NODE)
182 || (currentType == Node.CDATA_SECTION_NODE)
183 || (currentType == Node.ENTITY_REFERENCE_NODE)) {
184
185 collectedText.append(currentChild.getText());
186 } else if (currentType == Node.ELEMENT_NODE) {
187
188 if (collectedText.length() > 0) {
189 processCollectedText(element, collectedText,
190 tokenCounter, tokenizer, context);
191 collectedText.setLength(0);
192 }
193
194
195 walk((Element) currentChild, tokenCounter, tokenizer,
196 context);
197 }
198 }
199
200
201 if (collectedText.length() > 0) {
202 processCollectedText(element, collectedText,
203 tokenCounter, tokenizer, context);
204 }
205 }
206
207 /***
208 * Returns a string representation of this object.
209 *
210 * @return a textual representation
211 */
212 public String toString() {
213 return new ToStringBuilder(this)
214 .append("token processor", tokenProcessor)
215 .append("factory", factory)
216 .toString();
217 }
218
219 }