1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.dom;
23
24 import java.io.IOException;
25
26 import org.apache.commons.lang.builder.ToStringBuilder;
27 import org.dom4j.Document;
28 import org.dom4j.Element;
29 import org.dom4j.Node;
30 import org.dom4j.NodeFilter;
31
32 import de.fu_berlin.ties.ContextMap;
33 import de.fu_berlin.ties.ProcessingException;
34
35 import de.fu_berlin.ties.text.TokenContainer;
36 import de.fu_berlin.ties.text.TokenizerFactory;
37
38 /***
39 * Walks through a document, handing the elements matched by a
40 * {@link org.dom4j.NodeFilter} over to an
41 * {@link de.fu_berlin.ties.xml.dom.ElementProcessor}.
42 * The textual contents of the document are tokenized; the resulting tokens
43 * are stored in a multi-set ({@link org.apache.commons.collections.Bag}).
44 *
45 * @author Christian Siefkes
46 * @version $Revision: 1.10 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
47 */
48 public class DocumentWalker {
49
50 /***
51 * A filter used to decide which elements to hand over to the element
52 * processor.
53 */
54 private final NodeFilter elementFilter;
55
56 /***
57 * Used to process the elements selected by the element filter.
58 */
59 private final ElementProcessor elementProcessor;
60
61 /***
62 * Used to instantiate tokenizers.
63 */
64 private final TokenizerFactory factory;
65
66 /***
67 * Creates a new instance.
68 *
69 * @param filter a filter used to decide which elements to hand over to the
70 * element processor
71 * @param processor used to process the elements selected by the filter
72 * @param tFactory used to instantiate tokenizers
73 */
74 public DocumentWalker(final NodeFilter filter,
75 final ElementProcessor processor,
76 final TokenizerFactory tFactory) {
77 super();
78 elementFilter = filter;
79 elementProcessor = processor;
80 factory = tFactory;
81 }
82
83 /***
84 * Walks through the contents of an XML document, tokenizing the textual
85 * contents. The resulting tokens are stored in a {@link TokenContainer}.
86 *
87 * @param document the document to walk through
88 * @param context a map of objects that are made available for processing;
89 * might be <code>null</code> if not requred by the element processor
90 * @throws IOException might be thrown by the element processor
91 * @throws ProcessingException might be thrown by the element processor
92 */
93 public final void walk(final Document document, final ContextMap context)
94 throws IOException, ProcessingException {
95 final TokenContainer tokenContainer = new TokenContainer(factory);
96 final Element root = document.getRootElement();
97 walk(root, tokenContainer, context);
98 }
99
100 /***
101 * Walks through the contents of a node, tokenizing textual contents
102 * and recursing through nested elements. For elements matched by the
103 * registered node filter, the registered element processor is called
104 * -- the full textual content of the matched element and its children is
105 * available via {@link TokenContainer#getLast()}. For other elements,
106 * the textual contents are stored and child elements are walked through
107 * and matched recursively.
108 *
109 * <p><i>A successful match stops recursion, i.e. child elements of a
110 * matching element are never handed over to the node filter for
111 * testing (in the case, only the textual contents are recursively
112 * collected).</i>
113 *
114 * @param element the element to walk through
115 * @param tokenContainer container storing all tokens
116 * @param context a map of objects that are made available for processing;
117 * might be <code>null</code> if not required by the element processor
118 * @throws IOException might be thrown by the element processor
119 * @throws ProcessingException might be thrown by the element processor
120 */
121 protected void walk(final Element element,
122 final TokenContainer tokenContainer,
123 final ContextMap context)
124 throws IOException, ProcessingException {
125 final boolean matched = elementFilter.matches(element);
126 final StringBuilder collectedText = new StringBuilder();
127 Node currentChild;
128 int currentType;
129
130
131 for (int i = 0; i < element.nodeCount(); i++) {
132 currentChild = element.node(i);
133 currentType = currentChild.getNodeType();
134
135 if ((currentType == Node.TEXT_NODE)
136 || (currentType == Node.CDATA_SECTION_NODE)
137 || (currentType == Node.ENTITY_REFERENCE_NODE)) {
138
139 collectedText.append(currentChild.getText());
140 } else if (currentType == Node.ELEMENT_NODE) {
141 if (matched) {
142
143 DOMUtils.collectText((Element) currentChild, collectedText);
144 } else {
145
146 if (collectedText.length() > 0) {
147 tokenContainer.add(collectedText.toString());
148 collectedText.setLength(0);
149 }
150
151
152 walk((Element) currentChild, tokenContainer, context);
153 }
154 }
155 }
156
157
158 tokenContainer.add(collectedText.toString());
159
160 if (matched) {
161
162 elementProcessor.processElement(element, tokenContainer, context);
163 }
164 }
165
166 /***
167 * Returns a string representation of this object.
168 *
169 * @return a textual representation
170 */
171 public String toString() {
172 return new ToStringBuilder(this)
173 .append("element filer", elementFilter)
174 .append("element processor", elementProcessor)
175 .append("factory", factory)
176 .toString();
177 }
178
179 }