1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.dom;
23
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.OutputStream;
28 import java.io.OutputStreamWriter;
29 import java.io.Reader;
30 import java.io.Writer;
31 import java.util.List;
32
33 import org.apache.commons.configuration.Configuration;
34 import org.apache.commons.lang.StringUtils;
35 import org.dom4j.Attribute;
36 import org.dom4j.Branch;
37 import org.dom4j.Document;
38 import org.dom4j.DocumentException;
39 import org.dom4j.Element;
40 import org.dom4j.Node;
41 import org.dom4j.io.OutputFormat;
42 import org.dom4j.io.SAXReader;
43 import org.dom4j.io.XMLWriter;
44 import org.dom4j.io.aelfred2.SAXDriver;
45
46 import de.fu_berlin.ties.io.IOUtils;
47 import de.fu_berlin.ties.text.TextUtils;
48
49 /***
50 * A static class that provides utility constants and methods for working with
51 * DOM-like XML representations, focussing especially on
52 * <a href="http://www.dom4j.org/">dom4j</a>.
53 * No instances of this class can be created, only the static members
54 * should be used.
55 *
56 * @author Christian Siefkes
57 * @version $Revision: 1.14 $, $Date: 2004/11/02 11:36:25 $, $Author: siefkes $
58 */
59 public final class DOMUtils {
60
61 /***
62 * The parser used to read XML documents.
63 */
64 private static final SAXReader XML_READER = new SAXReader(new SAXDriver());
65
66 /***
67 * Private constructor prevents creation of instances.
68 */
69 private DOMUtils() {
70 super();
71 }
72
73 /***
74 * Returns the attribute with the given name, compatible to the name format
75 * returned by {@link #name(Attribute)}. If there are more than one
76 * attributes with the given name (e.g. in different namespaces) then the
77 * first one is returned.
78 *
79 * @param element the element whose attribute to return
80 * @param name the name of the attribute, compatible to the name format
81 * returned by {@link #name(Attribute)}
82 * @return the (first) matching attribute or <code>null</code> if none
83 * exists
84 */
85 public static Attribute attributeByName(final Element element,
86 final String name) {
87 return element.attribute(name);
88 }
89
90 /***
91 * Recursively collects the complete textual content of a branch, i.e.
92 * a document or element.
93 *
94 * @param branch the branch to recurse
95 * @return the collected text of the branch and all its child elements
96 */
97 public static String collectText(final Branch branch) {
98 final StringBuffer appender = new StringBuffer();
99
100
101 collectText(branch, appender);
102 return appender.toString();
103 }
104
105 /***
106 * Recursively collects the complete textual content of a branch, i.e.
107 * a document or element.
108 *
109 * @param branch the branch to recurse
110 * @param appender the collected text of the branch and all its child
111 * elements is appended to this string buffer
112 */
113 public static void collectText(final Branch branch,
114 final StringBuffer appender) {
115
116 try {
117 collectText(branch, appender, null);
118 } catch (IOException ioe) {
119
120 throw new RuntimeException(
121 "Implementation error: " + ioe.toString(), ioe);
122 }
123 }
124
125 /***
126 * Helper method that recursively collects the complete textual content of a
127 * branch and sends the result to a string buffer or writer.
128 *
129 * @param branch the branch to recurse
130 * @param appender the collected text of the branch and all its child
131 * elements is appended to this buffer, if not <code>null</code>
132 * @param writer the collected text of the branch and all its child
133 * elements is appended to this writer if <code>appender</code> is
134 * <code>null</code>
135 * @throws IOException if an I/O error occurs while writing to the writer
136 */
137 private static void collectText(final Branch branch,
138 final StringBuffer appender, final Writer writer)
139 throws IOException {
140 Node currentChild;
141 int currentType;
142
143
144 for (int i = 0; i < branch.nodeCount(); i++) {
145 currentChild = branch.node(i);
146 currentType = currentChild.getNodeType();
147
148 if ((currentType == Node.TEXT_NODE)
149 || (currentType == Node.CDATA_SECTION_NODE)
150 || (currentType == Node.ENTITY_REFERENCE_NODE)) {
151
152 if (appender != null) {
153 appender.append(currentChild.getText());
154 } else {
155 writer.write(currentChild.getText());
156 }
157 } else if (currentType == Node.ELEMENT_NODE) {
158
159 collectText((Element) currentChild, appender, writer);
160 }
161 }
162 }
163
164 /***
165 * Recursively collects the complete textual content of a branch, i.e.
166 * a document or element.
167 *
168 * @param branch the branch to recurse
169 * @param writer the collected text of the branch and all its child
170 * elements is appended to this writer; flushed but not closed by this
171 * method
172 * @throws IOException if an I/O error occurs while writing to the writer
173 */
174 public static void collectText(final Branch branch, final Writer writer)
175 throws IOException {
176
177 collectText(branch, null, writer);
178 writer.flush();
179 }
180
181 /***
182 * Returns the child elements with the given name, compatible to the name
183 * format returned by {@link #name(Element)}. If no elements are found
184 * then this method returns an empty list.
185 *
186 * @param element the element whose child elements to return
187 * @param name the name of the child elements, compatible to the name format
188 * returned by {@link #name(Attribute)}
189 * @return a list of all the child {@link Element}s for the given name
190 */
191 public static List elementsByName(final Element element,
192 final String name) {
193 return element.elements(name);
194 }
195
196 /***
197 * Static method that returns a String representing the name of an attribute
198 * in an XML document. This method should always be used when building
199 * context representations and related structures to ensure that
200 * attributes are represented in a unified way. See {@link #name(Element)}
201 * for details.
202 *
203 * @param attrib the element to name
204 * @return the name to use for this element
205 */
206 public static String name(final Attribute attrib) {
207 return attrib.getName();
208 }
209
210 /***
211 * Static method that returns a String representing the name of an element
212 * in an XML document. This method should always be used when building
213 * context representations and related structures to ensure that
214 * elements are represented in a unified way. Please don't call
215 * {@link org.dom4j.Node#getName()} or {@link Element#getQualifiedName()}
216 * or similar methods directly in such cases.
217 *
218 * <p>Currently, only the local name if used, namespace URIs and namespace
219 * prefixes are ignored. Including namespace prefixes in context
220 * representations would be quite useless, because in different document
221 * different prefixes can represent the same namespace and vice versa.
222 *
223 * <p>Including namespace URIs might lead to higher precision by avoiding
224 * the risk of confusing elements from totally different namespaces. On
225 * other other hand it might lead to lower recall and slower learning
226 * because elements from similar namespaces (e.g. different versions of the
227 * HTML standard) are all considered separated from each other.
228 *
229 * @param element the element to name
230 * @return the name to use for this element
231 */
232 public static String name(final Element element) {
233 return element.getName();
234 }
235
236 /***
237 * Reads an XML document from a local file, using a configured charset.
238 * Delegates to {@link IOUtils#openReader(File, Configuration)} to determine
239 * the character set. Compressed files are automatically decompressed (cf.
240 * {@link IOUtils#openCompressableInStream(InputStream)}).
241 *
242 * @param file the file to read
243 * @param config the configuration to use
244 * @return the newly created document
245 * @throws DocumentException if an error occurs during parsing
246 * @throws IOException if an I/O error occurrs
247 */
248 public static Document readDocument(final File file,
249 final Configuration config)
250 throws DocumentException, IOException {
251 Reader reader = null;
252 try {
253
254 reader = IOUtils.openReader(file, config);
255 return readDocument(reader);
256 } finally {
257
258 IOUtils.tryToClose(reader);
259 }
260 }
261
262 /***
263 * Reads an XML document from a local file, using a given charset.
264 * Compressed files are automatically decompressed (cf.
265 * {@link IOUtils#openCompressableInStream(InputStream)})
266 *
267 * @param file the file to read
268 * @param charset the character set to use for reading the file;
269 * if <code>null</code>, the default charset of the current platform is used
270 * @return the newly created document
271 * @throws DocumentException if an error occurs during parsing
272 * @throws IOException if an I/O error occurrs
273 */
274 public static Document readDocument(final File file, final String charset)
275 throws DocumentException, IOException {
276 Reader reader = null;
277 try {
278
279 reader = IOUtils.openReader(file, charset);
280 return readDocument(reader);
281 } finally {
282
283 IOUtils.tryToClose(reader);
284 }
285 }
286
287 /***
288 * Reads an XML document from a given stream.
289 * Compressed files are automatically decompressed (cf.
290 * {@link IOUtils#openCompressableInStream(InputStream)})
291 *
292 * @param in stream containing the text to parse; not closed by this method
293 * @return the newly created document
294 * @throws DocumentException if an error occurs during parsing
295 * @throws IOException if an I/O error occurrs
296 */
297 public static Document readDocument(final InputStream in)
298 throws DocumentException, IOException {
299 return XML_READER.read(IOUtils.openCompressableInStream(in));
300 }
301
302 /***
303 * Reads an XML document from a given reader.
304 *
305 * @param reader reader containing the text to parse; not closed by this
306 * method
307 * @return the newly created document
308 * @throws DocumentException if an error occurs during parsing
309 */
310 public static Document readDocument(final Reader reader)
311 throws DocumentException {
312 return XML_READER.read(reader);
313 }
314
315 /***
316 * Builds a simple partial representation of an element, containing the
317 * {@linkplain #name(Element) name of the element} and its normalized
318 * and {@linkplain TextUtils#shorten(String) shortened} textual content.
319 * Useful for logging.
320 *
321 * @param element the element to show (may be <code>null</code>)
322 * @return a simple partial representation of the element
323 */
324 public static String showElement(final Element element) {
325 if (element != null) {
326 return '[' + name(element) + ": "
327 + TextUtils.shorten(TextUtils.normalize(collectText(element)))
328 + ']';
329 } else {
330 return null;
331 }
332 }
333
334 /***
335 * Builds a simple partial representation of a textual token in an element,
336 * containing the {@linkplain #name(Element) name of the element} and the
337 * normalized and {@linkplain TextUtils#shorten(String) shortened} text of
338 * the token. Useful for logging.
339 *
340 * @param element the element containing the token; must not be
341 * <code>null</code>
342 * @param token the token to show (may be <code>null</code>)
343 * @return a simple representation joining element and token
344 */
345 public static String showToken(final Element element, final String token) {
346 return '['
347 + (token == null ? null : TextUtils.shorten(TextUtils.normalize(token)))
348 + '/' + name(element) + ']';
349 }
350
351 /***
352 * Writes an XML document to a given stream.
353 *
354 * @param document the document to write
355 * @param out the stream to write the document text to; flushed
356 * but not closed by this method
357 * @throws IOException if an I/O error occurs during writing
358 */
359 public static void writeDocument(final Document document,
360 final OutputStream out) throws IOException {
361 final OutputFormat outFormat = OutputFormat.createPrettyPrint();
362 final XMLWriter xmlWriter = new XMLWriter(out, outFormat);
363 xmlWriter.write(document);
364 xmlWriter.flush();
365 }
366
367 /***
368 * Writes an XML document to a given writer, using the character set of the
369 * underlying output stream.
370 *
371 * @param document the document to write
372 * @param writer the writer to write the document text to; flushed
373 * but not closed by this method
374 * @throws IOException if an I/O error occurs during writing
375 */
376 public static void writeDocument(final Document document,
377 final OutputStreamWriter writer) throws IOException {
378 writeDocument(document, writer, IOUtils.determineCharsetName(writer));
379 }
380
381 /***
382 * Writes an XML document to a given writer, using the given character set.
383 *
384 * @param document the document to write
385 * @param writer the writer to write the document text to; flushed
386 * but not closed by this method
387 * @param charset the character set of the writer; this must be a valid
388 * charset name (not <code>null</code> or empty etc.), it should be
389 * the <em>canonical</em> (standard) name of the used charset
390 * @throws IllegalArgumentException if the specific charset is
391 * <code>null</code> or empty
392 * @throws IOException if an I/O error occurs during writing
393 */
394 public static void writeDocument(final Document document,
395 final Writer writer, final String charset)
396 throws IllegalArgumentException, IOException {
397 if (StringUtils.isEmpty(charset)) {
398 throw new IllegalArgumentException("Charset name must not be "
399 + ((charset == null) ? "null" : "empty"));
400 }
401
402 final OutputFormat outFormat = OutputFormat.createPrettyPrint();
403 outFormat.setEncoding(charset);
404
405 final XMLWriter xmlWriter = new XMLWriter(writer, outFormat);
406 xmlWriter.write(document);
407 xmlWriter.flush();
408 }
409
410 }