1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.dom;
23
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileOutputStream;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.OutputStream;
30 import java.io.OutputStreamWriter;
31 import java.io.Reader;
32 import java.io.Writer;
33 import java.util.Iterator;
34 import java.util.List;
35
36 import org.apache.commons.configuration.Configuration;
37 import org.apache.commons.lang.StringUtils;
38 import org.dom4j.Attribute;
39 import org.dom4j.Branch;
40 import org.dom4j.Document;
41 import org.dom4j.DocumentException;
42 import org.dom4j.Element;
43 import org.dom4j.Namespace;
44 import org.dom4j.Node;
45 import org.dom4j.QName;
46 import org.dom4j.io.OutputFormat;
47 import org.dom4j.io.SAXReader;
48 import org.dom4j.io.XMLWriter;
49
50 import de.fu_berlin.ties.TiesConfiguration;
51 import de.fu_berlin.ties.io.IOUtils;
52 import de.fu_berlin.ties.text.TextUtils;
53
54 /***
55 * A static class that provides utility constants and methods for working with
56 * DOM-like XML representations, focussing especially on
57 * <a href="http://www.dom4j.org/">dom4j</a>.
58 * No instances of this class can be created, only the static members
59 * should be used.
60 *
61 * @author Christian Siefkes
62 * @version $Revision: 1.28 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
63 */
64 public final class DOMUtils {
65
66 /***
67 * The parser used to read XML documents. By default, this parser uses the
68 * standard {@link org.xml.sax.XMLReader} -- if this causes problems, it
69 * might be useful to explicitly load the <em>Aelfred2</em> parser instead
70 * (specify <code>SAXDriver</code> instance as constructor argument).
71 */
72 private static final SAXReader XML_READER = new SAXReader();
73
74 /***
75 * Private constructor prevents creation of instances.
76 */
77 private DOMUtils() {
78 super();
79 }
80
81 /***
82 * Returns the attribute with the given name, compatible to the name format
83 * returned by {@link #name(Attribute)}. If there are more than one
84 * attributes with the given name (e.g. in different namespaces) then the
85 * first one is returned.
86 *
87 * @param element the element whose attribute to return
88 * @param name the name of the attribute, compatible to the name format
89 * returned by {@link #name(Attribute)}
90 * @return the (first) matching attribute or <code>null</code> if none
91 * exists
92 */
93 public static Attribute attributeByName(final Element element,
94 final String name) {
95 return element.attribute(name);
96 }
97
98 /***
99 * Recursively collects the complete textual content of a branch, i.e.
100 * a document or element.
101 *
102 * @param branch the branch to recurse
103 * @return the collected text of the branch and all its child elements
104 */
105 public static String collectText(final Branch branch) {
106 final StringBuilder appender = new StringBuilder();
107
108
109 collectText(branch, appender);
110 return appender.toString();
111 }
112
113 /***
114 * Recursively collects the complete textual content of a branch, i.e.
115 * a document or element.
116 *
117 * @param branch the branch to recurse
118 * @param appender the collected text of the branch and all its child
119 * elements is appended to this string buffer
120 */
121 public static void collectText(final Branch branch,
122 final StringBuilder appender) {
123
124 try {
125 collectText(branch, appender, null);
126 } catch (IOException ioe) {
127
128 throw new RuntimeException(
129 "Implementation error: " + ioe.toString(), ioe);
130 }
131 }
132
133 /***
134 * Helper method that recursively collects the complete textual content of a
135 * branch and sends the result to a string buffer or writer.
136 *
137 * @param branch the branch to recurse
138 * @param appender the collected text of the branch and all its child
139 * elements is appended to this buffer, if not <code>null</code>
140 * @param writer the collected text of the branch and all its child
141 * elements is appended to this writer if <code>appender</code> is
142 * <code>null</code>
143 * @throws IOException if an I/O error occurs while writing to the writer
144 */
145 private static void collectText(final Branch branch,
146 final StringBuilder appender, final Writer writer)
147 throws IOException {
148 Node currentChild;
149 int currentType;
150
151
152 for (int i = 0; i < branch.nodeCount(); i++) {
153 currentChild = branch.node(i);
154 currentType = currentChild.getNodeType();
155
156 if ((currentType == Node.TEXT_NODE)
157 || (currentType == Node.CDATA_SECTION_NODE)
158 || (currentType == Node.ENTITY_REFERENCE_NODE)) {
159
160 if (appender != null) {
161 appender.append(currentChild.getText());
162 } else {
163 writer.write(currentChild.getText());
164 }
165 } else if (currentType == Node.ELEMENT_NODE) {
166
167 collectText((Element) currentChild, appender, writer);
168 }
169 }
170 }
171
172 /***
173 * Recursively collects the complete textual content of a branch, i.e.
174 * a document or element.
175 *
176 * @param branch the branch to recurse
177 * @param writer the collected text of the branch and all its child
178 * elements is appended to this writer; flushed but not closed by this
179 * method
180 * @throws IOException if an I/O error occurs while writing to the writer
181 */
182 public static void collectText(final Branch branch, final Writer writer)
183 throws IOException {
184
185 collectText(branch, null, writer);
186 writer.flush();
187 }
188
189 /***
190 * Creates the default output format used by this class for storing XML.
191 * This format adds platform-specific newlines after each element, but does
192 * not indent and trims (normalizes) all other whitespace.
193 *
194 * @return the created output format
195 */
196 public static OutputFormat createDefaultOutFormat() {
197 final OutputFormat format = new OutputFormat();
198 format.setIndent(false);
199 format.setNewlines(true);
200 format.setLineSeparator(TextUtils.LINE_SEPARATOR);
201 format.setTrimText(true);
202 return format;
203 }
204
205 /***
206 * Converts a local name into a qualfied name in the
207 * {@linkplain Namespace#NO_NAMESPACE default namespace}.
208 *
209 * @param localName the local the use
210 * @return a qualified name representing the local name in the
211 * {@linkplain Namespace#NO_NAMESPACE default namespace};
212 * or <code>null</code> if <code>localName</code> is <code>null</code>
213 */
214 public static QName defaultName(final String localName) {
215 return (localName == null) ? null
216 : QName.get(localName, Namespace.NO_NAMESPACE);
217 }
218
219 /***
220 * Deletes all attributes of an element and optionally of all its
221 * descendants.
222 *
223 * @param element the elements whose attributes should be deleted
224 * @param recurse whether to recursively delete the attributes of all
225 * direct and indirect child elements as well
226 */
227 public static void deleteAllAttributes(final Element element,
228 final boolean recurse) {
229
230 element.attributes().clear();
231
232 if (recurse) {
233
234 final Iterator childIter = element.elementIterator();
235
236 while (childIter.hasNext()) {
237 deleteAllAttributes((Element) childIter.next(), recurse);
238 }
239 }
240 }
241
242 /***
243 * Returns the child elements with the given name, compatible to the name
244 * format returned by {@link #name(Element)}. If no elements are found
245 * then this method returns an empty list.
246 *
247 * @param element the element whose child elements to return
248 * @param name the name of the child elements, compatible to the name format
249 * returned by {@link #name(Attribute)}
250 * @return a list of all the child {@link Element}s for the given name
251 */
252 public static List elementsByName(final Element element,
253 final String name) {
254 return element.elements(name);
255 }
256
257 /***
258 * Static method that returns a String representing the name of an attribute
259 * in an XML document. This method should always be used when building
260 * context representations and related structures to ensure that
261 * attributes are represented in a unified way. See {@link #name(Element)}
262 * for details.
263 *
264 * @param attrib the element to name
265 * @return the name to use for this element
266 */
267 public static String name(final Attribute attrib) {
268 return attrib.getName();
269 }
270
271 /***
272 * Static method that returns a String representing the name of an element
273 * in an XML document. This method should always be used when building
274 * context representations and related structures to ensure that
275 * elements are represented in a unified way. Please don't call
276 * {@link org.dom4j.Node#getName()} or {@link Element#getQualifiedName()}
277 * or similar methods directly in such cases.
278 *
279 * <p>Currently, only the local name if used, namespace URIs and namespace
280 * prefixes are ignored. Including namespace prefixes in context
281 * representations would be quite useless, because in different document
282 * different prefixes can represent the same namespace and vice versa.
283 *
284 * <p>Including namespace URIs might lead to higher precision by avoiding
285 * the risk of confusing elements from totally different namespaces. On
286 * other other hand it might lead to lower recall and slower learning
287 * because elements from similar namespaces (e.g. different versions of the
288 * HTML standard) are all considered separated from each other.
289 *
290 * @param element the element to name
291 * @return the name to use for this element
292 */
293 public static String name(final Element element) {
294 return element.getName();
295 }
296
297 /***
298 * Reads an XML document from a local filet. Compressed files are
299 * automatically decompressed (cf.
300 * {@link IOUtils#openCompressableInStream(InputStream)}).
301 *
302 * @param file the file to read
303 * @return the newly created document
304 * @throws DocumentException if an error occurs during parsing
305 * @throws IOException if an I/O error occurrs
306 */
307 public static Document readDocument(final File file)
308 throws DocumentException, IOException {
309 InputStream in = new FileInputStream(file);
310 try {
311
312 return readDocument(in);
313 } finally {
314
315 IOUtils.tryToClose(in);
316 }
317 }
318
319 /***
320 * Reads an XML document from a local file, using a configured charset.
321 * Delegates to {@link IOUtils#openReader(File, Configuration)} to determine
322 * the character set. Compressed files are automatically decompressed (cf.
323 * {@link IOUtils#openCompressableInStream(InputStream)}).
324 *
325 * @param file the file to read
326 * @param config the configuration to use
327 * @return the newly created document
328 * @throws DocumentException if an error occurs during parsing
329 * @throws IOException if an I/O error occurrs
330 */
331 public static Document readDocument(final File file,
332 final Configuration config)
333 throws DocumentException, IOException {
334 Reader reader = null;
335 try {
336
337 reader = IOUtils.openReader(file, config);
338 return readDocument(reader);
339 } finally {
340
341 IOUtils.tryToClose(reader);
342 }
343 }
344
345 /***
346 * Reads an XML document from a local file, using a given charset.
347 * Compressed files are automatically decompressed (cf.
348 * {@link IOUtils#openCompressableInStream(InputStream)})
349 *
350 * @param file the file to read
351 * @param charset the character set to use for reading the file;
352 * if <code>null</code>, the default charset of the current platform is used
353 * @return the newly created document
354 * @throws DocumentException if an error occurs during parsing
355 * @throws IOException if an I/O error occurrs
356 */
357 public static Document readDocument(final File file, final String charset)
358 throws DocumentException, IOException {
359 Reader reader = null;
360 try {
361
362 reader = IOUtils.openReader(file, charset);
363 return readDocument(reader);
364 } finally {
365
366 IOUtils.tryToClose(reader);
367 }
368 }
369
370 /***
371 * Reads an XML document from a given stream.
372 * Compressed files are automatically decompressed (cf.
373 * {@link IOUtils#openCompressableInStream(InputStream)})
374 *
375 * @param in stream containing the text to parse; not closed by this method
376 * @return the newly created document
377 * @throws DocumentException if an error occurs during parsing
378 * @throws IOException if an I/O error occurrs
379 */
380 public static Document readDocument(final InputStream in)
381 throws DocumentException, IOException {
382 return XML_READER.read(IOUtils.openCompressableInStream(in));
383 }
384
385 /***
386 * Reads an XML document from a given reader.
387 *
388 * @param reader reader containing the text to parse; not closed by this
389 * method
390 * @return the newly created document
391 * @throws DocumentException if an error occurs during parsing
392 */
393 public static Document readDocument(final Reader reader)
394 throws DocumentException {
395 return XML_READER.read(reader);
396 }
397
398 /***
399 * Builds a simple partial representation of an element, containing the
400 * {@linkplain #name(Element) name of the element} and its normalized
401 * and {@linkplain TextUtils#shorten(String) shortened} textual content.
402 * Useful for logging.
403 *
404 * @param element the element to show (may be <code>null</code>)
405 * @return a simple partial representation of the element
406 */
407 public static String showElement(final Element element) {
408 if (element != null) {
409 return '[' + name(element) + ": "
410 + TextUtils.shorten(TextUtils.normalize(collectText(element)))
411 + ']';
412 } else {
413 return null;
414 }
415 }
416
417 /***
418 * Builds a simple partial representation of a textual token in an element,
419 * containing the {@linkplain #name(Element) name of the element} and the
420 * normalized and {@linkplain TextUtils#shorten(String) shortened} text of
421 * the token. Useful for logging.
422 *
423 * @param element the element containing the token; must not be
424 * <code>null</code>
425 * @param token the token to show (may be <code>null</code>)
426 * @return a simple representation joining element and token
427 */
428 public static String showToken(final Element element, final String token) {
429 return '['
430 + (token == null ? null : TextUtils.shorten(TextUtils.normalize(token)))
431 + '/' + name(element) + ']';
432 }
433
434 /***
435 * Writes an XML document to a file, consulting a given configuration about
436 * {@linkplain
437 * IOUtils#openCompressableOutStream(OutputStream, TiesConfiguration)
438 * whether to use compression}.
439 *
440 * @param document the document to write
441 * @param file the file to write the document to
442 * @param config used to decide whether to use compression
443 * @param suffix an optional suffix that allows
444 * {@linkplain TiesConfiguration#adaptKey(String, String) overwriting} the
445 * general value of the configuration paramter with a more specified value
446 * @throws IOException if an I/O error occurs while writing
447 */
448 public static void writeDocument(final Document document, final File file,
449 final TiesConfiguration config, final String suffix)
450 throws IOException {
451 final OutputStream out = IOUtils.openCompressableOutStream(
452 new FileOutputStream(file), config, suffix);
453 try {
454 writeDocument(document, out);
455 } finally {
456 IOUtils.tryToClose(out);
457 }
458 }
459
460 /***
461 * Writes an XML document to a given stream.
462 *
463 * @param document the document to write
464 * @param out the stream to write the document to; flushed
465 * but not closed by this method
466 * @throws IOException if an I/O error occurs during writing
467 */
468 public static void writeDocument(final Document document,
469 final OutputStream out) throws IOException {
470 final OutputFormat outFormat = createDefaultOutFormat();
471 final XMLWriter xmlWriter = new XMLWriter(out, outFormat);
472 xmlWriter.write(document);
473 xmlWriter.flush();
474 }
475
476 /***
477 * Writes an XML document to a given writer, using the character set of the
478 * underlying output stream.
479 *
480 * @param document the document to write
481 * @param writer the writer to write the document to; flushed
482 * but not closed by this method
483 * @throws IOException if an I/O error occurs during writing
484 */
485 public static void writeDocument(final Document document,
486 final OutputStreamWriter writer) throws IOException {
487 writeDocument(document, writer, IOUtils.determineCharsetName(writer));
488 }
489
490 /***
491 * Writes an XML document to a given writer, using the given character set.
492 *
493 * @param document the document to write
494 * @param writer the writer to write the document to; flushed
495 * but not closed by this method
496 * @param charset the character set of the writer; this must be a valid
497 * charset name (not <code>null</code> or empty etc.), it should be
498 * the <em>canonical</em> (standard) name of the used charset
499 * @throws IllegalArgumentException if the specific charset is
500 * <code>null</code> or empty
501 * @throws IOException if an I/O error occurs during writing
502 */
503 public static void writeDocument(final Document document,
504 final Writer writer, final String charset)
505 throws IllegalArgumentException, IOException {
506 if (StringUtils.isEmpty(charset)) {
507 throw new IllegalArgumentException("Charset name must not be "
508 + ((charset == null) ? "null" : "empty"));
509 }
510
511 final OutputFormat outFormat = createDefaultOutFormat();
512 outFormat.setEncoding(charset);
513
514 final XMLWriter xmlWriter = new XMLWriter(writer, outFormat);
515 xmlWriter.write(document);
516 xmlWriter.flush();
517 }
518
519 }