View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.xml.dom;
23  
24  import java.io.File;
25  import java.io.FileInputStream;
26  import java.io.FileOutputStream;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.OutputStream;
30  import java.io.OutputStreamWriter;
31  import java.io.Reader;
32  import java.io.Writer;
33  import java.util.Iterator;
34  import java.util.List;
35  
36  import org.apache.commons.configuration.Configuration;
37  import org.apache.commons.lang.StringUtils;
38  import org.dom4j.Attribute;
39  import org.dom4j.Branch;
40  import org.dom4j.Document;
41  import org.dom4j.DocumentException;
42  import org.dom4j.Element;
43  import org.dom4j.Namespace;
44  import org.dom4j.Node;
45  import org.dom4j.QName;
46  import org.dom4j.io.OutputFormat;
47  import org.dom4j.io.SAXReader;
48  import org.dom4j.io.XMLWriter;
49  
50  import de.fu_berlin.ties.TiesConfiguration;
51  import de.fu_berlin.ties.io.IOUtils;
52  import de.fu_berlin.ties.text.TextUtils;
53  
54  /***
55   * A static class that provides utility constants and methods for working with
56   * DOM-like XML representations, focussing especially on
57   * <a href="http://www.dom4j.org/">dom4j</a>.
58   * No instances of this class can be created, only the static members
59   * should be used.
60   *
61   * @author Christian Siefkes
62   * @version $Revision: 1.28 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
63   */
64  public final class DOMUtils {
65  
66      /***
67       * The parser used to read XML documents. By default, this parser uses the
68       * standard {@link org.xml.sax.XMLReader} -- if this causes problems, it
69       * might be useful to explicitly load the <em>Aelfred2</em> parser instead
70       * (specify <code>SAXDriver</code> instance as constructor argument). 
71       */
72      private static final SAXReader XML_READER = new SAXReader();
73  
74      /***
75       * Private constructor prevents creation of instances.
76       */
77      private DOMUtils() {
78          super();
79      }
80  
81      /***
82       * Returns the attribute with the given name, compatible to the name format
83       * returned by {@link #name(Attribute)}.  If there are more than one
84       * attributes with the given name (e.g. in different namespaces) then the
85       * first one is returned.
86       *
87       * @param element the element whose attribute to return
88       * @param name the name of the attribute, compatible to the name format
89       * returned by {@link #name(Attribute)}
90       * @return the (first) matching attribute or <code>null</code> if none
91       * exists
92       */
93      public static Attribute attributeByName(final Element element,
94              final String name) {
95          return element.attribute(name);
96      }
97  
98      /***
99       * Recursively collects the complete textual content of a branch, i.e.
100      * a document or element.
101      *
102      * @param branch the branch to recurse
103      * @return the collected text of the branch and all its child elements
104      */
105     public static String collectText(final Branch branch) {
106         final StringBuilder appender = new StringBuilder();
107 
108         // delegate + return result as string
109         collectText(branch, appender);
110         return appender.toString();
111     }
112 
113     /***
114      * Recursively collects the complete textual content of a branch, i.e.
115      * a document or element.
116      *
117      * @param branch the branch to recurse
118      * @param appender the collected text of the branch and all its child
119      * elements is appended to this string buffer
120      */
121     public static void collectText(final Branch branch,
122                                    final StringBuilder appender) {
123        // delegate to helper method
124        try {
125            collectText(branch, appender, null);
126         } catch (IOException ioe) {
127             // this should never occur when using a string buffer
128             throw new RuntimeException(
129                 "Implementation error: " + ioe.toString(), ioe);
130         }
131     }
132 
133     /***
134      * Helper method that recursively collects the complete textual content of a
135      * branch and sends the result to a string buffer or writer.
136      *
137      * @param branch the branch to recurse
138      * @param appender the collected text of the branch and all its child
139      * elements is appended to this buffer, if not <code>null</code>
140      * @param writer the collected text of the branch and all its child
141      * elements is appended to this writer if <code>appender</code> is
142      * <code>null</code>
143      * @throws IOException if an I/O error occurs while writing to the writer
144      */
145     private static void collectText(final Branch branch,
146             final StringBuilder appender, final Writer writer)
147             throws IOException {
148         Node currentChild;
149         int currentType;
150 
151         // process child nodes
152         for (int i = 0; i < branch.nodeCount(); i++) {
153             currentChild = branch.node(i);
154             currentType = currentChild.getNodeType();
155 
156             if ((currentType == Node.TEXT_NODE)
157                     || (currentType == Node.CDATA_SECTION_NODE)
158                     || (currentType == Node.ENTITY_REFERENCE_NODE)) {
159                 // send textual content to buffer resp. writer
160                 if (appender != null) {
161                     appender.append(currentChild.getText());
162                 } else {
163                     writer.write(currentChild.getText());
164                 }
165             } else if (currentType == Node.ELEMENT_NODE) {
166                 // recursively collect text of child elements
167                 collectText((Element) currentChild, appender, writer);
168             }
169         }
170     }
171 
172     /***
173      * Recursively collects the complete textual content of a branch, i.e.
174      * a document or element.
175      *
176      * @param branch the branch to recurse
177      * @param writer the collected text of the branch and all its child
178      * elements is appended to this writer; flushed but not closed by this
179      * method
180      * @throws IOException if an I/O error occurs while writing to the writer
181      */
182     public static void collectText(final Branch branch, final Writer writer)
183             throws IOException {
184         // delegate to helper method + flush
185         collectText(branch, null, writer);
186         writer.flush();
187     }
188 
189     /***
190      * Creates the default output format used by this class for storing XML.
191      * This format adds platform-specific newlines after each element, but does
192      * not indent and trims (normalizes) all other whitespace.
193      *
194      * @return the created output format
195      */
196     public static OutputFormat createDefaultOutFormat() {
197         final OutputFormat format = new OutputFormat();
198         format.setIndent(false);
199         format.setNewlines(true);
200         format.setLineSeparator(TextUtils.LINE_SEPARATOR);
201         format.setTrimText(true);
202         return format;
203     }
204 
205     /***
206      * Converts a local name into a qualfied name in the
207      * {@linkplain Namespace#NO_NAMESPACE default namespace}.
208      *
209      * @param localName the local the use
210      * @return a qualified name representing the local name in the
211      * {@linkplain Namespace#NO_NAMESPACE default namespace};
212      * or <code>null</code> if <code>localName</code> is <code>null</code>
213      */
214     public static QName defaultName(final String localName) {
215         return (localName == null) ? null
216                 : QName.get(localName, Namespace.NO_NAMESPACE);
217     }
218 
219     /***
220      * Deletes all attributes of an element and optionally of all its
221      * descendants.
222      *
223      * @param element the elements whose attributes should be deleted
224      * @param recurse whether to recursively delete the attributes of all
225      * direct and indirect child elements as well
226      */
227     public static void deleteAllAttributes(final Element element,
228             final boolean recurse) {
229         // delete attributes of this element
230         element.attributes().clear();
231 
232         if (recurse) {
233             // recursively call this method on all child elements
234             final Iterator childIter = element.elementIterator();
235 
236             while (childIter.hasNext()) {
237                 deleteAllAttributes((Element) childIter.next(), recurse);
238             }
239         }
240     }
241 
242     /***
243      * Returns the child elements with the given name, compatible to the name
244      * format returned by {@link #name(Element)}. If no elements are found
245      * then this method returns an empty list.
246      *
247      * @param element the element whose child elements to return
248      * @param name the name of the child elements, compatible to the name format
249      * returned by {@link #name(Attribute)}
250      * @return a list of all the child {@link Element}s for the given name
251      */
252     public static List elementsByName(final Element element,
253             final String name) {
254         return element.elements(name);
255     }
256 
257     /***
258      * Static method that returns a String representing the name of an attribute
259      * in an XML document. This method should always be used when building
260      * context representations and related structures to ensure that
261      * attributes are represented in a unified way. See {@link #name(Element)}
262      * for details.
263      *
264      * @param attrib the element to name
265      * @return the name to use for this element
266      */
267     public static String name(final Attribute attrib) {
268         return attrib.getName();
269     }
270 
271     /***
272      * Static method that returns a String representing the name of an element
273      * in an XML document. This method should always be used when building
274      * context representations and related structures to ensure that
275      * elements are represented in a unified way. Please don't call
276      * {@link org.dom4j.Node#getName()} or {@link Element#getQualifiedName()}
277      * or similar methods directly in such cases.
278      *
279      * <p>Currently, only the local name if used, namespace URIs and namespace
280      * prefixes are ignored. Including namespace prefixes in context
281      * representations would be quite useless, because in different document
282      * different prefixes can represent the same namespace and vice versa.
283      *
284      * <p>Including namespace URIs might lead to higher precision by avoiding
285      * the risk of confusing elements from totally different namespaces. On
286      * other other hand it might lead to lower recall and slower learning
287      * because elements from similar namespaces (e.g. different versions of the
288      * HTML standard) are all considered separated from each other.
289      *
290      * @param element the element to name
291      * @return the name to use for this element
292      */
293     public static String name(final Element element) {
294         return element.getName();
295     }
296 
297     /***
298      * Reads an XML document from a local filet. Compressed files are
299      * automatically decompressed (cf.
300      * {@link IOUtils#openCompressableInStream(InputStream)}).
301      *
302      * @param file the file to read
303      * @return the newly created document
304      * @throws DocumentException if an error occurs during parsing
305      * @throws IOException if an I/O error occurrs
306      */
307     public static Document readDocument(final File file)
308     throws DocumentException, IOException {
309         InputStream in = new FileInputStream(file);
310         try {
311             // delegate (compression will be handled automatically)
312             return readDocument(in);
313         } finally {
314             // close the stream
315             IOUtils.tryToClose(in);
316         }
317     }
318 
319     /***
320      * Reads an XML document from a local file, using a configured charset.
321      * Delegates to {@link IOUtils#openReader(File, Configuration)} to determine
322      * the character set. Compressed files are automatically decompressed (cf.
323      * {@link IOUtils#openCompressableInStream(InputStream)}).
324      *
325      * @param file the file to read
326      * @param config the configuration to use
327      * @return the newly created document
328      * @throws DocumentException if an error occurs during parsing
329      * @throws IOException if an I/O error occurrs
330      */
331     public static Document readDocument(final File file,
332             final Configuration config)
333     throws DocumentException, IOException {
334         Reader reader = null;
335         try {
336             // open reader and delegate
337             reader = IOUtils.openReader(file, config);
338             return readDocument(reader);
339         } finally {
340             // close the reader
341             IOUtils.tryToClose(reader);
342         }
343     }
344 
345     /***
346      * Reads an XML document from a local file, using a given charset.
347      * Compressed files are automatically decompressed (cf.
348      * {@link IOUtils#openCompressableInStream(InputStream)})
349      *
350      * @param file the file to read
351      * @param charset the character set to use for reading the file;
352      * if <code>null</code>, the default charset of the current platform is used
353      * @return the newly created document
354      * @throws DocumentException if an error occurs during parsing
355      * @throws IOException if an I/O error occurrs
356      */
357     public static Document readDocument(final File file, final String charset)
358     throws DocumentException, IOException {
359         Reader reader = null;
360         try {
361             // open reader and delegate
362             reader = IOUtils.openReader(file, charset);
363             return readDocument(reader);
364         } finally {
365             // close the reader
366             IOUtils.tryToClose(reader);
367         }
368     }
369 
370     /***
371      * Reads an XML document from a given stream.
372      * Compressed files are automatically decompressed (cf.
373      * {@link IOUtils#openCompressableInStream(InputStream)})
374      *
375      * @param in stream containing the text to parse; not closed by this method
376      * @return the newly created document
377      * @throws DocumentException if an error occurs during parsing
378      * @throws IOException if an I/O error occurrs
379      */
380     public static Document readDocument(final InputStream in)
381             throws DocumentException, IOException {
382         return XML_READER.read(IOUtils.openCompressableInStream(in));
383     }
384 
385     /***
386      * Reads an XML document from a given reader.
387      *
388      * @param reader reader containing the text to parse; not closed by this
389      * method
390      * @return the newly created document
391      * @throws DocumentException if an error occurs during parsing
392      */
393     public static Document readDocument(final Reader reader)
394             throws DocumentException {
395         return XML_READER.read(reader);
396     }
397 
398     /***
399      * Builds a simple partial representation of an element, containing the
400      * {@linkplain #name(Element) name of the element} and its normalized
401      * and {@linkplain TextUtils#shorten(String) shortened} textual content.
402      * Useful for logging.
403      *
404      * @param element the element to show (may be <code>null</code>)
405      * @return a simple partial representation of the element
406      */
407     public static String showElement(final Element element) {
408         if (element != null) {
409             return '[' + name(element) + ": "
410             + TextUtils.shorten(TextUtils.normalize(collectText(element)))
411             + ']';
412         } else {
413             return null;
414         }
415     }
416 
417     /***
418      * Builds a simple partial representation of a textual token in an element,
419      * containing the {@linkplain #name(Element) name of the element} and the
420      * normalized and {@linkplain TextUtils#shorten(String) shortened} text of
421      * the token. Useful for logging.
422      *
423      * @param element the element containing the token; must not be
424      * <code>null</code>
425      * @param token the token to show (may be <code>null</code>)
426      * @return a simple representation joining element and token
427      */
428     public static String showToken(final Element element, final String token) {
429         return '['
430         + (token == null ? null : TextUtils.shorten(TextUtils.normalize(token)))
431         + '/' + name(element) + ']';
432     }
433 
434     /***
435      * Writes an XML document to a file, consulting a given configuration about
436      * {@linkplain
437      * IOUtils#openCompressableOutStream(OutputStream, TiesConfiguration)
438      * whether to use compression}.
439      *
440      * @param document the document to write
441      * @param file the file to write the document to
442      * @param config used to decide whether to use compression
443      * @param suffix an optional suffix that allows
444      * {@linkplain TiesConfiguration#adaptKey(String, String) overwriting} the
445      * general value of the configuration paramter with a more specified value
446      * @throws IOException if an I/O error occurs while writing
447      */
448     public static void writeDocument(final Document document, final File file,
449             final TiesConfiguration config, final String suffix)
450     throws IOException {
451         final OutputStream out = IOUtils.openCompressableOutStream(
452                 new FileOutputStream(file), config, suffix);
453         try {
454             writeDocument(document, out);
455         } finally {
456             IOUtils.tryToClose(out);
457         }
458     }
459 
460     /***
461      * Writes an XML document to a given stream.
462      *
463      * @param document the document to write
464      * @param out the stream to write the document to; flushed
465      * but not closed by this method
466      * @throws IOException if an I/O error occurs during writing
467      */
468     public static void writeDocument(final Document document,
469             final OutputStream out) throws IOException {
470         final OutputFormat outFormat = createDefaultOutFormat();
471         final XMLWriter xmlWriter = new XMLWriter(out, outFormat);
472         xmlWriter.write(document);
473         xmlWriter.flush();
474     }
475 
476     /***
477      * Writes an XML document to a given writer, using the character set of the
478      * underlying output stream.
479      *
480      * @param document the document to write
481      * @param writer the writer to write the document to; flushed
482      * but not closed by this method
483      * @throws IOException if an I/O error occurs during writing
484      */
485     public static void writeDocument(final Document document,
486             final OutputStreamWriter writer) throws IOException {
487         writeDocument(document, writer, IOUtils.determineCharsetName(writer));
488     }
489 
490     /***
491      * Writes an XML document to a given writer, using the given character set.
492      *
493      * @param document the document to write
494      * @param writer the writer to write the document to; flushed
495      * but not closed by this method
496      * @param charset the character set of the writer; this must be a valid
497      * charset name (not <code>null</code> or empty etc.), it should be
498      * the <em>canonical</em> (standard) name of the used charset
499      * @throws IllegalArgumentException if the specific charset is
500      * <code>null</code> or empty
501      * @throws IOException if an I/O error occurs during writing
502      */
503     public static void writeDocument(final Document document,
504             final Writer writer, final String charset)
505     throws IllegalArgumentException, IOException {
506         if (StringUtils.isEmpty(charset)) {
507             throw new IllegalArgumentException("Charset name must not be "
508                     + ((charset == null) ? "null" : "empty"));
509         }
510 
511         final OutputFormat outFormat = createDefaultOutFormat();
512         outFormat.setEncoding(charset);
513 
514         final XMLWriter xmlWriter = new XMLWriter(writer, outFormat);
515         xmlWriter.write(document);
516         xmlWriter.flush();
517     }
518 
519 }