View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.xml.dom;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.OutputStream;
28  import java.io.OutputStreamWriter;
29  import java.io.Reader;
30  import java.io.Writer;
31  import java.util.List;
32  
33  import org.apache.commons.configuration.Configuration;
34  import org.apache.commons.lang.StringUtils;
35  import org.dom4j.Attribute;
36  import org.dom4j.Branch;
37  import org.dom4j.Document;
38  import org.dom4j.DocumentException;
39  import org.dom4j.Element;
40  import org.dom4j.Node;
41  import org.dom4j.io.OutputFormat;
42  import org.dom4j.io.SAXReader;
43  import org.dom4j.io.XMLWriter;
44  import org.dom4j.io.aelfred2.SAXDriver;
45  
46  import de.fu_berlin.ties.io.IOUtils;
47  import de.fu_berlin.ties.text.TextUtils;
48  
49  /***
50   * A static class that provides utility constants and methods for working with
51   * DOM-like XML representations, focussing especially on
52   * <a href="http://www.dom4j.org/">dom4j</a>.
53   * No instances of this class can be created, only the static members
54   * should be used.
55   *
56   * @author Christian Siefkes
57   * @version $Revision: 1.14 $, $Date: 2004/11/02 11:36:25 $, $Author: siefkes $
58   */
59  public final class DOMUtils {
60  
61      /***
62       * The parser used to read XML documents.
63       */
64      private static final SAXReader XML_READER = new SAXReader(new SAXDriver());
65  
66      /***
67       * Private constructor prevents creation of instances.
68       */
69      private DOMUtils() {
70          super();
71      }
72  
73      /***
74       * Returns the attribute with the given name, compatible to the name format
75       * returned by {@link #name(Attribute)}.  If there are more than one
76       * attributes with the given name (e.g. in different namespaces) then the
77       * first one is returned.
78       *
79       * @param element the element whose attribute to return
80       * @param name the name of the attribute, compatible to the name format
81       * returned by {@link #name(Attribute)}
82       * @return the (first) matching attribute or <code>null</code> if none
83       * exists
84       */
85      public static Attribute attributeByName(final Element element,
86              final String name) {
87          return element.attribute(name);
88      }
89  
90      /***
91       * Recursively collects the complete textual content of a branch, i.e.
92       * a document or element.
93       *
94       * @param branch the branch to recurse
95       * @return the collected text of the branch and all its child elements
96       */
97      public static String collectText(final Branch branch) {
98          final StringBuffer appender = new StringBuffer();
99  
100         // delegate + return result as string
101         collectText(branch, appender);
102         return appender.toString();
103     }
104 
105     /***
106      * Recursively collects the complete textual content of a branch, i.e.
107      * a document or element.
108      *
109      * @param branch the branch to recurse
110      * @param appender the collected text of the branch and all its child
111      * elements is appended to this string buffer
112      */
113     public static void collectText(final Branch branch,
114                                    final StringBuffer appender) {
115        // delegate to helper method
116        try {
117            collectText(branch, appender, null);
118         } catch (IOException ioe) {
119             // this should never occur when using a string buffer
120             throw new RuntimeException(
121                 "Implementation error: " + ioe.toString(), ioe);
122         }
123     }
124 
125     /***
126      * Helper method that recursively collects the complete textual content of a
127      * branch and sends the result to a string buffer or writer.
128      *
129      * @param branch the branch to recurse
130      * @param appender the collected text of the branch and all its child
131      * elements is appended to this buffer, if not <code>null</code>
132      * @param writer the collected text of the branch and all its child
133      * elements is appended to this writer if <code>appender</code> is
134      * <code>null</code>
135      * @throws IOException if an I/O error occurs while writing to the writer
136      */
137     private static void collectText(final Branch branch,
138             final StringBuffer appender, final Writer writer)
139             throws IOException {
140         Node currentChild;
141         int currentType;
142 
143         // process child nodes
144         for (int i = 0; i < branch.nodeCount(); i++) {
145             currentChild = branch.node(i);
146             currentType = currentChild.getNodeType();
147 
148             if ((currentType == Node.TEXT_NODE)
149                     || (currentType == Node.CDATA_SECTION_NODE)
150                     || (currentType == Node.ENTITY_REFERENCE_NODE)) {
151                 // send textual content to buffer resp. writer
152                 if (appender != null) {
153                     appender.append(currentChild.getText());
154                 } else {
155                     writer.write(currentChild.getText());
156                 }
157             } else if (currentType == Node.ELEMENT_NODE) {
158                 // recursively collect text of child elements
159                 collectText((Element) currentChild, appender, writer);
160             }
161         }
162     }
163 
164     /***
165      * Recursively collects the complete textual content of a branch, i.e.
166      * a document or element.
167      *
168      * @param branch the branch to recurse
169      * @param writer the collected text of the branch and all its child
170      * elements is appended to this writer; flushed but not closed by this
171      * method
172      * @throws IOException if an I/O error occurs while writing to the writer
173      */
174     public static void collectText(final Branch branch, final Writer writer)
175             throws IOException {
176         // delegate to helper method + flush
177         collectText(branch, null, writer);
178         writer.flush();
179     }
180 
181     /***
182      * Returns the child elements with the given name, compatible to the name
183      * format returned by {@link #name(Element)}. If no elements are found
184      * then this method returns an empty list.
185      *
186      * @param element the element whose child elements to return
187      * @param name the name of the child elements, compatible to the name format
188      * returned by {@link #name(Attribute)}
189      * @return a list of all the child {@link Element}s for the given name
190      */
191     public static List elementsByName(final Element element,
192             final String name) {
193         return element.elements(name);
194     }
195 
196     /***
197      * Static method that returns a String representing the name of an attribute
198      * in an XML document. This method should always be used when building
199      * context representations and related structures to ensure that
200      * attributes are represented in a unified way. See {@link #name(Element)}
201      * for details.
202      *
203      * @param attrib the element to name
204      * @return the name to use for this element
205      */
206     public static String name(final Attribute attrib) {
207         return attrib.getName();
208     }
209 
210     /***
211      * Static method that returns a String representing the name of an element
212      * in an XML document. This method should always be used when building
213      * context representations and related structures to ensure that
214      * elements are represented in a unified way. Please don't call
215      * {@link org.dom4j.Node#getName()} or {@link Element#getQualifiedName()}
216      * or similar methods directly in such cases.
217      *
218      * <p>Currently, only the local name if used, namespace URIs and namespace
219      * prefixes are ignored. Including namespace prefixes in context
220      * representations would be quite useless, because in different document
221      * different prefixes can represent the same namespace and vice versa.
222      *
223      * <p>Including namespace URIs might lead to higher precision by avoiding
224      * the risk of confusing elements from totally different namespaces. On
225      * other other hand it might lead to lower recall and slower learning
226      * because elements from similar namespaces (e.g. different versions of the
227      * HTML standard) are all considered separated from each other.
228      *
229      * @param element the element to name
230      * @return the name to use for this element
231      */
232     public static String name(final Element element) {
233         return element.getName();
234     }
235 
236     /***
237      * Reads an XML document from a local file, using a configured charset.
238      * Delegates to {@link IOUtils#openReader(File, Configuration)} to determine
239      * the character set. Compressed files are automatically decompressed (cf.
240      * {@link IOUtils#openCompressableInStream(InputStream)}).
241      *
242      * @param file the file to read
243      * @param config the configuration to use
244      * @return the newly created document
245      * @throws DocumentException if an error occurs during parsing
246      * @throws IOException if an I/O error occurrs
247      */
248     public static Document readDocument(final File file,
249             final Configuration config)
250     throws DocumentException, IOException {
251         Reader reader = null;
252         try {
253             // open reader and delegate
254             reader = IOUtils.openReader(file, config);
255             return readDocument(reader);
256         } finally {
257             // close the reader
258             IOUtils.tryToClose(reader);
259         }
260     }
261 
262     /***
263      * Reads an XML document from a local file, using a given charset.
264      * Compressed files are automatically decompressed (cf.
265      * {@link IOUtils#openCompressableInStream(InputStream)})
266      *
267      * @param file the file to read
268      * @param charset the character set to use for reading the file;
269      * if <code>null</code>, the default charset of the current platform is used
270      * @return the newly created document
271      * @throws DocumentException if an error occurs during parsing
272      * @throws IOException if an I/O error occurrs
273      */
274     public static Document readDocument(final File file, final String charset)
275     throws DocumentException, IOException {
276         Reader reader = null;
277         try {
278             // open reader and delegate
279             reader = IOUtils.openReader(file, charset);
280             return readDocument(reader);
281         } finally {
282             // close the reader
283             IOUtils.tryToClose(reader);
284         }
285     }
286 
287     /***
288      * Reads an XML document from a given stream.
289      * Compressed files are automatically decompressed (cf.
290      * {@link IOUtils#openCompressableInStream(InputStream)})
291      *
292      * @param in stream containing the text to parse; not closed by this method
293      * @return the newly created document
294      * @throws DocumentException if an error occurs during parsing
295      * @throws IOException if an I/O error occurrs
296      */
297     public static Document readDocument(final InputStream in)
298             throws DocumentException, IOException {
299         return XML_READER.read(IOUtils.openCompressableInStream(in));
300     }
301 
302     /***
303      * Reads an XML document from a given reader.
304      *
305      * @param reader reader containing the text to parse; not closed by this
306      * method
307      * @return the newly created document
308      * @throws DocumentException if an error occurs during parsing
309      */
310     public static Document readDocument(final Reader reader)
311             throws DocumentException {
312         return XML_READER.read(reader);
313     }
314 
315     /***
316      * Builds a simple partial representation of an element, containing the
317      * {@linkplain #name(Element) name of the element} and its normalized
318      * and {@linkplain TextUtils#shorten(String) shortened} textual content.
319      * Useful for logging.
320      *
321      * @param element the element to show (may be <code>null</code>)
322      * @return a simple partial representation of the element
323      */
324     public static String showElement(final Element element) {
325         if (element != null) {
326             return '[' + name(element) + ": "
327             + TextUtils.shorten(TextUtils.normalize(collectText(element)))
328             + ']';
329         } else {
330             return null;
331         }
332     }
333 
334     /***
335      * Builds a simple partial representation of a textual token in an element,
336      * containing the {@linkplain #name(Element) name of the element} and the
337      * normalized and {@linkplain TextUtils#shorten(String) shortened} text of
338      * the token. Useful for logging.
339      *
340      * @param element the element containing the token; must not be
341      * <code>null</code>
342      * @param token the token to show (may be <code>null</code>)
343      * @return a simple representation joining element and token
344      */
345     public static String showToken(final Element element, final String token) {
346         return '['
347         + (token == null ? null : TextUtils.shorten(TextUtils.normalize(token)))
348         + '/' + name(element) + ']';
349     }
350 
351     /***
352      * Writes an XML document to a given stream.
353      *
354      * @param document the document to write
355      * @param out the stream to write the document text to; flushed
356      * but not closed by this method
357      * @throws IOException if an I/O error occurs during writing
358      */
359     public static void writeDocument(final Document document,
360             final OutputStream out) throws IOException {
361         final OutputFormat outFormat = OutputFormat.createPrettyPrint();
362         final XMLWriter xmlWriter = new XMLWriter(out, outFormat);
363         xmlWriter.write(document);
364         xmlWriter.flush();
365     }
366 
367     /***
368      * Writes an XML document to a given writer, using the character set of the
369      * underlying output stream.
370      *
371      * @param document the document to write
372      * @param writer the writer to write the document text to; flushed
373      * but not closed by this method
374      * @throws IOException if an I/O error occurs during writing
375      */
376     public static void writeDocument(final Document document,
377             final OutputStreamWriter writer) throws IOException {
378         writeDocument(document, writer, IOUtils.determineCharsetName(writer));
379     }
380 
381     /***
382      * Writes an XML document to a given writer, using the given character set.
383      *
384      * @param document the document to write
385      * @param writer the writer to write the document text to; flushed
386      * but not closed by this method
387      * @param charset the character set of the writer; this must be a valid
388      * charset name (not <code>null</code> or empty etc.), it should be
389      * the <em>canonical</em> (standard) name of the used charset
390      * @throws IllegalArgumentException if the specific charset is
391      * <code>null</code> or empty
392      * @throws IOException if an I/O error occurs during writing
393      */
394     public static void writeDocument(final Document document,
395             final Writer writer, final String charset)
396     throws IllegalArgumentException, IOException {
397         if (StringUtils.isEmpty(charset)) {
398             throw new IllegalArgumentException("Charset name must not be "
399                     + ((charset == null) ? "null" : "empty"));
400         }
401 
402         final OutputFormat outFormat = OutputFormat.createPrettyPrint();
403         outFormat.setEncoding(charset);
404 
405         final XMLWriter xmlWriter = new XMLWriter(writer, outFormat);
406         xmlWriter.write(document);
407         xmlWriter.flush();
408     }
409 
410 }