View Javadoc

1   /*
2    * Copyright (C) 2004-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.xml.dom;
23  
24  import java.io.IOException;
25  import java.io.Writer;
26  
27  import org.dom4j.Branch;
28  import org.dom4j.Document;
29  import org.dom4j.Element;
30  import org.dom4j.tree.DefaultDocument;
31  import org.dom4j.tree.DefaultElement;
32  
33  import de.fu_berlin.ties.ContextMap;
34  import de.fu_berlin.ties.DocumentReader;
35  import de.fu_berlin.ties.TiesConfiguration;
36  import de.fu_berlin.ties.io.IOUtils;
37  import de.fu_berlin.ties.text.TextUtils;
38  
39  /***
40   * An XML stripper converts a XML document to plain text, removing all markup.
41   * <p>This class is thread-safe and can be used to convert several documents
42   * in parallel.
43   *
44   * @author Christian Siefkes
45   * @version $Revision: 1.11 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
46   */
47  public class XMLStripper extends DocumentReader {
48  
49      /***
50       * Whether or not to normalize whitespace.
51       */
52      private final boolean normalizeWS;
53  
54      /***
55       * If this is set to <code>true</code>, the output will be an XML document
56       * instead of a plain text, by preserving the root element (all other
57       * elements + attributes are still discarded).
58       */
59      private final boolean toXML;
60  
61  
62      /***
63       * Creates a new instance, using a default extension and the
64       * {@linkplain TiesConfiguration#CONF standard configuration}.
65       */
66      public XMLStripper() {
67          this("txt");
68      }
69  
70      /***
71       * Creates a new instance, using the
72       * {@linkplain TiesConfiguration#CONF standard configuration}.
73       *
74       * @param outExt the extension to use for output files
75       */
76      public XMLStripper(final String outExt) {
77          this(outExt, TiesConfiguration.CONF);
78      }
79  
80      /***
81       * Creates a new instance from the provided configuration.
82       *
83       * @param outExt the extension to use for output files
84       * @param config used to configure this instance
85       */
86      public XMLStripper(final String outExt, final TiesConfiguration config) {
87          this(outExt, config.getBoolean("strip.to-xml"),
88                  config.getBoolean("strip.normalize"), config);
89      }
90  
91      /***
92       * Creates a new instance.
93       *
94       * @param outExt the extension to use for output files
95       * @param stripToXML if this is set to <code>true</code>, the output will
96       * be an XML document instead of a plain text, by preserving the root
97       * element (all other elements + attributes are still discarded)
98       * @param myNormalizeWS whether to normalize whitespace
99       * @param config used to configure superclasses
100      */
101     public XMLStripper(final String outExt, final boolean stripToXML,
102             final boolean myNormalizeWS, final TiesConfiguration config) {
103         super(outExt, config);
104         toXML = stripToXML;
105         normalizeWS = myNormalizeWS;
106     }
107 
108 
109     /***
110      * Helper method that collects and, if configured, normalizes the textual
111      * content of a document/element.
112      *
113      * @param branch the document/element to collect
114      * @return the collected textual content of the given branch, normalized
115      * if configured
116      */
117     private String collectText(final Branch branch) {
118         final String collectedText = DOMUtils.collectText(branch);
119 
120         if (normalizeWS) {
121             return TextUtils.normalize(collectedText);
122         } else {
123             return collectedText;
124         }
125     }
126 
127     /***
128      * Strips all markup from an XML document and stores the resulting plain
129      * text. This method just delegates to
130      * {@link DOMUtils#collectText(org.dom4j.Branch, StringBuilder)}.
131      *
132      * @param document the document to read
133      * @param writer the writer to write the resulting plain text to; flushed
134      * but not closed by this method
135      * @param context a map of objects that are made available for processing
136      * @throws IOException if an I/O error occurs
137      */
138     public void process(final Document document, final Writer writer,
139             final ContextMap context) throws IOException {
140         if (toXML) {
141             // copy root element + add collected textual content + serialize
142             final Element inputRoot = document.getRootElement();
143             final Element outputRoot = new DefaultElement(inputRoot.getQName());
144             outputRoot.addText(collectText(inputRoot));
145             final Document result = new DefaultDocument(outputRoot);
146             DOMUtils.writeDocument(result, writer,
147                     (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
148         } else {
149             // just serialize textual content
150             writer.write(collectText(document));
151         }
152     }
153 
154 }