1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.dom;
23
24 import java.io.IOException;
25 import java.io.Writer;
26
27 import org.dom4j.Branch;
28 import org.dom4j.Document;
29 import org.dom4j.Element;
30 import org.dom4j.tree.DefaultDocument;
31 import org.dom4j.tree.DefaultElement;
32
33 import de.fu_berlin.ties.ContextMap;
34 import de.fu_berlin.ties.DocumentReader;
35 import de.fu_berlin.ties.TiesConfiguration;
36 import de.fu_berlin.ties.io.IOUtils;
37 import de.fu_berlin.ties.text.TextUtils;
38
39 /***
40 * An XML stripper converts a XML document to plain text, removing all markup.
41 * <p>This class is thread-safe and can be used to convert several documents
42 * in parallel.
43 *
44 * @author Christian Siefkes
45 * @version $Revision: 1.11 $, $Date: 2006/10/21 16:04:33 $, $Author: siefkes $
46 */
47 public class XMLStripper extends DocumentReader {
48
49 /***
50 * Whether or not to normalize whitespace.
51 */
52 private final boolean normalizeWS;
53
54 /***
55 * If this is set to <code>true</code>, the output will be an XML document
56 * instead of a plain text, by preserving the root element (all other
57 * elements + attributes are still discarded).
58 */
59 private final boolean toXML;
60
61
62 /***
63 * Creates a new instance, using a default extension and the
64 * {@linkplain TiesConfiguration#CONF standard configuration}.
65 */
66 public XMLStripper() {
67 this("txt");
68 }
69
70 /***
71 * Creates a new instance, using the
72 * {@linkplain TiesConfiguration#CONF standard configuration}.
73 *
74 * @param outExt the extension to use for output files
75 */
76 public XMLStripper(final String outExt) {
77 this(outExt, TiesConfiguration.CONF);
78 }
79
80 /***
81 * Creates a new instance from the provided configuration.
82 *
83 * @param outExt the extension to use for output files
84 * @param config used to configure this instance
85 */
86 public XMLStripper(final String outExt, final TiesConfiguration config) {
87 this(outExt, config.getBoolean("strip.to-xml"),
88 config.getBoolean("strip.normalize"), config);
89 }
90
91 /***
92 * Creates a new instance.
93 *
94 * @param outExt the extension to use for output files
95 * @param stripToXML if this is set to <code>true</code>, the output will
96 * be an XML document instead of a plain text, by preserving the root
97 * element (all other elements + attributes are still discarded)
98 * @param myNormalizeWS whether to normalize whitespace
99 * @param config used to configure superclasses
100 */
101 public XMLStripper(final String outExt, final boolean stripToXML,
102 final boolean myNormalizeWS, final TiesConfiguration config) {
103 super(outExt, config);
104 toXML = stripToXML;
105 normalizeWS = myNormalizeWS;
106 }
107
108
109 /***
110 * Helper method that collects and, if configured, normalizes the textual
111 * content of a document/element.
112 *
113 * @param branch the document/element to collect
114 * @return the collected textual content of the given branch, normalized
115 * if configured
116 */
117 private String collectText(final Branch branch) {
118 final String collectedText = DOMUtils.collectText(branch);
119
120 if (normalizeWS) {
121 return TextUtils.normalize(collectedText);
122 } else {
123 return collectedText;
124 }
125 }
126
127 /***
128 * Strips all markup from an XML document and stores the resulting plain
129 * text. This method just delegates to
130 * {@link DOMUtils#collectText(org.dom4j.Branch, StringBuilder)}.
131 *
132 * @param document the document to read
133 * @param writer the writer to write the resulting plain text to; flushed
134 * but not closed by this method
135 * @param context a map of objects that are made available for processing
136 * @throws IOException if an I/O error occurs
137 */
138 public void process(final Document document, final Writer writer,
139 final ContextMap context) throws IOException {
140 if (toXML) {
141
142 final Element inputRoot = document.getRootElement();
143 final Element outputRoot = new DefaultElement(inputRoot.getQName());
144 outputRoot.addText(collectText(inputRoot));
145 final Document result = new DefaultDocument(outputRoot);
146 DOMUtils.writeDocument(result, writer,
147 (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
148 } else {
149
150 writer.write(collectText(document));
151 }
152 }
153
154 }