1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml.dom;
23
24 import java.io.IOException;
25 import java.io.Writer;
26
27 import org.dom4j.Document;
28
29 import de.fu_berlin.ties.ContextMap;
30 import de.fu_berlin.ties.DocumentReader;
31 import de.fu_berlin.ties.TiesConfiguration;
32
33 /***
34 * An XML stripper converts a XML document to plain text, removing all markup.
35 * <p>This class is thread-safe and can be used to convert several documents
36 * in parallel.
37 *
38 * @author Christian Siefkes
39 * @version $Revision: 1.4 $, $Date: 2004/08/23 17:11:21 $, $Author: siefkes $
40 */
41 public class XMLStripper extends DocumentReader {
42
43 /***
44 * Creates a new instance, using a default extension and the
45 * {@linkplain TiesConfiguration#CONF standard configuration}.
46 */
47 public XMLStripper() {
48 this("txt");
49 }
50
51 /***
52 * Creates a new instance, using the
53 * {@linkplain TiesConfiguration#CONF standard configuration}.
54 *
55 * @param outExt the extension to use for output files
56 */
57 public XMLStripper(final String outExt) {
58 this(outExt, TiesConfiguration.CONF);
59 }
60
61 /***
62 * Creates a new instance.
63 *
64 * @param outExt the extension to use for output files
65 * @param config used to configure superclasses
66 */
67 public XMLStripper(final String outExt, final TiesConfiguration config) {
68 super(outExt, config);
69 }
70
71 /***
72 * Strips all markup from an XML document and stores the resulting plain
73 * text. This method just delegates to
74 * {@link DOMUtils#collectText(org.dom4j.Branch, StringBuffer)}.
75 *
76 * @param document the document to read
77 * @param writer the writer to write the resulting plain text to; flushed
78 * but not closed by this method
79 * @param context a map of objects that are made available for processing
80 * @throws IOException if an I/O error occurs
81 */
82 public void process(final Document document, final Writer writer,
83 final ContextMap context) throws IOException {
84 DOMUtils.collectText(document, writer);
85 }
86
87 }