1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml;
23
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.io.Writer;
27 import java.util.Map;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.apache.commons.lang.builder.ToStringBuilder;
32
33 import de.fu_berlin.ties.io.IOUtils;
34 import de.fu_berlin.ties.text.TextUtils;
35
36 /***
37 * "Isolates" XML tags and textual contents in an XML/HTML document by printing
38 * each XML/HTML tag on a single line. This class is thread-safe.
39 *
40 * @author Christian Siefkes
41 * @version $Revision: 1.1 $, $Date: 2004/02/02 18:50:16 $, $Author: siefkes $
42 */
43 public class TagIsolator {
44
45 /***
46 * The character class of line separators.
47 */
48 public static final String LINE_SEPARATOR_CLASS = "[//r//n//p{Zl}]";
49
50 /***
51 * The class of whitespace characters that are <i>not</i> line separators.
52 */
53 public static final String SINGLE_LINE_WHITESPACE = "[ //t//x0B//f//p{Zs}]";
54
55 /***
56 * Matches one or more line-separator characters, including any surrounding
57 * whitespace.
58 */
59 private static final Pattern LINE_SEP_PATTERN = Pattern.compile(
60 SINGLE_LINE_WHITESPACE + "*"
61 + LINE_SEPARATOR_CLASS + "+"
62 + SINGLE_LINE_WHITESPACE + "*"
63 );
64
65 /***
66 * Matches the whole contents of an XML tag (anything between < and
67 * >), optionally followed resp. preceded by a line separator.
68 */
69 private static final Pattern TAG_PATTERN = Pattern.compile(
70 "(" + LINE_SEPARATOR_CLASS + "?)"
71 + SINGLE_LINE_WHITESPACE + "*"
72 + "(<.*?>)"
73 + SINGLE_LINE_WHITESPACE + "*"
74 + "(" + LINE_SEPARATOR_CLASS + "?)",
75 Pattern.DOTALL);
76
77 /***
78 * Creates a new instance.
79 */
80 public TagIsolator() {
81 super();
82 }
83
84 /***
85 * "Isolates" XML tags and textual contents in an XML document by
86 * printing each XML tag on a single line. This might introduce
87 * additional whitespace, so it should be used with care in situations
88 * where whitespace is significant. Neither reader nor writer are
89 * closed by this method.
90 *
91 * @param reader the reader to read the original XML file from
92 * @param writer the writer to write the modified XML file to
93 * @throws IOException if an I/O error occurs while reading or writing
94 * the data
95 */
96 public final void isolateTags(final Reader reader, final Writer writer)
97 throws IOException {
98 isolateTags(reader, writer, null);
99 }
100
101
102 /***
103 * "Isolates" XML tags and textual contents in an XML document by
104 * printing each XML tag on a single line. This might introduce
105 * additional whitespace, so it should be used with care in situations
106 * where whitespace is significant. Neither reader nor writer are
107 * closed by this method.
108 *
109 * @param reader the reader to read the original XML file from
110 * @param writer the writer to write the modified XML file to
111 * @param replacements a map of additional replacements to perform within
112 * the XML tags; maps regular expression {@link java.util.regex.Pattern}s
113 * to replacement {@link java.lang.String}s; might be <code>null</code>
114 * @throws IOException if an I/O error occurs while reading or writing
115 * the data
116 */
117 public final void isolateTags(final Reader reader, final Writer writer,
118 final Map replacements) throws IOException {
119 final String contents = IOUtils.readToString(reader);
120 final Matcher tagMatcher = TAG_PATTERN.matcher(contents);
121 final Matcher lineSepMatcher = LINE_SEP_PATTERN.matcher("");
122
123 int endOfLastMatch = 0;
124 String precedingLineSep;
125 String singleLineTagContents, finalTagContents;
126 String followingLineSep;
127 String textBetweenTags;
128 boolean textBetweenExists;
129
130
131 while (tagMatcher.find()) {
132
133 textBetweenTags = contents.substring(endOfLastMatch,
134 tagMatcher.start());
135 textBetweenExists = textBetweenTags.length() > 0;
136 if (textBetweenExists) {
137 writer.write(textBetweenTags);
138 }
139 endOfLastMatch = tagMatcher.end();
140
141
142 precedingLineSep = tagMatcher.group(1);
143
144
145
146 if (precedingLineSep.length() > 0) {
147 writer.write(precedingLineSep);
148 } else if (textBetweenExists) {
149 writer.write(TextUtils.LINE_SEPARATOR);
150 }
151
152
153 singleLineTagContents = TextUtils.replaceAll(tagMatcher.group(2),
154 lineSepMatcher, " ");
155
156
157 if (replacements != null) {
158 finalTagContents = TextUtils.multipleReplaceAll(
159 singleLineTagContents, replacements);
160 } else {
161 finalTagContents = singleLineTagContents;
162 }
163 writer.write(finalTagContents);
164
165
166 followingLineSep = tagMatcher.group(3);
167
168
169 if (followingLineSep.length() > 0) {
170 writer.write(followingLineSep);
171 } else {
172 writer.write(TextUtils.LINE_SEPARATOR);
173 }
174 }
175
176
177 writer.write(contents.substring(endOfLastMatch));
178 writer.flush();
179 }
180
181 /***
182 * Returns a string representation of this object.
183 *
184 * @return a textual representation
185 */
186 public String toString() {
187 return new ToStringBuilder(this).toString();
188 }
189 }