View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.xml;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  import java.io.Writer;
27  import java.util.Map;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.apache.commons.lang.builder.ToStringBuilder;
32  
33  import de.fu_berlin.ties.io.IOUtils;
34  import de.fu_berlin.ties.text.TextUtils;
35  
36  /***
37   * "Isolates" XML tags and textual contents in an XML/HTML document by printing
38   * each XML/HTML tag on a single line. This class is thread-safe.
39   *
40   * @author Christian Siefkes
41   * @version $Revision: 1.1 $, $Date: 2004/02/02 18:50:16 $, $Author: siefkes $
42   */
43  public class TagIsolator {
44  
45      /***
46       * The character class of line separators.
47       */
48      public static final String LINE_SEPARATOR_CLASS = "[//r//n//p{Zl}]";
49  
50      /***
51       * The class of whitespace characters that are <i>not</i> line separators.
52       */
53      public static final String SINGLE_LINE_WHITESPACE = "[ //t//x0B//f//p{Zs}]";
54  
55      /***
56       * Matches one or more line-separator characters, including any surrounding
57       * whitespace.
58       */
59      private static final Pattern LINE_SEP_PATTERN = Pattern.compile(
60          SINGLE_LINE_WHITESPACE + "*"       // optional preceding whitespace
61              + LINE_SEPARATOR_CLASS + "+"   // line-separators
62              + SINGLE_LINE_WHITESPACE + "*" // optional following whitespace
63          );
64  
65      /***
66       * Matches the whole contents of an XML tag (anything between &lt; and
67       * &gt;), optionally followed resp. preceded by a line separator.
68       */
69      private static final Pattern TAG_PATTERN = Pattern.compile(
70          "(" + LINE_SEPARATOR_CLASS + "?)"     // optional preceding line-sep
71          + SINGLE_LINE_WHITESPACE + "*"        // optional preceding whitespace
72          + "(<.*?>)"                           // the actual tag
73          + SINGLE_LINE_WHITESPACE + "*"        // optional following whitespace
74          + "(" + LINE_SEPARATOR_CLASS + "?)", // optional following line-sep
75          Pattern.DOTALL); // "." should also match newlines
76  
77      /***
78       * Creates a new instance.
79       */
80      public TagIsolator() {
81          super();
82      }
83  
84      /***
85       * "Isolates" XML tags and textual contents in an XML document by
86       * printing each XML tag on a single line. This might introduce
87       * additional whitespace, so it should be used with care in situations
88       * where whitespace is significant. Neither reader nor writer are
89       * closed by this method.
90       *
91       * @param reader the reader to read the original XML file from
92       * @param writer the writer to write the modified XML file to
93       * @throws IOException if an I/O error occurs while reading or writing
94       * the data
95       */
96      public final void isolateTags(final Reader reader, final Writer writer)
97              throws IOException {
98          isolateTags(reader, writer, null);
99      }
100 
101 
102     /***
103      * "Isolates" XML tags and textual contents in an XML document by
104      * printing each XML tag on a single line. This might introduce
105      * additional whitespace, so it should be used with care in situations
106      * where whitespace is significant. Neither reader nor writer are
107      * closed by this method.
108      *
109      * @param reader the reader to read the original XML file from
110      * @param writer the writer to write the modified XML file to
111      * @param replacements a map of additional replacements to perform within
112      * the XML tags; maps regular expression {@link java.util.regex.Pattern}s
113      * to replacement {@link java.lang.String}s; might be <code>null</code>
114      * @throws IOException if an I/O error occurs while reading or writing
115      * the data
116      */
117     public final void isolateTags(final Reader reader, final Writer writer,
118             final Map replacements) throws IOException {
119         final String contents = IOUtils.readToString(reader);
120         final Matcher tagMatcher = TAG_PATTERN.matcher(contents);
121         final Matcher lineSepMatcher = LINE_SEP_PATTERN.matcher("");
122 
123         int endOfLastMatch = 0;
124         String precedingLineSep;
125         String singleLineTagContents, finalTagContents;
126         String followingLineSep;
127         String textBetweenTags;
128         boolean textBetweenExists;
129 
130         // iterate over XML tags
131         while (tagMatcher.find()) {
132             // write everything between last and current match
133             textBetweenTags = contents.substring(endOfLastMatch,
134                 tagMatcher.start());
135             textBetweenExists = textBetweenTags.length() > 0;
136             if (textBetweenExists) {
137                 writer.write(textBetweenTags);
138             }
139             endOfLastMatch = tagMatcher.end();
140 
141             // first group: optional preceding line-separator
142             precedingLineSep = tagMatcher.group(1);
143 
144             // insert preceding line-separator, if none exists and if there was
145             // text after the last tag (no need to insert two line-separators)
146             if (precedingLineSep.length() > 0) {
147                 writer.write(precedingLineSep);
148             } else if (textBetweenExists) {
149                 writer.write(TextUtils.LINE_SEPARATOR);
150             }
151 
152             // replace all line-seps in the actual tag by a single whitespace
153             singleLineTagContents = TextUtils.replaceAll(tagMatcher.group(2),
154                 lineSepMatcher, " ");
155 
156             // do additional replacements, if specified
157             if (replacements != null) {
158                 finalTagContents = TextUtils.multipleReplaceAll(
159                     singleLineTagContents, replacements);
160             } else {
161                 finalTagContents = singleLineTagContents;
162             }
163             writer.write(finalTagContents);
164 
165             // third group: optional following line-separator
166             followingLineSep = tagMatcher.group(3);
167 
168             // insert following line-separator, if none exists
169             if (followingLineSep.length() > 0) {
170                 writer.write(followingLineSep);
171             } else {
172                 writer.write(TextUtils.LINE_SEPARATOR);
173             }
174         }
175 
176         // append tail and flush output
177         writer.write(contents.substring(endOfLastMatch));
178         writer.flush();
179     }
180 
181     /***
182      * Returns a string representation of this object.
183      *
184      * @return a textual representation
185      */
186     public String toString() {
187         return new ToStringBuilder(this).toString();
188     }
189 }