View Javadoc

1   /*
2    * Copyright (C) 2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.xml;
23  
24  import de.fu_berlin.ties.text.TextTokenizer;
25  
26  /***
27   * Static factory for creating a {@link de.fu_berlin.ties.text.TextTokenizer}s
28   * for XML-like input.
29   *
30   * @author Christian Siefkes
31   * @version $Revision: 1.2 $, $Date: 2004/04/08 16:38:26 $, $Author: siefkes $
32   */
33  public final class XMLTokenizerFactory {
34  
35      /***
36       * Pattern fragment listing allowed whitespace characters in an XML
37       * document. Space, tab, line feed, carriage return, and some other
38       * line-ending characters (according to XML 1.1) are allowed.
39       */
40      public static final String XML_WHITESPACE_CHARS = " //t//n//r\u0085\u2028";
41  
42      /***
43       * Pattern specifying whitespace in an XML document (one or more
44       * {@linkplain #XML_WHITESPACE_CHARS whitespace characters}).
45       */
46      public static final String XML_WHITESPACE =
47          "[" + XML_WHITESPACE_CHARS + "]+";
48  
49      /***
50       * Pattern specifying optional whitespace in an XML document (zero or more
51       * {@linkplain #XML_WHITESPACE_CHARS whitespace characters}).
52       */
53      public static final String XML_OPT_WHITESPACE =
54          "[" + XML_WHITESPACE_CHARS + "]*";
55  
56      /***
57       * Pattern fragment listing valid start characters of XML names (according
58       * to XML 1.1).
59       */
60      private static final String XML_NAME_START_CHARS =
61          ":A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D"
62          + "\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF"
63          + "\uF900-\uFDCF\uFDF0-\uFFFD";
64  
65      /***
66       * Pattern fragment listing valid start characters of XML names that can
67       * only be represented as surrogate pairs in Java: characters "0x10000" to
68       * "0xEFFFF", encoded in the UFT-16 surrogate pairs "0xD800 0xDC00" to
69       * "0xDB7F 0xDFFF".
70       */
71      private static final String XML_NAME_SURROGATE_PAIRS =
72          "[\uD800-\uDB7F][\uDC00-\uDFFF]";
73  
74      /***
75       * Pattern string specifying the class of valid start characters of XML
76       * names.
77       */
78      public static final String XML_NAME_START_CHAR =
79          "(?:[" + XML_NAME_START_CHARS + "]|" + XML_NAME_SURROGATE_PAIRS + ")";
80  
81      /***
82       * Pattern string for XML names (according to XML 1.1).
83       */
84      public static final String XML_NAME = XML_NAME_START_CHAR + "(?:[-"
85          + XML_NAME_START_CHARS + ".0-9\u00B7\u0300-\u036F\u203F-\u2040]|"
86          + XML_NAME_SURROGATE_PAIRS + ")*";
87  
88      /***
89       * Pattern string for strings enclosed in full or half quotes, e.g. XML
90       * attribute values.
91       */
92      public static final String XML_QUOTED_STRING =
93          "(?:\"[^\"]*\"|'[^']*')";
94  
95      /***
96       * Pattern string for the '=' sign, optionally surrounded by whitespace.
97       */
98      public static final String XML_EQUAL_SIGN =
99          XML_OPT_WHITESPACE + "=" + XML_OPT_WHITESPACE;
100 
101     /***
102      * Pattern string specifying an XML attribute (name = quoted-value pair).
103      */
104     public static final String XML_ATTRIBUTE = XML_NAME + XML_EQUAL_SIGN
105         + XML_QUOTED_STRING;
106 
107     /***
108      * Pattern string for a visible textual token in XML documents (contains
109      * neither whitespace nor markup).
110      */
111     public static final String XML_CDATA_TOKEN =
112         "[^<" + XML_WHITESPACE_CHARS + "]+";
113 
114     /***
115      * Pattern string specifying an XML start or empty tag (combined into a
116      * single pattern to avoid unnecessary backtracking).
117      */
118     public static final String XML_START_OR_EMPTY_TAG =
119         "<(" + XML_NAME + ")" + "(?:" + XML_WHITESPACE + XML_ATTRIBUTE + ")*"
120         + XML_OPT_WHITESPACE + "(/)?>";
121 
122     /***
123      * Pattern string specifying an XML end tag.
124      */
125     public static final String XML_END_TAG = "<(/" + XML_NAME
126         + ")" + XML_OPT_WHITESPACE + ">";
127 
128     /***
129      * Helper pattern string used to construct a P.I.
130      */
131     private static final String XML_PI_START = "//?" + XML_NAME;
132 
133     /***
134      * Helper pattern string used to construct a P.I.
135      */
136     private static final String XML_PI_REST = "[^<>]*?//?";
137 
138     /***
139      * Pattern string specifying an XML prolog or processing instruction.
140      */
141     public static final String XML_PROLOG_OR_PI =
142         "<(" + XML_PI_START + ")" + XML_PI_REST + ">";
143 
144     /***
145      * Helper pattern string used to construct a comment.
146      */
147     private static final String XML_COMMENT_START = "!--";
148 
149     /***
150      * Helper pattern string used to construct a comment.
151      */
152     private static final String XML_COMMENT_END = ".*?--";
153 
154     /***
155      * Pattern string specifying an XML comment.
156      */
157     public static final String XML_COMMENT = "(<" + XML_COMMENT_START
158         + ")" + XML_COMMENT_END + ">";
159 
160     /***
161      * Pattern string specifying a PE reference within a doctype declaration.
162      */
163     public static final String PE_REFERENCE = "%" + XML_NAME + ";";
164 
165     /***
166      * Pattern string specifying an markup declaration within a doctype
167      * declaration. A markup declaration either declares an entity, element,
168      * attribute, or notation; or it is a processing instruction or a comment.
169      */
170     public static final String MARKUP_DECL =
171         "<(?:(!?:ELEMENT|ATTLIST|ENTITY|NOTATION)(?:[^\"'>]*"
172             + XML_QUOTED_STRING + ")*[^\"'>]*|" + XML_PI_START + XML_PI_REST
173             + "|" + XML_COMMENT_START + XML_COMMENT_END + ")>";
174 
175     /***
176      * Pattern string specifying an XML document type declaration.
177      */
178     public static final String XML_DOCTYPE =
179         "<(!DOCTYPE)(?:" + XML_WHITESPACE + XML_NAME + ")*(?:" + XML_WHITESPACE
180         + XML_QUOTED_STRING + "){1,2}" + XML_OPT_WHITESPACE + "(?://[(?:"
181         + MARKUP_DECL + "|" + PE_REFERENCE + "|" + XML_WHITESPACE
182         + ")*//])?" + XML_OPT_WHITESPACE + ">";
183 
184     /***
185      * Pattern string specifying a CDATA section in an XML document.
186      */
187     public static final String XML_CDATA_SECTION =
188         "<!(//[CDATA).*?//]//]>";
189 
190     /***
191      * Pattern string specifying textual content (character data) in an XML
192      * document. Starting and trailing whitespace is not included.
193      */
194     public static final String XML_TEXTUAL_CONTENT = "(?:" + XML_CDATA_TOKEN
195         + XML_WHITESPACE + ")*" + XML_CDATA_TOKEN;
196 
197     /***
198      * The array of patterns used for shallow XML parsing.
199      */
200     public static final String[] XML_PATTERNS = {
201         XML_START_OR_EMPTY_TAG, XML_END_TAG,
202         XML_DOCTYPE, XML_PROLOG_OR_PI,
203         XML_COMMENT, XML_CDATA_SECTION, XML_TEXTUAL_CONTENT
204     };
205 
206     /***
207      * Factory method to create an instance for parsing files in XML syntax.
208      * Creates a shallow XML parser that splits XML input into a series of tags
209      * and textual content. The main difference from regular XML parsers is
210      * that tags can occur in any order; nesting constrains are not enforced.
211      * This can be useful to repair XML-like files.
212      *
213      * <p>The type of token returns can be determined by calling
214      * {@link TextTokenizer#capturedText()}:
215      *
216      * <dl>
217      * <dt><em>tagname</em><dt>
218      * <dd>for start tags of type <em>tagname</em></dd>
219      * <dt>/<em>tagname</em><dt>
220      * <dd>for end tags</dd>
221      * <dt><em>tagname/</em>/<dt>
222      * <dd>for empty tags</dd>
223      * <dt>!DOCTYPE<dt>
224      * <dd>for doctype declarations</dd>
225      * <dt>?<em>targetname</em><dt>
226      * <dd>for prolog ("?xml") and processing instructions</dd>
227      * <dt>&lt;!--<dt>
228      * <dd>for comments</dd>
229      * <dt>[CDATA<dt>
230      * <dd>for CDATA section</dd>
231      * <dt>"" (empty string)<dt>
232      * <dd>for textual content (character data)</dd>
233      * </dl>
234      *
235      * <p>Whitespace between tags and before and after textual content can be
236      * retrieved using the {@link TextTokenizer#precedingWhitespace()} method.
237      *
238      * @param text the text to tokenize
239      * @param ensureWhitespace whether to validate whitespace
240      * ({@link TextTokenizer#isWhitespacePatternEnsured()}), throwing an
241      * exception if a document contains serious errors (i.e. an unescaped "&lt;"
242      * within textual content); if <code>false</code>, the caller is responsible
243      * for validating whitespace
244      * @return a new instance suitable for parsing XML
245      */
246     public static TextTokenizer createXMLTokenizer(final CharSequence text,
247             final boolean ensureWhitespace) {
248         final TextTokenizer result = new TextTokenizer(XML_PATTERNS,
249             XML_WHITESPACE, text);
250         result.setWhitespacePatternEnsured(ensureWhitespace);
251         return result;
252     }
253 
254     /***
255      * Private constructor to prevent creation of instances.
256      */
257     private XMLTokenizerFactory() {
258         super();
259     }
260 
261 }