1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml;
23
24 import de.fu_berlin.ties.text.TextTokenizer;
25
26 /***
27 * Static factory for creating a {@link de.fu_berlin.ties.text.TextTokenizer}s
28 * for XML-like input.
29 *
30 * @author Christian Siefkes
31 * @version $Revision: 1.6 $, $Date: 2006/10/21 16:04:29 $, $Author: siefkes $
32 */
33 public final class XMLTokenizerFactory {
34
35 /***
36 * Pattern fragment listing allowed whitespace characters in an XML
37 * document. Space, tab, line feed, carriage return, and some other
38 * line-ending characters (according to XML 1.1) are allowed.
39 */
40 public static final String XML_WHITESPACE_CHARS = " //t//n//r\u0085\u2028";
41
42 /***
43 * Pattern specifying whitespace in an XML document (one or more
44 * {@linkplain #XML_WHITESPACE_CHARS whitespace characters}).
45 */
46 public static final String XML_WHITESPACE =
47 "[" + XML_WHITESPACE_CHARS + "]+";
48
49 /***
50 * Pattern specifying optional whitespace in an XML document (zero or more
51 * {@linkplain #XML_WHITESPACE_CHARS whitespace characters}).
52 */
53 public static final String XML_OPT_WHITESPACE =
54 "[" + XML_WHITESPACE_CHARS + "]*";
55
56 /***
57 * Pattern fragment listing valid start characters of XML names (according
58 * to XML 1.1).
59 */
60 private static final String XML_NAME_START_CHARS =
61 ":A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D"
62 + "\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF"
63 + "\uF900-\uFDCF\uFDF0-\uFFFD";
64
65 /***
66 * Pattern fragment listing valid start characters of XML names that can
67 * only be represented as surrogate pairs in Java: characters "0x10000" to
68 * "0xEFFFF", encoded in the UFT-16 surrogate pairs "0xD800 0xDC00" to
69 * "0xDB7F 0xDFFF".
70 */
71 private static final String XML_NAME_SURROGATE_PAIRS =
72 "[\uD800-\uDB7F][\uDC00-\uDFFF]";
73
74 /***
75 * Pattern string specifying the class of valid start characters of XML
76 * names.
77 */
78 public static final String XML_NAME_START_CHAR =
79 "(?:[" + XML_NAME_START_CHARS + "]|" + XML_NAME_SURROGATE_PAIRS + ")";
80
81 /***
82 * Pattern string for XML names (according to XML 1.1).
83 */
84 public static final String XML_NAME = XML_NAME_START_CHAR + "(?:[-"
85 + XML_NAME_START_CHARS + ".0-9\u00B7\u0300-\u036F\u203F-\u2040]|"
86 + XML_NAME_SURROGATE_PAIRS + ")*";
87
88 /***
89 * Pattern string for strings enclosed in full or half quotes, e.g. XML
90 * attribute values.
91 */
92 public static final String XML_QUOTED_STRING =
93 "(?:\"[^\"]*\"|'[^']*')";
94
95 /***
96 * Pattern string for the '=' sign, optionally surrounded by whitespace.
97 */
98 public static final String XML_EQUAL_SIGN =
99 XML_OPT_WHITESPACE + "=" + XML_OPT_WHITESPACE;
100
101 /***
102 * Pattern string specifying an XML attribute (name = quoted-value pair).
103 */
104 public static final String XML_ATTRIBUTE = XML_NAME + XML_EQUAL_SIGN
105 + XML_QUOTED_STRING;
106
107 /***
108 * Pattern string for a visible textual token in XML documents (contains
109 * neither whitespace nor markup).
110 */
111 public static final String XML_CDATA_TOKEN =
112 "[^<" + XML_WHITESPACE_CHARS + "]+";
113
114 /***
115 * Pattern string specifying an XML start or empty tag (combined into a
116 * single pattern to avoid unnecessary backtracking).
117 */
118 public static final String XML_START_OR_EMPTY_TAG =
119 "<(" + XML_NAME + ")" + "(?:" + XML_WHITESPACE + XML_ATTRIBUTE + ")*"
120 + XML_OPT_WHITESPACE + "(/)?>";
121
122 /***
123 * Pattern string specifying an XML end tag.
124 */
125 public static final String XML_END_TAG = "<(/" + XML_NAME
126 + ")" + XML_OPT_WHITESPACE + ">";
127
128 /***
129 * Helper pattern string used to construct a P.I.
130 */
131 private static final String XML_PI_START = "//?" + XML_NAME;
132
133 /***
134 * Helper pattern string used to construct a P.I.
135 */
136 private static final String XML_PI_REST = "[^<>]*?//?";
137
138 /***
139 * Pattern string specifying an XML prolog or processing instruction.
140 */
141 public static final String XML_PROLOG_OR_PI =
142 "<(" + XML_PI_START + ")" + XML_PI_REST + ">";
143
144 /***
145 * Helper pattern string used to construct a comment.
146 */
147 private static final String XML_COMMENT_START = "!--";
148
149 /***
150 * Helper pattern string used to construct a comment.
151 */
152 private static final String XML_COMMENT_END = ".*?--";
153
154 /***
155 * Pattern string specifying an XML comment.
156 */
157 public static final String XML_COMMENT = "(<" + XML_COMMENT_START
158 + ")" + XML_COMMENT_END + ">";
159
160 /***
161 * Pattern string specifying a PE reference within a doctype declaration.
162 */
163 public static final String PE_REFERENCE = "%" + XML_NAME + ";";
164
165 /***
166 * Pattern string specifying an markup declaration within a doctype
167 * declaration. A markup declaration either declares an entity, element,
168 * attribute, or notation; or it is a processing instruction or a comment.
169 */
170 public static final String MARKUP_DECL =
171 "<(?:(!?:ELEMENT|ATTLIST|ENTITY|NOTATION)(?:[^\"'>]*"
172 + XML_QUOTED_STRING + ")*[^\"'>]*|" + XML_PI_START + XML_PI_REST
173 + "|" + XML_COMMENT_START + XML_COMMENT_END + ")>";
174
175 /***
176 * Pattern string specifying an XML document type declaration.
177 */
178 public static final String XML_DOCTYPE =
179 "<(!DOCTYPE)(?:" + XML_WHITESPACE + XML_NAME + ")*(?:" + XML_WHITESPACE
180 + XML_QUOTED_STRING + "){1,2}" + XML_OPT_WHITESPACE + "(?://[(?:"
181 + MARKUP_DECL + "|" + PE_REFERENCE + "|" + XML_WHITESPACE
182 + ")*//])?" + XML_OPT_WHITESPACE + ">";
183
184 /***
185 * Pattern string specifying a CDATA section in an XML document.
186 */
187 public static final String XML_CDATA_SECTION =
188 "<!(//[CDATA).*?//]//]>";
189
190 /***
191 * Pattern string specifying textual content (character data) in an XML
192 * document. Starting and trailing whitespace is not included.
193 */
194 public static final String XML_TEXTUAL_CONTENT = "(?:" + XML_CDATA_TOKEN
195 + XML_WHITESPACE + ")*" + XML_CDATA_TOKEN;
196
197 /***
198 * The array of patterns used for shallow XML parsing.
199 */
200 public static final String[] XML_PATTERNS = {
201 XML_START_OR_EMPTY_TAG, XML_END_TAG,
202 XML_DOCTYPE, XML_PROLOG_OR_PI,
203 XML_COMMENT, XML_CDATA_SECTION, XML_TEXTUAL_CONTENT
204 };
205
206 /***
207 * Factory method to create an instance for parsing files in XML syntax.
208 * Creates a shallow XML parser that splits XML input into a series of tags
209 * and textual content. The main difference from regular XML parsers is
210 * that tags can occur in any order; nesting constrains are not enforced.
211 * This can be useful to repair XML-like files.
212 *
213 * <p>The type of token returns can be determined by calling
214 * {@link TextTokenizer#capturedText()}:
215 *
216 * <dl>
217 * <dt><em>tagname</em><dt>
218 * <dd>for start tags of type <em>tagname</em></dd>
219 * <dt>/<em>tagname</em><dt>
220 * <dd>for end tags</dd>
221 * <dt><em>tagname</em>/<dt>
222 * <dd>for empty tags</dd>
223 * <dt>!DOCTYPE<dt>
224 * <dd>for doctype declarations</dd>
225 * <dt>?<em>targetname</em><dt>
226 * <dd>for prolog ("?xml") and processing instructions</dd>
227 * <dt><!--<dt>
228 * <dd>for comments</dd>
229 * <dt>[CDATA<dt>
230 * <dd>for CDATA section</dd>
231 * <dt>"" (empty string)<dt>
232 * <dd>for textual content (character data)</dd>
233 * </dl>
234 *
235 * <p>Whitespace between tags and before and after textual content can be
236 * retrieved using the {@link TextTokenizer#precedingWhitespace()} method.
237 *
238 * @param text the text to tokenize
239 * @param ensureWhitespace whether to validate whitespace
240 * ({@link TextTokenizer#isWhitespacePatternEnsured()}), throwing an
241 * exception if a document contains serious errors (i.e. an unescaped "<"
242 * within textual content); if <code>false</code>, the caller is responsible
243 * for validating whitespace
244 * @return a new instance suitable for parsing XML
245 */
246 public static TextTokenizer createXMLTokenizer(final CharSequence text,
247 final boolean ensureWhitespace) {
248 final TextTokenizer result = new TextTokenizer(XML_PATTERNS,
249 XML_WHITESPACE, text);
250 result.setWhitespacePatternEnsured(ensureWhitespace);
251 return result;
252 }
253
254 /***
255 * Private constructor to prevent creation of instances.
256 */
257 private XMLTokenizerFactory() {
258 super();
259 }
260
261 }