1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml;
23
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.io.Writer;
27 import java.util.Set;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.apache.commons.lang.StringEscapeUtils;
32 import org.apache.commons.lang.builder.ToStringBuilder;
33
34 import de.fu_berlin.ties.ContextMap;
35 import de.fu_berlin.ties.ParsingException;
36 import de.fu_berlin.ties.TextProcessor;
37 import de.fu_berlin.ties.TiesConfiguration;
38
39 import de.fu_berlin.ties.io.IOUtils;
40 import de.fu_berlin.ties.text.TextTokenizer;
41 import de.fu_berlin.ties.text.TextUtils;
42 import de.fu_berlin.ties.util.CollUtils;
43 import de.fu_berlin.ties.util.Util;
44
45 /***
46 * This class tries to fix corrupt XML documents, especially documents
47 * containing nesting errors. Instances of this class are thread-safe and
48 * can fix several documents in parallel.
49 *
50 * @author Christian Siefkes
51 * @version $Revision: 1.24 $, $Date: 2006/10/21 16:04:29 $, $Author: siefkes $
52 */
53 public class XMLAdjuster extends TextProcessor {
54
55 /***
56 * Configuration key: the name to use for the root element if missing.
57 */
58 public static final String CONFIG_MISSING_ROOT = "adjust.missing-root";
59
60 /***
61 * Configuration key: Set of names of tags that can be converted empty tags
62 * when required.
63 */
64 public static final String CONFIG_EMPTIABLE_TAGS = "adjust.emptiable-tags";
65
66 /***
67 * Configuration key: whether to delete
68 * {@link #CONTROL_CHARS control characters} (which are not allowed in
69 * XML 1.0 and discouraged in XML 1.1).
70 */
71 public static final String CONFIG_DELETE_CONTROL_CHARS =
72 "adjust.delete.control-chars";
73
74 /***
75 * Configuration key: whether to delete "pseudo-tags".
76 */
77 public static final String CONFIG_DELETE_PSEUDO_TAGS =
78 "adjust.delete.pseudo-tags";
79
80 /***
81 * Configuration key: whether to delete trailing garbage (illegal content
82 * that occurs after the root tag has been closed).
83 */
84 public static final String CONFIG_DELETE_TRAILING_GARBAGE =
85 "adjust.delete.trailing-garbage";
86
87 /***
88 * Configuration key: whether to escape "&" starting a possible
89 * nonstandard entity reference ("&" at the start of one of the 5
90 * predefined entity references or a character reference is never escaped,
91 * all other "&" are always escaped).
92 */
93 public static final String CONFIG_ESCAPE_PSEUDO_ENTITIES =
94 "adjust.escape.pseudo-entities";
95
96 /***
97 * Pattern string specifying characters that can occur at the start of end
98 * of an unquoted attribute value: everything except '<', '>', '='
99 * and whitespace (whitespace is also allowed, but only in the middle of a
100 * value). Evaluated lazily (reluctant) to avoid missing the "/>" at
101 * the end of an empty tag.
102 */
103 public static final String UNQUOTED_ATTRIB_CHARS =
104 "[^<>=" + XMLTokenizerFactory.XML_WHITESPACE_CHARS + "]+?";
105
106 /***
107 * Pattern string specifying an XML attribute without proper quotes.
108 * Equal sign and value are captured in groups; the value is matched lazily
109 * (reluctant) so we won't miss the start of the next attribute.
110 */
111 public static final String UNQUOTED_ATTRIBUTE = XMLTokenizerFactory.XML_NAME
112 + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(=)"
113 + XMLTokenizerFactory.XML_OPT_WHITESPACE + "((?:"
114 + UNQUOTED_ATTRIB_CHARS + XMLTokenizerFactory.XML_WHITESPACE + ")*?"
115 + UNQUOTED_ATTRIB_CHARS + ")";
116
117 /***
118 * Pattern specifying of a "lax" XML start or empty tag that can contain
119 * unquoted (invalid) attributes (combined into a single pattern to avoid
120 * unnecessary backtracking). Element name, equal signs and values of
121 * the (last) unquoted attribute, and '/' (for empty tags) are captured.
122 */
123 public static final Pattern LAX_START_OR_EMPTY_TAG = Pattern.compile(
124 "<(" + XMLTokenizerFactory.XML_NAME + ")" + "(?:"
125 + XMLTokenizerFactory.XML_WHITESPACE + "(?:"
126 + XMLTokenizerFactory.XML_ATTRIBUTE + "|" + UNQUOTED_ATTRIBUTE + "))*"
127 + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(/)?>");
128
129 /***
130 * A "&" that is not the start of an predefined entity reference or
131 * a character reference and thus should be escaped if
132 * {@link #isEscapingPseudoEntities()} is <code>true</code>.
133 * (A pattern matching the rest of predefined entity or character reference
134 * is included via negative lookahead.)
135 */
136 public static final Pattern PSEUDO_AMP = Pattern.compile(
137 "&(?!(?:amp|lt|gt|apos|quot|#[0-9]+|#x[0-9a-fA-F]+);)");
138
139 /***
140 * A "&" that is not the start of an entity and thus must be escaped.
141 * (A pattern matching the rest of legal entity or character reference is
142 * included via negative lookahead.)
143 */
144 public static final Pattern SPURIOUS_AMP = Pattern.compile("&(?!(?:"
145 + XMLTokenizerFactory.XML_NAME + "|#[0-9]+|#x[0-9a-fA-F]+);)");
146
147 /***
148 * Escape sequence for the "&" character.
149 */
150 public static final String ESCAPED_AMP = "&";
151
152 /***
153 * Pattern specifying sequences of control characters (character codes
154 * below the space character, except tab, line feed and carriage return).
155 */
156 public static final Pattern CONTROL_CHARS =
157 Pattern.compile("[\u0001-\u0008\u000B-\u000C\u000E-\u001F]+");
158
159 /***
160 * Event constant: Converted to empty tag.
161 */
162 protected static final String EVENT_CONVERTED_TO_EMPTY_TAG =
163 "Converted to empty tag";
164
165 /***
166 * Event constant: Inserted missing end tag.
167 */
168 protected static final String EVENT_INSERTED_MISSING_END_TAG =
169 "Inserted missing end tag";
170
171 /***
172 * Event constant: Inserted missing root element.
173 */
174 protected static final String EVENT_INSERTED_MISSING_ROOT_ELEMENT =
175 "Inserted missing root element";
176
177 /***
178 * Event constant: Inserted missing start tag.
179 */
180 protected static final String EVENT_INSERTED_MISSING_START_TAG =
181 "Inserted missing start tag";
182
183 /***
184 * Event constant: Moved end tag up.
185 */
186 protected static final String EVENT_MOVED_END_TAG_UP =
187 "Moved end tag up";
188
189 /***
190 * Event constant: Moved start tag dow.
191 */
192 protected static final String EVENT_MOVED_START_TAG_DOWN =
193 "Moved start tag down";
194
195 /***
196 * Event constant: Split tag.
197 */
198 protected static final String EVENT_SPLIT_TAG =
199 "Split tag";
200
201 /***
202 * Event constant: Deleted control characters.
203 */
204 protected static final String EVENT_DELETED_CONTROL_CHARS =
205 "Deleted control characters";
206
207 /***
208 * Event constant: Deleted pseudo-tag.
209 */
210 protected static final String EVENT_DELETED_PSEUDO_TAG =
211 "Deleted pseudo-tag";
212
213 /***
214 * Event constant: Escaped characters that are illegal or unwanted.
215 */
216 protected static final String EVENT_ESCAPED_CHARS =
217 "Escaped characters";
218
219 /***
220 * Event constant: Quoted attribute values.
221 */
222 protected static final String EVENT_QUOTED_ATTRIBUTE_VALUES =
223 "Quoted attribute values";
224
225 /***
226 * Event constant: Deleted trailing garbage.
227 */
228 protected static final String EVENT_DELETED_TRAILING_GARBAGE =
229 "Deleted trailing garbage";
230
231 /***
232 * The name to use for the root element if missing. A root element
233 * with this name is created when not all elements and textual
234 * content are inclosed within a single element (the root).
235 * If <code>null</code> and {@link #deletingTrailingGarbage}, processing
236 * stops with an exception if the root element is missing.
237 */
238 private final String missingRootName;
239
240 /***
241 * Whether to delete trailing garbage (illegal content that occurs after
242 * the root tag has been closed).
243 */
244 private final boolean deletingTrailingGarbage;
245
246 /***
247 * Contains the names (Strings) of tags that can be converted an empty tags
248 * when required for fixing a document (e.g. "br" when
249 * <code><br></code> may be converted to <code><br/></code>
250 * during repair). Might be <code>null</code>.
251 */
252 private final Set<String> emptiableTags;
253
254 /***
255 * Whether to delete {@link #CONTROL_CHARS control characters} (which are
256 * not allowed in XML 1.0 and discouraged in XML 1.1).
257 */
258 private final boolean deletingControlChars;
259
260 /***
261 * Whether to delete "pseudo-tags", i.e., sequences that cannot be parsed as
262 * tags but look similar to them. "Pseudo-tags" start with '<' and end
263 * with '>', contain a printable character after the initial '<', and
264 * do not contain any inner '<' or '>'). If <code>true</code>, such
265 * sequences will be deleted; otherwise (default) the starting '<' will
266 * be escaped.
267 */
268 private final boolean deletingPseudoTags;
269
270 /***
271 * Whether to escape "&" starting a possible nonstandard entity
272 * reference ("&" at the start of one of the 5 predefined entity
273 * references or a character reference is never escaped, all other "&"
274 * are always escaped).
275 */
276 private final boolean escapingPseudoEntities;
277
278 /***
279 * Creates a new instance using a default extension and the
280 * {@linkplain TiesConfiguration#CONF standard configuration}.
281 */
282 public XMLAdjuster() {
283 this("xml");
284 }
285
286 /***
287 * Creates a new instance, configured from the
288 * {@linkplain TiesConfiguration#CONF standard configuration}.
289 * @param outExt the extension to use for output files
290 */
291 public XMLAdjuster(final String outExt) {
292 this(outExt, TiesConfiguration.CONF);
293 }
294
295 /***
296 * Creates a new instance from the provided configuration.
297 * @param outExt the extension to use for output files
298 * @param config used to configure this instance
299 */
300 public XMLAdjuster(final String outExt, final TiesConfiguration config) {
301 this(outExt, config.getString(CONFIG_MISSING_ROOT, null),
302 config.containsKey(CONFIG_EMPTIABLE_TAGS)
303 ? CollUtils.arrayAsSet(config.getStringArray(
304 CONFIG_EMPTIABLE_TAGS))
305 : null,
306 config.getBoolean(CONFIG_DELETE_CONTROL_CHARS),
307 config.getBoolean(CONFIG_DELETE_PSEUDO_TAGS),
308 config.getBoolean(CONFIG_DELETE_TRAILING_GARBAGE),
309 config.getBoolean(CONFIG_ESCAPE_PSEUDO_ENTITIES), config);
310 }
311
312 /***
313 * Creates a new instance.
314 *
315 * @param outExt the extension to use for output files
316 * @param missingRoot the name to use for the root element if missing, i.e.
317 * if not all elements and textual content are inclosed within a single
318 * element (the root); if <code>null</code>, processing stops with an
319 * exception if the root element is missing
320 * @param emptiableTagSet contains the names (Strings) of tags that can be
321 * converted an empty tags when required for fixing a document (e.g. "br"
322 * when <code><br></code> may be converted to
323 * <code><br/></code> during repair); might be <code>null</code> if
324 * there are none
325 * @param deleteControlChars whether to delete
326 * {@link #CONTROL_CHARS control characters} (which are not allowed in XML
327 * 1.0 and discouraged in XML 1.1)
328 * @param deletePseudoTags whether to
329 * {@linkplain #isDeletingPseudoTags() delete "pseudo-tags"}
330 * @param deleteTrailingGarbage whether to delete trailing garbage
331 * (illegal content that occurs after the root tag has been closed)
332 * @param escapePseudoEntities whether to escape "&" starting a possible
333 * nonstandard entity reference ("&" at the start of one of the 5
334 * predefined entity references or a character reference is never escaped,
335 * all other "&" are always escaped)
336 * @param config used to configure superclasses; if <code>null</code>,
337 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
338 */
339 public XMLAdjuster(final String outExt, final String missingRoot,
340 final Set<String> emptiableTagSet, final boolean deleteControlChars,
341 final boolean deletePseudoTags,
342 final boolean deleteTrailingGarbage,
343 final boolean escapePseudoEntities,
344 final TiesConfiguration config) {
345 super(outExt, config);
346 missingRootName = missingRoot;
347 emptiableTags = emptiableTagSet;
348 deletingControlChars = deleteControlChars;
349 deletingPseudoTags = deletePseudoTags;
350 deletingTrailingGarbage = deleteTrailingGarbage;
351 escapingPseudoEntities = escapePseudoEntities;
352 }
353
354 /***
355 * Tries to fix corrupt XML documents, especially documents
356 * containing nesting errors. Delegates to
357 * {@link #fixedConstituents(CharSequence)} and writes the result to the
358 * specified writer.
359 *
360 * @param input the corrupt XML document
361 * @param out the writer to write the corrected XML document to; flushed but
362 * not closed by this method
363 * @throws IOException if an I/O error occurs while using the writer
364 * @throws ParsingException if the XML input contains an uncorrectable error
365 */
366 public final void adjust(final CharSequence input, final Writer out)
367 throws IOException, ParsingException {
368 final XMLConstituent firstConst = fixedConstituents(input);
369
370
371 XMLConstituent currentConst = firstConst;
372 while (currentConst != null) {
373 out.write(currentConst.getRepresentantion());
374 currentConst = currentConst.nextConstituent();
375 }
376 out.flush();
377 }
378
379 /***
380 * Tries to fix corrupt XML documents, especially documents containing
381 * nesting errors. Delegates to {@link #adjust(CharSequence, Writer)}.
382 *
383 * @param in the reader to read the corrupt XML document from; not closed
384 * by this method
385 * @param out the writer to write the corrected XML document to; flushed but
386 * not closed by this method
387 * @throws IOException if an I/O error occurs while using the reader or
388 * writer
389 * @throws ParsingException if the XML input contains an uncorrectable error
390 */
391 public final void adjust(final Reader in, final Writer out)
392 throws IOException, ParsingException {
393 final String input = IOUtils.readToString(in);
394 adjust(input, out);
395 }
396
397 /***
398 * Method called by the {@link #logEvent(String, String)} methods whenever
399 * an event occurred to ensure the event is acceptable. Subclasses that want
400 * to prevent certain events can overwrite this method and throw an
401 * exception if an "illegal" event is encountered.
402 * This implementation does nothing, letting all events pass.
403 *
404 * @param eventType the event that occurred; should be one of the
405 * EVENT constants defined in this class.
406 * @throws ParsingException could be thrown by subclasses if the event is
407 * considered illicit
408 */
409 protected void checkEvent(final String eventType) throws ParsingException {
410
411 }
412
413 /***
414 * Helper method called after completing an end tag with a missing start tag
415 * or processing a tentative start tag to check whether the next appearance
416 * of this tag type is a start tag or another end tag.
417 * The second case means that another start tag is missing
418 * -- the missing start tag is created as a tentative tag and inserted.
419 *
420 * @param endTag the end tag whose next appearance should be checked
421 * @param insertAfter the tag after which the newly created tentative
422 * tag should be inserted
423 * @param unprocessedTags the container of unprocessed start and end tags
424 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
425 * implementations in subclasses if an "illicit" event occurred
426 */
427 private void checkNextAppearance(final TagConstituent endTag,
428 final TagConstituent insertAfter,
429 final UnprocessedTags unprocessedTags) throws ParsingException {
430
431
432 if (correspondingEndTag(endTag.getName(), -1, unprocessedTags,
433 false) != null) {
434
435
436 final TagConstituent tentative = new TagConstituent(
437 TagConstituent.START_TAG, endTag.getName(),
438 insertAfter.getMarkupSeriesNo());
439 tentative.setVariety(TagVariety.TENTATIVE);
440
441
442 XMLConstituent nextConst = insertAfter.nextConstituent();
443 if (nextConst.getType() == OtherConstituent.OUTER_WHITESPACE) {
444 nextConst.insertAfter(tentative);
445 } else {
446 insertAfter.insertAfter(tentative);
447 }
448
449
450 unprocessedTags.push(tentative, false);
451 logEvent(EVENT_INSERTED_MISSING_START_TAG, tentative);
452 }
453 }
454
455 /***
456 * Helper method that creates an constituent that is part of a markup series
457 * (i.e., that is neither text nor a CDATA section).
458 *
459 * @param currentToken the string representation of the constituent
460 * @param capturedText the text captured by the XML text tokenizer, used to
461 * determine the token type
462 * @param markupSeriesNo the number of the markup series of the constituent
463 * @param startAndEndTags all start and end tags are added to this
464 * container, if it isn't <code>null</code>
465 * @return the created constituent
466 */
467 private XMLConstituent createMarkupConstituent(final String currentToken,
468 final String capturedText, final int markupSeriesNo,
469 final UnprocessedTags startAndEndTags) {
470
471 final char firstChar = capturedText.charAt(0);
472 final char lastChar = capturedText.charAt(capturedText.length() - 1);
473 final XMLConstituent result;
474
475 if (firstChar == '/') {
476 result = new TagConstituent(TagConstituent.END_TAG,
477 capturedText.substring(1), currentToken, markupSeriesNo);
478 if (startAndEndTags != null) {
479 startAndEndTags.push((TagConstituent) result);
480 }
481 } else if (lastChar == '/') {
482 result = new TagConstituent(TagConstituent.EMPTY_TAG,
483 capturedText.substring(0, capturedText.length() - 1),
484 currentToken, markupSeriesNo);
485 } else if (firstChar == '!') {
486 result = new OtherConstituent(OtherConstituent.DOCTYPE,
487 currentToken);
488 } else if (firstChar == '?') {
489
490 if (capturedText.substring(1).equals("xml")) {
491 result = new OtherConstituent(OtherConstituent.XML_PROLOG,
492 currentToken);
493 } else {
494 result = new OtherConstituent(OtherConstituent.PI,
495 currentToken);
496 }
497 } else if (firstChar == '<') {
498 result = new OtherConstituent(OtherConstituent.COMMENT,
499 currentToken);
500 } else {
501 result = new TagConstituent(TagConstituent.START_TAG, capturedText,
502 currentToken, markupSeriesNo);
503 if (startAndEndTags != null) {
504 startAndEndTags.push((TagConstituent) result);
505 }
506 }
507 return result;
508 }
509
510 /***
511 * Checks whether <code>unprocessedTags</code> contains a
512 * matching tag within the specified <code>markupSeriesNo</code>, not
513 * preceeded by a matching start tag. Returns the found end tag, if it
514 * exists; returns <code>null</code> otherwise.
515 *
516 * @param tagname the name of the tag to check
517 * @param markupSeriesNo the markup series number to match;
518 * or <code>-1</code> if the markup series number doesn't matter
519 * @param unprocessedTags the container of start and end tags to check
520 * @param remove whether to remove the found end tag from the
521 * <code>unprocessedTags</code> container
522 * @return the found corresponding end tag,
523 * or <code>null</code> if no matching appearance was found
524 */
525 private TagConstituent correspondingEndTag(final String tagname,
526 final int markupSeriesNo, final UnprocessedTags unprocessedTags,
527 final boolean remove) {
528 final TagConstituent result;
529 final TagConstituent suitableTag;
530
531 if (markupSeriesNo >= 0) {
532 suitableTag =
533 unprocessedTags.findInSeries(tagname, markupSeriesNo, true);
534 } else {
535 suitableTag = unprocessedTags.findFirst(tagname);
536 }
537
538
539 if ((suitableTag != null)
540 && (suitableTag.getType() == TagConstituent.END_TAG)) {
541
542 result = suitableTag;
543 if (remove) {
544 unprocessedTags.forceRemove(suitableTag);
545 }
546 } else {
547
548 result = null;
549 }
550
551 return result;
552 }
553
554 /***
555 * Checks whether <code>openTags</code> contains a matching start tag
556 * (ignoring the root tag), either within the specified
557 * <code>markupSeriesNo</code> or a tentative tag anywhere.
558 * Returns and remove the found end tag, if it exists.
559 * Returns <code>null</code> otherwise.
560 *
561 * @param tagname the name of the tag to check
562 * @param markupSeriesNo the markup series number to match
563 * @param openTags the container of currently open start tags to check
564 * @return the found and removed matching start tag,
565 * or <code>null</code> if no matching non-root appearance was found
566 */
567 private TagConstituent correspondingOpenTag(final String tagname,
568 final int markupSeriesNo, final OpenTags openTags) {
569
570 TagConstituent result =
571 openTags.findInSeries(tagname, markupSeriesNo, false);
572
573
574 if (result == null) {
575 result = openTags.findTentativeTag(tagname);
576 }
577
578 if (result != null) {
579 if (openTags.isRoot(result)) {
580
581 result = null;
582 } else {
583
584 openTags.forceRemove(result);
585 }
586 }
587 return result;
588 }
589
590 /***
591 * Tries to fix corrupt XML documents, especially documents containing
592 * nesting errors. Delegates to {@link #adjust(Reader, Writer)}, ignoring
593 * the <code>context</code>.
594 *
595 * @param reader reader containing the text to process; should not be closed
596 * by this method
597 * @param writer the writer to write the processed text to; might be flushed
598 * but not closed by this method
599 * @param context a map of objects that are made available for processing
600 * @throws IOException if an I/O error occurs while using the reader or
601 * writer
602 * @throws ParsingException if the XML input contains an uncorrectable error
603 */
604 protected void doProcess(final Reader reader, final Writer writer,
605 final ContextMap context) throws IOException, ParsingException {
606 adjust(reader, writer);
607 }
608
609 /***
610 * Helper method that checks whether an end tag is certain to be missing for
611 * a specified open tag. This is the case if (a) the next appearance of
612 * this tag type is NOT an end tag and if (b) there are at least as much
613 * unprocessed start tags than unprocessed end tags for the given type.
614 *
615 * @param openTagName the name of the open tag check
616 * @param unprocessedTags the container of unprocessed start and end tags
617 * @return <code>true</code> iff both conditions stated above are fulfulled
618 */
619 private boolean endTagMissing(final String openTagName,
620 final UnprocessedTags unprocessedTags) {
621
622 final boolean result = (correspondingEndTag(openTagName, -1,
623 unprocessedTags, false) == null)
624
625 && (unprocessedTags.startTagCount(openTagName)
626 >= unprocessedTags.endTagCount(openTagName));
627 return result;
628 }
629
630 /***
631 * Escapes every of a specified pattern in the representation of a
632 * constituent.
633 *
634 * @param constituent the constituent to check
635 * @param pattern the pattern to replace
636 * @param replacement the replacement text
637 * @param eventType the event to log when replacing was necessary
638 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
639 * implementations in subclasses if an "illicit" event occurred
640 */
641 private void replaceInRep(final XMLConstituent constituent,
642 final Pattern pattern, final String replacement,
643 final String eventType) throws ParsingException {
644 final String oldRep = constituent.getRepresentantion();
645 final String newRep =
646 TextUtils.replaceAll(oldRep, pattern, replacement);
647
648 if (oldRep != newRep) {
649
650 logEvent(eventType, "old: '" + oldRep + "', new: '" + newRep + "'");
651 constituent.setRepresentantion(newRep);
652 }
653 }
654
655 /***
656 * Returns the constituents of an XML-like document after fixing possible
657 * nesting errors etc.
658 *
659 * @param input the XML-like input data
660 * @return a reference to the first contained constituent; the list
661 * of constituents can be traversed by calling
662 * {@link XMLConstituent#nextConstituent()} on each constituent untill
663 * <code>null</code> is returned
664 * @throws ParsingException if the XML input contains an uncorrectable error
665 */
666 public final XMLConstituent fixedConstituents(final CharSequence input)
667 throws ParsingException {
668
669
670 final UnprocessedTags unprocessedTags = new UnprocessedTags();
671 XMLConstituent firstConst =
672 rawConstituents(input, true, unprocessedTags);
673
674
675 XMLConstituent currentConst = firstConst;
676 XMLConstituent lastConst = currentConst;
677 TagConstituent currentTag;
678 final OpenTags openTags = new OpenTags();
679
680
681
682 XMLConstituent firstRootContent = null;
683 boolean insertedMissingRoot = false;
684 boolean skipConst;
685
686 while (currentConst != null) {
687 skipConst = false;
688
689 if ((firstRootContent == null) && isRootContent(currentConst)) {
690
691 firstRootContent = currentConst;
692
693 if (currentConst.getType() != TagConstituent.START_TAG) {
694
695 insertMissingRoot(firstRootContent, openTags);
696 insertedMissingRoot = true;
697 }
698 } else if (openTags.isEmpty() && (isRootContent(currentConst))) {
699
700 if (deletingTrailingGarbage) {
701
702 logEvent(EVENT_DELETED_TRAILING_GARBAGE,
703 currentConst.toString());
704 skipConst = true;
705 final XMLConstituent constToRemove = currentConst;
706 currentConst = currentConst.nextConstituent();
707
708 if (constToRemove instanceof TagConstituent) {
709 unprocessedTags.remove((TagConstituent) constToRemove);
710 }
711 constToRemove.remove();
712 } else {
713
714 insertMissingRoot(firstRootContent, openTags);
715 insertedMissingRoot = true;
716 }
717 }
718
719 if (!skipConst) {
720 if (currentConst instanceof TagConstituent) {
721 currentTag = (TagConstituent) currentConst;
722
723 if (currentTag.getType() == TagConstituent.START_TAG) {
724
725 unprocessedTags.forceRemove(currentTag);
726 openTags.push(currentTag);
727 } else if (currentTag.getType() == TagConstituent.END_TAG) {
728
729 unprocessedTags.forceRemove(currentTag);
730 handleEndTag(currentTag, openTags, unprocessedTags);
731 }
732 }
733
734 lastConst = currentConst;
735 currentConst = currentConst.nextConstituent();
736 }
737 }
738
739
740 handleEOF(lastConst, openTags, unprocessedTags, insertedMissingRoot);
741
742
743
744 while (firstConst.hasPrevious()) {
745 firstConst = firstConst.previousConstituent();
746 }
747 return firstConst;
748 }
749
750 /***
751 * Helper method that fixes character errors in the representation of a
752 * constituent, if any.
753 *
754 * @param constituent the constituent to check
755 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
756 * implementations in subclasses if an "illicit" event occurred
757 */
758 private void fixRepresentation(final XMLConstituent constituent)
759 throws ParsingException {
760
761 if (deletingControlChars) {
762 replaceInRep(constituent, CONTROL_CHARS, "",
763 EVENT_DELETED_CONTROL_CHARS);
764 }
765
766 if (needsAmpEscape(constituent)) {
767 if (escapingPseudoEntities) {
768
769
770 replaceInRep(constituent, PSEUDO_AMP, ESCAPED_AMP,
771 EVENT_ESCAPED_CHARS);
772 } else {
773
774 replaceInRep(constituent, SPURIOUS_AMP, ESCAPED_AMP,
775 EVENT_ESCAPED_CHARS);
776 }
777 }
778 }
779
780 /***
781 * Helper method for handling an end tag. Here the actual work of the
782 * algorithm takes place, because we have to ensure a match.
783 *
784 * @param endTag the end tag to handle
785 * @param openTags must contain all currently open tags
786 * @param unprocessedTags must contain all unprocessed start and end tags
787 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
788 * implementations in subclasses if an "illicit" event occurred
789 */
790 protected void handleEndTag(final TagConstituent endTag,
791 final OpenTags openTags, final UnprocessedTags unprocessedTags)
792 throws ParsingException {
793 TagConstituent lastOpenTag;
794 TagVariety lastOpenTagVariety;
795 String lastOpenTagName;
796 TagConstituent correspondingTag;
797 boolean matchFound = false;
798
799
800 while (!matchFound) {
801 lastOpenTag = openTags.peek();
802 lastOpenTagVariety = lastOpenTag.getVariety();
803 lastOpenTagName = lastOpenTag.getName();
804
805
806 if (lastOpenTagName.equals(endTag.getName())) {
807
808
809 if (openTags.popAndRegularize()
810 && (lastOpenTagVariety == TagVariety.TENTATIVE)) {
811 checkNextAppearance(endTag, endTag, unprocessedTags);
812 }
813 matchFound = true;
814 } else if (lastOpenTagVariety == TagVariety.TENTATIVE) {
815
816
817 moveTentativeTag(lastOpenTag, endTag, openTags,
818 unprocessedTags);
819
820 } else if (openTags.containsNonTentative(endTag.getName())
821 && (correspondingTag = correspondingEndTag(
822 lastOpenTagName, endTag.getMarkupSeriesNo(),
823 unprocessedTags, true)) != null) {
824
825
826
827
828 openTags.popAndRegularize();
829
830
831 correspondingTag.remove();
832 endTag.insertBefore(correspondingTag);
833
834
835
836 if (lastOpenTagVariety == TagVariety.TENTATIVE) {
837 checkNextAppearance(correspondingTag, endTag,
838 unprocessedTags);
839 }
840
841
842 logEvent(EVENT_MOVED_END_TAG_UP, correspondingTag);
843 } else if ((correspondingTag = correspondingOpenTag(
844 endTag.getName(), lastOpenTag.getMarkupSeriesNo(),
845 openTags)) != null) {
846
847
848
849
850 correspondingTag.remove();
851 lastOpenTag.insertAfter(correspondingTag);
852
853 if (correspondingTag.getVariety() == TagVariety.REGULAR) {
854
855 logEvent(EVENT_MOVED_START_TAG_DOWN, correspondingTag);
856 } else {
857 if (correspondingTag.getVariety() == TagVariety.TENTATIVE) {
858
859
860
861 checkNextAppearance(endTag, endTag, unprocessedTags);
862 }
863
864
865 correspondingTag.setVariety(TagVariety.REGULAR);
866
867 }
868 matchFound = true;
869 } else if (!openTags.contains(endTag.getName())) {
870
871
872 final TagConstituent missingStartTag =
873 new TagConstituent(TagConstituent.START_TAG,
874 endTag.getName(), lastOpenTag.getMarkupSeriesNo());
875
876
877 lastOpenTag.insertAfter(missingStartTag);
878 logEvent(EVENT_INSERTED_MISSING_START_TAG, missingStartTag);
879
880
881
882 checkNextAppearance(endTag, endTag, unprocessedTags);
883 matchFound = true;
884 } else if ((lastOpenTag.getMarkupSeriesNo()
885 == endTag.getMarkupSeriesNo())
886 && !endTagMissing(lastOpenTagName, unprocessedTags)) {
887
888
889
890
891 openTags.forceRemove(lastOpenTag);
892 lastOpenTag.remove();
893 endTag.insertAfter(lastOpenTag);
894
895
896 unprocessedTags.push(lastOpenTag, false);
897
898
899 if (lastOpenTagVariety == TagVariety.REGULAR) {
900 logEvent(EVENT_MOVED_START_TAG_DOWN, lastOpenTag);
901 }
902
903 } else {
904
905 final boolean endTagMissing =
906 endTagMissing(lastOpenTagName, unprocessedTags);
907
908 if (endTagMissing && isAnEmptiableTag(lastOpenTagName)) {
909
910 final TagConstituent newEmptyTag =
911 replaceWithEmptyCopy(lastOpenTag);
912 logEvent(EVENT_CONVERTED_TO_EMPTY_TAG, newEmptyTag);
913 } else {
914
915
916 final TagConstituent matchingEndTag = new TagConstituent(
917 TagConstituent.END_TAG, lastOpenTagName,
918 endTag.getMarkupSeriesNo());
919 endTag.insertBefore(matchingEndTag);
920
921 if (endTagMissing) {
922
923
924 if (lastOpenTagVariety == TagVariety.REGULAR) {
925 logEvent(EVENT_INSERTED_MISSING_END_TAG,
926 matchingEndTag);
927 }
928 } else {
929
930
931 final TagConstituent continuation = new TagConstituent(
932 TagConstituent.START_TAG, lastOpenTagName,
933 lastOpenTag.getRepresentantion(),
934 endTag.getMarkupSeriesNo());
935 continuation.setVariety(TagVariety.CONTINUATION);
936
937
938 endTag.insertAfter(continuation);
939 unprocessedTags.push(continuation, false);
940 logEvent(EVENT_SPLIT_TAG, lastOpenTag);
941 }
942
943
944 openTags.popAndRegularize();
945 }
946 }
947 }
948 }
949
950 /***
951 * Helper method for handling an the end of a file. Suitable end tags are
952 * created to close any left-over open tags.
953 *
954 * @param lastConst the last constituent in the original input (must not
955 * have a successor)
956 * @param openTags must contain all currently open tags
957 * @param unprocessedTags must contain all unprocessed start and end tags;
958 * should better be empty
959 * @param insertedMissingRoot whether the start tag of a missing root
960 * element was created (in this case we'll insert the corresponding end tag
961 * without logging another event)
962 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
963 * implementations in subclasses if an "illicit" event occurred
964 */
965 protected void handleEOF(final XMLConstituent lastConst,
966 final OpenTags openTags, final UnprocessedTags unprocessedTags,
967 final boolean insertedMissingRoot) throws ParsingException {
968
969 if (lastConst.nextConstituent() != null) {
970 Util.LOG.error("Implementation error: constituents after last "
971 + "constituent: " + lastConst);
972 }
973 if (!unprocessedTags.isEmpty()) {
974 Util.LOG.error("Implementation error: still unprocessed tags at "
975 + "end-of-file -- last constituent: " + lastConst
976 + " unprocessed tags: " + unprocessedTags);
977 }
978
979 XMLConstituent currentConst = lastConst;
980 TagConstituent lastOpenTag = openTags.peek();
981
982 if (lastOpenTag != null) {
983
984 while (!isRootContent(currentConst)) {
985 currentConst = currentConst.previousConstituent();
986 }
987
988
989 TagConstituent missingEndTag;
990 while (lastOpenTag != null) {
991
992 missingEndTag = new TagConstituent(TagConstituent.END_TAG,
993 lastOpenTag.getName());
994
995
996 currentConst.insertAfter(missingEndTag);
997 currentConst = missingEndTag;
998
999 openTags.pop();
1000 lastOpenTag = openTags.peek();
1001
1002
1003 if (!(insertedMissingRoot && openTags.isEmpty())) {
1004 logEvent(EVENT_INSERTED_MISSING_END_TAG, missingEndTag);
1005 }
1006 }
1007 }
1008 }
1009
1010 /***
1011 * Logs the occurance of an event necessary for fixing a document.
1012 * This method variant is used for character errors.
1013 *
1014 * @param eventType the event that occurred; should be one of the
1015 * EVENT constants defined in this class.
1016 * @param details a detailed description of the event
1017 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1018 * implementations in subclasses if the event is considered illicit
1019 */
1020 protected void logEvent(final String eventType, final String details)
1021 throws ParsingException {
1022 checkEvent(eventType);
1023 Util.LOG.debug("Modified document: " + eventType
1024 + " (" + details + ')');
1025 }
1026
1027 /***
1028 * Logs the occurance of an event necessary for fixing a document.
1029 * This method variant is used for nesting errors and missing root elements.
1030 *
1031 * @param eventType the event that occurred; should be one of the
1032 * EVENT constants defined in this class.
1033 * @param tag the involved tag
1034 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1035 * implementations in subclasses if the event is considered illicit
1036 */
1037 protected void logEvent(final String eventType, final TagConstituent tag)
1038 throws ParsingException {
1039 checkEvent(eventType);
1040 Util.LOG.debug("Modified document: " + eventType
1041 + " (" + tag.toString() + ')');
1042 }
1043
1044 /***
1045 * Inserts a root element with the configured name
1046 * ({@link #missingRootName}); or throws a parsing exception to finish
1047 * processing if inserting a root element is not allowed. This method
1048 * only creates and inserts a start tag of the configured type; the
1049 * matching end tag will automatically be added by
1050 * {@link #handleEOF(XMLConstituent, OpenTags, UnprocessedTags, boolean)}.
1051 *
1052 * @param firstRootContent the root start will be inserted before this
1053 * content
1054 * @param openTags the container of open tags to push the root tag to,
1055 * assumed to be empty
1056 * @throws ParsingException if inserting a missing root tag is now allowed
1057 * (no name for a missing root tag specified)
1058 */
1059 private void insertMissingRoot(final XMLConstituent firstRootContent,
1060 final OpenTags openTags) throws ParsingException {
1061 if (missingRootName != null) {
1062
1063 final TagConstituent rootStartTag = new TagConstituent(
1064 TagConstituent.START_TAG, missingRootName, 0);
1065
1066
1067 firstRootContent.insertBefore(rootStartTag);
1068
1069
1070 openTags.push(rootStartTag);
1071 logEvent(EVENT_INSERTED_MISSING_ROOT_ELEMENT, rootStartTag);
1072 } else {
1073 throw new ParsingException("Root tag missing (resp. tags or "
1074 + "textual content outside root element)");
1075 }
1076 }
1077
1078 /***
1079 * Whether the specified tag is one of the tags that can be converted an
1080 * empty tags when required for fixing a document. For example, "br" when
1081 * <code><br></code> may be converted to <code><br/></code>
1082 * during repair.
1083 *
1084 * @param tag the name of the tag to look up
1085 * @return <code>true</code> iff this tag is contained in the set of
1086 * emptiable tags
1087 */
1088 protected boolean isAnEmptiableTag(final String tag) {
1089 return (emptiableTags != null) && emptiableTags.contains(tag);
1090 }
1091
1092 /***
1093 * Whether {@link #CONTROL_CHARS control characters} are deleted (these
1094 * characters are not allowed in XML 1.0 and discouraged in XML 1.1).
1095 * @return <code>true</code> iff control characters are deleted
1096 */
1097 public boolean isDeletingControlChars() {
1098 return deletingControlChars;
1099 }
1100
1101 /***
1102 * Whether "pseudo-tags" are deleted, i.e., sequences that cannot be parsed
1103 * as tags but look similar to them. "Pseudo-tags" start with '<' and end
1104 * with '>', contain a printable character after the initial '<', and
1105 * do not contain any inner '<' or '>'). Disabled by default.
1106 *
1107 * @return <code>true</code> iff pseudo-tags are be deleted (otherwise the
1108 * starting '<' is escaped)
1109 */
1110 public boolean isDeletingPseudoTags() {
1111 return deletingPseudoTags;
1112 }
1113
1114 /***
1115 * Whether to escape "&" starting a possible nonstandard entity
1116 * reference ("&" at the start of one of the 5 predefined entity
1117 * references or a character reference is never escaped, all other "&"
1118 * are always escaped).
1119 *
1120 * @return <code>true</code> iff "pseudo entites" are escaped
1121 */
1122 public boolean isEscapingPseudoEntities() {
1123 return escapingPseudoEntities;
1124 }
1125
1126 /***
1127 * Checks whether a constituent is content that must be enclosed in the
1128 * root element. Tags, textual content, and CDATA sections are root
1129 * content; other constituent types can occur outside of the root element.
1130 *
1131 * @param constituent the constituent to check
1132 * @return <code>true</code> iff the constituent is root content
1133 */
1134 private boolean isRootContent(final XMLConstituent constituent) {
1135 final boolean result;
1136 if (constituent instanceof TagConstituent) {
1137
1138 result = true;
1139 } else {
1140 if ((constituent.getType() == OtherConstituent.CDATA_SECTION)
1141 || (constituent.getType() == OtherConstituent.TEXT)) {
1142
1143 result = true;
1144 } else {
1145 result = false;
1146 }
1147 }
1148 return result;
1149 }
1150
1151 /***
1152 * Helper method that moves a tentative tag after a specified end tag.
1153 * The tentative tag must originally be contained in the container of open
1154 * tags; it is popped from this container and re-added to the container of
1155 * unprocessed tags.
1156 *
1157 * @param tentativeTag the tag to move, should be a tentative tag
1158 * @param endTag the tentative tag is moved after this tag
1159 * @param openTags must contain all currently open tags, including the tag
1160 * to move
1161 * @param unprocessedTags must contain all unprocessed start and end tags
1162 */
1163 private void moveTentativeTag(final TagConstituent tentativeTag,
1164 final TagConstituent endTag, final OpenTags openTags,
1165 final UnprocessedTags unprocessedTags) {
1166
1167 openTags.forceRemove(tentativeTag);
1168 tentativeTag.remove();
1169
1170
1171 tentativeTag.setMarkupSeriesNo(endTag.getMarkupSeriesNo());
1172 endTag.insertAfter(tentativeTag);
1173
1174
1175 unprocessedTags.push(tentativeTag, false);
1176 }
1177
1178 /***
1179 * Checks whether escapes for the "&" character are required in the
1180 * representation of a constituent.
1181 *
1182 * @param constituent the constituent to check
1183 * @return <code>true</code> if the {@linkplain XMLConstituent#getType()
1184 * type} of the constituent is {@link TagConstituent#START_TAG},
1185 * {@link TagConstituent#EMPTY_TAG}, or {@link OtherConstituent#TEXT};
1186 * <code>false</code> for all other types (which either cannot contain any
1187 * "&" or do not need to escape "&")
1188 */
1189 private boolean needsAmpEscape(final XMLConstituent constituent) {
1190 final short constType = constituent.getType();
1191 return (constType == TagConstituent.START_TAG)
1192 || (constType == TagConstituent.EMPTY_TAG)
1193 || (constType == OtherConstituent.TEXT);
1194 }
1195
1196 /***
1197 * Returns the raw constituents of an XML-like document. The constituents
1198 * are returned "raw" as they occur in the input, without fixing possible
1199 * nesting errors etc.
1200 *
1201 * @param input the XML-like input data
1202 * @param fixCharacterErrors whether to try to fix character errors, i.e.
1203 * unescaped "<" and "&" and tags with unquoted attribute values; if
1204 * <code>false</code>, unescaped "<" in textual content and unquoted
1205 * attribute values will yield an exception, while any unescaped "&"
1206 * and unescaped "<" in attribute values will be ignored
1207 * @return a reference to the first contained constituent; the list
1208 * of constituents can be traversed by calling
1209 * {@link XMLConstituent#nextConstituent()} on each constituent untill
1210 * <code>null</code> is returned
1211 * @throws ParsingException if the XML input contains an uncorrectable error
1212 */
1213 public final XMLConstituent rawConstituents(final CharSequence input,
1214 final boolean fixCharacterErrors) throws ParsingException {
1215 return rawConstituents(input, fixCharacterErrors, null);
1216 }
1217
1218 /***
1219 * Returns the raw constituents of an XML-like document. The constituents
1220 * are returned "raw" as they occur in the input, without fixing possible
1221 * nesting errors etc.
1222 *
1223 * @param input the XML-like input data
1224 * @param fixCharacterErrors whether to try to fix character errors, i.e.
1225 * unescaped "<" and "&" and tags with unquoted attribute values; if
1226 * <code>false</code>, unescaped "<" in textual content and unquoted
1227 * attribute values will yield an exception, while any unescaped "&"
1228 * and unescaped "<" in attribute values will be ignored
1229 * @param startAndEndTags all start and end tags are added to this
1230 * container, if it isn't <code>null</code>
1231 * @return a reference to the first contained constituent; the list
1232 * of constituents can be traversed by calling
1233 * {@link XMLConstituent#nextConstituent()} on each constituent untill
1234 * <code>null</code> is returned
1235 * @throws ParsingException if the XML input contains an uncorrectable error
1236 */
1237 protected final XMLConstituent rawConstituents(final CharSequence input,
1238 final boolean fixCharacterErrors,
1239 final UnprocessedTags startAndEndTags) throws ParsingException {
1240
1241
1242 final TextTokenizer tokenizer =
1243 XMLTokenizerFactory.createXMLTokenizer(input, !fixCharacterErrors);
1244 String tokenText;
1245 String precWS;
1246 String captured = null;
1247 boolean done = false;
1248 XMLConstituent firstConst = null;
1249 XMLConstituent priorConst = null;
1250 XMLConstituent currentConst;
1251
1252
1253 boolean inMarkupSeries = true;
1254
1255 int markupSeriesNo = 0;
1256
1257
1258 boolean fixedCharError;
1259 XMLConstituent extraConst = null;
1260 String extraWS = null;
1261
1262 try {
1263 while (!done) {
1264 tokenText = tokenizer.nextToken();
1265 if (tokenText == null) {
1266 done = true;
1267 } else {
1268
1269 captured = tokenizer.capturedText();
1270 }
1271
1272 if (tokenizer.hasPrecedingWhitespace()) {
1273 precWS = tokenizer.precedingWhitespace();
1274 if (fixCharacterErrors
1275 && (!tokenizer.precedingWhitespaceIsValid())) {
1276
1277 fixedCharError = false;
1278 if (precWS.endsWith("<") && (captured.length() == 0)) {
1279
1280 final String remainingWS = precWS.substring(0,
1281 precWS.length() - 1);
1282 final int gtPos = tokenText.indexOf('>');
1283
1284 if (tokenizer.isValidWhitespace(remainingWS)
1285 && gtPos > 0) {
1286
1287
1288 final Object[] fixed
1289 = tryToFixTag('<' + tokenText);
1290 if (fixed != null) {
1291
1292 extraConst = (TagConstituent) fixed[0];
1293 tokenText = (String) fixed[1];
1294 fixedCharError = true;
1295 precWS = remainingWS;
1296 } else if (deletingPseudoTags) {
1297
1298 logEvent(EVENT_DELETED_PSEUDO_TAG, '<'
1299 + tokenText.substring(0, gtPos + 1));
1300 tokenText = tokenText.substring(gtPos + 1);
1301 fixedCharError = true;
1302 precWS = remainingWS;
1303 }
1304 }
1305 }
1306
1307 if (fixedCharError) {
1308
1309
1310 final int initWS =
1311 tokenizer.initialWhitespaceCount(tokenText);
1312 if (initWS > 0) {
1313 if (extraConst != null) {
1314
1315 extraWS = tokenText.substring(0, initWS);
1316 } else {
1317
1318 precWS += tokenText.substring(0, initWS);
1319 }
1320 tokenText = tokenText.substring(initWS);
1321 }
1322 } else {
1323
1324 String escape = StringEscapeUtils.escapeXml(precWS);
1325 final int initWS =
1326 tokenizer.initialWhitespaceCount(escape);
1327
1328
1329 if (initWS > 0) {
1330 precWS = escape.substring(0, initWS);
1331 escape = escape.substring(initWS);
1332 } else {
1333 precWS = "";
1334 }
1335 logEvent(EVENT_ESCAPED_CHARS, escape);
1336
1337 if (captured.length() == 0) {
1338
1339 tokenText = escape + tokenText;
1340 } else {
1341 final int trailingWSChars =
1342 tokenizer.trailingWhitespaceCount(escape);
1343 if (trailingWSChars > 0) {
1344
1345 extraWS = escape.substring(
1346 escape.length() - trailingWSChars);
1347 escape = escape.substring(0,
1348 escape.length() - trailingWSChars);
1349 }
1350
1351 extraConst = new OtherConstituent(
1352 OtherConstituent.TEXT, escape);
1353 }
1354 }
1355 }
1356
1357
1358
1359 if (precWS.length() > 0) {
1360 currentConst = new OtherConstituent(
1361 OtherConstituent.OUTER_WHITESPACE, precWS);
1362
1363 if (priorConst == null) {
1364 firstConst = currentConst;
1365 } else {
1366 priorConst.insertAfter(currentConst);
1367 }
1368 priorConst = currentConst;
1369 }
1370
1371 if (extraConst != null) {
1372
1373 if ((extraConst.getType() == OtherConstituent.TEXT)
1374 || (extraConst.getType()
1375 == OtherConstituent.CDATA_SECTION)) {
1376 inMarkupSeries = false;
1377 } else {
1378 if (!inMarkupSeries) {
1379 inMarkupSeries = true;
1380 markupSeriesNo++;
1381 }
1382 if (extraConst instanceof TagConstituent) {
1383
1384 ((TagConstituent) extraConst)
1385 .setMarkupSeriesNo(markupSeriesNo);
1386
1387
1388 if ((startAndEndTags != null)
1389 && (extraConst.getType()
1390 != TagConstituent.EMPTY_TAG)) {
1391 startAndEndTags.push(
1392 (TagConstituent) extraConst);
1393 }
1394 }
1395 }
1396 if (fixCharacterErrors) {
1397
1398 fixRepresentation(extraConst);
1399 }
1400
1401 if (priorConst == null) {
1402 firstConst = extraConst;
1403 } else {
1404 priorConst.insertAfter(extraConst);
1405 }
1406 priorConst = extraConst;
1407 extraConst = null;
1408
1409 if (extraWS != null) {
1410
1411 currentConst = new OtherConstituent(
1412 OtherConstituent.OUTER_WHITESPACE,
1413 extraWS);
1414 priorConst.insertAfter(currentConst);
1415 priorConst = currentConst;
1416 extraWS = null;
1417 }
1418 }
1419 }
1420
1421
1422 if ((!done) && (tokenText.length() > 0)) {
1423
1424 if ((captured.length() == 0)
1425 || (captured.equals("[CDATA"))) {
1426
1427
1428 if (inMarkupSeries) {
1429 inMarkupSeries = false;
1430 }
1431 if (captured.length() == 0) {
1432 currentConst = new OtherConstituent(
1433 OtherConstituent.TEXT, tokenText);
1434 } else {
1435 currentConst = new OtherConstituent(
1436 OtherConstituent.CDATA_SECTION, tokenText);
1437 }
1438 } else {
1439
1440 if (!inMarkupSeries) {
1441 inMarkupSeries = true;
1442 markupSeriesNo++;
1443 }
1444 currentConst = createMarkupConstituent(tokenText,
1445 captured, markupSeriesNo, startAndEndTags);
1446 }
1447 if (fixCharacterErrors) {
1448
1449 fixRepresentation(currentConst);
1450 }
1451
1452 if (priorConst == null) {
1453 firstConst = currentConst;
1454 } else {
1455 priorConst.insertAfter(currentConst);
1456 }
1457 priorConst = currentConst;
1458 }
1459 }
1460 } catch (IllegalArgumentException iae) {
1461
1462 throw new ParsingException(
1463 "Uncorrectable error in XML input: " + iae.getMessage());
1464 }
1465 return firstConst;
1466 }
1467
1468 /***
1469 * Replaces a start tag with an copy that is an empty tag. This means,
1470 * the representation of the created copy ends in "/>" instead of ">"
1471 * and its type is {@link TagConstituent#EMPTY_TAG} instead of
1472 * {@link TagConstituent#START_TAG}.
1473 *
1474 * <p>If the original tag is part of a list, it is removed and the copy
1475 * is inserted instead.
1476 *
1477 * @param startTag the start tag to replace
1478 * @return the created copy
1479 * @throws IllegalArgumentException if the specified tag is not a valid
1480 * start tag
1481 */
1482 private TagConstituent replaceWithEmptyCopy(final TagConstituent startTag)
1483 throws IllegalArgumentException {
1484 if (startTag.getType() != TagConstituent.START_TAG) {
1485 throw new IllegalArgumentException(
1486 "Tag to replace must be a start tag (actual type: "
1487 + startTag.getType() + ')');
1488 }
1489 final StringBuilder representation =
1490 new StringBuilder(startTag.getRepresentantion());
1491 final int endMarker = representation.lastIndexOf(">");
1492 if (endMarker < 0) {
1493 throw new IllegalArgumentException(
1494 "Start tag representation is invalid: '>' missing!");
1495 }
1496
1497
1498 representation.insert(endMarker, '/');
1499 final TagConstituent result =
1500 new TagConstituent(TagConstituent.EMPTY_TAG, startTag.getName(),
1501 representation.toString(), startTag.getMarkupSeriesNo());
1502
1503
1504 final XMLConstituent prevConst = startTag.previousConstituent();
1505 final XMLConstituent nextConst = startTag.nextConstituent();
1506 startTag.remove();
1507
1508 if (prevConst != null) {
1509 prevConst.insertAfter(result);
1510 } else if (nextConst != null) {
1511
1512 result.insertAfter(nextConst);
1513 }
1514
1515
1516
1517 return result;
1518 }
1519
1520 /***
1521 * Returns a string representation of this object.
1522 *
1523 * @return a textual representation
1524 */
1525 public String toString() {
1526 return new ToStringBuilder(this)
1527 .append("missing root name", missingRootName)
1528 .append("emptiable tags", emptiableTags)
1529 .append("delete control characters", deletingControlChars)
1530 .append("delete pseudo-tags", deletingPseudoTags)
1531 .append("escape pseudo-entities", escapingPseudoEntities)
1532 .toString();
1533 }
1534
1535 /***
1536 * Helper method that tries to parse a string as an XML start or empty tag
1537 * that contains unquoted attribute values (might be followed by other
1538 * text). If this is the case, the unquoted values are fixed and a suitable
1539 * tag constituent is created.
1540 *
1541 * <p>Only call this method when you're sure that the input does not contain
1542 * a <em>valid</em> tag, i.e. if there must be at least one unquoted value.
1543 * This method doesn't store a markup series in the tag.
1544 *
1545 * @param text the text to parse
1546 * @return an array of two elements: the created {@link TagConstituent}
1547 * and a String containing the unused rest of the input text (might be empty
1548 * but not <code>null</code>; or <code>null</code> if the text couldn't be
1549 * parsed as a text.
1550 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1551 * implementations in subclasses if an "illicit" event occurred
1552 */
1553 private Object[] tryToFixTag(final String text) throws ParsingException {
1554 final Object[] result;
1555 final Matcher laxMatcher = LAX_START_OR_EMPTY_TAG.matcher(text);
1556
1557 if (laxMatcher.lookingAt()) {
1558
1559
1560 final String tagName = laxMatcher.group(1);
1561 final short tagType;
1562
1563 result = new Object[2];
1564
1565 result[1] = text.substring(laxMatcher.end());
1566 String oldTagRep = laxMatcher.group();
1567
1568
1569 final String lastGroup = laxMatcher.group(laxMatcher.groupCount());
1570
1571 if (lastGroup == null) {
1572 tagType = TagConstituent.START_TAG;
1573 } else if ("/".equals(lastGroup)) {
1574 tagType = TagConstituent.EMPTY_TAG;
1575 } else {
1576
1577 throw new RuntimeException("Implementation error: last group of"
1578 + " lax tag '" + laxMatcher.group() + "' is '"
1579 + lastGroup + "' instead of '/' or null");
1580 }
1581
1582 StringBuilder newTagRep = null;
1583 String unquotedValue;
1584
1585
1586 while (laxMatcher.groupCount() > 2
1587 && (laxMatcher.group(3) != null)) {
1588
1589
1590 newTagRep = new StringBuilder(oldTagRep.substring(0,
1591 laxMatcher.start(3)));
1592 unquotedValue = laxMatcher.group(3);
1593
1594
1595 if (unquotedValue.startsWith("\"")
1596 || unquotedValue.startsWith("'")) {
1597 unquotedValue = unquotedValue.substring(1);
1598 }
1599 if (unquotedValue.endsWith("\"")
1600 || unquotedValue.endsWith("'")) {
1601 unquotedValue = unquotedValue.substring(0,
1602 unquotedValue.length() - 1);
1603 }
1604
1605
1606 unquotedValue = TextUtils.replaceAll(unquotedValue,
1607 Pattern.compile("\""), """);
1608
1609
1610 newTagRep.append('"');
1611 newTagRep.append(unquotedValue);
1612 newTagRep.append('"');
1613
1614
1615 newTagRep.append(oldTagRep.substring(laxMatcher.end(3),
1616 laxMatcher.end()));
1617
1618
1619 laxMatcher.reset(newTagRep);
1620 if (!laxMatcher.matches()) {
1621
1622 throw new RuntimeException("Implementation error while "
1623 + "trying to fix unquoted attribute values: '"
1624 + newTagRep + "' is no longer parsable as a tag");
1625 }
1626 oldTagRep = newTagRep.toString();
1627 }
1628
1629
1630 result[0] = new TagConstituent(tagType, tagName,
1631 newTagRep.toString());
1632 logEvent(EVENT_QUOTED_ATTRIBUTE_VALUES, newTagRep.toString());
1633 } else {
1634 result = null;
1635 }
1636 return result;
1637 }
1638
1639 }