1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.xml;
23
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.io.Writer;
27 import java.util.Set;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.apache.commons.lang.StringEscapeUtils;
32 import org.apache.commons.lang.builder.ToStringBuilder;
33
34 import de.fu_berlin.ties.ContextMap;
35 import de.fu_berlin.ties.ParsingException;
36 import de.fu_berlin.ties.TextProcessor;
37 import de.fu_berlin.ties.TiesConfiguration;
38
39 import de.fu_berlin.ties.io.IOUtils;
40 import de.fu_berlin.ties.text.TextTokenizer;
41 import de.fu_berlin.ties.text.TextUtils;
42 import de.fu_berlin.ties.util.CollectionUtils;
43 import de.fu_berlin.ties.util.Util;
44
45 /***
46 * This class tries to fix corrupt XML documents, especially documents
47 * containing nesting errors. Instances of this class are thread-safe and
48 * can fix several documents in parallel.
49 *
50 * @author Christian Siefkes
51 * @version $Revision: 1.14 $, $Date: 2004/12/06 18:00:05 $, $Author: siefkes $
52 */
53 public class XMLAdjuster extends TextProcessor {
54
55 /***
56 * Configuration key: the name to use for the root element if missing.
57 */
58 public static final String CONFIG_MISSING_ROOT = "adjust.missing-root";
59
60 /***
61 * Configuration key: Set of names of tags that can be converted empty tags
62 * when required.
63 */
64 public static final String CONFIG_EMPTIABLE_TAGS = "adjust.emptiable-tags";
65
66 /***
67 * Configuration key: whether to delete
68 * {@link #CONTROL_CHARS control characters} (which are not allowed in
69 * XML 1.0 and discouraged in XML 1.1).
70 */
71 public static final String CONFIG_DELETE_CONTROL_CHARS =
72 "adjust.delete.control-chars";
73
74 /***
75 * Configuration key: whether to delete "pseudo-tags".
76 */
77 public static final String CONFIG_DELETE_PSEUDO_TAGS =
78 "adjust.delete.pseudo-tags";
79
80 /***
81 * Configuration key: whether to escape "&" starting a possible
82 * nonstandard entity reference ("&" at the start of one of the 5
83 * predefined entity references or a character reference is never escaped,
84 * all other "&" are always escaped).
85 */
86 public static final String CONFIG_ESCAPE_PSEUDO_ENTITIES =
87 "adjust.escape.pseudo-entities";
88
89 /***
90 * Pattern string specifying characters that can occur at the start of end
91 * of an unquoted attribute value: everything except '<', '>', '='
92 * and whitespace (whitespace is also allowed, but only in the middle of a
93 * value). Evaluated lazily (reluctant) to avoid missing the "/>" at
94 * the end of an empty tag.
95 */
96 public static final String UNQUOTED_ATTRIB_CHARS =
97 "[^<>=" + XMLTokenizerFactory.XML_WHITESPACE_CHARS + "]+?";
98
99 /***
100 * Pattern string specifying an XML attribute without proper quotes.
101 * Equal sign and value are captured in groups; the value is matched lazily
102 * (reluctant) so we won't miss the start of the next attribute.
103 */
104 public static final String UNQUOTED_ATTRIBUTE = XMLTokenizerFactory.XML_NAME
105 + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(=)"
106 + XMLTokenizerFactory.XML_OPT_WHITESPACE + "((?:"
107 + UNQUOTED_ATTRIB_CHARS + XMLTokenizerFactory.XML_WHITESPACE + ")*?"
108 + UNQUOTED_ATTRIB_CHARS + ")";
109
110 /***
111 * Pattern specifying of a "lax" XML start or empty tag that can contain
112 * unquoted (invalid) attributes (combined into a single pattern to avoid
113 * unnecessary backtracking). Element name, equal signs and values of
114 * the (last) unquoted attribute, and '/' (for empty tags) are captured.
115 */
116 public static final Pattern LAX_START_OR_EMPTY_TAG = Pattern.compile(
117 "<(" + XMLTokenizerFactory.XML_NAME + ")" + "(?:"
118 + XMLTokenizerFactory.XML_WHITESPACE + "(?:"
119 + XMLTokenizerFactory.XML_ATTRIBUTE + "|" + UNQUOTED_ATTRIBUTE + "))*"
120 + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(/)?>");
121
122 /***
123 * A "&" that is not the start of an predefined entity reference or
124 * a character reference and thus should be escaped if
125 * {@link #isEscapingPseudoEntities()} is <code>true</code>.
126 * (A pattern matching the rest of predefined entity or character reference
127 * is included via negative lookahead.)
128 */
129 public static final Pattern PSEUDO_AMP = Pattern.compile(
130 "&(?!(?:amp|lt|gt|apos|quot|#[0-9]+|#x[0-9a-fA-F]+);)");
131
132 /***
133 * A "&" that is not the start of an entity and thus must be escaped.
134 * (A pattern matching the rest of legal entity or character reference is
135 * included via negative lookahead.)
136 */
137 public static final Pattern SPURIOUS_AMP = Pattern.compile("&(?!(?:"
138 + XMLTokenizerFactory.XML_NAME + "|#[0-9]+|#x[0-9a-fA-F]+);)");
139
140 /***
141 * Escape sequence for the "&" character.
142 */
143 public static final String ESCAPED_AMP = "&";
144
145 /***
146 * Pattern specifying sequences of control characters (character codes
147 * below the space character, except tab, line feed and carriage return).
148 */
149 public static final Pattern CONTROL_CHARS =
150 Pattern.compile("[\u0001-\u0008\u000B-\u000C\u000E-\u001F]+");
151
152 /***
153 * Event constant: Converted to empty tag.
154 */
155 protected static final String EVENT_CONVERTED_TO_EMPTY_TAG =
156 "Converted to empty tag";
157
158 /***
159 * Event constant: Inserted missing end tag.
160 */
161 protected static final String EVENT_INSERTED_MISSING_END_TAG =
162 "Inserted missing end tag";
163
164 /***
165 * Event constant: Inserted missing root element.
166 */
167 protected static final String EVENT_INSERTED_MISSING_ROOT_ELEMENT =
168 "Inserted missing root element";
169
170 /***
171 * Event constant: Inserted missing start tag.
172 */
173 protected static final String EVENT_INSERTED_MISSING_START_TAG =
174 "Inserted missing start tag";
175
176 /***
177 * Event constant: Moved end tag up.
178 */
179 protected static final String EVENT_MOVED_END_TAG_UP =
180 "Moved end tag up";
181
182 /***
183 * Event constant: Moved start tag dow.
184 */
185 protected static final String EVENT_MOVED_START_TAG_DOWN =
186 "Moved start tag down";
187
188 /***
189 * Event constant: Split tag.
190 */
191 protected static final String EVENT_SPLIT_TAG =
192 "Split tag";
193
194 /***
195 * Event constant: Deleted control characters.
196 */
197 protected static final String EVENT_DELETED_CONTROL_CHARS =
198 "Deleted control characters";
199
200 /***
201 * Event constant: Deleted pseudo-tag.
202 */
203 protected static final String EVENT_DELETED_PSEUDO_TAG =
204 "Deleted pseudo-tag";
205
206 /***
207 * Event constant: Escaped characters that are illegal or unwanted.
208 */
209 protected static final String EVENT_ESCAPED_CHARS =
210 "Escaped characters";
211
212 /***
213 * Event constant: Quoted attribute values.
214 */
215 protected static final String EVENT_QUOTED_ATTRIBUTE_VALUES =
216 "Quoted attribute values";
217
218 /***
219 * The name to use for the root element if missing. A root element
220 * with this name is created when not all elements and textual
221 * content are inclosed within a single element (the root).
222 * If <code>null</code>, processing stops with an exception if the root
223 * element is missing.
224 */
225 private final String missingRootName;
226
227 /***
228 * Contains the names (Strings) of tags that can be converted an empty tags
229 * when required for fixing a document (e.g. "br" when
230 * <code><br></code> may be converted to <code><br/></code>
231 * during repair). Might be <code>null</code>.
232 */
233 private final Set<String> emptiableTags;
234
235 /***
236 * Whether to delete {@link #CONTROL_CHARS control characters} (which are
237 * not allowed in XML 1.0 and discouraged in XML 1.1).
238 */
239 private final boolean deletingControlChars;
240
241 /***
242 * Whether to delete "pseudo-tags", i.e., sequences that cannot be parsed as
243 * tags but look similar to them. "Pseudo-tags" start with '<' and end
244 * with '>', contain a printable character after the initial '<', and
245 * do not contain any inner '<' or '>'). If <code>true</code>, such
246 * sequences will be deleted; otherwise (default) the starting '<' will
247 * be escaped.
248 */
249 private final boolean deletingPseudoTags;
250
251 /***
252 * Whether to escape "&" starting a possible nonstandard entity
253 * reference ("&" at the start of one of the 5 predefined entity
254 * references or a character reference is never escaped, all other "&"
255 * are always escaped).
256 */
257 private final boolean escapingPseudoEntities;
258
259 /***
260 * Creates a new instance using a default extension and the
261 * {@linkplain TiesConfiguration#CONF standard configuration}.
262 */
263 public XMLAdjuster() {
264 this("xml");
265 }
266
267 /***
268 * Creates a new instance, configured from the
269 * {@linkplain TiesConfiguration#CONF standard configuration}.
270 * @param outExt the extension to use for output files
271 */
272 public XMLAdjuster(final String outExt) {
273 this(outExt, TiesConfiguration.CONF);
274 }
275
276 /***
277 * Creates a new instance from the provided configuration.
278 * @param outExt the extension to use for output files
279 * @param config used to configure this instance
280 */
281 public XMLAdjuster(final String outExt, final TiesConfiguration config) {
282 this(outExt, config.getString(CONFIG_MISSING_ROOT, null),
283 config.containsKey(CONFIG_EMPTIABLE_TAGS)
284 ? CollectionUtils.arrayAsSet(config.getStringArray(
285 CONFIG_EMPTIABLE_TAGS))
286 : null,
287 config.getBoolean(CONFIG_DELETE_CONTROL_CHARS),
288 config.getBoolean(CONFIG_DELETE_PSEUDO_TAGS),
289 config.getBoolean(CONFIG_ESCAPE_PSEUDO_ENTITIES), config);
290 }
291
292 /***
293 * Creates a new instance.
294 *
295 * @param outExt the extension to use for output files
296 * @param missingRoot the name to use for the root element if missing, i.e.
297 * if not all elements and textual content are inclosed within a single
298 * element (the root); if <code>null</code>, processing stops with an
299 * exception if the root element is missing
300 * @param emptiableTagSet contains the names (Strings) of tags that can be
301 * converted an empty tags when required for fixing a document (e.g. "br"
302 * when <code><br></code> may be converted to
303 * <code><br/></code> during repair); might be <code>null</code> if
304 * there are none
305 * @param deleteControlChars whether to delete
306 * {@link #CONTROL_CHARS control characters} (which are not allowed in XML
307 * 1.0 and discouraged in XML 1.1)
308 * @param deletePseudoTags whether to
309 * {@linkplain #isDeletingPseudoTags() delete "pseudo-tags"}
310 * @param escapePseudoEntities whether to escape "&" starting a possible
311 * nonstandard entity reference ("&" at the start of one of the 5
312 * predefined entity references or a character reference is never escaped,
313 * all other "&" are always escaped)
314 * @param config used to configure superclasses; if <code>null</code>,
315 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
316 */
317 public XMLAdjuster(final String outExt, final String missingRoot,
318 final Set<String> emptiableTagSet, final boolean deleteControlChars,
319 final boolean deletePseudoTags,
320 final boolean escapePseudoEntities,
321 final TiesConfiguration config) {
322 super(outExt, config);
323 missingRootName = missingRoot;
324 emptiableTags = emptiableTagSet;
325 deletingControlChars = deleteControlChars;
326 deletingPseudoTags = deletePseudoTags;
327 escapingPseudoEntities = escapePseudoEntities;
328 }
329
330 /***
331 * Tries to fix corrupt XML documents, especially documents
332 * containing nesting errors. Delegates to
333 * {@link #fixedConstituents(CharSequence)} and writes the result to the
334 * specified writer.
335 *
336 * @param input the corrupt XML document
337 * @param out the writer to write the corrected XML document to; flushed but
338 * not closed by this method
339 * @throws IOException if an I/O error occurs while using the writer
340 * @throws ParsingException if the XML input contains an uncorrectable error
341 */
342 public final void adjust(final CharSequence input, final Writer out)
343 throws IOException, ParsingException {
344 final XMLConstituent firstConst = fixedConstituents(input);
345
346
347 XMLConstituent currentConst = firstConst;
348 while (currentConst != null) {
349 out.write(currentConst.getRepresentantion());
350 currentConst = currentConst.nextConstituent();
351 }
352 out.flush();
353 }
354
355 /***
356 * Tries to fix corrupt XML documents, especially documents containing
357 * nesting errors. Delegates to {@link #adjust(CharSequence, Writer)}.
358 *
359 * @param in the reader to read the corrupt XML document from; not closed
360 * by this method
361 * @param out the writer to write the corrected XML document to; flushed but
362 * not closed by this method
363 * @throws IOException if an I/O error occurs while using the reader or
364 * writer
365 * @throws ParsingException if the XML input contains an uncorrectable error
366 */
367 public final void adjust(final Reader in, final Writer out)
368 throws IOException, ParsingException {
369 final String input = IOUtils.readToString(in);
370 adjust(input, out);
371 }
372
373 /***
374 * Method called by the {@link #logEvent(String, String)} methods whenever
375 * an event occurred to ensure the event is acceptable. Subclasses that want
376 * to prevent certain events can overwrite this method and throw an
377 * exception if an "illegal" event is encountered.
378 * This implementation does nothing, letting all events pass.
379 *
380 * @param eventType the event that occurred; should be one of the
381 * EVENT constants defined in this class.
382 * @throws ParsingException could be thrown by subclasses if the event is
383 * considered illicit
384 */
385 protected void checkEvent(final String eventType) throws ParsingException {
386 }
387
388 /***
389 * Helper method called after completing an end tag with a missing start tag
390 * or processing a tentative start tag to check whether the next appearance
391 * of this tag type is a start tag or another end tag.
392 * The second case means that another start tag is missing
393 * -- the missing start tag is created as a tentative tag and inserted.
394 *
395 * @param endTag the end tag whose next appearance should be checked
396 * @param insertAfter the tag after which the newly created tentative
397 * tag should be inserted
398 * @param unprocessedTags the container of unprocessed start and end tags
399 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
400 * implementations in subclasses if an "illicit" event occurred
401 */
402 private void checkNextAppearance(final TagConstituent endTag,
403 final TagConstituent insertAfter,
404 final UnprocessedTags unprocessedTags) throws ParsingException {
405
406
407 if (correspondingEndTag(endTag.getName(), -1, unprocessedTags,
408 false) != null) {
409
410
411 final TagConstituent tentative = new TagConstituent(
412 TagConstituent.START_TAG, endTag.getName(),
413 insertAfter.getMarkupSeriesNo());
414 tentative.setVariety(TagVariety.TENTATIVE);
415
416
417 XMLConstituent nextConst = insertAfter.nextConstituent();
418 if (nextConst.getType() == OtherConstituent.OUTER_WHITESPACE) {
419 nextConst.insertAfter(tentative);
420 } else {
421 insertAfter.insertAfter(tentative);
422 }
423
424
425 unprocessedTags.push(tentative, false);
426 logEvent(EVENT_INSERTED_MISSING_START_TAG, tentative);
427 }
428 }
429
430 /***
431 * Helper method that creates an constituent that is part of a markup series
432 * (i.e., that is neither text nor a CDATA section).
433 *
434 * @param currentToken the string representation of the constituent
435 * @param capturedText the text captured by the XML text tokenizer, used to
436 * determine the token type
437 * @param markupSeriesNo the number of the markup series of the constituent
438 * @param startAndEndTags all start and end tags are added to this
439 * container, if it isn't <code>null</code>
440 * @return the created constituent
441 */
442 private XMLConstituent createMarkupConstituent(final String currentToken,
443 final String capturedText, final int markupSeriesNo,
444 final UnprocessedTags startAndEndTags) {
445
446 final char firstChar = capturedText.charAt(0);
447 final char lastChar = capturedText.charAt(capturedText.length() - 1);
448 final XMLConstituent result;
449
450 if (firstChar == '/') {
451 result = new TagConstituent(TagConstituent.END_TAG,
452 capturedText.substring(1), currentToken, markupSeriesNo);
453 if (startAndEndTags != null) {
454 startAndEndTags.push((TagConstituent) result);
455 }
456 } else if (lastChar == '/') {
457 result = new TagConstituent(TagConstituent.EMPTY_TAG,
458 capturedText.substring(0, capturedText.length() - 1),
459 currentToken, markupSeriesNo);
460 } else if (firstChar == '!') {
461 result = new OtherConstituent(OtherConstituent.DOCTYPE,
462 currentToken);
463 } else if (firstChar == '?') {
464
465 if (capturedText.substring(1).equals("xml")) {
466 result = new OtherConstituent(OtherConstituent.XML_PROLOG,
467 currentToken);
468 } else {
469 result = new OtherConstituent(OtherConstituent.PI,
470 currentToken);
471 }
472 } else if (firstChar == '<') {
473 result = new OtherConstituent(OtherConstituent.COMMENT,
474 currentToken);
475 } else {
476 result = new TagConstituent(TagConstituent.START_TAG, capturedText,
477 currentToken, markupSeriesNo);
478 if (startAndEndTags != null) {
479 startAndEndTags.push((TagConstituent) result);
480 }
481 }
482 return result;
483 }
484
485 /***
486 * Checks whether <code>unprocessedTags</code> contains a
487 * matching tag within the specified <code>markupSeriesNo</code>, not
488 * preceeded by a matching start tag. Returns the found end tag, if it
489 * exists; returns <code>null</code> otherwise.
490 *
491 * @param tagname the name of the tag to check
492 * @param markupSeriesNo the markup series number to match;
493 * or <code>-1</code> if the markup series number doesn't matter
494 * @param unprocessedTags the container of start and end tags to check
495 * @param remove whether to remove the found end tag from the
496 * <code>unprocessedTags</code> container
497 * @return the found corresponding end tag,
498 * or <code>null</code> if no matching appearance was found
499 */
500 private TagConstituent correspondingEndTag(final String tagname,
501 final int markupSeriesNo, final UnprocessedTags unprocessedTags,
502 final boolean remove) {
503 final TagConstituent result;
504 final TagConstituent suitableTag;
505
506 if (markupSeriesNo >= 0) {
507 suitableTag =
508 unprocessedTags.findInSeries(tagname, markupSeriesNo, true);
509 } else {
510 suitableTag = unprocessedTags.findFirst(tagname);
511 }
512
513
514 if ((suitableTag != null)
515 && (suitableTag.getType() == TagConstituent.END_TAG)) {
516
517 result = suitableTag;
518 if (remove) {
519 unprocessedTags.forceRemove(suitableTag);
520 }
521 } else {
522
523 result = null;
524 }
525
526 return result;
527 }
528
529 /***
530 * Checks whether <code>openTags</code> contains a matching start tag
531 * (ignoring the root tag), either within the specified
532 * <code>markupSeriesNo</code> or a tentative tag anywhere.
533 * Returns and remove the found end tag, if it exists.
534 * Returns <code>null</code> otherwise.
535 *
536 * @param tagname the name of the tag to check
537 * @param markupSeriesNo the markup series number to match
538 * @param openTags the container of currently open start tags to check
539 * @return the found and removed matching start tag,
540 * or <code>null</code> if no matching non-root appearance was found
541 */
542 private TagConstituent correspondingOpenTag(final String tagname,
543 final int markupSeriesNo, final OpenTags openTags) {
544
545 TagConstituent result =
546 openTags.findInSeries(tagname, markupSeriesNo, false);
547
548
549 if (result == null) {
550 result = openTags.findTentativeTag(tagname);
551 }
552
553 if (result != null) {
554 if (openTags.isRoot(result)) {
555
556 result = null;
557 } else {
558
559 openTags.forceRemove(result);
560 }
561 }
562 return result;
563 }
564
565 /***
566 * Tries to fix corrupt XML documents, especially documents containing
567 * nesting errors. Delegates to {@link #adjust(Reader, Writer)}, ignoring
568 * the <code>context</code>.
569 *
570 * @param reader reader containing the text to process; should not be closed
571 * by this method
572 * @param writer the writer to write the processed text to; might be flushed
573 * but not closed by this method
574 * @param context a map of objects that are made available for processing
575 * @throws IOException if an I/O error occurs while using the reader or
576 * writer
577 * @throws ParsingException if the XML input contains an uncorrectable error
578 */
579 protected void doProcess(final Reader reader, final Writer writer,
580 final ContextMap context) throws IOException, ParsingException {
581 adjust(reader, writer);
582 }
583
584 /***
585 * Helper method that checks whether an end tag is certain to be missing for
586 * a specified open tag. This is the case if (a) the next appearance of
587 * this tag type is NOT an end tag and if (b) there are at least as much
588 * unprocessed start tags than unprocessed end tags for the given type.
589 *
590 * @param openTagName the name of the open tag check
591 * @param unprocessedTags the container of unprocessed start and end tags
592 * @return <code>true</code> iff both conditions stated above are fulfulled
593 */
594 private boolean endTagMissing(final String openTagName,
595 final UnprocessedTags unprocessedTags) {
596
597 final boolean result = (correspondingEndTag(openTagName, -1,
598 unprocessedTags, false) == null)
599
600 && (unprocessedTags.startTagCount(openTagName)
601 >= unprocessedTags.endTagCount(openTagName));
602 return result;
603 }
604
605 /***
606 * Escapes every of a specified pattern in the representation of a
607 * constituent.
608 *
609 * @param constituent the constituent to check
610 * @param pattern the pattern to replace
611 * @param replacement the replacement text
612 * @param eventType the event to log when replacing was necessary
613 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
614 * implementations in subclasses if an "illicit" event occurred
615 */
616 private void replaceInRep(final XMLConstituent constituent,
617 final Pattern pattern, final String replacement,
618 final String eventType) throws ParsingException {
619 final String oldRep = constituent.getRepresentantion();
620 final String newRep =
621 TextUtils.replaceAll(oldRep, pattern, replacement);
622
623 if (oldRep != newRep) {
624
625 logEvent(eventType, "old: '" + oldRep + "', new: '" + newRep + "'");
626 constituent.setRepresentantion(newRep);
627 }
628 }
629
630 /***
631 * Returns the constituents of an XML-like document after fixing possible
632 * nesting errors etc.
633 *
634 * @param input the XML-like input data
635 * @return a reference to the first contained constituent; the list
636 * of constituents can be traversed by calling
637 * {@link XMLConstituent#nextConstituent()} on each constituent untill
638 * <code>null</code> is returned
639 * @throws ParsingException if the XML input contains an uncorrectable error
640 */
641 public final XMLConstituent fixedConstituents(final CharSequence input)
642 throws ParsingException {
643
644
645 final UnprocessedTags unprocessedTags = new UnprocessedTags();
646 XMLConstituent firstConst =
647 rawConstituents(input, true, unprocessedTags);
648
649
650 XMLConstituent currentConst = firstConst;
651 XMLConstituent lastConst = currentConst;
652 TagConstituent currentTag;
653 final OpenTags openTags = new OpenTags();
654
655
656
657 XMLConstituent firstRootContent = null;
658 boolean insertedMissingRoot = false;
659
660 while (currentConst != null) {
661 if ((firstRootContent == null) && isRootContent(currentConst)) {
662
663 firstRootContent = currentConst;
664
665 if (currentConst.getType() != TagConstituent.START_TAG) {
666
667 insertMissingRoot(firstRootContent, openTags);
668 insertedMissingRoot = true;
669 }
670 } else if (openTags.isEmpty() && (isRootContent(currentConst))) {
671
672
673 insertMissingRoot(firstRootContent, openTags);
674 insertedMissingRoot = true;
675 }
676
677 if (currentConst instanceof TagConstituent) {
678 currentTag = (TagConstituent) currentConst;
679 if (currentTag.getType() == TagConstituent.START_TAG) {
680
681 unprocessedTags.forceRemove(currentTag);
682 openTags.push(currentTag);
683 } else if (currentTag.getType() == TagConstituent.END_TAG) {
684
685 unprocessedTags.forceRemove(currentTag);
686 handleEndTag(currentTag, openTags, unprocessedTags);
687 }
688 }
689
690 lastConst = currentConst;
691 currentConst = currentConst.nextConstituent();
692 }
693
694
695 handleEOF(lastConst, openTags, unprocessedTags, insertedMissingRoot);
696
697
698
699 while (firstConst.hasPrevious()) {
700 firstConst = firstConst.previousConstituent();
701 }
702 return firstConst;
703 }
704
705 /***
706 * Helper method that fixes character errors in the representation of a
707 * constituent, if any.
708 *
709 * @param constituent the constituent to check
710 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
711 * implementations in subclasses if an "illicit" event occurred
712 */
713 private void fixRepresentation(final XMLConstituent constituent)
714 throws ParsingException {
715
716 if (deletingControlChars) {
717 replaceInRep(constituent, CONTROL_CHARS, "",
718 EVENT_DELETED_CONTROL_CHARS);
719 }
720
721 if (needsAmpEscape(constituent)) {
722 if (escapingPseudoEntities) {
723
724
725 replaceInRep(constituent, PSEUDO_AMP, ESCAPED_AMP,
726 EVENT_ESCAPED_CHARS);
727 } else {
728
729 replaceInRep(constituent, SPURIOUS_AMP, ESCAPED_AMP,
730 EVENT_ESCAPED_CHARS);
731 }
732 }
733 }
734
735 /***
736 * Helper method for handling an end tag. Here the actual work of the
737 * algorithm takes place, because we have to ensure a match.
738 *
739 * @param endTag the end tag to handle
740 * @param openTags must contain all currently open tags
741 * @param unprocessedTags must contain all unprocessed start and end tags
742 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
743 * implementations in subclasses if an "illicit" event occurred
744 */
745 protected void handleEndTag(final TagConstituent endTag,
746 final OpenTags openTags, final UnprocessedTags unprocessedTags)
747 throws ParsingException {
748 TagConstituent lastOpenTag;
749 TagVariety lastOpenTagVariety;
750 String lastOpenTagName;
751 TagConstituent correspondingTag;
752 boolean matchFound = false;
753
754
755 while (!matchFound) {
756 lastOpenTag = openTags.peek();
757 lastOpenTagVariety = lastOpenTag.getVariety();
758 lastOpenTagName = lastOpenTag.getName();
759
760
761 if (lastOpenTagName.equals(endTag.getName())) {
762
763
764 if (openTags.popAndRegularize()
765 && (lastOpenTagVariety == TagVariety.TENTATIVE)) {
766 checkNextAppearance(endTag, endTag, unprocessedTags);
767 }
768 matchFound = true;
769 } else if (lastOpenTagVariety == TagVariety.TENTATIVE) {
770
771
772 moveTentativeTag(lastOpenTag, endTag, openTags,
773 unprocessedTags);
774
775 } else if (openTags.containsNonTentative(endTag.getName())
776 && (correspondingTag = correspondingEndTag(
777 lastOpenTagName, endTag.getMarkupSeriesNo(),
778 unprocessedTags, true)) != null) {
779
780
781
782
783 openTags.popAndRegularize();
784
785
786 correspondingTag.remove();
787 endTag.insertBefore(correspondingTag);
788
789
790
791 if (lastOpenTagVariety == TagVariety.TENTATIVE) {
792 checkNextAppearance(correspondingTag, endTag,
793 unprocessedTags);
794 }
795
796
797 logEvent(EVENT_MOVED_END_TAG_UP, correspondingTag);
798 } else if ((correspondingTag = correspondingOpenTag(
799 endTag.getName(), lastOpenTag.getMarkupSeriesNo(),
800 openTags)) != null) {
801
802
803
804
805 correspondingTag.remove();
806 lastOpenTag.insertAfter(correspondingTag);
807
808 if (correspondingTag.getVariety() == TagVariety.REGULAR) {
809
810 logEvent(EVENT_MOVED_START_TAG_DOWN, correspondingTag);
811 } else {
812 if (correspondingTag.getVariety() == TagVariety.TENTATIVE) {
813
814
815
816 checkNextAppearance(endTag, endTag, unprocessedTags);
817 }
818
819
820 correspondingTag.setVariety(TagVariety.REGULAR);
821
822 }
823 matchFound = true;
824 } else if (!openTags.contains(endTag.getName())) {
825
826
827 final TagConstituent missingStartTag =
828 new TagConstituent(TagConstituent.START_TAG,
829 endTag.getName(), lastOpenTag.getMarkupSeriesNo());
830
831
832 lastOpenTag.insertAfter(missingStartTag);
833 logEvent(EVENT_INSERTED_MISSING_START_TAG, missingStartTag);
834
835
836
837 checkNextAppearance(endTag, endTag, unprocessedTags);
838 matchFound = true;
839 } else if ((lastOpenTag.getMarkupSeriesNo()
840 == endTag.getMarkupSeriesNo())
841 && !endTagMissing(lastOpenTagName, unprocessedTags)) {
842
843
844
845
846 openTags.forceRemove(lastOpenTag);
847 lastOpenTag.remove();
848 endTag.insertAfter(lastOpenTag);
849
850
851 unprocessedTags.push(lastOpenTag, false);
852
853
854 if (lastOpenTagVariety == TagVariety.REGULAR) {
855 logEvent(EVENT_MOVED_START_TAG_DOWN, lastOpenTag);
856 }
857
858 } else {
859
860 final boolean endTagMissing =
861 endTagMissing(lastOpenTagName, unprocessedTags);
862
863 if (endTagMissing && isAnEmptiableTag(lastOpenTagName)) {
864
865 final TagConstituent newEmptyTag =
866 replaceWithEmptyCopy(lastOpenTag);
867 logEvent(EVENT_CONVERTED_TO_EMPTY_TAG, newEmptyTag);
868 } else {
869
870
871 final TagConstituent matchingEndTag = new TagConstituent(
872 TagConstituent.END_TAG, lastOpenTagName,
873 endTag.getMarkupSeriesNo());
874 endTag.insertBefore(matchingEndTag);
875
876 if (endTagMissing) {
877
878
879 if (lastOpenTagVariety == TagVariety.REGULAR) {
880 logEvent(EVENT_INSERTED_MISSING_END_TAG,
881 matchingEndTag);
882 }
883 } else {
884
885
886 final TagConstituent continuation = new TagConstituent(
887 TagConstituent.START_TAG, lastOpenTagName,
888 lastOpenTag.getRepresentantion(),
889 endTag.getMarkupSeriesNo());
890 continuation.setVariety(TagVariety.CONTINUATION);
891
892
893 endTag.insertAfter(continuation);
894 unprocessedTags.push(continuation, false);
895 logEvent(EVENT_SPLIT_TAG, lastOpenTag);
896 }
897
898
899 openTags.popAndRegularize();
900 }
901 }
902 }
903 }
904
905 /***
906 * Helper method for handling an the end of a file. Suitable end tags are
907 * created to close any left-over open tags.
908 *
909 * @param lastConst the last constituent in the original input (must not
910 * have a successor)
911 * @param openTags must contain all currently open tags
912 * @param unprocessedTags must contain all unprocessed start and end tags;
913 * should better be empty
914 * @param insertedMissingRoot whether the start tag of a missing root
915 * element was created (in this case we'll insert the corresponding end tag
916 * without logging another event)
917 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
918 * implementations in subclasses if an "illicit" event occurred
919 */
920 protected void handleEOF(final XMLConstituent lastConst,
921 final OpenTags openTags, final UnprocessedTags unprocessedTags,
922 final boolean insertedMissingRoot) throws ParsingException {
923
924 if (lastConst.nextConstituent() != null) {
925 Util.LOG.error("Implementation error: constituents after last "
926 + "constituent: " + lastConst);
927 }
928 if (!unprocessedTags.isEmpty()) {
929 Util.LOG.error("Implementation error: still unprocessed tags at "
930 + "end-of-file -- last constituent: " + lastConst
931 + " unprocessed tags: " + unprocessedTags);
932 }
933
934 XMLConstituent currentConst = lastConst;
935 TagConstituent lastOpenTag = openTags.peek();
936
937 if (lastOpenTag != null) {
938
939 while (!isRootContent(currentConst)) {
940 currentConst = currentConst.previousConstituent();
941 }
942
943
944 TagConstituent missingEndTag;
945 while (lastOpenTag != null) {
946
947 missingEndTag = new TagConstituent(TagConstituent.END_TAG,
948 lastOpenTag.getName());
949
950
951 currentConst.insertAfter(missingEndTag);
952 currentConst = missingEndTag;
953
954 openTags.pop();
955 lastOpenTag = openTags.peek();
956
957
958 if (!(insertedMissingRoot && openTags.isEmpty())) {
959 logEvent(EVENT_INSERTED_MISSING_END_TAG, missingEndTag);
960 }
961 }
962 }
963 }
964
965 /***
966 * Logs the occurance of an event necessary for fixing a document.
967 * This method variant is used for character errors.
968 *
969 * @param eventType the event that occurred; should be one of the
970 * EVENT constants defined in this class.
971 * @param details a detailed description of the event
972 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
973 * implementations in subclasses if the event is considered illicit
974 */
975 protected void logEvent(final String eventType, final String details)
976 throws ParsingException {
977 checkEvent(eventType);
978 Util.LOG.debug("Modified document: " + eventType
979 + " (" + details.toString() + ')');
980 }
981
982 /***
983 * Logs the occurance of an event necessary for fixing a document.
984 * This method variant is used for nesting errors and missing root elements.
985 *
986 * @param eventType the event that occurred; should be one of the
987 * EVENT constants defined in this class.
988 * @param tag the involved tag
989 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
990 * implementations in subclasses if the event is considered illicit
991 */
992 protected void logEvent(final String eventType, final TagConstituent tag)
993 throws ParsingException {
994 checkEvent(eventType);
995 Util.LOG.debug("Modified document: " + eventType
996 + " (" + tag.toString() + ')');
997 }
998
999 /***
1000 * Inserts a root element with the configured name
1001 * ({@link #missingRootName}); or throws a parsing exception to finish
1002 * processing if inserting a root element is not allowed. This method
1003 * only creates and inserts a start tag of the configured type; the
1004 * matching end tag will automatically be added by
1005 * {@link #handleEOF(XMLConstituent, OpenTags, UnprocessedTags, boolean)}.
1006 *
1007 * @param firstRootContent the root start will be inserted before this
1008 * content
1009 * @param openTags the container of open tags to push the root tag to,
1010 * assumed to be empty
1011 * @throws ParsingException if inserting a missing root tag is now allowed
1012 * (no name for a missing root tag specified)
1013 */
1014 private void insertMissingRoot(final XMLConstituent firstRootContent,
1015 final OpenTags openTags) throws ParsingException {
1016 if (missingRootName != null) {
1017
1018 final TagConstituent rootStartTag = new TagConstituent(
1019 TagConstituent.START_TAG, missingRootName, 0);
1020
1021
1022 firstRootContent.insertBefore(rootStartTag);
1023
1024
1025 openTags.push(rootStartTag);
1026 logEvent(EVENT_INSERTED_MISSING_ROOT_ELEMENT, rootStartTag);
1027 } else {
1028 throw new ParsingException("Root tag missing (resp. tags or "
1029 + "textual content outside root element)");
1030 }
1031 }
1032
1033 /***
1034 * Whether the specified tag is one of the tags that can be converted an
1035 * empty tags when required for fixing a document. For example, "br" when
1036 * <code><br></code> may be converted to <code><br/></code>
1037 * during repair.
1038 *
1039 * @param tag the name of the tag to look up
1040 * @return <code>true</code> iff this tag is contained in the set of
1041 * emptiable tags
1042 */
1043 protected boolean isAnEmptiableTag(final String tag) {
1044 return (emptiableTags != null) && emptiableTags.contains(tag);
1045 }
1046
1047 /***
1048 * Whether {@link #CONTROL_CHARS control characters} are deleted (these
1049 * characters are not allowed in XML 1.0 and discouraged in XML 1.1).
1050 * @return <code>true</code> iff control characters are deleted
1051 */
1052 public boolean isDeletingControlChars() {
1053 return deletingControlChars;
1054 }
1055
1056 /***
1057 * Whether "pseudo-tags" are deleted, i.e., sequences that cannot be parsed
1058 * as tags but look similar to them. "Pseudo-tags" start with '<' and end
1059 * with '>', contain a printable character after the initial '<', and
1060 * do not contain any inner '<' or '>'). Disabled by default.
1061 *
1062 * @return <code>true</code> iff pseudo-tags are be deleted (otherwise the
1063 * starting '<' is escaped)
1064 */
1065 public boolean isDeletingPseudoTags() {
1066 return deletingPseudoTags;
1067 }
1068
1069 /***
1070 * Whether to escape "&" starting a possible nonstandard entity
1071 * reference ("&" at the start of one of the 5 predefined entity
1072 * references or a character reference is never escaped, all other "&"
1073 * are always escaped).
1074 *
1075 * @return <code>true</code> iff "pseudo entites" are escaped
1076 */
1077 public boolean isEscapingPseudoEntities() {
1078 return escapingPseudoEntities;
1079 }
1080
1081 /***
1082 * Checks whether a constituent is content that must be enclosed in the
1083 * root element. Tags, textual content, and CDATA sections are root
1084 * content; other constituent types can occur outside of the root element.
1085 *
1086 * @param constituent the constituent to check
1087 * @return <code>true</code> iff the constituent is root content
1088 */
1089 private boolean isRootContent(final XMLConstituent constituent) {
1090 final boolean result;
1091 if (constituent instanceof TagConstituent) {
1092
1093 result = true;
1094 } else {
1095 if ((constituent.getType() == OtherConstituent.CDATA_SECTION)
1096 || (constituent.getType() == OtherConstituent.TEXT)) {
1097
1098 result = true;
1099 } else {
1100 result = false;
1101 }
1102 }
1103 return result;
1104 }
1105
1106 /***
1107 * Helper method that moves a tentative tag after a specified end tag.
1108 * The tentative tag must originally be contained in the container of open
1109 * tags; it is popped from this container and re-added to the container of
1110 * unprocessed tags.
1111 *
1112 * @param tentativeTag the tag to move, should be a tentative tag
1113 * @param endTag the tentative tag is moved after this tag
1114 * @param openTags must contain all currently open tags, including the tag
1115 * to move
1116 * @param unprocessedTags must contain all unprocessed start and end tags
1117 */
1118 private void moveTentativeTag(final TagConstituent tentativeTag,
1119 final TagConstituent endTag, final OpenTags openTags,
1120 final UnprocessedTags unprocessedTags) {
1121
1122 openTags.forceRemove(tentativeTag);
1123 tentativeTag.remove();
1124
1125
1126 tentativeTag.setMarkupSeriesNo(endTag.getMarkupSeriesNo());
1127 endTag.insertAfter(tentativeTag);
1128
1129
1130 unprocessedTags.push(tentativeTag, false);
1131 }
1132
1133 /***
1134 * Checks whether escapes for the "&" character are required in the
1135 * representation of a constituent.
1136 *
1137 * @param constituent the constituent to check
1138 * @return <code>true</code> if the {@linkplain XMLConstituent#getType()
1139 * type} of the constituent is {@link TagConstituent#START_TAG},
1140 * {@link TagConstituent#EMPTY_TAG}, or {@link OtherConstituent#TEXT};
1141 * <code>false</code> for all other types (which either cannot contain any
1142 * "&" or do not need to escape "&")
1143 */
1144 private boolean needsAmpEscape(final XMLConstituent constituent) {
1145 final short constType = constituent.getType();
1146 return (constType == TagConstituent.START_TAG)
1147 || (constType == TagConstituent.EMPTY_TAG)
1148 || (constType == OtherConstituent.TEXT);
1149 }
1150
1151 /***
1152 * Returns the raw constituents of an XML-like document. The constituents
1153 * are returned "raw" as they occur in the input, without fixing possible
1154 * nesting errors etc.
1155 *
1156 * @param input the XML-like input data
1157 * @param fixCharacterErrors whether to try to fix character errors, i.e.
1158 * unescaped "<" and "&" and tags with unquoted attribute values; if
1159 * <code>false</code>, unescaped "<" in textual content and unquoted
1160 * attribute values will yield an exception, while any unescaped "&"
1161 * and unescaped "<" in attribute values will be ignored
1162 * @return a reference to the first contained constituent; the list
1163 * of constituents can be traversed by calling
1164 * {@link XMLConstituent#nextConstituent()} on each constituent untill
1165 * <code>null</code> is returned
1166 * @throws ParsingException if the XML input contains an uncorrectable error
1167 */
1168 public final XMLConstituent rawConstituents(final CharSequence input,
1169 final boolean fixCharacterErrors) throws ParsingException {
1170 return rawConstituents(input, fixCharacterErrors, null);
1171 }
1172
1173 /***
1174 * Returns the raw constituents of an XML-like document. The constituents
1175 * are returned "raw" as they occur in the input, without fixing possible
1176 * nesting errors etc.
1177 *
1178 * @param input the XML-like input data
1179 * @param fixCharacterErrors whether to try to fix character errors, i.e.
1180 * unescaped "<" and "&" and tags with unquoted attribute values; if
1181 * <code>false</code>, unescaped "<" in textual content and unquoted
1182 * attribute values will yield an exception, while any unescaped "&"
1183 * and unescaped "<" in attribute values will be ignored
1184 * @param startAndEndTags all start and end tags are added to this
1185 * container, if it isn't <code>null</code>
1186 * @return a reference to the first contained constituent; the list
1187 * of constituents can be traversed by calling
1188 * {@link XMLConstituent#nextConstituent()} on each constituent untill
1189 * <code>null</code> is returned
1190 * @throws ParsingException if the XML input contains an uncorrectable error
1191 */
1192 protected final XMLConstituent rawConstituents(final CharSequence input,
1193 final boolean fixCharacterErrors,
1194 final UnprocessedTags startAndEndTags) throws ParsingException {
1195
1196
1197 final TextTokenizer tokenizer =
1198 XMLTokenizerFactory.createXMLTokenizer(input, !fixCharacterErrors);
1199 String tokenText;
1200 String precWS;
1201 String captured = null;
1202 boolean done = false;
1203 XMLConstituent firstConst = null;
1204 XMLConstituent priorConst = null;
1205 XMLConstituent currentConst;
1206
1207
1208 boolean inMarkupSeries = true;
1209
1210 int markupSeriesNo = 0;
1211
1212
1213 boolean fixedCharError;
1214 XMLConstituent extraConst = null;
1215 String extraWS = null;
1216
1217 try {
1218 while (!done) {
1219 tokenText = tokenizer.nextToken();
1220 if (tokenText == null) {
1221 done = true;
1222 } else {
1223
1224 captured = tokenizer.capturedText();
1225 }
1226
1227 if (tokenizer.hasPrecedingWhitespace()) {
1228 precWS = tokenizer.precedingWhitespace();
1229 if (fixCharacterErrors
1230 && (!tokenizer.precedingWhitespaceIsValid())) {
1231
1232 fixedCharError = false;
1233 if (precWS.endsWith("<") && (captured.length() == 0)) {
1234
1235 final String remainingWS = precWS.substring(0,
1236 precWS.length() - 1);
1237 final int gtPos = tokenText.indexOf('>');
1238
1239 if (tokenizer.isValidWhitespace(remainingWS)
1240 && gtPos > 0) {
1241
1242
1243 final Object[] fixed
1244 = tryToFixTag('<' + tokenText);
1245 if (fixed != null) {
1246
1247 extraConst = (TagConstituent) fixed[0];
1248 tokenText = (String) fixed[1];
1249 fixedCharError = true;
1250 precWS = remainingWS;
1251 } else if (deletingPseudoTags) {
1252
1253 logEvent(EVENT_DELETED_PSEUDO_TAG, '<'
1254 + tokenText.substring(0, gtPos + 1));
1255 tokenText = tokenText.substring(gtPos + 1);
1256 fixedCharError = true;
1257 precWS = remainingWS;
1258 }
1259 }
1260 }
1261
1262 if (fixedCharError) {
1263
1264
1265 final int initWS =
1266 tokenizer.initialWhitespaceCount(tokenText);
1267 if (initWS > 0) {
1268 if (extraConst != null) {
1269
1270 extraWS = tokenText.substring(0, initWS);
1271 } else {
1272
1273 precWS += tokenText.substring(0, initWS);
1274 }
1275 tokenText = tokenText.substring(initWS);
1276 }
1277 } else {
1278
1279 String escape = StringEscapeUtils.escapeXml(precWS);
1280 final int initWS =
1281 tokenizer.initialWhitespaceCount(escape);
1282
1283
1284 if (initWS > 0) {
1285 precWS = escape.substring(0, initWS);
1286 escape = escape.substring(initWS);
1287 } else {
1288 precWS = "";
1289 }
1290 logEvent(EVENT_ESCAPED_CHARS, escape);
1291
1292 if (captured.length() == 0) {
1293
1294 tokenText = escape + tokenText;
1295 } else {
1296 final int trailingWSChars =
1297 tokenizer.trailingWhitespaceCount(escape);
1298 if (trailingWSChars > 0) {
1299
1300 extraWS = escape.substring(
1301 escape.length() - trailingWSChars);
1302 escape = escape.substring(0,
1303 escape.length() - trailingWSChars);
1304 }
1305
1306 extraConst = new OtherConstituent(
1307 OtherConstituent.TEXT, escape);
1308 }
1309 }
1310 }
1311
1312
1313
1314 if (precWS.length() > 0) {
1315 currentConst = new OtherConstituent(
1316 OtherConstituent.OUTER_WHITESPACE, precWS);
1317
1318 if (priorConst == null) {
1319 firstConst = currentConst;
1320 } else {
1321 priorConst.insertAfter(currentConst);
1322 }
1323 priorConst = currentConst;
1324 }
1325
1326 if (extraConst != null) {
1327
1328 if ((extraConst.getType() == OtherConstituent.TEXT)
1329 || (extraConst.getType()
1330 == OtherConstituent.CDATA_SECTION)) {
1331 inMarkupSeries = false;
1332 } else {
1333 if (!inMarkupSeries) {
1334 inMarkupSeries = true;
1335 markupSeriesNo++;
1336 }
1337 if (extraConst instanceof TagConstituent) {
1338
1339 ((TagConstituent) extraConst)
1340 .setMarkupSeriesNo(markupSeriesNo);
1341
1342
1343 if ((startAndEndTags != null)
1344 && (extraConst.getType()
1345 != TagConstituent.EMPTY_TAG)) {
1346 startAndEndTags.push(
1347 (TagConstituent) extraConst);
1348 }
1349 }
1350 }
1351 if (fixCharacterErrors) {
1352
1353 fixRepresentation(extraConst);
1354 }
1355
1356 if (priorConst == null) {
1357 firstConst = extraConst;
1358 } else {
1359 priorConst.insertAfter(extraConst);
1360 }
1361 priorConst = extraConst;
1362 extraConst = null;
1363
1364 if (extraWS != null) {
1365
1366 currentConst = new OtherConstituent(
1367 OtherConstituent.OUTER_WHITESPACE,
1368 extraWS);
1369 priorConst.insertAfter(currentConst);
1370 priorConst = currentConst;
1371 extraWS = null;
1372 }
1373 }
1374 }
1375
1376
1377 if ((!done) && (tokenText.length() > 0)) {
1378
1379 if ((captured.length() == 0)
1380 || (captured.equals("[CDATA"))) {
1381
1382
1383 if (inMarkupSeries) {
1384 inMarkupSeries = false;
1385 }
1386 if (captured.length() == 0) {
1387 currentConst = new OtherConstituent(
1388 OtherConstituent.TEXT, tokenText);
1389 } else {
1390 currentConst = new OtherConstituent(
1391 OtherConstituent.CDATA_SECTION, tokenText);
1392 }
1393 } else {
1394
1395 if (!inMarkupSeries) {
1396 inMarkupSeries = true;
1397 markupSeriesNo++;
1398 }
1399 currentConst = createMarkupConstituent(tokenText,
1400 captured, markupSeriesNo, startAndEndTags);
1401 }
1402 if (fixCharacterErrors) {
1403
1404 fixRepresentation(currentConst);
1405 }
1406
1407 if (priorConst == null) {
1408 firstConst = currentConst;
1409 } else {
1410 priorConst.insertAfter(currentConst);
1411 }
1412 priorConst = currentConst;
1413 }
1414 }
1415 } catch (IllegalArgumentException iae) {
1416
1417 throw new ParsingException(
1418 "Uncorrectable error in XML input: " + iae.getMessage());
1419 }
1420 return firstConst;
1421 }
1422
1423 /***
1424 * Replaces a start tag with an copy that is an empty tag. This means,
1425 * the representation of the created copy ends in "/>" instead of ">"
1426 * and its type is {@link TagConstituent#EMPTY_TAG} instead of
1427 * {@link TagConstituent#START_TAG}.
1428 *
1429 * <p>If the original tag is part of a list, it is removed and the copy
1430 * is inserted instead.
1431 *
1432 * @param startTag the start tag to replace
1433 * @return the created copy
1434 * @throws IllegalArgumentException if the specified tag is not a valid
1435 * start tag
1436 */
1437 private TagConstituent replaceWithEmptyCopy(final TagConstituent startTag)
1438 throws IllegalArgumentException {
1439 if (startTag.getType() != TagConstituent.START_TAG) {
1440 throw new IllegalArgumentException(
1441 "Tag to replace must be a start tag (actual type: "
1442 + startTag.getType() + ')');
1443 }
1444 final StringBuffer representation =
1445 new StringBuffer(startTag.getRepresentantion());
1446 final int endMarker = representation.lastIndexOf(">");
1447 if (endMarker < 0) {
1448 throw new IllegalArgumentException(
1449 "Start tag representation is invalid: '>' missing!");
1450 }
1451
1452
1453 representation.insert(endMarker, '/');
1454 final TagConstituent result =
1455 new TagConstituent(TagConstituent.EMPTY_TAG, startTag.getName(),
1456 representation.toString(), startTag.getMarkupSeriesNo());
1457
1458
1459 final XMLConstituent prevConst = startTag.previousConstituent();
1460 final XMLConstituent nextConst = startTag.nextConstituent();
1461 startTag.remove();
1462
1463 if (prevConst != null) {
1464 prevConst.insertAfter(result);
1465 } else if (nextConst != null) {
1466
1467 result.insertAfter(nextConst);
1468 }
1469
1470
1471
1472 return result;
1473 }
1474
1475 /***
1476 * Returns a string representation of this object.
1477 *
1478 * @return a textual representation
1479 */
1480 public String toString() {
1481 return new ToStringBuilder(this)
1482 .append("missing root name", missingRootName)
1483 .append("emptiable tags", emptiableTags)
1484 .append("delete control characters", deletingControlChars)
1485 .append("delete pseudo-tags", deletingPseudoTags)
1486 .append("escape pseudo-entities", escapingPseudoEntities)
1487 .toString();
1488 }
1489
1490 /***
1491 * Helper method that tries to parse a string as an XML start or empty tag
1492 * that contains unquoted attribute values (might be followed by other
1493 * text). If this is the case, the unquoted values are fixed and a suitable
1494 * tag constituent is created.
1495 *
1496 * <p>Only call this method when you're sure that the input does not contain
1497 * a <em>valid</em> tag, i.e. if there must be at least one unquoted value.
1498 * This method doesn't store a markup series in the tag.
1499 *
1500 * @param text the text to parse
1501 * @return an array of two elements: the created {@link TagConstituent}
1502 * and a String containing the unused rest of the input text (might be empty
1503 * but not <code>null</code>; or <code>null</code> if the text couldn't be
1504 * parsed as a text.
1505 * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1506 * implementations in subclasses if an "illicit" event occurred
1507 */
1508 private Object[] tryToFixTag(final String text) throws ParsingException {
1509 final Object[] result;
1510 final Matcher laxMatcher = LAX_START_OR_EMPTY_TAG.matcher(text);
1511
1512 if (laxMatcher.lookingAt()) {
1513
1514
1515 final String tagName = laxMatcher.group(1);
1516 final short tagType;
1517
1518 result = new Object[2];
1519
1520 result[1] = text.substring(laxMatcher.end());
1521 String oldTagRep = laxMatcher.group();
1522
1523
1524 final String lastGroup = laxMatcher.group(laxMatcher.groupCount());
1525
1526 if (lastGroup == null) {
1527 tagType = TagConstituent.START_TAG;
1528 } else if ("/".equals(lastGroup)) {
1529 tagType = TagConstituent.EMPTY_TAG;
1530 } else {
1531
1532 throw new RuntimeException("Implementation error: last group of"
1533 + " lax tag '" + laxMatcher.group() + "' is '"
1534 + lastGroup + "' instead of '/' or null");
1535 }
1536
1537 StringBuilder newTagRep = null;
1538 String unquotedValue;
1539
1540
1541 while (laxMatcher.groupCount() > 2
1542 && (laxMatcher.group(3) != null)) {
1543
1544
1545 newTagRep = new StringBuilder(oldTagRep.substring(0,
1546 laxMatcher.start(3)));
1547 unquotedValue = laxMatcher.group(3);
1548
1549
1550 if (unquotedValue.startsWith("\"")
1551 || unquotedValue.startsWith("'")) {
1552 unquotedValue = unquotedValue.substring(1);
1553 }
1554 if (unquotedValue.endsWith("\"")
1555 || unquotedValue.endsWith("'")) {
1556 unquotedValue = unquotedValue.substring(0,
1557 unquotedValue.length() - 1);
1558 }
1559
1560
1561 unquotedValue = TextUtils.replaceAll(unquotedValue,
1562 Pattern.compile("\""), """);
1563
1564
1565 newTagRep.append('"');
1566 newTagRep.append(unquotedValue);
1567 newTagRep.append('"');
1568
1569
1570 newTagRep.append(oldTagRep.substring(laxMatcher.end(3),
1571 laxMatcher.end()));
1572
1573
1574 laxMatcher.reset(newTagRep);
1575 if (!laxMatcher.matches()) {
1576
1577 throw new RuntimeException("Implementation error while "
1578 + "trying to fix unquoted attribute values: '"
1579 + newTagRep + "' is no longer parsable as a tag");
1580 }
1581 oldTagRep = newTagRep.toString();
1582 }
1583
1584
1585 result[0] = new TagConstituent(tagType, tagName,
1586 newTagRep.toString());
1587 logEvent(EVENT_QUOTED_ATTRIBUTE_VALUES, newTagRep.toString());
1588 } else {
1589 result = null;
1590 }
1591 return result;
1592 }
1593
1594 }