View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.xml;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  import java.io.Writer;
27  import java.util.Set;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.apache.commons.lang.StringEscapeUtils;
32  import org.apache.commons.lang.builder.ToStringBuilder;
33  
34  import de.fu_berlin.ties.ContextMap;
35  import de.fu_berlin.ties.ParsingException;
36  import de.fu_berlin.ties.TextProcessor;
37  import de.fu_berlin.ties.TiesConfiguration;
38  
39  import de.fu_berlin.ties.io.IOUtils;
40  import de.fu_berlin.ties.text.TextTokenizer;
41  import de.fu_berlin.ties.text.TextUtils;
42  import de.fu_berlin.ties.util.CollUtils;
43  import de.fu_berlin.ties.util.Util;
44  
45  /***
46   * This class tries to fix corrupt XML documents, especially documents
47   * containing nesting errors. Instances of this class are thread-safe and
48   * can fix several documents in parallel.
49   *
50   * @author Christian Siefkes
51   * @version $Revision: 1.24 $, $Date: 2006/10/21 16:04:29 $, $Author: siefkes $
52   */
53  public class XMLAdjuster extends TextProcessor {
54  
55      /***
56       * Configuration key: the name to use for the root element if missing.
57       */
58      public static final String CONFIG_MISSING_ROOT = "adjust.missing-root";
59  
60      /***
61       * Configuration key: Set of names of tags that can be converted empty tags
62       * when required.
63       */
64      public static final String CONFIG_EMPTIABLE_TAGS = "adjust.emptiable-tags";
65  
66      /***
67       * Configuration key: whether to delete
68       * {@link #CONTROL_CHARS control characters} (which are not allowed in
69       * XML 1.0 and discouraged in XML 1.1).
70       */
71      public static final String CONFIG_DELETE_CONTROL_CHARS =
72          "adjust.delete.control-chars";
73  
74      /***
75       * Configuration key: whether to delete "pseudo-tags".
76       */
77      public static final String CONFIG_DELETE_PSEUDO_TAGS =
78          "adjust.delete.pseudo-tags";
79  
80      /***
81       * Configuration key: whether to delete trailing garbage (illegal content
82       * that occurs after the root tag has been closed).
83       */
84      public static final String CONFIG_DELETE_TRAILING_GARBAGE =
85          "adjust.delete.trailing-garbage";
86  
87      /***
88       * Configuration key: whether to escape "&amp;" starting a possible
89       * nonstandard entity reference ("&amp;" at the start of one of the 5
90       * predefined entity references or a character reference is never escaped,
91       * all other "&amp;" are always escaped).
92       */
93      public static final String CONFIG_ESCAPE_PSEUDO_ENTITIES =
94          "adjust.escape.pseudo-entities";
95  
96      /***
97       * Pattern string specifying characters that can occur at the start of end
98       * of an unquoted attribute value: everything except '&lt;', '&gt;', '='
99       * and whitespace (whitespace is also allowed, but only in the middle of a
100      * value). Evaluated lazily (reluctant) to avoid missing the "/&gt;" at
101      * the end of an empty tag.
102      */
103     public static final String UNQUOTED_ATTRIB_CHARS =
104         "[^<>=" + XMLTokenizerFactory.XML_WHITESPACE_CHARS + "]+?";
105 
106     /***
107      * Pattern string specifying an XML attribute without proper quotes.
108      * Equal sign and value are captured in groups; the value is matched lazily
109      * (reluctant) so we won't miss the start of the next attribute.
110      */
111     public static final String UNQUOTED_ATTRIBUTE = XMLTokenizerFactory.XML_NAME
112         + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(=)"
113         + XMLTokenizerFactory.XML_OPT_WHITESPACE + "((?:"
114         + UNQUOTED_ATTRIB_CHARS + XMLTokenizerFactory.XML_WHITESPACE + ")*?"
115         + UNQUOTED_ATTRIB_CHARS + ")";
116 
117     /***
118      * Pattern specifying of a "lax" XML start or empty tag that can contain
119      * unquoted (invalid) attributes (combined into a single pattern to avoid
120      * unnecessary backtracking). Element name, equal signs and values of
121      * the (last) unquoted attribute, and '/' (for empty tags) are captured.
122      */
123     public static final Pattern LAX_START_OR_EMPTY_TAG = Pattern.compile(
124         "<(" + XMLTokenizerFactory.XML_NAME + ")" + "(?:"
125         + XMLTokenizerFactory.XML_WHITESPACE + "(?:"
126         + XMLTokenizerFactory.XML_ATTRIBUTE + "|" + UNQUOTED_ATTRIBUTE + "))*"
127         + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(/)?>");
128 
129     /***
130      * A "&amp;" that is not the start of an predefined entity reference or
131      * a character reference and thus should be escaped if
132      * {@link #isEscapingPseudoEntities()} is <code>true</code>.
133      * (A pattern matching the rest of predefined entity or character reference
134      * is included via negative lookahead.)
135      */
136     public static final Pattern PSEUDO_AMP = Pattern.compile(
137         "&(?!(?:amp|lt|gt|apos|quot|#[0-9]+|#x[0-9a-fA-F]+);)");
138 
139     /***
140      * A "&amp;" that is not the start of an entity and thus must be escaped.
141      * (A pattern matching the rest of legal entity or character reference is
142      * included via negative lookahead.)
143      */
144     public static final Pattern SPURIOUS_AMP = Pattern.compile("&(?!(?:"
145         + XMLTokenizerFactory.XML_NAME + "|#[0-9]+|#x[0-9a-fA-F]+);)");
146 
147     /***
148      * Escape sequence for the "&amp;" character.
149      */
150     public static final String ESCAPED_AMP = "&amp;";
151 
152     /***
153      * Pattern specifying sequences of control characters (character codes
154      * below the space character, except tab, line feed and carriage return).
155      */
156     public static final Pattern CONTROL_CHARS =
157         Pattern.compile("[\u0001-\u0008\u000B-\u000C\u000E-\u001F]+");
158 
159     /***
160      * Event constant: Converted to empty tag.
161      */
162     protected static final String EVENT_CONVERTED_TO_EMPTY_TAG =
163         "Converted to empty tag";
164 
165     /***
166      * Event constant: Inserted missing end tag.
167      */
168     protected static final String EVENT_INSERTED_MISSING_END_TAG  =
169         "Inserted missing end tag";
170 
171     /***
172      * Event constant: Inserted missing root element.
173      */
174     protected static final String EVENT_INSERTED_MISSING_ROOT_ELEMENT  =
175         "Inserted missing root element";
176 
177     /***
178      * Event constant: Inserted missing start tag.
179      */
180     protected static final String EVENT_INSERTED_MISSING_START_TAG  =
181         "Inserted missing start tag";
182 
183     /***
184      * Event constant: Moved end tag up.
185      */
186     protected static final String EVENT_MOVED_END_TAG_UP =
187         "Moved end tag up";
188 
189     /***
190      * Event constant: Moved start tag dow.
191      */
192     protected static final String EVENT_MOVED_START_TAG_DOWN  =
193         "Moved start tag down";
194 
195     /***
196      * Event constant: Split tag.
197      */
198     protected static final String EVENT_SPLIT_TAG  =
199         "Split tag";
200 
201     /***
202      * Event constant: Deleted control characters.
203      */
204     protected static final String EVENT_DELETED_CONTROL_CHARS  =
205         "Deleted control characters";
206 
207     /***
208      * Event constant: Deleted pseudo-tag.
209      */
210     protected static final String EVENT_DELETED_PSEUDO_TAG  =
211         "Deleted pseudo-tag";
212 
213     /***
214      * Event constant: Escaped characters that are illegal or unwanted.
215      */
216     protected static final String EVENT_ESCAPED_CHARS  =
217         "Escaped characters";
218 
219     /***
220      * Event constant: Quoted attribute values.
221      */
222     protected static final String EVENT_QUOTED_ATTRIBUTE_VALUES  =
223         "Quoted attribute values";
224 
225     /***
226      * Event constant: Deleted trailing garbage.
227      */
228     protected static final String EVENT_DELETED_TRAILING_GARBAGE  =
229         "Deleted trailing garbage";
230 
231     /***
232      * The name to use for the root element if missing. A root element
233      * with this name is created when not all elements and textual
234      * content are inclosed within a single element (the root).
235      * If <code>null</code> and {@link #deletingTrailingGarbage}, processing
236      * stops with an exception if the root element is missing.
237      */
238     private final String missingRootName;
239 
240     /***
241      * Whether to delete trailing garbage (illegal content that occurs after
242      * the root tag has been closed).
243      */
244     private final boolean deletingTrailingGarbage;
245 
246     /***
247      * Contains the names (Strings) of tags that can be converted an empty tags
248      * when required for fixing a document (e.g. "br" when
249      * <code>&lt;br&gt;</code> may be converted to <code>&lt;br/&gt;</code>
250      * during repair). Might be <code>null</code>.
251      */
252     private final Set<String> emptiableTags;
253 
254     /***
255      * Whether to delete {@link #CONTROL_CHARS control characters} (which are
256      * not allowed in XML 1.0 and discouraged in XML 1.1).
257      */
258     private final boolean deletingControlChars;
259 
260     /***
261      * Whether to delete "pseudo-tags", i.e., sequences that cannot be parsed as
262      * tags but look similar to them. "Pseudo-tags" start with '&lt;' and end
263      * with '&gt;', contain a printable character after the initial '&lt;', and
264      * do not contain any inner '&lt;' or '&gt;'). If <code>true</code>, such
265      * sequences will be deleted; otherwise (default) the starting '&lt;' will
266      * be escaped.
267      */
268     private final boolean deletingPseudoTags;
269 
270     /***
271      * Whether to escape "&amp;" starting a possible nonstandard entity
272      * reference ("&amp;" at the start of one of the 5 predefined entity
273      * references or a character reference is never escaped, all other "&amp;"
274      * are always escaped).
275      */
276     private final boolean escapingPseudoEntities;
277 
278     /***
279      * Creates a new instance using a default extension and the
280      * {@linkplain TiesConfiguration#CONF standard configuration}.
281      */
282     public XMLAdjuster() {
283         this("xml");
284     }
285 
286     /***
287      * Creates a new instance, configured from the
288      * {@linkplain TiesConfiguration#CONF standard configuration}.
289      * @param outExt the extension to use for output files
290      */
291     public XMLAdjuster(final String outExt) {
292         this(outExt, TiesConfiguration.CONF);
293     }
294 
295     /***
296      * Creates a new instance from the provided configuration.
297      * @param outExt the extension to use for output files
298      * @param config used to configure this instance
299      */
300     public XMLAdjuster(final String outExt, final TiesConfiguration config) {
301         this(outExt, config.getString(CONFIG_MISSING_ROOT, null),
302             config.containsKey(CONFIG_EMPTIABLE_TAGS)
303                     ? CollUtils.arrayAsSet(config.getStringArray(
304                             CONFIG_EMPTIABLE_TAGS))
305                     : null,
306             config.getBoolean(CONFIG_DELETE_CONTROL_CHARS),
307             config.getBoolean(CONFIG_DELETE_PSEUDO_TAGS),
308             config.getBoolean(CONFIG_DELETE_TRAILING_GARBAGE),
309             config.getBoolean(CONFIG_ESCAPE_PSEUDO_ENTITIES), config);
310     }
311 
312     /***
313      * Creates a new instance.
314      *
315      * @param outExt the extension to use for output files
316      * @param missingRoot the name to use for the root element if missing, i.e.
317      * if not all elements and textual content are inclosed within a single
318      * element (the root); if <code>null</code>, processing stops with an
319      * exception if the root element is missing
320      * @param emptiableTagSet contains the names (Strings) of tags that can be
321      * converted an empty tags when required for fixing a document (e.g. "br"
322      * when <code>&lt;br&gt;</code> may be converted to
323      * <code>&lt;br/&gt;</code> during repair); might be <code>null</code> if
324      * there are none
325      * @param deleteControlChars whether to delete
326      * {@link #CONTROL_CHARS control characters} (which are not allowed in XML
327      * 1.0 and discouraged in XML 1.1)
328      * @param deletePseudoTags whether to
329      * {@linkplain #isDeletingPseudoTags() delete "pseudo-tags"}
330      * @param deleteTrailingGarbage whether to delete trailing garbage
331      * (illegal content that occurs after the root tag has been closed)
332      * @param escapePseudoEntities whether to escape "&amp;" starting a possible
333      * nonstandard entity reference ("&amp;" at the start of one of the 5
334      * predefined entity references or a character reference is never escaped,
335      * all other "&amp;" are always escaped)
336      * @param config used to configure superclasses; if <code>null</code>,
337      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
338      */
339     public XMLAdjuster(final String outExt, final String missingRoot,
340             final Set<String> emptiableTagSet, final boolean deleteControlChars,
341             final boolean deletePseudoTags,
342             final boolean deleteTrailingGarbage,
343             final boolean escapePseudoEntities,
344             final TiesConfiguration config) {
345         super(outExt, config);
346         missingRootName = missingRoot;
347         emptiableTags = emptiableTagSet;
348         deletingControlChars = deleteControlChars;
349         deletingPseudoTags = deletePseudoTags;
350         deletingTrailingGarbage = deleteTrailingGarbage;
351         escapingPseudoEntities = escapePseudoEntities;
352     }
353 
354     /***
355      * Tries to fix corrupt XML documents, especially documents
356      * containing nesting errors. Delegates to
357      * {@link #fixedConstituents(CharSequence)} and writes the result to the
358      * specified writer.
359      *
360      * @param input the corrupt XML document
361      * @param out the writer to write the corrected XML document to; flushed but
362      * not closed by this method
363      * @throws IOException if an I/O error occurs while using the writer
364      * @throws ParsingException if the XML input contains an uncorrectable error
365      */
366     public final void adjust(final CharSequence input, final Writer out)
367             throws IOException, ParsingException {
368         final XMLConstituent firstConst = fixedConstituents(input);
369 
370         // write corrected data to out
371         XMLConstituent currentConst = firstConst;
372         while (currentConst != null) {
373             out.write(currentConst.getRepresentantion());
374             currentConst = currentConst.nextConstituent();
375         }
376         out.flush();
377     }
378 
379     /***
380      * Tries to fix corrupt XML documents, especially documents containing
381      * nesting errors. Delegates to {@link #adjust(CharSequence, Writer)}.
382      *
383      * @param in the reader to read the corrupt XML document from; not closed
384      * by this method
385      * @param out the writer to write the corrected XML document to; flushed but
386      * not closed by this method
387      * @throws IOException if an I/O error occurs while using the reader or
388      * writer
389      * @throws ParsingException if the XML input contains an uncorrectable error
390      */
391     public final void adjust(final Reader in, final Writer out)
392             throws IOException, ParsingException {
393         final String input = IOUtils.readToString(in);
394         adjust(input, out);
395     }
396 
397     /***
398      * Method called by the {@link #logEvent(String, String)} methods whenever
399      * an event occurred to ensure the event is acceptable. Subclasses that want
400      * to prevent certain events can overwrite this method and throw an
401      * exception if an "illegal" event is encountered.
402      * This implementation does nothing, letting all events pass.
403      *
404      * @param eventType the event that occurred; should be one of the
405      * EVENT constants defined in this class.
406      * @throws ParsingException could be thrown by subclasses if the event is
407      * considered illicit
408      */
409     protected void checkEvent(final String eventType) throws ParsingException {
410         // can be overwritten by subclasses
411     }
412 
413     /***
414      * Helper method called after completing an end tag with a missing start tag
415      * or processing a tentative start tag to check whether the next appearance
416      * of this tag type is a start tag or another end tag.
417      * The second case means that another start tag is missing
418      * -- the missing start tag is created as a tentative tag and inserted.
419      *
420      * @param endTag the end tag whose next appearance should be checked
421      * @param insertAfter the tag after which the newly created tentative
422      * tag should be inserted
423      * @param unprocessedTags the container of unprocessed start and end tags
424      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
425      * implementations in subclasses if an "illicit" event occurred
426      */
427     private void checkNextAppearance(final TagConstituent endTag,
428             final TagConstituent insertAfter,
429             final UnprocessedTags unprocessedTags) throws ParsingException {
430         // check whether the next appearance is a start tag (ok) or end tag
431         // (insert tentative start tag)
432         if (correspondingEndTag(endTag.getName(), -1, unprocessedTags,
433                 false) != null) {
434             // this means next appearance is another end tag:
435             // create tentative start tag after current end tag
436             final TagConstituent tentative = new TagConstituent(
437                 TagConstituent.START_TAG, endTag.getName(),
438                 insertAfter.getMarkupSeriesNo());
439             tentative.setVariety(TagVariety.TENTATIVE);
440 
441             // insert after specified tag + following whitespace, if any
442             XMLConstituent nextConst = insertAfter.nextConstituent();
443             if (nextConst.getType() == OtherConstituent.OUTER_WHITESPACE) {
444                 nextConst.insertAfter(tentative);
445             } else {
446                 insertAfter.insertAfter(tentative);
447             }
448 
449             // insert at begin of unprocessed tags and log event
450             unprocessedTags.push(tentative, false);
451             logEvent(EVENT_INSERTED_MISSING_START_TAG, tentative);
452         }
453     }
454 
455     /***
456      * Helper method that creates an constituent that is part of a markup series
457      * (i.e., that is neither text nor a CDATA section).
458      *
459      * @param currentToken the string representation of the constituent
460      * @param capturedText the text captured by the XML text tokenizer, used to
461      * determine the token type
462      * @param markupSeriesNo the number of the markup series of the constituent
463      * @param startAndEndTags all start and end tags are added to this
464      * container, if it isn't <code>null</code>
465      * @return the created constituent
466      */
467     private XMLConstituent createMarkupConstituent(final String currentToken,
468             final String capturedText, final int markupSeriesNo,
469             final UnprocessedTags startAndEndTags) {
470         // used to determine the exact token type
471         final char firstChar = capturedText.charAt(0);
472         final char lastChar = capturedText.charAt(capturedText.length() - 1);
473         final XMLConstituent result;
474 
475         if (firstChar == '/') {
476             result = new TagConstituent(TagConstituent.END_TAG,
477                 capturedText.substring(1), currentToken, markupSeriesNo);
478             if (startAndEndTags != null) {
479                 startAndEndTags.push((TagConstituent) result);
480             }
481         } else if (lastChar == '/') {
482             result = new TagConstituent(TagConstituent.EMPTY_TAG,
483                 capturedText.substring(0, capturedText.length() - 1),
484                 currentToken, markupSeriesNo);
485         } else if (firstChar == '!') {
486             result = new OtherConstituent(OtherConstituent.DOCTYPE,
487                 currentToken);
488         } else if (firstChar == '?') {
489             // prolog ("<?xml...") or processing instruction?
490             if (capturedText.substring(1).equals("xml")) {
491                 result = new OtherConstituent(OtherConstituent.XML_PROLOG,
492                     currentToken);
493             } else {
494                 result = new OtherConstituent(OtherConstituent.PI,
495                     currentToken);
496             }
497         } else if (firstChar == '<') {
498             result = new OtherConstituent(OtherConstituent.COMMENT,
499                 currentToken);
500         } else {
501             result = new TagConstituent(TagConstituent.START_TAG, capturedText,
502                 currentToken, markupSeriesNo);
503             if (startAndEndTags != null) {
504                 startAndEndTags.push((TagConstituent) result);
505             }
506         }
507         return result;
508     }
509 
510     /***
511      * Checks whether <code>unprocessedTags</code> contains a
512      * matching tag within the specified <code>markupSeriesNo</code>, not
513      * preceeded by a matching start tag. Returns the found end tag, if it
514      * exists; returns <code>null</code> otherwise.
515      *
516      * @param tagname the name of the tag to check
517      * @param markupSeriesNo the markup series number to match;
518      * or <code>-1</code> if the markup series number doesn't matter
519      * @param unprocessedTags the container of start and end tags to check
520      * @param remove whether to remove the found end tag from the
521      * <code>unprocessedTags</code> container
522      * @return the found corresponding end tag,
523      * or <code>null</code> if no matching appearance was found
524      */
525     private TagConstituent correspondingEndTag(final String tagname,
526             final int markupSeriesNo, final UnprocessedTags unprocessedTags,
527             final boolean remove) {
528         final TagConstituent result;
529         final TagConstituent suitableTag;
530 
531         if (markupSeriesNo >= 0) {
532             suitableTag =
533                 unprocessedTags.findInSeries(tagname, markupSeriesNo, true);
534         } else {
535             suitableTag = unprocessedTags.findFirst(tagname);
536         }
537 
538         // check whether we found a start tag or end tag (if any)
539         if ((suitableTag != null)
540             && (suitableTag.getType() == TagConstituent.END_TAG)) {
541                 // found suitable end tag
542                 result = suitableTag;
543                 if (remove) {
544                     unprocessedTags.forceRemove(suitableTag);
545                 }
546         } else {
547             // no tag or start tag
548             result = null;
549         }
550 
551         return result;
552     }
553 
554     /***
555      * Checks whether <code>openTags</code> contains a matching start tag
556      * (ignoring the root tag), either within the specified
557      * <code>markupSeriesNo</code> or a tentative tag anywhere.
558      * Returns and remove the found end tag, if it exists.
559      * Returns <code>null</code> otherwise.
560      *
561      * @param tagname the name of the tag to check
562      * @param markupSeriesNo the markup series number to match
563      * @param openTags the container of currently open start tags to check
564      * @return the found and removed matching start tag,
565      * or <code>null</code> if no matching non-root appearance was found
566      */
567     private TagConstituent correspondingOpenTag(final String tagname,
568             final int markupSeriesNo, final OpenTags openTags) {
569         // use last match within specified markup series, if any
570         TagConstituent result =
571             openTags.findInSeries(tagname, markupSeriesNo, false);
572 
573         // otherwise look for tentative tag
574         if (result == null) {
575             result = openTags.findTentativeTag(tagname);
576         }
577 
578         if (result != null) {
579             if (openTags.isRoot(result)) {
580                 // the root tag is not accepted -- return null instead
581                 result = null;
582             } else {
583                 // remove found tag
584                 openTags.forceRemove(result);
585             }
586         }
587         return result;
588     }
589 
590     /***
591      * Tries to fix corrupt XML documents, especially documents containing
592      * nesting errors. Delegates to {@link #adjust(Reader, Writer)}, ignoring
593      * the <code>context</code>.
594      *
595      * @param reader reader containing the text to process; should not be closed
596      * by this method
597      * @param writer the writer to write the processed text to; might be flushed
598      * but not closed by this method
599      * @param context a map of objects that are made available for processing
600      * @throws IOException if an I/O error occurs while using the reader or
601      * writer
602      * @throws ParsingException if the XML input contains an uncorrectable error
603      */
604     protected void doProcess(final Reader reader, final Writer writer,
605             final ContextMap context) throws IOException, ParsingException {
606         adjust(reader, writer);
607     }
608 
609     /***
610      * Helper method that checks whether an end tag is certain to be missing for
611      * a specified open tag. This is the case if (a) the next appearance of
612      * this tag type is NOT an end tag and if (b) there are at least as much
613      * unprocessed start tags than unprocessed end tags for the given type.
614      *
615      * @param openTagName the name of the open tag check
616      * @param unprocessedTags the container of unprocessed start and end tags
617      * @return <code>true</code> iff both conditions stated above are fulfulled
618      */
619     private boolean endTagMissing(final String openTagName,
620             final UnprocessedTags unprocessedTags) {
621         // true if the next tag is not an end tag
622         final boolean result = (correspondingEndTag(openTagName, -1,
623                 unprocessedTags, false) == null)
624         // and if there are at least as many start tags as end tags
625             && (unprocessedTags.startTagCount(openTagName)
626                 >= unprocessedTags.endTagCount(openTagName));
627         return result;
628     }
629 
630     /***
631      * Escapes every of a specified pattern in the representation of a
632      * constituent.
633      *
634      * @param constituent the constituent to check
635      * @param pattern the pattern to replace
636      * @param replacement the replacement text
637      * @param eventType the event to log when replacing was necessary
638      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
639      * implementations in subclasses if an "illicit" event occurred
640      */
641     private void replaceInRep(final XMLConstituent constituent,
642             final Pattern pattern, final String replacement,
643             final String eventType) throws ParsingException {
644         final String oldRep = constituent.getRepresentantion();
645         final String newRep =
646             TextUtils.replaceAll(oldRep, pattern, replacement);
647 
648         if (oldRep != newRep) {
649             // representation changed: log event and update representation
650             logEvent(eventType, "old: '" + oldRep + "', new: '" + newRep + "'");
651             constituent.setRepresentantion(newRep);
652         }
653     }
654 
655     /***
656      * Returns the constituents of an XML-like document after fixing possible
657      * nesting errors etc.
658      *
659      * @param input the XML-like input data
660      * @return a reference to the first contained constituent; the list
661      * of constituents can be traversed by calling
662      * {@link XMLConstituent#nextConstituent()} on each constituent untill
663      * <code>null</code> is returned
664      * @throws ParsingException if the XML input contains an uncorrectable error
665      */
666     public final XMLConstituent fixedConstituents(final CharSequence input)
667             throws ParsingException {
668         // first run: create list of all tags + container of start and end tags;
669         // fix character-level errors
670         final UnprocessedTags unprocessedTags = new UnprocessedTags();
671         XMLConstituent firstConst =
672             rawConstituents(input, true, unprocessedTags);
673 
674         // second run: ensure corrent nesting of start + end tags
675         XMLConstituent currentConst = firstConst;
676         XMLConstituent lastConst = currentConst;
677         TagConstituent currentTag;
678         final OpenTags openTags = new OpenTags();
679 
680         // all root content (tags, non-whitespace text, CDATA sections) must
681         // be enclosed in a root element
682         XMLConstituent firstRootContent = null;
683         boolean insertedMissingRoot = false;
684         boolean skipConst;
685 
686         while (currentConst != null) {
687             skipConst = false;
688 
689             if ((firstRootContent == null) && isRootContent(currentConst)) {
690                 // this is the first root content
691                 firstRootContent = currentConst;
692 
693                 if (currentConst.getType() != TagConstituent.START_TAG) {
694                     // not a start tag: try to insert missing root element
695                     insertMissingRoot(firstRootContent, openTags);
696                     insertedMissingRoot = true;
697                 }
698             } else if (openTags.isEmpty() && (isRootContent(currentConst))) {
699                 // all tags have been closed but this is root content:
700                 if (deletingTrailingGarbage) {
701                     // delete this constituent ("trailing garbage")
702                     logEvent(EVENT_DELETED_TRAILING_GARBAGE,
703                             currentConst.toString());
704                     skipConst = true;
705                     final XMLConstituent constToRemove = currentConst;
706                     currentConst = currentConst.nextConstituent();
707 
708                     if (constToRemove instanceof TagConstituent) {
709                         unprocessedTags.remove((TagConstituent) constToRemove);
710                     }
711                     constToRemove.remove();
712                 } else {
713                     // try to insert missing root element
714                     insertMissingRoot(firstRootContent, openTags);
715                     insertedMissingRoot = true;
716                 }
717             }
718 
719             if (!skipConst) {
720                 if (currentConst instanceof TagConstituent) {
721                     currentTag = (TagConstituent) currentConst;
722 
723                     if (currentTag.getType() == TagConstituent.START_TAG) {
724                         // move from unprocessed tags to open tags
725                         unprocessedTags.forceRemove(currentTag);
726                         openTags.push(currentTag);
727                     } else if (currentTag.getType() == TagConstituent.END_TAG) {
728                         // remove from unprocessed tags and do handling
729                         unprocessedTags.forceRemove(currentTag);
730                         handleEndTag(currentTag, openTags, unprocessedTags);
731                     }
732                 }
733 
734                 lastConst = currentConst; // so we don't loose the very last one
735                 currentConst = currentConst.nextConstituent();
736             }
737         }
738 
739         // handle end-of-file
740         handleEOF(lastConst, openTags, unprocessedTags, insertedMissingRoot);
741 
742         // first constituent might have changed (e.g. due to moves in first tag
743         // series, insertion of root element)
744         while (firstConst.hasPrevious()) {
745             firstConst = firstConst.previousConstituent();
746         }
747         return firstConst;
748     }
749 
750     /***
751      * Helper method that fixes character errors in the representation of a
752      * constituent, if any.
753      *
754      * @param constituent the constituent to check
755      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
756      * implementations in subclasses if an "illicit" event occurred
757      */
758     private void fixRepresentation(final XMLConstituent constituent)
759             throws ParsingException {
760         // delete control characters if configured
761         if (deletingControlChars) {
762             replaceInRep(constituent, CONTROL_CHARS, "",
763                 EVENT_DELETED_CONTROL_CHARS);
764         }
765 
766         if (needsAmpEscape(constituent)) {
767             if (escapingPseudoEntities) {
768                 // escape every '&' unless starting predefined entity or
769                 // character reference
770                 replaceInRep(constituent, PSEUDO_AMP, ESCAPED_AMP,
771                     EVENT_ESCAPED_CHARS);
772             } else {
773                 // escape only illegal '&' (not start of any entity)
774                 replaceInRep(constituent, SPURIOUS_AMP, ESCAPED_AMP,
775                     EVENT_ESCAPED_CHARS);
776             }
777         }
778     }
779 
780     /***
781      * Helper method for handling an end tag. Here the actual work of the
782      * algorithm takes place, because we have to ensure a match.
783      *
784      * @param endTag the end tag to handle
785      * @param openTags must contain all currently open tags
786      * @param unprocessedTags must contain all unprocessed start and end tags
787      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
788      * implementations in subclasses if an "illicit" event occurred
789      */
790     protected void handleEndTag(final TagConstituent endTag,
791             final OpenTags openTags, final UnprocessedTags unprocessedTags)
792             throws ParsingException {
793         TagConstituent lastOpenTag;
794         TagVariety lastOpenTagVariety;
795         String lastOpenTagName;
796         TagConstituent correspondingTag;
797         boolean matchFound = false;
798 
799         // iterate until end tag has been processed by finding a match
800         while (!matchFound) {
801             lastOpenTag = openTags.peek();
802             lastOpenTagVariety = lastOpenTag.getVariety();
803             lastOpenTagName = lastOpenTag.getName();
804             //Log.TIES.debug("Mismatch: " + lastOpenTag + " / " + endTag);
805 
806             if (lastOpenTagName.equals(endTag.getName())) {
807                 // last open tag matched: remove it and ensure it's regular
808                 // (checking next appearance for tentative tags)
809                 if (openTags.popAndRegularize()
810                         && (lastOpenTagVariety == TagVariety.TENTATIVE)) {
811                     checkNextAppearance(endTag, endTag, unprocessedTags);
812                 }
813                 matchFound = true; // done
814             } else if (lastOpenTagVariety == TagVariety.TENTATIVE) {
815                 // last open tag is tentative: insert after the end tag; removed
816                 // it from open tags and re-added to unprocessed tags
817                 moveTentativeTag(lastOpenTag, endTag, openTags,
818                     unprocessedTags);
819                 // now retry
820             } else if (openTags.containsNonTentative(endTag.getName())
821                     && (correspondingTag = correspondingEndTag(
822                     lastOpenTagName, endTag.getMarkupSeriesNo(),
823                     unprocessedTags, true)) != null) {
824                 // A non-tentative start tag for the current end tag exists and
825                 // there is a corresponding end tag for the last open tag in the
826                 // current markup series. Remove the last open tag and make it
827                 // regular if necessary
828                 openTags.popAndRegularize();
829 
830                 // move the corresponding end tag before the current end tag
831                 correspondingTag.remove();
832                 endTag.insertBefore(correspondingTag);
833 
834                 // check next appearance if tentative (if necessary inserting
835                 // new tentative tag after the current end tag)
836                 if (lastOpenTagVariety == TagVariety.TENTATIVE) {
837                     checkNextAppearance(correspondingTag, endTag,
838                         unprocessedTags);
839                 }
840 
841                 // log event and retry
842                 logEvent(EVENT_MOVED_END_TAG_UP, correspondingTag);
843             } else if ((correspondingTag = correspondingOpenTag(
844                     endTag.getName(), lastOpenTag.getMarkupSeriesNo(),
845                     openTags)) != null) {
846                 // Open tags contained a non-root tag corresponding to the end
847                 // tag, either within the markup series of the last open tag or
848                 // a tentative tag anywhere.
849                 // Move the corresponding start after the last open tag
850                 correspondingTag.remove();
851                 lastOpenTag.insertAfter(correspondingTag);
852 
853                 if (correspondingTag.getVariety() == TagVariety.REGULAR) {
854                     // log event
855                     logEvent(EVENT_MOVED_START_TAG_DOWN, correspondingTag);
856                 } else {
857                     if (correspondingTag.getVariety() == TagVariety.TENTATIVE) {
858                         // tentative tag: check whether the next appearance is
859                         // a start tag (ok) or end tag (another start tag
860                         // missing: insert tentative start tag)
861                         checkNextAppearance(endTag, endTag, unprocessedTags);
862                     }
863 
864                     // convert to regular tag
865                     correspondingTag.setVariety(TagVariety.REGULAR);
866                     // event was already logged when creating the irregular tag
867                 }
868                 matchFound = true; // done
869             } else if (!openTags.contains(endTag.getName())) {
870                 // no open tag matching the current end tag -- we have to
871                 // create one
872                 final TagConstituent missingStartTag =
873                     new TagConstituent(TagConstituent.START_TAG,
874                         endTag.getName(), lastOpenTag.getMarkupSeriesNo());
875 
876                 // insert new tag after the last open tag
877                 lastOpenTag.insertAfter(missingStartTag);
878                 logEvent(EVENT_INSERTED_MISSING_START_TAG, missingStartTag);
879 
880                 // Check whether the next appearance is a start tag (ok) or end
881                 // tag (another start tag missing: insert tentative start tag)
882                 checkNextAppearance(endTag, endTag, unprocessedTags);
883                 matchFound = true; // done
884             } else if ((lastOpenTag.getMarkupSeriesNo()
885                     == endTag.getMarkupSeriesNo())
886                     && !endTagMissing(lastOpenTagName, unprocessedTags)) {
887                 // the last open tag is within the current markup series and a
888                 // corresponding end tag exists for it (in any markup series):
889                 // Move it after the current end tag, removing it from open tags
890                 // and from current place in list
891                 openTags.forceRemove(lastOpenTag);
892                 lastOpenTag.remove();
893                 endTag.insertAfter(lastOpenTag);
894 
895                 // append at the start of unprocessed tags of this series
896                 unprocessedTags.push(lastOpenTag, false);
897 
898                 // log event if this is a regular tag
899                 if (lastOpenTagVariety == TagVariety.REGULAR) {
900                     logEvent(EVENT_MOVED_START_TAG_DOWN, lastOpenTag);
901                 }
902                 // now retry
903             } else {
904                 // no way out: we have to "get rid of" the last open tag
905                 final boolean endTagMissing =
906                     endTagMissing(lastOpenTagName, unprocessedTags);
907 
908                 if (endTagMissing && isAnEmptiableTag(lastOpenTagName)) {
909                     // convert last open tag to empty tag
910                     final TagConstituent newEmptyTag =
911                         replaceWithEmptyCopy(lastOpenTag);
912                     logEvent(EVENT_CONVERTED_TO_EMPTY_TAG, newEmptyTag);
913                 } else {
914                     // create and insert suitable end tag and insert it before
915                     // the current end tag
916                     final TagConstituent matchingEndTag = new TagConstituent(
917                         TagConstituent.END_TAG, lastOpenTagName,
918                         endTag.getMarkupSeriesNo());
919                     endTag.insertBefore(matchingEndTag);
920 
921                     if (endTagMissing) {
922                         // end tag of lastOpenTag type is missing: created it.
923                         // Log event for regular tags
924                         if (lastOpenTagVariety == TagVariety.REGULAR) {
925                             logEvent(EVENT_INSERTED_MISSING_END_TAG,
926                                 matchingEndTag);
927                         }
928                     } else {
929                         // sufficient end tags available:
930                         // split lastOpenTag by creating continuation copy
931                         final TagConstituent continuation = new TagConstituent(
932                             TagConstituent.START_TAG, lastOpenTagName,
933                             lastOpenTag.getRepresentantion(),
934                             endTag.getMarkupSeriesNo());
935                         continuation.setVariety(TagVariety.CONTINUATION);
936 
937                         // insert copy after current end tag and log event
938                         endTag.insertAfter(continuation);
939                         unprocessedTags.push(continuation, false);
940                         logEvent(EVENT_SPLIT_TAG, lastOpenTag);
941                     }
942 
943                     // Remove from open tags and ensure it's regular; retry
944                     openTags.popAndRegularize();
945                 }
946             }
947         }
948     }
949 
950     /***
951      * Helper method for handling an the end of a file. Suitable end tags are
952      * created to close any left-over open tags.
953      *
954      * @param lastConst the last constituent in the original input (must not
955      * have a successor)
956      * @param openTags must contain all currently open tags
957      * @param unprocessedTags must contain all unprocessed start and end tags;
958      * should better be empty
959      * @param insertedMissingRoot whether the start tag of a missing root
960      * element was created (in this case we'll insert the corresponding end tag
961      * without logging another event)
962      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
963      * implementations in subclasses if an "illicit" event occurred
964      */
965     protected void handleEOF(final XMLConstituent lastConst,
966             final OpenTags openTags, final UnprocessedTags unprocessedTags,
967             final boolean insertedMissingRoot) throws ParsingException {
968         // check state
969         if (lastConst.nextConstituent() != null) {
970             Util.LOG.error("Implementation error: constituents after last "
971                 + "constituent: " + lastConst);
972         }
973         if (!unprocessedTags.isEmpty()) {
974             Util.LOG.error("Implementation error: still unprocessed tags at "
975                 + "end-of-file -- last constituent: " + lastConst
976                 + " unprocessed tags: " + unprocessedTags);
977         }
978 
979         XMLConstituent currentConst = lastConst;
980         TagConstituent lastOpenTag = openTags.peek();
981 
982         if (lastOpenTag != null) {
983             // there are unclosed elements -- look for last root content
984             while (!isRootContent(currentConst)) {
985                 currentConst = currentConst.previousConstituent();
986             }
987 
988             // insert missing end tags after last root content
989             TagConstituent missingEndTag;
990             while (lastOpenTag != null) {
991                 // create suitable end tag for each unclosed open tag
992                 missingEndTag = new TagConstituent(TagConstituent.END_TAG,
993                     lastOpenTag.getName());
994 
995                 // insert after last root content
996                 currentConst.insertAfter(missingEndTag);
997                 currentConst = missingEndTag;
998                 // remove handled tag and peek the previous one
999                 openTags.pop();
1000                 lastOpenTag = openTags.peek();
1001 
1002                 // log an event unless this is the end of a missing root
1003                 if (!(insertedMissingRoot && openTags.isEmpty())) {
1004                     logEvent(EVENT_INSERTED_MISSING_END_TAG, missingEndTag);
1005                 }
1006             }
1007         }
1008     }
1009 
1010     /***
1011      * Logs the occurance of an event necessary for fixing a document.
1012      * This method variant is used for character errors.
1013      *
1014      * @param eventType the event that occurred; should be one of the
1015      * EVENT constants defined in this class.
1016      * @param details a detailed description of the event
1017      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1018      * implementations in subclasses if the event is considered illicit
1019      */
1020     protected void logEvent(final String eventType, final String details)
1021             throws ParsingException {
1022         checkEvent(eventType);
1023         Util.LOG.debug("Modified document: " + eventType
1024                 + " (" + details + ')');
1025     }
1026 
1027     /***
1028      * Logs the occurance of an event necessary for fixing a document.
1029      * This method variant is used for nesting errors and missing root elements.
1030      *
1031      * @param eventType the event that occurred; should be one of the
1032      * EVENT constants defined in this class.
1033      * @param tag the involved tag
1034      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1035      * implementations in subclasses if the event is considered illicit
1036      */
1037     protected void logEvent(final String eventType, final TagConstituent tag)
1038             throws ParsingException {
1039         checkEvent(eventType);
1040         Util.LOG.debug("Modified document: " + eventType
1041                 + " (" + tag.toString() + ')');
1042     }
1043 
1044     /***
1045      * Inserts a root element with the configured name
1046      * ({@link #missingRootName}); or throws a parsing exception to finish
1047      * processing if inserting a root element is not allowed. This method
1048      * only creates and inserts a start tag of the configured type; the
1049      * matching end tag will automatically be added by
1050      * {@link #handleEOF(XMLConstituent, OpenTags, UnprocessedTags, boolean)}.
1051      *
1052      * @param firstRootContent the root start will be inserted before this
1053      * content
1054      * @param openTags the container of open tags to push the root tag to,
1055      * assumed to be empty
1056      * @throws ParsingException if inserting a missing root tag is now allowed
1057      * (no name for a missing root tag specified)
1058      */
1059     private void insertMissingRoot(final XMLConstituent firstRootContent,
1060             final OpenTags openTags) throws ParsingException {
1061         if (missingRootName != null) {
1062             // create start tag
1063             final TagConstituent rootStartTag = new TagConstituent(
1064                     TagConstituent.START_TAG, missingRootName, 0);
1065 
1066             // insert before first root content
1067             firstRootContent.insertBefore(rootStartTag);
1068 
1069             // push to open tags and log event
1070             openTags.push(rootStartTag);
1071             logEvent(EVENT_INSERTED_MISSING_ROOT_ELEMENT, rootStartTag);
1072         } else {
1073             throw new ParsingException("Root tag missing (resp. tags or "
1074                     + "textual content outside root element)");
1075         }
1076     }
1077 
1078     /***
1079      * Whether the specified tag is one of the tags that can be converted an
1080      * empty tags when required for fixing a document. For example, "br" when
1081      * <code>&lt;br&gt;</code> may be converted to <code>&lt;br/&gt;</code>
1082      * during repair.
1083      *
1084      * @param tag the name of the tag to look up
1085      * @return <code>true</code> iff this tag is contained in the set of
1086      * emptiable tags
1087      */
1088     protected boolean isAnEmptiableTag(final String tag) {
1089         return (emptiableTags != null) && emptiableTags.contains(tag);
1090     }
1091 
1092     /***
1093      * Whether {@link #CONTROL_CHARS control characters} are deleted (these
1094      * characters are not allowed in XML 1.0 and discouraged in XML 1.1).
1095      * @return <code>true</code> iff control characters are deleted
1096      */
1097     public boolean isDeletingControlChars() {
1098         return deletingControlChars;
1099     }
1100 
1101     /***
1102      * Whether "pseudo-tags" are deleted, i.e., sequences that cannot be parsed
1103      * as tags but look similar to them. "Pseudo-tags" start with '&lt;' and end
1104      * with '&gt;', contain a printable character after the initial '&lt;', and
1105      * do not contain any inner '&lt;' or '&gt;'). Disabled by default.
1106      *
1107      * @return <code>true</code> iff pseudo-tags are be deleted (otherwise the
1108      * starting '&lt;' is escaped)
1109      */
1110     public boolean isDeletingPseudoTags() {
1111         return deletingPseudoTags;
1112     }
1113 
1114     /***
1115      * Whether to escape "&amp;" starting a possible nonstandard entity
1116      * reference ("&amp;" at the start of one of the 5 predefined entity
1117      * references or a character reference is never escaped, all other "&amp;"
1118      * are always escaped).
1119      *
1120      * @return <code>true</code> iff "pseudo entites" are escaped
1121      */
1122     public boolean isEscapingPseudoEntities() {
1123         return escapingPseudoEntities;
1124     }
1125 
1126     /***
1127      * Checks whether a constituent is content that must be enclosed in the
1128      * root element. Tags, textual content, and CDATA sections are root
1129      * content; other constituent types can occur outside of the root element.
1130      *
1131      * @param constituent the constituent to check
1132      * @return <code>true</code> iff the constituent is root content
1133      */
1134     private boolean isRootContent(final XMLConstituent constituent) {
1135         final boolean result;
1136         if (constituent instanceof TagConstituent) {
1137             // all tags are root content
1138             result = true;
1139         } else {
1140             if ((constituent.getType() == OtherConstituent.CDATA_SECTION)
1141                     || (constituent.getType() == OtherConstituent.TEXT)) {
1142                 // text and CDATA sections are root content
1143                 result = true;
1144             } else {
1145                 result = false;
1146             }
1147         }
1148         return result;
1149     }
1150 
1151     /***
1152      * Helper method that moves a tentative tag after a specified end tag.
1153      * The tentative tag must originally be contained in the container of open
1154      * tags; it is popped from this container and re-added to the container of
1155      * unprocessed tags.
1156      *
1157      * @param tentativeTag the tag to move, should be a tentative tag
1158      * @param endTag the tentative tag is moved after this tag
1159      * @param openTags must contain all currently open tags, including the tag
1160      * to move
1161      * @param unprocessedTags must contain all unprocessed start and end tags
1162      */
1163     private void moveTentativeTag(final TagConstituent tentativeTag,
1164             final TagConstituent endTag, final OpenTags openTags,
1165             final UnprocessedTags unprocessedTags) {
1166         // remove from open tags and from current place in list
1167         openTags.forceRemove(tentativeTag);
1168         tentativeTag.remove();
1169 
1170         // adjust series no. and re-add after current tag
1171         tentativeTag.setMarkupSeriesNo(endTag.getMarkupSeriesNo());
1172         endTag.insertAfter(tentativeTag);
1173 
1174         // prepend at the begin of unprocessed tags of this series
1175         unprocessedTags.push(tentativeTag, false);
1176     }
1177 
1178     /***
1179      * Checks whether escapes for the "&amp;" character are required in the
1180      * representation of a constituent.
1181      *
1182      * @param constituent the constituent to check
1183      * @return <code>true</code> if the {@linkplain XMLConstituent#getType()
1184      * type} of the constituent is {@link TagConstituent#START_TAG},
1185      * {@link TagConstituent#EMPTY_TAG}, or {@link OtherConstituent#TEXT};
1186      * <code>false</code> for all other types (which either cannot contain any
1187      * "&amp;" or do not need to escape "&amp;")
1188      */
1189     private boolean needsAmpEscape(final XMLConstituent constituent) {
1190         final short constType = constituent.getType();
1191         return (constType == TagConstituent.START_TAG)
1192             || (constType == TagConstituent.EMPTY_TAG)
1193             || (constType == OtherConstituent.TEXT);
1194     }
1195 
1196     /***
1197      * Returns the raw constituents of an XML-like document. The constituents
1198      * are returned "raw" as they occur in the input, without fixing possible
1199      * nesting errors etc.
1200      *
1201      * @param input the XML-like input data
1202      * @param fixCharacterErrors whether to try to fix character errors, i.e.
1203      * unescaped "&lt;" and "&amp;" and tags with unquoted attribute values; if
1204      * <code>false</code>, unescaped "&lt;" in textual content and unquoted
1205      * attribute values will yield an exception, while any unescaped "&amp"
1206      * and unescaped "&lt;" in attribute values will be ignored
1207      * @return a reference to the first contained constituent; the list
1208      * of constituents can be traversed by calling
1209      * {@link XMLConstituent#nextConstituent()} on each constituent untill
1210      * <code>null</code> is returned
1211      * @throws ParsingException if the XML input contains an uncorrectable error
1212      */
1213     public final XMLConstituent rawConstituents(final CharSequence input,
1214             final boolean fixCharacterErrors) throws ParsingException {
1215         return rawConstituents(input, fixCharacterErrors, null);
1216     }
1217 
1218     /***
1219      * Returns the raw constituents of an XML-like document. The constituents
1220      * are returned "raw" as they occur in the input, without fixing possible
1221      * nesting errors etc.
1222      *
1223      * @param input the XML-like input data
1224      * @param fixCharacterErrors whether to try to fix character errors, i.e.
1225      * unescaped "&lt;" and "&amp;" and tags with unquoted attribute values; if
1226      * <code>false</code>, unescaped "&lt;" in textual content and unquoted
1227      * attribute values will yield an exception, while any unescaped "&amp"
1228      * and unescaped "&lt;" in attribute values will be ignored
1229      * @param startAndEndTags all start and end tags are added to this
1230      * container, if it isn't <code>null</code>
1231      * @return a reference to the first contained constituent; the list
1232      * of constituents can be traversed by calling
1233      * {@link XMLConstituent#nextConstituent()} on each constituent untill
1234      * <code>null</code> is returned
1235      * @throws ParsingException if the XML input contains an uncorrectable error
1236      */
1237     protected final XMLConstituent rawConstituents(final CharSequence input,
1238             final boolean fixCharacterErrors,
1239             final UnprocessedTags startAndEndTags) throws ParsingException {
1240         // tokenizer for XML-like input -- if fixCharacterErrors we'll ensure
1241         // whitespace validity outselves, otherwise let the tokenizer validate
1242         final TextTokenizer tokenizer =
1243             XMLTokenizerFactory.createXMLTokenizer(input, !fixCharacterErrors);
1244         String tokenText;
1245         String precWS;
1246         String captured = null;
1247         boolean done = false;
1248         XMLConstituent firstConst = null;
1249         XMLConstituent priorConst = null;
1250         XMLConstituent currentConst;
1251         // whether we're currently in a markup series (any kind of tags and
1252         // declarations) or outside (textual content or CDATA sections)
1253         boolean inMarkupSeries = true;
1254         // markup series are sequentially numbered, starting at 0
1255         int markupSeriesNo = 0;
1256 
1257         // variables for checking and fixing character errors
1258         boolean fixedCharError;
1259         XMLConstituent extraConst = null;
1260         String extraWS = null;
1261 
1262         try {
1263             while (!done) {
1264                 tokenText = tokenizer.nextToken();
1265                 if (tokenText == null) {
1266                     done = true;    // reached end of input
1267                 } else {
1268                     // create suitable type
1269                     captured = tokenizer.capturedText();
1270                 }
1271 
1272                 if (tokenizer.hasPrecedingWhitespace()) {
1273                     precWS = tokenizer.precedingWhitespace();
1274                     if (fixCharacterErrors
1275                             && (!tokenizer.precedingWhitespaceIsValid())) {
1276                         // "whitespace" is invalid: try to fix it
1277                         fixedCharError = false;
1278                         if (precWS.endsWith("<") && (captured.length() == 0)) {
1279                             // trailing '<' and text token is TEXT:
1280                             final String remainingWS = precWS.substring(0,
1281                                 precWS.length() - 1);
1282                             final int gtPos = tokenText.indexOf('>');
1283 
1284                             if (tokenizer.isValidWhitespace(remainingWS)
1285                                     && gtPos > 0) {
1286                                 // trailing '<' is only problem and
1287                                 // text contains '>' after at least one char
1288                                 final Object[] fixed
1289                                     = tryToFixTag('<' + tokenText);
1290                                 if (fixed != null) {
1291                                     // store tag + set token to unused rest
1292                                     extraConst = (TagConstituent) fixed[0];
1293                                     tokenText = (String) fixed[1];
1294                                     fixedCharError = true;
1295                                     precWS = remainingWS;
1296                                 } else if (deletingPseudoTags) {
1297                                     // delete "pseudo-tags" + set token to rest
1298                                     logEvent(EVENT_DELETED_PSEUDO_TAG, '<'
1299                                         + tokenText.substring(0, gtPos + 1));
1300                                     tokenText = tokenText.substring(gtPos + 1);
1301                                     fixedCharError = true;
1302                                     precWS = remainingWS;
1303                                 }
1304                             }
1305                         }
1306 
1307                         if (fixedCharError) {
1308                             // fixed error: check if next token (TEXT)
1309                             // now starts with whitespace
1310                             final int initWS =
1311                                 tokenizer.initialWhitespaceCount(tokenText);
1312                             if (initWS > 0) {
1313                                 if (extraConst != null) {
1314                                     // store after extra constituent
1315                                     extraWS = tokenText.substring(0, initWS);
1316                                 } else {
1317                                     // append to current whitespace
1318                                     precWS += tokenText.substring(0, initWS);
1319                                 }
1320                                 tokenText = tokenText.substring(initWS);
1321                             }
1322                         } else {
1323                             // escape illegal characters
1324                             String escape = StringEscapeUtils.escapeXml(precWS);
1325                             final int initWS =
1326                                 tokenizer.initialWhitespaceCount(escape);
1327 
1328                             // separate initial whitespace + log event
1329                             if (initWS > 0) {
1330                                 precWS = escape.substring(0, initWS);
1331                                 escape = escape.substring(initWS);
1332                             } else {
1333                                 precWS = "";
1334                             }
1335                             logEvent(EVENT_ESCAPED_CHARS, escape);
1336 
1337                             if (captured.length() == 0) {
1338                                 // next token is TEXT: just prepend
1339                                 tokenText = escape + tokenText;
1340                             } else {
1341                                 final int trailingWSChars =
1342                                     tokenizer.trailingWhitespaceCount(escape);
1343                                 if (trailingWSChars > 0) {
1344                                     // store trailing whitespace separately
1345                                     extraWS = escape.substring(
1346                                         escape.length() - trailingWSChars);
1347                                     escape = escape.substring(0,
1348                                         escape.length() - trailingWSChars);
1349                                 }
1350                                 // create text constituent
1351                                 extraConst = new OtherConstituent(
1352                                     OtherConstituent.TEXT, escape);
1353                             }
1354                         }
1355                     }
1356 
1357                     // create whitespace constituent (if whitespace remains
1358                     // after fixing possible character errors)
1359                     if (precWS.length() > 0) {
1360                         currentConst = new OtherConstituent(
1361                             OtherConstituent.OUTER_WHITESPACE, precWS);
1362                         // add to list
1363                         if (priorConst == null) {
1364                             firstConst = currentConst;
1365                         } else {
1366                             priorConst.insertAfter(currentConst);
1367                         }
1368                         priorConst = currentConst;
1369                     }
1370 
1371                     if (extraConst != null) {
1372                         // handle extra constituent from fixed character error
1373                         if ((extraConst.getType() == OtherConstituent.TEXT)
1374                                 || (extraConst.getType()
1375                                     == OtherConstituent.CDATA_SECTION)) {
1376                             inMarkupSeries = false;   // non-markup
1377                         } else {    // markup
1378                             if (!inMarkupSeries) {    // start new series
1379                                 inMarkupSeries = true;
1380                                 markupSeriesNo++;
1381                             }
1382                             if (extraConst instanceof TagConstituent) {
1383                                 // store markup series in tag constituents
1384                                 ((TagConstituent) extraConst)
1385                                     .setMarkupSeriesNo(markupSeriesNo);
1386 
1387                                 // add to unprocessed tags if start or end tag
1388                                 if ((startAndEndTags != null)
1389                                         && (extraConst.getType()
1390                                                 != TagConstituent.EMPTY_TAG)) {
1391                                     startAndEndTags.push(
1392                                             (TagConstituent) extraConst);
1393                                 }
1394                             }
1395                         }
1396                         if (fixCharacterErrors) {
1397                             // fix illegal characters in representation
1398                             fixRepresentation(extraConst);
1399                         }
1400                         // add extra constituent to list
1401                         if (priorConst == null) {
1402                             firstConst = extraConst;
1403                         } else {
1404                             priorConst.insertAfter(extraConst);
1405                         }
1406                         priorConst = extraConst;
1407                         extraConst = null;  // reset variable
1408 
1409                         if (extraWS != null) {
1410                             // add extra whitespace to list
1411                             currentConst = new OtherConstituent(
1412                                 OtherConstituent.OUTER_WHITESPACE,
1413                                 extraWS);
1414                             priorConst.insertAfter(currentConst);
1415                             priorConst = currentConst;
1416                             extraWS = null; // reset variable
1417                         }
1418                     }
1419                 }
1420 
1421                 // tokenText lenght might be 0 if we fixed an character error
1422                 if ((!done) && (tokenText.length() > 0)) {
1423                     // create suitable type
1424                     if ((captured.length() == 0)
1425                             || (captured.equals("[CDATA"))) {
1426                         // textual content (character data) or CDATA section
1427                         // now we're outside a markup series
1428                         if (inMarkupSeries) {
1429                             inMarkupSeries = false;
1430                         }
1431                         if (captured.length() == 0) {
1432                             currentConst = new OtherConstituent(
1433                                 OtherConstituent.TEXT, tokenText);
1434                         } else {
1435                             currentConst = new OtherConstituent(
1436                                 OtherConstituent.CDATA_SECTION, tokenText);
1437                         }
1438                     } else {
1439                         // now we're inside a markup series
1440                         if (!inMarkupSeries) {
1441                             inMarkupSeries = true;
1442                             markupSeriesNo++;
1443                         }
1444                         currentConst = createMarkupConstituent(tokenText,
1445                             captured, markupSeriesNo, startAndEndTags);
1446                     }
1447                     if (fixCharacterErrors) {
1448                         // fix illegal characters in representation
1449                         fixRepresentation(currentConst);
1450                     }
1451                     // add to list
1452                     if (priorConst == null) {
1453                         firstConst = currentConst;
1454                     } else {
1455                         priorConst.insertAfter(currentConst);
1456                     }
1457                     priorConst = currentConst;
1458                 }
1459             }
1460         } catch (IllegalArgumentException iae) {
1461             // convert unchecked into checked exception
1462             throw new ParsingException(
1463                 "Uncorrectable error in XML input: " + iae.getMessage());
1464         }
1465         return firstConst;
1466     }
1467 
1468     /***
1469      * Replaces a start tag with an copy that is an empty tag. This means,
1470      * the representation of the created copy ends in "/&gt;" instead of "&gt;"
1471      * and its type is {@link TagConstituent#EMPTY_TAG} instead of
1472      * {@link TagConstituent#START_TAG}.
1473      *
1474      * <p>If the original tag is part of a list, it is removed and the copy
1475      * is inserted instead.
1476      *
1477      * @param startTag the start tag to replace
1478      * @return the created copy
1479      * @throws IllegalArgumentException if the specified tag is not a valid
1480      * start tag
1481      */
1482     private TagConstituent replaceWithEmptyCopy(final TagConstituent startTag)
1483             throws IllegalArgumentException {
1484         if (startTag.getType() != TagConstituent.START_TAG) {
1485             throw new IllegalArgumentException(
1486                 "Tag to replace must be a start tag (actual type: "
1487                     + startTag.getType() + ')');
1488         }
1489         final StringBuilder representation =
1490             new StringBuilder(startTag.getRepresentantion());
1491         final int endMarker = representation.lastIndexOf(">");
1492         if (endMarker < 0) {
1493             throw new IllegalArgumentException(
1494                 "Start tag representation is invalid: '>' missing!");
1495         }
1496 
1497         // insert '/' before closing '>'
1498         representation.insert(endMarker, '/');
1499         final TagConstituent result =
1500             new TagConstituent(TagConstituent.EMPTY_TAG, startTag.getName(),
1501                 representation.toString(), startTag.getMarkupSeriesNo());
1502 
1503         // remove original tag from list (if any) and insert new one instead
1504         final XMLConstituent prevConst = startTag.previousConstituent();
1505         final XMLConstituent nextConst = startTag.nextConstituent();
1506         startTag.remove();
1507 
1508         if (prevConst != null) {
1509             prevConst.insertAfter(result);
1510         } else if (nextConst != null) {
1511             // this is the first tag in a list
1512             result.insertAfter(nextConst);
1513         }
1514 
1515         // copy will always be of the "regular" variety--that's ok, after
1516         // processing start tag would be "regularized" too
1517         return result;
1518     }
1519 
1520     /***
1521      * Returns a string representation of this object.
1522      *
1523      * @return a textual representation
1524      */
1525     public String toString() {
1526         return new ToStringBuilder(this)
1527             .append("missing root name", missingRootName)
1528             .append("emptiable tags", emptiableTags)
1529             .append("delete control characters", deletingControlChars)
1530             .append("delete pseudo-tags", deletingPseudoTags)
1531             .append("escape pseudo-entities", escapingPseudoEntities)
1532             .toString();
1533     }
1534 
1535     /***
1536      * Helper method that tries to parse a string as an XML start or empty tag
1537      * that contains unquoted attribute values (might be followed by other
1538      * text). If this is the case, the unquoted values are fixed and a suitable
1539      * tag constituent is created.
1540      *
1541      * <p>Only call this method when you're sure that the input does not contain
1542      * a <em>valid</em> tag, i.e. if there must be at least one unquoted value.
1543      * This method doesn't store a markup series in the tag.
1544      *
1545      * @param text the text to parse
1546      * @return an array of two elements: the created {@link TagConstituent}
1547      * and a String containing the unused rest of the input text (might be empty
1548      * but not <code>null</code>; or <code>null</code> if the text couldn't be
1549      * parsed as a text.
1550      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1551      * implementations in subclasses if an "illicit" event occurred
1552      */
1553     private Object[] tryToFixTag(final String text) throws ParsingException {
1554         final Object[] result;
1555         final Matcher laxMatcher = LAX_START_OR_EMPTY_TAG.matcher(text);
1556 
1557         if (laxMatcher.lookingAt()) {
1558             // it's a start or empty tag with unquoted
1559             // attribute values -- fix them
1560             final String tagName = laxMatcher.group(1);
1561             final short tagType;
1562 
1563             result = new Object[2];
1564             // split tag + store rest (if any) to be returned
1565             result[1] = text.substring(laxMatcher.end());
1566             String oldTagRep = laxMatcher.group();
1567 
1568             // determine tag type
1569             final String lastGroup = laxMatcher.group(laxMatcher.groupCount());
1570 
1571             if (lastGroup == null) {
1572                 tagType = TagConstituent.START_TAG;
1573             } else if ("/".equals(lastGroup)) {
1574                 tagType = TagConstituent.EMPTY_TAG;
1575             } else {
1576                 // not supposed to happen
1577                 throw new RuntimeException("Implementation error: last group of"
1578                         + " lax tag '" + laxMatcher.group() + "' is '"
1579                         + lastGroup + "' instead of '/' or null");
1580             }
1581 
1582             StringBuilder newTagRep = null;
1583             String unquotedValue;
1584 
1585             // is there still an unquoted attrib (whole value will be captured)?
1586             while (laxMatcher.groupCount() > 2
1587                     && (laxMatcher.group(3) != null)) {
1588                 // create representation, starting with text until unquoted val.
1589                 // (the 2nd groups contain '=' so we have to check the 3rd one)
1590                 newTagRep = new StringBuilder(oldTagRep.substring(0,
1591                         laxMatcher.start(3)));
1592                 unquotedValue = laxMatcher.group(3);
1593 
1594                 // remove starting or ending quote
1595                 if (unquotedValue.startsWith("\"")
1596                         || unquotedValue.startsWith("'")) {
1597                     unquotedValue = unquotedValue.substring(1);
1598                 }
1599                 if (unquotedValue.endsWith("\"")
1600                         || unquotedValue.endsWith("'")) {
1601                     unquotedValue = unquotedValue.substring(0,
1602                         unquotedValue.length() - 1);
1603                 }
1604 
1605                 // escape any embedded quotes
1606                 unquotedValue = TextUtils.replaceAll(unquotedValue,
1607                     Pattern.compile("\""), "&quot;");
1608 
1609                 // append the value within quotes
1610                 newTagRep.append('"');
1611                 newTagRep.append(unquotedValue);
1612                 newTagRep.append('"');
1613 
1614                 // append rest of tag
1615                 newTagRep.append(oldTagRep.substring(laxMatcher.end(3),
1616                     laxMatcher.end()));
1617 
1618                 // match again to check if there are more unquoted attribs
1619                 laxMatcher.reset(newTagRep);
1620                 if (!laxMatcher.matches()) {
1621                     // not supposed to happen
1622                     throw new RuntimeException("Implementation error while "
1623                             + "trying to fix unquoted attribute values: '"
1624                             + newTagRep + "' is no longer parsable as a tag");
1625                 }
1626                 oldTagRep = newTagRep.toString();
1627             }
1628 
1629             // create and store TagConstituent + log event
1630             result[0] = new TagConstituent(tagType, tagName,
1631                     newTagRep.toString());
1632             logEvent(EVENT_QUOTED_ATTRIBUTE_VALUES, newTagRep.toString());
1633         } else {
1634             result = null;
1635         }
1636         return result;
1637     }
1638 
1639 }