View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.xml;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  import java.io.Writer;
27  import java.util.Set;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.apache.commons.lang.StringEscapeUtils;
32  import org.apache.commons.lang.builder.ToStringBuilder;
33  
34  import de.fu_berlin.ties.ContextMap;
35  import de.fu_berlin.ties.ParsingException;
36  import de.fu_berlin.ties.TextProcessor;
37  import de.fu_berlin.ties.TiesConfiguration;
38  
39  import de.fu_berlin.ties.io.IOUtils;
40  import de.fu_berlin.ties.text.TextTokenizer;
41  import de.fu_berlin.ties.text.TextUtils;
42  import de.fu_berlin.ties.util.CollectionUtils;
43  import de.fu_berlin.ties.util.Util;
44  
45  /***
46   * This class tries to fix corrupt XML documents, especially documents
47   * containing nesting errors. Instances of this class are thread-safe and
48   * can fix several documents in parallel.
49   *
50   * @author Christian Siefkes
51   * @version $Revision: 1.14 $, $Date: 2004/12/06 18:00:05 $, $Author: siefkes $
52   */
53  public class XMLAdjuster extends TextProcessor {
54  
55      /***
56       * Configuration key: the name to use for the root element if missing.
57       */
58      public static final String CONFIG_MISSING_ROOT = "adjust.missing-root";
59  
60      /***
61       * Configuration key: Set of names of tags that can be converted empty tags
62       * when required.
63       */
64      public static final String CONFIG_EMPTIABLE_TAGS = "adjust.emptiable-tags";
65  
66      /***
67       * Configuration key: whether to delete
68       * {@link #CONTROL_CHARS control characters} (which are not allowed in
69       * XML 1.0 and discouraged in XML 1.1).
70       */
71      public static final String CONFIG_DELETE_CONTROL_CHARS =
72          "adjust.delete.control-chars";
73  
74      /***
75       * Configuration key: whether to delete "pseudo-tags".
76       */
77      public static final String CONFIG_DELETE_PSEUDO_TAGS =
78          "adjust.delete.pseudo-tags";
79  
80      /***
81       * Configuration key: whether to escape "&amp;" starting a possible
82       * nonstandard entity reference ("&amp;" at the start of one of the 5
83       * predefined entity references or a character reference is never escaped,
84       * all other "&amp;" are always escaped).
85       */
86      public static final String CONFIG_ESCAPE_PSEUDO_ENTITIES =
87          "adjust.escape.pseudo-entities";
88  
89      /***
90       * Pattern string specifying characters that can occur at the start of end
91       * of an unquoted attribute value: everything except '&lt;', '&gt;', '='
92       * and whitespace (whitespace is also allowed, but only in the middle of a
93       * value). Evaluated lazily (reluctant) to avoid missing the "/&gt;" at
94       * the end of an empty tag.
95       */
96      public static final String UNQUOTED_ATTRIB_CHARS =
97          "[^<>=" + XMLTokenizerFactory.XML_WHITESPACE_CHARS + "]+?";
98  
99      /***
100      * Pattern string specifying an XML attribute without proper quotes.
101      * Equal sign and value are captured in groups; the value is matched lazily
102      * (reluctant) so we won't miss the start of the next attribute.
103      */
104     public static final String UNQUOTED_ATTRIBUTE = XMLTokenizerFactory.XML_NAME
105         + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(=)"
106         + XMLTokenizerFactory.XML_OPT_WHITESPACE + "((?:"
107         + UNQUOTED_ATTRIB_CHARS + XMLTokenizerFactory.XML_WHITESPACE + ")*?"
108         + UNQUOTED_ATTRIB_CHARS + ")";
109 
110     /***
111      * Pattern specifying of a "lax" XML start or empty tag that can contain
112      * unquoted (invalid) attributes (combined into a single pattern to avoid
113      * unnecessary backtracking). Element name, equal signs and values of
114      * the (last) unquoted attribute, and '/' (for empty tags) are captured.
115      */
116     public static final Pattern LAX_START_OR_EMPTY_TAG = Pattern.compile(
117         "<(" + XMLTokenizerFactory.XML_NAME + ")" + "(?:"
118         + XMLTokenizerFactory.XML_WHITESPACE + "(?:"
119         + XMLTokenizerFactory.XML_ATTRIBUTE + "|" + UNQUOTED_ATTRIBUTE + "))*"
120         + XMLTokenizerFactory.XML_OPT_WHITESPACE + "(/)?>");
121 
122     /***
123      * A "&amp;" that is not the start of an predefined entity reference or
124      * a character reference and thus should be escaped if
125      * {@link #isEscapingPseudoEntities()} is <code>true</code>.
126      * (A pattern matching the rest of predefined entity or character reference
127      * is included via negative lookahead.)
128      */
129     public static final Pattern PSEUDO_AMP = Pattern.compile(
130         "&(?!(?:amp|lt|gt|apos|quot|#[0-9]+|#x[0-9a-fA-F]+);)");
131 
132     /***
133      * A "&amp;" that is not the start of an entity and thus must be escaped.
134      * (A pattern matching the rest of legal entity or character reference is
135      * included via negative lookahead.)
136      */
137     public static final Pattern SPURIOUS_AMP = Pattern.compile("&(?!(?:"
138         + XMLTokenizerFactory.XML_NAME + "|#[0-9]+|#x[0-9a-fA-F]+);)");
139 
140     /***
141      * Escape sequence for the "&amp;" character.
142      */
143     public static final String ESCAPED_AMP = "&amp;";
144 
145     /***
146      * Pattern specifying sequences of control characters (character codes
147      * below the space character, except tab, line feed and carriage return).
148      */
149     public static final Pattern CONTROL_CHARS =
150         Pattern.compile("[\u0001-\u0008\u000B-\u000C\u000E-\u001F]+");
151 
152     /***
153      * Event constant: Converted to empty tag.
154      */
155     protected static final String EVENT_CONVERTED_TO_EMPTY_TAG =
156         "Converted to empty tag";
157 
158     /***
159      * Event constant: Inserted missing end tag.
160      */
161     protected static final String EVENT_INSERTED_MISSING_END_TAG  =
162         "Inserted missing end tag";
163 
164     /***
165      * Event constant: Inserted missing root element.
166      */
167     protected static final String EVENT_INSERTED_MISSING_ROOT_ELEMENT  =
168         "Inserted missing root element";
169 
170     /***
171      * Event constant: Inserted missing start tag.
172      */
173     protected static final String EVENT_INSERTED_MISSING_START_TAG  =
174         "Inserted missing start tag";
175 
176     /***
177      * Event constant: Moved end tag up.
178      */
179     protected static final String EVENT_MOVED_END_TAG_UP =
180         "Moved end tag up";
181 
182     /***
183      * Event constant: Moved start tag dow.
184      */
185     protected static final String EVENT_MOVED_START_TAG_DOWN  =
186         "Moved start tag down";
187 
188     /***
189      * Event constant: Split tag.
190      */
191     protected static final String EVENT_SPLIT_TAG  =
192         "Split tag";
193 
194     /***
195      * Event constant: Deleted control characters.
196      */
197     protected static final String EVENT_DELETED_CONTROL_CHARS  =
198         "Deleted control characters";
199 
200     /***
201      * Event constant: Deleted pseudo-tag.
202      */
203     protected static final String EVENT_DELETED_PSEUDO_TAG  =
204         "Deleted pseudo-tag";
205 
206     /***
207      * Event constant: Escaped characters that are illegal or unwanted.
208      */
209     protected static final String EVENT_ESCAPED_CHARS  =
210         "Escaped characters";
211 
212     /***
213      * Event constant: Quoted attribute values.
214      */
215     protected static final String EVENT_QUOTED_ATTRIBUTE_VALUES  =
216         "Quoted attribute values";
217 
218     /***
219      * The name to use for the root element if missing. A root element
220      * with this name is created when not all elements and textual
221      * content are inclosed within a single element (the root).
222      * If <code>null</code>, processing stops with an exception if the root
223      * element is missing.
224      */
225     private final String missingRootName;
226 
227     /***
228      * Contains the names (Strings) of tags that can be converted an empty tags
229      * when required for fixing a document (e.g. "br" when
230      * <code>&lt;br&gt;</code> may be converted to <code>&lt;br/&gt;</code>
231      * during repair). Might be <code>null</code>.
232      */
233     private final Set<String> emptiableTags;
234 
235     /***
236      * Whether to delete {@link #CONTROL_CHARS control characters} (which are
237      * not allowed in XML 1.0 and discouraged in XML 1.1).
238      */
239     private final boolean deletingControlChars;
240 
241     /***
242      * Whether to delete "pseudo-tags", i.e., sequences that cannot be parsed as
243      * tags but look similar to them. "Pseudo-tags" start with '&lt;' and end
244      * with '&gt;', contain a printable character after the initial '&lt;', and
245      * do not contain any inner '&lt;' or '&gt;'). If <code>true</code>, such
246      * sequences will be deleted; otherwise (default) the starting '&lt;' will
247      * be escaped.
248      */
249     private final boolean deletingPseudoTags;
250 
251     /***
252      * Whether to escape "&amp;" starting a possible nonstandard entity
253      * reference ("&amp;" at the start of one of the 5 predefined entity
254      * references or a character reference is never escaped, all other "&amp;"
255      * are always escaped).
256      */
257     private final boolean escapingPseudoEntities;
258 
259     /***
260      * Creates a new instance using a default extension and the
261      * {@linkplain TiesConfiguration#CONF standard configuration}.
262      */
263     public XMLAdjuster() {
264         this("xml");
265     }
266 
267     /***
268      * Creates a new instance, configured from the
269      * {@linkplain TiesConfiguration#CONF standard configuration}.
270      * @param outExt the extension to use for output files
271      */
272     public XMLAdjuster(final String outExt) {
273         this(outExt, TiesConfiguration.CONF);
274     }
275 
276     /***
277      * Creates a new instance from the provided configuration.
278      * @param outExt the extension to use for output files
279      * @param config used to configure this instance
280      */
281     public XMLAdjuster(final String outExt, final TiesConfiguration config) {
282         this(outExt, config.getString(CONFIG_MISSING_ROOT, null),
283             config.containsKey(CONFIG_EMPTIABLE_TAGS)
284                     ? CollectionUtils.arrayAsSet(config.getStringArray(
285                             CONFIG_EMPTIABLE_TAGS))
286                     : null,
287             config.getBoolean(CONFIG_DELETE_CONTROL_CHARS),
288             config.getBoolean(CONFIG_DELETE_PSEUDO_TAGS),
289             config.getBoolean(CONFIG_ESCAPE_PSEUDO_ENTITIES), config);
290     }
291 
292     /***
293      * Creates a new instance.
294      *
295      * @param outExt the extension to use for output files
296      * @param missingRoot the name to use for the root element if missing, i.e.
297      * if not all elements and textual content are inclosed within a single
298      * element (the root); if <code>null</code>, processing stops with an
299      * exception if the root element is missing
300      * @param emptiableTagSet contains the names (Strings) of tags that can be
301      * converted an empty tags when required for fixing a document (e.g. "br"
302      * when <code>&lt;br&gt;</code> may be converted to
303      * <code>&lt;br/&gt;</code> during repair); might be <code>null</code> if
304      * there are none
305      * @param deleteControlChars whether to delete
306      * {@link #CONTROL_CHARS control characters} (which are not allowed in XML
307      * 1.0 and discouraged in XML 1.1)
308      * @param deletePseudoTags whether to
309      * {@linkplain #isDeletingPseudoTags() delete "pseudo-tags"}
310      * @param escapePseudoEntities whether to escape "&amp;" starting a possible
311      * nonstandard entity reference ("&amp;" at the start of one of the 5
312      * predefined entity references or a character reference is never escaped,
313      * all other "&amp;" are always escaped)
314      * @param config used to configure superclasses; if <code>null</code>,
315      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
316      */
317     public XMLAdjuster(final String outExt, final String missingRoot,
318             final Set<String> emptiableTagSet, final boolean deleteControlChars,
319             final boolean deletePseudoTags,
320             final boolean escapePseudoEntities,
321             final TiesConfiguration config) {
322         super(outExt, config);
323         missingRootName = missingRoot;
324         emptiableTags = emptiableTagSet;
325         deletingControlChars = deleteControlChars;
326         deletingPseudoTags = deletePseudoTags;
327         escapingPseudoEntities = escapePseudoEntities;
328     }
329 
330     /***
331      * Tries to fix corrupt XML documents, especially documents
332      * containing nesting errors. Delegates to
333      * {@link #fixedConstituents(CharSequence)} and writes the result to the
334      * specified writer.
335      *
336      * @param input the corrupt XML document
337      * @param out the writer to write the corrected XML document to; flushed but
338      * not closed by this method
339      * @throws IOException if an I/O error occurs while using the writer
340      * @throws ParsingException if the XML input contains an uncorrectable error
341      */
342     public final void adjust(final CharSequence input, final Writer out)
343             throws IOException, ParsingException {
344         final XMLConstituent firstConst = fixedConstituents(input);
345 
346         // write corrected data to out
347         XMLConstituent currentConst = firstConst;
348         while (currentConst != null) {
349             out.write(currentConst.getRepresentantion());
350             currentConst = currentConst.nextConstituent();
351         }
352         out.flush();
353     }
354 
355     /***
356      * Tries to fix corrupt XML documents, especially documents containing
357      * nesting errors. Delegates to {@link #adjust(CharSequence, Writer)}.
358      *
359      * @param in the reader to read the corrupt XML document from; not closed
360      * by this method
361      * @param out the writer to write the corrected XML document to; flushed but
362      * not closed by this method
363      * @throws IOException if an I/O error occurs while using the reader or
364      * writer
365      * @throws ParsingException if the XML input contains an uncorrectable error
366      */
367     public final void adjust(final Reader in, final Writer out)
368             throws IOException, ParsingException {
369         final String input = IOUtils.readToString(in);
370         adjust(input, out);
371     }
372 
373     /***
374      * Method called by the {@link #logEvent(String, String)} methods whenever
375      * an event occurred to ensure the event is acceptable. Subclasses that want
376      * to prevent certain events can overwrite this method and throw an
377      * exception if an "illegal" event is encountered.
378      * This implementation does nothing, letting all events pass.
379      *
380      * @param eventType the event that occurred; should be one of the
381      * EVENT constants defined in this class.
382      * @throws ParsingException could be thrown by subclasses if the event is
383      * considered illicit
384      */
385     protected void checkEvent(final String eventType) throws ParsingException {
386     }
387 
388     /***
389      * Helper method called after completing an end tag with a missing start tag
390      * or processing a tentative start tag to check whether the next appearance
391      * of this tag type is a start tag or another end tag.
392      * The second case means that another start tag is missing
393      * -- the missing start tag is created as a tentative tag and inserted.
394      *
395      * @param endTag the end tag whose next appearance should be checked
396      * @param insertAfter the tag after which the newly created tentative
397      * tag should be inserted
398      * @param unprocessedTags the container of unprocessed start and end tags
399      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
400      * implementations in subclasses if an "illicit" event occurred
401      */
402     private void checkNextAppearance(final TagConstituent endTag,
403             final TagConstituent insertAfter,
404             final UnprocessedTags unprocessedTags) throws ParsingException {
405         // check whether the next appearance is a start tag (ok) or end tag
406         // (insert tentative start tag)
407         if (correspondingEndTag(endTag.getName(), -1, unprocessedTags,
408                 false) != null) {
409             // this means next appearance is another end tag:
410             // create tentative start tag after current end tag
411             final TagConstituent tentative = new TagConstituent(
412                 TagConstituent.START_TAG, endTag.getName(),
413                 insertAfter.getMarkupSeriesNo());
414             tentative.setVariety(TagVariety.TENTATIVE);
415 
416             // insert after specified tag + following whitespace, if any
417             XMLConstituent nextConst = insertAfter.nextConstituent();
418             if (nextConst.getType() == OtherConstituent.OUTER_WHITESPACE) {
419                 nextConst.insertAfter(tentative);
420             } else {
421                 insertAfter.insertAfter(tentative);
422             }
423 
424             // insert at begin of unprocessed tags and log event
425             unprocessedTags.push(tentative, false);
426             logEvent(EVENT_INSERTED_MISSING_START_TAG, tentative);
427         }
428     }
429 
430     /***
431      * Helper method that creates an constituent that is part of a markup series
432      * (i.e., that is neither text nor a CDATA section).
433      *
434      * @param currentToken the string representation of the constituent
435      * @param capturedText the text captured by the XML text tokenizer, used to
436      * determine the token type
437      * @param markupSeriesNo the number of the markup series of the constituent
438      * @param startAndEndTags all start and end tags are added to this
439      * container, if it isn't <code>null</code>
440      * @return the created constituent
441      */
442     private XMLConstituent createMarkupConstituent(final String currentToken,
443             final String capturedText, final int markupSeriesNo,
444             final UnprocessedTags startAndEndTags) {
445         // used to determine the exact token type
446         final char firstChar = capturedText.charAt(0);
447         final char lastChar = capturedText.charAt(capturedText.length() - 1);
448         final XMLConstituent result;
449 
450         if (firstChar == '/') {
451             result = new TagConstituent(TagConstituent.END_TAG,
452                 capturedText.substring(1), currentToken, markupSeriesNo);
453             if (startAndEndTags != null) {
454                 startAndEndTags.push((TagConstituent) result);
455             }
456         } else if (lastChar == '/') {
457             result = new TagConstituent(TagConstituent.EMPTY_TAG,
458                 capturedText.substring(0, capturedText.length() - 1),
459                 currentToken, markupSeriesNo);
460         } else if (firstChar == '!') {
461             result = new OtherConstituent(OtherConstituent.DOCTYPE,
462                 currentToken);
463         } else if (firstChar == '?') {
464             // prolog ("<?xml...") or processing instruction?
465             if (capturedText.substring(1).equals("xml")) {
466                 result = new OtherConstituent(OtherConstituent.XML_PROLOG,
467                     currentToken);
468             } else {
469                 result = new OtherConstituent(OtherConstituent.PI,
470                     currentToken);
471             }
472         } else if (firstChar == '<') {
473             result = new OtherConstituent(OtherConstituent.COMMENT,
474                 currentToken);
475         } else {
476             result = new TagConstituent(TagConstituent.START_TAG, capturedText,
477                 currentToken, markupSeriesNo);
478             if (startAndEndTags != null) {
479                 startAndEndTags.push((TagConstituent) result);
480             }
481         }
482         return result;
483     }
484 
485     /***
486      * Checks whether <code>unprocessedTags</code> contains a
487      * matching tag within the specified <code>markupSeriesNo</code>, not
488      * preceeded by a matching start tag. Returns the found end tag, if it
489      * exists; returns <code>null</code> otherwise.
490      *
491      * @param tagname the name of the tag to check
492      * @param markupSeriesNo the markup series number to match;
493      * or <code>-1</code> if the markup series number doesn't matter
494      * @param unprocessedTags the container of start and end tags to check
495      * @param remove whether to remove the found end tag from the
496      * <code>unprocessedTags</code> container
497      * @return the found corresponding end tag,
498      * or <code>null</code> if no matching appearance was found
499      */
500     private TagConstituent correspondingEndTag(final String tagname,
501             final int markupSeriesNo, final UnprocessedTags unprocessedTags,
502             final boolean remove) {
503         final TagConstituent result;
504         final TagConstituent suitableTag;
505 
506         if (markupSeriesNo >= 0) {
507             suitableTag =
508                 unprocessedTags.findInSeries(tagname, markupSeriesNo, true);
509         } else {
510             suitableTag = unprocessedTags.findFirst(tagname);
511         }
512 
513         // check whether we found a start tag or end tag (if any)
514         if ((suitableTag != null)
515             && (suitableTag.getType() == TagConstituent.END_TAG)) {
516                 // found suitable end tag
517                 result = suitableTag;
518                 if (remove) {
519                     unprocessedTags.forceRemove(suitableTag);
520                 }
521         } else {
522             // no tag or start tag
523             result = null;
524         }
525 
526         return result;
527     }
528 
529     /***
530      * Checks whether <code>openTags</code> contains a matching start tag
531      * (ignoring the root tag), either within the specified
532      * <code>markupSeriesNo</code> or a tentative tag anywhere.
533      * Returns and remove the found end tag, if it exists.
534      * Returns <code>null</code> otherwise.
535      *
536      * @param tagname the name of the tag to check
537      * @param markupSeriesNo the markup series number to match
538      * @param openTags the container of currently open start tags to check
539      * @return the found and removed matching start tag,
540      * or <code>null</code> if no matching non-root appearance was found
541      */
542     private TagConstituent correspondingOpenTag(final String tagname,
543             final int markupSeriesNo, final OpenTags openTags) {
544         // use last match within specified markup series, if any
545         TagConstituent result =
546             openTags.findInSeries(tagname, markupSeriesNo, false);
547 
548         // otherwise look for tentative tag
549         if (result == null) {
550             result = openTags.findTentativeTag(tagname);
551         }
552 
553         if (result != null) {
554             if (openTags.isRoot(result)) {
555                 // the root tag is not accepted -- return null instead
556                 result = null;
557             } else {
558                 // remove found tag
559                 openTags.forceRemove(result);
560             }
561         }
562         return result;
563     }
564 
565     /***
566      * Tries to fix corrupt XML documents, especially documents containing
567      * nesting errors. Delegates to {@link #adjust(Reader, Writer)}, ignoring
568      * the <code>context</code>.
569      *
570      * @param reader reader containing the text to process; should not be closed
571      * by this method
572      * @param writer the writer to write the processed text to; might be flushed
573      * but not closed by this method
574      * @param context a map of objects that are made available for processing
575      * @throws IOException if an I/O error occurs while using the reader or
576      * writer
577      * @throws ParsingException if the XML input contains an uncorrectable error
578      */
579     protected void doProcess(final Reader reader, final Writer writer,
580             final ContextMap context) throws IOException, ParsingException {
581         adjust(reader, writer);
582     }
583 
584     /***
585      * Helper method that checks whether an end tag is certain to be missing for
586      * a specified open tag. This is the case if (a) the next appearance of
587      * this tag type is NOT an end tag and if (b) there are at least as much
588      * unprocessed start tags than unprocessed end tags for the given type.
589      *
590      * @param openTagName the name of the open tag check
591      * @param unprocessedTags the container of unprocessed start and end tags
592      * @return <code>true</code> iff both conditions stated above are fulfulled
593      */
594     private boolean endTagMissing(final String openTagName,
595             final UnprocessedTags unprocessedTags) {
596         // true if the next tag is not an tag
597         final boolean result = (correspondingEndTag(openTagName, -1,
598                 unprocessedTags, false) == null)
599         // and if there are at least as many start tags as end tags
600             && (unprocessedTags.startTagCount(openTagName)
601                 >= unprocessedTags.endTagCount(openTagName));
602         return result;
603     }
604 
605     /***
606      * Escapes every of a specified pattern in the representation of a
607      * constituent.
608      *
609      * @param constituent the constituent to check
610      * @param pattern the pattern to replace
611      * @param replacement the replacement text
612      * @param eventType the event to log when replacing was necessary
613      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
614      * implementations in subclasses if an "illicit" event occurred
615      */
616     private void replaceInRep(final XMLConstituent constituent,
617             final Pattern pattern, final String replacement,
618             final String eventType) throws ParsingException {
619         final String oldRep = constituent.getRepresentantion();
620         final String newRep =
621             TextUtils.replaceAll(oldRep, pattern, replacement);
622 
623         if (oldRep != newRep) {
624             // representation changed: log event and update representation
625             logEvent(eventType, "old: '" + oldRep + "', new: '" + newRep + "'");
626             constituent.setRepresentantion(newRep);
627         }
628     }
629 
630     /***
631      * Returns the constituents of an XML-like document after fixing possible
632      * nesting errors etc.
633      *
634      * @param input the XML-like input data
635      * @return a reference to the first contained constituent; the list
636      * of constituents can be traversed by calling
637      * {@link XMLConstituent#nextConstituent()} on each constituent untill
638      * <code>null</code> is returned
639      * @throws ParsingException if the XML input contains an uncorrectable error
640      */
641     public final XMLConstituent fixedConstituents(final CharSequence input)
642             throws ParsingException {
643         // first run: create list of all tags + container of start and end tags;
644         // fix character-level errors
645         final UnprocessedTags unprocessedTags = new UnprocessedTags();
646         XMLConstituent firstConst =
647             rawConstituents(input, true, unprocessedTags);
648 
649         // second run: ensure corrent nesting of start + end tags
650         XMLConstituent currentConst = firstConst;
651         XMLConstituent lastConst = currentConst;
652         TagConstituent currentTag;
653         final OpenTags openTags = new OpenTags();
654 
655         // all root content (tags, non-whitespace text, CDATA sections) must
656         // be enclosed in a root element
657         XMLConstituent firstRootContent = null;
658         boolean insertedMissingRoot = false;
659 
660         while (currentConst != null) {
661             if ((firstRootContent == null) && isRootContent(currentConst)) {
662                 // this is the first root content
663                 firstRootContent = currentConst;
664 
665                 if (currentConst.getType() != TagConstituent.START_TAG) {
666                     // not a start tag: try to insert missing root element
667                     insertMissingRoot(firstRootContent, openTags);
668                     insertedMissingRoot = true;
669                 }
670             } else if (openTags.isEmpty() && (isRootContent(currentConst))) {
671                 // all tags have been closed but this is root content:
672                 // try to insert missing root element
673                 insertMissingRoot(firstRootContent, openTags);
674                 insertedMissingRoot = true;
675             }
676 
677             if (currentConst instanceof TagConstituent) {
678                 currentTag = (TagConstituent) currentConst;
679                 if (currentTag.getType() == TagConstituent.START_TAG) {
680                     // move from unprocessed tags to open tags
681                     unprocessedTags.forceRemove(currentTag);
682                     openTags.push(currentTag);
683                 } else if (currentTag.getType() == TagConstituent.END_TAG) {
684                     // remove from unprocessed tags and do handling
685                     unprocessedTags.forceRemove(currentTag);
686                     handleEndTag(currentTag, openTags, unprocessedTags);
687                 }
688             }
689 
690             lastConst = currentConst; // so we don't loose the very last one
691             currentConst = currentConst.nextConstituent();
692         }
693 
694         // handle end-of-file
695         handleEOF(lastConst, openTags, unprocessedTags, insertedMissingRoot);
696 
697         // first constituent might have changed (e.g. due to moves in first tag
698         // series, insertion of root element)
699         while (firstConst.hasPrevious()) {
700             firstConst = firstConst.previousConstituent();
701         }
702         return firstConst;
703     }
704 
705     /***
706      * Helper method that fixes character errors in the representation of a
707      * constituent, if any.
708      *
709      * @param constituent the constituent to check
710      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
711      * implementations in subclasses if an "illicit" event occurred
712      */
713     private void fixRepresentation(final XMLConstituent constituent)
714             throws ParsingException {
715         // delete control characters if configured
716         if (deletingControlChars) {
717             replaceInRep(constituent, CONTROL_CHARS, "",
718                 EVENT_DELETED_CONTROL_CHARS);
719         }
720 
721         if (needsAmpEscape(constituent)) {
722             if (escapingPseudoEntities) {
723                 // escape every '&' unless starting predefined entity or
724                 // character reference
725                 replaceInRep(constituent, PSEUDO_AMP, ESCAPED_AMP,
726                     EVENT_ESCAPED_CHARS);
727             } else {
728                 // escape only illegal '&' (not start of any entity)
729                 replaceInRep(constituent, SPURIOUS_AMP, ESCAPED_AMP,
730                     EVENT_ESCAPED_CHARS);
731             }
732         }
733     }
734 
735     /***
736      * Helper method for handling an end tag. Here the actual work of the
737      * algorithm takes place, because we have to ensure a match.
738      *
739      * @param endTag the end tag to handle
740      * @param openTags must contain all currently open tags
741      * @param unprocessedTags must contain all unprocessed start and end tags
742      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
743      * implementations in subclasses if an "illicit" event occurred
744      */
745     protected void handleEndTag(final TagConstituent endTag,
746             final OpenTags openTags, final UnprocessedTags unprocessedTags)
747             throws ParsingException {
748         TagConstituent lastOpenTag;
749         TagVariety lastOpenTagVariety;
750         String lastOpenTagName;
751         TagConstituent correspondingTag;
752         boolean matchFound = false;
753 
754         // iterate until end tag has been processed by finding a match
755         while (!matchFound) {
756             lastOpenTag = openTags.peek();
757             lastOpenTagVariety = lastOpenTag.getVariety();
758             lastOpenTagName = lastOpenTag.getName();
759             //Log.TIES.debug("Mismatch: " + lastOpenTag + " / " + endTag);
760 
761             if (lastOpenTagName.equals(endTag.getName())) {
762                 // last open tag matched: remove it and ensure it's regular
763                 // (checking next appearance for tentative tags)
764                 if (openTags.popAndRegularize()
765                         && (lastOpenTagVariety == TagVariety.TENTATIVE)) {
766                     checkNextAppearance(endTag, endTag, unprocessedTags);
767                 }
768                 matchFound = true; // done
769             } else if (lastOpenTagVariety == TagVariety.TENTATIVE) {
770                 // last open tag is tentative: insert after the end tag; removed
771                 // it from open tags and re-added to unprocessed tags
772                 moveTentativeTag(lastOpenTag, endTag, openTags,
773                     unprocessedTags);
774                 // now retry
775             } else if (openTags.containsNonTentative(endTag.getName())
776                     && (correspondingTag = correspondingEndTag(
777                     lastOpenTagName, endTag.getMarkupSeriesNo(),
778                     unprocessedTags, true)) != null) {
779                 // A non-tentative start tag for the current end tag exists and
780                 // there is a corresponding end tag for the last open tag in the
781                 // current markup series. Remove the last open tag and make it
782                 // regular if necessary
783                 openTags.popAndRegularize();
784 
785                 // move the corresponding end tag before the current end tag
786                 correspondingTag.remove();
787                 endTag.insertBefore(correspondingTag);
788 
789                 // check next appearance if tentative (if necessary inserting
790                 // new tentative tag after the current end tag)
791                 if (lastOpenTagVariety == TagVariety.TENTATIVE) {
792                     checkNextAppearance(correspondingTag, endTag,
793                         unprocessedTags);
794                 }
795 
796                 // log event and retry
797                 logEvent(EVENT_MOVED_END_TAG_UP, correspondingTag);
798             } else if ((correspondingTag = correspondingOpenTag(
799                     endTag.getName(), lastOpenTag.getMarkupSeriesNo(),
800                     openTags)) != null) {
801                 // Open tags contained a non-root tag corresponding to the end
802                 // tag, either within the markup series of the last open tag or
803                 // a tentative tag anywhere.
804                 // Move the corresponding start after the last open tag
805                 correspondingTag.remove();
806                 lastOpenTag.insertAfter(correspondingTag);
807 
808                 if (correspondingTag.getVariety() == TagVariety.REGULAR) {
809                     // log event
810                     logEvent(EVENT_MOVED_START_TAG_DOWN, correspondingTag);
811                 } else {
812                     if (correspondingTag.getVariety() == TagVariety.TENTATIVE) {
813                         // tentative tag: check whether the next appearance is
814                         // a start tag (ok) or end tag (another start tag
815                         // missing: insert tentative start tag)
816                         checkNextAppearance(endTag, endTag, unprocessedTags);
817                     }
818 
819                     // convert to regular tag
820                     correspondingTag.setVariety(TagVariety.REGULAR);
821                     // event was already logged when creating the irregular tag
822                 }
823                 matchFound = true; // done
824             } else if (!openTags.contains(endTag.getName())) {
825                 // no open tag matching the current end tag -- we have to
826                 // create one
827                 final TagConstituent missingStartTag =
828                     new TagConstituent(TagConstituent.START_TAG,
829                         endTag.getName(), lastOpenTag.getMarkupSeriesNo());
830 
831                 // insert new tag after the last open tag
832                 lastOpenTag.insertAfter(missingStartTag);
833                 logEvent(EVENT_INSERTED_MISSING_START_TAG, missingStartTag);
834 
835                 // Check whether the next appearance is a start tag (ok) or end
836                 // tag (another start tag missing: insert tentative start tag)
837                 checkNextAppearance(endTag, endTag, unprocessedTags);
838                 matchFound = true; // done
839             } else if ((lastOpenTag.getMarkupSeriesNo()
840                     == endTag.getMarkupSeriesNo())
841                     && !endTagMissing(lastOpenTagName, unprocessedTags)) {
842                 // the last open tag is within the current markup series and a
843                 // corresponding end tag exists for it (in any markup series):
844                 // Move it after the current end tag, removing it from open tags
845                 // and from current place in list
846                 openTags.forceRemove(lastOpenTag);
847                 lastOpenTag.remove();
848                 endTag.insertAfter(lastOpenTag);
849 
850                 // append at the start of unprocessed tags of this series
851                 unprocessedTags.push(lastOpenTag, false);
852 
853                 // log event if this is a regular tag
854                 if (lastOpenTagVariety == TagVariety.REGULAR) {
855                     logEvent(EVENT_MOVED_START_TAG_DOWN, lastOpenTag);
856                 }
857                 // now retry
858             } else {
859                 // no way out: we have to "get rid of" the last open tag
860                 final boolean endTagMissing =
861                     endTagMissing(lastOpenTagName, unprocessedTags);
862 
863                 if (endTagMissing && isAnEmptiableTag(lastOpenTagName)) {
864                     // convert last open tag to empty tag
865                     final TagConstituent newEmptyTag =
866                         replaceWithEmptyCopy(lastOpenTag);
867                     logEvent(EVENT_CONVERTED_TO_EMPTY_TAG, newEmptyTag);
868                 } else {
869                     // create and insert suitable end tag and insert it before
870                     // the current end tag
871                     final TagConstituent matchingEndTag = new TagConstituent(
872                         TagConstituent.END_TAG, lastOpenTagName,
873                         endTag.getMarkupSeriesNo());
874                     endTag.insertBefore(matchingEndTag);
875 
876                     if (endTagMissing) {
877                         // end tag of lastOpenTag type is missing: created it.
878                         // Log event for regular tags
879                         if (lastOpenTagVariety == TagVariety.REGULAR) {
880                             logEvent(EVENT_INSERTED_MISSING_END_TAG,
881                                 matchingEndTag);
882                         }
883                     } else {
884                         // sufficient end tags available:
885                         // split lastOpenTag by creating continuation copy
886                         final TagConstituent continuation = new TagConstituent(
887                             TagConstituent.START_TAG, lastOpenTagName,
888                             lastOpenTag.getRepresentantion(),
889                             endTag.getMarkupSeriesNo());
890                         continuation.setVariety(TagVariety.CONTINUATION);
891 
892                         // insert copy after current end tag and log event
893                         endTag.insertAfter(continuation);
894                         unprocessedTags.push(continuation, false);
895                         logEvent(EVENT_SPLIT_TAG, lastOpenTag);
896                     }
897 
898                     // Remove from open tags and ensure it's regular; retry
899                     openTags.popAndRegularize();
900                 }
901             }
902         }
903     }
904 
905     /***
906      * Helper method for handling an the end of a file. Suitable end tags are
907      * created to close any left-over open tags.
908      *
909      * @param lastConst the last constituent in the original input (must not
910      * have a successor)
911      * @param openTags must contain all currently open tags
912      * @param unprocessedTags must contain all unprocessed start and end tags;
913      * should better be empty
914      * @param insertedMissingRoot whether the start tag of a missing root
915      * element was created (in this case we'll insert the corresponding end tag
916      * without logging another event)
917      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
918      * implementations in subclasses if an "illicit" event occurred
919      */
920     protected void handleEOF(final XMLConstituent lastConst,
921             final OpenTags openTags, final UnprocessedTags unprocessedTags,
922             final boolean insertedMissingRoot) throws ParsingException {
923         // check state
924         if (lastConst.nextConstituent() != null) {
925             Util.LOG.error("Implementation error: constituents after last "
926                 + "constituent: " + lastConst);
927         }
928         if (!unprocessedTags.isEmpty()) {
929             Util.LOG.error("Implementation error: still unprocessed tags at "
930                 + "end-of-file -- last constituent: " + lastConst
931                 + " unprocessed tags: " + unprocessedTags);
932         }
933 
934         XMLConstituent currentConst = lastConst;
935         TagConstituent lastOpenTag = openTags.peek();
936 
937         if (lastOpenTag != null) {
938             // there are unclosed elements -- look for last root content
939             while (!isRootContent(currentConst)) {
940                 currentConst = currentConst.previousConstituent();
941             }
942 
943             // insert missing end tags after last root content
944             TagConstituent missingEndTag;
945             while (lastOpenTag != null) {
946                 // create suitable end tag for each unclosed open tag
947                 missingEndTag = new TagConstituent(TagConstituent.END_TAG,
948                     lastOpenTag.getName());
949 
950                 // insert after last root content
951                 currentConst.insertAfter(missingEndTag);
952                 currentConst = missingEndTag;
953                 // remove handled tag and peek the previous one
954                 openTags.pop();
955                 lastOpenTag = openTags.peek();
956 
957                 // log an event unless this is the end of a missing root
958                 if (!(insertedMissingRoot && openTags.isEmpty())) {
959                     logEvent(EVENT_INSERTED_MISSING_END_TAG, missingEndTag);
960                 }
961             }
962         }
963     }
964 
965     /***
966      * Logs the occurance of an event necessary for fixing a document.
967      * This method variant is used for character errors.
968      *
969      * @param eventType the event that occurred; should be one of the
970      * EVENT constants defined in this class.
971      * @param details a detailed description of the event
972      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
973      * implementations in subclasses if the event is considered illicit
974      */
975     protected void logEvent(final String eventType, final String details)
976             throws ParsingException {
977         checkEvent(eventType);
978         Util.LOG.debug("Modified document: " + eventType
979                 + " (" + details.toString() + ')');
980     }
981 
982     /***
983      * Logs the occurance of an event necessary for fixing a document.
984      * This method variant is used for nesting errors and missing root elements.
985      *
986      * @param eventType the event that occurred; should be one of the
987      * EVENT constants defined in this class.
988      * @param tag the involved tag
989      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
990      * implementations in subclasses if the event is considered illicit
991      */
992     protected void logEvent(final String eventType, final TagConstituent tag)
993             throws ParsingException {
994         checkEvent(eventType);
995         Util.LOG.debug("Modified document: " + eventType
996                 + " (" + tag.toString() + ')');
997     }
998 
999     /***
1000      * Inserts a root element with the configured name
1001      * ({@link #missingRootName}); or throws a parsing exception to finish
1002      * processing if inserting a root element is not allowed. This method
1003      * only creates and inserts a start tag of the configured type; the
1004      * matching end tag will automatically be added by
1005      * {@link #handleEOF(XMLConstituent, OpenTags, UnprocessedTags, boolean)}.
1006      *
1007      * @param firstRootContent the root start will be inserted before this
1008      * content
1009      * @param openTags the container of open tags to push the root tag to,
1010      * assumed to be empty
1011      * @throws ParsingException if inserting a missing root tag is now allowed
1012      * (no name for a missing root tag specified)
1013      */
1014     private void insertMissingRoot(final XMLConstituent firstRootContent,
1015             final OpenTags openTags) throws ParsingException {
1016         if (missingRootName != null) {
1017             // create start tag
1018             final TagConstituent rootStartTag = new TagConstituent(
1019                     TagConstituent.START_TAG, missingRootName, 0);
1020 
1021             // insert before first root content
1022             firstRootContent.insertBefore(rootStartTag);
1023 
1024             // push to open tags and log event
1025             openTags.push(rootStartTag);
1026             logEvent(EVENT_INSERTED_MISSING_ROOT_ELEMENT, rootStartTag);
1027         } else {
1028             throw new ParsingException("Root tag missing (resp. tags or "
1029                     + "textual content outside root element)");
1030         }
1031     }
1032 
1033     /***
1034      * Whether the specified tag is one of the tags that can be converted an
1035      * empty tags when required for fixing a document. For example, "br" when
1036      * <code>&lt;br&gt;</code> may be converted to <code>&lt;br/&gt;</code>
1037      * during repair.
1038      *
1039      * @param tag the name of the tag to look up
1040      * @return <code>true</code> iff this tag is contained in the set of
1041      * emptiable tags
1042      */
1043     protected boolean isAnEmptiableTag(final String tag) {
1044         return (emptiableTags != null) && emptiableTags.contains(tag);
1045     }
1046 
1047     /***
1048      * Whether {@link #CONTROL_CHARS control characters} are deleted (these
1049      * characters are not allowed in XML 1.0 and discouraged in XML 1.1).
1050      * @return <code>true</code> iff control characters are deleted
1051      */
1052     public boolean isDeletingControlChars() {
1053         return deletingControlChars;
1054     }
1055 
1056     /***
1057      * Whether "pseudo-tags" are deleted, i.e., sequences that cannot be parsed
1058      * as tags but look similar to them. "Pseudo-tags" start with '&lt;' and end
1059      * with '&gt;', contain a printable character after the initial '&lt;', and
1060      * do not contain any inner '&lt;' or '&gt;'). Disabled by default.
1061      *
1062      * @return <code>true</code> iff pseudo-tags are be deleted (otherwise the
1063      * starting '&lt;' is escaped)
1064      */
1065     public boolean isDeletingPseudoTags() {
1066         return deletingPseudoTags;
1067     }
1068 
1069     /***
1070      * Whether to escape "&amp;" starting a possible nonstandard entity
1071      * reference ("&amp;" at the start of one of the 5 predefined entity
1072      * references or a character reference is never escaped, all other "&amp;"
1073      * are always escaped).
1074      *
1075      * @return <code>true</code> iff "pseudo entites" are escaped
1076      */
1077     public boolean isEscapingPseudoEntities() {
1078         return escapingPseudoEntities;
1079     }
1080 
1081     /***
1082      * Checks whether a constituent is content that must be enclosed in the
1083      * root element. Tags, textual content, and CDATA sections are root
1084      * content; other constituent types can occur outside of the root element.
1085      *
1086      * @param constituent the constituent to check
1087      * @return <code>true</code> iff the constituent is root content
1088      */
1089     private boolean isRootContent(final XMLConstituent constituent) {
1090         final boolean result;
1091         if (constituent instanceof TagConstituent) {
1092             // all tags are root content
1093             result = true;
1094         } else {
1095             if ((constituent.getType() == OtherConstituent.CDATA_SECTION)
1096                     || (constituent.getType() == OtherConstituent.TEXT)) {
1097                 // text and CDATA sections are root content
1098                 result = true;
1099             } else {
1100                 result = false;
1101             }
1102         }
1103         return result;
1104     }
1105 
1106     /***
1107      * Helper method that moves a tentative tag after a specified end tag.
1108      * The tentative tag must originally be contained in the container of open
1109      * tags; it is popped from this container and re-added to the container of
1110      * unprocessed tags.
1111      *
1112      * @param tentativeTag the tag to move, should be a tentative tag
1113      * @param endTag the tentative tag is moved after this tag
1114      * @param openTags must contain all currently open tags, including the tag
1115      * to move
1116      * @param unprocessedTags must contain all unprocessed start and end tags
1117      */
1118     private void moveTentativeTag(final TagConstituent tentativeTag,
1119             final TagConstituent endTag, final OpenTags openTags,
1120             final UnprocessedTags unprocessedTags) {
1121         // remove from open tags and from current place in list
1122         openTags.forceRemove(tentativeTag);
1123         tentativeTag.remove();
1124 
1125         // adjust series no. and re-add after current tag
1126         tentativeTag.setMarkupSeriesNo(endTag.getMarkupSeriesNo());
1127         endTag.insertAfter(tentativeTag);
1128 
1129         // prepend at the begin of unprocessed tags of this series
1130         unprocessedTags.push(tentativeTag, false);
1131     }
1132 
1133     /***
1134      * Checks whether escapes for the "&amp;" character are required in the
1135      * representation of a constituent.
1136      *
1137      * @param constituent the constituent to check
1138      * @return <code>true</code> if the {@linkplain XMLConstituent#getType()
1139      * type} of the constituent is {@link TagConstituent#START_TAG},
1140      * {@link TagConstituent#EMPTY_TAG}, or {@link OtherConstituent#TEXT};
1141      * <code>false</code> for all other types (which either cannot contain any
1142      * "&amp;" or do not need to escape "&amp;")
1143      */
1144     private boolean needsAmpEscape(final XMLConstituent constituent) {
1145         final short constType = constituent.getType();
1146         return (constType == TagConstituent.START_TAG)
1147             || (constType == TagConstituent.EMPTY_TAG)
1148             || (constType == OtherConstituent.TEXT);
1149     }
1150 
1151     /***
1152      * Returns the raw constituents of an XML-like document. The constituents
1153      * are returned "raw" as they occur in the input, without fixing possible
1154      * nesting errors etc.
1155      *
1156      * @param input the XML-like input data
1157      * @param fixCharacterErrors whether to try to fix character errors, i.e.
1158      * unescaped "&lt;" and "&amp;" and tags with unquoted attribute values; if
1159      * <code>false</code>, unescaped "&lt;" in textual content and unquoted
1160      * attribute values will yield an exception, while any unescaped "&amp"
1161      * and unescaped "&lt;" in attribute values will be ignored
1162      * @return a reference to the first contained constituent; the list
1163      * of constituents can be traversed by calling
1164      * {@link XMLConstituent#nextConstituent()} on each constituent untill
1165      * <code>null</code> is returned
1166      * @throws ParsingException if the XML input contains an uncorrectable error
1167      */
1168     public final XMLConstituent rawConstituents(final CharSequence input,
1169             final boolean fixCharacterErrors) throws ParsingException {
1170         return rawConstituents(input, fixCharacterErrors, null);
1171     }
1172 
1173     /***
1174      * Returns the raw constituents of an XML-like document. The constituents
1175      * are returned "raw" as they occur in the input, without fixing possible
1176      * nesting errors etc.
1177      *
1178      * @param input the XML-like input data
1179      * @param fixCharacterErrors whether to try to fix character errors, i.e.
1180      * unescaped "&lt;" and "&amp;" and tags with unquoted attribute values; if
1181      * <code>false</code>, unescaped "&lt;" in textual content and unquoted
1182      * attribute values will yield an exception, while any unescaped "&amp"
1183      * and unescaped "&lt;" in attribute values will be ignored
1184      * @param startAndEndTags all start and end tags are added to this
1185      * container, if it isn't <code>null</code>
1186      * @return a reference to the first contained constituent; the list
1187      * of constituents can be traversed by calling
1188      * {@link XMLConstituent#nextConstituent()} on each constituent untill
1189      * <code>null</code> is returned
1190      * @throws ParsingException if the XML input contains an uncorrectable error
1191      */
1192     protected final XMLConstituent rawConstituents(final CharSequence input,
1193             final boolean fixCharacterErrors,
1194             final UnprocessedTags startAndEndTags) throws ParsingException {
1195         // tokenizer for XML-like input -- if fixCharacterErrors we'll ensure
1196         // whitespace validity outselves, otherwise let the tokenizer validate
1197         final TextTokenizer tokenizer =
1198             XMLTokenizerFactory.createXMLTokenizer(input, !fixCharacterErrors);
1199         String tokenText;
1200         String precWS;
1201         String captured = null;
1202         boolean done = false;
1203         XMLConstituent firstConst = null;
1204         XMLConstituent priorConst = null;
1205         XMLConstituent currentConst;
1206         // whether we're currently in a markup series (any kind of tags and
1207         // declarations) or outside (textual content or CDATA sections)
1208         boolean inMarkupSeries = true;
1209         // markup series are sequentially numbered, starting at 0
1210         int markupSeriesNo = 0;
1211 
1212         // variables for checking and fixing character errors
1213         boolean fixedCharError;
1214         XMLConstituent extraConst = null;
1215         String extraWS = null;
1216 
1217         try {
1218             while (!done) {
1219                 tokenText = tokenizer.nextToken();
1220                 if (tokenText == null) {
1221                     done = true;    // reached end of input
1222                 } else {
1223                     // create suitable type
1224                     captured = tokenizer.capturedText();
1225                 }
1226 
1227                 if (tokenizer.hasPrecedingWhitespace()) {
1228                     precWS = tokenizer.precedingWhitespace();
1229                     if (fixCharacterErrors
1230                             && (!tokenizer.precedingWhitespaceIsValid())) {
1231                         // "whitespace" is invalid: try to fix it
1232                         fixedCharError = false;
1233                         if (precWS.endsWith("<") && (captured.length() == 0)) {
1234                             // trailing '<' and text token is TEXT:
1235                             final String remainingWS = precWS.substring(0,
1236                                 precWS.length() - 1);
1237                             final int gtPos = tokenText.indexOf('>');
1238 
1239                             if (tokenizer.isValidWhitespace(remainingWS)
1240                                     && gtPos > 0) {
1241                                 // trailing '<' is only problem and
1242                                 // text contains '>' after at least one char
1243                                 final Object[] fixed
1244                                     = tryToFixTag('<' + tokenText);
1245                                 if (fixed != null) {
1246                                     // store tag + set token to unused rest
1247                                     extraConst = (TagConstituent) fixed[0];
1248                                     tokenText = (String) fixed[1];
1249                                     fixedCharError = true;
1250                                     precWS = remainingWS;
1251                                 } else if (deletingPseudoTags) {
1252                                     // delete "pseudo-tags" + set token to rest
1253                                     logEvent(EVENT_DELETED_PSEUDO_TAG, '<'
1254                                         + tokenText.substring(0, gtPos + 1));
1255                                     tokenText = tokenText.substring(gtPos + 1);
1256                                     fixedCharError = true;
1257                                     precWS = remainingWS;
1258                                 }
1259                             }
1260                         }
1261 
1262                         if (fixedCharError) {
1263                             // fixed error: check if next token (TEXT)
1264                             // now starts with whitespace
1265                             final int initWS =
1266                                 tokenizer.initialWhitespaceCount(tokenText);
1267                             if (initWS > 0) {
1268                                 if (extraConst != null) {
1269                                     // store after extra constituent
1270                                     extraWS = tokenText.substring(0, initWS);
1271                                 } else {
1272                                     // append to current whitespace
1273                                     precWS += tokenText.substring(0, initWS);
1274                                 }
1275                                 tokenText = tokenText.substring(initWS);
1276                             }
1277                         } else {
1278                             // escape illegal characters
1279                             String escape = StringEscapeUtils.escapeXml(precWS);
1280                             final int initWS =
1281                                 tokenizer.initialWhitespaceCount(escape);
1282 
1283                             // separate initial whitespace + log event
1284                             if (initWS > 0) {
1285                                 precWS = escape.substring(0, initWS);
1286                                 escape = escape.substring(initWS);
1287                             } else {
1288                                 precWS = "";
1289                             }
1290                             logEvent(EVENT_ESCAPED_CHARS, escape);
1291 
1292                             if (captured.length() == 0) {
1293                                 // next token is TEXT: just prepend
1294                                 tokenText = escape + tokenText;
1295                             } else {
1296                                 final int trailingWSChars =
1297                                     tokenizer.trailingWhitespaceCount(escape);
1298                                 if (trailingWSChars > 0) {
1299                                     // store trailing whitespace separately
1300                                     extraWS = escape.substring(
1301                                         escape.length() - trailingWSChars);
1302                                     escape = escape.substring(0,
1303                                         escape.length() - trailingWSChars);
1304                                 }
1305                                 // create text constituent
1306                                 extraConst = new OtherConstituent(
1307                                     OtherConstituent.TEXT, escape);
1308                             }
1309                         }
1310                     }
1311 
1312                     // create whitespace constituent (if whitespace remains
1313                     // after fixing possible character errors)
1314                     if (precWS.length() > 0) {
1315                         currentConst = new OtherConstituent(
1316                             OtherConstituent.OUTER_WHITESPACE, precWS);
1317                         // add to list
1318                         if (priorConst == null) {
1319                             firstConst = currentConst;
1320                         } else {
1321                             priorConst.insertAfter(currentConst);
1322                         }
1323                         priorConst = currentConst;
1324                     }
1325 
1326                     if (extraConst != null) {
1327                         // handle extra constituent from fixed character error
1328                         if ((extraConst.getType() == OtherConstituent.TEXT)
1329                                 || (extraConst.getType()
1330                                     == OtherConstituent.CDATA_SECTION)) {
1331                             inMarkupSeries = false;   // non-markup
1332                         } else {    // markup
1333                             if (!inMarkupSeries) {    // start new series
1334                                 inMarkupSeries = true;
1335                                 markupSeriesNo++;
1336                             }
1337                             if (extraConst instanceof TagConstituent) {
1338                                 // store markup series in tag constituents
1339                                 ((TagConstituent) extraConst)
1340                                     .setMarkupSeriesNo(markupSeriesNo);
1341 
1342                                 // add to unprocessed tags if start or end tag
1343                                 if ((startAndEndTags != null)
1344                                         && (extraConst.getType()
1345                                                 != TagConstituent.EMPTY_TAG)) {
1346                                     startAndEndTags.push(
1347                                             (TagConstituent) extraConst);
1348                                 }
1349                             }
1350                         }
1351                         if (fixCharacterErrors) {
1352                             // fix illegal characters in representation
1353                             fixRepresentation(extraConst);
1354                         }
1355                         // add extra constituent to list
1356                         if (priorConst == null) {
1357                             firstConst = extraConst;
1358                         } else {
1359                             priorConst.insertAfter(extraConst);
1360                         }
1361                         priorConst = extraConst;
1362                         extraConst = null;  // reset variable
1363 
1364                         if (extraWS != null) {
1365                             // add extra whitespace to list
1366                             currentConst = new OtherConstituent(
1367                                 OtherConstituent.OUTER_WHITESPACE,
1368                                 extraWS);
1369                             priorConst.insertAfter(currentConst);
1370                             priorConst = currentConst;
1371                             extraWS = null; // reset variable
1372                         }
1373                     }
1374                 }
1375 
1376                 // tokenText lenght might be 0 if we fixed an character error
1377                 if ((!done) && (tokenText.length() > 0)) {
1378                     // create suitable type
1379                     if ((captured.length() == 0)
1380                             || (captured.equals("[CDATA"))) {
1381                         // textual content (character data) or CDATA section
1382                         // now we're outside a markup series
1383                         if (inMarkupSeries) {
1384                             inMarkupSeries = false;
1385                         }
1386                         if (captured.length() == 0) {
1387                             currentConst = new OtherConstituent(
1388                                 OtherConstituent.TEXT, tokenText);
1389                         } else {
1390                             currentConst = new OtherConstituent(
1391                                 OtherConstituent.CDATA_SECTION, tokenText);
1392                         }
1393                     } else {
1394                         // now we're inside a markup series
1395                         if (!inMarkupSeries) {
1396                             inMarkupSeries = true;
1397                             markupSeriesNo++;
1398                         }
1399                         currentConst = createMarkupConstituent(tokenText,
1400                             captured, markupSeriesNo, startAndEndTags);
1401                     }
1402                     if (fixCharacterErrors) {
1403                         // fix illegal characters in representation
1404                         fixRepresentation(currentConst);
1405                     }
1406                     // add to list
1407                     if (priorConst == null) {
1408                         firstConst = currentConst;
1409                     } else {
1410                         priorConst.insertAfter(currentConst);
1411                     }
1412                     priorConst = currentConst;
1413                 }
1414             }
1415         } catch (IllegalArgumentException iae) {
1416             // convert unchecked into checked exception
1417             throw new ParsingException(
1418                 "Uncorrectable error in XML input: " + iae.getMessage());
1419         }
1420         return firstConst;
1421     }
1422 
1423     /***
1424      * Replaces a start tag with an copy that is an empty tag. This means,
1425      * the representation of the created copy ends in "/&gt;" instead of "&gt;"
1426      * and its type is {@link TagConstituent#EMPTY_TAG} instead of
1427      * {@link TagConstituent#START_TAG}.
1428      *
1429      * <p>If the original tag is part of a list, it is removed and the copy
1430      * is inserted instead.
1431      *
1432      * @param startTag the start tag to replace
1433      * @return the created copy
1434      * @throws IllegalArgumentException if the specified tag is not a valid
1435      * start tag
1436      */
1437     private TagConstituent replaceWithEmptyCopy(final TagConstituent startTag)
1438             throws IllegalArgumentException {
1439         if (startTag.getType() != TagConstituent.START_TAG) {
1440             throw new IllegalArgumentException(
1441                 "Tag to replace must be a start tag (actual type: "
1442                     + startTag.getType() + ')');
1443         }
1444         final StringBuffer representation =
1445             new StringBuffer(startTag.getRepresentantion());
1446         final int endMarker = representation.lastIndexOf(">");
1447         if (endMarker < 0) {
1448             throw new IllegalArgumentException(
1449                 "Start tag representation is invalid: '>' missing!");
1450         }
1451 
1452         // insert '/' before closing '>'
1453         representation.insert(endMarker, '/');
1454         final TagConstituent result =
1455             new TagConstituent(TagConstituent.EMPTY_TAG, startTag.getName(),
1456                 representation.toString(), startTag.getMarkupSeriesNo());
1457 
1458         // remove original tag from list (if any) and insert new one instead
1459         final XMLConstituent prevConst = startTag.previousConstituent();
1460         final XMLConstituent nextConst = startTag.nextConstituent();
1461         startTag.remove();
1462 
1463         if (prevConst != null) {
1464             prevConst.insertAfter(result);
1465         } else if (nextConst != null) {
1466             // this is the first tag in a list
1467             result.insertAfter(nextConst);
1468         }
1469 
1470         // copy will always be of the "regular" variety--that's ok, after
1471         // processing start tag would be "regularized" too
1472         return result;
1473     }
1474 
1475     /***
1476      * Returns a string representation of this object.
1477      *
1478      * @return a textual representation
1479      */
1480     public String toString() {
1481         return new ToStringBuilder(this)
1482             .append("missing root name", missingRootName)
1483             .append("emptiable tags", emptiableTags)
1484             .append("delete control characters", deletingControlChars)
1485             .append("delete pseudo-tags", deletingPseudoTags)
1486             .append("escape pseudo-entities", escapingPseudoEntities)
1487             .toString();
1488     }
1489 
1490     /***
1491      * Helper method that tries to parse a string as an XML start or empty tag
1492      * that contains unquoted attribute values (might be followed by other
1493      * text). If this is the case, the unquoted values are fixed and a suitable
1494      * tag constituent is created.
1495      *
1496      * <p>Only call this method when you're sure that the input does not contain
1497      * a <em>valid</em> tag, i.e. if there must be at least one unquoted value.
1498      * This method doesn't store a markup series in the tag.
1499      *
1500      * @param text the text to parse
1501      * @return an array of two elements: the created {@link TagConstituent}
1502      * and a String containing the unused rest of the input text (might be empty
1503      * but not <code>null</code>; or <code>null</code> if the text couldn't be
1504      * parsed as a text.
1505      * @throws ParsingException might be thrown by {@link #checkEvent(String)}
1506      * implementations in subclasses if an "illicit" event occurred
1507      */
1508     private Object[] tryToFixTag(final String text) throws ParsingException {
1509         final Object[] result;
1510         final Matcher laxMatcher = LAX_START_OR_EMPTY_TAG.matcher(text);
1511 
1512         if (laxMatcher.lookingAt()) {
1513             // it's a start or empty tag with unquoted
1514             // attribute values -- fix them
1515             final String tagName = laxMatcher.group(1);
1516             final short tagType;
1517 
1518             result = new Object[2];
1519             // split tag + store rest (if any) to be returned
1520             result[1] = text.substring(laxMatcher.end());
1521             String oldTagRep = laxMatcher.group();
1522 
1523             // determine tag type
1524             final String lastGroup = laxMatcher.group(laxMatcher.groupCount());
1525 
1526             if (lastGroup == null) {
1527                 tagType = TagConstituent.START_TAG;
1528             } else if ("/".equals(lastGroup)) {
1529                 tagType = TagConstituent.EMPTY_TAG;                
1530             } else {
1531                 // not supposed to happen
1532                 throw new RuntimeException("Implementation error: last group of"
1533                         + " lax tag '" + laxMatcher.group() + "' is '"
1534                         + lastGroup + "' instead of '/' or null");
1535             }
1536 
1537             StringBuilder newTagRep = null;
1538             String unquotedValue;
1539 
1540             // is there still an unquoted attrib (whole value will be captured)?
1541             while (laxMatcher.groupCount() > 2
1542                     && (laxMatcher.group(3) != null)) {
1543                 // create representation, starting with text until unquoted val.
1544                 // (the 2nd groups contain '=' so we have to check the 3rd one)
1545                 newTagRep = new StringBuilder(oldTagRep.substring(0,
1546                         laxMatcher.start(3)));
1547                 unquotedValue = laxMatcher.group(3);
1548 
1549                 // remove starting or ending quote
1550                 if (unquotedValue.startsWith("\"")
1551                         || unquotedValue.startsWith("'")) {
1552                     unquotedValue = unquotedValue.substring(1);
1553                 }
1554                 if (unquotedValue.endsWith("\"")
1555                         || unquotedValue.endsWith("'")) {
1556                     unquotedValue = unquotedValue.substring(0,
1557                         unquotedValue.length() - 1);
1558                 }
1559 
1560                 // escape any embedded quotes
1561                 unquotedValue = TextUtils.replaceAll(unquotedValue,
1562                     Pattern.compile("\""), "&quot;");
1563 
1564                 // append the value within quotes
1565                 newTagRep.append('"');
1566                 newTagRep.append(unquotedValue);
1567                 newTagRep.append('"');
1568 
1569                 // append rest of tag
1570                 newTagRep.append(oldTagRep.substring(laxMatcher.end(3),
1571                     laxMatcher.end()));
1572 
1573                 // match again to check if there are more unquoted attribs
1574                 laxMatcher.reset(newTagRep);
1575                 if (!laxMatcher.matches()) {
1576                     // not supposed to happen
1577                     throw new RuntimeException("Implementation error while "
1578                             + "trying to fix unquoted attribute values: '"
1579                             + newTagRep + "' is no longer parsable as a tag");
1580                 }
1581                 oldTagRep = newTagRep.toString();
1582             }
1583 
1584             // create and store TagConstituent + log event
1585             result[0] = new TagConstituent(tagType, tagName,
1586                     newTagRep.toString());
1587             logEvent(EVENT_QUOTED_ATTRIBUTE_VALUES, newTagRep.toString());
1588         } else {
1589             result = null;
1590         }
1591         return result;
1592     }
1593 
1594 }