View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.preprocess;
23  
24  import java.io.ByteArrayInputStream;
25  import java.io.ByteArrayOutputStream;
26  import java.io.FileWriter;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.PrintWriter;
30  import java.io.Reader;
31  import java.io.StringReader;
32  import java.io.StringWriter;
33  import java.io.Writer;
34  import java.util.regex.Matcher;
35  import java.util.regex.Pattern;
36  
37  import org.apache.commons.lang.StringUtils;
38  import org.apache.commons.lang.builder.ToStringBuilder;
39  import org.w3c.tidy.Configuration;
40  import org.w3c.tidy.Node;
41  import org.w3c.tidy.Out;
42  import org.w3c.tidy.OutImpl;
43  import org.w3c.tidy.PPrint;
44  import org.w3c.tidy.Tidy;
45  
46  import de.fu_berlin.ties.ContextMap;
47  import de.fu_berlin.ties.ParsingException;
48  import de.fu_berlin.ties.ProcessingException;
49  import de.fu_berlin.ties.TextProcessor;
50  import de.fu_berlin.ties.TiesConfiguration;
51  
52  import de.fu_berlin.ties.io.ContentType;
53  import de.fu_berlin.ties.io.IOUtils;
54  import de.fu_berlin.ties.text.TextUtils;
55  import de.fu_berlin.ties.util.ExternalCommand;
56  import de.fu_berlin.ties.util.Util;
57  import de.fu_berlin.ties.xml.OtherConstituent;
58  import de.fu_berlin.ties.xml.TagConstituent;
59  import de.fu_berlin.ties.xml.XMLAdjuster;
60  import de.fu_berlin.ties.xml.XMLConstituent;
61  
62  /***
63   * Preprocesses documents by converting them a suitable XML format and adding
64   * lingustic information. Instances of this class are thread-safe.
65   *
66   * @author Christian Siefkes
67   * @version $Revision: 1.16 $, $Date: 2004/12/07 12:02:05 $, $Author: siefkes $
68   */
69  public class PreProcessor extends TextProcessor {
70  
71      /***
72       * Configuration key prefix: command name and arguments of an external
73       * converter from a specified type to HTML.
74       */
75      public static final String CONFIG_HTMLCONV_COMMAND
76          = "html-converter.command";
77  
78      /***
79       * Configuration key: Whether plain text is preprocessed to recognize
80       * and reformat definition lists.
81       */
82      public static final String CONFIG_PREPROCESS_TEXT
83          = "preprocess.text";
84  
85      /***
86       * Configuration key: A tagger (or a list of taggers) used to annotate a
87       * text e.g. with linguistic information. Each tagger must implement the
88       * TextProcessor interface and accept a string (the output extension) as
89       * single constructor argument.
90       */
91      public static final String CONFIG_PREPROCESS_TAGGER
92          = "preprocess.tagger";
93  
94      /***
95       * The replacement inserted by {@link #insertLineBreaks(CharSequence)}
96       * (an empty <code>br</code> element is appended after each match).
97       */
98      private static final String NEWLINE_REPLACEMENT = "$0<br/>";
99  
100     /***
101      * The name of the XHTML tag marking preformatted text.
102      */
103     private static final String PRE_TAG = "pre";
104 
105     /***
106      * The name of the XHTML tag marking definitions in a definition list.
107      */
108     private static final String DD_TAG = "dd";
109 
110     /***
111      * Pattern fragment matching a term in a definition list (printable
112      * characters excluding ':' followed by a colon (':'). Following whitespace
113      * is not included in this pattern.
114      */
115     private static final String DL_TERM = "[^//s:]+:";
116 
117     /***
118      * Pattern fragment matching an entry (term + definition) in a definition
119      * text in plain text, formatted similar to RFC 822/2882:
120      * "Term: Definition\n", optionally followed by deeper indented
121      * continuation lines. Group 1 must match the current indentation level
122      * (whitespace at the start of each line, if any).
123      */
124     private static final String DL_ENTRY =
125         // term followed by whitespace and (start of) definition
126         DL_TERM + TextUtils.SINGLE_LINE_WS.pattern() + "+//S.*"
127         // any number of continuation lines (deeper indented)
128         + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1"
129         + TextUtils.SINGLE_LINE_WS.pattern() + "+.+)*";
130 
131     /***
132      * Pattern matching a paragraph that should be converted to a description
133      * list (when converting plain text to XHTML). A list must have at least
134      * two entries to be recognized.
135      */
136     private static final Pattern DL_PARA = Pattern.compile(
137         // preceded by empty line (or start of text)
138         "(?://A|" + TextUtils.NEWLINE_PATTERN.pattern()
139             + TextUtils.SINGLE_LINE_WS.pattern() + "*"
140             + TextUtils.NEWLINE_PATTERN.pattern() + ")"
141         // group 1: indent level (whitespace that must be present on all lines)
142         + "(" + TextUtils.SINGLE_LINE_WS.pattern() + "*)"
143         // first entry
144         + DL_ENTRY
145         // further entries (starting w/possible indent captured in group 1)
146         + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_ENTRY + ")+"
147         // we also allow a trailing term
148         + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_TERM
149         + TextUtils.SINGLE_LINE_WS.pattern() + "*)?"
150         // followed (lookahead) by empty line (or end of text)
151             + "(?=//Z|" + TextUtils.NEWLINE_PATTERN.pattern()
152             + TextUtils.SINGLE_LINE_WS.pattern() + "*"
153             + TextUtils.NEWLINE_PATTERN.pattern() + ")"
154         );
155 
156     /***
157      * Pattern fragment matching a short line (containing at most 40
158      * characters + optionally starting and trailing whitespace).
159      */
160     private static final String SHORT_LINE =
161         // optional whitespace
162         TextUtils.SINGLE_LINE_WS.pattern() + "*"
163         // opt. whitespace + 1-40 characters (first + last must be visible)
164         + "//S(?:.{1,38}//S)?"
165         // optional whitespace
166         + TextUtils.SINGLE_LINE_WS.pattern() + "*";
167 
168     /***
169      * Pattern matching two consecutive {@link #SHORT_LINE short lines}.
170      */
171     private static final Pattern TWO_SHORT_LINES = Pattern.compile(
172         // first line + line separator
173         "^" + SHORT_LINE + TextUtils.NEWLINE_PATTERN.pattern()
174         // second line (using lookahead so the next match can match it again)
175         + "(?=" + SHORT_LINE + "$)",
176         Pattern.MULTILINE);
177 
178     /***
179      * Pattern used to match an XML declaration not containing an encoding.
180      */
181     private static final Pattern XML_DECLARATION = Pattern.compile(
182             "(<//?xml//s+version=\"1//.0\")//s*(//?>)");
183 
184     /***
185      * Whether plain text is preprocessed to recognize and reformat definition
186      * lists.
187      */
188     private final boolean preprocessingText;
189 
190     /***
191      * Tool for cleaning up malformed and faulty HTML. Synchronized on itself.
192      */
193     private final Tidy tidy;
194 
195     /***
196      * Used for printing the output returned by Tidy. Synchronized on the
197      * {@link #tidy} instance.
198      */
199     private final PPrint tidyPrinter;
200 
201     /***
202      * An array of taggers used to annotate a text e.g. with linguistic
203      * information.
204      */
205     private final TextProcessor[] tagger;
206 
207     /***
208      * XML used for {@linkplain #insertLineBreaks(CharSequence) inserting line
209      * breaks}.
210      */
211     private final XMLAdjuster xmlAdjuster;
212 
213     /***
214      * Creates and configured a new instance, using a default extension and the
215      * {@linkplain TiesConfiguration standard configuration}.
216      */
217     public PreProcessor() {
218         this("aug");
219     }
220 
221     /***
222      * Creates and configured a new instance, using the
223      * {@linkplain TiesConfiguration standard configuration}.
224      *
225      * @param outExt the extension to use for output files
226      * @throws IllegalArgumentException if the configured linguistic tagger(s)
227      * cannot be instantiated
228      */
229     public PreProcessor(final String outExt) throws IllegalArgumentException {
230         this(outExt, TiesConfiguration.CONF);
231     }
232 
233     /***
234      * Creates and configured a new instance.
235      *
236      * @param outExt the extension to use for output files
237      * @param config used to configure superclasses
238      * @throws IllegalArgumentException if the configured linguistic tagger(s)
239      * cannot be instantiated
240      */
241     public PreProcessor(final String outExt, final TiesConfiguration config)
242     throws IllegalArgumentException {
243         // call superclass + store configuration + set parameters
244         super(outExt, config);
245         preprocessingText = config.getBoolean(CONFIG_PREPROCESS_TEXT);
246 
247         // initialize (linguistic) taggers
248         String[] taggerNames = config.getStringArray(CONFIG_PREPROCESS_TAGGER);
249         if (!TiesConfiguration.arrayIsEmpty(taggerNames)) {
250             // each tagger gets a dummy output extension as constructor arg
251             final String[] params = new String[] { "tt" };
252             tagger = new TextProcessor[taggerNames.length];
253 
254             for (int i = 0; i < taggerNames.length; i++) {
255                 try {
256                     tagger[i] = (TextProcessor) Util.createObject(
257                             Class.forName(taggerNames[i]), params);                    
258                 } catch (Exception e) {
259                     // repackage exception
260                     throw new IllegalArgumentException(
261                             "Tagger initialization failed", e);
262                 }
263             }
264         } else {
265             // no taggers are used
266             tagger = null;
267         }
268 
269         // create and configure TJidy
270         tidy = new Tidy();
271 
272         // try to change the output log
273         try {
274             final FileWriter tidyLog = new FileWriter("ties-tidy.log");
275             tidy.setErrout(new PrintWriter(tidyLog, true));
276         } catch (IOException ioe) {
277             Util.LOG.warn(
278                 "PreProcessor: couldn't redirect Tidy output to ties-tidy.log",
279                 ioe);
280         }
281 
282         // adjust Tidy configuration, cf. http://www.w3.org/People/Raggett/tidy/
283         tidy.setCharEncoding(Configuration.UTF8);
284         tidy.setEncloseBlockText(true);
285         tidy.setEncloseText(true);
286         tidy.setDocType("omit");
287         tidy.setDropEmptyParas(true);
288         tidy.setLogicalEmphasis(true);
289         tidy.setMakeClean(true);
290         tidy.setOnlyErrors(true);
291         tidy.setQuoteNbsp(false);
292         tidy.setQuiet(true);
293         tidy.setRawOut(true);
294         tidy.setShowWarnings(false);
295         tidy.setTidyMark(false);
296         tidy.setWraplen(0);  // wrapping disabled
297         tidy.setXmlPi(true);
298         tidy.setXmlPIs(true);
299         tidy.setXmlOut(true);
300 
301         // initialize the Tidy printer
302         tidyPrinter = new PPrint(tidy.getConfiguration());
303 
304         // initialize XML adjuster without doing any extras
305         xmlAdjuster =
306             new XMLAdjuster(null, null, null, false, false, false, config);
307     }
308 
309     /***
310      * Converts HTML input to a clean XHTML representation, if necessary.
311      * Delegates to <code>JTidy</code> for checking and cleaning the HTML code.
312      *
313      * @param input the HTML to tidy
314      * @param charset the character to be used for storing the resulting XHTML
315      * document (required to write the XML Declaration correctly)
316      * @return the cleaned-up XHTML
317      * @throws IOException if the I/O goes wrong
318      */
319     public final String cleanHTML(final String input, final String charset)
320     throws IOException {
321         // convert string to byte array input stream using UTF-8
322         final InputStream inStream = new ByteArrayInputStream(
323             input.getBytes(IOUtils.STANDARD_UNICODE_CHARSET));
324 
325         // store result in byte array output stream, estimating initial size
326         final ByteArrayOutputStream outStream =
327             new ByteArrayOutputStream(input.length());
328 
329         // synchronized on the tidy instance
330         synchronized (tidy) {
331             // do the actual clean-up + conversion
332             final Node document = tidy.parse(inStream, null);
333 
334             // in case of some errors Tidy returns a node but doesn't print to
335             // the output stream, so we'll handle this ourselves
336             final Out out = new OutImpl();
337             out.encoding = Configuration.UTF8;
338             out.out = outStream;
339             tidyPrinter.printTree(out, (short) 0, 0, null, document);
340             tidyPrinter.flushLine(out, 0);
341         }
342 
343         final String rawResult =
344             outStream.toString(IOUtils.STANDARD_UNICODE_CHARSET);
345         final String result;
346 
347         if (StringUtils.isNotEmpty(charset)) {
348             // insert encoding in XML declaration
349             result = XML_DECLARATION.matcher(rawResult).replaceFirst(
350                     "$1 encoding=\"" + charset + "\"$2");
351         } else {
352             Util.LOG.warn("No character set specified -- cannot fix XML"
353                     + " declaration of XHTML document");
354             result = rawResult;
355         }
356 
357         return result;
358     }
359 
360     /***
361      * Preprocesses the contents of a file. Neither input stream nor output
362      * writer are closed by this method.
363      *
364      * @param reader a reader containing the text to preprocess; not closed
365      * by this method
366      * @param writer a writer used to store the preprocessed text; flushed
367      * but not closed by this method
368      * @param context a map of objects that are made available for processing;
369      * the {@link ContentType#KEY_MIME_TYPE} key should to mapped to the MIME
370      * type of the document and the {@link IOUtils#KEY_LOCAL_CHARSET} key to
371      * the character set of the <code>writer</code>
372      * @throws IOException if an I/O error occurred
373      * @throws ProcessingException if the file couldn't be parsed, e.g. due to
374      * an error in the XML input
375      */
376     protected final void doProcess(final Reader reader, final Writer writer,
377             final ContextMap context)
378             throws IOException, ProcessingException {
379         // read contents from provided stream
380         String contents = IOUtils.readToString(reader);
381         final String mimeType = (String) context.get(ContentType.KEY_MIME_TYPE);
382         final boolean html;
383         final boolean xml;
384 
385         // preprocess plain text if configured
386         if (preprocessingText && ContentType.MIME_PLAIN.equals(mimeType)) {
387             contents = preprocessText(contents);
388         }
389 
390         if (ContentType.MIME_HTML.equals(mimeType)) {
391             // this is HTML but not necessarily XHTML
392             html = true;
393             xml = false;
394         } else if ((mimeType != null) && (getConfig().containsKey(
395                 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND,
396                                             mimeType)))) {
397             // there is an external XHTML converter configured for this type
398             final String[] commandArgs = getConfig().getStringArray(
399                 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND, mimeType));
400             final ExternalCommand extConverter =
401                 new ExternalCommand(commandArgs);
402 
403             // run external converter and store output
404             contents = extConverter.execute(contents);
405 
406             // output is already XHTML -- no need to send thru Tidy
407             html = true;
408             xml = true;
409         } else {
410             // try to process contents without conversion (XML)
411             html = false;
412             xml = true;
413         }
414 
415         if (html) {
416             if (!xml) {
417                 // clean-up HTML markup + convert to XHTML
418                 contents = cleanHTML(contents,
419                         (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
420             }
421 
422             // insert <br> tags in <pre>
423             contents = insertLineBreaks(contents);
424         }
425 
426         // delegate to taggers, if any
427         if (tagger != null) {
428             StringWriter stringWriter;
429 
430             for (int i = 0; i < tagger.length; i++) {
431                 stringWriter = new StringWriter(contents.length());
432                 tagger[i].process(new StringReader(contents), stringWriter,
433                         context);
434                 contents = stringWriter.toString();
435             }
436         }
437 
438         // store results
439         writer.write(contents);
440         writer.flush();
441     }
442 
443     /***
444      * Adds empty <code>br</code> elements in an XHTML document where
445      * appropriate for better recognizing of the physical formatting.
446      * Currently this is done at the begin of each line in a <code>pre</code>
447      * element; and between short lines (at most 40 visible characters each)
448      * in <code>dd</code> element (definitions within definition lists).
449      * (Text in CDATA sections is not modified; leading and trailing linebreaks
450      * are ignored.)
451      *
452      * @param input the XHTML to process
453      * @return the XHTML with <code>br</code> elements added
454      * @throws ParsingException if the file couldn't be parsed, e.g. due to an
455      * error in the XML input
456      */
457     private String insertLineBreaks(final CharSequence input)
458             throws ParsingException {
459         final StringBuffer result = new StringBuffer();
460         final XMLConstituent firstConst =
461             xmlAdjuster.rawConstituents(input, false);
462         XMLConstituent currentConst = firstConst;
463         TagConstituent currentTag;
464         int currentType;
465         boolean inPre = false;
466         boolean inDD = false;
467         String output;
468         final char namespaceSeparator = ':';
469         final Matcher newlineMatcher = TextUtils.NEWLINE_PATTERN.matcher("");
470         final Matcher shortLinesMatcher = TWO_SHORT_LINES.matcher("");
471 
472         // iterate + insert br elements where required
473         while (currentConst != null) {
474             currentType = currentConst.getType();
475 
476             if ((currentType == TagConstituent.START_TAG)
477                     || (currentType == TagConstituent.END_TAG)) {
478                 currentTag = (TagConstituent) currentConst;
479 
480                 // quick-and-dirty namespace handling
481                 if (currentTag.getName().equals(PRE_TAG) || currentTag
482                         .getName().endsWith(namespaceSeparator + PRE_TAG)) {
483                     // entering or exiting <pre> element
484                     inPre = (currentType == TagConstituent.START_TAG);
485                 } else if (currentTag.getName().equals(DD_TAG) || currentTag
486                     .getName().endsWith(namespaceSeparator + DD_TAG)) {
487                 // entering or exiting <dd> element
488                 inDD = (currentType == TagConstituent.START_TAG);
489                 }
490             }
491 
492             if (inPre && (currentType == OtherConstituent.TEXT)) {
493                 // insert <br/> tags within text content of <pre> elements
494                 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
495                     newlineMatcher, NEWLINE_REPLACEMENT);
496 
497                 if (output != currentConst.getRepresentantion()) {
498                     Util.LOG.debug("Inserted break tags into preformatted text:"
499                         + TextUtils.LINE_SEPARATOR + output);
500                 }
501             } else if (inDD && (currentType == OtherConstituent.TEXT)) {
502                 // insert <br/> tags within two short lines within <dd> elements
503                 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
504                     shortLinesMatcher, NEWLINE_REPLACEMENT);
505 
506                 if (output != currentConst.getRepresentantion()) {
507                     Util.LOG.debug("Inserted break tags between short lines "
508                         + "in <dd> element:" + TextUtils.LINE_SEPARATOR
509                         + output);
510                 }
511             } else {
512                 // output representation as is
513                 output = currentConst.getRepresentantion();
514             }
515 
516             result.append(output);
517             currentConst = currentConst.nextConstituent();
518         }
519 
520         return result.toString();
521     }
522 
523     /***
524      * Preprocessed plain text to bring definition lists in a format recognized
525      * by <code>txt2html</code>.
526      *
527      * @param input the plain text to process
528      * @return the plain text with converted definition lists; or a reference
529      * to <code>input</code> if there was nothing to convert
530      */
531     private String preprocessText(final String input) {
532         // trimming text because a single empty line at start or end could
533         // spoil matches
534         final String trimmedInput = input.trim();
535         final Matcher dlMatcher = DL_PARA.matcher(trimmedInput);
536         boolean found = dlMatcher.find();
537         int afterLastMatch = 0;
538         String orgPara, normalizedPara, convertedPara;
539         String indent;  // indentation level (group 1)
540         String extraIndent;
541         Pattern termToConvert;
542         Pattern continuationIndent;
543 
544         if (found) {
545             final StringBuffer result = new StringBuffer();
546             do {
547                 // append text preceding the match
548                 result.append(trimmedInput.substring(afterLastMatch,
549                     dlMatcher.start()));
550                 afterLastMatch = dlMatcher.end();
551                 indent = dlMatcher.group(1);
552                 orgPara = dlMatcher.group();
553 
554                 // use extra indent of 2 for (first line of) definition
555                 extraIndent = indent + "  ";
556 
557                 // normalize continuation lines to use extra indent
558                 continuationIndent = Pattern.compile(
559                     // must start on new line (group 1)
560                     "(//A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
561                     // with deeper intentation
562                     + indent + TextUtils.SINGLE_LINE_WS.pattern() + "+"
563                     );
564                 normalizedPara = TextUtils.replaceAll(orgPara,
565                     continuationIndent, "$1" + extraIndent);
566 
567                 // term to convert
568                 termToConvert = Pattern.compile(
569                     // must start on new line
570                     "((?://A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
571                     // with correct indent
572                     + indent + DL_TERM + ")"
573                     // followed by whitespace
574                     + TextUtils.SINGLE_LINE_WS.pattern() + "+"
575                     // and then (lookahead) a printable char
576                     + "(?=//S)");
577 
578                 // replace whitespace after term by newline + extra indent
579                 convertedPara = TextUtils.replaceAll(normalizedPara,
580                     termToConvert,
581                     "$1" + TextUtils.LINE_SEPARATOR + extraIndent);
582 
583 
584                 Util.LOG.debug(
585                     "Reformatted paragraph containing definition list: "
586                     + convertedPara);
587 
588                 // append + look for next one
589                 result.append(convertedPara);
590                 found = dlMatcher.find();
591             } while (found);
592 
593             // append rest of text
594             result.append(trimmedInput.substring(afterLastMatch));
595             return result.toString();
596         } else {
597             // nothing to replace
598             return input;
599         }
600     }
601 
602     /***
603      * Returns a string representation of this object.
604      *
605      * @return a textual representation
606      */
607     public String toString() {
608         final ToStringBuilder builder = new ToStringBuilder(this)
609             .appendSuper(super.toString())
610             .append("preprocessing text", preprocessingText);
611 
612         if (tagger != null) {
613             builder.append("tagger", tagger);
614         }
615 
616         return builder.toString();
617     }
618 
619 }