View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.preprocess;
23  
24  import java.io.ByteArrayInputStream;
25  import java.io.ByteArrayOutputStream;
26  import java.io.FileWriter;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.PrintWriter;
30  import java.io.Reader;
31  import java.io.StringReader;
32  import java.io.StringWriter;
33  import java.io.Writer;
34  import java.util.regex.Matcher;
35  import java.util.regex.Pattern;
36  
37  import org.apache.commons.lang.StringUtils;
38  import org.apache.commons.lang.builder.ToStringBuilder;
39  import org.w3c.tidy.Configuration;
40  import org.w3c.tidy.Node;
41  import org.w3c.tidy.Out;
42  import org.w3c.tidy.OutImpl;
43  import org.w3c.tidy.PPrint;
44  import org.w3c.tidy.Tidy;
45  
46  import de.fu_berlin.ties.ContextMap;
47  import de.fu_berlin.ties.ParsingException;
48  import de.fu_berlin.ties.ProcessingException;
49  import de.fu_berlin.ties.TextProcessor;
50  import de.fu_berlin.ties.TiesConfiguration;
51  
52  import de.fu_berlin.ties.io.ContentType;
53  import de.fu_berlin.ties.io.IOUtils;
54  import de.fu_berlin.ties.text.TextUtils;
55  import de.fu_berlin.ties.util.ExternalCommand;
56  import de.fu_berlin.ties.util.Util;
57  import de.fu_berlin.ties.xml.OtherConstituent;
58  import de.fu_berlin.ties.xml.TagConstituent;
59  import de.fu_berlin.ties.xml.XMLAdjuster;
60  import de.fu_berlin.ties.xml.XMLConstituent;
61  
62  /***
63   * Preprocesses documents by converting them a suitable XML format and adding
64   * lingustic information. Instances of this class are thread-safe.
65   *
66   * @author Christian Siefkes
67   * @version $Revision: 1.23 $, $Date: 2006/10/21 16:04:23 $, $Author: siefkes $
68   */
69  public class PreProcessor extends TextProcessor {
70  
71      /***
72       * Configuration key prefix: command name and arguments of an external
73       * converter from a specified type to HTML.
74       */
75      public static final String CONFIG_HTMLCONV_COMMAND
76          = "html-converter.command";
77  
78      /***
79       * Configuration key: Whether plain text is preprocessed to recognize
80       * and reformat definition lists.
81       */
82      public static final String CONFIG_PREPROCESS_TEXT
83          = "preprocess.text";
84  
85      /***
86       * Configuration key: A tagger (or a list of taggers) used to annotate a
87       * text e.g. with linguistic information. Each tagger must implement the
88       * TextProcessor interface and accept a string (the output extension) as
89       * single constructor argument.
90       */
91      public static final String CONFIG_PREPROCESS_TAGGER
92          = "preprocess.tagger";
93  
94      /***
95       * The extension used by default for preprocessed ("augmented") files.
96       */
97      public static final String FILE_EXT = "aug";
98  
99      /***
100      * The replacement inserted by {@link #insertLineBreaks(CharSequence)}
101      * (an empty <code>br</code> element is appended after each match).
102      */
103     private static final String NEWLINE_REPLACEMENT = "$0<br/>";
104 
105     /***
106      * The name of the XHTML tag marking preformatted text.
107      */
108     private static final String PRE_TAG = "pre";
109 
110     /***
111      * The name of the XHTML tag marking definitions in a definition list.
112      */
113     private static final String DD_TAG = "dd";
114 
115     /***
116      * Pattern fragment matching a term in a definition list (printable
117      * characters excluding ':' followed by a colon (':'). Following whitespace
118      * is not included in this pattern.
119      */
120     private static final String DL_TERM = "[^//s:]+:";
121 
122     /***
123      * Pattern fragment matching an entry (term + definition) in a definition
124      * text in plain text, formatted similar to RFC 822/2882:
125      * "Term: Definition\n", optionally followed by deeper indented
126      * continuation lines. Group 1 must match the current indentation level
127      * (whitespace at the start of each line, if any).
128      */
129     private static final String DL_ENTRY =
130         // term followed by whitespace and (start of) definition
131         DL_TERM + TextUtils.SINGLE_LINE_WS.pattern() + "+//S.*"
132         // any number of continuation lines (deeper indented)
133         + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1"
134         + TextUtils.SINGLE_LINE_WS.pattern() + "+.+)*";
135 
136     /***
137      * Pattern matching a paragraph that should be converted to a description
138      * list (when converting plain text to XHTML). A list must have at least
139      * two entries to be recognized.
140      */
141     private static final Pattern DL_PARA = Pattern.compile(
142         // preceded by empty line (or start of text)
143         "(?://A|" + TextUtils.NEWLINE_PATTERN.pattern()
144             + TextUtils.SINGLE_LINE_WS.pattern() + "*"
145             + TextUtils.NEWLINE_PATTERN.pattern() + ")"
146         // group 1: indent level (whitespace that must be present on all lines)
147         + "(" + TextUtils.SINGLE_LINE_WS.pattern() + "*)"
148         // first entry
149         + DL_ENTRY
150         // further entries (starting w/possible indent captured in group 1)
151         + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_ENTRY + ")+"
152         // we also allow a trailing term
153         + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_TERM
154         + TextUtils.SINGLE_LINE_WS.pattern() + "*)?"
155         // followed (lookahead) by empty line (or end of text)
156             + "(?=//Z|" + TextUtils.NEWLINE_PATTERN.pattern()
157             + TextUtils.SINGLE_LINE_WS.pattern() + "*"
158             + TextUtils.NEWLINE_PATTERN.pattern() + ")"
159         );
160 
161     /***
162      * Pattern fragment matching a short line (containing at most 40
163      * characters + optionally starting and trailing whitespace).
164      */
165     private static final String SHORT_LINE =
166         // optional whitespace
167         TextUtils.SINGLE_LINE_WS.pattern() + "*"
168         // opt. whitespace + 1-40 characters (first + last must be visible)
169         + "//S(?:.{1,38}//S)?"
170         // optional whitespace
171         + TextUtils.SINGLE_LINE_WS.pattern() + "*";
172 
173     /***
174      * Pattern matching two consecutive {@link #SHORT_LINE short lines}.
175      */
176     private static final Pattern TWO_SHORT_LINES = Pattern.compile(
177         // first line + line separator
178         "^" + SHORT_LINE + TextUtils.NEWLINE_PATTERN.pattern()
179         // second line (using lookahead so the next match can match it again)
180         + "(?=" + SHORT_LINE + "$)",
181         Pattern.MULTILINE);
182 
183     /***
184      * Pattern used to match an XML declaration not containing an encoding.
185      */
186     private static final Pattern XML_DECLARATION = Pattern.compile(
187             "(<//?xml//s+version=\"1//.0\")//s*(//?>)");
188 
189     /***
190      * Whether plain text is preprocessed to recognize and reformat definition
191      * lists.
192      */
193     private final boolean preprocessingText;
194 
195     /***
196      * Tool for cleaning up malformed and faulty HTML. Synchronized on itself.
197      */
198     private final Tidy tidy;
199 
200     /***
201      * Used for printing the output returned by Tidy. Synchronized on the
202      * {@link #tidy} instance.
203      */
204     private final PPrint tidyPrinter;
205 
206     /***
207      * An array of taggers used to annotate a text e.g. with linguistic
208      * information.
209      */
210     private final TextProcessor[] tagger;
211 
212     /***
213      * XML used for {@linkplain #insertLineBreaks(CharSequence) inserting line
214      * breaks}.
215      */
216     private final XMLAdjuster xmlAdjuster;
217 
218     /***
219      * Creates and configured a new instance, using a default extension and the
220      * {@linkplain TiesConfiguration standard configuration}.
221      */
222     public PreProcessor() {
223         this(FILE_EXT);
224     }
225 
226     /***
227      * Creates and configured a new instance, using the
228      * {@linkplain TiesConfiguration standard configuration}.
229      *
230      * @param outExt the extension to use for output files
231      * @throws IllegalArgumentException if the configured linguistic tagger(s)
232      * cannot be instantiated
233      */
234     public PreProcessor(final String outExt) throws IllegalArgumentException {
235         this(outExt, TiesConfiguration.CONF);
236     }
237 
238     /***
239      * Creates and configured a new instance.
240      *
241      * @param outExt the extension to use for output files
242      * @param config used to configure superclasses
243      * @throws IllegalArgumentException if the configured linguistic tagger(s)
244      * cannot be instantiated
245      */
246     public PreProcessor(final String outExt, final TiesConfiguration config)
247     throws IllegalArgumentException {
248         // call superclass + store configuration + set parameters
249         super(outExt, config);
250         preprocessingText = config.getBoolean(CONFIG_PREPROCESS_TEXT);
251 
252         // initialize (linguistic) taggers
253         String[] taggerNames = config.getStringArray(CONFIG_PREPROCESS_TAGGER);
254         if (!TiesConfiguration.arrayIsEmpty(taggerNames)) {
255             // each tagger gets a dummy output extension as constructor arg
256             final String[] params = new String[] {"tt"};
257             tagger = new TextProcessor[taggerNames.length];
258 
259             for (int i = 0; i < taggerNames.length; i++) {
260                 try {
261                     tagger[i] = (TextProcessor) Util.createObject(
262                             Class.forName(taggerNames[i]), params);
263                 } catch (Exception e) {
264                     // repackage exception
265                     throw new IllegalArgumentException(
266                             "Tagger initialization failed", e);
267                 }
268             }
269         } else {
270             // no taggers are used
271             tagger = null;
272         }
273 
274         // create and configure TJidy
275         tidy = new Tidy();
276 
277         // try to change the output log
278         try {
279             final FileWriter tidyLog = new FileWriter("ties-tidy.log");
280             tidy.setErrout(new PrintWriter(tidyLog, true));
281         } catch (IOException ioe) {
282             Util.LOG.warn(
283                 "PreProcessor: couldn't redirect Tidy output to ties-tidy.log",
284                 ioe);
285         }
286 
287         // adjust Tidy configuration, cf. http://www.w3.org/People/Raggett/tidy/
288         tidy.setCharEncoding(Configuration.UTF8);
289         tidy.setEncloseBlockText(true);
290         tidy.setEncloseText(true);
291         tidy.setDocType("omit");
292         tidy.setDropEmptyParas(true);
293         tidy.setLogicalEmphasis(true);
294         tidy.setMakeClean(true);
295         tidy.setOnlyErrors(true);
296         tidy.setQuoteNbsp(false);
297         tidy.setQuiet(true);
298         tidy.setRawOut(true);
299         tidy.setShowWarnings(false);
300         tidy.setTidyMark(false);
301         tidy.setWraplen(0);  // wrapping disabled
302         tidy.setXmlPi(true);
303         tidy.setXmlPIs(true);
304         tidy.setXmlOut(true);
305 
306         // initialize the Tidy printer
307         tidyPrinter = new PPrint(tidy.getConfiguration());
308 
309         // initialize XML adjuster without doing any extras
310         xmlAdjuster = new XMLAdjuster(null, null, null, false, false, false,
311                 false, config);
312     }
313 
314     /***
315      * Converts HTML input to a clean XHTML representation, if necessary.
316      * Delegates to <code>JTidy</code> for checking and cleaning the HTML code.
317      *
318      * @param input the HTML to tidy
319      * @param charset the character to be used for storing the resulting XHTML
320      * document (required to write the XML Declaration correctly)
321      * @return the cleaned-up XHTML
322      * @throws IOException if the I/O goes wrong
323      */
324     public final String cleanHTML(final String input, final String charset)
325     throws IOException {
326         // convert string to byte array input stream using UTF-8
327         final InputStream inStream = new ByteArrayInputStream(
328             input.getBytes(IOUtils.STANDARD_UNICODE_CHARSET));
329 
330         // store result in byte array output stream, estimating initial size
331         final ByteArrayOutputStream outStream =
332             new ByteArrayOutputStream(input.length());
333 
334         // synchronized on the tidy instance
335         synchronized (tidy) {
336             // do the actual clean-up + conversion
337             final Node document = tidy.parse(inStream, null);
338 
339             // in case of some errors Tidy returns a node but doesn't print to
340             // the output stream, so we'll handle this ourselves
341             final Out out = new OutImpl();
342             out.encoding = Configuration.UTF8;
343             out.out = outStream;
344             tidyPrinter.printTree(out, (short) 0, 0, null, document);
345             tidyPrinter.flushLine(out, 0);
346         }
347 
348         final String rawResult =
349             outStream.toString(IOUtils.STANDARD_UNICODE_CHARSET);
350         final String result;
351 
352         if (StringUtils.isNotEmpty(charset)) {
353             // insert encoding in XML declaration
354             result = XML_DECLARATION.matcher(rawResult).replaceFirst(
355                     "$1 encoding=\"" + charset + "\"$2");
356         } else {
357             Util.LOG.warn("No character set specified -- cannot fix XML"
358                     + " declaration of XHTML document");
359             result = rawResult;
360         }
361 
362         return result;
363     }
364 
365     /***
366      * Preprocesses the contents of a file. Neither input stream nor output
367      * writer are closed by this method.
368      *
369      * @param reader a reader containing the text to preprocess; not closed
370      * by this method
371      * @param writer a writer used to store the preprocessed text; flushed
372      * but not closed by this method
373      * @param context a map of objects that are made available for processing;
374      * the {@link ContentType#KEY_MIME_TYPE} key should to mapped to the MIME
375      * type of the document and the {@link IOUtils#KEY_LOCAL_CHARSET} key to
376      * the character set of the <code>writer</code>
377      * @throws IOException if an I/O error occurred
378      * @throws ProcessingException if the file couldn't be parsed, e.g. due to
379      * an error in the XML input
380      */
381     protected final void doProcess(final Reader reader, final Writer writer,
382             final ContextMap context)
383             throws IOException, ProcessingException {
384         // read contents from provided stream
385         String contents = IOUtils.readToString(reader);
386         final String mimeType = (String) context.get(ContentType.KEY_MIME_TYPE);
387         final boolean html;
388         final boolean xml;
389 
390         // preprocess plain text if configured
391         if (preprocessingText && ContentType.MIME_PLAIN.equals(mimeType)) {
392             contents = preprocessText(contents);
393         }
394 
395         if (ContentType.MIME_HTML.equals(mimeType)) {
396             // this is HTML but not necessarily XHTML
397             html = true;
398             xml = false;
399         } else if ((mimeType != null) && (getConfig().containsKey(
400                 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND,
401                                             mimeType)))) {
402             // there is an external XHTML converter configured for this type
403             final String[] commandArgs = getConfig().getStringArray(
404                 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND, mimeType));
405             final ExternalCommand extConverter =
406                 new ExternalCommand(commandArgs);
407 
408             // run external converter and store output
409             contents = extConverter.execute(contents);
410 
411             // output is already XHTML -- no need to send thru Tidy
412             html = true;
413             xml = true;
414         } else {
415             // try to process contents without conversion (XML)
416             html = false;
417             xml = true;
418         }
419 
420         if (html) {
421             if (!xml) {
422                 // clean-up HTML markup + convert to XHTML
423                 contents = cleanHTML(contents,
424                         (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
425             }
426 
427             // insert <br> tags in <pre>
428             contents = insertLineBreaks(contents);
429         }
430 
431         // delegate to taggers, if any
432         if (tagger != null) {
433             StringWriter stringWriter;
434 
435             for (int i = 0; i < tagger.length; i++) {
436                 stringWriter = new StringWriter(contents.length());
437                 tagger[i].process(new StringReader(contents), stringWriter,
438                         context);
439                 contents = stringWriter.toString();
440             }
441         }
442 
443         // store results
444         writer.write(contents);
445         writer.flush();
446     }
447 
448     /***
449      * Adds empty <code>br</code> elements in an XHTML document where
450      * appropriate for better recognizing of the physical formatting.
451      * Currently this is done at the begin of each line in a <code>pre</code>
452      * element; and between short lines (at most 40 visible characters each)
453      * in <code>dd</code> element (definitions within definition lists).
454      * (Text in CDATA sections is not modified; leading and trailing linebreaks
455      * are ignored.)
456      *
457      * @param input the XHTML to process
458      * @return the XHTML with <code>br</code> elements added
459      * @throws ParsingException if the file couldn't be parsed, e.g. due to an
460      * error in the XML input
461      */
462     private String insertLineBreaks(final CharSequence input)
463             throws ParsingException {
464         final StringBuilder result = new StringBuilder();
465         final XMLConstituent firstConst =
466             xmlAdjuster.rawConstituents(input, false);
467         XMLConstituent currentConst = firstConst;
468         TagConstituent currentTag;
469         int currentType;
470         boolean inPre = false;
471         boolean inDD = false;
472         String output;
473         final char namespaceSeparator = ':';
474         final Matcher newlineMatcher = TextUtils.NEWLINE_PATTERN.matcher("");
475         final Matcher shortLinesMatcher = TWO_SHORT_LINES.matcher("");
476 
477         // iterate + insert br elements where required
478         while (currentConst != null) {
479             currentType = currentConst.getType();
480 
481             if ((currentType == TagConstituent.START_TAG)
482                     || (currentType == TagConstituent.END_TAG)) {
483                 currentTag = (TagConstituent) currentConst;
484 
485                 // quick-and-dirty namespace handling
486                 if (currentTag.getName().equals(PRE_TAG) || currentTag
487                         .getName().endsWith(namespaceSeparator + PRE_TAG)) {
488                     // entering or exiting <pre> element
489                     inPre = (currentType == TagConstituent.START_TAG);
490                 } else if (currentTag.getName().equals(DD_TAG) || currentTag
491                     .getName().endsWith(namespaceSeparator + DD_TAG)) {
492                 // entering or exiting <dd> element
493                 inDD = (currentType == TagConstituent.START_TAG);
494                 }
495             }
496 
497             if (inPre && (currentType == OtherConstituent.TEXT)) {
498                 // insert <br/> tags within text content of <pre> elements
499                 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
500                     newlineMatcher, NEWLINE_REPLACEMENT);
501 
502                 if (output != currentConst.getRepresentantion()) {
503                     Util.LOG.debug("Inserted break tags into preformatted text:"
504                         + TextUtils.LINE_SEPARATOR + output);
505                 }
506             } else if (inDD && (currentType == OtherConstituent.TEXT)) {
507                 // insert <br/> tags within two short lines within <dd> elements
508                 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
509                     shortLinesMatcher, NEWLINE_REPLACEMENT);
510 
511                 if (output != currentConst.getRepresentantion()) {
512                     Util.LOG.debug("Inserted break tags between short lines "
513                         + "in <dd> element:" + TextUtils.LINE_SEPARATOR
514                         + output);
515                 }
516             } else {
517                 // output representation as is
518                 output = currentConst.getRepresentantion();
519             }
520 
521             result.append(output);
522             currentConst = currentConst.nextConstituent();
523         }
524 
525         return result.toString();
526     }
527 
528     /***
529      * Preprocessed plain text to bring definition lists in a format recognized
530      * by <code>txt2html</code>.
531      *
532      * @param input the plain text to process
533      * @return the plain text with converted definition lists; or a reference
534      * to <code>input</code> if there was nothing to convert
535      */
536     private String preprocessText(final String input) {
537         // trimming text because a single empty line at start or end could
538         // spoil matches
539         final String trimmedInput = input.trim();
540         final Matcher dlMatcher = DL_PARA.matcher(trimmedInput);
541         boolean found = dlMatcher.find();
542         int afterLastMatch = 0;
543         String orgPara, normalizedPara, convertedPara;
544         String indent;  // indentation level (group 1)
545         String extraIndent;
546         Pattern termToConvert;
547         Pattern continuationIndent;
548 
549         if (found) {
550             final StringBuilder result = new StringBuilder();
551             do {
552                 // append text preceding the match
553                 result.append(trimmedInput.substring(afterLastMatch,
554                     dlMatcher.start()));
555                 afterLastMatch = dlMatcher.end();
556                 indent = dlMatcher.group(1);
557                 orgPara = dlMatcher.group();
558 
559                 // use extra indent of 2 for (first line of) definition
560                 extraIndent = indent + "  ";
561 
562                 // normalize continuation lines to use extra indent
563                 continuationIndent = Pattern.compile(
564                     // must start on new line (group 1)
565                     "(//A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
566                     // with deeper intentation
567                     + indent + TextUtils.SINGLE_LINE_WS.pattern() + "+"
568                     );
569                 normalizedPara = TextUtils.replaceAll(orgPara,
570                     continuationIndent, "$1" + extraIndent);
571 
572                 // term to convert
573                 termToConvert = Pattern.compile(
574                     // must start on new line
575                     "((?://A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
576                     // with correct indent
577                     + indent + DL_TERM + ")"
578                     // followed by whitespace
579                     + TextUtils.SINGLE_LINE_WS.pattern() + "+"
580                     // and then (lookahead) a printable char
581                     + "(?=//S)");
582 
583                 // replace whitespace after term by newline + extra indent
584                 convertedPara = TextUtils.replaceAll(normalizedPara,
585                     termToConvert,
586                     "$1" + TextUtils.LINE_SEPARATOR + extraIndent);
587 
588 
589                 Util.LOG.debug(
590                     "Reformatted paragraph containing definition list: "
591                     + convertedPara);
592 
593                 // append + look for next one
594                 result.append(convertedPara);
595                 found = dlMatcher.find();
596             } while (found);
597 
598             // append rest of text
599             result.append(trimmedInput.substring(afterLastMatch));
600             return result.toString();
601         } else {
602             // nothing to replace
603             return input;
604         }
605     }
606 
607     /***
608      * Returns a string representation of this object.
609      *
610      * @return a textual representation
611      */
612     public String toString() {
613         final ToStringBuilder builder = new ToStringBuilder(this)
614             .appendSuper(super.toString())
615             .append("preprocessing text", preprocessingText);
616 
617         if (tagger != null) {
618             builder.append("tagger", tagger);
619         }
620 
621         return builder.toString();
622     }
623 
624 }