1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.preprocess;
23
24 import java.io.ByteArrayInputStream;
25 import java.io.ByteArrayOutputStream;
26 import java.io.FileWriter;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PrintWriter;
30 import java.io.Reader;
31 import java.io.StringReader;
32 import java.io.StringWriter;
33 import java.io.Writer;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37 import org.apache.commons.lang.StringUtils;
38 import org.apache.commons.lang.builder.ToStringBuilder;
39 import org.w3c.tidy.Configuration;
40 import org.w3c.tidy.Node;
41 import org.w3c.tidy.Out;
42 import org.w3c.tidy.OutImpl;
43 import org.w3c.tidy.PPrint;
44 import org.w3c.tidy.Tidy;
45
46 import de.fu_berlin.ties.ContextMap;
47 import de.fu_berlin.ties.ParsingException;
48 import de.fu_berlin.ties.ProcessingException;
49 import de.fu_berlin.ties.TextProcessor;
50 import de.fu_berlin.ties.TiesConfiguration;
51
52 import de.fu_berlin.ties.io.ContentType;
53 import de.fu_berlin.ties.io.IOUtils;
54 import de.fu_berlin.ties.text.TextUtils;
55 import de.fu_berlin.ties.util.ExternalCommand;
56 import de.fu_berlin.ties.util.Util;
57 import de.fu_berlin.ties.xml.OtherConstituent;
58 import de.fu_berlin.ties.xml.TagConstituent;
59 import de.fu_berlin.ties.xml.XMLAdjuster;
60 import de.fu_berlin.ties.xml.XMLConstituent;
61
62 /***
63 * Preprocesses documents by converting them a suitable XML format and adding
64 * lingustic information. Instances of this class are thread-safe.
65 *
66 * @author Christian Siefkes
67 * @version $Revision: 1.16 $, $Date: 2004/12/07 12:02:05 $, $Author: siefkes $
68 */
69 public class PreProcessor extends TextProcessor {
70
71 /***
72 * Configuration key prefix: command name and arguments of an external
73 * converter from a specified type to HTML.
74 */
75 public static final String CONFIG_HTMLCONV_COMMAND
76 = "html-converter.command";
77
78 /***
79 * Configuration key: Whether plain text is preprocessed to recognize
80 * and reformat definition lists.
81 */
82 public static final String CONFIG_PREPROCESS_TEXT
83 = "preprocess.text";
84
85 /***
86 * Configuration key: A tagger (or a list of taggers) used to annotate a
87 * text e.g. with linguistic information. Each tagger must implement the
88 * TextProcessor interface and accept a string (the output extension) as
89 * single constructor argument.
90 */
91 public static final String CONFIG_PREPROCESS_TAGGER
92 = "preprocess.tagger";
93
94 /***
95 * The replacement inserted by {@link #insertLineBreaks(CharSequence)}
96 * (an empty <code>br</code> element is appended after each match).
97 */
98 private static final String NEWLINE_REPLACEMENT = "$0<br/>";
99
100 /***
101 * The name of the XHTML tag marking preformatted text.
102 */
103 private static final String PRE_TAG = "pre";
104
105 /***
106 * The name of the XHTML tag marking definitions in a definition list.
107 */
108 private static final String DD_TAG = "dd";
109
110 /***
111 * Pattern fragment matching a term in a definition list (printable
112 * characters excluding ':' followed by a colon (':'). Following whitespace
113 * is not included in this pattern.
114 */
115 private static final String DL_TERM = "[^//s:]+:";
116
117 /***
118 * Pattern fragment matching an entry (term + definition) in a definition
119 * text in plain text, formatted similar to RFC 822/2882:
120 * "Term: Definition\n", optionally followed by deeper indented
121 * continuation lines. Group 1 must match the current indentation level
122 * (whitespace at the start of each line, if any).
123 */
124 private static final String DL_ENTRY =
125
126 DL_TERM + TextUtils.SINGLE_LINE_WS.pattern() + "+//S.*"
127
128 + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1"
129 + TextUtils.SINGLE_LINE_WS.pattern() + "+.+)*";
130
131 /***
132 * Pattern matching a paragraph that should be converted to a description
133 * list (when converting plain text to XHTML). A list must have at least
134 * two entries to be recognized.
135 */
136 private static final Pattern DL_PARA = Pattern.compile(
137
138 "(?://A|" + TextUtils.NEWLINE_PATTERN.pattern()
139 + TextUtils.SINGLE_LINE_WS.pattern() + "*"
140 + TextUtils.NEWLINE_PATTERN.pattern() + ")"
141
142 + "(" + TextUtils.SINGLE_LINE_WS.pattern() + "*)"
143
144 + DL_ENTRY
145
146 + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_ENTRY + ")+"
147
148 + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_TERM
149 + TextUtils.SINGLE_LINE_WS.pattern() + "*)?"
150
151 + "(?=//Z|" + TextUtils.NEWLINE_PATTERN.pattern()
152 + TextUtils.SINGLE_LINE_WS.pattern() + "*"
153 + TextUtils.NEWLINE_PATTERN.pattern() + ")"
154 );
155
156 /***
157 * Pattern fragment matching a short line (containing at most 40
158 * characters + optionally starting and trailing whitespace).
159 */
160 private static final String SHORT_LINE =
161
162 TextUtils.SINGLE_LINE_WS.pattern() + "*"
163
164 + "//S(?:.{1,38}//S)?"
165
166 + TextUtils.SINGLE_LINE_WS.pattern() + "*";
167
168 /***
169 * Pattern matching two consecutive {@link #SHORT_LINE short lines}.
170 */
171 private static final Pattern TWO_SHORT_LINES = Pattern.compile(
172
173 "^" + SHORT_LINE + TextUtils.NEWLINE_PATTERN.pattern()
174
175 + "(?=" + SHORT_LINE + "$)",
176 Pattern.MULTILINE);
177
178 /***
179 * Pattern used to match an XML declaration not containing an encoding.
180 */
181 private static final Pattern XML_DECLARATION = Pattern.compile(
182 "(<//?xml//s+version=\"1//.0\")//s*(//?>)");
183
184 /***
185 * Whether plain text is preprocessed to recognize and reformat definition
186 * lists.
187 */
188 private final boolean preprocessingText;
189
190 /***
191 * Tool for cleaning up malformed and faulty HTML. Synchronized on itself.
192 */
193 private final Tidy tidy;
194
195 /***
196 * Used for printing the output returned by Tidy. Synchronized on the
197 * {@link #tidy} instance.
198 */
199 private final PPrint tidyPrinter;
200
201 /***
202 * An array of taggers used to annotate a text e.g. with linguistic
203 * information.
204 */
205 private final TextProcessor[] tagger;
206
207 /***
208 * XML used for {@linkplain #insertLineBreaks(CharSequence) inserting line
209 * breaks}.
210 */
211 private final XMLAdjuster xmlAdjuster;
212
213 /***
214 * Creates and configured a new instance, using a default extension and the
215 * {@linkplain TiesConfiguration standard configuration}.
216 */
217 public PreProcessor() {
218 this("aug");
219 }
220
221 /***
222 * Creates and configured a new instance, using the
223 * {@linkplain TiesConfiguration standard configuration}.
224 *
225 * @param outExt the extension to use for output files
226 * @throws IllegalArgumentException if the configured linguistic tagger(s)
227 * cannot be instantiated
228 */
229 public PreProcessor(final String outExt) throws IllegalArgumentException {
230 this(outExt, TiesConfiguration.CONF);
231 }
232
233 /***
234 * Creates and configured a new instance.
235 *
236 * @param outExt the extension to use for output files
237 * @param config used to configure superclasses
238 * @throws IllegalArgumentException if the configured linguistic tagger(s)
239 * cannot be instantiated
240 */
241 public PreProcessor(final String outExt, final TiesConfiguration config)
242 throws IllegalArgumentException {
243
244 super(outExt, config);
245 preprocessingText = config.getBoolean(CONFIG_PREPROCESS_TEXT);
246
247
248 String[] taggerNames = config.getStringArray(CONFIG_PREPROCESS_TAGGER);
249 if (!TiesConfiguration.arrayIsEmpty(taggerNames)) {
250
251 final String[] params = new String[] { "tt" };
252 tagger = new TextProcessor[taggerNames.length];
253
254 for (int i = 0; i < taggerNames.length; i++) {
255 try {
256 tagger[i] = (TextProcessor) Util.createObject(
257 Class.forName(taggerNames[i]), params);
258 } catch (Exception e) {
259
260 throw new IllegalArgumentException(
261 "Tagger initialization failed", e);
262 }
263 }
264 } else {
265
266 tagger = null;
267 }
268
269
270 tidy = new Tidy();
271
272
273 try {
274 final FileWriter tidyLog = new FileWriter("ties-tidy.log");
275 tidy.setErrout(new PrintWriter(tidyLog, true));
276 } catch (IOException ioe) {
277 Util.LOG.warn(
278 "PreProcessor: couldn't redirect Tidy output to ties-tidy.log",
279 ioe);
280 }
281
282
283 tidy.setCharEncoding(Configuration.UTF8);
284 tidy.setEncloseBlockText(true);
285 tidy.setEncloseText(true);
286 tidy.setDocType("omit");
287 tidy.setDropEmptyParas(true);
288 tidy.setLogicalEmphasis(true);
289 tidy.setMakeClean(true);
290 tidy.setOnlyErrors(true);
291 tidy.setQuoteNbsp(false);
292 tidy.setQuiet(true);
293 tidy.setRawOut(true);
294 tidy.setShowWarnings(false);
295 tidy.setTidyMark(false);
296 tidy.setWraplen(0);
297 tidy.setXmlPi(true);
298 tidy.setXmlPIs(true);
299 tidy.setXmlOut(true);
300
301
302 tidyPrinter = new PPrint(tidy.getConfiguration());
303
304
305 xmlAdjuster =
306 new XMLAdjuster(null, null, null, false, false, false, config);
307 }
308
309 /***
310 * Converts HTML input to a clean XHTML representation, if necessary.
311 * Delegates to <code>JTidy</code> for checking and cleaning the HTML code.
312 *
313 * @param input the HTML to tidy
314 * @param charset the character to be used for storing the resulting XHTML
315 * document (required to write the XML Declaration correctly)
316 * @return the cleaned-up XHTML
317 * @throws IOException if the I/O goes wrong
318 */
319 public final String cleanHTML(final String input, final String charset)
320 throws IOException {
321
322 final InputStream inStream = new ByteArrayInputStream(
323 input.getBytes(IOUtils.STANDARD_UNICODE_CHARSET));
324
325
326 final ByteArrayOutputStream outStream =
327 new ByteArrayOutputStream(input.length());
328
329
330 synchronized (tidy) {
331
332 final Node document = tidy.parse(inStream, null);
333
334
335
336 final Out out = new OutImpl();
337 out.encoding = Configuration.UTF8;
338 out.out = outStream;
339 tidyPrinter.printTree(out, (short) 0, 0, null, document);
340 tidyPrinter.flushLine(out, 0);
341 }
342
343 final String rawResult =
344 outStream.toString(IOUtils.STANDARD_UNICODE_CHARSET);
345 final String result;
346
347 if (StringUtils.isNotEmpty(charset)) {
348
349 result = XML_DECLARATION.matcher(rawResult).replaceFirst(
350 "$1 encoding=\"" + charset + "\"$2");
351 } else {
352 Util.LOG.warn("No character set specified -- cannot fix XML"
353 + " declaration of XHTML document");
354 result = rawResult;
355 }
356
357 return result;
358 }
359
360 /***
361 * Preprocesses the contents of a file. Neither input stream nor output
362 * writer are closed by this method.
363 *
364 * @param reader a reader containing the text to preprocess; not closed
365 * by this method
366 * @param writer a writer used to store the preprocessed text; flushed
367 * but not closed by this method
368 * @param context a map of objects that are made available for processing;
369 * the {@link ContentType#KEY_MIME_TYPE} key should to mapped to the MIME
370 * type of the document and the {@link IOUtils#KEY_LOCAL_CHARSET} key to
371 * the character set of the <code>writer</code>
372 * @throws IOException if an I/O error occurred
373 * @throws ProcessingException if the file couldn't be parsed, e.g. due to
374 * an error in the XML input
375 */
376 protected final void doProcess(final Reader reader, final Writer writer,
377 final ContextMap context)
378 throws IOException, ProcessingException {
379
380 String contents = IOUtils.readToString(reader);
381 final String mimeType = (String) context.get(ContentType.KEY_MIME_TYPE);
382 final boolean html;
383 final boolean xml;
384
385
386 if (preprocessingText && ContentType.MIME_PLAIN.equals(mimeType)) {
387 contents = preprocessText(contents);
388 }
389
390 if (ContentType.MIME_HTML.equals(mimeType)) {
391
392 html = true;
393 xml = false;
394 } else if ((mimeType != null) && (getConfig().containsKey(
395 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND,
396 mimeType)))) {
397
398 final String[] commandArgs = getConfig().getStringArray(
399 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND, mimeType));
400 final ExternalCommand extConverter =
401 new ExternalCommand(commandArgs);
402
403
404 contents = extConverter.execute(contents);
405
406
407 html = true;
408 xml = true;
409 } else {
410
411 html = false;
412 xml = true;
413 }
414
415 if (html) {
416 if (!xml) {
417
418 contents = cleanHTML(contents,
419 (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
420 }
421
422
423 contents = insertLineBreaks(contents);
424 }
425
426
427 if (tagger != null) {
428 StringWriter stringWriter;
429
430 for (int i = 0; i < tagger.length; i++) {
431 stringWriter = new StringWriter(contents.length());
432 tagger[i].process(new StringReader(contents), stringWriter,
433 context);
434 contents = stringWriter.toString();
435 }
436 }
437
438
439 writer.write(contents);
440 writer.flush();
441 }
442
443 /***
444 * Adds empty <code>br</code> elements in an XHTML document where
445 * appropriate for better recognizing of the physical formatting.
446 * Currently this is done at the begin of each line in a <code>pre</code>
447 * element; and between short lines (at most 40 visible characters each)
448 * in <code>dd</code> element (definitions within definition lists).
449 * (Text in CDATA sections is not modified; leading and trailing linebreaks
450 * are ignored.)
451 *
452 * @param input the XHTML to process
453 * @return the XHTML with <code>br</code> elements added
454 * @throws ParsingException if the file couldn't be parsed, e.g. due to an
455 * error in the XML input
456 */
457 private String insertLineBreaks(final CharSequence input)
458 throws ParsingException {
459 final StringBuffer result = new StringBuffer();
460 final XMLConstituent firstConst =
461 xmlAdjuster.rawConstituents(input, false);
462 XMLConstituent currentConst = firstConst;
463 TagConstituent currentTag;
464 int currentType;
465 boolean inPre = false;
466 boolean inDD = false;
467 String output;
468 final char namespaceSeparator = ':';
469 final Matcher newlineMatcher = TextUtils.NEWLINE_PATTERN.matcher("");
470 final Matcher shortLinesMatcher = TWO_SHORT_LINES.matcher("");
471
472
473 while (currentConst != null) {
474 currentType = currentConst.getType();
475
476 if ((currentType == TagConstituent.START_TAG)
477 || (currentType == TagConstituent.END_TAG)) {
478 currentTag = (TagConstituent) currentConst;
479
480
481 if (currentTag.getName().equals(PRE_TAG) || currentTag
482 .getName().endsWith(namespaceSeparator + PRE_TAG)) {
483
484 inPre = (currentType == TagConstituent.START_TAG);
485 } else if (currentTag.getName().equals(DD_TAG) || currentTag
486 .getName().endsWith(namespaceSeparator + DD_TAG)) {
487
488 inDD = (currentType == TagConstituent.START_TAG);
489 }
490 }
491
492 if (inPre && (currentType == OtherConstituent.TEXT)) {
493
494 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
495 newlineMatcher, NEWLINE_REPLACEMENT);
496
497 if (output != currentConst.getRepresentantion()) {
498 Util.LOG.debug("Inserted break tags into preformatted text:"
499 + TextUtils.LINE_SEPARATOR + output);
500 }
501 } else if (inDD && (currentType == OtherConstituent.TEXT)) {
502
503 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
504 shortLinesMatcher, NEWLINE_REPLACEMENT);
505
506 if (output != currentConst.getRepresentantion()) {
507 Util.LOG.debug("Inserted break tags between short lines "
508 + "in <dd> element:" + TextUtils.LINE_SEPARATOR
509 + output);
510 }
511 } else {
512
513 output = currentConst.getRepresentantion();
514 }
515
516 result.append(output);
517 currentConst = currentConst.nextConstituent();
518 }
519
520 return result.toString();
521 }
522
523 /***
524 * Preprocessed plain text to bring definition lists in a format recognized
525 * by <code>txt2html</code>.
526 *
527 * @param input the plain text to process
528 * @return the plain text with converted definition lists; or a reference
529 * to <code>input</code> if there was nothing to convert
530 */
531 private String preprocessText(final String input) {
532
533
534 final String trimmedInput = input.trim();
535 final Matcher dlMatcher = DL_PARA.matcher(trimmedInput);
536 boolean found = dlMatcher.find();
537 int afterLastMatch = 0;
538 String orgPara, normalizedPara, convertedPara;
539 String indent;
540 String extraIndent;
541 Pattern termToConvert;
542 Pattern continuationIndent;
543
544 if (found) {
545 final StringBuffer result = new StringBuffer();
546 do {
547
548 result.append(trimmedInput.substring(afterLastMatch,
549 dlMatcher.start()));
550 afterLastMatch = dlMatcher.end();
551 indent = dlMatcher.group(1);
552 orgPara = dlMatcher.group();
553
554
555 extraIndent = indent + " ";
556
557
558 continuationIndent = Pattern.compile(
559
560 "(//A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
561
562 + indent + TextUtils.SINGLE_LINE_WS.pattern() + "+"
563 );
564 normalizedPara = TextUtils.replaceAll(orgPara,
565 continuationIndent, "$1" + extraIndent);
566
567
568 termToConvert = Pattern.compile(
569
570 "((?://A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
571
572 + indent + DL_TERM + ")"
573
574 + TextUtils.SINGLE_LINE_WS.pattern() + "+"
575
576 + "(?=//S)");
577
578
579 convertedPara = TextUtils.replaceAll(normalizedPara,
580 termToConvert,
581 "$1" + TextUtils.LINE_SEPARATOR + extraIndent);
582
583
584 Util.LOG.debug(
585 "Reformatted paragraph containing definition list: "
586 + convertedPara);
587
588
589 result.append(convertedPara);
590 found = dlMatcher.find();
591 } while (found);
592
593
594 result.append(trimmedInput.substring(afterLastMatch));
595 return result.toString();
596 } else {
597
598 return input;
599 }
600 }
601
602 /***
603 * Returns a string representation of this object.
604 *
605 * @return a textual representation
606 */
607 public String toString() {
608 final ToStringBuilder builder = new ToStringBuilder(this)
609 .appendSuper(super.toString())
610 .append("preprocessing text", preprocessingText);
611
612 if (tagger != null) {
613 builder.append("tagger", tagger);
614 }
615
616 return builder.toString();
617 }
618
619 }