1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.preprocess;
23
24 import java.io.ByteArrayInputStream;
25 import java.io.ByteArrayOutputStream;
26 import java.io.FileWriter;
27 import java.io.IOException;
28 import java.io.InputStream;
29 import java.io.PrintWriter;
30 import java.io.Reader;
31 import java.io.StringReader;
32 import java.io.StringWriter;
33 import java.io.Writer;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37 import org.apache.commons.lang.StringUtils;
38 import org.apache.commons.lang.builder.ToStringBuilder;
39 import org.w3c.tidy.Configuration;
40 import org.w3c.tidy.Node;
41 import org.w3c.tidy.Out;
42 import org.w3c.tidy.OutImpl;
43 import org.w3c.tidy.PPrint;
44 import org.w3c.tidy.Tidy;
45
46 import de.fu_berlin.ties.ContextMap;
47 import de.fu_berlin.ties.ParsingException;
48 import de.fu_berlin.ties.ProcessingException;
49 import de.fu_berlin.ties.TextProcessor;
50 import de.fu_berlin.ties.TiesConfiguration;
51
52 import de.fu_berlin.ties.io.ContentType;
53 import de.fu_berlin.ties.io.IOUtils;
54 import de.fu_berlin.ties.text.TextUtils;
55 import de.fu_berlin.ties.util.ExternalCommand;
56 import de.fu_berlin.ties.util.Util;
57 import de.fu_berlin.ties.xml.OtherConstituent;
58 import de.fu_berlin.ties.xml.TagConstituent;
59 import de.fu_berlin.ties.xml.XMLAdjuster;
60 import de.fu_berlin.ties.xml.XMLConstituent;
61
62 /***
63 * Preprocesses documents by converting them a suitable XML format and adding
64 * lingustic information. Instances of this class are thread-safe.
65 *
66 * @author Christian Siefkes
67 * @version $Revision: 1.23 $, $Date: 2006/10/21 16:04:23 $, $Author: siefkes $
68 */
69 public class PreProcessor extends TextProcessor {
70
71 /***
72 * Configuration key prefix: command name and arguments of an external
73 * converter from a specified type to HTML.
74 */
75 public static final String CONFIG_HTMLCONV_COMMAND
76 = "html-converter.command";
77
78 /***
79 * Configuration key: Whether plain text is preprocessed to recognize
80 * and reformat definition lists.
81 */
82 public static final String CONFIG_PREPROCESS_TEXT
83 = "preprocess.text";
84
85 /***
86 * Configuration key: A tagger (or a list of taggers) used to annotate a
87 * text e.g. with linguistic information. Each tagger must implement the
88 * TextProcessor interface and accept a string (the output extension) as
89 * single constructor argument.
90 */
91 public static final String CONFIG_PREPROCESS_TAGGER
92 = "preprocess.tagger";
93
94 /***
95 * The extension used by default for preprocessed ("augmented") files.
96 */
97 public static final String FILE_EXT = "aug";
98
99 /***
100 * The replacement inserted by {@link #insertLineBreaks(CharSequence)}
101 * (an empty <code>br</code> element is appended after each match).
102 */
103 private static final String NEWLINE_REPLACEMENT = "$0<br/>";
104
105 /***
106 * The name of the XHTML tag marking preformatted text.
107 */
108 private static final String PRE_TAG = "pre";
109
110 /***
111 * The name of the XHTML tag marking definitions in a definition list.
112 */
113 private static final String DD_TAG = "dd";
114
115 /***
116 * Pattern fragment matching a term in a definition list (printable
117 * characters excluding ':' followed by a colon (':'). Following whitespace
118 * is not included in this pattern.
119 */
120 private static final String DL_TERM = "[^//s:]+:";
121
122 /***
123 * Pattern fragment matching an entry (term + definition) in a definition
124 * text in plain text, formatted similar to RFC 822/2882:
125 * "Term: Definition\n", optionally followed by deeper indented
126 * continuation lines. Group 1 must match the current indentation level
127 * (whitespace at the start of each line, if any).
128 */
129 private static final String DL_ENTRY =
130
131 DL_TERM + TextUtils.SINGLE_LINE_WS.pattern() + "+//S.*"
132
133 + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1"
134 + TextUtils.SINGLE_LINE_WS.pattern() + "+.+)*";
135
136 /***
137 * Pattern matching a paragraph that should be converted to a description
138 * list (when converting plain text to XHTML). A list must have at least
139 * two entries to be recognized.
140 */
141 private static final Pattern DL_PARA = Pattern.compile(
142
143 "(?://A|" + TextUtils.NEWLINE_PATTERN.pattern()
144 + TextUtils.SINGLE_LINE_WS.pattern() + "*"
145 + TextUtils.NEWLINE_PATTERN.pattern() + ")"
146
147 + "(" + TextUtils.SINGLE_LINE_WS.pattern() + "*)"
148
149 + DL_ENTRY
150
151 + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_ENTRY + ")+"
152
153 + "(?:" + TextUtils.NEWLINE_PATTERN.pattern() + "//1" + DL_TERM
154 + TextUtils.SINGLE_LINE_WS.pattern() + "*)?"
155
156 + "(?=//Z|" + TextUtils.NEWLINE_PATTERN.pattern()
157 + TextUtils.SINGLE_LINE_WS.pattern() + "*"
158 + TextUtils.NEWLINE_PATTERN.pattern() + ")"
159 );
160
161 /***
162 * Pattern fragment matching a short line (containing at most 40
163 * characters + optionally starting and trailing whitespace).
164 */
165 private static final String SHORT_LINE =
166
167 TextUtils.SINGLE_LINE_WS.pattern() + "*"
168
169 + "//S(?:.{1,38}//S)?"
170
171 + TextUtils.SINGLE_LINE_WS.pattern() + "*";
172
173 /***
174 * Pattern matching two consecutive {@link #SHORT_LINE short lines}.
175 */
176 private static final Pattern TWO_SHORT_LINES = Pattern.compile(
177
178 "^" + SHORT_LINE + TextUtils.NEWLINE_PATTERN.pattern()
179
180 + "(?=" + SHORT_LINE + "$)",
181 Pattern.MULTILINE);
182
183 /***
184 * Pattern used to match an XML declaration not containing an encoding.
185 */
186 private static final Pattern XML_DECLARATION = Pattern.compile(
187 "(<//?xml//s+version=\"1//.0\")//s*(//?>)");
188
189 /***
190 * Whether plain text is preprocessed to recognize and reformat definition
191 * lists.
192 */
193 private final boolean preprocessingText;
194
195 /***
196 * Tool for cleaning up malformed and faulty HTML. Synchronized on itself.
197 */
198 private final Tidy tidy;
199
200 /***
201 * Used for printing the output returned by Tidy. Synchronized on the
202 * {@link #tidy} instance.
203 */
204 private final PPrint tidyPrinter;
205
206 /***
207 * An array of taggers used to annotate a text e.g. with linguistic
208 * information.
209 */
210 private final TextProcessor[] tagger;
211
212 /***
213 * XML used for {@linkplain #insertLineBreaks(CharSequence) inserting line
214 * breaks}.
215 */
216 private final XMLAdjuster xmlAdjuster;
217
218 /***
219 * Creates and configured a new instance, using a default extension and the
220 * {@linkplain TiesConfiguration standard configuration}.
221 */
222 public PreProcessor() {
223 this(FILE_EXT);
224 }
225
226 /***
227 * Creates and configured a new instance, using the
228 * {@linkplain TiesConfiguration standard configuration}.
229 *
230 * @param outExt the extension to use for output files
231 * @throws IllegalArgumentException if the configured linguistic tagger(s)
232 * cannot be instantiated
233 */
234 public PreProcessor(final String outExt) throws IllegalArgumentException {
235 this(outExt, TiesConfiguration.CONF);
236 }
237
238 /***
239 * Creates and configured a new instance.
240 *
241 * @param outExt the extension to use for output files
242 * @param config used to configure superclasses
243 * @throws IllegalArgumentException if the configured linguistic tagger(s)
244 * cannot be instantiated
245 */
246 public PreProcessor(final String outExt, final TiesConfiguration config)
247 throws IllegalArgumentException {
248
249 super(outExt, config);
250 preprocessingText = config.getBoolean(CONFIG_PREPROCESS_TEXT);
251
252
253 String[] taggerNames = config.getStringArray(CONFIG_PREPROCESS_TAGGER);
254 if (!TiesConfiguration.arrayIsEmpty(taggerNames)) {
255
256 final String[] params = new String[] {"tt"};
257 tagger = new TextProcessor[taggerNames.length];
258
259 for (int i = 0; i < taggerNames.length; i++) {
260 try {
261 tagger[i] = (TextProcessor) Util.createObject(
262 Class.forName(taggerNames[i]), params);
263 } catch (Exception e) {
264
265 throw new IllegalArgumentException(
266 "Tagger initialization failed", e);
267 }
268 }
269 } else {
270
271 tagger = null;
272 }
273
274
275 tidy = new Tidy();
276
277
278 try {
279 final FileWriter tidyLog = new FileWriter("ties-tidy.log");
280 tidy.setErrout(new PrintWriter(tidyLog, true));
281 } catch (IOException ioe) {
282 Util.LOG.warn(
283 "PreProcessor: couldn't redirect Tidy output to ties-tidy.log",
284 ioe);
285 }
286
287
288 tidy.setCharEncoding(Configuration.UTF8);
289 tidy.setEncloseBlockText(true);
290 tidy.setEncloseText(true);
291 tidy.setDocType("omit");
292 tidy.setDropEmptyParas(true);
293 tidy.setLogicalEmphasis(true);
294 tidy.setMakeClean(true);
295 tidy.setOnlyErrors(true);
296 tidy.setQuoteNbsp(false);
297 tidy.setQuiet(true);
298 tidy.setRawOut(true);
299 tidy.setShowWarnings(false);
300 tidy.setTidyMark(false);
301 tidy.setWraplen(0);
302 tidy.setXmlPi(true);
303 tidy.setXmlPIs(true);
304 tidy.setXmlOut(true);
305
306
307 tidyPrinter = new PPrint(tidy.getConfiguration());
308
309
310 xmlAdjuster = new XMLAdjuster(null, null, null, false, false, false,
311 false, config);
312 }
313
314 /***
315 * Converts HTML input to a clean XHTML representation, if necessary.
316 * Delegates to <code>JTidy</code> for checking and cleaning the HTML code.
317 *
318 * @param input the HTML to tidy
319 * @param charset the character to be used for storing the resulting XHTML
320 * document (required to write the XML Declaration correctly)
321 * @return the cleaned-up XHTML
322 * @throws IOException if the I/O goes wrong
323 */
324 public final String cleanHTML(final String input, final String charset)
325 throws IOException {
326
327 final InputStream inStream = new ByteArrayInputStream(
328 input.getBytes(IOUtils.STANDARD_UNICODE_CHARSET));
329
330
331 final ByteArrayOutputStream outStream =
332 new ByteArrayOutputStream(input.length());
333
334
335 synchronized (tidy) {
336
337 final Node document = tidy.parse(inStream, null);
338
339
340
341 final Out out = new OutImpl();
342 out.encoding = Configuration.UTF8;
343 out.out = outStream;
344 tidyPrinter.printTree(out, (short) 0, 0, null, document);
345 tidyPrinter.flushLine(out, 0);
346 }
347
348 final String rawResult =
349 outStream.toString(IOUtils.STANDARD_UNICODE_CHARSET);
350 final String result;
351
352 if (StringUtils.isNotEmpty(charset)) {
353
354 result = XML_DECLARATION.matcher(rawResult).replaceFirst(
355 "$1 encoding=\"" + charset + "\"$2");
356 } else {
357 Util.LOG.warn("No character set specified -- cannot fix XML"
358 + " declaration of XHTML document");
359 result = rawResult;
360 }
361
362 return result;
363 }
364
365 /***
366 * Preprocesses the contents of a file. Neither input stream nor output
367 * writer are closed by this method.
368 *
369 * @param reader a reader containing the text to preprocess; not closed
370 * by this method
371 * @param writer a writer used to store the preprocessed text; flushed
372 * but not closed by this method
373 * @param context a map of objects that are made available for processing;
374 * the {@link ContentType#KEY_MIME_TYPE} key should to mapped to the MIME
375 * type of the document and the {@link IOUtils#KEY_LOCAL_CHARSET} key to
376 * the character set of the <code>writer</code>
377 * @throws IOException if an I/O error occurred
378 * @throws ProcessingException if the file couldn't be parsed, e.g. due to
379 * an error in the XML input
380 */
381 protected final void doProcess(final Reader reader, final Writer writer,
382 final ContextMap context)
383 throws IOException, ProcessingException {
384
385 String contents = IOUtils.readToString(reader);
386 final String mimeType = (String) context.get(ContentType.KEY_MIME_TYPE);
387 final boolean html;
388 final boolean xml;
389
390
391 if (preprocessingText && ContentType.MIME_PLAIN.equals(mimeType)) {
392 contents = preprocessText(contents);
393 }
394
395 if (ContentType.MIME_HTML.equals(mimeType)) {
396
397 html = true;
398 xml = false;
399 } else if ((mimeType != null) && (getConfig().containsKey(
400 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND,
401 mimeType)))) {
402
403 final String[] commandArgs = getConfig().getStringArray(
404 TiesConfiguration.joinKey(CONFIG_HTMLCONV_COMMAND, mimeType));
405 final ExternalCommand extConverter =
406 new ExternalCommand(commandArgs);
407
408
409 contents = extConverter.execute(contents);
410
411
412 html = true;
413 xml = true;
414 } else {
415
416 html = false;
417 xml = true;
418 }
419
420 if (html) {
421 if (!xml) {
422
423 contents = cleanHTML(contents,
424 (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
425 }
426
427
428 contents = insertLineBreaks(contents);
429 }
430
431
432 if (tagger != null) {
433 StringWriter stringWriter;
434
435 for (int i = 0; i < tagger.length; i++) {
436 stringWriter = new StringWriter(contents.length());
437 tagger[i].process(new StringReader(contents), stringWriter,
438 context);
439 contents = stringWriter.toString();
440 }
441 }
442
443
444 writer.write(contents);
445 writer.flush();
446 }
447
448 /***
449 * Adds empty <code>br</code> elements in an XHTML document where
450 * appropriate for better recognizing of the physical formatting.
451 * Currently this is done at the begin of each line in a <code>pre</code>
452 * element; and between short lines (at most 40 visible characters each)
453 * in <code>dd</code> element (definitions within definition lists).
454 * (Text in CDATA sections is not modified; leading and trailing linebreaks
455 * are ignored.)
456 *
457 * @param input the XHTML to process
458 * @return the XHTML with <code>br</code> elements added
459 * @throws ParsingException if the file couldn't be parsed, e.g. due to an
460 * error in the XML input
461 */
462 private String insertLineBreaks(final CharSequence input)
463 throws ParsingException {
464 final StringBuilder result = new StringBuilder();
465 final XMLConstituent firstConst =
466 xmlAdjuster.rawConstituents(input, false);
467 XMLConstituent currentConst = firstConst;
468 TagConstituent currentTag;
469 int currentType;
470 boolean inPre = false;
471 boolean inDD = false;
472 String output;
473 final char namespaceSeparator = ':';
474 final Matcher newlineMatcher = TextUtils.NEWLINE_PATTERN.matcher("");
475 final Matcher shortLinesMatcher = TWO_SHORT_LINES.matcher("");
476
477
478 while (currentConst != null) {
479 currentType = currentConst.getType();
480
481 if ((currentType == TagConstituent.START_TAG)
482 || (currentType == TagConstituent.END_TAG)) {
483 currentTag = (TagConstituent) currentConst;
484
485
486 if (currentTag.getName().equals(PRE_TAG) || currentTag
487 .getName().endsWith(namespaceSeparator + PRE_TAG)) {
488
489 inPre = (currentType == TagConstituent.START_TAG);
490 } else if (currentTag.getName().equals(DD_TAG) || currentTag
491 .getName().endsWith(namespaceSeparator + DD_TAG)) {
492
493 inDD = (currentType == TagConstituent.START_TAG);
494 }
495 }
496
497 if (inPre && (currentType == OtherConstituent.TEXT)) {
498
499 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
500 newlineMatcher, NEWLINE_REPLACEMENT);
501
502 if (output != currentConst.getRepresentantion()) {
503 Util.LOG.debug("Inserted break tags into preformatted text:"
504 + TextUtils.LINE_SEPARATOR + output);
505 }
506 } else if (inDD && (currentType == OtherConstituent.TEXT)) {
507
508 output = TextUtils.replaceAll(currentConst.getRepresentantion(),
509 shortLinesMatcher, NEWLINE_REPLACEMENT);
510
511 if (output != currentConst.getRepresentantion()) {
512 Util.LOG.debug("Inserted break tags between short lines "
513 + "in <dd> element:" + TextUtils.LINE_SEPARATOR
514 + output);
515 }
516 } else {
517
518 output = currentConst.getRepresentantion();
519 }
520
521 result.append(output);
522 currentConst = currentConst.nextConstituent();
523 }
524
525 return result.toString();
526 }
527
528 /***
529 * Preprocessed plain text to bring definition lists in a format recognized
530 * by <code>txt2html</code>.
531 *
532 * @param input the plain text to process
533 * @return the plain text with converted definition lists; or a reference
534 * to <code>input</code> if there was nothing to convert
535 */
536 private String preprocessText(final String input) {
537
538
539 final String trimmedInput = input.trim();
540 final Matcher dlMatcher = DL_PARA.matcher(trimmedInput);
541 boolean found = dlMatcher.find();
542 int afterLastMatch = 0;
543 String orgPara, normalizedPara, convertedPara;
544 String indent;
545 String extraIndent;
546 Pattern termToConvert;
547 Pattern continuationIndent;
548
549 if (found) {
550 final StringBuilder result = new StringBuilder();
551 do {
552
553 result.append(trimmedInput.substring(afterLastMatch,
554 dlMatcher.start()));
555 afterLastMatch = dlMatcher.end();
556 indent = dlMatcher.group(1);
557 orgPara = dlMatcher.group();
558
559
560 extraIndent = indent + " ";
561
562
563 continuationIndent = Pattern.compile(
564
565 "(//A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
566
567 + indent + TextUtils.SINGLE_LINE_WS.pattern() + "+"
568 );
569 normalizedPara = TextUtils.replaceAll(orgPara,
570 continuationIndent, "$1" + extraIndent);
571
572
573 termToConvert = Pattern.compile(
574
575 "((?://A|" + TextUtils.NEWLINE_PATTERN.pattern() + ")"
576
577 + indent + DL_TERM + ")"
578
579 + TextUtils.SINGLE_LINE_WS.pattern() + "+"
580
581 + "(?=//S)");
582
583
584 convertedPara = TextUtils.replaceAll(normalizedPara,
585 termToConvert,
586 "$1" + TextUtils.LINE_SEPARATOR + extraIndent);
587
588
589 Util.LOG.debug(
590 "Reformatted paragraph containing definition list: "
591 + convertedPara);
592
593
594 result.append(convertedPara);
595 found = dlMatcher.find();
596 } while (found);
597
598
599 result.append(trimmedInput.substring(afterLastMatch));
600 return result.toString();
601 } else {
602
603 return input;
604 }
605 }
606
607 /***
608 * Returns a string representation of this object.
609 *
610 * @return a textual representation
611 */
612 public String toString() {
613 final ToStringBuilder builder = new ToStringBuilder(this)
614 .appendSuper(super.toString())
615 .append("preprocessing text", preprocessingText);
616
617 if (tagger != null) {
618 builder.append("tagger", tagger);
619 }
620
621 return builder.toString();
622 }
623
624 }