1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.preprocess;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.io.StringReader;
28 import java.io.StringWriter;
29 import java.io.Writer;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.Enumeration;
33 import java.util.HashMap;
34 import java.util.List;
35 import java.util.Map;
36 import java.util.Properties;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40 import org.apache.commons.collections.Bag;
41 import org.apache.commons.collections.bag.HashBag;
42 import org.apache.commons.lang.builder.ToStringBuilder;
43
44 import de.fu_berlin.ties.ContextMap;
45 import de.fu_berlin.ties.ParsingException;
46 import de.fu_berlin.ties.TextProcessor;
47 import de.fu_berlin.ties.TiesConfiguration;
48
49 import de.fu_berlin.ties.io.IOUtils;
50 import de.fu_berlin.ties.text.TextTokenizer;
51 import de.fu_berlin.ties.text.TextUtils;
52 import de.fu_berlin.ties.util.ExternalCommand;
53 import de.fu_berlin.ties.util.Util;
54 import de.fu_berlin.ties.xml.TagIsolator;
55 import de.fu_berlin.ties.xml.XMLAdjuster;
56 import de.fu_berlin.ties.xml.XMLTokenizerFactory;
57
58 /***
59 * Integrates the
60 * <a href="http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/"
61 * >TreeTagger</a>, a linguistic tool for part-of-speech tagging and chunk
62 * parsing. This integration brings XML-based input files in a form that can
63 * be processed by TreeTagger, runs the external TreeTagger command, converts
64 * the output in the augmented text format defined by TIE, inserting tags
65 * marking sentences and unifying the original XML markup and the TreeTagger
66 * output in a single XML tree. This class is thread-safe.
67 *
68 * @author Christian Siefkes
69 * @version $Revision: 1.27 $, $Date: 2006/10/21 16:04:23 $, $Author: siefkes $
70 */
71 public class TreeTagger extends TextProcessor {
72
73 /***
74 * Configuration key: the name of the TreeTagger command
75 * (language-specific).
76 */
77 private static final String CONFIG_COMMAND = "treetagger.command";
78
79 /***
80 * Configuration key: whether to add XML tags around each sentence.
81 */
82 private static final String CONFIG_TAG_SENTENCES =
83 "treetagger.tag-sentences";
84
85 /***
86 * Configuration key: the POS tag marking the end of a sentence
87 * (language-specific).
88 */
89 private static final String CONFIG_END_OF_SENTENCE = "treetagger.eos";
90
91 /***
92 * Configuration key: list of POS tags still to include in a previous
93 * sentence when they occur after an end-of-sentence tag (language-specific,
94 * optional).
95 */
96 private static final String CONFIG_AFTER_EOS = "treetagger.after-eos";
97
98 /***
99 * Tag marking the end of a sentence constituent (XML end tag).
100 */
101 private static final String END_TAG_CONSTITUENT = "</const>";
102
103 /***
104 * Tag marking the end of a sentence (XML end tag).
105 */
106 private static final String END_TAG_SENTENCE = "</sent>";
107
108 /***
109 * The apostrophe character (').
110 */
111 private static final String APOSTROPHE_CHAR = "'";
112
113 /***
114 * A pseudo entity used to replace the apostrophe (') within XML tags,
115 * so it is protected from the TreeTagger which separates clitics
116 * (English version).
117 */
118 private static final String PSEUDO_ENTITY_APOSTROPHE = "&';";
119
120 /***
121 * A pseudo entity used to replace whitespace within XML tags,
122 * so it is protected from the TreeTagger which splits lines at whitespace.
123 */
124 private static final String PSEUDO_ENTITY_WHITESPACE = "&;";
125
126 /***
127 * A map of regexp patterns and the corresponding replacement strings used
128 * to make XML tags safe for use with the TreeTagger. We can use a hash map
129 * as the order or replacements doesn't matter.
130 */
131 private static final Map<Pattern, String> REPLACE_WITHIN_TAGS =
132 new HashMap<Pattern, String>();
133
134 /***
135 * A map of regexp patterns and the corresponding replacement strings to
136 * revert the changes of applying the {@link #REPLACE_WITHIN_TAGS} map. We
137 * can use a hash map as the order or replacements doesn't matter.
138 */
139 private static final Map<Pattern, String> RESTORE_WITHIN_TAGS =
140 new HashMap<Pattern, String>();
141
142 /***
143 * Pattern for recognizing regular XML entities.
144 */
145 private static final Pattern XML_ENTITY =
146 Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ");");
147
148 /***
149 * Replacement string to protect regular XML entities from being split
150 * by the TreeTagger tokenizer.
151 */
152 private static final String XML_ENTITY_REPLACEMENT = "$1&";
153
154 /***
155 * Pattern for recognizing the XML entities in the replaced form used to
156 * protect them from TreeTagger.
157 */
158 private static final Pattern REPLACED_XML_ENTITY =
159 Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ")&");
160
161 /***
162 * Replacement string to restore protected regular XML entities to their
163 * original form.
164 */
165 private static final String XML_ENTITY_RESTORE = "$1;";
166
167 /***
168 * An unmodifiable map of properties containg the HTML characters and
169 * entities to substitute. Each key is a regular expression
170 * {@link java.util.regex.Pattern} of characters and entities and should
171 * be replaced by the correspinding value String.
172 */
173 private static final Map SUBSTITUTES;
174
175 /***
176 * Name of the properties file containing the HTML characters and entities
177 * to substitute.
178 */
179 private static final String SUBSTITUTES_FILE = "conf/substitutes.cfg";
180
181 /***
182 * Static initialization of the static maps.
183 */
184 static {
185 final Properties substProps = new Properties();
186 final InputStream substStream = TreeTagger.class.getClassLoader()
187 .getResourceAsStream(SUBSTITUTES_FILE);
188
189 if (substStream == null) {
190 Util.LOG.error("Class loader returned null for "
191 + SUBSTITUTES_FILE);
192 }
193
194 final Map<Pattern, String> tempSubst = new HashMap<Pattern, String>();
195
196 try {
197 substProps.load(substStream);
198 Enumeration keys = substProps.propertyNames();
199
200
201 final Matcher nameMatcher = Pattern.compile(
202 "//p{Alpha}+").matcher("");
203 final String nameReplacement = "&$0;";
204
205
206 final Matcher numMatcher = Pattern.compile(
207 "//p{Digit}+").matcher("");
208 final String numReplacement = "&#$0;";
209
210
211 final Matcher whitespaceMatcher =
212 Pattern.compile("[//s|]+").matcher("//p{Digit}+");
213 final String whitespaceReplacement = "|";
214
215 String currentPropKey, currentPropValue;
216 StringBuffer currentPatternValue;
217 Pattern currentPattern;
218 int entityNumber;
219 char entityChar;
220
221 while (keys.hasMoreElements()) {
222 currentPropKey = (String) keys.nextElement();
223 currentPropValue =
224 substProps.getProperty(currentPropKey).trim();
225
226
227 currentPropValue = TextUtils.replaceAll(currentPropValue,
228 whitespaceMatcher, whitespaceReplacement);
229
230 currentPropValue = TextUtils.replaceAll(currentPropValue,
231 nameMatcher, nameReplacement);
232
233 numMatcher.reset(currentPropValue);
234 currentPatternValue = new StringBuffer();
235
236 while (numMatcher.find()) {
237 numMatcher.appendReplacement(currentPatternValue,
238 numReplacement);
239
240 entityNumber = Integer.parseInt(numMatcher.group());
241 entityChar = (char) entityNumber;
242 currentPatternValue.append("|").append(entityChar);
243 }
244 numMatcher.appendTail(currentPatternValue);
245
246
247 currentPattern = Pattern.compile("(?:"
248 + currentPatternValue + ")");
249 tempSubst.put(currentPattern, currentPropKey);
250 }
251 } catch (RuntimeException rte) {
252 Util.LOG.error("Could not initialize substitution table from "
253 + SUBSTITUTES_FILE + " = ", rte);
254 } catch (IOException ioe) {
255 Util.LOG.error("Could not initialize substitution table from "
256 + SUBSTITUTES_FILE + ": ", ioe);
257 }
258 SUBSTITUTES = Collections.unmodifiableMap(tempSubst);
259
260 REPLACE_WITHIN_TAGS.put(Pattern.compile(APOSTROPHE_CHAR),
261 PSEUDO_ENTITY_APOSTROPHE);
262 RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_APOSTROPHE),
263 APOSTROPHE_CHAR);
264 REPLACE_WITHIN_TAGS.put(Pattern.compile("//s+"),
265 PSEUDO_ENTITY_WHITESPACE);
266 RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_WHITESPACE),
267 " ");
268 }
269
270
271 /***
272 * XML adjuster for merging the original XML tree with the markup added by
273 * the TreeTagger. Neither missing root nor emptiable tags are used,
274 * control characters and pseudo-tags are not deleted, only illegal "&"
275 * are escaped; no file extension is required.
276 */
277 private final XMLAdjuster xmlAdjuster;
278
279 /***
280 * Used to isolate XML tags so the TreeTagger can handle a resulting file.
281 */
282 private final TagIsolator tagIsolator = new TagIsolator();
283
284 /***
285 * Whether to add XML tags around each sentence.
286 */
287 private final boolean tagSentences;
288
289
290 /***
291 * Creates a new instance, using the
292 * {@linkplain TiesConfiguration standard configuration}.
293 *
294 * @param outExt the extension to use for output files
295 */
296 public TreeTagger(final String outExt) {
297 this(outExt, TiesConfiguration.CONF);
298 }
299
300 /***
301 * Creates a new instance.
302 *
303 * @param outExt the extension to use for output files
304 * @param config used to configure superclasses
305 */
306 public TreeTagger(final String outExt, final TiesConfiguration config) {
307 super(outExt, config);
308 tagSentences = config.getBoolean(CONFIG_TAG_SENTENCES);
309
310
311
312
313 xmlAdjuster = new XMLAdjuster(null, null, null, false, false, false,
314 true, config);
315 }
316
317 /***
318 * Workaround for a strange TreeTagger bug: the tagger not only tends to
319 * omit trailing XML markup (which is not too bad since missing end tags
320 * are completed by the XML adjuster), but sometimes it appends spurious
321 * ones. To work around this, we delete all spurious end tags occurring
322 * in the last markup series (after the last textual content).
323 *
324 * @param input the TreeTagger output
325 * @return a corrected copy of the input where spurious end tags
326 * after the last tetual content have been deleted
327 */
328 protected String deleteSpuriousEndTags(final String input) {
329
330 final TextTokenizer tokenizer =
331 XMLTokenizerFactory.createXMLTokenizer(input, false);
332 final Matcher xmlNameMatcher =
333 Pattern.compile(XMLTokenizerFactory.XML_NAME).matcher("");
334 String token, capturedText, tagType;
335 final Bag endTagCount = new HashBag();
336 final Bag startTagCount = new HashBag();
337 final List<String[]> endTagsInMarkupSeries = new ArrayList<String[]>();
338
339 while ((token = tokenizer.nextToken()) != null) {
340 capturedText = tokenizer.capturedText();
341 xmlNameMatcher.reset(capturedText);
342
343 if ((capturedText.length() == 0)
344 || (capturedText.equals("[CDATA"))) {
345
346 endTagsInMarkupSeries.clear();
347 } else if (capturedText.charAt(0) == '/') {
348
349 tagType = capturedText.substring(1);
350 endTagCount.add(tagType);
351
352
353 endTagsInMarkupSeries.add(new String[] {tagType, token});
354 } else if (xmlNameMatcher.matches()) {
355
356 tagType = capturedText;
357 startTagCount.add(tagType);
358 }
359 }
360
361 String[] tagDetails;
362 Pattern quotedToken;
363 Matcher tokenMatcher;
364 int difference;
365 int startIndex = -1;
366 int endIndex = -1;
367 String correctedInput = input;
368
369
370 for (int i = 0; i < endTagsInMarkupSeries.size(); i++) {
371 tagDetails = endTagsInMarkupSeries.get(i);
372 tagType = tagDetails[0];
373 token = tagDetails[1];
374 difference = endTagCount.getCount(tagType)
375 - startTagCount.getCount(tagType);
376
377 if (difference > 0) {
378
379 quotedToken = Pattern.compile(Pattern.quote(token));
380 tokenMatcher = quotedToken.matcher(correctedInput);
381
382
383 while (tokenMatcher.find()) {
384 startIndex = tokenMatcher.start();
385 endIndex = tokenMatcher.end();
386 }
387 Util.LOG.debug("Found " + difference
388 + " more end tags than start tags of type " + tagType
389 + " in the TreeTagged text -- "
390 + "will delete last occurrence '"
391 + correctedInput.substring(startIndex, endIndex) + "'");
392
393
394 correctedInput = correctedInput.substring(0, startIndex)
395 + correctedInput.substring(endIndex);
396 endTagCount.remove(tagType, 1);
397 }
398 }
399
400 return correctedInput;
401 }
402
403 /***
404 * Augments the <code>input</code> text with the output of the TreeTagger.
405 *
406 * @param in reader containing the text to process; must contain the textual
407 * representation of a well-formed XML document
408 * @param out the writer to write the processed text to; the text will
409 * be augmented with part-of-speech, lemma, and chunk information,
410 * it will be a well-formed XML document (if the input was well-formed)
411 * @param context a map of objects that are made available for processing
412 * @throws IOException if an I/O error occurred
413 * @throws ParsingException if the file couldn't be parsed, e.g. due to an
414 * error in the XML input
415 */
416 protected void doProcess(final Reader in, final Writer out,
417 final ContextMap context) throws IOException, ParsingException {
418 final String input = IOUtils.readToString(in);
419
420
421
422
423
424 final String simplifiedXML = TextUtils.multipleReplaceAll(
425 input, SUBSTITUTES);
426
427
428 final String protectedXML = TextUtils.replaceAll(simplifiedXML,
429 XML_ENTITY, XML_ENTITY_REPLACEMENT);
430
431
432 Writer isolateOut = new StringWriter();
433 tagIsolator.isolateTags(new StringReader(protectedXML), isolateOut,
434 REPLACE_WITHIN_TAGS);
435
436
437
438
439 final String taggerCommandName = getConfig().getString(
440 getConfig().localizeKey(CONFIG_COMMAND));
441 final ExternalCommand taggerCommand = new ExternalCommand(
442 new String[] {taggerCommandName});
443 final String treeTagged = taggerCommand.execute(null,
444 isolateOut.toString());
445
446
447
448
449 final String unprotectedXML = TextUtils.replaceAll(treeTagged,
450 REPLACED_XML_ENTITY, XML_ENTITY_RESTORE);
451
452
453
454
455
456
457 final String unisolatedXML = TextUtils.multipleReplaceAll(
458 unprotectedXML, RESTORE_WITHIN_TAGS);
459
460
461
462
463 final String fixedXML = deleteSpuriousEndTags(unisolatedXML);
464
465
466 final String sentenceTagged;
467 if (tagSentences) {
468 sentenceTagged = tagSentences(fixedXML);
469 } else {
470 sentenceTagged = fixedXML;
471 }
472
473
474 xmlAdjuster.adjust(new StringReader(sentenceTagged), out);
475 }
476
477 /***
478 * Adds tags to mark the sentences in a document. Only the ends of
479 * sentences are tagged by this method by inserted </sent> tags --
480 * the corresponding start tags are later added by the XML adjuster.
481 *
482 * @param input the text to process
483 * @return the processed tag with </sent> tags added
484 */
485 protected final String tagSentences(final String input) {
486
487 final String eosType = getConfig().getString(getConfig().localizeKey(
488 CONFIG_END_OF_SENTENCE));
489 final String[] afterEOSTypes = getConfig().getStringArray(
490 getConfig().localizeKey(CONFIG_AFTER_EOS));
491
492 final StringBuilder patternString =
493 new StringBuilder(typedPos(eosType));
494
495
496 patternString.append("(?://s*");
497 final String trailingTags;
498
499 if ((afterEOSTypes != null) && (afterEOSTypes.length > 0)) {
500 final String afterEOSPattern =
501 TextUtils.joinAlternatives(afterEOSTypes);
502
503 trailingTags = TextUtils.joinAlternatives(new String[] {
504 typedPos(afterEOSPattern), END_TAG_CONSTITUENT
505 }
506 );
507 } else {
508
509 trailingTags = END_TAG_CONSTITUENT;
510 }
511
512
513 patternString.append(trailingTags);
514 patternString.append(")*");
515
516
517
518
519
520
521
522
523 return TextUtils.replaceAll(input,
524 Pattern.compile(patternString.toString()), "$0" + END_TAG_SENTENCE);
525 }
526
527 /***
528 * Returns a string representation of this object.
529 *
530 * @return a textual representation
531 */
532 public String toString() {
533 return new ToStringBuilder(this)
534 .appendSuper(super.toString())
535 .append("tag sentences", tagSentences)
536 .append("tag isolator", tagIsolator)
537 .append("XML adjuster", xmlAdjuster)
538 .toString();
539 }
540
541 /***
542 * Helper method that create a regular expression string for matching a
543 * part-of-speech element of a specified type.
544 *
545 * @param type the type string in {@link Pattern} format
546 * @return a string in {@link Pattern} format matching the POS element
547 */
548 private String typedPos(final String type) {
549
550
551 return "<pos//s+type=\"" + type + ".+?</pos>";
552 }
553
554 }