View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.preprocess;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.io.StringReader;
28  import java.io.StringWriter;
29  import java.io.Writer;
30  import java.util.ArrayList;
31  import java.util.Collections;
32  import java.util.Enumeration;
33  import java.util.HashMap;
34  import java.util.List;
35  import java.util.Map;
36  import java.util.Properties;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  import org.apache.commons.collections.Bag;
41  import org.apache.commons.collections.bag.HashBag;
42  import org.apache.commons.lang.builder.ToStringBuilder;
43  
44  import de.fu_berlin.ties.ContextMap;
45  import de.fu_berlin.ties.ParsingException;
46  import de.fu_berlin.ties.TextProcessor;
47  import de.fu_berlin.ties.TiesConfiguration;
48  
49  import de.fu_berlin.ties.io.IOUtils;
50  import de.fu_berlin.ties.text.TextTokenizer;
51  import de.fu_berlin.ties.text.TextUtils;
52  import de.fu_berlin.ties.util.ExternalCommand;
53  import de.fu_berlin.ties.util.Util;
54  import de.fu_berlin.ties.xml.TagIsolator;
55  import de.fu_berlin.ties.xml.XMLAdjuster;
56  import de.fu_berlin.ties.xml.XMLTokenizerFactory;
57  
58  /***
59   * Integrates the
60   * <a href="http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/"
61   * >TreeTagger</a>, a linguistic tool for part-of-speech tagging and chunk
62   * parsing. This integration brings XML-based input files in a form that can
63   * be processed by TreeTagger, runs the external TreeTagger command, converts
64   * the output in the augmented text format defined by TIE, inserting tags
65   * marking sentences and unifying the original XML markup and the TreeTagger
66   * output in a single XML tree. This class is thread-safe.
67   *
68   * @author Christian Siefkes
69   * @version $Revision: 1.27 $, $Date: 2006/10/21 16:04:23 $, $Author: siefkes $
70   */
71  public class TreeTagger extends TextProcessor {
72  
73      /***
74       * Configuration key: the name of the TreeTagger command
75       * (language-specific).
76       */
77      private static final String CONFIG_COMMAND = "treetagger.command";
78  
79      /***
80       * Configuration key: whether to add XML tags around each sentence.
81       */
82      private static final String CONFIG_TAG_SENTENCES =
83          "treetagger.tag-sentences";
84  
85      /***
86       * Configuration key: the POS tag marking the end of a sentence
87       * (language-specific).
88       */
89      private static final String CONFIG_END_OF_SENTENCE = "treetagger.eos";
90  
91      /***
92       * Configuration key: list of POS tags still to include in a previous
93       * sentence when they occur after an end-of-sentence tag (language-specific,
94       * optional).
95       */
96      private static final String CONFIG_AFTER_EOS = "treetagger.after-eos";
97  
98      /***
99       * Tag marking the end of a sentence constituent (XML end tag).
100      */
101     private static final String END_TAG_CONSTITUENT = "</const>";
102 
103     /***
104      * Tag marking the end of a sentence (XML end tag).
105      */
106     private static final String END_TAG_SENTENCE = "</sent>";
107 
108     /***
109      * The apostrophe character (').
110      */
111     private static final String APOSTROPHE_CHAR = "'";
112 
113     /***
114      * A pseudo entity used to replace the apostrophe (') within XML tags,
115      * so it is protected from the TreeTagger which separates clitics
116      * (English version).
117      */
118     private static final String PSEUDO_ENTITY_APOSTROPHE = "&';";
119 
120     /***
121      * A pseudo entity used to replace whitespace within XML tags,
122      * so it is protected from the TreeTagger which splits lines at whitespace.
123      */
124     private static final String PSEUDO_ENTITY_WHITESPACE = "&;";
125 
126     /***
127      * A map of regexp patterns and the corresponding replacement strings used
128      * to make XML tags safe for use with the TreeTagger. We can use a hash map
129      * as the order or replacements doesn't matter.
130      */
131     private static final Map<Pattern, String> REPLACE_WITHIN_TAGS =
132         new HashMap<Pattern, String>();
133 
134     /***
135      * A map of regexp patterns and the corresponding replacement strings to
136      * revert the changes of applying the {@link #REPLACE_WITHIN_TAGS} map. We
137      * can use a hash map as the order or replacements doesn't matter.
138      */
139     private static final Map<Pattern, String> RESTORE_WITHIN_TAGS =
140         new HashMap<Pattern, String>();
141 
142     /***
143      * Pattern for recognizing regular XML entities.
144      */
145     private static final Pattern XML_ENTITY =
146         Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ");");
147 
148     /***
149      * Replacement string to protect regular XML entities from being split
150      * by the TreeTagger tokenizer.
151      */
152     private static final String XML_ENTITY_REPLACEMENT = "$1&";
153 
154     /***
155      * Pattern for recognizing the XML entities in the replaced form used to
156      * protect them from TreeTagger.
157      */
158     private static final Pattern REPLACED_XML_ENTITY =
159         Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ")&");
160 
161     /***
162      * Replacement string to restore protected regular XML entities to their
163      * original form.
164      */
165     private static final String XML_ENTITY_RESTORE = "$1;";
166 
167     /***
168      * An unmodifiable map of properties containg the HTML characters and
169      * entities to substitute. Each key is a regular expression
170      * {@link java.util.regex.Pattern} of characters and entities and should
171      * be replaced by the correspinding value String.
172      */
173     private static final Map SUBSTITUTES;
174 
175     /***
176      * Name of the properties file containing the HTML characters and entities
177      * to substitute.
178      */
179     private static final String SUBSTITUTES_FILE = "conf/substitutes.cfg";
180 
181     /***
182      * Static initialization of the static maps.
183      */
184     static {
185         final Properties substProps = new Properties();
186         final InputStream substStream = TreeTagger.class.getClassLoader()
187                 .getResourceAsStream(SUBSTITUTES_FILE);
188 
189         if (substStream == null) {
190             Util.LOG.error("Class loader returned null for "
191                 + SUBSTITUTES_FILE);
192         }
193 
194         final Map<Pattern, String> tempSubst = new HashMap<Pattern, String>();
195 
196         try {
197             substProps.load(substStream);
198             Enumeration keys = substProps.propertyNames();
199 
200             // replace each 'name' by '&name;'
201             final Matcher nameMatcher = Pattern.compile(
202                 "//p{Alpha}+").matcher("");
203             final String nameReplacement = "&$0;";
204 
205             // replace each 'number' by '&#number;|Unicode char'
206             final Matcher numMatcher = Pattern.compile(
207                 "//p{Digit}+").matcher("");
208             final String numReplacement = "&#$0;";
209 
210             // replace whitespace and '|' by a single '|' (to mark alternatives)
211             final Matcher whitespaceMatcher =
212                 Pattern.compile("[//s|]+").matcher("//p{Digit}+");
213             final String whitespaceReplacement = "|";
214 
215             String currentPropKey, currentPropValue;
216             StringBuffer currentPatternValue;
217             Pattern currentPattern;
218             int entityNumber; // the numeric value of the entity
219             char entityChar;
220 
221             while (keys.hasMoreElements()) {
222                 currentPropKey = (String) keys.nextElement();
223                 currentPropValue =
224                     substProps.getProperty(currentPropKey).trim();
225 
226                 // replace whitespace
227                 currentPropValue = TextUtils.replaceAll(currentPropValue,
228                     whitespaceMatcher, whitespaceReplacement);
229 
230                 currentPropValue = TextUtils.replaceAll(currentPropValue,
231                     nameMatcher, nameReplacement);
232 
233                 numMatcher.reset(currentPropValue);
234                 currentPatternValue = new StringBuffer();
235 
236                 while (numMatcher.find()) {
237                     numMatcher.appendReplacement(currentPatternValue,
238                         numReplacement);
239                     // append raw Unicode character as third option
240                     entityNumber = Integer.parseInt(numMatcher.group());
241                     entityChar = (char) entityNumber;
242                     currentPatternValue.append("|").append(entityChar);
243                 }
244                 numMatcher.appendTail(currentPatternValue);
245 
246                 // create final pattern
247                 currentPattern = Pattern.compile("(?:"
248                     + currentPatternValue + ")");
249                 tempSubst.put(currentPattern, currentPropKey);
250             }
251         } catch (RuntimeException rte) {
252             Util.LOG.error("Could not initialize substitution table from "
253                 + SUBSTITUTES_FILE + " = ", rte);
254         } catch (IOException ioe) {
255             Util.LOG.error("Could not initialize substitution table from "
256                 + SUBSTITUTES_FILE + ": ", ioe);
257         }
258         SUBSTITUTES = Collections.unmodifiableMap(tempSubst);
259 
260         REPLACE_WITHIN_TAGS.put(Pattern.compile(APOSTROPHE_CHAR),
261             PSEUDO_ENTITY_APOSTROPHE);
262         RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_APOSTROPHE),
263             APOSTROPHE_CHAR);
264         REPLACE_WITHIN_TAGS.put(Pattern.compile("//s+"),
265             PSEUDO_ENTITY_WHITESPACE);
266         RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_WHITESPACE),
267             " ");
268     }
269 
270 
271     /***
272      * XML adjuster for merging the original XML tree with the markup added by
273      * the TreeTagger. Neither missing root nor emptiable tags are used,
274      * control characters and pseudo-tags are not deleted, only illegal "&amp;"
275      * are escaped; no file extension is required.
276      */
277     private final XMLAdjuster xmlAdjuster;
278 
279     /***
280      * Used to isolate XML tags so the TreeTagger can handle a resulting file.
281      */
282     private final TagIsolator tagIsolator = new TagIsolator();
283 
284     /***
285      * Whether to add XML tags around each sentence.
286      */
287     private final boolean tagSentences;
288 
289 
290     /***
291      * Creates a new instance, using the
292      * {@linkplain TiesConfiguration standard configuration}.
293      *
294      * @param outExt the extension to use for output files
295      */
296     public TreeTagger(final String outExt) {
297         this(outExt, TiesConfiguration.CONF);
298     }
299 
300     /***
301      * Creates a new instance.
302      *
303      * @param outExt the extension to use for output files
304      * @param config used to configure superclasses
305      */
306     public TreeTagger(final String outExt, final TiesConfiguration config) {
307         super(outExt, config);
308         tagSentences = config.getBoolean(CONFIG_TAG_SENTENCES);
309 
310         // initialize XML adjuster: Neither missing root nor emptiable tags are
311         // used, nothing is deleted, but illegal "&amp;" are escaped;
312         // no file extension is required
313         xmlAdjuster = new XMLAdjuster(null, null, null, false, false, false,
314                 true, config);
315     }
316 
317     /***
318      * Workaround for a strange TreeTagger bug: the tagger not only tends to
319      * omit trailing XML markup (which is not too bad since missing end tags
320      * are completed by the XML adjuster), but sometimes it appends spurious
321      * ones. To work around this, we delete all spurious end tags occurring
322      * in the last markup series (after the last textual content).
323      *
324      * @param input the TreeTagger output
325      * @return a corrected copy of the input where spurious end tags
326      * after the last tetual content have been deleted
327      */
328     protected String deleteSpuriousEndTags(final String input) {
329         // use XML tokenization to count start + end tags
330         final TextTokenizer tokenizer =
331             XMLTokenizerFactory.createXMLTokenizer(input, false);
332         final Matcher xmlNameMatcher =
333             Pattern.compile(XMLTokenizerFactory.XML_NAME).matcher("");
334         String token, capturedText, tagType;
335         final Bag endTagCount = new HashBag();
336         final Bag startTagCount = new HashBag();
337         final List<String[]> endTagsInMarkupSeries = new ArrayList<String[]>();
338 
339         while ((token = tokenizer.nextToken()) != null) {
340             capturedText = tokenizer.capturedText();
341             xmlNameMatcher.reset(capturedText);
342 
343             if ((capturedText.length() == 0)
344                     || (capturedText.equals("[CDATA"))) {
345                 // textual content: reset markup series
346                 endTagsInMarkupSeries.clear();
347             } else if (capturedText.charAt(0) == '/') {
348                 // this is an end tag: count it
349                 tagType = capturedText.substring(1);
350                 endTagCount.add(tagType);
351 
352                 // store tag type + token representation in markup series list
353                 endTagsInMarkupSeries.add(new String[] {tagType, token});
354             } else if (xmlNameMatcher.matches()) {
355                 // this is a start tag: count it
356                 tagType = capturedText;
357                 startTagCount.add(tagType);
358             }
359         }
360 
361         String[] tagDetails;
362         Pattern quotedToken;
363         Matcher tokenMatcher;
364         int difference;
365         int startIndex = -1;
366         int endIndex = -1;
367         String correctedInput = input;
368 
369         // iterate all end tags in the last markup series
370         for (int i = 0; i < endTagsInMarkupSeries.size(); i++) {
371             tagDetails = endTagsInMarkupSeries.get(i);
372             tagType = tagDetails[0];
373             token = tagDetails[1];
374             difference = endTagCount.getCount(tagType)
375                     - startTagCount.getCount(tagType);
376 
377             if (difference > 0) {
378                 // more end than start tags
379                 quotedToken = Pattern.compile(Pattern.quote(token));
380                 tokenMatcher = quotedToken.matcher(correctedInput);
381 
382                 // find all occurrences of the token in the text
383                 while (tokenMatcher.find()) {
384                     startIndex = tokenMatcher.start();
385                     endIndex = tokenMatcher.end();
386                 }
387                 Util.LOG.debug("Found " + difference
388                         + " more end tags than start tags of type " + tagType
389                         + " in the TreeTagged text -- "
390                         + "will delete last occurrence '"
391                         + correctedInput.substring(startIndex, endIndex) + "'");
392 
393                 // delete last occurrence of end tag + remove instance from bag
394                 correctedInput = correctedInput.substring(0, startIndex)
395                         + correctedInput.substring(endIndex);
396                 endTagCount.remove(tagType, 1);
397             }
398         }
399 
400         return correctedInput;
401     }
402 
403     /***
404      * Augments the <code>input</code> text with the output of the TreeTagger.
405      *
406      * @param in reader containing the text to process; must contain the textual
407      * representation of a well-formed XML document
408      * @param out the writer to write the processed text to; the text will
409      * be augmented with part-of-speech, lemma, and chunk information,
410      * it will be a well-formed XML document (if the input was well-formed)
411      * @param context a map of objects that are made available for processing
412      * @throws IOException if an I/O error occurred
413      * @throws ParsingException if the file couldn't be parsed, e.g. due to an
414      * error in the XML input
415      */
416     protected void doProcess(final Reader in, final Writer out,
417             final ContextMap context) throws IOException, ParsingException {
418         final String input = IOUtils.readToString(in);
419         //Util.LOG.debug("Un-augmented XML: " + input);
420 
421         // replaces some special characters/entities with a simpler
422         // representation (e.g., different types of quotes, dashes,
423         // and special whitespace characters)
424         final String simplifiedXML = TextUtils.multipleReplaceAll(
425             input, SUBSTITUTES);
426 
427         // protect entities from TreeTagger
428         final String protectedXML = TextUtils.replaceAll(simplifiedXML,
429             XML_ENTITY, XML_ENTITY_REPLACEMENT);
430 
431         // isolate tags for TreeTagger
432         Writer isolateOut = new StringWriter();
433         tagIsolator.isolateTags(new StringReader(protectedXML), isolateOut,
434             REPLACE_WITHIN_TAGS);
435 
436         //Util.LOG.debug("Tag-isolated XML: " + isolateOut.toString());
437 
438         // delegate to TreeTagger
439         final String taggerCommandName = getConfig().getString(
440                 getConfig().localizeKey(CONFIG_COMMAND));
441         final ExternalCommand taggerCommand = new ExternalCommand(
442             new String[] {taggerCommandName});
443         final String treeTagged = taggerCommand.execute(null,
444                     isolateOut.toString());
445 
446         //Util.LOG.debug("TreeTagged XML: " + treeTagged);
447 
448         // restore entities to original form
449         final String unprotectedXML = TextUtils.replaceAll(treeTagged,
450             REPLACED_XML_ENTITY, XML_ENTITY_RESTORE);
451 
452         //Util.LOG.debug("Un-protected XML: " + unprotectedXML);
453 
454         // restore whitespace + apostrophes in tags
455         // We're using pseudo-entities  (illegal XML) instead of actual
456         // ones, so we can replace them without fear of replacing too much
457         final String unisolatedXML = TextUtils.multipleReplaceAll(
458             unprotectedXML, RESTORE_WITHIN_TAGS);
459 
460         //Util.LOG.debug("Un-adjusted XML: " + unisolatedXML);
461 
462         // workaround for strange TreeTagger bug
463         final String fixedXML = deleteSpuriousEndTags(unisolatedXML);
464 
465         // add end-of-sentence markers (</sent>), if configured
466         final String sentenceTagged;
467         if (tagSentences) {
468             sentenceTagged = tagSentences(fixedXML);
469         } else {
470             sentenceTagged = fixedXML;
471         }
472 
473         // adjusting XML structure (merging original + TreeTagger trees)
474         xmlAdjuster.adjust(new StringReader(sentenceTagged), out);
475     }
476 
477     /***
478      * Adds tags to mark the sentences in a document. Only the ends of
479      * sentences are tagged by this method by inserted &lt;/sent&gt; tags --
480      * the corresponding start tags are later added by the XML adjuster.
481      *
482      * @param input the text to process
483      * @return the processed tag with &lt;/sent&gt; tags added
484      */
485     protected final String tagSentences(final String input) {
486         // build pattern of text to match
487         final String eosType = getConfig().getString(getConfig().localizeKey(
488                         CONFIG_END_OF_SENTENCE));
489         final String[] afterEOSTypes = getConfig().getStringArray(
490                 getConfig().localizeKey(CONFIG_AFTER_EOS));
491         // start pattern
492         final StringBuilder patternString =
493             new StringBuilder(typedPos(eosType));
494 
495         // open non-capturing group, accept preceding whitespace
496         patternString.append("(?://s*");
497         final String trailingTags;
498 
499         if ((afterEOSTypes != null) && (afterEOSTypes.length > 0)) {
500             final String afterEOSPattern =
501                 TextUtils.joinAlternatives(afterEOSTypes);
502             // these POS tags and ending constituents should be included
503             trailingTags = TextUtils.joinAlternatives(new String[] {
504                     typedPos(afterEOSPattern), END_TAG_CONSTITUENT
505                 }
506             );
507         } else {
508             // ending constituents should be included
509             trailingTags = END_TAG_CONSTITUENT;
510         }
511 
512         // append trailing tags + close group (matching any number of times)
513         patternString.append(trailingTags);
514         patternString.append(")*");
515 
516         /* rough debugging only--very inefficient
517         final Pattern eosPattern = Pattern.compile(patternString.toString());
518         Log.TIES.info("End-of-sentence pattern '" + patternString.toString()
519             + "' matches " + (eosPattern.split(input).length - 1) + " times");
520         */
521 
522         // append sentence end tag after each match of the created pattern
523         return TextUtils.replaceAll(input,
524             Pattern.compile(patternString.toString()), "$0" + END_TAG_SENTENCE);
525     }
526 
527     /***
528      * Returns a string representation of this object.
529      *
530      * @return a textual representation
531      */
532     public String toString() {
533         return new ToStringBuilder(this)
534             .appendSuper(super.toString())
535             .append("tag sentences", tagSentences)
536             .append("tag isolator", tagIsolator)
537             .append("XML adjuster", xmlAdjuster)
538             .toString();
539     }
540 
541     /***
542      * Helper method that create a regular expression string for matching a
543      * part-of-speech element of a specified type.
544      *
545      * @param type the type string in {@link Pattern} format
546      * @return a string in {@link Pattern} format matching the POS element
547      */
548     private String typedPos(final String type) {
549         // we rely on the format written by our TreeTagger adaption (type is
550         // the first attribute, quotes are " not ')
551         return "<pos//s+type=\"" + type + ".+?</pos>";
552     }
553 
554 }