View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties.preprocess;
23  
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.Reader;
27  import java.io.StringReader;
28  import java.io.StringWriter;
29  import java.io.Writer;
30  import java.util.Collections;
31  import java.util.Enumeration;
32  import java.util.HashMap;
33  import java.util.Map;
34  import java.util.Properties;
35  import java.util.regex.Matcher;
36  import java.util.regex.Pattern;
37  
38  import org.apache.commons.lang.builder.ToStringBuilder;
39  
40  import de.fu_berlin.ties.ContextMap;
41  import de.fu_berlin.ties.ParsingException;
42  import de.fu_berlin.ties.TextProcessor;
43  import de.fu_berlin.ties.TiesConfiguration;
44  
45  import de.fu_berlin.ties.io.IOUtils;
46  import de.fu_berlin.ties.text.TextUtils;
47  import de.fu_berlin.ties.util.ExternalCommand;
48  import de.fu_berlin.ties.util.Util;
49  import de.fu_berlin.ties.xml.TagIsolator;
50  import de.fu_berlin.ties.xml.XMLAdjuster;
51  import de.fu_berlin.ties.xml.XMLTokenizerFactory;
52  
53  /***
54   * Integrates the
55   * <a href="http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/"
56   * >TreeTagger</a>, a linguistic tool for part-of-speech tagging and chunk
57   * parsing. This integration brings XML-based input files in a form that can
58   * be processed by TreeTagger, runs the external TreeTagger command, converts
59   * the output in the augmented text format defined by TIE, inserting tags
60   * marking sentences and unifying the original XML markup and the TreeTagger
61   * output in a single XML tree. This class is thread-safe.
62   *
63   * @author Christian Siefkes
64   * @version $Revision: 1.14 $, $Date: 2004/11/08 11:57:35 $, $Author: siefkes $
65   */
66  public class TreeTagger extends TextProcessor {
67  
68      /***
69       * Configuration key: the name of the TreeTagger command
70       * (language-specific).
71       */
72      private static final String CONFIG_COMMAND = "treetagger.command";
73  
74      /***
75       * Configuration key: the POS tag marking the end of a sentence
76       * (language-specific).
77       */
78      private static final String CONFIG_END_OF_SENTENCE = "treetagger.eos";
79  
80      /***
81       * Configuration key: list of POS tags still to include in a previous
82       * sentence when they occur after an end-of-sentence tag (language-specific,
83       * optional).
84       */
85      private static final String CONFIG_AFTER_EOS = "treetagger.after-eos";
86  
87      /***
88       * Tag marking the end of a sentence constituent (XML end tag).
89       */
90      private static final String END_TAG_CONSTITUENT = "</const>";
91  
92      /***
93       * Tag marking the end of a sentence (XML end tag).
94       */
95      private static final String END_TAG_SENTENCE = "</sent>";
96  
97      /***
98       * The apostrophe character (').
99       */
100     private static final String APOSTROPHE_CHAR = "'";
101 
102     /***
103      * A pseudo entity used to replace the apostrophe (') within XML tags,
104      * so it is protected from the TreeTagger which separates clitics
105      * (English version).
106      */
107     private static final String PSEUDO_ENTITY_APOSTROPHE = "&';";
108 
109     /***
110      * A pseudo entity used to replace whitespace within XML tags,
111      * so it is protected from the TreeTagger which splits lines at whitespace.
112      */
113     private static final String PSEUDO_ENTITY_WHITESPACE = "&;";
114 
115     /***
116      * A map of regexp patterns and the corresponding replacement strings used
117      * to make XML tags safe for use with the TreeTagger. We can use a hash map
118      * as the order or replacements doesn't matter.
119      */
120     private static final Map<Pattern, String> REPLACE_WITHIN_TAGS =
121         new HashMap<Pattern, String>();
122 
123     /***
124      * A map of regexp patterns and the corresponding replacement strings to
125      * revert the changes of applying the {@link #REPLACE_WITHIN_TAGS} map. We
126      * can use a hash map as the order or replacements doesn't matter.
127      */
128     private static final Map<Pattern, String> RESTORE_WITHIN_TAGS =
129         new HashMap<Pattern, String>();
130 
131     /***
132      * Pattern for recognizing regular XML entities.
133      */
134     private static final Pattern XML_ENTITY =
135         Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ");");
136 
137     /***
138      * Replacement string to protect regular XML entities from being split
139      * by the TreeTagger tokenizer.
140      */
141     private static final String XML_ENTITY_REPLACEMENT = "$1&";
142 
143     /***
144      * Pattern for recognizing the XML entities in the replaced form used to
145      * protect them from TreeTagger.
146      */
147     private static final Pattern REPLACED_XML_ENTITY =
148         Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ")&");
149 
150     /***
151      * Replacement string to restore protected regular XML entities to their
152      * original form.
153      */
154     private static final String XML_ENTITY_RESTORE = "$1;";
155 
156     /***
157      * An unmodifiable map of properties containg the HTML characters and
158      * entities to substitute. Each key is a regular expression
159      * {@link java.util.regex.Pattern} of characters and entities and should
160      * be replaced by the correspinding value String.
161      */
162     private static final Map SUBSTITUTES;
163 
164     /***
165      * Name of the properties file containing the HTML characters and entities
166      * to substitute.
167      */
168     private static final String SUBSTITUTES_FILE = "conf/substitutes.cfg";
169 
170     /***
171      * Static initialization of the static maps.
172      */
173     static {
174         final Properties substProps = new Properties();
175         final InputStream substStream =
176             ClassLoader.getSystemResourceAsStream(SUBSTITUTES_FILE);
177 
178         if (substStream == null) {
179             Util.LOG.error("Class loader returned null for "
180                 + SUBSTITUTES_FILE);
181         }
182 
183         final Map<Pattern, String> tempSubst = new HashMap<Pattern, String>();
184 
185         try {
186             substProps.load(substStream);
187             Enumeration keys = substProps.propertyNames();
188 
189             // replace each 'name' by '&name;'
190             final Matcher nameMatcher = Pattern.compile(
191                 "//p{Alpha}+").matcher("");
192             final String nameReplacement = "&$0;";
193 
194             // replace each 'number' by '&#number;|Unicode char'
195             final Matcher numMatcher = Pattern.compile(
196                 "//p{Digit}+").matcher("");
197             final String numReplacement = "&#$0;";
198 
199             // replace whitespace and '|' by a single '|' (to mark alternatives)
200             final Matcher whitespaceMatcher =
201                 Pattern.compile("[//s|]+").matcher("//p{Digit}+");
202             final String whitespaceReplacement = "|";
203 
204             String currentPropKey, currentPropValue;
205             StringBuffer currentPatternValue;
206             Pattern currentPattern;
207             int entityNumber; // the numeric value of the entity
208             char entityChar;
209 
210             while (keys.hasMoreElements()) {
211                 currentPropKey = (String) keys.nextElement();
212                 currentPropValue =
213                     substProps.getProperty(currentPropKey).trim();
214 
215                 // replace whitespace
216                 currentPropValue = TextUtils.replaceAll(currentPropValue,
217                     whitespaceMatcher, whitespaceReplacement);
218 
219                 currentPropValue = TextUtils.replaceAll(currentPropValue,
220                     nameMatcher, nameReplacement);
221 
222                 numMatcher.reset(currentPropValue);
223                 currentPatternValue = new StringBuffer();
224 
225                 while (numMatcher.find()) {
226                     numMatcher.appendReplacement(currentPatternValue,
227                         numReplacement);
228                     // append raw Unicode character as third option
229                     entityNumber = Integer.parseInt(numMatcher.group());
230                     entityChar = (char) entityNumber;
231                     currentPatternValue.append("|").append(entityChar);
232                 }
233                 numMatcher.appendTail(currentPatternValue);
234 
235                 // create final pattern
236                 currentPattern = Pattern.compile("(?:"
237                     + currentPatternValue + ")");
238                 tempSubst.put(currentPattern, currentPropKey);
239             }
240         } catch (RuntimeException rte) {
241             Util.LOG.error("Could not initialize substitution table from "
242                 + SUBSTITUTES_FILE + " = ", rte);
243         } catch (IOException ioe) {
244             Util.LOG.error("Could not initialize substitution table from "
245                 + SUBSTITUTES_FILE + ": ", ioe);
246         }
247         SUBSTITUTES = Collections.unmodifiableMap(tempSubst);
248 
249         REPLACE_WITHIN_TAGS.put(Pattern.compile(APOSTROPHE_CHAR),
250             PSEUDO_ENTITY_APOSTROPHE);
251         RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_APOSTROPHE),
252             APOSTROPHE_CHAR);
253         REPLACE_WITHIN_TAGS.put(Pattern.compile("//s+"),
254             PSEUDO_ENTITY_WHITESPACE);
255         RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_WHITESPACE),
256             " ");
257     }
258 
259     /***
260      * XML adjuster for merging the original XML tree with the markup added by
261      * the TreeTagger. Neither missing root nor emptiable tags are used,
262      * control characters and pseudo-tags are not deleted, only illegal "&amp;"
263      * are escaped; no file extension is required.
264      */
265     private final XMLAdjuster xmlAdjuster;
266 
267     /***
268      * Used to isolate XML tags so the TreeTagger can handle a resulting file.
269      */
270     private final TagIsolator tagIsolator = new TagIsolator();
271 
272     /***
273      * Creates a new instance, using the
274      * {@linkplain TiesConfiguration standard configuration}.
275      *
276      * @param outExt the extension to use for output files
277      */
278     public TreeTagger(final String outExt) {
279         this(outExt, TiesConfiguration.CONF);
280     }
281 
282     /***
283      * Creates a new instance.
284      *
285      * @param outExt the extension to use for output files
286      * @param config used to configure superclasses
287      */
288     public TreeTagger(final String outExt, final TiesConfiguration config) {
289         super(outExt, config);
290 
291         // initialize XML adjuster: Neither missing root nor emptiable tags are
292         // used, control characters and pseudo-tags are not deleted, only
293         // illegal "&amp;" are escaped; no file extension is required
294         xmlAdjuster =
295             new XMLAdjuster(null, null, null, false, false, false, config);
296     }
297 
298     /***
299      * Augments the <code>input</code> text with the output of the TreeTagger.
300      *
301      * @param in reader containing the text to process; must contain the textual
302      * representation of a well-formed XML document
303      * @param out the writer to write the processed text to; the text will
304      * be augmented with part-of-speech, lemma, and chunk information,
305      * it will be a well-formed XML document (if the input was well-formed)
306      * @param context a map of objects that are made available for processing
307      * @throws IOException if an I/O error occurred
308      * @throws ParsingException if the file couldn't be parsed, e.g. due to an
309      * error in the XML input
310      */
311     protected void doProcess(final Reader in, final Writer out,
312             final ContextMap context) throws IOException, ParsingException {
313         final String input = IOUtils.readToString(in);
314         //Util.LOG.debug("Un-augmented XML: " + input);
315 
316         // replaces some special characters/entities with a simpler
317         // representation (e.g., different types of quotes, dashes,
318         // and special whitespace characters)
319         final String simplifiedXML = TextUtils.multipleReplaceAll(
320             input, SUBSTITUTES);
321 
322         // protect entities from TreeTagger
323         final String protectedXML = TextUtils.replaceAll(simplifiedXML,
324             XML_ENTITY, XML_ENTITY_REPLACEMENT);
325 
326         // isolate tags for TreeTagger
327         Writer isolateOut = new StringWriter();
328         tagIsolator.isolateTags(new StringReader(protectedXML), isolateOut,
329             REPLACE_WITHIN_TAGS);
330 
331         // delegate to TreeTagger
332         final String taggerCommandName = getConfig().getString(
333                 getConfig().localizeKey(CONFIG_COMMAND));
334         final ExternalCommand taggerCommand = new ExternalCommand(
335             new String[] {taggerCommandName});
336         final String treeTagged =
337             taggerCommand.execute(null, isolateOut.toString());
338 
339         // restore entities to original form
340         final String unprotectedXML = TextUtils.replaceAll(treeTagged,
341             REPLACED_XML_ENTITY, XML_ENTITY_RESTORE);
342 
343         // restore whitespace + apostrophes in tags
344         // We're using pseudo-entities  (illegal XML) instead of actual
345         // ones, so we can replace them without fear of replacing too much
346         final String unisolatedXML = TextUtils.multipleReplaceAll(
347             unprotectedXML, RESTORE_WITHIN_TAGS);
348 
349         //Util.LOG.debug("Un-adjusted XML: " + unisolatedXML);
350 
351         // add end-of-sentence markers (</sent>)
352         final String sentenceTagged = tagSentences(unisolatedXML);
353 
354         // adjusting XML structure (merging original + TreeTagger trees)
355         xmlAdjuster.adjust(new StringReader(sentenceTagged), out);
356     }
357 
358     /***
359      * Adds tags to mark the sentences in a document. Only the ends of
360      * sentences are tagged by this method by inserted &lt;/sent&gt; tags --
361      * the corresponding start tags are later added by the XML adjuster.
362      *
363      * @param input the text to process
364      * @return the processed tag with &lt;/sent&gt; tags added
365      */
366     protected final String tagSentences(final String input) {
367         // build pattern of text to match
368         final String eosType = getConfig().getString(getConfig().localizeKey(
369                         CONFIG_END_OF_SENTENCE));
370         final String[] afterEOSTypes = getConfig().getStringArray(
371                 getConfig().localizeKey(CONFIG_AFTER_EOS));
372         // start pattern
373         final StringBuffer patternString = new StringBuffer(typedPos(eosType));
374 
375         // open non-capturing group, accept preceding whitespace
376         patternString.append("(?://s*");
377         final String trailingTags;
378 
379         if ((afterEOSTypes != null) && (afterEOSTypes.length > 0)) {
380             final String afterEOSPattern =
381                 TextUtils.joinAlternatives(afterEOSTypes);
382             // these POS tags and ending constituents should be included
383             trailingTags = TextUtils.joinAlternatives(new String[] {
384                     typedPos(afterEOSPattern), END_TAG_CONSTITUENT
385                 }
386             );
387         } else {
388             // ending constituents should be included
389             trailingTags = END_TAG_CONSTITUENT;
390         }
391 
392         // append trailing tags + close group (matching any number of times)
393         patternString.append(trailingTags);
394         patternString.append(")*");
395 
396         /* rough debugging only--very inefficient
397         final Pattern eosPattern = Pattern.compile(patternString.toString());
398         Log.TIES.info("End-of-sentence pattern '" + patternString.toString()
399             + "' matches " + (eosPattern.split(input).length - 1) + " times");
400         */
401 
402         // append sentence end tag after each match of the created pattern
403         return TextUtils.replaceAll(input,
404             Pattern.compile(patternString.toString()), "$0" + END_TAG_SENTENCE);
405     }
406 
407     /***
408      * Returns a string representation of this object.
409      *
410      * @return a textual representation
411      */
412     public String toString() {
413         return new ToStringBuilder(this)
414             .appendSuper(super.toString())
415             .append("tag isolator", tagIsolator)
416             .append("XML adjuster", xmlAdjuster)
417             .toString();
418     }
419 
420     /***
421      * Helper method that create a regular expression string for matching a
422      * part-of-speech element of a specified type.
423      *
424      * @param type the type string in {@link Pattern} format
425      * @return a string in {@link Pattern} format matching the POS element
426      */
427     private String typedPos(final String type) {
428         // we rely on the format written by our TreeTagger adaption (type is
429         // the first attribute, quotes are " not ')
430         return "<pos//s+type=\"" + type + ".+?</pos>";
431     }
432 
433 }