1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties.preprocess;
23
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.Reader;
27 import java.io.StringReader;
28 import java.io.StringWriter;
29 import java.io.Writer;
30 import java.util.Collections;
31 import java.util.Enumeration;
32 import java.util.HashMap;
33 import java.util.Map;
34 import java.util.Properties;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
37
38 import org.apache.commons.lang.builder.ToStringBuilder;
39
40 import de.fu_berlin.ties.ContextMap;
41 import de.fu_berlin.ties.ParsingException;
42 import de.fu_berlin.ties.TextProcessor;
43 import de.fu_berlin.ties.TiesConfiguration;
44
45 import de.fu_berlin.ties.io.IOUtils;
46 import de.fu_berlin.ties.text.TextUtils;
47 import de.fu_berlin.ties.util.ExternalCommand;
48 import de.fu_berlin.ties.util.Util;
49 import de.fu_berlin.ties.xml.TagIsolator;
50 import de.fu_berlin.ties.xml.XMLAdjuster;
51 import de.fu_berlin.ties.xml.XMLTokenizerFactory;
52
53 /***
54 * Integrates the
55 * <a href="http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/"
56 * >TreeTagger</a>, a linguistic tool for part-of-speech tagging and chunk
57 * parsing. This integration brings XML-based input files in a form that can
58 * be processed by TreeTagger, runs the external TreeTagger command, converts
59 * the output in the augmented text format defined by TIE, inserting tags
60 * marking sentences and unifying the original XML markup and the TreeTagger
61 * output in a single XML tree. This class is thread-safe.
62 *
63 * @author Christian Siefkes
64 * @version $Revision: 1.14 $, $Date: 2004/11/08 11:57:35 $, $Author: siefkes $
65 */
66 public class TreeTagger extends TextProcessor {
67
68 /***
69 * Configuration key: the name of the TreeTagger command
70 * (language-specific).
71 */
72 private static final String CONFIG_COMMAND = "treetagger.command";
73
74 /***
75 * Configuration key: the POS tag marking the end of a sentence
76 * (language-specific).
77 */
78 private static final String CONFIG_END_OF_SENTENCE = "treetagger.eos";
79
80 /***
81 * Configuration key: list of POS tags still to include in a previous
82 * sentence when they occur after an end-of-sentence tag (language-specific,
83 * optional).
84 */
85 private static final String CONFIG_AFTER_EOS = "treetagger.after-eos";
86
87 /***
88 * Tag marking the end of a sentence constituent (XML end tag).
89 */
90 private static final String END_TAG_CONSTITUENT = "</const>";
91
92 /***
93 * Tag marking the end of a sentence (XML end tag).
94 */
95 private static final String END_TAG_SENTENCE = "</sent>";
96
97 /***
98 * The apostrophe character (').
99 */
100 private static final String APOSTROPHE_CHAR = "'";
101
102 /***
103 * A pseudo entity used to replace the apostrophe (') within XML tags,
104 * so it is protected from the TreeTagger which separates clitics
105 * (English version).
106 */
107 private static final String PSEUDO_ENTITY_APOSTROPHE = "&';";
108
109 /***
110 * A pseudo entity used to replace whitespace within XML tags,
111 * so it is protected from the TreeTagger which splits lines at whitespace.
112 */
113 private static final String PSEUDO_ENTITY_WHITESPACE = "&;";
114
115 /***
116 * A map of regexp patterns and the corresponding replacement strings used
117 * to make XML tags safe for use with the TreeTagger. We can use a hash map
118 * as the order or replacements doesn't matter.
119 */
120 private static final Map<Pattern, String> REPLACE_WITHIN_TAGS =
121 new HashMap<Pattern, String>();
122
123 /***
124 * A map of regexp patterns and the corresponding replacement strings to
125 * revert the changes of applying the {@link #REPLACE_WITHIN_TAGS} map. We
126 * can use a hash map as the order or replacements doesn't matter.
127 */
128 private static final Map<Pattern, String> RESTORE_WITHIN_TAGS =
129 new HashMap<Pattern, String>();
130
131 /***
132 * Pattern for recognizing regular XML entities.
133 */
134 private static final Pattern XML_ENTITY =
135 Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ");");
136
137 /***
138 * Replacement string to protect regular XML entities from being split
139 * by the TreeTagger tokenizer.
140 */
141 private static final String XML_ENTITY_REPLACEMENT = "$1&";
142
143 /***
144 * Pattern for recognizing the XML entities in the replaced form used to
145 * protect them from TreeTagger.
146 */
147 private static final Pattern REPLACED_XML_ENTITY =
148 Pattern.compile("(&" + XMLTokenizerFactory.XML_NAME + ")&");
149
150 /***
151 * Replacement string to restore protected regular XML entities to their
152 * original form.
153 */
154 private static final String XML_ENTITY_RESTORE = "$1;";
155
156 /***
157 * An unmodifiable map of properties containg the HTML characters and
158 * entities to substitute. Each key is a regular expression
159 * {@link java.util.regex.Pattern} of characters and entities and should
160 * be replaced by the correspinding value String.
161 */
162 private static final Map SUBSTITUTES;
163
164 /***
165 * Name of the properties file containing the HTML characters and entities
166 * to substitute.
167 */
168 private static final String SUBSTITUTES_FILE = "conf/substitutes.cfg";
169
170 /***
171 * Static initialization of the static maps.
172 */
173 static {
174 final Properties substProps = new Properties();
175 final InputStream substStream =
176 ClassLoader.getSystemResourceAsStream(SUBSTITUTES_FILE);
177
178 if (substStream == null) {
179 Util.LOG.error("Class loader returned null for "
180 + SUBSTITUTES_FILE);
181 }
182
183 final Map<Pattern, String> tempSubst = new HashMap<Pattern, String>();
184
185 try {
186 substProps.load(substStream);
187 Enumeration keys = substProps.propertyNames();
188
189
190 final Matcher nameMatcher = Pattern.compile(
191 "//p{Alpha}+").matcher("");
192 final String nameReplacement = "&$0;";
193
194
195 final Matcher numMatcher = Pattern.compile(
196 "//p{Digit}+").matcher("");
197 final String numReplacement = "&#$0;";
198
199
200 final Matcher whitespaceMatcher =
201 Pattern.compile("[//s|]+").matcher("//p{Digit}+");
202 final String whitespaceReplacement = "|";
203
204 String currentPropKey, currentPropValue;
205 StringBuffer currentPatternValue;
206 Pattern currentPattern;
207 int entityNumber;
208 char entityChar;
209
210 while (keys.hasMoreElements()) {
211 currentPropKey = (String) keys.nextElement();
212 currentPropValue =
213 substProps.getProperty(currentPropKey).trim();
214
215
216 currentPropValue = TextUtils.replaceAll(currentPropValue,
217 whitespaceMatcher, whitespaceReplacement);
218
219 currentPropValue = TextUtils.replaceAll(currentPropValue,
220 nameMatcher, nameReplacement);
221
222 numMatcher.reset(currentPropValue);
223 currentPatternValue = new StringBuffer();
224
225 while (numMatcher.find()) {
226 numMatcher.appendReplacement(currentPatternValue,
227 numReplacement);
228
229 entityNumber = Integer.parseInt(numMatcher.group());
230 entityChar = (char) entityNumber;
231 currentPatternValue.append("|").append(entityChar);
232 }
233 numMatcher.appendTail(currentPatternValue);
234
235
236 currentPattern = Pattern.compile("(?:"
237 + currentPatternValue + ")");
238 tempSubst.put(currentPattern, currentPropKey);
239 }
240 } catch (RuntimeException rte) {
241 Util.LOG.error("Could not initialize substitution table from "
242 + SUBSTITUTES_FILE + " = ", rte);
243 } catch (IOException ioe) {
244 Util.LOG.error("Could not initialize substitution table from "
245 + SUBSTITUTES_FILE + ": ", ioe);
246 }
247 SUBSTITUTES = Collections.unmodifiableMap(tempSubst);
248
249 REPLACE_WITHIN_TAGS.put(Pattern.compile(APOSTROPHE_CHAR),
250 PSEUDO_ENTITY_APOSTROPHE);
251 RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_APOSTROPHE),
252 APOSTROPHE_CHAR);
253 REPLACE_WITHIN_TAGS.put(Pattern.compile("//s+"),
254 PSEUDO_ENTITY_WHITESPACE);
255 RESTORE_WITHIN_TAGS.put(Pattern.compile(PSEUDO_ENTITY_WHITESPACE),
256 " ");
257 }
258
259 /***
260 * XML adjuster for merging the original XML tree with the markup added by
261 * the TreeTagger. Neither missing root nor emptiable tags are used,
262 * control characters and pseudo-tags are not deleted, only illegal "&"
263 * are escaped; no file extension is required.
264 */
265 private final XMLAdjuster xmlAdjuster;
266
267 /***
268 * Used to isolate XML tags so the TreeTagger can handle a resulting file.
269 */
270 private final TagIsolator tagIsolator = new TagIsolator();
271
272 /***
273 * Creates a new instance, using the
274 * {@linkplain TiesConfiguration standard configuration}.
275 *
276 * @param outExt the extension to use for output files
277 */
278 public TreeTagger(final String outExt) {
279 this(outExt, TiesConfiguration.CONF);
280 }
281
282 /***
283 * Creates a new instance.
284 *
285 * @param outExt the extension to use for output files
286 * @param config used to configure superclasses
287 */
288 public TreeTagger(final String outExt, final TiesConfiguration config) {
289 super(outExt, config);
290
291
292
293
294 xmlAdjuster =
295 new XMLAdjuster(null, null, null, false, false, false, config);
296 }
297
298 /***
299 * Augments the <code>input</code> text with the output of the TreeTagger.
300 *
301 * @param in reader containing the text to process; must contain the textual
302 * representation of a well-formed XML document
303 * @param out the writer to write the processed text to; the text will
304 * be augmented with part-of-speech, lemma, and chunk information,
305 * it will be a well-formed XML document (if the input was well-formed)
306 * @param context a map of objects that are made available for processing
307 * @throws IOException if an I/O error occurred
308 * @throws ParsingException if the file couldn't be parsed, e.g. due to an
309 * error in the XML input
310 */
311 protected void doProcess(final Reader in, final Writer out,
312 final ContextMap context) throws IOException, ParsingException {
313 final String input = IOUtils.readToString(in);
314
315
316
317
318
319 final String simplifiedXML = TextUtils.multipleReplaceAll(
320 input, SUBSTITUTES);
321
322
323 final String protectedXML = TextUtils.replaceAll(simplifiedXML,
324 XML_ENTITY, XML_ENTITY_REPLACEMENT);
325
326
327 Writer isolateOut = new StringWriter();
328 tagIsolator.isolateTags(new StringReader(protectedXML), isolateOut,
329 REPLACE_WITHIN_TAGS);
330
331
332 final String taggerCommandName = getConfig().getString(
333 getConfig().localizeKey(CONFIG_COMMAND));
334 final ExternalCommand taggerCommand = new ExternalCommand(
335 new String[] {taggerCommandName});
336 final String treeTagged =
337 taggerCommand.execute(null, isolateOut.toString());
338
339
340 final String unprotectedXML = TextUtils.replaceAll(treeTagged,
341 REPLACED_XML_ENTITY, XML_ENTITY_RESTORE);
342
343
344
345
346 final String unisolatedXML = TextUtils.multipleReplaceAll(
347 unprotectedXML, RESTORE_WITHIN_TAGS);
348
349
350
351
352 final String sentenceTagged = tagSentences(unisolatedXML);
353
354
355 xmlAdjuster.adjust(new StringReader(sentenceTagged), out);
356 }
357
358 /***
359 * Adds tags to mark the sentences in a document. Only the ends of
360 * sentences are tagged by this method by inserted </sent> tags --
361 * the corresponding start tags are later added by the XML adjuster.
362 *
363 * @param input the text to process
364 * @return the processed tag with </sent> tags added
365 */
366 protected final String tagSentences(final String input) {
367
368 final String eosType = getConfig().getString(getConfig().localizeKey(
369 CONFIG_END_OF_SENTENCE));
370 final String[] afterEOSTypes = getConfig().getStringArray(
371 getConfig().localizeKey(CONFIG_AFTER_EOS));
372
373 final StringBuffer patternString = new StringBuffer(typedPos(eosType));
374
375
376 patternString.append("(?://s*");
377 final String trailingTags;
378
379 if ((afterEOSTypes != null) && (afterEOSTypes.length > 0)) {
380 final String afterEOSPattern =
381 TextUtils.joinAlternatives(afterEOSTypes);
382
383 trailingTags = TextUtils.joinAlternatives(new String[] {
384 typedPos(afterEOSPattern), END_TAG_CONSTITUENT
385 }
386 );
387 } else {
388
389 trailingTags = END_TAG_CONSTITUENT;
390 }
391
392
393 patternString.append(trailingTags);
394 patternString.append(")*");
395
396
397
398
399
400
401
402
403 return TextUtils.replaceAll(input,
404 Pattern.compile(patternString.toString()), "$0" + END_TAG_SENTENCE);
405 }
406
407 /***
408 * Returns a string representation of this object.
409 *
410 * @return a textual representation
411 */
412 public String toString() {
413 return new ToStringBuilder(this)
414 .appendSuper(super.toString())
415 .append("tag isolator", tagIsolator)
416 .append("XML adjuster", xmlAdjuster)
417 .toString();
418 }
419
420 /***
421 * Helper method that create a regular expression string for matching a
422 * part-of-speech element of a specified type.
423 *
424 * @param type the type string in {@link Pattern} format
425 * @return a string in {@link Pattern} format matching the POS element
426 */
427 private String typedPos(final String type) {
428
429
430 return "<pos//s+type=\"" + type + ".+?</pos>";
431 }
432
433 }