1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties;
23
24 import java.io.File;
25 import java.io.FileOutputStream;
26 import java.io.FileWriter;
27 import java.io.IOException;
28 import java.io.InputStreamReader;
29 import java.io.OutputStreamWriter;
30 import java.io.Reader;
31 import java.io.StringReader;
32 import java.io.StringWriter;
33 import java.io.Writer;
34 import java.net.URL;
35 import java.net.URLConnection;
36 import java.util.HashMap;
37 import java.util.Map;
38
39 import org.apache.commons.lang.ArrayUtils;
40 import org.apache.commons.lang.builder.ToStringBuilder;
41
42 import de.fu_berlin.ties.io.ContentType;
43 import de.fu_berlin.ties.io.IOUtils;
44 import de.fu_berlin.ties.util.Util;
45
46 /***
47 * Abstract base class for a {@link de.fu_berlin.ties.Processor} that operates
48 * on text documents. Input is read from a file or URL or
49 * {@link java.io.Reader}, output is written to a file or
50 * {@link java.io.Writer}.
51 *
52 * @author Christian Siefkes
53 * @version $Revision: 1.28 $, $Date: 2006/10/21 16:03:52 $, $Author: siefkes $
54 */
55 public abstract class TextProcessor extends ConfigurableProcessor {
56
57 /***
58 * Configuration prefix for post-processors.
59 */
60 public static final String CONFIG_POST = "post";
61
62 /***
63 * Context key referring to the local name of the processed document.
64 */
65 public static final String KEY_LOCAL_NAME = "local";
66
67 /***
68 * Context key referring to the directory of the processed document,
69 * if it is a local file.
70 */
71 public static final String KEY_DIRECTORY = "directory";
72
73 /***
74 * Context key referring output directory; if missing, the value of
75 * {@link #KEY_DIRECTORY} is used instead.
76 */
77 public static final String KEY_OUT_DIRECTORY = "outdir";
78
79 /***
80 * Context key referring to the URL of the processed document, if loaded
81 * from an URL.
82 */
83 public static final String KEY_URL = "url";
84
85 /***
86 * The extension used for output files.
87 */
88 private final String outFileExt;
89
90 /***
91 * The configuration key for the preprocessor (or <code>null</code> if no
92 * <code>outFileExt</code> is given).
93 */
94 private final String postProcKey;
95
96 /***
97 * A map of post-processors for different languages, if configured.
98 * Maps from localized or global {@link #CONFIG_POST} key to initialized
99 * {@link Processor} instance (or to <code>null</code> if no post-processor
100 * exists). Synchronized on itself.
101 */
102 private final Map<String, Processor> postProcessors =
103 new HashMap<String, Processor>();
104
105 /***
106 * Creates a new instance, using the
107 * {@linkplain TiesConfiguration#CONF standard configuration}.
108 *
109 * @param outExt the extension to use for output files
110 */
111 public TextProcessor(final String outExt) {
112 this(outExt, null);
113 }
114
115 /***
116 * Creates a new instance.
117 *
118 * @param outExt the extension to use for output files
119 * @param conf used to configure this instance; if <code>null</code>,
120 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
121 */
122 public TextProcessor(final String outExt, final TiesConfiguration conf) {
123 super(conf);
124 outFileExt = outExt;
125
126 if (outExt != null) {
127 postProcKey = TiesConfiguration.joinKey(CONFIG_POST, outExt);
128 } else {
129 postProcKey = null;
130 }
131 }
132
133 /***
134 * Processes the contents of a reader, writing a modified version to a
135 * writer.
136 *
137 * @param reader reader containing the text to process; should not be closed
138 * by this method
139 * @param writer the writer to write the processed text to; might be flushed
140 * but not closed by this method; if this method does not use the writer,
141 * the underlying file will be deleted afterwards
142 * @param context a map of objects that are made available for processing;
143 * when called from the implemented <code>process</code> methods in this
144 * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
145 * to the character set of the output writer;
146 * from {@link #KEY_OUT_DIRECTORY} to the output directory ({@link File});
147 * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
148 * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
149 * {@link #KEY_DIRECTORY} to the input directory ({@link File}), in case of
150 * a local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
151 * the processed document
152 * @throws IOException if an I/O error occurs
153 * @throws ProcessingException if an error occurs during processing
154 */
155 protected abstract void doProcess(final Reader reader, final Writer writer,
156 final ContextMap context) throws IOException, ProcessingException;
157
158 /***
159 * Returns the extension used for output files.
160 * @return the value of the attribute
161 */
162 public String getOutFileExt() {
163 return outFileExt;
164 }
165
166 /***
167 * Delegates to the abstract {@link #doProcess(Reader, Writer, ContextMap)}
168 * method and invokes a post-processor, if configured.
169 *
170 * @param reader reader containing the text to process; should not be closed
171 * by this method
172 * @param writer the writer to write the processed text to; might be flushed
173 * but not closed by this method; if this method does not use the writer,
174 * the underlying file will be deleted afterwards
175 * @param context a map of objects that are made available for processing;
176 * when called from the implemented <code>process</code> methods in this
177 * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
178 * to the character set of the output writer; from
179 * {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
180 * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
181 * {@link #KEY_DIRECTORY} to the directory ({@link File}), in case of a
182 * local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
183 * the processed document
184 * @throws IOException if an I/O error occurs
185 * @throws ProcessingException if an error occurs during processing
186 */
187 public final void process(final Reader reader, final Writer writer,
188 final ContextMap context) throws IOException, ProcessingException {
189 final TextProcessor postProc;
190
191
192 if (postProcKey != null) {
193 final String localizedKey = getConfig().localizeKey(postProcKey);
194
195 synchronized (postProcessors) {
196 if (postProcessors.containsKey(localizedKey)) {
197
198 postProc = (TextProcessor) postProcessors.get(localizedKey);
199 } else {
200
201 if (getConfig().containsKey(localizedKey)) {
202
203
204 final String[] classDefinition =
205 TiesConfiguration.CONF.getStringArray(localizedKey);
206
207 try {
208 postProc = (TextProcessor)
209 Util.createObject(classDefinition);
210 Util.LOG.debug(getClass().getName()
211 + ": Initialized post-processor from "
212 + localizedKey + " (definition: "
213 + ArrayUtils.toString(classDefinition) + ")");
214 } catch (Exception e) {
215
216 throw new ProcessingException(
217 "Could not initialize post-processor from "
218 + localizedKey + " (definition: "
219 + ArrayUtils.toString(classDefinition) + ")",
220 e);
221 }
222 } else {
223
224 postProc = null;
225 }
226
227
228 postProcessors.put(localizedKey, postProc);
229 }
230 }
231 } else {
232 postProc = null;
233 }
234
235 if (postProc != null) {
236 final StringWriter interimWriter = new StringWriter();
237
238
239 doProcess(reader, interimWriter, context);
240 postProc.process(new StringReader(interimWriter.toString()),
241 writer, context);
242 } else {
243
244 doProcess(reader, writer, context);
245 }
246 }
247
248 /***
249 * Processes the contents of a file, delegating to the
250 * {@link #process(File, Writer, ContextMap)} method.
251 *
252 * @param file the file to process
253 * @param writer the writer to write the processed text to; not closed by
254 * this method
255 * @throws IOException if an I/O error occurs
256 * @throws ProcessingException if an error occurs during processing
257 */
258 public final void process(final File file, final Writer writer)
259 throws IOException, ProcessingException {
260
261 final ContextMap context = new ContextMap();
262 if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
263 context.put(IOUtils.KEY_LOCAL_CHARSET,
264 getConfig().getString(IOUtils.KEY_LOCAL_CHARSET));
265 }
266
267 process(file, writer, context);
268 }
269
270 /***
271 * Processes the contents of a file, delegating to the
272 * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
273 * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
274 * {@link ContextMap context}.
275 *
276 * @param file the file to process
277 * @param writer the writer to write the processed text to; not closed by
278 * this method
279 * @param context a map of objects that are made available for processing;
280 * should contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
281 * character set to use for local files
282 * @throws IOException if an I/O error occurs
283 * @throws ProcessingException if an error occurs during processing
284 */
285 public final void process(final File file, final Writer writer,
286 final ContextMap context)
287 throws IOException, ProcessingException {
288
289 final ContentType contentType =
290 ContentType.determineContentType(file, getConfig());
291
292
293 context.put(ContentType.KEY_MIME_TYPE, contentType.getMimeType());
294 context.put(KEY_DIRECTORY, file.getParentFile());
295 final String charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
296
297
298 final InputStreamReader reader = IOUtils.openReader(file, charset);
299
300 if (charset == null) {
301
302 context.put(IOUtils.KEY_LOCAL_CHARSET,
303 IOUtils.determineCharsetName(reader));
304 }
305
306 try {
307
308 process(reader, writer, context);
309 } finally {
310 IOUtils.tryToClose(reader);
311 }
312 }
313
314 /***
315 * Processes a file or URL given as input argument, delegating to the
316 * appropriate <code>process</code> method. A warning is logged if the input
317 * is neither a readable file nor a readable URL. Stores a mapping
318 * from {@link IOUtils#KEY_LOCAL_CHARSET} to the character set of the
319 * output writer in the created {@link ContextMap context}.
320 *
321 * @param inputName the name of a readable file or URL to process
322 * @throws IOException if an I/O error occurs during processing
323 * @throws ProcessingException if an error occurs during processing
324 */
325 public final void process(final String inputName)
326 throws IOException, ProcessingException {
327 final long startTime = System.currentTimeMillis();
328 URL inURL = null;
329 URLConnection inConn = null;
330 String dirName = null;
331 String localName = null;
332 boolean isOkay;
333 Util.LOG.debug("Starting to process " + inputName);
334
335
336 final File inFile = new File(inputName);
337 if (inFile.exists() && inFile.canRead()) {
338
339 dirName = inFile.getParent();
340 localName = inFile.getName();
341 isOkay = true;
342 } else {
343
344 try {
345 inURL = new URL(inputName);
346 inConn = inURL.openConnection();
347
348
349 dirName = null;
350 localName = IOUtils.getLocalName(inURL, true);
351 isOkay = true;
352 } catch (IOException ioe) {
353 isOkay = false;
354 }
355 }
356
357
358 final String outDirName =
359 getConfig().getString(KEY_OUT_DIRECTORY, dirName);
360 final File outDir = (outDirName != null) ? new File(outDirName) : null;
361
362 if (isOkay) {
363 final File outFile = IOUtils.createOutFile(outDir,
364 localName, getOutFileExt());
365 final OutputStreamWriter writer;
366 final String localCharset;
367
368
369 if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
370
371 localCharset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET);
372 writer = new OutputStreamWriter(
373 new FileOutputStream(outFile), localCharset);
374 } else {
375
376 localCharset = null;
377 writer = new FileWriter(outFile);
378 }
379
380
381 final ContextMap context = new ContextMap();
382 final String usedCharset = (localCharset != null)
383 ? localCharset
384 : IOUtils.determineCharsetName(writer);
385 context.put(IOUtils.KEY_LOCAL_CHARSET, usedCharset);
386 context.put(KEY_LOCAL_NAME, localName);
387 context.put(KEY_OUT_DIRECTORY, outFile.getParentFile());
388
389 try {
390 if (inConn != null) {
391
392 process(inConn, writer, context);
393 } else {
394
395 process(inFile, writer, context);
396 }
397
398 writer.flush();
399 } finally {
400 IOUtils.tryToClose(writer);
401
402
403 if (outFile.length() == 0) {
404 if (!outFile.delete()) {
405 Util.LOG.warn("Could not delete empty output file "
406 + outFile);
407 }
408 }
409 }
410
411 if (outFile.exists() && (outFile.length() > 0)) {
412 Util.LOG.info("Results of processing " + inputName
413 + " stored in " + outFile + " ("
414 + Util.showDuration(startTime) + ")");
415 } else {
416 Util.LOG.info("Processed " + inputName + " ("
417 + Util.showDuration(startTime) + ")");
418 }
419 } else {
420 Util.LOG.warn(inputName + " is neither a readable "
421 + "file nor a readable URL -- ignoring it");
422 }
423 }
424
425 /***
426 * Processes the contents of an URL connection, delegating to the
427 * {@link #process(URLConnection, Writer, ContextMap)} method.
428 * Assumed the {@linkplain IOUtils#STANDARD_HTTP_CHARSET standard HTTP
429 * character set} ("ISO-8859-1") if no {@link IOUtils#KEY_LOCAL_CHARSET}
430 * has been configured.
431 *
432 * @param urlConn the URL connection to process
433 * @param writer the writer to write the processed text to; not closed by
434 * this method
435 * @throws IOException if an I/O error occurs
436 * @throws ProcessingException if an error occurs during processing
437 */
438 public final void process(final URLConnection urlConn, final Writer writer)
439 throws IOException, ProcessingException {
440
441 final ContextMap context = new ContextMap();
442 final String charset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET,
443 IOUtils.STANDARD_HTTP_CHARSET);
444 context.put(IOUtils.KEY_LOCAL_CHARSET, charset);
445
446 process(urlConn, writer, context);
447 }
448
449 /***
450 * Processes the contents of an URL connection, delegating to the
451 * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
452 * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
453 * {@link ContextMap context}.
454 *
455 * @param urlConn the URL connection to process
456 * @param writer the writer to write the processed text to; not closed by
457 * this method
458 * @param context a map of objects that are made available for processing;
459 * must contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
460 * character set to use for local files
461 * @throws IOException if an I/O error occurs
462 * @throws ProcessingException if an error occurs during processing
463 */
464 public final void process(final URLConnection urlConn, final Writer writer,
465 final ContextMap context) throws IOException, ProcessingException {
466
467 final URL url = urlConn.getURL();
468 final String contentTypeHeader = urlConn.getContentType();
469 final ContentType contentType = ContentType.determineContentType(
470 url, contentTypeHeader, getConfig());
471
472
473 context.put(ContentType.KEY_MIME_TYPE,
474 contentType.getMimeType());
475 context.put(KEY_URL, url);
476
477
478 final String charset;
479 if (contentType.getCharset() != null) {
480
481 charset = contentType.getCharset();
482 } else {
483
484 charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
485 }
486
487 final InputStreamReader reader =
488 new InputStreamReader(urlConn.getInputStream(), charset);
489
490 try {
491
492 process(reader, writer, context);
493 } finally {
494 IOUtils.tryToClose(reader);
495 }
496 }
497
498 /***
499 * Returns a string representation of this object.
500 *
501 * @return a textual representation
502 */
503 public String toString() {
504 return new ToStringBuilder(this)
505 .append("output file extension", outFileExt)
506 .toString();
507 }
508
509 }