1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 package de.fu_berlin.ties;
23
24 import java.io.File;
25 import java.io.FileOutputStream;
26 import java.io.FileWriter;
27 import java.io.IOException;
28 import java.io.InputStreamReader;
29 import java.io.OutputStreamWriter;
30 import java.io.Reader;
31 import java.io.StringReader;
32 import java.io.StringWriter;
33 import java.io.Writer;
34 import java.net.URL;
35 import java.net.URLConnection;
36 import java.util.HashMap;
37 import java.util.Map;
38
39 import org.apache.commons.lang.ArrayUtils;
40 import org.apache.commons.lang.builder.ToStringBuilder;
41
42 import de.fu_berlin.ties.io.ContentType;
43 import de.fu_berlin.ties.io.IOUtils;
44 import de.fu_berlin.ties.util.Util;
45
46 /***
47 * Abstract base class for a {@link de.fu_berlin.ties.Processor} that operates
48 * on text documents. Input is read from a file or URL or
49 * {@link java.io.Reader}, output is written to a file or
50 * {@link java.io.Writer}.
51 *
52 * @author Christian Siefkes
53 * @version $Revision: 1.22 $, $Date: 2004/10/12 14:17:19 $, $Author: siefkes $
54 */
55 public abstract class TextProcessor extends ConfigurableProcessor {
56
57 /***
58 * Configuration prefix for post-processors.
59 */
60 public static final String CONFIG_POST = "post";
61
62 /***
63 * Context key referring to the local name of the processed document.
64 */
65 public static final String KEY_LOCAL_NAME = "local";
66
67 /***
68 * Context key referring to the directory of the processed document,
69 * if it is a local file.
70 */
71 public static final String KEY_DIRECTORY = "directory";
72
73 /***
74 * Context key referring output directory; if missing, the value of
75 * {@link #KEY_DIRECTORY} is used instead.
76 */
77 public static final String KEY_OUT_DIRECTORY = "outdir";
78
79 /***
80 * Context key referring to the URL of the processed document, if loaded
81 * from an URL.
82 */
83 public static final String KEY_URL = "url";
84
85 /***
86 * The extension used for output files.
87 */
88 private final String outFileExt;
89
90 /***
91 * The configuration key for the preprocessor (or <code>null</code> if no
92 * <code>outFileExt</code> is given).
93 */
94 private final String postProcKey;
95
96 /***
97 * A map of post-processors for different languages, if configured.
98 * Maps from localized or global {@link #CONFIG_POST} key to initialized
99 * {@link Processor} instance (or to <code>null</code> if no post-processor
100 * exists). Synchronized on itself.
101 */
102 private final Map<String, Processor> postProcessors =
103 new HashMap<String, Processor>();
104
105 /***
106 * Creates a new instance.
107 *
108 * @param outExt the extension to use for output files
109 * @param conf used to configure this instance; if <code>null</code>,
110 * the {@linkplain TiesConfiguration#CONF standard configuration} is used
111 */
112 public TextProcessor(final String outExt, final TiesConfiguration conf) {
113 super(conf);
114 outFileExt = outExt;
115
116 if (outExt != null) {
117 postProcKey = TiesConfiguration.joinKey(CONFIG_POST, outExt);
118 } else {
119 postProcKey = null;
120 }
121 }
122
123 /***
124 * Processes the contents of a reader, writing a modified version to a
125 * writer.
126 *
127 * @param reader reader containing the text to process; should not be closed
128 * by this method
129 * @param writer the writer to write the processed text to; might be flushed
130 * but not closed by this method; if this method does not use the writer,
131 * the underlying file will be deleted afterwards
132 * @param context a map of objects that are made available for processing;
133 * when called from the implemented <code>process</code> methods in this
134 * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
135 * to the character set of the output writer; from
136 * {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
137 * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
138 * {@link #KEY_DIRECTORY} to the directory ({@link File}), in case of a
139 * local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
140 * the processed document
141 * @throws IOException if an I/O error occurs
142 * @throws ProcessingException if an error occurs during processing
143 */
144 protected abstract void doProcess(final Reader reader, final Writer writer,
145 final ContextMap context) throws IOException, ProcessingException;
146
147 /***
148 * Returns the extension used for output files.
149 * @return the value of the attribute
150 */
151 public String getOutFileExt() {
152 return outFileExt;
153 }
154
155 /***
156 * Delegates to the abstract {@link #doProcess(Reader, Writer, ContextMap)}
157 * method and invokes a post-processor, if configured.
158 *
159 * @param reader reader containing the text to process; should not be closed
160 * by this method
161 * @param writer the writer to write the processed text to; might be flushed
162 * but not closed by this method; if this method does not use the writer,
163 * the underlying file will be deleted afterwards
164 * @param context a map of objects that are made available for processing;
165 * when called from the implemented <code>process</code> methods in this
166 * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
167 * to the character set of the output writer; from
168 * {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
169 * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
170 * {@link #KEY_DIRECTORY} to the directory ({@link File}), in case of a
171 * local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
172 * the processed document
173 * @throws IOException if an I/O error occurs
174 * @throws ProcessingException if an error occurs during processing
175 */
176 public final void process(final Reader reader, final Writer writer,
177 final ContextMap context) throws IOException, ProcessingException {
178 final TextProcessor postProc;
179
180
181 if (postProcKey != null) {
182 final String localizedKey = getConfig().localizeKey(postProcKey);
183
184 synchronized (postProcessors) {
185 if (postProcessors.containsKey(localizedKey)) {
186
187 postProc = (TextProcessor) postProcessors.get(localizedKey);
188 } else {
189
190 if (getConfig().containsKey(localizedKey)) {
191
192
193 final String[] classDefinition =
194 TiesConfiguration.CONF.getStringArray(localizedKey);
195
196 try {
197 postProc = (TextProcessor)
198 Util.createObject(classDefinition);
199 Util.LOG.debug(getClass().getName()
200 + ": Initialized post-processor from "
201 + localizedKey + " (definition: "
202 + ArrayUtils.toString(classDefinition) + ")");
203 } catch (Exception e) {
204
205 throw new ProcessingException(
206 "Could not initialize post-processor from "
207 + localizedKey + " (definition: "
208 + ArrayUtils.toString(classDefinition) + ")",
209 e);
210 }
211 } else {
212
213 postProc = null;
214 }
215
216
217 postProcessors.put(localizedKey, postProc);
218 }
219 }
220 } else {
221 postProc = null;
222 }
223
224 if (postProc != null) {
225 final StringWriter interimWriter = new StringWriter();
226
227
228 doProcess(reader, interimWriter, context);
229 postProc.process(new StringReader(interimWriter.toString()),
230 writer, context);
231 } else {
232
233 doProcess(reader, writer, context);
234 }
235 }
236
237 /***
238 * Processes the contents of a file, delegating to the
239 * {@link #process(File, Writer, ContextMap)} method.
240 *
241 * @param file the file to process
242 * @param writer the writer to write the processed text to; not closed by
243 * this method
244 * @throws IOException if an I/O error occurs
245 * @throws ProcessingException if an error occurs during processing
246 */
247 public final void process(final File file, final Writer writer)
248 throws IOException, ProcessingException {
249
250 final ContextMap context = new ContextMap();
251 if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
252 context.put(IOUtils.KEY_LOCAL_CHARSET,
253 getConfig().getString(IOUtils.KEY_LOCAL_CHARSET));
254 }
255
256 process(file, writer, context);
257 }
258
259 /***
260 * Processes the contents of a file, delegating to the
261 * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
262 * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
263 * {@link ContextMap context}.
264 *
265 * @param file the file to process
266 * @param writer the writer to write the processed text to; not closed by
267 * this method
268 * @param context a map of objects that are made available for processing;
269 * should contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
270 * character set to use for local files
271 * @throws IOException if an I/O error occurs
272 * @throws ProcessingException if an error occurs during processing
273 */
274 public final void process(final File file, final Writer writer,
275 final ContextMap context)
276 throws IOException, ProcessingException {
277
278 final ContentType contentType =
279 ContentType.determineContentType(file, getConfig());
280
281
282 context.put(ContentType.KEY_MIME_TYPE, contentType.getMimeType());
283 context.put(KEY_DIRECTORY, file.getParentFile());
284 final String charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
285
286
287 final InputStreamReader reader = IOUtils.openReader(file, charset);
288
289 if (charset == null) {
290
291 context.put(IOUtils.KEY_LOCAL_CHARSET,
292 IOUtils.determineCharsetName(reader));
293 }
294
295 try {
296
297 process(reader, writer, context);
298 } finally {
299 IOUtils.tryToClose(reader);
300 }
301 }
302
303 /***
304 * Processes a file or URL given as input argument, delegating to the
305 * appropriate <code>process</code> method. A warning is logged if the input
306 * is neither a readable file nor a readable URL. Stores a mapping
307 * from {@link IOUtils#KEY_LOCAL_CHARSET} to the character set of the
308 * output writer in the created {@link ContextMap context}.
309 *
310 * @param inputName the name of a readable file or URL to process
311 * @throws IOException if an I/O error occurs during processing
312 * @throws ProcessingException if an error occurs during processing
313 */
314 public final void process(final String inputName)
315 throws IOException, ProcessingException {
316 final long startTime = System.currentTimeMillis();
317 URL inURL = null;
318 URLConnection inConn = null;
319 String dirName = null;
320 String localName = null;
321 boolean isOkay;
322 Util.LOG.debug("Starting to process " + inputName);
323
324
325 final File inFile = new File(inputName);
326 if (inFile.exists() && inFile.canRead()) {
327
328 dirName = inFile.getParent();
329 localName = inFile.getName();
330 isOkay = true;
331 } else {
332
333 try {
334 inURL = new URL(inputName);
335 inConn = inURL.openConnection();
336
337
338 dirName = null;
339 localName = IOUtils.getLocalName(inURL, true);
340 isOkay = true;
341 } catch (IOException ioe) {
342 isOkay = false;
343 }
344 }
345
346
347 final String outDirName =
348 getConfig().getString(KEY_OUT_DIRECTORY, dirName);
349 final File outDir = (outDirName != null) ? new File(outDirName) : null;
350
351 if (isOkay) {
352 final File outFile = IOUtils.createOutFile(outDir,
353 localName, getOutFileExt());
354 final OutputStreamWriter writer;
355 final String localCharset;
356
357
358 if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
359
360 localCharset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET);
361 writer = new OutputStreamWriter(
362 new FileOutputStream(outFile), localCharset);
363 } else {
364
365 localCharset = null;
366 writer = new FileWriter(outFile);
367 }
368
369
370 final ContextMap context = new ContextMap();
371 final String usedCharset = (localCharset != null)
372 ? localCharset
373 : IOUtils.determineCharsetName(writer);
374 context.put(IOUtils.KEY_LOCAL_CHARSET, usedCharset);
375 context.put(KEY_LOCAL_NAME, localName);
376
377 try {
378 if (inConn != null) {
379
380 process(inConn, writer, context);
381 } else {
382
383 process(inFile, writer, context);
384 }
385
386 writer.flush();
387 } finally {
388 IOUtils.tryToClose(writer);
389
390
391 if (outFile.length() == 0) {
392 if (!outFile.delete()) {
393 Util.LOG.warn("Could not delete empty output file "
394 + outFile);
395 }
396 }
397 }
398
399 if (outFile.exists() && (outFile.length() > 0)) {
400 Util.LOG.info("Results of processing " + inputName
401 + " stored in " + outFile + " ("
402 + Util.showDuration(startTime) + ")");
403 } else {
404 Util.LOG.info("Processed " + inputName + " ("
405 + Util.showDuration(startTime) + ")");
406 }
407 } else {
408 Util.LOG.warn(inputName + " is neither a readable "
409 + "file nor a readable URL -- ignoring it");
410 }
411 }
412
413 /***
414 * Processes the contents of an URL connection, delegating to the
415 * {@link #process(URLConnection, Writer, ContextMap)} method.
416 * Assumed the {@linkplain IOUtils#STANDARD_HTTP_CHARSET standard HTTP
417 * character set} ("ISO-8859-1") if no {@link IOUtils#KEY_LOCAL_CHARSET} has been configured.
418 *
419 * @param urlConn the URL connection to process
420 * @param writer the writer to write the processed text to; not closed by
421 * this method
422 * @throws IOException if an I/O error occurs
423 * @throws ProcessingException if an error occurs during processing
424 */
425 public final void process(final URLConnection urlConn, final Writer writer)
426 throws IOException, ProcessingException {
427
428 final ContextMap context = new ContextMap();
429 final String charset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET,
430 IOUtils.STANDARD_HTTP_CHARSET);
431 context.put(IOUtils.KEY_LOCAL_CHARSET, charset);
432
433 process(urlConn, writer, context);
434 }
435
436 /***
437 * Processes the contents of an URL connection, delegating to the
438 * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
439 * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
440 * {@link ContextMap context}.
441 *
442 * @param urlConn the URL connection to process
443 * @param writer the writer to write the processed text to; not closed by
444 * this method
445 * @param context a map of objects that are made available for processing;
446 * must contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
447 * character set to use for local files
448 * @throws IOException if an I/O error occurs
449 * @throws ProcessingException if an error occurs during processing
450 */
451 public final void process(final URLConnection urlConn, final Writer writer,
452 final ContextMap context) throws IOException, ProcessingException {
453
454 final URL url = urlConn.getURL();
455 final String contentTypeHeader = urlConn.getContentType();
456 final ContentType contentType = ContentType.determineContentType(
457 url, contentTypeHeader, getConfig());
458
459
460 context.put(ContentType.KEY_MIME_TYPE,
461 contentType.getMimeType());
462 context.put(KEY_URL, url);
463
464
465 final String charset;
466 if (contentType.getCharset() != null) {
467
468 charset = contentType.getCharset();
469 } else {
470
471 charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
472 }
473
474 final InputStreamReader reader =
475 new InputStreamReader(urlConn.getInputStream(), charset);
476
477 try {
478
479 process(reader, writer, context);
480 } finally {
481 IOUtils.tryToClose(reader);
482 }
483 }
484
485 /***
486 * Returns a string representation of this object.
487 *
488 * @return a textual representation
489 */
490 public String toString() {
491 return new ToStringBuilder(this)
492 .append("output file extension", outFileExt)
493 .toString();
494 }
495
496 }