View Javadoc

1   /*
2    * Copyright (C) 2003-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties;
23  
24  import java.io.File;
25  import java.io.FileOutputStream;
26  import java.io.FileWriter;
27  import java.io.IOException;
28  import java.io.InputStreamReader;
29  import java.io.OutputStreamWriter;
30  import java.io.Reader;
31  import java.io.StringReader;
32  import java.io.StringWriter;
33  import java.io.Writer;
34  import java.net.URL;
35  import java.net.URLConnection;
36  import java.util.HashMap;
37  import java.util.Map;
38  
39  import org.apache.commons.lang.ArrayUtils;
40  import org.apache.commons.lang.builder.ToStringBuilder;
41  
42  import de.fu_berlin.ties.io.ContentType;
43  import de.fu_berlin.ties.io.IOUtils;
44  import de.fu_berlin.ties.util.Util;
45  
46  /***
47   * Abstract base class for a {@link de.fu_berlin.ties.Processor} that operates
48   * on text documents. Input is read from a file or URL or
49   * {@link java.io.Reader}, output is written to a file or
50   * {@link java.io.Writer}.
51   *
52   * @author Christian Siefkes
53   * @version $Revision: 1.28 $, $Date: 2006/10/21 16:03:52 $, $Author: siefkes $
54   */
55  public abstract class TextProcessor extends ConfigurableProcessor {
56  
57      /***
58       * Configuration prefix for post-processors.
59       */
60      public static final String CONFIG_POST = "post";
61  
62      /***
63       * Context key referring to the local name of the processed document.
64       */
65      public static final String KEY_LOCAL_NAME = "local";
66  
67      /***
68       * Context key referring to the directory of the processed document,
69       * if it is a local file.
70       */
71      public static final String KEY_DIRECTORY = "directory";
72  
73      /***
74       * Context key referring output directory; if missing, the value of
75       * {@link #KEY_DIRECTORY} is used instead.
76       */
77      public static final String KEY_OUT_DIRECTORY = "outdir";
78  
79      /***
80       * Context key referring to the URL of the processed document, if loaded
81       * from an URL.
82       */
83      public static final String KEY_URL = "url";
84  
85      /***
86       * The extension used for output files.
87       */
88      private final String outFileExt;
89  
90      /***
91       * The configuration key for the preprocessor (or <code>null</code> if no
92       * <code>outFileExt</code> is given).
93       */
94      private final String postProcKey;
95  
96      /***
97       * A map of post-processors for different languages, if configured.
98       * Maps from localized or global {@link #CONFIG_POST} key to initialized
99       * {@link Processor} instance (or to <code>null</code> if no post-processor
100      * exists). Synchronized on itself.
101      */
102     private final Map<String, Processor> postProcessors =
103         new HashMap<String, Processor>();
104 
105     /***
106      * Creates a new instance, using the
107      * {@linkplain TiesConfiguration#CONF standard configuration}.
108      *
109      * @param outExt the extension to use for output files
110      */
111     public TextProcessor(final String outExt) {
112         this(outExt, null);
113     }
114 
115     /***
116      * Creates a new instance.
117      *
118      * @param outExt the extension to use for output files
119      * @param conf used to configure this instance; if <code>null</code>,
120      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
121      */
122     public TextProcessor(final String outExt, final TiesConfiguration conf) {
123         super(conf);
124         outFileExt = outExt;
125 
126         if (outExt != null) {
127             postProcKey = TiesConfiguration.joinKey(CONFIG_POST, outExt);
128         } else {
129             postProcKey = null;
130         }
131     }
132 
133     /***
134      * Processes the contents of a reader, writing a modified version to a
135      * writer.
136      *
137      * @param reader reader containing the text to process; should not be closed
138      * by this method
139      * @param writer the writer to write the processed text to; might be flushed
140      * but not closed by this method; if this method does not use the writer,
141      * the underlying file will be deleted afterwards
142      * @param context a map of objects that are made available for processing;
143      * when called from the implemented <code>process</code> methods in this
144      * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
145      * to the character set of the output writer;
146      * from {@link #KEY_OUT_DIRECTORY} to the output directory ({@link File});
147      * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
148      * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
149      * {@link #KEY_DIRECTORY} to the input directory ({@link File}), in case of
150      * a local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
151      * the processed document
152      * @throws IOException if an I/O error occurs
153      * @throws ProcessingException if an error occurs during processing
154      */
155     protected abstract void doProcess(final Reader reader, final Writer writer,
156             final ContextMap context) throws IOException, ProcessingException;
157 
158     /***
159      * Returns the extension used for output files.
160      * @return the value of the attribute
161      */
162     public String getOutFileExt() {
163         return outFileExt;
164     }
165 
166     /***
167      * Delegates to the abstract {@link #doProcess(Reader, Writer, ContextMap)}
168      * method and invokes a post-processor, if configured.
169      *
170      * @param reader reader containing the text to process; should not be closed
171      * by this method
172      * @param writer the writer to write the processed text to; might be flushed
173      * but not closed by this method; if this method does not use the writer,
174      * the underlying file will be deleted afterwards
175      * @param context a map of objects that are made available for processing;
176      * when called from the implemented <code>process</code> methods in this
177      * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
178      * to the character set of the output writer; from
179      * {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
180      * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
181      * {@link #KEY_DIRECTORY} to the directory ({@link File}), in case of a
182      * local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
183      * the processed document
184      * @throws IOException if an I/O error occurs
185      * @throws ProcessingException if an error occurs during processing
186      */
187     public final void process(final Reader reader, final Writer writer,
188             final ContextMap context) throws IOException, ProcessingException {
189         final TextProcessor postProc;
190 
191         // determine whether there is a post-processor to invoke
192         if (postProcKey != null) {
193             final String localizedKey = getConfig().localizeKey(postProcKey);
194 
195             synchronized (postProcessors) {
196                 if (postProcessors.containsKey(localizedKey)) {
197                     // load stored instance resp. "null" if there is none
198                     postProc = (TextProcessor) postProcessors.get(localizedKey);
199                 } else {
200                     // try to load instance from configuration
201                     if (getConfig().containsKey(localizedKey)) {
202                         // first argument is class name, any further arguments
203                         // are constructor parameters
204                         final String[] classDefinition =
205                             TiesConfiguration.CONF.getStringArray(localizedKey);
206 
207                         try {
208                             postProc = (TextProcessor)
209                                 Util.createObject(classDefinition);
210                             Util.LOG.debug(getClass().getName()
211                                 + ": Initialized post-processor from "
212                                 + localizedKey + " (definition: "
213                                 + ArrayUtils.toString(classDefinition) + ")");
214                         } catch (Exception e) {
215                             // wrap and rethrow
216                             throw new ProcessingException(
217                                 "Could not initialize post-processor from "
218                                 + localizedKey + " (definition: "
219                                 + ArrayUtils.toString(classDefinition) + ")",
220                                 e);
221                         }
222                     } else {
223                         // remember that there is no post-processor
224                         postProc = null;
225                     }
226 
227                     // store initialized resp. null processor in map
228                     postProcessors.put(localizedKey, postProc);
229                 }
230             } // synchronized
231         } else {
232             postProc = null;
233         }
234 
235         if (postProc != null) {
236             final StringWriter interimWriter = new StringWriter();
237 
238             // delegate to abstract method and then to post-processor
239             doProcess(reader, interimWriter, context);
240             postProc.process(new StringReader(interimWriter.toString()),
241                 writer, context);
242         } else {
243             // delegate to abstract method
244             doProcess(reader, writer, context);
245         }
246     }
247 
248     /***
249      * Processes the contents of a file, delegating to the
250      * {@link #process(File, Writer, ContextMap)} method.
251      *
252      * @param file the file to process
253      * @param writer the writer to write the processed text to; not closed by
254      * this method
255      * @throws IOException if an I/O error occurs
256      * @throws ProcessingException if an error occurs during processing
257      */
258     public final void process(final File file, final Writer writer)
259     throws IOException, ProcessingException {
260         // create a context and store local charset, if configured
261         final ContextMap context = new ContextMap();
262         if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
263             context.put(IOUtils.KEY_LOCAL_CHARSET,
264                 getConfig().getString(IOUtils.KEY_LOCAL_CHARSET));
265         }
266 
267         process(file, writer, context);
268     }
269 
270     /***
271      * Processes the contents of a file, delegating to the
272      * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
273      * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
274      * {@link ContextMap context}.
275      *
276      * @param file the file to process
277      * @param writer the writer to write the processed text to; not closed by
278      * this method
279      * @param context a map of objects that are made available for processing;
280      * should contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
281      * character set to use for local files
282      * @throws IOException if an I/O error occurs
283      * @throws ProcessingException if an error occurs during processing
284      */
285     public final void process(final File file, final Writer writer,
286             final ContextMap context)
287     throws IOException, ProcessingException {
288         // rely on file extension for content type
289         final ContentType contentType =
290             ContentType.determineContentType(file, getConfig());
291 
292         // store the MIME type + directory in the context
293         context.put(ContentType.KEY_MIME_TYPE, contentType.getMimeType());
294         context.put(KEY_DIRECTORY, file.getParentFile());
295         final String charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
296 
297         // open reader using configured charset
298         final InputStreamReader reader = IOUtils.openReader(file, charset);
299 
300         if (charset == null) {
301             // no charset given-- store the platform-specific default charset
302             context.put(IOUtils.KEY_LOCAL_CHARSET,
303                     IOUtils.determineCharsetName(reader));
304         }
305 
306         try {
307             // delegate to actual method
308             process(reader, writer, context);
309         } finally {
310             IOUtils.tryToClose(reader);
311         }
312     }
313 
314     /***
315      * Processes a file or URL given as input argument, delegating to the
316      * appropriate <code>process</code> method. A warning is logged if the input
317      * is neither a readable file nor a readable URL. Stores a mapping
318      * from {@link IOUtils#KEY_LOCAL_CHARSET} to the character set of the
319      * output writer in the created {@link ContextMap context}.
320      *
321      * @param inputName the name of a readable file or URL to process
322      * @throws IOException if an I/O error occurs during processing
323      * @throws ProcessingException if an error occurs during processing
324      */
325     public final void process(final String inputName)
326             throws IOException, ProcessingException {
327         final long startTime = System.currentTimeMillis();
328         URL inURL = null;
329         URLConnection inConn = null;
330         String dirName = null;
331         String localName = null;
332         boolean isOkay;
333         Util.LOG.debug("Starting to process " + inputName);
334 
335         // check whether it is a file or an URL
336         final File inFile = new File(inputName);
337         if (inFile.exists() && inFile.canRead()) {
338             // for determining output file name
339             dirName = inFile.getParent();
340             localName = inFile.getName();
341             isOkay = true;
342         } else {
343             // if it's not a file check whether it's an URL
344             try {
345                 inURL = new URL(inputName);
346                 inConn = inURL.openConnection();
347 
348                 // for determining output file name
349                 dirName = null;
350                 localName = IOUtils.getLocalName(inURL, true);
351                 isOkay = true;
352             } catch (IOException ioe) {
353                 isOkay = false;
354             }
355         }
356 
357         // where to write the output file
358         final String outDirName =
359             getConfig().getString(KEY_OUT_DIRECTORY, dirName);
360         final File outDir = (outDirName != null) ? new File(outDirName) : null;
361 
362         if (isOkay) {
363             final File outFile = IOUtils.createOutFile(outDir,
364                     localName, getOutFileExt());
365             final OutputStreamWriter writer;
366             final String localCharset;
367 
368             // check whether a local charset is configured
369             if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
370                 // use configured charset
371                 localCharset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET);
372                 writer = new OutputStreamWriter(
373                     new FileOutputStream(outFile), localCharset);
374             } else {
375                 // use platform default charset for output
376                 localCharset = null;
377                 writer = new FileWriter(outFile);
378             }
379 
380             // create a context and store local charset + local name + outdir
381             final ContextMap context = new ContextMap();
382             final String usedCharset = (localCharset != null)
383                 ? localCharset
384                 : IOUtils.determineCharsetName(writer);
385             context.put(IOUtils.KEY_LOCAL_CHARSET, usedCharset);
386             context.put(KEY_LOCAL_NAME, localName);
387             context.put(KEY_OUT_DIRECTORY, outFile.getParentFile());
388 
389             try {
390                 if (inConn != null) {
391                     // delegate to URL processing method
392                     process(inConn, writer, context);
393                 } else {
394                     // delegate to file processing method
395                     process(inFile, writer, context);
396                 }
397 
398                 writer.flush();
399             } finally {
400                 IOUtils.tryToClose(writer);
401 
402                 // delete the out file if no output was written
403                 if (outFile.length() == 0) {
404                     if (!outFile.delete()) {
405                         Util.LOG.warn("Could not delete empty output file "
406                             + outFile);
407                     }
408                 }
409             }
410 
411             if (outFile.exists() && (outFile.length() > 0)) {
412                 Util.LOG.info("Results of processing " + inputName
413                     + " stored in " + outFile + " ("
414                     + Util.showDuration(startTime) + ")");
415             } else {
416                 Util.LOG.info("Processed " + inputName + " ("
417                     + Util.showDuration(startTime) + ")");
418             }
419         } else {
420             Util.LOG.warn(inputName + " is neither a readable "
421                 + "file nor a readable URL -- ignoring it");
422         }
423     }
424 
425     /***
426      * Processes the contents of an URL connection, delegating to the
427      * {@link #process(URLConnection, Writer, ContextMap)} method.
428      * Assumed the {@linkplain IOUtils#STANDARD_HTTP_CHARSET standard HTTP
429      * character set} ("ISO-8859-1") if no {@link IOUtils#KEY_LOCAL_CHARSET}
430      * has been configured.
431      *
432      * @param urlConn the URL connection to process
433      * @param writer the writer to write the processed text to; not closed by
434      * this method
435      * @throws IOException if an I/O error occurs
436      * @throws ProcessingException if an error occurs during processing
437      */
438     public final void process(final URLConnection urlConn, final Writer writer)
439     throws IOException, ProcessingException {
440         // create a context and store local charset (default to HTTP standard)
441         final ContextMap context = new ContextMap();
442         final String charset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET,
443                 IOUtils.STANDARD_HTTP_CHARSET);
444         context.put(IOUtils.KEY_LOCAL_CHARSET, charset);
445 
446         process(urlConn, writer, context);
447     }
448 
449     /***
450      * Processes the contents of an URL connection, delegating to the
451      * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
452      * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
453      * {@link ContextMap context}.
454      *
455      * @param urlConn the URL connection to process
456      * @param writer the writer to write the processed text to; not closed by
457      * this method
458      * @param context a map of objects that are made available for processing;
459      * must contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
460      * character set to use for local files
461      * @throws IOException if an I/O error occurs
462      * @throws ProcessingException if an error occurs during processing
463      */
464     public final void process(final URLConnection urlConn, final Writer writer,
465             final ContextMap context) throws IOException, ProcessingException {
466         // use Content-Type header + URL extension to determine content type
467         final URL url = urlConn.getURL();
468         final String contentTypeHeader = urlConn.getContentType();
469         final ContentType contentType = ContentType.determineContentType(
470                 url, contentTypeHeader, getConfig());
471 
472         // store the MIME type + URL in the context
473         context.put(ContentType.KEY_MIME_TYPE,
474             contentType.getMimeType());
475         context.put(KEY_URL, url);
476 
477         // determine charset + open reader
478         final String charset;
479         if (contentType.getCharset() != null) {
480             // use determined charset
481             charset = contentType.getCharset();
482         } else {
483             // use configured local charset
484             charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
485         }
486 
487         final InputStreamReader reader =
488             new InputStreamReader(urlConn.getInputStream(), charset);
489 
490         try {
491             // delegate to actual method
492             process(reader, writer, context);
493         } finally {
494             IOUtils.tryToClose(reader);
495         }
496     }
497 
498     /***
499      * Returns a string representation of this object.
500      *
501      * @return a textual representation
502      */
503     public String toString() {
504         return new ToStringBuilder(this)
505             .append("output file extension", outFileExt)
506             .toString();
507     }
508 
509 }