View Javadoc

1   /*
2    * Copyright (C) 2003-2004 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This library is free software; you can redistribute it and/or
8    * modify it under the terms of the GNU Lesser General Public
9    * License as published by the Free Software Foundation; either
10   * version 2.1 of the License, or (at your option) any later version.
11   *
12   * This library is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15   * Lesser General Public License for more details.
16   *
17   * You should have received a copy of the GNU Lesser General Public
18   * License along with this library; if not, visit
19   * http://www.gnu.org/licenses/lgpl.html or write to the Free Software
20   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
21   */
22  package de.fu_berlin.ties;
23  
24  import java.io.File;
25  import java.io.FileOutputStream;
26  import java.io.FileWriter;
27  import java.io.IOException;
28  import java.io.InputStreamReader;
29  import java.io.OutputStreamWriter;
30  import java.io.Reader;
31  import java.io.StringReader;
32  import java.io.StringWriter;
33  import java.io.Writer;
34  import java.net.URL;
35  import java.net.URLConnection;
36  import java.util.HashMap;
37  import java.util.Map;
38  
39  import org.apache.commons.lang.ArrayUtils;
40  import org.apache.commons.lang.builder.ToStringBuilder;
41  
42  import de.fu_berlin.ties.io.ContentType;
43  import de.fu_berlin.ties.io.IOUtils;
44  import de.fu_berlin.ties.util.Util;
45  
46  /***
47   * Abstract base class for a {@link de.fu_berlin.ties.Processor} that operates
48   * on text documents. Input is read from a file or URL or
49   * {@link java.io.Reader}, output is written to a file or
50   * {@link java.io.Writer}.
51   *
52   * @author Christian Siefkes
53   * @version $Revision: 1.22 $, $Date: 2004/10/12 14:17:19 $, $Author: siefkes $
54   */
55  public abstract class TextProcessor extends ConfigurableProcessor {
56  
57      /***
58       * Configuration prefix for post-processors.
59       */
60      public static final String CONFIG_POST = "post";
61  
62      /***
63       * Context key referring to the local name of the processed document.
64       */
65      public static final String KEY_LOCAL_NAME = "local";
66  
67      /***
68       * Context key referring to the directory of the processed document,
69       * if it is a local file.
70       */
71      public static final String KEY_DIRECTORY = "directory";
72  
73      /***
74       * Context key referring output directory; if missing, the value of
75       * {@link #KEY_DIRECTORY} is used instead.
76       */
77      public static final String KEY_OUT_DIRECTORY = "outdir";
78  
79      /***
80       * Context key referring to the URL of the processed document, if loaded
81       * from an URL.
82       */
83      public static final String KEY_URL = "url";
84  
85      /***
86       * The extension used for output files.
87       */
88      private final String outFileExt;
89  
90      /***
91       * The configuration key for the preprocessor (or <code>null</code> if no
92       * <code>outFileExt</code> is given).
93       */
94      private final String postProcKey;
95  
96      /***
97       * A map of post-processors for different languages, if configured.
98       * Maps from localized or global {@link #CONFIG_POST} key to initialized
99       * {@link Processor} instance (or to <code>null</code> if no post-processor
100      * exists). Synchronized on itself.
101      */
102     private final Map<String, Processor> postProcessors =
103         new HashMap<String, Processor>();
104 
105     /***
106      * Creates a new instance.
107      *
108      * @param outExt the extension to use for output files
109      * @param conf used to configure this instance; if <code>null</code>,
110      * the {@linkplain TiesConfiguration#CONF standard configuration} is used
111      */
112     public TextProcessor(final String outExt, final TiesConfiguration conf) {
113         super(conf);
114         outFileExt = outExt;
115 
116         if (outExt != null) {
117             postProcKey = TiesConfiguration.joinKey(CONFIG_POST, outExt);
118         } else {
119             postProcKey = null;
120         }
121     }
122 
123     /***
124      * Processes the contents of a reader, writing a modified version to a
125      * writer.
126      *
127      * @param reader reader containing the text to process; should not be closed
128      * by this method
129      * @param writer the writer to write the processed text to; might be flushed
130      * but not closed by this method; if this method does not use the writer,
131      * the underlying file will be deleted afterwards
132      * @param context a map of objects that are made available for processing;
133      * when called from the implemented <code>process</code> methods in this
134      * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
135      * to the character set of the output writer; from
136      * {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
137      * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
138      * {@link #KEY_DIRECTORY} to the directory ({@link File}), in case of a
139      * local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
140      * the processed document
141      * @throws IOException if an I/O error occurs
142      * @throws ProcessingException if an error occurs during processing
143      */
144     protected abstract void doProcess(final Reader reader, final Writer writer,
145             final ContextMap context) throws IOException, ProcessingException;
146 
147     /***
148      * Returns the extension used for output files.
149      * @return the value of the attribute
150      */
151     public String getOutFileExt() {
152         return outFileExt;
153     }
154 
155     /***
156      * Delegates to the abstract {@link #doProcess(Reader, Writer, ContextMap)}
157      * method and invokes a post-processor, if configured.
158      *
159      * @param reader reader containing the text to process; should not be closed
160      * by this method
161      * @param writer the writer to write the processed text to; might be flushed
162      * but not closed by this method; if this method does not use the writer,
163      * the underlying file will be deleted afterwards
164      * @param context a map of objects that are made available for processing;
165      * when called from the implemented <code>process</code> methods in this
166      * class, it will contain mappings from {@link IOUtils#KEY_LOCAL_CHARSET}
167      * to the character set of the output writer; from
168      * {@link ContentType#KEY_MIME_TYPE} to the document's MIME type; from
169      * {@link #KEY_LOCAL_NAME} to the local name (String) and either from
170      * {@link #KEY_DIRECTORY} to the directory ({@link File}), in case of a
171      * local file) or from {@link #KEY_URL} to the {@link URL} (otherwise) of
172      * the processed document
173      * @throws IOException if an I/O error occurs
174      * @throws ProcessingException if an error occurs during processing
175      */
176     public final void process(final Reader reader, final Writer writer,
177             final ContextMap context) throws IOException, ProcessingException {
178         final TextProcessor postProc;
179 
180         // determine whether there is a post-processor to invoke
181         if (postProcKey != null) {
182             final String localizedKey = getConfig().localizeKey(postProcKey);
183 
184             synchronized (postProcessors) {
185                 if (postProcessors.containsKey(localizedKey)) {
186                     // load stored instance resp. "null" if there is none
187                     postProc = (TextProcessor) postProcessors.get(localizedKey);
188                 } else {
189                     // try to load instance from configuration
190                     if (getConfig().containsKey(localizedKey)) {
191                         // first argument is class name, any further arguments
192                         // are constructor parameters
193                         final String[] classDefinition =
194                             TiesConfiguration.CONF.getStringArray(localizedKey);
195 
196                         try {
197                             postProc = (TextProcessor)
198                                 Util.createObject(classDefinition);
199                             Util.LOG.debug(getClass().getName()
200                                 + ": Initialized post-processor from "
201                                 + localizedKey + " (definition: "
202                                 + ArrayUtils.toString(classDefinition) + ")");
203                         } catch (Exception e) {
204                             // wrap and rethrow
205                             throw new ProcessingException(
206                                 "Could not initialize post-processor from "
207                                 + localizedKey + " (definition: "
208                                 + ArrayUtils.toString(classDefinition) + ")",
209                                 e);
210                         }
211                     } else {
212                         // remember that there is no post-processor
213                         postProc = null;
214                     }
215 
216                     // store initialized resp. null processor in map
217                     postProcessors.put(localizedKey, postProc);
218                 }
219             } // synchronized
220         } else {
221             postProc = null;
222         }
223 
224         if (postProc != null) {
225             final StringWriter interimWriter = new StringWriter();
226 
227             // delegate to abstract method and then to post-processor
228             doProcess(reader, interimWriter, context);
229             postProc.process(new StringReader(interimWriter.toString()),
230                 writer, context);
231         } else {
232             // delegate to abstract method
233             doProcess(reader, writer, context);
234         }
235     }
236 
237     /***
238      * Processes the contents of a file, delegating to the
239      * {@link #process(File, Writer, ContextMap)} method.
240      *
241      * @param file the file to process
242      * @param writer the writer to write the processed text to; not closed by
243      * this method
244      * @throws IOException if an I/O error occurs
245      * @throws ProcessingException if an error occurs during processing
246      */
247     public final void process(final File file, final Writer writer)
248     throws IOException, ProcessingException {
249         // create a context and store local charset, if configured
250         final ContextMap context = new ContextMap();
251         if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
252             context.put(IOUtils.KEY_LOCAL_CHARSET,
253                 getConfig().getString(IOUtils.KEY_LOCAL_CHARSET));
254         }
255 
256         process(file, writer, context);
257     }
258 
259     /***
260      * Processes the contents of a file, delegating to the
261      * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
262      * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
263      * {@link ContextMap context}.
264      *
265      * @param file the file to process
266      * @param writer the writer to write the processed text to; not closed by
267      * this method
268      * @param context a map of objects that are made available for processing;
269      * should contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
270      * character set to use for local files
271      * @throws IOException if an I/O error occurs
272      * @throws ProcessingException if an error occurs during processing
273      */
274     public final void process(final File file, final Writer writer,
275             final ContextMap context)
276     throws IOException, ProcessingException {
277         // rely on file extension for content type
278         final ContentType contentType =
279             ContentType.determineContentType(file, getConfig());
280 
281         // store the MIME type + directory in the context
282         context.put(ContentType.KEY_MIME_TYPE, contentType.getMimeType());
283         context.put(KEY_DIRECTORY, file.getParentFile());
284         final String charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
285 
286         // open reader using configured charset
287         final InputStreamReader reader = IOUtils.openReader(file, charset);
288 
289         if (charset == null) {
290             // no charset given-- store the platform-specific default charset
291             context.put(IOUtils.KEY_LOCAL_CHARSET,
292                     IOUtils.determineCharsetName(reader));
293         }
294 
295         try {
296             // delegate to actual method
297             process(reader, writer, context);
298         } finally {
299             IOUtils.tryToClose(reader);
300         }
301     }
302 
303     /***
304      * Processes a file or URL given as input argument, delegating to the
305      * appropriate <code>process</code> method. A warning is logged if the input
306      * is neither a readable file nor a readable URL. Stores a mapping
307      * from {@link IOUtils#KEY_LOCAL_CHARSET} to the character set of the
308      * output writer in the created {@link ContextMap context}.
309      *
310      * @param inputName the name of a readable file or URL to process
311      * @throws IOException if an I/O error occurs during processing
312      * @throws ProcessingException if an error occurs during processing
313      */
314     public final void process(final String inputName)
315             throws IOException, ProcessingException {
316         final long startTime = System.currentTimeMillis();
317         URL inURL = null;
318         URLConnection inConn = null;
319         String dirName = null;
320         String localName = null;
321         boolean isOkay;
322         Util.LOG.debug("Starting to process " + inputName);
323 
324         // check whether it is a file or an URL
325         final File inFile = new File(inputName);
326         if (inFile.exists() && inFile.canRead()) {
327             // for determining output file name
328             dirName = inFile.getParent();
329             localName = inFile.getName();
330             isOkay = true;
331         } else {
332             // if it's not a file check whether it's an URL
333             try {
334                 inURL = new URL(inputName);
335                 inConn = inURL.openConnection();
336 
337                 // for determining output file name
338                 dirName = null;
339                 localName = IOUtils.getLocalName(inURL, true);
340                 isOkay = true;
341             } catch (IOException ioe) {
342                 isOkay = false;
343             }
344         }
345 
346         // where to write the output file
347         final String outDirName =
348             getConfig().getString(KEY_OUT_DIRECTORY, dirName);
349         final File outDir = (outDirName != null) ? new File(outDirName) : null;
350 
351         if (isOkay) {
352             final File outFile = IOUtils.createOutFile(outDir,
353                     localName, getOutFileExt());
354             final OutputStreamWriter writer;
355             final String localCharset;
356 
357             // check whether a local charset is configured
358             if (getConfig().containsKey(IOUtils.KEY_LOCAL_CHARSET)) {
359                 // use configured charset
360                 localCharset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET);
361                 writer = new OutputStreamWriter(
362                     new FileOutputStream(outFile), localCharset);
363             } else {
364                 // use platform default charset for output
365                 localCharset = null;
366                 writer = new FileWriter(outFile);
367             }
368 
369             // create a context and store local charset + local name
370             final ContextMap context = new ContextMap();
371             final String usedCharset = (localCharset != null)
372                 ? localCharset
373                 : IOUtils.determineCharsetName(writer);
374             context.put(IOUtils.KEY_LOCAL_CHARSET, usedCharset);
375             context.put(KEY_LOCAL_NAME, localName);
376 
377             try {
378                 if (inConn != null) {
379                     // delegate to URL processing method
380                     process(inConn, writer, context);
381                 } else {
382                     // delegate to file processing method
383                     process(inFile, writer, context);
384                 }
385 
386                 writer.flush();
387             } finally {
388                 IOUtils.tryToClose(writer);
389 
390                 // delete the out file if no output was written
391                 if (outFile.length() == 0) {
392                     if (!outFile.delete()) {
393                         Util.LOG.warn("Could not delete empty output file "
394                             + outFile);
395                     }
396                 }
397             }
398 
399             if (outFile.exists() && (outFile.length() > 0)) {
400                 Util.LOG.info("Results of processing " + inputName
401                     + " stored in " + outFile + " ("
402                     + Util.showDuration(startTime) + ")");
403             } else {
404                 Util.LOG.info("Processed " + inputName + " ("
405                     + Util.showDuration(startTime) + ")");
406             }
407         } else {
408             Util.LOG.warn(inputName + " is neither a readable "
409                 + "file nor a readable URL -- ignoring it");
410         }
411     }
412 
413     /***
414      * Processes the contents of an URL connection, delegating to the
415      * {@link #process(URLConnection, Writer, ContextMap)} method.
416      * Assumed the {@linkplain IOUtils#STANDARD_HTTP_CHARSET standard HTTP
417      * character set} ("ISO-8859-1") if no {@link IOUtils#KEY_LOCAL_CHARSET} has been configured.
418      *
419      * @param urlConn the URL connection to process
420      * @param writer the writer to write the processed text to; not closed by
421      * this method
422      * @throws IOException if an I/O error occurs
423      * @throws ProcessingException if an error occurs during processing
424      */
425     public final void process(final URLConnection urlConn, final Writer writer)
426     throws IOException, ProcessingException {
427         // create a context and store local charset (default to HTTP standard)
428         final ContextMap context = new ContextMap();
429         final String charset = getConfig().getString(IOUtils.KEY_LOCAL_CHARSET,
430                 IOUtils.STANDARD_HTTP_CHARSET);
431         context.put(IOUtils.KEY_LOCAL_CHARSET, charset);
432 
433         process(urlConn, writer, context);
434     }
435 
436     /***
437      * Processes the contents of an URL connection, delegating to the
438      * {@link #process(Reader, Writer, ContextMap)} method. Stores a mapping
439      * from {@link ContentType#KEY_MIME_TYPE} to the document's MIME type in the
440      * {@link ContextMap context}.
441      *
442      * @param urlConn the URL connection to process
443      * @param writer the writer to write the processed text to; not closed by
444      * this method
445      * @param context a map of objects that are made available for processing;
446      * must contain a mapping from {@link IOUtils#KEY_LOCAL_CHARSET} to the
447      * character set to use for local files
448      * @throws IOException if an I/O error occurs
449      * @throws ProcessingException if an error occurs during processing
450      */
451     public final void process(final URLConnection urlConn, final Writer writer,
452             final ContextMap context) throws IOException, ProcessingException {
453         // use Content-Type header + URL extension to determine content type
454         final URL url = urlConn.getURL();
455         final String contentTypeHeader = urlConn.getContentType();
456         final ContentType contentType = ContentType.determineContentType(
457                 url, contentTypeHeader, getConfig());
458 
459         // store the MIME type + URL in the context
460         context.put(ContentType.KEY_MIME_TYPE,
461             contentType.getMimeType());
462         context.put(KEY_URL, url);
463 
464         // determine charset + open reader
465         final String charset;
466         if (contentType.getCharset() != null) {
467             // use determined charset
468             charset = contentType.getCharset();
469         } else {
470             // use configured local charset
471             charset = (String) context.get(IOUtils.KEY_LOCAL_CHARSET);
472         }
473 
474         final InputStreamReader reader =
475             new InputStreamReader(urlConn.getInputStream(), charset);
476 
477         try {
478             // delegate to actual method
479             process(reader, writer, context);
480         } finally {
481             IOUtils.tryToClose(reader);
482         }
483     }
484 
485     /***
486      * Returns a string representation of this object.
487      *
488      * @return a textual representation
489      */
490     public String toString() {
491         return new ToStringBuilder(this)
492             .append("output file extension", outFileExt)
493             .toString();
494     }
495 
496 }