View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.text;
23  
24  import java.io.IOException;
25  import java.io.Reader;
26  import java.io.Writer;
27  import java.util.regex.Pattern;
28  
29  import de.fu_berlin.ties.ContextMap;
30  import de.fu_berlin.ties.ProcessingException;
31  import de.fu_berlin.ties.TextProcessor;
32  import de.fu_berlin.ties.TiesConfiguration;
33  import de.fu_berlin.ties.io.IOUtils;
34  
35  /***
36   * Simplifies different kinds of quotes that can occur in text files, replacing
37   * all kinds of quotes by a &quot; character.
38   *
39   * <p>The {@link #simplifyQuotes(String)} method can be used statically,
40   * instance creation is only necessary if you want to .
41   * 
42   * @author Christian Siefkes
43   * @version $Revision: 1.5 $, $Date: 2006/10/21 16:04:25 $, $Author: siefkes $
44   */
45  public class SimplifyQuotes extends TextProcessor {
46  
47      /***
48       * Pattern matching the different kinds of quotes that are simplified by
49       * this instance.
50       */
51      private static final Pattern QUOTE_PATTERN = Pattern.compile("(?:''|``|"
52          + "\u0171|\u0187|\u8216|\u0145|\u8217|\u0146|\u8218|\u0130|\u8220|"
53          + "\u0147|\u8221|\u0148|\u8222|\u0132|\u8249|\u0139|\u8250|\u0155)");
54  
55  
56      /***
57       * Simplifies different kinds of quotes that can occur in text files,
58       * replacing all kinds of quotes by a &quot; character.
59       *
60       * @param input the input text to simplify
61       * @return a string created by simplifying all quotes in the input
62       */
63      public static String simplifyQuotes(final String input) {
64          return TextUtils.replaceAll(input, QUOTE_PATTERN, "\"");
65      }
66  
67  
68      /***
69       * Creates a new instance.
70       *
71       * @param outExt the extension to use for output files
72       */
73      public SimplifyQuotes(final String outExt) {
74          super(outExt);
75      }
76  
77      /***
78       * Creates a new instance.
79       *
80       * @param outExt the extension to use for output files
81       * @param conf used to configure superclasses
82       */
83      public SimplifyQuotes(final String outExt, final TiesConfiguration conf) {
84          super(outExt, conf);
85      }
86  
87      /***
88       * {@inheritDoc}
89       */
90      protected void doProcess(final Reader reader, final Writer writer,
91              final ContextMap context) throws IOException, ProcessingException {
92          // delegate to static method
93          writer.write(simplifyQuotes(IOUtils.readToString(reader)));
94      }
95  
96  }