View Javadoc

1   /*
2    * Copyright (C) 2005-2006 Christian Siefkes <christian@siefkes.net>.
3    * Development of this software is supported by the German Research Society,
4    * Berlin-Brandenburg Graduate School in Distributed Information Systems
5    * (DFG grant no. GRK 316).
6    *
7    * This program is free software; you can redistribute it and/or modify
8    * it under the terms of the GNU General Public License as published by
9    * the Free Software Foundation; either version 2 of the License, or
10   * (at your option) any later version.
11   *
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Public License for more details.
16   *
17   * You should have received a copy of the GNU General Public License
18   * along with this program; if not, visit
19   * http://www.gnu.org/licenses/gpl.html or write to the Free Software
20   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21   */
22  package de.fu_berlin.ties.io;
23  
24  import java.io.File;
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.io.Writer;
28  import java.util.Iterator;
29  
30  import org.apache.commons.lang.StringUtils;
31  
32  import de.fu_berlin.ties.ContextMap;
33  import de.fu_berlin.ties.ProcessingException;
34  import de.fu_berlin.ties.TextProcessor;
35  import de.fu_berlin.ties.TiesConfiguration;
36  import de.fu_berlin.ties.text.TextUtils;
37  
38  /***
39   * Externalizes the contents of a file in
40   * {@linkplain de.fu_berlin.ties.io.DelimSepValues DSV format} (or any other
41   * {@link de.fu_berlin.ties.io.FieldContainer}. For each entry, the contents
42   * of one specified field (read from the value of the {@link #CONFIG_KEY}
43   * configuration parameter) are stored in an external file. The base name of the
44   * external file (without file extension) is stored in the output DSV file
45   * instead of its content.
46   *
47   * <p>Base name and extension of the external files are determined from the
48   * input file. For example, if the input file is named <em>file.data</em> and
49   * contains 87 entries, 87 externalized files named <em>file01.data</em>,
50   * <em>file02.data</em>, ..., <em>file87.data</em> will be created
51   * (the number of leading zeros is determined as required to ensure that all
52   * file names have the same length). Entries are skipped (but still counted for
53   * numbering purposes) if the value of the specified field is empty or missing.
54   *
55   * <p>Instances of this class are thread-safe.
56   *
57   * @author Christian Siefkes
58   * @version $Revision: 1.7 $, $Date: 2006/10/21 16:04:22 $, $Author: siefkes $
59   */
60  public class Externalize extends TextProcessor {
61  
62      /***
63       * Configuration key: The name of the field whose contents to externalize:
64       * {@value}.
65       */
66      public static final String CONFIG_KEY = "externalize.key";
67  
68  
69      /***
70       * Creates a new instance, using the
71       * {@linkplain TiesConfiguration#CONF standard configuration}.
72       *
73       * @param outExt the extension to use for output files
74       */
75      public Externalize(final String outExt) {
76          super(outExt);
77      }
78  
79      /***
80       * Creates a new instance.
81       *
82       * @param outExt the extension to use for output files
83       * @param conf used to configure this instance; if <code>null</code>,
84       * the {@linkplain TiesConfiguration#CONF standard configuration} is used
85       */
86      public Externalize(final String outExt, final TiesConfiguration conf) {
87          super(outExt, conf);
88      }
89  
90      /***
91       * Externalizes the contents of a field container. This method delegates
92       * to {@link #externalize(FieldContainer, File, String, String, String)},
93       * determining the name of the field to externalize from the
94       * {@link #CONFIG_KEY} configuration parameter.
95       *
96       * @param container the container to externalize; will be modified by
97       * replacing the values stored in the <code>key</code> field with the base
98       * names (without extension) of the newly created external files containing
99       * them
100      * @param directory the directory to use for storing the externalized files;
101      * if <code>null</code>, the working directory is used
102      * @param localName the name of the input file, used to determine the names
103      * of externalized files
104      * @param charset  the character set to use for the external files;
105      * if <code>null</code>, the default charset of the current platform is used
106      * @throws IOException if an I/O error occurs while writing the external
107      * files
108      */
109     public void externalize(final FieldContainer container,
110             final File directory, final String localName, final String charset)
111     throws IOException {
112         externalize(container, directory, localName, charset,
113                 getConfig().getString(CONFIG_KEY));
114     }
115 
116     /***
117      * Externalizes the contents of a field container. 
118      *
119      * @param container the container to externalize; will be modified by
120      * replacing the values stored in the <code>key</code> field with the base
121      * names (without extension) of the newly created external files containing
122      * them
123      * @param directory the directory to use for storing the externalized files;
124      * if <code>null</code>, the working directory is used
125      * @param localName the name of the input file, used to determine the names
126      * of externalized files
127      * @param charset the character set to use for the external files;
128      * if <code>null</code>, the default charset of the current platform is used
129      * @param key the name of the field to externalize
130      * @throws IOException if an I/O error occurs while writing the external
131      * files
132      */
133     public void externalize(final FieldContainer container,
134             final File directory, final String localName, final String charset,
135             final String key)
136     throws IOException {
137         final String baseName = IOUtils.getBaseName(localName);
138         final String extension = IOUtils.getExtension(localName);
139 
140         // number of digits to ensure that all numbers have same length 
141         final int numDigits = String.valueOf(container.size()).length();
142 
143         // iterate all entries, counting from 1
144         int i = 1;
145         final Iterator<FieldMap> entryIter = container.entryIterator();
146         FieldMap entry;
147         String value;
148         String extFileBase;
149         String extFileName;
150         File extFile;
151         Writer extWriter;
152 
153         while (entryIter.hasNext()) {
154             entry = entryIter.next();
155             value = (String) entry.get(key);
156 
157             // jump over entry if value is missing or empty
158             if (StringUtils.isNotEmpty(value)) {
159                 // left-pad with zeros so all numbers have same length
160                 extFileBase = baseName + StringUtils.leftPad(
161                         Integer.toString(i), numDigits, '0');
162                 extFileName = extFileBase + IOUtils.EXT_SEPARATOR + extension;
163                 extFile = new File(directory, extFileName);
164                 extWriter = IOUtils.openWriter(extFile, charset);
165 
166                 // write value to external file + append newline
167                 try {
168                     IOUtils.writeToWriter(value, extWriter);
169                     extWriter.write(TextUtils.LINE_SEPARATOR);
170                 } finally {
171                     IOUtils.tryToClose(extWriter);
172                 }
173 
174                 // container: replace value by name of external file (w/o ext.)
175                 entry.put(key, extFileBase);
176             }
177 
178             i++;
179         }
180     }
181 
182     /***
183      * {@inheritDoc} This implementation delegates to
184      * {@link #externalize(FieldContainer, File, String, String)}, using
185      * {@linkplain DelimSepValues DSV format} for input and output.
186      */
187     protected void doProcess(final Reader reader, final Writer writer,
188             final ContextMap context) throws IOException, ProcessingException {
189         // delegate to externalize method, using DSV format for input/output
190         final FieldContainer container = new DelimSepValues(getConfig());
191         container.read(reader);
192         externalize(container, (File) context.get(KEY_OUT_DIRECTORY),
193                 (String) context.get(KEY_LOCAL_NAME),
194                 (String) context.get(IOUtils.KEY_LOCAL_CHARSET));
195         container.store(writer);
196     }
197 
198 }